From 52807002c99563b65b087c7929cd0f5f8bf82176 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Sun, 30 Dec 2018 16:39:11 -0700
Subject: [PATCH 01/95] Format tweaks.

---
 .../standard/pipeline/advance_p_pipeline.cc    | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
index a222d76f..e275b116 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
@@ -252,9 +252,13 @@ advance_p_pipeline( species_t * RESTRICT sp,
 
   int rank;
 
-  if ( !sp || !aa || !ia || sp->g != aa->g || sp->g != ia->g )
+  if ( ! sp           ||
+       ! aa           ||
+       ! ia           ||
+       sp->g != aa->g ||
+       sp->g != ia->g )
   {
-    ERROR( ( "Bad args" ) );
+    ERROR( ( "Bad args." ) );
   }
 
   args->p0      = sp->p;
@@ -264,10 +268,10 @@ advance_p_pipeline( species_t * RESTRICT sp,
   args->seg     = seg;
   args->g       = sp->g;
 
-  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
-  args->cdt_dx  = sp->g->cvac*sp->g->dt*sp->g->rdx;
-  args->cdt_dy  = sp->g->cvac*sp->g->dt*sp->g->rdy;
-  args->cdt_dz  = sp->g->cvac*sp->g->dt*sp->g->rdz;
+  args->qdt_2mc = ( sp->q * sp->g->dt ) / ( 2 * sp->m * sp->g->cvac );
+  args->cdt_dx  = sp->g->cvac * sp->g->dt * sp->g->rdx;
+  args->cdt_dy  = sp->g->cvac * sp->g->dt * sp->g->rdy;
+  args->cdt_dz  = sp->g->cvac * sp->g->dt * sp->g->rdz;
   args->qsp     = sp->q;
 
   args->np      = sp->np;
@@ -300,7 +304,7 @@ advance_p_pipeline( species_t * RESTRICT sp,
   {
     if ( args->seg[rank].n_ignored )
     {
-      WARNING( ( "Pipeline %i ran out of storage for %i movers",
+      WARNING( ( "Pipeline %i ran out of storage for %i movers.",
                  rank, args->seg[rank].n_ignored ) );
     }
 

From 49a2f0ff3ff580bb8d8ad3168def1c14274618b8 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 7 Jan 2019 13:28:39 -0700
Subject: [PATCH 02/95] Fix what appears to be a memory error that dates back
 to the v407 version.

---
 .../standard/pipeline/advance_p_pipeline.cc                | 2 ++
 .../standard/pipeline/advance_p_pipeline_v16.cc            | 7 +++++--
 .../standard/pipeline/advance_p_pipeline_v4.cc             | 7 +++++--
 .../standard/pipeline/advance_p_pipeline_v8.cc             | 7 +++++--
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
index e275b116..8dde6f27 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
@@ -78,8 +78,10 @@ advance_p_pipeline_scalar( advance_p_pipeline_args_t * args,
   // The host gets the first accumulator array.
 
   if ( pipeline_rank != n_pipeline )
+  {
     a0 += ( 1 + pipeline_rank ) *
           POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+  }
 
   // Process particles for this pipeline.
 
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc
index bc152588..ef6f8b1a 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc
@@ -101,8 +101,11 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args,
   // Determine which accumulator array to use.
   // The host gets the first accumulator array.
 
-  a0 += ( 1 + pipeline_rank ) *
-        POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+  if ( pipeline_rank != n_pipeline )
+  {
+    a0 += ( 1 + pipeline_rank ) *
+          POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+  }
 
   // Process the particle blocks for this pipeline.
 
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc
index 4e23770e..19d82ade 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc
@@ -76,8 +76,11 @@ advance_p_pipeline_v4( advance_p_pipeline_args_t * args,
   // Determine which accumulator array to use.
   // The host gets the first accumulator array.
 
-  a0 += ( 1 + pipeline_rank ) *
-        POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+  if ( pipeline_rank != n_pipeline )
+  {
+    a0 += ( 1 + pipeline_rank ) *
+          POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+  }
 
   // Process the particle blocks for this pipeline.
 
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc
index 84ed3916..0890d554 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v8.cc
@@ -80,8 +80,11 @@ advance_p_pipeline_v8( advance_p_pipeline_args_t * args,
   // Determine which accumulator array to use.
   // The host gets the first accumulator array.
 
-  a0 += ( 1 + pipeline_rank ) *
-        POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+  if ( pipeline_rank != n_pipeline )
+  {
+    a0 += ( 1 + pipeline_rank ) *
+          POW2_CEIL( (args->nx+2)*(args->ny+2)*(args->nz+2), 2 );
+  }
 
   // Process the particle blocks for this pipeline.
 

From d1227049ad99c663034c61f9d2e160565d37307f Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 31 Jan 2019 01:34:43 -0700
Subject: [PATCH 03/95] Add configurable and documented build scripts for
 building VPIC on LANL ATS-1 and CTS-1 machines. Document how to use these two
 scripts.

---
 README.md      |  22 +-
 arch/lanl-ats1 | 967 +++++++++++++++++++++++++++++++++++++++++++++++++
 arch/lanl-cts1 | 829 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1815 insertions(+), 3 deletions(-)
 create mode 100755 arch/lanl-ats1
 create mode 100755 arch/lanl-cts1

diff --git a/README.md b/README.md
index c36de28b..d20a36b8 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ the top-level source directory:
     cd build
 ```
 
-The `./arch` directory also contains various cmake scripts (including specific build options) which can help with building
+The `./arch` directory also contains various cmake scripts (including specific build options) which can help with building.
 
 They can be invoked using something like:
 
@@ -97,9 +97,25 @@ After configuration, simply type:
     make
 ```
 
-Advanced users may chose to instead invoke `cmake` directly and hand select options
+Two scripts in the `./arch` directory are of particular note: lanl-ats1 and lanl-cts1. These scripts provide a default way to build VPIC
+on LANL ATS-1 clusters such as Trinity and Trinitite and LANL CTS-1 clusters. The LANL ATS-1 clusters are the first generation
+of DOE Advanced Technology Systems and consist of a partition of dual socket Intel Haswell nodes and a partition of single socket
+Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation of DOE Commodity Technology Systems and consist of
+dual socket Intel Broadwell nodes running the TOSS 3.3 operating system. The lanl-ats1 and lanl-cts1 scripts are heavily
+documented and can be configured to provide a large variety of custom builds for their respective platform types. These
+scripts could also serve as a good starting point for development of a build script for other machine types. Because these
+scripts also configure the users build environment via the use of module commands, the scripts run both the cmake and make
+commands.
 
-GCC users should ensure the `-fno-strict-aliasing` compiler flag is set (as shown in `./arch/generic-gcc-sse`)
+From the user created build directory, these scripts can be invoked as follows:
+
+```bash
+    ../arch/lanl-ats1
+```
+
+Advanced users may chose to instead invoke `cmake` directly and hand select options.
+
+GCC users should ensure the `-fno-strict-aliasing` compiler flag is set (as shown in `./arch/generic-gcc-sse`).
 
 
 # Building an example input deck
diff --git a/arch/lanl-ats1 b/arch/lanl-ats1
new file mode 100755
index 00000000..981377ae
--- /dev/null
+++ b/arch/lanl-ats1
@@ -0,0 +1,967 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# This script supports building VPIC on ATS-1 machines at Los Alamos National
+# Laboratory (LANL). These machines run the Cray Linux Environment Operating
+# System and have two compute partitions, a Haswell partition and a Knights
+# Landing (KNL) partition. Both processor types are Intel processors. These
+# machines provide three compiler choices: Intel, GNU and Cray compilers. Two
+# MPI implementations are provided: Cray Mpich and Open MPI.
+#
+# Normal users should not need to change this script if building VPIC to run
+# on the KNL nodes of ATS-1 machines and happy with defaults.
+#
+# If normal users desire to build VPIC to run on the Haswell nodes of ATS-1
+# machines, they will need to change this script in two places: first in the
+# section where a node type is chosen and second in the section where the type
+# of vector intrinsics used are chosen.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called.
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/.."
+
+#------------------------------------------------------------------------------#
+# Configure the type of build that we want to perform.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# CCE: Cray compilers
+#
+# Note that selecting CCE for the Cray compilers currently does not work. The
+# main reason why you might want to compile with the Cray compilers is to use
+# some of the Cray specific tools like Reveal or a small set of features in
+# the CrayPat profiling software. This is not a common use case for users.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="CCE"
+
+#------------------------------------------------------------------------------#
+# Choose a processor node type.
+#------------------------------------------------------------------------------#
+# One of the node types must be chosen. Valid options are the following.
+#
+# KNL: Knights Landing nodes
+# HSW: Haswell nodes
+#
+# If HSW, for Haswell, is chosen, you must also change the section on vector
+# intrinsics support below to turn off support for V16_AVX512. Normally, you
+# would also turn on support for V8_AVX2. See the documentation on the vector
+# intrinsics section below for more details.
+#------------------------------------------------------------------------------#
+
+KNL="yes"
+#HSW="yes"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# CMPI: Cray Mpich, the Cray supported MPI library
+# OMPI: Open MPI
+#------------------------------------------------------------------------------#
+
+VMPI="CMPI"
+#VMPI="OMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
+#------------------------------------------------------------------------------#
+# Choose format of status update output.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_MORE_DIGITS=OFF, the output has two significant figures.
+#
+# If SET_MORE_DIGITS=ON, the output has four significant figures.
+#------------------------------------------------------------------------------#
+
+SET_MORE_DIGITS="OFF"
+#SET_MORE_DIGITS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose type of vector intrinsics support.
+#------------------------------------------------------------------------------#
+# Note the following constraints.
+#
+# Each of the nine variables in this section must have a configured value.
+# This is because the corresponding "USE" cmake variable is set on the cmake
+# command line below to allow any possible combinations to be configured using
+# a single cmake command.
+#
+# If all values are configured as OFF, the scalar implementations of VPIC
+# functions which are not vectorized will be used.
+#
+# It is possible to have a vector version configured as ON for each of the
+# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC
+# function has a V16 implementation, that will be used. If there is not a V16
+# implementation but there is a V8 implementation, that will be used. If there
+# is not a V16 or V8 implementation but there is a V4 implementation, that
+# will be used. Finally, for functions that have no vector implementations,
+# the scalar version will be used.
+#
+# Currently, it is recommended to always configure the appropriate V4 version
+# as on if using vector versions because there are key functions that only
+# have a V4 version because the current algorithm does not generalize to
+# longer vector lengths. An example is the move_p function. Since the V4
+# versions are generally more performant than the scalar versions, it makes
+# sense to use them even when using the longer vector length implementations
+# for other VPIC functions.
+#
+# In summary, when using vector versions on a machine with 256 bit SIMD, the
+# V4 and V8 implementations should be configured as ON. When using a machine
+# with 512 bit SIMD, V4 and V16 implementations should be configured as ON.
+#
+# First, we turn all of the vector options OFF. Then, we turn on the ones we
+# want.
+#------------------------------------------------------------------------------#
+
+SET_V4_PORTABLE="OFF"
+SET_V4_SSE="OFF"
+SET_V4_AVX="OFF"
+SET_V4_AVX2="OFF"
+SET_V8_PORTABLE="OFF"
+SET_V8_AVX="OFF"
+SET_V8_AVX2="OFF"
+SET_V16_PORTABLE="OFF"
+SET_V16_AVX512="OFF"
+
+#SET_V4_PORTABLE="ON"
+#SET_V4_SSE="ON"
+#SET_V4_AVX="ON"
+SET_V4_AVX2="ON"
+#SET_V8_PORTABLE="ON"
+#SET_V8_AVX="ON"
+#SET_V8_AVX2="ON"
+#SET_V16_PORTABLE="ON"
+SET_V16_AVX512="ON"
+
+#------------------------------------------------------------------------------#
+# Choose a particle sort implementation.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are the
+# following.
+#
+# LSORT: legacy, thread serial sort
+# TSORT: thread parallel sort
+#
+# The LSORT particle sort implementation is the thread serial particle sort
+# implementation from the legacy v407 version of VPIC. This implementation
+# supports both in-place and out-of-place sorting of the particles. It is very
+# competitive with the thread parallel sort implementation for a small number
+# of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting
+# the particles in-place allows the fraction of particles stored in High
+# Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint
+# of VPIC is reduced by the memory of a particle array which can be significant
+# for particle dominated problems.
+#
+# The TSORT particle sort implementation is a thread parallel implementation.
+# Currently, it can only perform out-of-place sorting of the particles. It will
+# be more performant than the LSORT implementation when using many threads per
+# MPI rank but uses more memory because of the out-of-place sort.
+#------------------------------------------------------------------------------#
+
+VSORT="LSORT"
+#VSORT="TSORT"
+
+#------------------------------------------------------------------------------#
+# Choose type of library to build.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is to build a static library, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_SHARED_LIBS="OFF"
+#SET_SHARED_LIBS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose integrated test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the integrated tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_INTEGRATED_TESTS="OFF"
+#SET_INTEGRATED_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose unit test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the unit tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_UNIT_TESTS="OFF"
+#SET_UNIT_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose the CMake build type.
+#------------------------------------------------------------------------------#
+# One of the available options must be chosen. Valid options depend on build
+# types available in the CMake version but include at least the following.
+#
+# Release: In general, the default for CMake.
+# None: Tells CMake not to use any pre-defined build type and gives VPIC build
+#       system total control of CMake variables defined on cmake command line.
+#------------------------------------------------------------------------------#
+
+SET_BUILD_TYPE="Release"
+#SET_BUILD_TYPE="None"
+
+#------------------------------------------------------------------------------#
+# Choose number of parallel make processes for build.
+#------------------------------------------------------------------------------#
+# If NJ variable is not defined, "make" will perform a parallel build using
+# maximum number of processors on the compilation machine.
+#
+# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many
+# processes writing to STDOUT at the same time and will be difficult to
+# interpret.
+#
+# When using VERBOSE = 1,  use of NJ = 1 is recommended.
+#
+# The default is to use a modest number of processes in the parallel build.
+#
+# Comment out default below to use all processors on compilation machine.
+#------------------------------------------------------------------------------#
+
+NJ=8
+#NJ=1
+
+#------------------------------------------------------------------------------#
+# Choose verbosity of "make" output.
+#------------------------------------------------------------------------------#
+# Setting VERBOSE = 1 causes "make" to output commands it is executing.
+#
+# This information is useful if debugging a failed build.
+#
+# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build.
+#
+# The default is a quiet build.
+#------------------------------------------------------------------------------#
+
+SET_VERBOSE=0
+#SET_VERBOSE=1
+
+#------------------------------------------------------------------------------#
+# Choose versions of modules to use if default is not desired.
+#------------------------------------------------------------------------------#
+# No choice is required in this section.
+#
+# Some possible alternative module versions are provided below. Change as
+# needed or desired.
+#
+# This section may need to be updated periodically as the module enviroment
+# evolves because of updates to operating system and programming environment.
+#------------------------------------------------------------------------------#
+
+#VERSION_CMAKE=3.12.1
+
+#VERSION_INTEL=19.0.1
+#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0
+#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0
+#VERSION_INTEL_INSPECTOR=2019.1.0
+#VERSION_INTEL_TRACE_ANALYZER=2019.1.022
+
+#VERSION_GNU=7.3.0
+
+#VERSION_CCE=9.0.0.21672
+#VERSION_CRAY_MPICH=7.7.4.4
+#VERSION_CRAY_PERF_TOOLS=7.0.4
+
+#VERSION_OPEN_MPI=3.1.2
+
+#VERSION_FORGE=18.3
+
+#------------------------------------------------------------------------------#
+# Unless the user wants to modify options to the compiler, no changes should
+# be needed below this point.
+#
+# If the user desires to configure compiler options, proceed to the section
+# below for the chosen compiler.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure default compiler names to use Cray wrapper scripts.
+#------------------------------------------------------------------------------#
+
+VPIC_COMPILER_C="cc"
+VPIC_COMPILER_CXX="CC"
+
+if [ "$VMPI" = "OMPI" ]
+then
+    VPIC_COMPILER_C="mpicc"
+    VPIC_COMPILER_CXX="mpicxx"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Intel compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "INT" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to
+    # be vectorized always, regardless of computation work volume.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -inline-forceinline"
+    #FLAGS_CXX_COMPILER+=" -vec-threshold0"
+    FLAGS_CXX_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #
+    # Use of "-craype-verbose" causes Cray compiler wrapper script to print
+    # command it is forwarding to actual compiler for invocation. This is very
+    # useful for producing a build log to make sure compiler is being invoked
+    # with expected options.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+    FLAGS_CXX_COMPILER+=" -craype-verbose"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -qopt-report=5"
+    FLAGS_CXX_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_CXX_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-zmm-usage=high" causes the compiler to generate zmm code,
+    # i.e. AVX-512 code, without any restrictions. Extensive use of AVX-512
+    # code causes the CPU core to down clock or throttle to avoid overheating.
+    # The default is for the compiler to use some internal limits on how much
+    # AVX-512 instructions are used. This is relevant on ATS-1 systems only
+    # for KNL processors.
+    #--------------------------------------------------------------------------#
+
+    if [ "$KNL" = "yes" ]
+    then
+        FLAGS_CXX_COMPILER+=" -qopt-zmm-usage=high"
+    fi
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to
+    # be vectorized always, regardless of computation work volume.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -inline-forceinline"
+    #FLAGS_C_COMPILER+=" -vec-threshold0"
+    FLAGS_C_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #
+    # Use of "-craype-verbose" causes Cray compiler wrapper script to print
+    # command it is forwarding to actual compiler for invocation. This is very
+    # useful for producing a build log to make sure compiler is being invoked
+    # with expected options.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+    FLAGS_C_COMPILER+=" -craype-verbose"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -qopt-report=5"
+    FLAGS_C_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_C_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-zmm-usage=high" causes the compiler to generate zmm code,
+    # i.e. AVX-512 code, without any restrictions. Extensive use of AVX-512
+    # code causes the CPU core to down clock or throttle to avoid overheating.
+    # The default is for the compiler to use some internal limits on how much
+    # AVX-512 instructions are used. This is relevant on ATS-1 systems only
+    # for KNL processors.
+    #--------------------------------------------------------------------------#
+
+    if [ "$KNL" = "yes" ]
+    then
+        FLAGS_C_COMPILER+=" -qopt-zmm-usage=high"
+    fi
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the GNU compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "GNU" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -ffast-math"
+    FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=knl" or "-march=haswell" causes g++ to generate code
+    # specific to and optimized for the specific architecture of either KNL
+    # or Haswell. It appears that the Cray wrappers already do this correctly
+    # for KNL but it seems they may not for Haswell.
+    #--------------------------------------------------------------------------#
+
+    if [ "$KNL" = "yes" ]
+    then
+        FLAGS_CXX_COMPILER+=" -march=knl"
+    else
+        FLAGS_CXX_COMPILER+=" -march=haswell"
+    fi
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -ffast-math"
+    FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=knl" or "-march=haswell" causes gcc to generate code
+    # specific to and optimized for the specific architecture of either KNL
+    # or Haswell. It appears that the Cray wrappers already do this correctly
+    # for KNL but it seems they may not for Haswell.
+    #--------------------------------------------------------------------------#
+
+    if [ "$KNL" = "yes" ]
+    then
+        FLAGS_C_COMPILER+=" -march=knl"
+    else
+        FLAGS_C_COMPILER+=" -march=haswell"
+    fi
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Cray compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "CCE" ]
+then
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+    #FLAGS_CXX_COMPILER+=" -hlist=ad"
+    #FLAGS_CXX_COMPILER+=" -hipa5"
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+    #FLAGS_CXX_COMPILER+=" -rdynamic"
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+    #FLAGS_C_COMPILER+=" -hlist=ad"
+    #FLAGS_C_COMPILER+=" -hipa5"
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+    #FLAGS_C_COMPILER+=" -rdynamic"
+    FLAGS_C_COMPILER+=" -dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# This ends user configuration section.
+#
+# No changes required below unless VPIC build system has been extended or the
+# module system on ATS-1 machines has changed in some fundamental way.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure thread model.
+#------------------------------------------------------------------------------#
+
+if [ "$VTHR" = "PTH" ]
+then
+    SET_OPENMP="OFF"
+    SET_PTHREADS="ON"
+fi
+
+if [ "$VTHR" = "OMP" ]
+then
+    SET_OPENMP="ON"
+    SET_PTHREADS="OFF"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure particle sort method.
+#------------------------------------------------------------------------------#
+
+if [ "$VSORT" = "LSORT" ]
+then
+    FLAGS_C_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
+
+    FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure environment using modules.
+#------------------------------------------------------------------------------#
+# Note that the user may want to modify the module configuration.
+#
+# Note that module commands used to define the build environment are captured
+# in a Bash script named bashrc.modules which is written into the top level
+# build directory. This script can be used in run scripts and other scenarios
+# where there is a need to reproduce the environment used to build VPIC.
+#------------------------------------------------------------------------------#
+
+echo '#!/bin/bash' >> bashrc.modules
+echo "" >> bashrc.modules
+
+module load friendly-testing
+echo "module load friendly-testing" >> bashrc.modules
+
+module load sandbox
+echo "module load sandbox" >> bashrc.modules
+
+module load cmake
+echo "module load cmake" >> bashrc.modules
+
+if [ ! "x$VERSION_CMAKE" = "x" ]
+then
+    module swap cmake cmake/$VERSION_CMAKE
+    echo "module swap cmake cmake/$VERSION_CMAKE" >> bashrc.modules
+fi
+
+module unload craype-hugepages2M
+echo "module unload craype-hugepages2M" >> bashrc.modules
+
+if [ "$VCOM" = "INT" ]
+then
+    if [ ! "x$VERSION_INTEL" = "x" ]
+    then
+        module swap intel intel/$VERSION_INTEL
+        echo "module swap intel intel/$VERSION_INTEL" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "GNU" ]
+then
+    module swap PrgEnv-intel PrgEnv-gnu
+    echo "module swap PrgEnv-intel PrgEnv-gnu" >> bashrc.modules
+
+    if [ ! "x$VERSION_GNU" = "x" ]
+    then
+        module swap gcc gcc/$VERSION_GNU
+        echo "module swap gcc gcc/$VERSION_GNU" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "CCE" ]
+then
+    module swap PrgEnv-intel PrgEnv-cray
+    echo "module swap PrgEnv-intel PrgEnv-cray" >> bashrc.modules
+
+    if [ ! "x$VERSION_CCE" = "x" ]
+    then
+        module swap cce cce/$VERSION_CCE
+        echo "module swap cce cce/$VERSION_CCE" >> bashrc.modules
+    fi
+fi
+
+if [ "$KNL" = "yes" ]
+then
+    module swap craype-haswell craype-mic-knl
+    echo "module swap craype-haswell craype-mic-knl" >> bashrc.modules
+fi
+
+if [ "$VMPI" = "CMPI" ]
+then
+    if [ ! "x$VERSION_CRAY_MPICH" = "x" ]
+    then
+        module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH
+        echo "module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH" >> bashrc.modules
+    fi
+
+    export MPI_ROOT=$MPICH_DIR
+fi
+
+if [ "$VMPI" = "OMPI" ]
+then
+    module unload cray-mpich
+    echo "module unload cray-mpich" >> bashrc.modules
+
+    module unload cray-libsci
+    echo "module unload cray-libsci" >> bashrc.modules
+
+    module load openmpi
+    echo "module load openmpi" >> bashrc.modules
+
+    if [ ! "x$VERSION_OPEN_MPI" = "x" ]
+    then
+        module swap openmpi openmpi/$VERSION_OPEN_MPI
+        echo "module swap openmpi openmpi/$VERSION_OPEN_MPI" >> bashrc.modules
+    fi
+fi
+
+module list
+echo "" >> bashrc.modules
+echo "module list" >> bashrc.modules
+
+#------------------------------------------------------------------------------#
+# Call cmake command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# Use of the "-LAH" command line option to cmake causes cmake to output the
+# values of all of its variables. This is useful information when debugging
+# a failed build.
+#
+# Note that all of the possible VPIC cmake variables relevant to an ATS-1
+# system are set on the command line so that they can all be conditionally
+# configured above through user selections.
+#------------------------------------------------------------------------------#
+
+cmake \
+    -LAH \
+    -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \
+    -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \
+    -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \
+    -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \
+    -DUSE_V4_SSE=$SET_V4_SSE \
+    -DUSE_V4_AVX=$SET_V4_AVX \
+    -DUSE_V4_AVX2=$SET_V4_AVX2 \
+    -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \
+    -DUSE_V8_AVX=$SET_V8_AVX \
+    -DUSE_V8_AVX2=$SET_V8_AVX2 \
+    -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \
+    -DUSE_V16_AVX512=$SET_V16_AVX512 \
+    -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \
+    -DUSE_OPENMP=$SET_OPENMP \
+    -DUSE_PTHREADS=$SET_PTHREADS \
+    -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \
+    -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \
+    -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \
+    -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \
+    -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \
+    $src_dir
+
+#------------------------------------------------------------------------------#
+# Call make command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# In general, it is necessary to call the "make" command within this script
+# because the module environment has been configured within this script.
+#
+# Setting VERBOSE=1 causes "make" to output the commands it is executing.
+# This information is useful if debugging a failed build.
+#
+# If the NJ variable is not defined, "make" will perform a parallel build
+# using maximum number of processors on the compilation machine. If using
+# VERBOSE=1, the verbose output will be garbled by many processes writing
+# to STDOUT at the same time and will be difficult to interpret. When using
+# VERBOSE=1, it can be helpful to also use NJ=1.
+#------------------------------------------------------------------------------#
+
+make -j $NJ VERBOSE=$SET_VERBOSE
+
+#------------------------------------------------------------------------------#
+# Done.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# vim: syntax=sh
+#------------------------------------------------------------------------------#
diff --git a/arch/lanl-cts1 b/arch/lanl-cts1
new file mode 100755
index 00000000..edafd8f9
--- /dev/null
+++ b/arch/lanl-cts1
@@ -0,0 +1,829 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# This script supports building VPIC on CTS-1 machines at Los Alamos National
+# Laboratory (LANL). These machines run the Tri-lab TOSS 3.3 Operating System,
+# a customized version of Red Hat Enterprise Linux 7.5. CTS-1 machines have
+# dual socket 18 core Broadwell nodes. These machines provide three compiler
+# choices: Intel, GNU and PGI. Three MPI implementations are provided: Open
+# MPI, Intel MPI and Mvapich.
+#
+# Normal users should not need to change this script if happy with defaults.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called.
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/.."
+
+#------------------------------------------------------------------------------#
+# Configure the type of build that we want to perform.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# PGI: Portland Group compilers, now part of Nvidia
+#
+# Note that selecting PGI for Portland Group compilers has not been tested
+# and probably does not work.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="PGI"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# OMPI: Open MPI, most commonly used MPI implementation on LANL CTS-1 machines
+# IMPI: Intel MPI
+#
+# Choose Intel MPI if you want to use the Intel Application Performance
+# Snapshot performance analysis tool to analyze MPI performance of VPIC or
+# other Intel analysis tools which provide analysis of MPI usage.
+#------------------------------------------------------------------------------#
+
+VMPI="OMPI"
+#VMPI="IMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
+#------------------------------------------------------------------------------#
+# Choose format of status update output.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_MORE_DIGITS=OFF, the output has two significant figures.
+#
+# If SET_MORE_DIGITS=ON, the output has four significant figures.
+#------------------------------------------------------------------------------#
+
+SET_MORE_DIGITS="OFF"
+#SET_MORE_DIGITS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose type of vector intrinsics support.
+#------------------------------------------------------------------------------#
+# Note the following constraints.
+#
+# Each of the eight variables in this section must have a configured value.
+# This is because the corresponding "USE" cmake variable is set on the cmake
+# command line below to allow any possible combinations to be configured using
+# a single cmake command.
+#
+# If all values are configured as OFF, the scalar implementations of VPIC
+# functions which are not vectorized will be used.
+#
+# It is possible to have a vector version configured as ON for each of the
+# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC
+# function has a V16 implementation, that will be used. If there is not a V16
+# implementation but there is a V8 implementation, that will be used. If there
+# is not a V16 or V8 implementation but there is a V4 implementation, that
+# will be used. Finally, for functions that have no vector implementations,
+# the scalar version will be used.
+#
+# Currently, it is recommended to always configure the appropriate V4 version
+# as on if using vector versions because there are key functions that only
+# have a V4 version because the current algorithm does not generalize to
+# longer vector lengths. An example is the move_p function. Since the V4
+# versions are generally more performant than the scalar versions, it makes
+# sense to use them even when using the longer vector length implementations
+# for other VPIC functions.
+#
+# In summary, when using vector versions on a machine with 256 bit SIMD, the
+# V4 and V8 implementations should be configured as ON.
+#
+# First, we turn all of the vector options OFF. Then, we turn on the ones we
+# want.
+#------------------------------------------------------------------------------#
+
+SET_V4_PORTABLE="OFF"
+SET_V4_SSE="OFF"
+SET_V4_AVX="OFF"
+SET_V4_AVX2="OFF"
+SET_V8_PORTABLE="OFF"
+SET_V8_AVX="OFF"
+SET_V8_AVX2="OFF"
+SET_V16_PORTABLE="OFF"
+
+#SET_V4_PORTABLE="ON"
+#SET_V4_SSE="ON"
+#SET_V4_AVX="ON"
+SET_V4_AVX2="ON"
+#SET_V8_PORTABLE="ON"
+#SET_V8_AVX="ON"
+SET_V8_AVX2="ON"
+#SET_V16_PORTABLE="ON"
+
+#------------------------------------------------------------------------------#
+# Choose a particle sort implementation.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are the
+# following.
+#
+# LSORT: legacy, thread serial sort
+# TSORT: thread parallel sort
+#
+# The LSORT particle sort implementation is the thread serial particle sort
+# implementation from the legacy v407 version of VPIC. This implementation
+# supports both in-place and out-of-place sorting of the particles. It is very
+# competitive with the thread parallel sort implementation for a small number
+# of threads per MPI rank, i.e. 4 or less. Also, the memory footprint of VPIC
+# is reduced by the memory of a particle array which can be significant for
+# particle dominated problems.
+#
+# The TSORT particle sort implementation is a thread parallel implementation.
+# Currently, it can only perform out-of-place sorting of the particles. It will
+# be more performant than the LSORT implementation when using many threads per
+# MPI rank but uses more memory because of the out-of-place sort.
+#------------------------------------------------------------------------------#
+
+VSORT="LSORT"
+#VSORT="TSORT"
+
+#------------------------------------------------------------------------------#
+# Choose type of library to build.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is to build a static library, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_SHARED_LIBS="OFF"
+#SET_SHARED_LIBS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose integrated test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the integrated tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_INTEGRATED_TESTS="OFF"
+#SET_INTEGRATED_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose unit test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the unit tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_UNIT_TESTS="OFF"
+#SET_UNIT_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose the CMake build type.
+#------------------------------------------------------------------------------#
+# One of the available options must be chosen. Valid options depend on build
+# types available in the CMake version but include at least the following.
+#
+# Release: In general, the default for CMake.
+# None: Tells CMake not to use any pre-defined build type and gives VPIC build
+#       system total control of CMake variables defined on cmake command line.
+#------------------------------------------------------------------------------#
+
+SET_BUILD_TYPE="Release"
+#SET_BUILD_TYPE="None"
+
+#------------------------------------------------------------------------------#
+# Choose number of parallel make processes for build.
+#------------------------------------------------------------------------------#
+# If NJ variable is not defined, "make" will perform a parallel build using
+# maximum number of processors on the compilation machine.
+#
+# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many
+# processes writing to STDOUT at the same time and will be difficult to
+# interpret.
+#
+# When using VERBOSE = 1,  use of NJ = 1 is recommended.
+#
+# The default is to use a modest number of processes in the parallel build.
+#
+# Comment out default below to use all processors on compilation machine.
+#------------------------------------------------------------------------------#
+
+NJ=8
+#NJ=1
+
+#------------------------------------------------------------------------------#
+# Choose verbosity of "make" output.
+#------------------------------------------------------------------------------#
+# Setting VERBOSE = 1 causes "make" to output commands it is executing.
+#
+# This information is useful if debugging a failed build.
+#
+# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build.
+#
+# The default is a quiet build.
+#------------------------------------------------------------------------------#
+
+SET_VERBOSE=0
+#SET_VERBOSE=1
+
+#------------------------------------------------------------------------------#
+# Choose versions of modules to use if default is not desired.
+#------------------------------------------------------------------------------#
+# No choice is required in this section.
+#
+# Some possible alternative module versions are provided below. Change as
+# needed or desired.
+#
+# This section may need to be updated periodically as the module enviroment
+# evolves because of updates to operating system and programming environment.
+#------------------------------------------------------------------------------#
+
+#VERSION_CMAKE=3.12.1
+
+#VERSION_INTEL=18.0.3
+#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0
+#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0
+#VERSION_INTEL_INSPECTOR=2019.1.0
+#VERSION_INTEL_TRACE_ANALYZER=2019.1.022
+#VERSION_INTEL_MPI=2019.1
+
+#VERSION_GNU=7.3.0
+
+#VERSION_PGI=18.10
+
+#VERSION_OPEN_MPI=3.1.2
+
+#VERSION_FORGE=18.3
+
+#------------------------------------------------------------------------------#
+# Unless the user wants to modify options to the compiler, no changes should
+# be needed below this point.
+#
+# If the user desires to configure compiler options, proceed to the section
+# below for the chosen compiler.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure default compiler names to use Cray wrapper scripts.
+#------------------------------------------------------------------------------#
+
+VPIC_COMPILER_C="mpicc"
+VPIC_COMPILER_CXX="mpicxx"
+
+if [ "$VMPI" = "IMPI" ]
+then
+    VPIC_COMPILER_C="mpiicc"
+    VPIC_COMPILER_CXX="mpiicpc"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Intel compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "INT" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to
+    # be vectorized always, regardless of computation work volume.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -inline-forceinline"
+    #FLAGS_CXX_COMPILER+=" -vec-threshold0"
+    FLAGS_CXX_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -qopt-report=5"
+    FLAGS_CXX_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_CXX_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to
+    # be vectorized always, regardless of computation work volume.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -inline-forceinline"
+    #FLAGS_C_COMPILER+=" -vec-threshold0"
+    FLAGS_C_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -qopt-report=5"
+    FLAGS_C_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_C_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the GNU compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "GNU" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -ffast-math"
+    FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=broadwell" causes g++ to generate code specific to and
+    # optimized for the architecture of Broadwell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -march=broadwell"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -ffast-math"
+    FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=broadwell" causes gcc to generate code specific to and
+    # optimized for the architecture of Broadwell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -march=broadwell"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the PGI compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "PGI" ]
+then
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# This ends user configuration section.
+#
+# No changes required below unless VPIC build system has been extended or the
+# module system on CTS-1 machines has changed in some fundamental way.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure thread model.
+#------------------------------------------------------------------------------#
+
+if [ "$VTHR" = "PTH" ]
+then
+    SET_OPENMP="OFF"
+    SET_PTHREADS="ON"
+fi
+
+if [ "$VTHR" = "OMP" ]
+then
+    SET_OPENMP="ON"
+    SET_PTHREADS="OFF"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure particle sort method.
+#------------------------------------------------------------------------------#
+
+if [ "$VSORT" = "LSORT" ]
+then
+    FLAGS_C_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
+
+    FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure environment using modules.
+#------------------------------------------------------------------------------#
+# Note that the user may want to modify the module configuration.
+#
+# Note that module commands used to define the build environment are captured
+# in a Bash script named bashrc.modules which is written into the top level
+# build directory. This script can be used in run scripts and other scenarios
+# where there is a need to reproduce the environment used to build VPIC.
+#------------------------------------------------------------------------------#
+
+echo '#!/bin/bash' >> bashrc.modules
+echo "" >> bashrc.modules
+
+module purge
+echo "module purge" >> bashrc.modules
+
+module load friendly-testing
+echo "module load friendly-testing" >> bashrc.modules
+
+module load sandbox
+echo "module load sandbox" >> bashrc.modules
+
+if [ ! "x$VERSION_CMAKE" = "x" ]
+then
+    module load cmake/$VERSION_CMAKE
+    echo "module load cmake/$VERSION_CMAKE" >> bashrc.modules
+else
+    module load cmake
+    echo "module load cmake" >> bashrc.modules
+fi
+
+if [ "$VCOM" = "INT" ]
+then
+    if [ ! "x$VERSION_INTEL" = "x" ]
+    then
+        module load intel/$VERSION_INTEL
+        echo "module load intel/$VERSION_INTEL" >> bashrc.modules
+    else
+        module load intel
+        echo "module load intel" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "GNU" ]
+then
+    if [ ! "x$VERSION_GNU" = "x" ]
+    then
+        module load gcc/$VERSION_GNU
+        echo "module load gcc/$VERSION_GNU" >> bashrc.modules
+    else
+        module load gcc
+        echo "module load gcc" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "PGI" ]
+then
+    if [ ! "x$VERSION_PGI" = "x" ]
+    then
+        module load pgi/$VERSION_PGI
+        echo "module load pgi/$VERSION_PGI" >> bashrc.modules
+    else
+        module load pgi
+        echo "module load pgi" >> bashrc.modules
+    fi
+fi
+
+if [ "$VMPI" = "OMPI" ]
+then
+    if [ ! "x$VERSION_OPEN_MPI" = "x" ]
+    then
+        module load openmpi/$VERSION_OPEN_MPI
+        echo "module load openmpi/$VERSION_OPEN_MPI" >> bashrc.modules
+    else
+        module load openmpi
+        echo "module load openmpi" >> bashrc.modules
+    fi
+fi
+
+if [ "$VMPI" = "IMPI" ]
+then
+    if [ ! "x$VERSION_INTEL_MPI" = "x" ]
+    then
+        module load intel-mpi/$VERSION_INTEL_MPI
+        echo "module load intel-mpi/$VERSION_INTEL_MPI" >> bashrc.modules
+    else
+        module load intel-mpi
+        echo "module load intel-mpi" >> bashrc.modules
+    fi
+fi
+
+module list
+echo "" >> bashrc.modules
+echo "module list" >> bashrc.modules
+
+#------------------------------------------------------------------------------#
+# Call cmake command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# Use of the "-LAH" command line option to cmake causes cmake to output the
+# values of all of its variables. This is useful information when debugging
+# a failed build.
+#
+# Note that all of the possible VPIC cmake variables relevant to a CTS-1
+# system are set on the command line so that they can all be conditionally
+# configured above through user selections.
+#------------------------------------------------------------------------------#
+
+cmake \
+    -LAH \
+    -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \
+    -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \
+    -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \
+    -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \
+    -DUSE_V4_SSE=$SET_V4_SSE \
+    -DUSE_V4_AVX=$SET_V4_AVX \
+    -DUSE_V4_AVX2=$SET_V4_AVX2 \
+    -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \
+    -DUSE_V8_AVX=$SET_V8_AVX \
+    -DUSE_V8_AVX2=$SET_V8_AVX2 \
+    -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \
+    -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \
+    -DUSE_OPENMP=$SET_OPENMP \
+    -DUSE_PTHREADS=$SET_PTHREADS \
+    -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \
+    -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \
+    -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \
+    -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \
+    -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \
+    $src_dir
+
+#------------------------------------------------------------------------------#
+# Call make command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# In general, it is necessary to call the "make" command within this script
+# because the module environment has been configured within this script.
+#
+# Setting VERBOSE=1 causes "make" to output the commands it is executing.
+# This information is useful if debugging a failed build.
+#
+# If the NJ variable is not defined, "make" will perform a parallel build
+# using maximum number of processors on the compilation machine. If using
+# VERBOSE=1, the verbose output will be garbled by many processes writing
+# to STDOUT at the same time and will be difficult to interpret. When using
+# VERBOSE=1, it can be helpful to also use NJ=1.
+#------------------------------------------------------------------------------#
+
+make -j $NJ VERBOSE=$SET_VERBOSE
+
+#------------------------------------------------------------------------------#
+# Done.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# vim: syntax=sh
+#------------------------------------------------------------------------------#

From c8d0849607394b86aaf95777dae75ab4c37bce31 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 31 Jan 2019 01:47:35 -0700
Subject: [PATCH 04/95] Additional updates to documentation.

---
 README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d20a36b8..b8284968 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ of DOE Advanced Technology Systems and consist of a partition of dual socket Int
 Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation of DOE Commodity Technology Systems and consist of
 dual socket Intel Broadwell nodes running the TOSS 3.3 operating system. The lanl-ats1 and lanl-cts1 scripts are heavily
 documented and can be configured to provide a large variety of custom builds for their respective platform types. These
-scripts could also serve as a good starting point for development of a build script for other machine types. Because these
+scripts could also serve as a good starting point for development of a build script for other platform types. Because these
 scripts also configure the users build environment via the use of module commands, the scripts run both the cmake and make
 commands.
 
@@ -113,7 +113,14 @@ From the user created build directory, these scripts can be invoked as follows:
     ../arch/lanl-ats1
 ```
 
-Advanced users may chose to instead invoke `cmake` directly and hand select options.
+or
+
+```bash
+    ../arch/lanl-cts1
+```
+
+Advanced users may choose to instead invoke `cmake` directly and hand select options. Documentation on valid ways
+to select these options may be found in the lanl-ats1 and lanl-cts1 build scripts mentioned above.
 
 GCC users should ensure the `-fno-strict-aliasing` compiler flag is set (as shown in `./arch/generic-gcc-sse`).
 

From aeabefd0205828a82761ce9a095cf05acf8f5b81 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 31 Jan 2019 02:27:08 -0700
Subject: [PATCH 05/95] Update compiler option documentation to make more
 accurate.

---
 arch/lanl-ats1 | 10 ++++++----
 arch/lanl-cts1 | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/lanl-ats1 b/arch/lanl-ats1
index 981377ae..b6dea6b9 100755
--- a/arch/lanl-ats1
+++ b/arch/lanl-ats1
@@ -353,8 +353,9 @@ then
     # ignored that are used to limit memory usage and excessive compile times
     # by the compiler.
     #
-    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to
-    # be vectorized always, regardless of computation work volume.
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
     #--------------------------------------------------------------------------#
 
     FLAGS_CXX_COMPILER+=" -inline-forceinline"
@@ -459,8 +460,9 @@ then
     # ignored that are used to limit memory usage and excessive compile times
     # by the compiler.
     #
-    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to
-    # be vectorized always, regardless of computation work volume.
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
     #--------------------------------------------------------------------------#
 
     FLAGS_C_COMPILER+=" -inline-forceinline"
diff --git a/arch/lanl-cts1 b/arch/lanl-cts1
index edafd8f9..c18d8999 100755
--- a/arch/lanl-cts1
+++ b/arch/lanl-cts1
@@ -326,8 +326,9 @@ then
     # ignored that are used to limit memory usage and excessive compile times
     # by the compiler.
     #
-    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to
-    # be vectorized always, regardless of computation work volume.
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
     #--------------------------------------------------------------------------#
 
     FLAGS_CXX_COMPILER+=" -inline-forceinline"
@@ -402,8 +403,9 @@ then
     # ignored that are used to limit memory usage and excessive compile times
     # by the compiler.
     #
-    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops to
-    # be vectorized always, regardless of computation work volume.
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
     #--------------------------------------------------------------------------#
 
     FLAGS_C_COMPILER+=" -inline-forceinline"

From c3768115200ab19d65331e7c5a988d2dc7f4d236 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 31 Jan 2019 08:12:51 -0700
Subject: [PATCH 06/95] Reorder options a bit.

---
 arch/lanl-ats1 | 120 ++++++++++++++++++++++++-------------------------
 arch/lanl-cts1 | 108 ++++++++++++++++++++++----------------------
 2 files changed, 114 insertions(+), 114 deletions(-)

diff --git a/arch/lanl-ats1 b/arch/lanl-ats1
index b6dea6b9..7a5cb90e 100755
--- a/arch/lanl-ats1
+++ b/arch/lanl-ats1
@@ -26,26 +26,6 @@ src_dir="${0%/*}/.."
 # Configure the type of build that we want to perform.
 #------------------------------------------------------------------------------#
 
-#------------------------------------------------------------------------------#
-# Choose a compiler.
-#------------------------------------------------------------------------------#
-# One of the compiler choices in this section must be chosen. Valid options
-# are the following.
-#
-# INT: Intel compilers
-# GNU: GNU compilers
-# CCE: Cray compilers
-#
-# Note that selecting CCE for the Cray compilers currently does not work. The
-# main reason why you might want to compile with the Cray compilers is to use
-# some of the Cray specific tools like Reveal or a small set of features in
-# the CrayPat profiling software. This is not a common use case for users.
-#------------------------------------------------------------------------------#
-
-VCOM="INT"
-#VCOM="GNU"
-#VCOM="CCE"
-
 #------------------------------------------------------------------------------#
 # Choose a processor node type.
 #------------------------------------------------------------------------------#
@@ -63,46 +43,6 @@ VCOM="INT"
 KNL="yes"
 #HSW="yes"
 
-#------------------------------------------------------------------------------#
-# Choose an MPI implementation.
-#------------------------------------------------------------------------------#
-# One of the MPI library choices must be chosen. Valid options are the
-# following.
-#
-# CMPI: Cray Mpich, the Cray supported MPI library
-# OMPI: Open MPI
-#------------------------------------------------------------------------------#
-
-VMPI="CMPI"
-#VMPI="OMPI"
-
-#------------------------------------------------------------------------------#
-# Choose a thread model.
-#------------------------------------------------------------------------------#
-# One of the two available thread models must be chosen. Valid options are the
-# following.
-#
-# PTH: Pthreads
-# OMP: OpenMP
-#------------------------------------------------------------------------------#
-
-VTHR="PTH"
-#VTHR="OMP"
-
-#------------------------------------------------------------------------------#
-# Choose format of status update output.
-#------------------------------------------------------------------------------#
-# One of the two available options must be chosen. Valid options are ON and
-# OFF.
-#
-# If SET_MORE_DIGITS=OFF, the output has two significant figures.
-#
-# If SET_MORE_DIGITS=ON, the output has four significant figures.
-#------------------------------------------------------------------------------#
-
-SET_MORE_DIGITS="OFF"
-#SET_MORE_DIGITS="ON"
-
 #------------------------------------------------------------------------------#
 # Choose type of vector intrinsics support.
 #------------------------------------------------------------------------------#
@@ -160,6 +100,66 @@ SET_V4_AVX2="ON"
 #SET_V16_PORTABLE="ON"
 SET_V16_AVX512="ON"
 
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# CCE: Cray compilers
+#
+# Note that selecting CCE for the Cray compilers currently does not work. The
+# main reason why you might want to compile with the Cray compilers is to use
+# some of the Cray specific tools like Reveal or a small set of features in
+# the CrayPat profiling software. This is not a common use case for users.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="CCE"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# CMPI: Cray Mpich, the Cray supported MPI library
+# OMPI: Open MPI
+#------------------------------------------------------------------------------#
+
+VMPI="CMPI"
+#VMPI="OMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
+#------------------------------------------------------------------------------#
+# Choose format of status update output.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_MORE_DIGITS=OFF, the output has two significant figures.
+#
+# If SET_MORE_DIGITS=ON, the output has four significant figures.
+#------------------------------------------------------------------------------#
+
+SET_MORE_DIGITS="OFF"
+#SET_MORE_DIGITS="ON"
+
 #------------------------------------------------------------------------------#
 # Choose a particle sort implementation.
 #------------------------------------------------------------------------------#
diff --git a/arch/lanl-cts1 b/arch/lanl-cts1
index c18d8999..a846c0ec 100755
--- a/arch/lanl-cts1
+++ b/arch/lanl-cts1
@@ -20,6 +20,60 @@ src_dir="${0%/*}/.."
 # Configure the type of build that we want to perform.
 #------------------------------------------------------------------------------#
 
+#------------------------------------------------------------------------------#
+# Choose type of vector intrinsics support.
+#------------------------------------------------------------------------------#
+# Note the following constraints.
+#
+# Each of the eight variables in this section must have a configured value.
+# This is because the corresponding "USE" cmake variable is set on the cmake
+# command line below to allow any possible combinations to be configured using
+# a single cmake command.
+#
+# If all values are configured as OFF, the scalar implementations of VPIC
+# functions which are not vectorized will be used.
+#
+# It is possible to have a vector version configured as ON for each of the
+# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC
+# function has a V16 implementation, that will be used. If there is not a V16
+# implementation but there is a V8 implementation, that will be used. If there
+# is not a V16 or V8 implementation but there is a V4 implementation, that
+# will be used. Finally, for functions that have no vector implementations,
+# the scalar version will be used.
+#
+# Currently, it is recommended to always configure the appropriate V4 version
+# as on if using vector versions because there are key functions that only
+# have a V4 version because the current algorithm does not generalize to
+# longer vector lengths. An example is the move_p function. Since the V4
+# versions are generally more performant than the scalar versions, it makes
+# sense to use them even when using the longer vector length implementations
+# for other VPIC functions.
+#
+# In summary, when using vector versions on a machine with 256 bit SIMD, the
+# V4 and V8 implementations should be configured as ON.
+#
+# First, we turn all of the vector options OFF. Then, we turn on the ones we
+# want.
+#------------------------------------------------------------------------------#
+
+SET_V4_PORTABLE="OFF"
+SET_V4_SSE="OFF"
+SET_V4_AVX="OFF"
+SET_V4_AVX2="OFF"
+SET_V8_PORTABLE="OFF"
+SET_V8_AVX="OFF"
+SET_V8_AVX2="OFF"
+SET_V16_PORTABLE="OFF"
+
+#SET_V4_PORTABLE="ON"
+#SET_V4_SSE="ON"
+#SET_V4_AVX="ON"
+SET_V4_AVX2="ON"
+#SET_V8_PORTABLE="ON"
+#SET_V8_AVX="ON"
+SET_V8_AVX2="ON"
+#SET_V16_PORTABLE="ON"
+
 #------------------------------------------------------------------------------#
 # Choose a compiler.
 #------------------------------------------------------------------------------#
@@ -82,60 +136,6 @@ VTHR="PTH"
 SET_MORE_DIGITS="OFF"
 #SET_MORE_DIGITS="ON"
 
-#------------------------------------------------------------------------------#
-# Choose type of vector intrinsics support.
-#------------------------------------------------------------------------------#
-# Note the following constraints.
-#
-# Each of the eight variables in this section must have a configured value.
-# This is because the corresponding "USE" cmake variable is set on the cmake
-# command line below to allow any possible combinations to be configured using
-# a single cmake command.
-#
-# If all values are configured as OFF, the scalar implementations of VPIC
-# functions which are not vectorized will be used.
-#
-# It is possible to have a vector version configured as ON for each of the
-# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC
-# function has a V16 implementation, that will be used. If there is not a V16
-# implementation but there is a V8 implementation, that will be used. If there
-# is not a V16 or V8 implementation but there is a V4 implementation, that
-# will be used. Finally, for functions that have no vector implementations,
-# the scalar version will be used.
-#
-# Currently, it is recommended to always configure the appropriate V4 version
-# as on if using vector versions because there are key functions that only
-# have a V4 version because the current algorithm does not generalize to
-# longer vector lengths. An example is the move_p function. Since the V4
-# versions are generally more performant than the scalar versions, it makes
-# sense to use them even when using the longer vector length implementations
-# for other VPIC functions.
-#
-# In summary, when using vector versions on a machine with 256 bit SIMD, the
-# V4 and V8 implementations should be configured as ON.
-#
-# First, we turn all of the vector options OFF. Then, we turn on the ones we
-# want.
-#------------------------------------------------------------------------------#
-
-SET_V4_PORTABLE="OFF"
-SET_V4_SSE="OFF"
-SET_V4_AVX="OFF"
-SET_V4_AVX2="OFF"
-SET_V8_PORTABLE="OFF"
-SET_V8_AVX="OFF"
-SET_V8_AVX2="OFF"
-SET_V16_PORTABLE="OFF"
-
-#SET_V4_PORTABLE="ON"
-#SET_V4_SSE="ON"
-#SET_V4_AVX="ON"
-SET_V4_AVX2="ON"
-#SET_V8_PORTABLE="ON"
-#SET_V8_AVX="ON"
-SET_V8_AVX2="ON"
-#SET_V16_PORTABLE="ON"
-
 #------------------------------------------------------------------------------#
 # Choose a particle sort implementation.
 #------------------------------------------------------------------------------#

From c96f8c5bbb8e5df7d6608e9274081f0ec56bc34d Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 31 Jan 2019 12:02:43 -0700
Subject: [PATCH 07/95] For the lanl-ats1 script, make sure that the Cray
 programming environment starts out as the Cray default of PrgEnv-intel. This
 change checks for the case where the user has modified their module
 environment and swaps it back to the case assumed by the build script.

---
 arch/lanl-ats1 | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/lanl-ats1 b/arch/lanl-ats1
index 7a5cb90e..d2a4af50 100755
--- a/arch/lanl-ats1
+++ b/arch/lanl-ats1
@@ -798,6 +798,21 @@ then
     FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
 fi
 
+#------------------------------------------------------------------------------#
+# Make sure the Cray programming environment is configured as the default of
+# PrgEnv-intel.
+#------------------------------------------------------------------------------#
+
+if [ "$CRAY_PRGENVGNU" = "loaded" ]
+then
+    module swap PrgEnv-gnu PrgEnv-intel
+fi
+
+if [ "$CRAY_PRGENVCRAY" = "loaded" ]
+then
+    module swap PrgEnv-cray PrgEnv-intel
+fi
+
 #------------------------------------------------------------------------------#
 # Configure environment using modules.
 #------------------------------------------------------------------------------#

From bb2baea590785e750c61ed52596f283da4b72d22 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 11 Feb 2019 14:03:34 -0700
Subject: [PATCH 08/95] Fix issues and errors in use of float literals
 introduced in a previous commit.

---
 src/species_advance/standard/move_p.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/species_advance/standard/move_p.cc b/src/species_advance/standard/move_p.cc
index 085a59ca..dfba3785 100644
--- a/src/species_advance/standard/move_p.cc
+++ b/src/species_advance/standard/move_p.cc
@@ -240,13 +240,13 @@ move_p( particle_t       * ALIGNED(128) p0,
 
     s_dir[0] = (s_dispx>0.0f) ? 1.0f : -1.0f;
     s_dir[1] = (s_dispy>0.0f) ? 1.0f : -1.0f;
-    s_dir[2] = (s_dispz>0.0) ? 1.0f : -1.0f;
+    s_dir[2] = (s_dispz>0.0f) ? 1.0f : -1.0f;
 
     // Compute the twice the fractional distance to each potential
     // streak/cell face intersection.
-    v0 = (s_dispx==0) ? 3.4e38f : (s_dir[0]-s_midx)/s_dispx;
-    v1 = (s_dispy==0) ? 3.4e38f : (s_dir[1]-s_midy)/s_dispy;
-    v2 = (s_dispz==0) ? 3.4e38f : (s_dir[2]-s_midz)/s_dispz;
+    v0 = (s_dispx==0.0f) ? 3.4e38f : (s_dir[0]-s_midx)/s_dispx;
+    v1 = (s_dispy==0.0f) ? 3.4e38f : (s_dir[1]-s_midy)/s_dispy;
+    v2 = (s_dispz==0.0f) ? 3.4e38f : (s_dir[2]-s_midz)/s_dispz;
 
     // Determine the fractional length and axis of current streak. The
     // streak ends on either the first face intersected by the
@@ -254,10 +254,10 @@ move_p( particle_t       * ALIGNED(128) p0,
     //
     //   axis 0,1 or 2 ... streak ends on a x,y or z-face respectively
     //   axis 3        ... streak ends at end of the particle track
-    /**/      v3=2.0f,  axis=3.0f;
-    if(v0<v3) v3=v0, axis=0.0f;
-    if(v1<v3) v3=v1, axis=1.0f;
-    if(v2<v3) v3=v2, axis=2.0f;
+    /**/      v3=2.0f, axis=3;
+    if(v0<v3) v3=v0,   axis=0;
+    if(v1<v3) v3=v1,   axis=1;
+    if(v2<v3) v3=v2,   axis=2;
     v3 *= 0.5f;
 
     // Compute the midpoint and the normalized displacement of the streak

From 8ab387d23fd806bdaab2c69b97cce22d437d6ce0 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 11 Feb 2019 16:44:24 -0700
Subject: [PATCH 09/95] Add CMake support for configuring a VPIC build with the
 legacy particle sort implementation. Add build script support for a few more
 CMake variables that were missing and should be availble to users of the
 build scripts.

---
 CMakeLists.txt | 12 ++++++++++++
 arch/lanl-ats1 | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 arch/lanl-cts1 | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b573cd57..2f9902c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,8 @@ option(USE_V16_PORTABLE "Enable V16 Portable" OFF)
 
 option(USE_V16_AVX512 "Enable V16 AVX512" OFF)
 
+option(USE_LEGACY_SORT "Enable Legacy Sort Implementation" OFF)
+
 #option(USE_ADVANCE_P_AUTOVEC "Enable Explicit Autovec" OFF)
 
 option(VPIC_PRINT_MORE_DIGITS "Print more digits in VPIC timer info" OFF)
@@ -109,6 +111,7 @@ endif(DISABLE_DYNAMIC_RESIZING)
 if(NOT SET_MIN_NUM_PARTICLES STREQUAL "AUTO")
     add_definitions(-DMIN_NP=${SET_MIN_NUM_PARTICLES})
 endif()
+
 #------------------------------------------------------------------------------#
 # OpenSSL
 #------------------------------------------------------------------------------#
@@ -127,6 +130,15 @@ find_package(Threads REQUIRED)
 # Act on build options set in project.cmake
 #------------------------------------------------------------------------------#
 
+#------------------------------------------------------------------------------#
+# Add options for building with the legacy particle sort implementation.
+#------------------------------------------------------------------------------#
+
+if(USE_LEGACY_SORT)
+  add_definitions(-DVPIC_USE_LEGACY_SORT)
+    set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_LEGACY_SORT")
+endif(USE_LEGACY_SORT)
+
 #------------------------------------------------------------------------------#
 # Add options for building with a threading model.
 #------------------------------------------------------------------------------#
diff --git a/arch/lanl-ats1 b/arch/lanl-ats1
index d2a4af50..46a2b34f 100755
--- a/arch/lanl-ats1
+++ b/arch/lanl-ats1
@@ -221,6 +221,48 @@ SET_INTEGRATED_TESTS="OFF"
 SET_UNIT_TESTS="OFF"
 #SET_UNIT_TESTS="ON"
 
+#------------------------------------------------------------------------------#
+# Choose OpenSSL support for checksums.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off.
+#
+# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on.
+#------------------------------------------------------------------------------#
+
+SET_ENABLE_OPENSSL="OFF"
+#SET_ENABLE_OPENSSL="ON"
+
+#------------------------------------------------------------------------------#
+# Choose support for dynamic resizing of particle arrays.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized
+# dynamically.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized
+# dynamically and the user will be responsible for ensuring that particle
+# arrays have enough space to handle the evolution of a non-uniform particle
+# distribution.
+#------------------------------------------------------------------------------#
+
+SET_DISABLE_DYNAMIC_RESIZING="OFF"
+#SET_DISABLE_DYNAMIC_RESIZING="ON"
+
+#------------------------------------------------------------------------------#
+# Choose the minimum number of particles to dynamically allocate space for.
+#------------------------------------------------------------------------------#
+# A value must be chosen.  The default is 128 particles which allocates space
+# equal to a 4 KByte page size.
+#------------------------------------------------------------------------------#
+
+SET_PARTICLE_MIN_NUM="128"
+#SET_PARTICLE_MIN_NUM="32768"
+
 #------------------------------------------------------------------------------#
 # Choose the CMake build type.
 #------------------------------------------------------------------------------#
@@ -793,9 +835,7 @@ fi
 
 if [ "$VSORT" = "LSORT" ]
 then
-    FLAGS_C_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
-
-    FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
+    SET_LEGACY_SORT="ON"
 fi
 
 #------------------------------------------------------------------------------#
@@ -936,6 +976,10 @@ cmake \
     -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \
     -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \
     -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \
+    -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \
+    -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \
+    -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \
+    -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \
     -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \
     -DUSE_V4_SSE=$SET_V4_SSE \
     -DUSE_V4_AVX=$SET_V4_AVX \
diff --git a/arch/lanl-cts1 b/arch/lanl-cts1
index a846c0ec..74364294 100755
--- a/arch/lanl-cts1
+++ b/arch/lanl-cts1
@@ -195,6 +195,48 @@ SET_INTEGRATED_TESTS="OFF"
 SET_UNIT_TESTS="OFF"
 #SET_UNIT_TESTS="ON"
 
+#------------------------------------------------------------------------------#
+# Choose OpenSSL support for checksums.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off.
+#
+# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on.
+#------------------------------------------------------------------------------#
+
+SET_ENABLE_OPENSSL="OFF"
+#SET_ENABLE_OPENSSL="ON"
+
+#------------------------------------------------------------------------------#
+# Choose support for dynamic resizing of particle arrays.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized
+# dynamically.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized
+# dynamically and the user will be responsible for ensuring that particle
+# arrays have enough space to handle the evolution of a non-uniform particle
+# distribution.
+#------------------------------------------------------------------------------#
+
+SET_DISABLE_DYNAMIC_RESIZING="OFF"
+#SET_DISABLE_DYNAMIC_RESIZING="ON"
+
+#------------------------------------------------------------------------------#
+# Choose the minimum number of particles to dynamically allocate space for.
+#------------------------------------------------------------------------------#
+# A value must be chosen.  The default is 128 particles which allocates space
+# equal to a 4 KByte page size.
+#------------------------------------------------------------------------------#
+
+SET_PARTICLE_MIN_NUM="128"
+#SET_PARTICLE_MIN_NUM="32768"
+
 #------------------------------------------------------------------------------#
 # Choose the CMake build type.
 #------------------------------------------------------------------------------#
@@ -664,9 +706,7 @@ fi
 
 if [ "$VSORT" = "LSORT" ]
 then
-    FLAGS_C_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
-
-    FLAGS_CXX_COMPILER+=" -DVPIC_USE_LEGACY_SORT"
+    SET_LEGACY_SORT="ON"
 fi
 
 #------------------------------------------------------------------------------#
@@ -784,6 +824,10 @@ cmake \
     -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \
     -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \
     -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \
+    -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \
+    -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \
+    -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \
+    -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \
     -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \
     -DUSE_V4_SSE=$SET_V4_SSE \
     -DUSE_V4_AVX=$SET_V4_AVX \

From 8515047e87990d170c310f7f8fd0b0587e67dc6b Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 18 Feb 2019 21:19:54 -0700
Subject: [PATCH 10/95] Separate lanl-ats1 script into two separate scripts,
 one for Haswell nodes and one for KNL nodes.

---
 README.md                         |  26 +-
 arch/lanl-ats1-hsw                | 963 ++++++++++++++++++++++++++++++
 arch/{lanl-ats1 => lanl-ats1-knl} |  66 +-
 3 files changed, 992 insertions(+), 63 deletions(-)
 create mode 100755 arch/lanl-ats1-hsw
 rename arch/{lanl-ats1 => lanl-ats1-knl} (95%)

diff --git a/README.md b/README.md
index 4392ec9b..2383a792 100644
--- a/README.md
+++ b/README.md
@@ -97,20 +97,26 @@ After configuration, simply type:
     make
 ```
 
-Two scripts in the `./arch` directory are of particular note: lanl-ats1 and lanl-cts1. These scripts provide a default way to build VPIC
-on LANL ATS-1 clusters such as Trinity and Trinitite and LANL CTS-1 clusters. The LANL ATS-1 clusters are the first generation
-of DOE Advanced Technology Systems and consist of a partition of dual socket Intel Haswell nodes and a partition of single socket
-Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation of DOE Commodity Technology Systems and consist of
-dual socket Intel Broadwell nodes running the TOSS 3.3 operating system. The lanl-ats1 and lanl-cts1 scripts are heavily
-documented and can be configured to provide a large variety of custom builds for their respective platform types. These
-scripts could also serve as a good starting point for development of a build script for other platform types. Because these
-scripts also configure the users build environment via the use of module commands, the scripts run both the cmake and make
-commands.
+Three scripts in the `./arch` directory are of particular note: lanl-ats1-hsw, lanl-ats1-knl and lanl-cts1. These scripts
+provide a default way to build VPIC on LANL ATS-1 clusters such as Trinity and Trinitite and LANL CTS-1 clusters. The LANL
+ATS-1 clusters are the first generation of DOE Advanced Technology Systems and consist of a partition of dual socket Intel
+Haswell nodes and a partition of single socket Intel Knights Landing nodes. The LANL CTS-1 clusters are the first generation
+of DOE Commodity Technology Systems and consist of dual socket Intel Broadwell nodes running the TOSS 3.3 operating system.
+The lanl-ats1-hsw, lanl-ats1-knl and lanl-cts1 scripts are heavily documented and can be configured to provide a large
+variety of custom builds for their respective platform types. These scripts could also serve as a good starting point for
+development of a build script for other platform types. Because these scripts also configure the users build environment
+via the use of module commands, the scripts run both the cmake and make commands.
 
 From the user created build directory, these scripts can be invoked as follows:
 
 ```bash
-    ../arch/lanl-ats1
+    ../arch/lanl-ats1-hsw
+```
+
+or
+
+```bash
+    ../arch/lanl-ats1-knl
 ```
 
 or
diff --git a/arch/lanl-ats1-hsw b/arch/lanl-ats1-hsw
new file mode 100755
index 00000000..2b69ade9
--- /dev/null
+++ b/arch/lanl-ats1-hsw
@@ -0,0 +1,963 @@
+#! /usr/bin/env bash
+#------------------------------------------------------------------------------#
+# This script supports building VPIC on ATS-1 machines at Los Alamos National
+# Laboratory (LANL) for Haswell nodes. These machines run the Cray Linux
+# Environment Operating System and have two compute partitions, a Haswell
+# partition and a Knights Landing (KNL) partition. Both processor types are
+# Intel processors. These machines provide three compiler choices: Intel, GNU
+# and Cray compilers. Two MPI implementations are provided: Cray Mpich and Open
+# MPI.
+#
+# Normal users should not need to change this script if building VPIC to run
+# on the Haswell nodes of ATS-1 machines and happy with defaults.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Get the path to the project from which this script was called.
+#------------------------------------------------------------------------------#
+
+src_dir="${0%/*}/.."
+
+#------------------------------------------------------------------------------#
+# Configure the type of build that we want to perform.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Choose type of vector intrinsics support.
+#------------------------------------------------------------------------------#
+# Note the following constraints.
+#
+# Each of the nine variables in this section must have a configured value.
+# This is because the corresponding "USE" cmake variable is set on the cmake
+# command line below to allow any possible combinations to be configured using
+# a single cmake command.
+#
+# If all values are configured as OFF, the scalar implementations of VPIC
+# functions which are not vectorized will be used.
+#
+# It is possible to have a vector version configured as ON for each of the
+# three vector widths i.e. V4, V8 and V16. In that scenario, if a given VPIC
+# function has a V16 implementation, that will be used. If there is not a V16
+# implementation but there is a V8 implementation, that will be used. If there
+# is not a V16 or V8 implementation but there is a V4 implementation, that
+# will be used. Finally, for functions that have no vector implementations,
+# the scalar version will be used.
+#
+# Currently, it is recommended to always configure the appropriate V4 version
+# as on if using vector versions because there are key functions that only
+# have a V4 version because the current algorithm does not generalize to
+# longer vector lengths. An example is the move_p function. Since the V4
+# versions are generally more performant than the scalar versions, it makes
+# sense to use them even when using the longer vector length implementations
+# for other VPIC functions.
+#
+# In summary, when using vector versions on a machine with 256 bit SIMD, the
+# V4 and V8 implementations should be configured as ON. When using a machine
+# with 512 bit SIMD, V4 and V16 implementations should be configured as ON.
+#
+# First, we turn all of the vector options OFF. Then, we turn on the ones we
+# want.
+#------------------------------------------------------------------------------#
+
+SET_V4_PORTABLE="OFF"
+SET_V4_SSE="OFF"
+SET_V4_AVX="OFF"
+SET_V4_AVX2="OFF"
+SET_V8_PORTABLE="OFF"
+SET_V8_AVX="OFF"
+SET_V8_AVX2="OFF"
+SET_V16_PORTABLE="OFF"
+SET_V16_AVX512="OFF"
+
+#SET_V4_PORTABLE="ON"
+#SET_V4_SSE="ON"
+#SET_V4_AVX="ON"
+SET_V4_AVX2="ON"
+#SET_V8_PORTABLE="ON"
+#SET_V8_AVX="ON"
+SET_V8_AVX2="ON"
+#SET_V16_PORTABLE="ON"
+#SET_V16_AVX512="ON"
+
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# CCE: Cray compilers
+#
+# Note that selecting CCE for the Cray compilers currently does not work. The
+# main reason why you might want to compile with the Cray compilers is to use
+# some of the Cray specific tools like Reveal or a small set of features in
+# the CrayPat profiling software. This is not a common use case for users.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="CCE"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# CMPI: Cray Mpich, the Cray supported MPI library
+# OMPI: Open MPI
+#------------------------------------------------------------------------------#
+
+VMPI="CMPI"
+#VMPI="OMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
+#------------------------------------------------------------------------------#
+# Choose format of status update output.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_MORE_DIGITS=OFF, the output has two significant figures.
+#
+# If SET_MORE_DIGITS=ON, the output has four significant figures.
+#------------------------------------------------------------------------------#
+
+SET_MORE_DIGITS="OFF"
+#SET_MORE_DIGITS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose a particle sort implementation.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are the
+# following.
+#
+# LSORT: legacy, thread serial sort
+# TSORT: thread parallel sort
+#
+# The LSORT particle sort implementation is the thread serial particle sort
+# implementation from the legacy v407 version of VPIC. This implementation
+# supports both in-place and out-of-place sorting of the particles. It is very
+# competitive with the thread parallel sort implementation for a small number
+# of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting
+# the particles in-place allows the fraction of particles stored in High
+# Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint
+# of VPIC is reduced by the memory of a particle array which can be significant
+# for particle dominated problems.
+#
+# The TSORT particle sort implementation is a thread parallel implementation.
+# Currently, it can only perform out-of-place sorting of the particles. It will
+# be more performant than the LSORT implementation when using many threads per
+# MPI rank but uses more memory because of the out-of-place sort.
+#------------------------------------------------------------------------------#
+
+VSORT="LSORT"
+#VSORT="TSORT"
+
+#------------------------------------------------------------------------------#
+# Choose type of library to build.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is to build a static library, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_SHARED_LIBS="OFF"
+#SET_SHARED_LIBS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose integrated test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the integrated tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_INTEGRATED_TESTS="OFF"
+#SET_INTEGRATED_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose unit test support.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON or OFF.
+#
+# The default is not to build the unit tests, i.e. OFF.
+#------------------------------------------------------------------------------#
+
+SET_UNIT_TESTS="OFF"
+#SET_UNIT_TESTS="ON"
+
+#------------------------------------------------------------------------------#
+# Choose OpenSSL support for checksums.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_ENABLE_OPENSSL=OFF, use of checksums is turned off.
+#
+# If SET_ENABLE_OPENSSL=ON, use of checksums is turned on.
+#------------------------------------------------------------------------------#
+
+SET_ENABLE_OPENSSL="OFF"
+#SET_ENABLE_OPENSSL="ON"
+
+#------------------------------------------------------------------------------#
+# Choose support for dynamic resizing of particle arrays.
+#------------------------------------------------------------------------------#
+# One of the two available options must be chosen. Valid options are ON and
+# OFF.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=OFF, particle arrays will be resized
+# dynamically.
+#
+# If SET_DISABLE_DYNAMIC_RESIZING=ON, particle arrays will not be resized
+# dynamically and the user will be responsible for ensuring that particle
+# arrays have enough space to handle the evolution of a non-uniform particle
+# distribution.
+#------------------------------------------------------------------------------#
+
+SET_DISABLE_DYNAMIC_RESIZING="OFF"
+#SET_DISABLE_DYNAMIC_RESIZING="ON"
+
+#------------------------------------------------------------------------------#
+# Choose the minimum number of particles to dynamically allocate space for.
+#------------------------------------------------------------------------------#
+# A value must be chosen.  The default is 128 particles which allocates space
+# equal to a 4 KByte page size.
+#------------------------------------------------------------------------------#
+
+SET_PARTICLE_MIN_NUM="128"
+#SET_PARTICLE_MIN_NUM="32768"
+
+#------------------------------------------------------------------------------#
+# Choose the CMake build type.
+#------------------------------------------------------------------------------#
+# One of the available options must be chosen. Valid options depend on build
+# types available in the CMake version but include at least the following.
+#
+# Release: In general, the default for CMake.
+# None: Tells CMake not to use any pre-defined build type and gives VPIC build
+#       system total control of CMake variables defined on cmake command line.
+#------------------------------------------------------------------------------#
+
+SET_BUILD_TYPE="Release"
+#SET_BUILD_TYPE="None"
+
+#------------------------------------------------------------------------------#
+# Choose number of parallel make processes for build.
+#------------------------------------------------------------------------------#
+# If NJ variable is not defined, "make" will perform a parallel build using
+# maximum number of processors on the compilation machine.
+#
+# If using VERBOSE = 1 and NJ > 1, verbose output will be garbled by many
+# processes writing to STDOUT at the same time and will be difficult to
+# interpret.
+#
+# When using VERBOSE = 1,  use of NJ = 1 is recommended.
+#
+# The default is to use a modest number of processes in the parallel build.
+#
+# Comment out default below to use all processors on compilation machine.
+#------------------------------------------------------------------------------#
+
+NJ=8
+#NJ=1
+
+#------------------------------------------------------------------------------#
+# Choose verbosity of "make" output.
+#------------------------------------------------------------------------------#
+# Setting VERBOSE = 1 causes "make" to output commands it is executing.
+#
+# This information is useful if debugging a failed build.
+#
+# Setting VERBOSE = 0 or leaving VERBOSE undefined results in a quiet build.
+#
+# The default is a quiet build.
+#------------------------------------------------------------------------------#
+
+SET_VERBOSE=0
+#SET_VERBOSE=1
+
+#------------------------------------------------------------------------------#
+# Choose versions of modules to use if default is not desired.
+#------------------------------------------------------------------------------#
+# No choice is required in this section.
+#
+# Some possible alternative module versions are provided below. Change as
+# needed or desired.
+#
+# This section may need to be updated periodically as the module enviroment
+# evolves because of updates to operating system and programming environment.
+#------------------------------------------------------------------------------#
+
+#VERSION_CMAKE=3.12.1
+
+#VERSION_INTEL=19.0.1
+#VERSION_INTEL_VTUNE_AMPLIFIER=2019.1.0
+#VERSION_INTEL_VECTOR_ADVISOR=2019.1.0
+#VERSION_INTEL_INSPECTOR=2019.1.0
+#VERSION_INTEL_TRACE_ANALYZER=2019.1.022
+
+#VERSION_GNU=7.3.0
+
+#VERSION_CCE=9.0.0.21672
+#VERSION_CRAY_MPICH=7.7.4.4
+#VERSION_CRAY_PERF_TOOLS=7.0.4
+
+#VERSION_OPEN_MPI=3.1.2
+
+#VERSION_FORGE=18.3
+
+#------------------------------------------------------------------------------#
+# Unless the user wants to modify options to the compiler, no changes should
+# be needed below this point.
+#
+# If the user desires to configure compiler options, proceed to the section
+# below for the chosen compiler.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure default compiler names to use Cray wrapper scripts.
+#------------------------------------------------------------------------------#
+
+VPIC_COMPILER_C="cc"
+VPIC_COMPILER_CXX="CC"
+
+if [ "$VMPI" = "OMPI" ]
+then
+    VPIC_COMPILER_C="mpicc"
+    VPIC_COMPILER_CXX="mpicxx"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Intel compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "INT" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -inline-forceinline"
+    #FLAGS_CXX_COMPILER+=" -vec-threshold0"
+    FLAGS_CXX_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #
+    # Use of "-craype-verbose" causes Cray compiler wrapper script to print
+    # command it is forwarding to actual compiler for invocation. This is very
+    # useful for producing a build log to make sure compiler is being invoked
+    # with expected options.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+    FLAGS_CXX_COMPILER+=" -craype-verbose"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -qopt-report=5"
+    FLAGS_CXX_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_CXX_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O3" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O3" could be
+    # "-O2" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O3"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-inline-forceinline" overrides default heuristics of compiler
+    # and forces inlining of functions marked with inline keyword if compiler
+    # is able to inline. For VPIC, this option has mainly been used when using
+    # a portable implementation to force inlining by compiler and also when
+    # use of "-Winline" option identifies functions not being inlined that are
+    # marked with inline keyword.
+    #
+    # Use of "-qoverride-limits" cause certain internal compiler limits to be
+    # ignored that are used to limit memory usage and excessive compile times
+    # by the compiler.
+    #
+    # Use of "-vec-threshold0" ignores compiler heuristics and causes loops
+    # which can be vectorized to always be vectorized, regardless of the
+    # amount of computational work in the loop.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -inline-forceinline"
+    #FLAGS_C_COMPILER+=" -vec-threshold0"
+    FLAGS_C_COMPILER+=" -qoverride-limits"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-no-ansi-alias" informs compiler that VPIC does not obey ANSI
+    # aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -no-ansi-alias"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #
+    # Use of "-craype-verbose" causes Cray compiler wrapper script to print
+    # command it is forwarding to actual compiler for invocation. This is very
+    # useful for producing a build log to make sure compiler is being invoked
+    # with expected options.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+    FLAGS_C_COMPILER+=" -craype-verbose"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-qopt-report=5" specifies level of detail in compiler reports.
+    # This is the maximum level of detail.
+    #
+    # Use of "-qopt-report-phase=all" causes all phases of compilation process
+    # to provide output for compiler reports. Compiler reports are useful for
+    # understanding how compiler is optimizing various parts of VPIC.
+    #
+    # Use of "-diag-disable 10397" disables printing of diagnostic message
+    # that compiler reports are being generated.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -qopt-report=5"
+    FLAGS_C_COMPILER+=" -qopt-report-phase=all"
+    FLAGS_C_COMPILER+=" -diag-disable 10397"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Wl,--export-dynamic" removes following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the GNU compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "GNU" ]
+then
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -ffast-math"
+    FLAGS_CXX_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From g++ man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=knl" or "-march=haswell" causes g++ to generate code
+    # specific to and optimized for the specific architecture of either KNL
+    # or Haswell. It appears that the Cray wrappers already do this correctly
+    # for KNL but it seems they may not for Haswell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER+=" -march=haswell"
+
+    #--------------------------------------------------------------------------#
+    # Use "-g" to provide debug symbols in the executable.  In general, use of
+    # "-g" with modern compilers does not degrade performance and provides
+    # information required by many tools such as debugging and performance
+    # analysis tools.
+    #
+    # Use of "-O2" provides fairly aggressive optimization. When using vector
+    # intrinsics versions, most of the optimization is explicit in the
+    # intrinsics implementations. Reasonable alternatives to "-O2" could be
+    # "-O3" or "-Ofast". These alternatives should be benchmarked sometime.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-ffast-math" causes compiler to relax various IEEE or ISO rules
+    # and specifications for math functions which can result in faster code.
+    #
+    # Use of "-fno-unsafe-math-optimizations" turns off some unsafe math
+    # optimizations that got turned on by use of "-ffast-math" option. Some
+    # comments in VPIC source code indicate need for this with older compilers.
+    # This should be checked some time to see if it is still a relevant issue.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -ffast-math"
+    FLAGS_C_COMPILER+=" -fno-unsafe-math-optimizations"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fomit-frame-pointer" prevents keeping the frame pointer in a
+    # register for functions that do not need one. This can make an extra
+    # register available in many functions and reduce number of overall
+    # instructions. Some profiling should be done to measure the benefit of
+    # using this option.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fomit-frame-pointer"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-fno-strict-aliasing" informs compiler that VPIC does not obey
+    # ANSI aliasing rules which can reduce available optimizations.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -fno-strict-aliasing"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-Winline" cause compiler to emit a warning when a function that
+    # is declared inline is not inlined. Inlining is very important to VPIC
+    # performance and it is useful to know if compiler has not inlined a
+    # function that was assumed to be inlined.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -Winline"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-rdynamic" removes the following type of VPIC warnings.
+    #
+    # Unable to find a safely writable symbol that corresponds to address
+    # 432af0 (the closest match was "(null)" from "./lpi_2d_F6_test.Linux").
+    # Writing out the raw address instead and keeping my fingers crossed.
+    #
+    # From gcc man page: Pass the flag -export-dynamic to the ELF linker, on
+    # targets that support it. This instructs the linker to add all symbols,
+    # not only used ones, to the dynamic symbol table. This option is needed
+    # for some uses of "dlopen" or to allow obtaining backtraces from within
+    # a program.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -rdynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-dynamic" causes Cray compiler wrapper to direct compiler driver
+    # to link dynamic libraries at runtime instead of static libraries. The
+    # default on Cray systems is to link static libraries. It is important for
+    # many tools, especially performance analysis tools, to have an executable
+    # that has been linked dynamically to system libraries and MPI libraries.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    # Use of "-march=knl" or "-march=haswell" causes gcc to generate code
+    # specific to and optimized for the specific architecture of either KNL
+    # or Haswell. It appears that the Cray wrappers already do this correctly
+    # for KNL but it seems they may not for Haswell.
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER+=" -march=haswell"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure options for the Cray compilers.
+#------------------------------------------------------------------------------#
+
+if [ "$VCOM" = "CCE" ]
+then
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_CXX_COMPILER="-g -O2"
+    #FLAGS_CXX_COMPILER+=" -hlist=ad"
+    #FLAGS_CXX_COMPILER+=" -hipa5"
+    FLAGS_CXX_COMPILER+=" -Wl,--export-dynamic"
+    #FLAGS_CXX_COMPILER+=" -rdynamic"
+    FLAGS_CXX_COMPILER+=" -dynamic"
+
+    #--------------------------------------------------------------------------#
+    #
+    #--------------------------------------------------------------------------#
+
+    FLAGS_C_COMPILER="-g -O2"
+    #FLAGS_C_COMPILER+=" -hlist=ad"
+    #FLAGS_C_COMPILER+=" -hipa5"
+    FLAGS_C_COMPILER+=" -Wl,--export-dynamic"
+    #FLAGS_C_COMPILER+=" -rdynamic"
+    FLAGS_C_COMPILER+=" -dynamic"
+fi
+
+#------------------------------------------------------------------------------#
+# This ends user configuration section.
+#
+# No changes required below unless VPIC build system has been extended or the
+# module system on ATS-1 machines has changed in some fundamental way.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# Configure thread model.
+#------------------------------------------------------------------------------#
+
+if [ "$VTHR" = "PTH" ]
+then
+    SET_OPENMP="OFF"
+    SET_PTHREADS="ON"
+fi
+
+if [ "$VTHR" = "OMP" ]
+then
+    SET_OPENMP="ON"
+    SET_PTHREADS="OFF"
+fi
+
+#------------------------------------------------------------------------------#
+# Configure particle sort method.
+#------------------------------------------------------------------------------#
+
+if [ "$VSORT" = "LSORT" ]
+then
+    SET_LEGACY_SORT="ON"
+fi
+
+#------------------------------------------------------------------------------#
+# Make sure the Cray programming environment is configured as the default of
+# PrgEnv-intel.
+#------------------------------------------------------------------------------#
+
+if [ "$CRAY_PRGENVGNU" = "loaded" ]
+then
+    module swap PrgEnv-gnu PrgEnv-intel
+fi
+
+if [ "$CRAY_PRGENVCRAY" = "loaded" ]
+then
+    module swap PrgEnv-cray PrgEnv-intel
+fi
+
+#------------------------------------------------------------------------------#
+# Configure environment using modules.
+#------------------------------------------------------------------------------#
+# Note that the user may want to modify the module configuration.
+#
+# Note that module commands used to define the build environment are captured
+# in a Bash script named bashrc.modules which is written into the top level
+# build directory. This script can be used in run scripts and other scenarios
+# where there is a need to reproduce the environment used to build VPIC.
+#------------------------------------------------------------------------------#
+
+echo '#!/bin/bash' >> bashrc.modules
+echo "" >> bashrc.modules
+
+module load friendly-testing
+echo "module load friendly-testing" >> bashrc.modules
+
+module load sandbox
+echo "module load sandbox" >> bashrc.modules
+
+module load cmake
+echo "module load cmake" >> bashrc.modules
+
+if [ ! "x$VERSION_CMAKE" = "x" ]
+then
+    module swap cmake cmake/$VERSION_CMAKE
+    echo "module swap cmake cmake/$VERSION_CMAKE" >> bashrc.modules
+fi
+
+module unload craype-hugepages2M
+echo "module unload craype-hugepages2M" >> bashrc.modules
+
+if [ "$VCOM" = "INT" ]
+then
+    if [ ! "x$VERSION_INTEL" = "x" ]
+    then
+        module swap intel intel/$VERSION_INTEL
+        echo "module swap intel intel/$VERSION_INTEL" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "GNU" ]
+then
+    module swap PrgEnv-intel PrgEnv-gnu
+    echo "module swap PrgEnv-intel PrgEnv-gnu" >> bashrc.modules
+
+    if [ ! "x$VERSION_GNU" = "x" ]
+    then
+        module swap gcc gcc/$VERSION_GNU
+        echo "module swap gcc gcc/$VERSION_GNU" >> bashrc.modules
+    fi
+fi
+
+if [ "$VCOM" = "CCE" ]
+then
+    module swap PrgEnv-intel PrgEnv-cray
+    echo "module swap PrgEnv-intel PrgEnv-cray" >> bashrc.modules
+
+    if [ ! "x$VERSION_CCE" = "x" ]
+    then
+        module swap cce cce/$VERSION_CCE
+        echo "module swap cce cce/$VERSION_CCE" >> bashrc.modules
+    fi
+fi
+
+if [ "$VMPI" = "CMPI" ]
+then
+    if [ ! "x$VERSION_CRAY_MPICH" = "x" ]
+    then
+        module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH
+        echo "module swap cray-mpich cray-mpich/$VERSION_CRAY_MPICH" >> bashrc.modules
+    fi
+
+    export MPI_ROOT=$MPICH_DIR
+fi
+
+if [ "$VMPI" = "OMPI" ]
+then
+    module unload cray-mpich
+    echo "module unload cray-mpich" >> bashrc.modules
+
+    module unload cray-libsci
+    echo "module unload cray-libsci" >> bashrc.modules
+
+    module load openmpi
+    echo "module load openmpi" >> bashrc.modules
+
+    if [ ! "x$VERSION_OPEN_MPI" = "x" ]
+    then
+        module swap openmpi openmpi/$VERSION_OPEN_MPI
+        echo "module swap openmpi openmpi/$VERSION_OPEN_MPI" >> bashrc.modules
+    fi
+fi
+
+module list
+echo "" >> bashrc.modules
+echo "module list" >> bashrc.modules
+
+#------------------------------------------------------------------------------#
+# Call cmake command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# Use of the "-LAH" command line option to cmake causes cmake to output the
+# values of all of its variables. This is useful information when debugging
+# a failed build.
+#
+# Note that all of the possible VPIC cmake variables relevant to an ATS-1
+# system are set on the command line so that they can all be conditionally
+# configured above through user selections.
+#------------------------------------------------------------------------------#
+
+cmake \
+    -LAH \
+    -DCMAKE_BUILD_TYPE=$SET_BUILD_TYPE \
+    -DENABLE_INTEGRATED_TESTS=$SET_INTEGRATED_TESTS \
+    -DENABLE_UNIT_TESTS=$SET_UNIT_TESTS \
+    -DENABLE_OPENSSL=$SET_ENABLE_OPENSSL \
+    -DDISABLE_DYNAMIC_RESIZING=$SET_DISABLE_DYNAMIC_RESIZING \
+    -DSET_MIN_NUM_PARTICLES=$SET_PARTICLE_MIN_NUM \
+    -DUSE_LEGACY_SORT=$SET_LEGACY_SORT \
+    -DUSE_V4_PORTABLE=$SET_V4_PORTABLE \
+    -DUSE_V4_SSE=$SET_V4_SSE \
+    -DUSE_V4_AVX=$SET_V4_AVX \
+    -DUSE_V4_AVX2=$SET_V4_AVX2 \
+    -DUSE_V8_PORTABLE=$SET_V8_PORTABLE \
+    -DUSE_V8_AVX=$SET_V8_AVX \
+    -DUSE_V8_AVX2=$SET_V8_AVX2 \
+    -DUSE_V16_PORTABLE=$SET_V16_PORTABLE \
+    -DUSE_V16_AVX512=$SET_V16_AVX512 \
+    -DVPIC_PRINT_MORE_DIGITS=$SET_MORE_DIGITS \
+    -DUSE_OPENMP=$SET_OPENMP \
+    -DUSE_PTHREADS=$SET_PTHREADS \
+    -DBUILD_SHARED_LIBS=$SET_SHARED_LIBS \
+    -DCMAKE_C_COMPILER=$VPIC_COMPILER_C \
+    -DCMAKE_CXX_COMPILER=$VPIC_COMPILER_CXX \
+    -DCMAKE_C_FLAGS="$FLAGS_C_COMPILER" \
+    -DCMAKE_CXX_FLAGS="$FLAGS_CXX_COMPILER" \
+    $src_dir
+
+#------------------------------------------------------------------------------#
+# Call make command.
+#------------------------------------------------------------------------------#
+# Notes:
+#
+# In general, it is necessary to call the "make" command within this script
+# because the module environment has been configured within this script.
+#
+# Setting VERBOSE=1 causes "make" to output the commands it is executing.
+# This information is useful if debugging a failed build.
+#
+# If the NJ variable is not defined, "make" will perform a parallel build
+# using maximum number of processors on the compilation machine. If using
+# VERBOSE=1, the verbose output will be garbled by many processes writing
+# to STDOUT at the same time and will be difficult to interpret. When using
+# VERBOSE=1, it can be helpful to also use NJ=1.
+#------------------------------------------------------------------------------#
+
+make -j $NJ VERBOSE=$SET_VERBOSE
+
+#------------------------------------------------------------------------------#
+# Done.
+#------------------------------------------------------------------------------#
+
+#------------------------------------------------------------------------------#
+# vim: syntax=sh
+#------------------------------------------------------------------------------#
diff --git a/arch/lanl-ats1 b/arch/lanl-ats1-knl
similarity index 95%
rename from arch/lanl-ats1
rename to arch/lanl-ats1-knl
index 46a2b34f..907dfb1b 100755
--- a/arch/lanl-ats1
+++ b/arch/lanl-ats1-knl
@@ -1,19 +1,15 @@
 #! /usr/bin/env bash
 #------------------------------------------------------------------------------#
 # This script supports building VPIC on ATS-1 machines at Los Alamos National
-# Laboratory (LANL). These machines run the Cray Linux Environment Operating
-# System and have two compute partitions, a Haswell partition and a Knights
-# Landing (KNL) partition. Both processor types are Intel processors. These
-# machines provide three compiler choices: Intel, GNU and Cray compilers. Two
-# MPI implementations are provided: Cray Mpich and Open MPI.
+# Laboratory (LANL) for Knights Landing nodes. These machines run the Cray
+# Linux Environment Operating System and have two compute partitions, a Haswell
+# partition and a Knights Landing (KNL) partition. Both processor types are
+# Intel processors. These machines provide three compiler choices: Intel, GNU
+# and Cray compilers. Two MPI implementations are provided: Cray Mpich and Open
+# MPI.
 #
 # Normal users should not need to change this script if building VPIC to run
 # on the KNL nodes of ATS-1 machines and happy with defaults.
-#
-# If normal users desire to build VPIC to run on the Haswell nodes of ATS-1
-# machines, they will need to change this script in two places: first in the
-# section where a node type is chosen and second in the section where the type
-# of vector intrinsics used are chosen.
 #------------------------------------------------------------------------------#
 
 #------------------------------------------------------------------------------#
@@ -26,23 +22,6 @@ src_dir="${0%/*}/.."
 # Configure the type of build that we want to perform.
 #------------------------------------------------------------------------------#
 
-#------------------------------------------------------------------------------#
-# Choose a processor node type.
-#------------------------------------------------------------------------------#
-# One of the node types must be chosen. Valid options are the following.
-#
-# KNL: Knights Landing nodes
-# HSW: Haswell nodes
-#
-# If HSW, for Haswell, is chosen, you must also change the section on vector
-# intrinsics support below to turn off support for V16_AVX512. Normally, you
-# would also turn on support for V8_AVX2. See the documentation on the vector
-# intrinsics section below for more details.
-#------------------------------------------------------------------------------#
-
-KNL="yes"
-#HSW="yes"
-
 #------------------------------------------------------------------------------#
 # Choose type of vector intrinsics support.
 #------------------------------------------------------------------------------#
@@ -471,10 +450,7 @@ then
     # for KNL processors.
     #--------------------------------------------------------------------------#
 
-    if [ "$KNL" = "yes" ]
-    then
-        FLAGS_CXX_COMPILER+=" -qopt-zmm-usage=high"
-    fi
+    FLAGS_CXX_COMPILER+=" -qopt-zmm-usage=high"
 
     #--------------------------------------------------------------------------#
     # Use "-g" to provide debug symbols in the executable.  In general, use of
@@ -578,10 +554,7 @@ then
     # for KNL processors.
     #--------------------------------------------------------------------------#
 
-    if [ "$KNL" = "yes" ]
-    then
-        FLAGS_C_COMPILER+=" -qopt-zmm-usage=high"
-    fi
+    FLAGS_C_COMPILER+=" -qopt-zmm-usage=high"
 fi
 
 #------------------------------------------------------------------------------#
@@ -676,12 +649,7 @@ then
     # for KNL but it seems they may not for Haswell.
     #--------------------------------------------------------------------------#
 
-    if [ "$KNL" = "yes" ]
-    then
-        FLAGS_CXX_COMPILER+=" -march=knl"
-    else
-        FLAGS_CXX_COMPILER+=" -march=haswell"
-    fi
+    FLAGS_CXX_COMPILER+=" -march=knl"
 
     #--------------------------------------------------------------------------#
     # Use "-g" to provide debug symbols in the executable.  In general, use of
@@ -769,12 +737,7 @@ then
     # for KNL but it seems they may not for Haswell.
     #--------------------------------------------------------------------------#
 
-    if [ "$KNL" = "yes" ]
-    then
-        FLAGS_C_COMPILER+=" -march=knl"
-    else
-        FLAGS_C_COMPILER+=" -march=haswell"
-    fi
+    FLAGS_C_COMPILER+=" -march=knl"
 fi
 
 #------------------------------------------------------------------------------#
@@ -885,6 +848,9 @@ fi
 module unload craype-hugepages2M
 echo "module unload craype-hugepages2M" >> bashrc.modules
 
+module swap craype-haswell craype-mic-knl
+echo "module swap craype-haswell craype-mic-knl" >> bashrc.modules
+
 if [ "$VCOM" = "INT" ]
 then
     if [ ! "x$VERSION_INTEL" = "x" ]
@@ -918,12 +884,6 @@ then
     fi
 fi
 
-if [ "$KNL" = "yes" ]
-then
-    module swap craype-haswell craype-mic-knl
-    echo "module swap craype-haswell craype-mic-knl" >> bashrc.modules
-fi
-
 if [ "$VMPI" = "CMPI" ]
 then
     if [ ! "x$VERSION_CRAY_MPICH" = "x" ]

From 3782d9ae5cb710d100cc3c8c441cc0341b30678c Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Tue, 19 Feb 2019 22:58:54 -0700
Subject: [PATCH 11/95] Reorder build configuration options to something that
 seems more sensible.

---
 arch/lanl-ats1-hsw | 92 ++++++++++++++++++++++----------------------
 arch/lanl-ats1-knl | 92 ++++++++++++++++++++++----------------------
 arch/lanl-cts1     | 96 +++++++++++++++++++++++-----------------------
 3 files changed, 140 insertions(+), 140 deletions(-)

diff --git a/arch/lanl-ats1-hsw b/arch/lanl-ats1-hsw
index 2b69ade9..56a18692 100755
--- a/arch/lanl-ats1-hsw
+++ b/arch/lanl-ats1-hsw
@@ -22,6 +22,52 @@ src_dir="${0%/*}/.."
 # Configure the type of build that we want to perform.
 #------------------------------------------------------------------------------#
 
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# CCE: Cray compilers
+#
+# Note that selecting CCE for the Cray compilers currently does not work. The
+# main reason why you might want to compile with the Cray compilers is to use
+# some of the Cray specific tools like Reveal or a small set of features in
+# the CrayPat profiling software. This is not a common use case for users.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="CCE"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# CMPI: Cray Mpich, the Cray supported MPI library
+# OMPI: Open MPI
+#------------------------------------------------------------------------------#
+
+VMPI="CMPI"
+#VMPI="OMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
 #------------------------------------------------------------------------------#
 # Choose type of vector intrinsics support.
 #------------------------------------------------------------------------------#
@@ -79,52 +125,6 @@ SET_V8_AVX2="ON"
 #SET_V16_PORTABLE="ON"
 #SET_V16_AVX512="ON"
 
-#------------------------------------------------------------------------------#
-# Choose a compiler.
-#------------------------------------------------------------------------------#
-# One of the compiler choices in this section must be chosen. Valid options
-# are the following.
-#
-# INT: Intel compilers
-# GNU: GNU compilers
-# CCE: Cray compilers
-#
-# Note that selecting CCE for the Cray compilers currently does not work. The
-# main reason why you might want to compile with the Cray compilers is to use
-# some of the Cray specific tools like Reveal or a small set of features in
-# the CrayPat profiling software. This is not a common use case for users.
-#------------------------------------------------------------------------------#
-
-VCOM="INT"
-#VCOM="GNU"
-#VCOM="CCE"
-
-#------------------------------------------------------------------------------#
-# Choose an MPI implementation.
-#------------------------------------------------------------------------------#
-# One of the MPI library choices must be chosen. Valid options are the
-# following.
-#
-# CMPI: Cray Mpich, the Cray supported MPI library
-# OMPI: Open MPI
-#------------------------------------------------------------------------------#
-
-VMPI="CMPI"
-#VMPI="OMPI"
-
-#------------------------------------------------------------------------------#
-# Choose a thread model.
-#------------------------------------------------------------------------------#
-# One of the two available thread models must be chosen. Valid options are the
-# following.
-#
-# PTH: Pthreads
-# OMP: OpenMP
-#------------------------------------------------------------------------------#
-
-VTHR="PTH"
-#VTHR="OMP"
-
 #------------------------------------------------------------------------------#
 # Choose format of status update output.
 #------------------------------------------------------------------------------#
diff --git a/arch/lanl-ats1-knl b/arch/lanl-ats1-knl
index 907dfb1b..68c2e12a 100755
--- a/arch/lanl-ats1-knl
+++ b/arch/lanl-ats1-knl
@@ -22,6 +22,52 @@ src_dir="${0%/*}/.."
 # Configure the type of build that we want to perform.
 #------------------------------------------------------------------------------#
 
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# CCE: Cray compilers
+#
+# Note that selecting CCE for the Cray compilers currently does not work. The
+# main reason why you might want to compile with the Cray compilers is to use
+# some of the Cray specific tools like Reveal or a small set of features in
+# the CrayPat profiling software. This is not a common use case for users.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="CCE"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# CMPI: Cray Mpich, the Cray supported MPI library
+# OMPI: Open MPI
+#------------------------------------------------------------------------------#
+
+VMPI="CMPI"
+#VMPI="OMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
 #------------------------------------------------------------------------------#
 # Choose type of vector intrinsics support.
 #------------------------------------------------------------------------------#
@@ -79,52 +125,6 @@ SET_V4_AVX2="ON"
 #SET_V16_PORTABLE="ON"
 SET_V16_AVX512="ON"
 
-#------------------------------------------------------------------------------#
-# Choose a compiler.
-#------------------------------------------------------------------------------#
-# One of the compiler choices in this section must be chosen. Valid options
-# are the following.
-#
-# INT: Intel compilers
-# GNU: GNU compilers
-# CCE: Cray compilers
-#
-# Note that selecting CCE for the Cray compilers currently does not work. The
-# main reason why you might want to compile with the Cray compilers is to use
-# some of the Cray specific tools like Reveal or a small set of features in
-# the CrayPat profiling software. This is not a common use case for users.
-#------------------------------------------------------------------------------#
-
-VCOM="INT"
-#VCOM="GNU"
-#VCOM="CCE"
-
-#------------------------------------------------------------------------------#
-# Choose an MPI implementation.
-#------------------------------------------------------------------------------#
-# One of the MPI library choices must be chosen. Valid options are the
-# following.
-#
-# CMPI: Cray Mpich, the Cray supported MPI library
-# OMPI: Open MPI
-#------------------------------------------------------------------------------#
-
-VMPI="CMPI"
-#VMPI="OMPI"
-
-#------------------------------------------------------------------------------#
-# Choose a thread model.
-#------------------------------------------------------------------------------#
-# One of the two available thread models must be chosen. Valid options are the
-# following.
-#
-# PTH: Pthreads
-# OMP: OpenMP
-#------------------------------------------------------------------------------#
-
-VTHR="PTH"
-#VTHR="OMP"
-
 #------------------------------------------------------------------------------#
 # Choose format of status update output.
 #------------------------------------------------------------------------------#
diff --git a/arch/lanl-cts1 b/arch/lanl-cts1
index 74364294..6e17d194 100755
--- a/arch/lanl-cts1
+++ b/arch/lanl-cts1
@@ -20,6 +20,54 @@ src_dir="${0%/*}/.."
 # Configure the type of build that we want to perform.
 #------------------------------------------------------------------------------#
 
+#------------------------------------------------------------------------------#
+# Choose a compiler.
+#------------------------------------------------------------------------------#
+# One of the compiler choices in this section must be chosen. Valid options
+# are the following.
+#
+# INT: Intel compilers
+# GNU: GNU compilers
+# PGI: Portland Group compilers, now part of Nvidia
+#
+# Note that selecting PGI for Portland Group compilers has not been tested
+# and probably does not work.
+#------------------------------------------------------------------------------#
+
+VCOM="INT"
+#VCOM="GNU"
+#VCOM="PGI"
+
+#------------------------------------------------------------------------------#
+# Choose an MPI implementation.
+#------------------------------------------------------------------------------#
+# One of the MPI library choices must be chosen. Valid options are the
+# following.
+#
+# OMPI: Open MPI, most commonly used MPI implementation on LANL CTS-1 machines
+# IMPI: Intel MPI
+#
+# Choose Intel MPI if you want to use the Intel Application Performance
+# Snapshot performance analysis tool to analyze MPI performance of VPIC or
+# other Intel analysis tools which provide analysis of MPI usage.
+#------------------------------------------------------------------------------#
+
+VMPI="OMPI"
+#VMPI="IMPI"
+
+#------------------------------------------------------------------------------#
+# Choose a thread model.
+#------------------------------------------------------------------------------#
+# One of the two available thread models must be chosen. Valid options are the
+# following.
+#
+# PTH: Pthreads
+# OMP: OpenMP
+#------------------------------------------------------------------------------#
+
+VTHR="PTH"
+#VTHR="OMP"
+
 #------------------------------------------------------------------------------#
 # Choose type of vector intrinsics support.
 #------------------------------------------------------------------------------#
@@ -74,54 +122,6 @@ SET_V4_AVX2="ON"
 SET_V8_AVX2="ON"
 #SET_V16_PORTABLE="ON"
 
-#------------------------------------------------------------------------------#
-# Choose a compiler.
-#------------------------------------------------------------------------------#
-# One of the compiler choices in this section must be chosen. Valid options
-# are the following.
-#
-# INT: Intel compilers
-# GNU: GNU compilers
-# PGI: Portland Group compilers, now part of Nvidia
-#
-# Note that selecting PGI for Portland Group compilers has not been tested
-# and probably does not work.
-#------------------------------------------------------------------------------#
-
-VCOM="INT"
-#VCOM="GNU"
-#VCOM="PGI"
-
-#------------------------------------------------------------------------------#
-# Choose an MPI implementation.
-#------------------------------------------------------------------------------#
-# One of the MPI library choices must be chosen. Valid options are the
-# following.
-#
-# OMPI: Open MPI, most commonly used MPI implementation on LANL CTS-1 machines
-# IMPI: Intel MPI
-#
-# Choose Intel MPI if you want to use the Intel Application Performance
-# Snapshot performance analysis tool to analyze MPI performance of VPIC or
-# other Intel analysis tools which provide analysis of MPI usage.
-#------------------------------------------------------------------------------#
-
-VMPI="OMPI"
-#VMPI="IMPI"
-
-#------------------------------------------------------------------------------#
-# Choose a thread model.
-#------------------------------------------------------------------------------#
-# One of the two available thread models must be chosen. Valid options are the
-# following.
-#
-# PTH: Pthreads
-# OMP: OpenMP
-#------------------------------------------------------------------------------#
-
-VTHR="PTH"
-#VTHR="OMP"
-
 #------------------------------------------------------------------------------#
 # Choose format of status update output.
 #------------------------------------------------------------------------------#

From 7e396dc38d4c5e8339b471ba09d7b41349cc9021 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 28 Feb 2019 16:35:31 -0700
Subject: [PATCH 12/95] Add more documentation about various available CMake
 configuration variables.

---
 README.md | 84 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 68 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 2383a792..e3e634a7 100644
--- a/README.md
+++ b/README.md
@@ -206,32 +206,84 @@ Currently, the following options are exposed at compile time for the users consi
 
 ## Threading Model
 
- - `USE_PTHREADS` (default `ON`): Use Pthreads for the threading model (default enabled)
- - `USE_OPENMP`: Use OpenMP for the threading model
+ - `USE_PTHREADS`: Use Pthreads for threading model, (default `ON`)
+ - `USE_OPENMP`:   Use OpenMP for threading model
 
 ## Vectorization
 
- - `USE_V4_SSE`: Enable 4 wide (128-bit) SSE
- - `USE_V4_AVX`: Enable 4 wide (128-bit) AVX
- - `USE_V4_AVX2`: Enable 4 wide (128-bit) AVX2
- - `USE_V4_ALTIVEC`: Enable 4 wide (128-bit) Altivec
- - `USE_V4_PORTABLE`: Enable 4 wide (128-bit) portable implementation
+The following CMake variables are used to control the vector implementation that
+VPIC uses for each SIMD width.  Currently, there is support for 128 bit, 256 bit
+and 512 bit SIMD widths.  The default is for each of these CMake variables to be
+disabled which means that an unvectorized reference implementation of functions
+will be used.
 
- - `USE_V8_AVX`: Enable 8 wide (256-bit) AVX
- - `USE_V8_AVX2`: Enable 8 wide (256-bit) AVX2
- - `USE_V8_PORTABLE`: Enable 8 wide (256-bit) portable implementation
+ - `USE_V4_SSE`:       Enable 4 wide (128-bit) SSE
+ - `USE_V4_AVX`:       Enable 4 wide (128-bit) AVX
+ - `USE_V4_AVX2`:      Enable 4 wide (128-bit) AVX2
+ - `USE_V4_ALTIVEC`:   Enable 4 wide (128-bit) Altivec
+ - `USE_V4_PORTABLE`:  Enable 4 wide (128-bit) portable implementation
 
- - `USE_V16_AVX512`: Enable 16 wide (512-bit) AVX512
- - `USE_V16_PORTABLE`: Enable 16 wide (512-bit) portable implementation
+ - `USE_V8_AVX`:       Enable 8 wide (256-bit) AVX
+ - `USE_V8_AVX2`:      Enable 8 wide (256-bit) AVX2
+ - `USE_V8_PORTABLE`:  Enable 8 wide (256-bit) portable implementation
 
-If no combination of these are selected, the "reference" (read: unvectorized)
-version of the pusher will be used
+ - `USE_V16_AVX512`:   Enable 16 wide (512-bit) AVX512
+ - `USE_V16_PORTABLE`: Enable 16 wide (512-bit) portable implementation
 
-See example decks for how these are used together in combination.
+Several functions in VPIC have vector implementations for each of the three SIMD
+widths.  Some only have a single implementation.  An example of the latter is
+move_p which only has a reference implementation and a V4 implementation.
+
+It is possible to have a single CMake vector variable configured as ON for each
+of the three supported SIMD vector widths.  It is recommended to always have a
+CMake variable configured as ON for the 128 bit SIMD vector width so that move_p
+will be vectorized.  In addition, it is recommended to configure as ON the CMake
+variable that is associated with the native SIMD vector width of the processor
+that VPIC is targeting.  If a CMake variable is configured as ON for each of the
+three available SIMD vector widths, then for a given function in VPIC, the
+implementation which supports the largest SIMD vector length will be chosen.  If
+a V16 implementation exists, it will be chosen.  If a V16 implementation does not
+exist but V8 and V4 implementations exist, the V8 implementation will be chosen.
+If V16 and V8 implementations do not exist but a V4 implementation does, it will
+be chosen.  If no SIMD vector implementation exists, the unvectorized reference
+implementation will be chosen.
+
+In summary, when using vector versions on a machine with 256 bit SIMD, the
+V4 and V8 implementations should be configured as ON. When using a machine
+with 512 bit SIMD, V4 and V16 implementations should be configured as ON.
+When choosing a vector implementation for a given SIMD vector length, the
+implementation that is closest to the SIMD instruction set for the targeted
+processor should be chosen.  The portable versions are most commonly used for
+debugging the implementation of new intrinsics versions.  However, the portable
+versions are generally more performant than the unvectorized reference
+implemenation.  So, one might consider using the V4_PORTABLE version on ARM
+processors until a V4_NEON implementation becomes available.
 
 ## Output 
 
- - `VPIC_PRINT_MORE_DIGITS`: Enable more digits in the debug timing implementation
+ - `VPIC_PRINT_MORE_DIGITS`: Enable more digits in timing output of status reports
+
+## Particle sorting implementation
+
+The CMake variable below allows building VPIC to use the legacy, thread serial
+implementation of the particle sort algorithm.
+
+ - `USE_LEGACY_SORT`: Use legacy thread serial particle sort, (default `OFF`)
+
+The legacy particle sort implementation is the thread serial particle sort
+implementation from the legacy v407 version of VPIC. This implementation
+supports both in-place and out-of-place sorting of the particles. It is very
+competitive with the thread parallel sort implementation for a small number
+of threads per MPI rank, i.e. 4 or less, especially on KNL because sorting
+the particles in-place allows the fraction of particles stored in High
+Bandwidth Memory (HBM) to remain stored in HBM. Also, the memory footprint
+of VPIC is reduced by the memory of a particle array which can be significant
+for particle dominated problems.
+
+The default particle sort implementation is a thread parallel implementation.
+Currently, it can only perform out-of-place sorting of the particles. It will
+be more performant than the legacy implementation when using many threads per
+MPI rank but uses more memory because of the out-of-place sort.
 
 # Workflow
 

From 61ad4721d59fbc380523f11eee510560723c9e65 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 11 Mar 2019 15:46:25 -0600
Subject: [PATCH 13/95] Switch all C files to C++ files unless it is explicit
 that they need to be C files.

---
 src/boundary/{absorb_tally.c => absorb_tally.cc}                  | 0
 src/boundary/{boundary.c => boundary.cc}                          | 0
 src/boundary/{link.c => link.cc}                                  | 0
 src/boundary/{maxwellian_reflux.c => maxwellian_reflux.cc}        | 0
 src/collision/{binary.c => binary.cc}                             | 0
 src/collision/{collision.c => collision.cc}                       | 0
 src/collision/{hard_sphere.c => hard_sphere.cc}                   | 0
 src/collision/{langevin.c => langevin.cc}                         | 0
 src/collision/{large_angle_coulomb.c => large_angle_coulomb.cc}   | 0
 src/collision/pipeline/{binary_pipeline.c => binary_pipeline.cc}  | 0
 .../pipeline/{langevin_pipeline.c => langevin_pipeline.cc}        | 0
 src/collision/pipeline/{unary_pipeline.c => unary_pipeline.cc}    | 0
 src/collision/{unary.c => unary.cc}                               | 0
 src/emitter/{child_langmuir.c => child_langmuir.cc}               | 0
 src/emitter/{emitter.c => emitter.cc}                             | 0
 src/field_advance/{field_advance.c => field_advance.cc}           | 0
 src/field_advance/standard/{clean_div_e.c => clean_div_e.cc}      | 0
 .../standard/{compute_div_e_err.c => compute_div_e_err.cc}        | 0
 src/field_advance/standard/{compute_rhob.c => compute_rhob.cc}    | 0
 .../{compute_rms_div_b_err.c => compute_rms_div_b_err.cc}         | 0
 .../{compute_rms_div_e_err.c => compute_rms_div_e_err.cc}         | 0
 src/field_advance/standard/{energy_f.c => energy_f.cc}            | 0
 src/field_advance/standard/{local.c => local.cc}                  | 0
 .../pipeline/{clean_div_e_pipeline.c => clean_div_e_pipeline.cc}  | 0
 ...compute_div_e_err_pipeline.c => compute_div_e_err_pipeline.cc} | 0
 .../{compute_rhob_pipeline.c => compute_rhob_pipeline.cc}         | 0
 ...rms_div_b_err_pipeline.c => compute_rms_div_b_err_pipeline.cc} | 0
 ...rms_div_e_err_pipeline.c => compute_rms_div_e_err_pipeline.cc} | 0
 .../pipeline/{energy_f_pipeline.c => energy_f_pipeline.cc}        | 0
 ...cuum_clean_div_e_pipeline.c => vacuum_clean_div_e_pipeline.cc} | 0
 ..._div_e_err_pipeline.c => vacuum_compute_div_e_err_pipeline.cc} | 0
 ...um_compute_rhob_pipeline.c => vacuum_compute_rhob_pipeline.cc} | 0
 .../{vacuum_energy_f_pipeline.c => vacuum_energy_f_pipeline.cc}   | 0
 src/field_advance/standard/{remote.c => remote.cc}                | 0
 src/field_advance/standard/{sfa.c => sfa.cc}                      | 0
 .../standard/{vacuum_clean_div_e.c => vacuum_clean_div_e.cc}      | 0
 .../{vacuum_compute_div_e_err.c => vacuum_compute_div_e_err.cc}   | 0
 .../standard/{vacuum_compute_rhob.c => vacuum_compute_rhob.cc}    | 0
 .../standard/{vacuum_energy_f.c => vacuum_energy_f.cc}            | 0
 src/grid/{grid_comm.c => grid_comm.cc}                            | 0
 src/grid/{grid_structors.c => grid_structors.cc}                  | 0
 src/grid/{ops.c => ops.cc}                                        | 0
 src/grid/{partition.c => partition.cc}                            | 0
 src/material/{material.c => material.cc}                          | 0
 src/sf_interface/{accumulator_array.c => accumulator_array.cc}    | 0
 src/sf_interface/{clear_accumulators.c => clear_accumulators.cc}  | 0
 src/sf_interface/{hydro_array.c => hydro_array.cc}                | 0
 ...ear_accumulators_pipeline.c => clear_accumulators_pipeline.cc} | 0
 src/species_advance/{species_advance.c => species_advance.cc}     | 0
 src/species_advance/standard/{hydro_p.c => hydro_p.cc}            | 0
 .../standard/pipeline/{sort_p_pipeline.c => sort_p_pipeline.cc}   | 0
 src/species_advance/standard/{sort_p.c => sort_p.cc}              | 0
 src/util/{boot.c => boot.cc}                                      | 0
 src/util/checkpt/{checkpt.c => checkpt.cc}                        | 0
 src/util/pipelines/{pipelines_helper.c => pipelines_helper.cc}    | 0
 src/util/pipelines/{pipelines_serial.c => pipelines_serial.cc}    | 0
 src/util/pipelines/{pipelines_thread.c => pipelines_thread.cc}    | 0
 src/util/profile/{profile.c => profile.cc}                        | 0
 src/util/rng/{drandn_table.c => drandn_table.cc}                  | 0
 src/util/rng/{frandn_table.c => frandn_table.cc}                  | 0
 src/util/rng/{rng.c => rng.cc}                                    | 0
 src/util/rng/{rng_pool.c => rng_pool.cc}                          | 0
 src/util/{util_base.c => util_base.cc}                            | 0
 63 files changed, 0 insertions(+), 0 deletions(-)
 rename src/boundary/{absorb_tally.c => absorb_tally.cc} (100%)
 rename src/boundary/{boundary.c => boundary.cc} (100%)
 rename src/boundary/{link.c => link.cc} (100%)
 rename src/boundary/{maxwellian_reflux.c => maxwellian_reflux.cc} (100%)
 rename src/collision/{binary.c => binary.cc} (100%)
 rename src/collision/{collision.c => collision.cc} (100%)
 rename src/collision/{hard_sphere.c => hard_sphere.cc} (100%)
 rename src/collision/{langevin.c => langevin.cc} (100%)
 rename src/collision/{large_angle_coulomb.c => large_angle_coulomb.cc} (100%)
 rename src/collision/pipeline/{binary_pipeline.c => binary_pipeline.cc} (100%)
 rename src/collision/pipeline/{langevin_pipeline.c => langevin_pipeline.cc} (100%)
 rename src/collision/pipeline/{unary_pipeline.c => unary_pipeline.cc} (100%)
 rename src/collision/{unary.c => unary.cc} (100%)
 rename src/emitter/{child_langmuir.c => child_langmuir.cc} (100%)
 rename src/emitter/{emitter.c => emitter.cc} (100%)
 rename src/field_advance/{field_advance.c => field_advance.cc} (100%)
 rename src/field_advance/standard/{clean_div_e.c => clean_div_e.cc} (100%)
 rename src/field_advance/standard/{compute_div_e_err.c => compute_div_e_err.cc} (100%)
 rename src/field_advance/standard/{compute_rhob.c => compute_rhob.cc} (100%)
 rename src/field_advance/standard/{compute_rms_div_b_err.c => compute_rms_div_b_err.cc} (100%)
 rename src/field_advance/standard/{compute_rms_div_e_err.c => compute_rms_div_e_err.cc} (100%)
 rename src/field_advance/standard/{energy_f.c => energy_f.cc} (100%)
 rename src/field_advance/standard/{local.c => local.cc} (100%)
 rename src/field_advance/standard/pipeline/{clean_div_e_pipeline.c => clean_div_e_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{compute_div_e_err_pipeline.c => compute_div_e_err_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{compute_rhob_pipeline.c => compute_rhob_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{compute_rms_div_b_err_pipeline.c => compute_rms_div_b_err_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{compute_rms_div_e_err_pipeline.c => compute_rms_div_e_err_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{energy_f_pipeline.c => energy_f_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{vacuum_clean_div_e_pipeline.c => vacuum_clean_div_e_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{vacuum_compute_div_e_err_pipeline.c => vacuum_compute_div_e_err_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{vacuum_compute_rhob_pipeline.c => vacuum_compute_rhob_pipeline.cc} (100%)
 rename src/field_advance/standard/pipeline/{vacuum_energy_f_pipeline.c => vacuum_energy_f_pipeline.cc} (100%)
 rename src/field_advance/standard/{remote.c => remote.cc} (100%)
 rename src/field_advance/standard/{sfa.c => sfa.cc} (100%)
 rename src/field_advance/standard/{vacuum_clean_div_e.c => vacuum_clean_div_e.cc} (100%)
 rename src/field_advance/standard/{vacuum_compute_div_e_err.c => vacuum_compute_div_e_err.cc} (100%)
 rename src/field_advance/standard/{vacuum_compute_rhob.c => vacuum_compute_rhob.cc} (100%)
 rename src/field_advance/standard/{vacuum_energy_f.c => vacuum_energy_f.cc} (100%)
 rename src/grid/{grid_comm.c => grid_comm.cc} (100%)
 rename src/grid/{grid_structors.c => grid_structors.cc} (100%)
 rename src/grid/{ops.c => ops.cc} (100%)
 rename src/grid/{partition.c => partition.cc} (100%)
 rename src/material/{material.c => material.cc} (100%)
 rename src/sf_interface/{accumulator_array.c => accumulator_array.cc} (100%)
 rename src/sf_interface/{clear_accumulators.c => clear_accumulators.cc} (100%)
 rename src/sf_interface/{hydro_array.c => hydro_array.cc} (100%)
 rename src/sf_interface/pipeline/{clear_accumulators_pipeline.c => clear_accumulators_pipeline.cc} (100%)
 rename src/species_advance/{species_advance.c => species_advance.cc} (100%)
 rename src/species_advance/standard/{hydro_p.c => hydro_p.cc} (100%)
 rename src/species_advance/standard/pipeline/{sort_p_pipeline.c => sort_p_pipeline.cc} (100%)
 rename src/species_advance/standard/{sort_p.c => sort_p.cc} (100%)
 rename src/util/{boot.c => boot.cc} (100%)
 rename src/util/checkpt/{checkpt.c => checkpt.cc} (100%)
 rename src/util/pipelines/{pipelines_helper.c => pipelines_helper.cc} (100%)
 rename src/util/pipelines/{pipelines_serial.c => pipelines_serial.cc} (100%)
 rename src/util/pipelines/{pipelines_thread.c => pipelines_thread.cc} (100%)
 rename src/util/profile/{profile.c => profile.cc} (100%)
 rename src/util/rng/{drandn_table.c => drandn_table.cc} (100%)
 rename src/util/rng/{frandn_table.c => frandn_table.cc} (100%)
 rename src/util/rng/{rng.c => rng.cc} (100%)
 rename src/util/rng/{rng_pool.c => rng_pool.cc} (100%)
 rename src/util/{util_base.c => util_base.cc} (100%)

diff --git a/src/boundary/absorb_tally.c b/src/boundary/absorb_tally.cc
similarity index 100%
rename from src/boundary/absorb_tally.c
rename to src/boundary/absorb_tally.cc
diff --git a/src/boundary/boundary.c b/src/boundary/boundary.cc
similarity index 100%
rename from src/boundary/boundary.c
rename to src/boundary/boundary.cc
diff --git a/src/boundary/link.c b/src/boundary/link.cc
similarity index 100%
rename from src/boundary/link.c
rename to src/boundary/link.cc
diff --git a/src/boundary/maxwellian_reflux.c b/src/boundary/maxwellian_reflux.cc
similarity index 100%
rename from src/boundary/maxwellian_reflux.c
rename to src/boundary/maxwellian_reflux.cc
diff --git a/src/collision/binary.c b/src/collision/binary.cc
similarity index 100%
rename from src/collision/binary.c
rename to src/collision/binary.cc
diff --git a/src/collision/collision.c b/src/collision/collision.cc
similarity index 100%
rename from src/collision/collision.c
rename to src/collision/collision.cc
diff --git a/src/collision/hard_sphere.c b/src/collision/hard_sphere.cc
similarity index 100%
rename from src/collision/hard_sphere.c
rename to src/collision/hard_sphere.cc
diff --git a/src/collision/langevin.c b/src/collision/langevin.cc
similarity index 100%
rename from src/collision/langevin.c
rename to src/collision/langevin.cc
diff --git a/src/collision/large_angle_coulomb.c b/src/collision/large_angle_coulomb.cc
similarity index 100%
rename from src/collision/large_angle_coulomb.c
rename to src/collision/large_angle_coulomb.cc
diff --git a/src/collision/pipeline/binary_pipeline.c b/src/collision/pipeline/binary_pipeline.cc
similarity index 100%
rename from src/collision/pipeline/binary_pipeline.c
rename to src/collision/pipeline/binary_pipeline.cc
diff --git a/src/collision/pipeline/langevin_pipeline.c b/src/collision/pipeline/langevin_pipeline.cc
similarity index 100%
rename from src/collision/pipeline/langevin_pipeline.c
rename to src/collision/pipeline/langevin_pipeline.cc
diff --git a/src/collision/pipeline/unary_pipeline.c b/src/collision/pipeline/unary_pipeline.cc
similarity index 100%
rename from src/collision/pipeline/unary_pipeline.c
rename to src/collision/pipeline/unary_pipeline.cc
diff --git a/src/collision/unary.c b/src/collision/unary.cc
similarity index 100%
rename from src/collision/unary.c
rename to src/collision/unary.cc
diff --git a/src/emitter/child_langmuir.c b/src/emitter/child_langmuir.cc
similarity index 100%
rename from src/emitter/child_langmuir.c
rename to src/emitter/child_langmuir.cc
diff --git a/src/emitter/emitter.c b/src/emitter/emitter.cc
similarity index 100%
rename from src/emitter/emitter.c
rename to src/emitter/emitter.cc
diff --git a/src/field_advance/field_advance.c b/src/field_advance/field_advance.cc
similarity index 100%
rename from src/field_advance/field_advance.c
rename to src/field_advance/field_advance.cc
diff --git a/src/field_advance/standard/clean_div_e.c b/src/field_advance/standard/clean_div_e.cc
similarity index 100%
rename from src/field_advance/standard/clean_div_e.c
rename to src/field_advance/standard/clean_div_e.cc
diff --git a/src/field_advance/standard/compute_div_e_err.c b/src/field_advance/standard/compute_div_e_err.cc
similarity index 100%
rename from src/field_advance/standard/compute_div_e_err.c
rename to src/field_advance/standard/compute_div_e_err.cc
diff --git a/src/field_advance/standard/compute_rhob.c b/src/field_advance/standard/compute_rhob.cc
similarity index 100%
rename from src/field_advance/standard/compute_rhob.c
rename to src/field_advance/standard/compute_rhob.cc
diff --git a/src/field_advance/standard/compute_rms_div_b_err.c b/src/field_advance/standard/compute_rms_div_b_err.cc
similarity index 100%
rename from src/field_advance/standard/compute_rms_div_b_err.c
rename to src/field_advance/standard/compute_rms_div_b_err.cc
diff --git a/src/field_advance/standard/compute_rms_div_e_err.c b/src/field_advance/standard/compute_rms_div_e_err.cc
similarity index 100%
rename from src/field_advance/standard/compute_rms_div_e_err.c
rename to src/field_advance/standard/compute_rms_div_e_err.cc
diff --git a/src/field_advance/standard/energy_f.c b/src/field_advance/standard/energy_f.cc
similarity index 100%
rename from src/field_advance/standard/energy_f.c
rename to src/field_advance/standard/energy_f.cc
diff --git a/src/field_advance/standard/local.c b/src/field_advance/standard/local.cc
similarity index 100%
rename from src/field_advance/standard/local.c
rename to src/field_advance/standard/local.cc
diff --git a/src/field_advance/standard/pipeline/clean_div_e_pipeline.c b/src/field_advance/standard/pipeline/clean_div_e_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/clean_div_e_pipeline.c
rename to src/field_advance/standard/pipeline/clean_div_e_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/compute_div_e_err_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/compute_div_e_err_pipeline.c
rename to src/field_advance/standard/pipeline/compute_div_e_err_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/compute_rhob_pipeline.c b/src/field_advance/standard/pipeline/compute_rhob_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/compute_rhob_pipeline.c
rename to src/field_advance/standard/pipeline/compute_rhob_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.c b/src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.c
rename to src/field_advance/standard/pipeline/compute_rms_div_b_err_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.c
rename to src/field_advance/standard/pipeline/compute_rms_div_e_err_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/energy_f_pipeline.c b/src/field_advance/standard/pipeline/energy_f_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/energy_f_pipeline.c
rename to src/field_advance/standard/pipeline/energy_f_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.c b/src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.c
rename to src/field_advance/standard/pipeline/vacuum_clean_div_e_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.c b/src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.c
rename to src/field_advance/standard/pipeline/vacuum_compute_div_e_err_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.c b/src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.c
rename to src/field_advance/standard/pipeline/vacuum_compute_rhob_pipeline.cc
diff --git a/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.c b/src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.cc
similarity index 100%
rename from src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.c
rename to src/field_advance/standard/pipeline/vacuum_energy_f_pipeline.cc
diff --git a/src/field_advance/standard/remote.c b/src/field_advance/standard/remote.cc
similarity index 100%
rename from src/field_advance/standard/remote.c
rename to src/field_advance/standard/remote.cc
diff --git a/src/field_advance/standard/sfa.c b/src/field_advance/standard/sfa.cc
similarity index 100%
rename from src/field_advance/standard/sfa.c
rename to src/field_advance/standard/sfa.cc
diff --git a/src/field_advance/standard/vacuum_clean_div_e.c b/src/field_advance/standard/vacuum_clean_div_e.cc
similarity index 100%
rename from src/field_advance/standard/vacuum_clean_div_e.c
rename to src/field_advance/standard/vacuum_clean_div_e.cc
diff --git a/src/field_advance/standard/vacuum_compute_div_e_err.c b/src/field_advance/standard/vacuum_compute_div_e_err.cc
similarity index 100%
rename from src/field_advance/standard/vacuum_compute_div_e_err.c
rename to src/field_advance/standard/vacuum_compute_div_e_err.cc
diff --git a/src/field_advance/standard/vacuum_compute_rhob.c b/src/field_advance/standard/vacuum_compute_rhob.cc
similarity index 100%
rename from src/field_advance/standard/vacuum_compute_rhob.c
rename to src/field_advance/standard/vacuum_compute_rhob.cc
diff --git a/src/field_advance/standard/vacuum_energy_f.c b/src/field_advance/standard/vacuum_energy_f.cc
similarity index 100%
rename from src/field_advance/standard/vacuum_energy_f.c
rename to src/field_advance/standard/vacuum_energy_f.cc
diff --git a/src/grid/grid_comm.c b/src/grid/grid_comm.cc
similarity index 100%
rename from src/grid/grid_comm.c
rename to src/grid/grid_comm.cc
diff --git a/src/grid/grid_structors.c b/src/grid/grid_structors.cc
similarity index 100%
rename from src/grid/grid_structors.c
rename to src/grid/grid_structors.cc
diff --git a/src/grid/ops.c b/src/grid/ops.cc
similarity index 100%
rename from src/grid/ops.c
rename to src/grid/ops.cc
diff --git a/src/grid/partition.c b/src/grid/partition.cc
similarity index 100%
rename from src/grid/partition.c
rename to src/grid/partition.cc
diff --git a/src/material/material.c b/src/material/material.cc
similarity index 100%
rename from src/material/material.c
rename to src/material/material.cc
diff --git a/src/sf_interface/accumulator_array.c b/src/sf_interface/accumulator_array.cc
similarity index 100%
rename from src/sf_interface/accumulator_array.c
rename to src/sf_interface/accumulator_array.cc
diff --git a/src/sf_interface/clear_accumulators.c b/src/sf_interface/clear_accumulators.cc
similarity index 100%
rename from src/sf_interface/clear_accumulators.c
rename to src/sf_interface/clear_accumulators.cc
diff --git a/src/sf_interface/hydro_array.c b/src/sf_interface/hydro_array.cc
similarity index 100%
rename from src/sf_interface/hydro_array.c
rename to src/sf_interface/hydro_array.cc
diff --git a/src/sf_interface/pipeline/clear_accumulators_pipeline.c b/src/sf_interface/pipeline/clear_accumulators_pipeline.cc
similarity index 100%
rename from src/sf_interface/pipeline/clear_accumulators_pipeline.c
rename to src/sf_interface/pipeline/clear_accumulators_pipeline.cc
diff --git a/src/species_advance/species_advance.c b/src/species_advance/species_advance.cc
similarity index 100%
rename from src/species_advance/species_advance.c
rename to src/species_advance/species_advance.cc
diff --git a/src/species_advance/standard/hydro_p.c b/src/species_advance/standard/hydro_p.cc
similarity index 100%
rename from src/species_advance/standard/hydro_p.c
rename to src/species_advance/standard/hydro_p.cc
diff --git a/src/species_advance/standard/pipeline/sort_p_pipeline.c b/src/species_advance/standard/pipeline/sort_p_pipeline.cc
similarity index 100%
rename from src/species_advance/standard/pipeline/sort_p_pipeline.c
rename to src/species_advance/standard/pipeline/sort_p_pipeline.cc
diff --git a/src/species_advance/standard/sort_p.c b/src/species_advance/standard/sort_p.cc
similarity index 100%
rename from src/species_advance/standard/sort_p.c
rename to src/species_advance/standard/sort_p.cc
diff --git a/src/util/boot.c b/src/util/boot.cc
similarity index 100%
rename from src/util/boot.c
rename to src/util/boot.cc
diff --git a/src/util/checkpt/checkpt.c b/src/util/checkpt/checkpt.cc
similarity index 100%
rename from src/util/checkpt/checkpt.c
rename to src/util/checkpt/checkpt.cc
diff --git a/src/util/pipelines/pipelines_helper.c b/src/util/pipelines/pipelines_helper.cc
similarity index 100%
rename from src/util/pipelines/pipelines_helper.c
rename to src/util/pipelines/pipelines_helper.cc
diff --git a/src/util/pipelines/pipelines_serial.c b/src/util/pipelines/pipelines_serial.cc
similarity index 100%
rename from src/util/pipelines/pipelines_serial.c
rename to src/util/pipelines/pipelines_serial.cc
diff --git a/src/util/pipelines/pipelines_thread.c b/src/util/pipelines/pipelines_thread.cc
similarity index 100%
rename from src/util/pipelines/pipelines_thread.c
rename to src/util/pipelines/pipelines_thread.cc
diff --git a/src/util/profile/profile.c b/src/util/profile/profile.cc
similarity index 100%
rename from src/util/profile/profile.c
rename to src/util/profile/profile.cc
diff --git a/src/util/rng/drandn_table.c b/src/util/rng/drandn_table.cc
similarity index 100%
rename from src/util/rng/drandn_table.c
rename to src/util/rng/drandn_table.cc
diff --git a/src/util/rng/frandn_table.c b/src/util/rng/frandn_table.cc
similarity index 100%
rename from src/util/rng/frandn_table.c
rename to src/util/rng/frandn_table.cc
diff --git a/src/util/rng/rng.c b/src/util/rng/rng.cc
similarity index 100%
rename from src/util/rng/rng.c
rename to src/util/rng/rng.cc
diff --git a/src/util/rng/rng_pool.c b/src/util/rng/rng_pool.cc
similarity index 100%
rename from src/util/rng/rng_pool.c
rename to src/util/rng/rng_pool.cc
diff --git a/src/util/util_base.c b/src/util/util_base.cc
similarity index 100%
rename from src/util/util_base.c
rename to src/util/util_base.cc

From 3a03a8810eb465c64739f9e6e0743168bc71ee33 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Tue, 9 Jul 2019 21:26:28 -0600
Subject: [PATCH 14/95] Format tweaks.

---
 deck/main.cc                                  |   2 +-
 src/boundary/boundary_p.cc                    | 726 ++++++++++++------
 src/boundary/link.cc                          |   3 +-
 src/collision/collision_private.h             |   3 +-
 src/collision/hard_sphere.cc                  | 101 +--
 src/collision/large_angle_coulomb.cc          | 107 +--
 src/emitter/child_langmuir.cc                 |   1 -
 src/species_advance/species_advance.cc        |  27 +-
 src/species_advance/standard/hydro_p.cc       |   3 +-
 src/species_advance/standard/move_p.cc        | 107 ++-
 .../standard/pipeline/advance_p_pipeline.cc   |   3 +-
 .../pipeline/advance_p_pipeline_v16.cc        | 121 +--
 .../standard/pipeline/center_p_pipeline.cc    |   5 +-
 .../standard/pipeline/spa_private.h           |   4 -
 .../standard/pipeline/uncenter_p_pipeline.cc  |   8 +-
 src/species_advance/standard/rho_p.cc         |   8 +-
 src/species_advance/standard/sort_p.cc        |   4 +-
 src/util/profile/profile.h                    |   1 +
 src/util/v16/v16_avx512.h                     |  18 +-
 src/util/v4/v4_altivec.h                      |  42 +-
 src/util/v4/v4_avx.h                          |  39 +-
 src/util/v4/v4_avx2.h                         |  16 +-
 src/util/v4/v4_sse.h                          |  39 +-
 src/vpic/dump.cc                              |   3 +-
 src/vpic/initialize.cc                        |   1 -
 src/vpic/misc.cc                              |  21 +-
 src/vpic/vpic.h                               |   6 +-
 27 files changed, 898 insertions(+), 521 deletions(-)

diff --git a/deck/main.cc b/deck/main.cc
index 001baff4..f9f7fb1b 100644
--- a/deck/main.cc
+++ b/deck/main.cc
@@ -103,7 +103,7 @@ int main(int argc, char** argv)
 
     // Do any post init/restore simulation modifications
 
-    // Detec if the "modify" option is passed, which allows users to change
+    // Detect if the "modify" option is passed, which allows users to change
     // options (such as quota, num_step, etc) when restoring
     fbase = strip_cmdline_string( &argc, &argv, "--modify", NULL );
     if( fbase )
diff --git a/src/boundary/boundary_p.cc b/src/boundary/boundary_p.cc
index 25d87b41..a50d6657 100644
--- a/src/boundary/boundary_p.cc
+++ b/src/boundary/boundary_p.cc
@@ -1,9 +1,11 @@
 #define IN_boundary
+
 #include "boundary_private.h"
 
-// If this is defined particle and mover buffers will not resize dynamically
-// (This is the common case for the users)
-//#define DISABLE_DYNAMIC_RESIZING
+// If this is defined particle and mover buffers will not resize dynamically.
+// This is the common case for the users.
+
+#define DISABLE_DYNAMIC_RESIZING
 
 // FIXME: ARCHITECTURAL FLAW!  CUSTOM BCS AND SHARED FACES CANNOT
 // COEXIST ON THE SAME FACE!  THIS MEANS THAT CUSTOM BOUNDARYS MUST
@@ -25,6 +27,10 @@
 using namespace v4;
 #endif
 
+#ifdef V8_ACCELERATION
+using namespace v8;
+#endif
+
 #ifndef MIN_NP
 #define MIN_NP 128 // Default to 4kb (~1 page worth of memory)
 //#define MIN_NP 32768 // 32768 particles is 1 MiB of memory.
@@ -33,13 +39,15 @@ using namespace v4;
 
 enum { MAX_PBC = 32, MAX_SP = 32 };
 
+// This is the AoS implementation.
+
 void
 boundary_p( particle_bc_t       * RESTRICT pbc_list,
             species_t           * RESTRICT sp_list,
             field_array_t       * RESTRICT fa,
-            accumulator_array_t * RESTRICT aa ) {
-
-  // Gives the local mp port associated with a local face
+            accumulator_array_t * RESTRICT aa )
+{
+  // Gives the local mp port associated with a local face.
   static const int f2b[6]  = { BOUNDARY(-1, 0, 0),
                                BOUNDARY( 0,-1, 0),
                                BOUNDARY( 0, 0,-1),
@@ -47,7 +55,7 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
                                BOUNDARY( 0, 1, 0),
                                BOUNDARY( 0, 0, 1) };
 
-  // Gives the remote mp port associated with a local face
+  // Gives the remote mp port associated with a local face.
   static const int f2rb[6] = { BOUNDARY( 1, 0, 0),
                                BOUNDARY( 0, 1, 0),
                                BOUNDARY( 0, 0, 1),
@@ -55,122 +63,178 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
                                BOUNDARY( 0,-1, 0),
                                BOUNDARY( 0, 0,-1) };
 
-  // Gives the axis associated with a local face
-  static const int axis[6]  = { 0, 1, 2,  0,  1,  2 };
+  // Gives the axis associated with a local face.
+  static const int axis[6]  = { 0, 1, 2, 0, 1, 2 };
 
-  // Gives the location of sending face on the receiver
+  // Gives the location of sending face on the receiver.
   static const float dir[6] = { 1, 1, 1, -1, -1, -1 };
 
-  // Temporary store for local particle injectors
-  // FIXME: Ugly static usage
+  // Temporary store for local particle injectors.
+  // FIXME: Ugly static usage.
   static particle_injector_t * RESTRICT ALIGNED(16) ci = NULL;
+
   static int max_ci = 0;
 
   int n_send[6], n_recv[6], n_ci;
 
   species_t * sp;
+
   int face;
 
-  // Check input args
+  // Check input args.
 
-  if( !sp_list ) return; // Nothing to do if no species
-  if( !fa || !aa || sp_list->g!=aa->g || fa->g!=aa->g )
-    ERROR(( "Bad args" ));
+  if ( ! sp_list )
+  {
+    return; // Nothing to do if no species.
+  }
 
-  // Unpack the particle boundary conditions
+  if ( ! fa                ||
+       ! aa                ||
+       sp_list->g != aa->g ||
+       fa->g      != aa->g )
+  {
+    ERROR( ( "Bad args." ) );
+  }
+
+  // Unpack the particle boundary conditions.
 
   particle_bc_func_t pbc_interact[MAX_PBC];
+
   void * pbc_params[MAX_PBC];
+
   const int nb = num_particle_bc( pbc_list );
-  if( nb>MAX_PBC ) ERROR(( "Update this to support more particle boundary conditions" ));
-  for( particle_bc_t * pbc=pbc_list; pbc; pbc=pbc->next ) {
-    pbc_interact[-pbc->id-3] = pbc->interact;
-    pbc_params[  -pbc->id-3] = pbc->params;
-   }
 
-  // Unpack fields
+  if ( nb > MAX_PBC )
+  {
+    ERROR( ( "Update this to support more particle boundary conditions." ) );
+  }
+
+  for( particle_bc_t * pbc = pbc_list; pbc; pbc = pbc->next )
+  {
+    pbc_interact[ -pbc->id - 3 ] = pbc->interact;
+    pbc_params  [ -pbc->id - 3 ] = pbc->params;
+  }
+
+  // Unpack fields.
 
   field_t * RESTRICT ALIGNED(128) f = fa->f;
   grid_t  * RESTRICT              g = fa->g;
 
-  // Unpack accumulator
+  // Unpack accumulator.
 
   accumulator_t * RESTRICT ALIGNED(128) a0 = aa->a;
 
-  // Unpack the grid
+  // Unpack the grid.
 
   const int64_t * RESTRICT ALIGNED(128) neighbor = g->neighbor;
   /**/  mp_t    * RESTRICT              mp       = g->mp;
+
   const int64_t rangel = g->rangel;
   const int64_t rangeh = g->rangeh;
   const int64_t rangem = g->range[world_size];
+
   /*const*/ int bc[6], shared[6];
   /*const*/ int64_t range[6];
-  for( face=0; face<6; face++ ) {
-    bc[face] = g->bc[f2b[face]];
-    shared[face] = (bc[face]>=0) && (bc[face]<world_size) &&
-                   (bc[face]!=world_rank);
-    if( shared[face] ) range[face] = g->range[bc[face]];
-  }
 
-  // Begin receiving the particle counts
+  for( face = 0; face < 6; face++ )
+  {
+    bc    [ face ] = g->bc[ f2b[ face ] ];
+
+    shared[ face ] = ( bc[ face ] >= 0          ) &&
+                     ( bc[ face ] <  world_size ) &&
+                     ( bc[ face ] != world_rank );
 
-  for( face=0; face<6; face++ )
-    if( shared[face] ) {
-      mp_size_recv_buffer( mp, f2b[face], sizeof(int) );
-      mp_begin_recv( mp, f2b[face], sizeof(int), bc[face], f2rb[face] );
+    if ( shared[ face ] )
+    {
+      range[ face ] = g->range[ bc[ face ] ];
     }
+  }
 
-  // Load the particle send and local injection buffers
+  // Begin receiving the particle counts.
+
+  for( face = 0; face < 6; face++ )
+  {
+    if ( shared[ face ] )
+    {
+      mp_size_recv_buffer( mp,
+                           f2b[ face ],
+                           sizeof( int ) );
+
+      mp_begin_recv( mp,
+                     f2b[ face ],
+                     sizeof( int ),
+                     bc[ face ],
+                     f2rb[ face ] );
+    }
+  }
 
-  do {
+  // Load the particle send and local injection buffers.
 
+  do
+  {
     particle_injector_t * RESTRICT ALIGNED(16) pi_send[6];
 
-    // Presize the send and injection buffers
+    // Presize the send and injection buffers.
     //
     // Each buffer is large enough to hold one injector corresponding
     // to every mover in use (worst case, but plausible scenario in
     // beam simulations, is one buffer gets all the movers).
     //
     // FIXME: We could be several times more efficient in our particle
-    // injector buffer sizing here.  Namely, we could create on local
-    // injector buffer of nm is size.  All injection for all
-    // boundaries would be done here.  The local buffer would then be
-    // counted to determine the size of each send buffer.  The local
-    // buffer would then move all injectors into the approate send
-    // buffers (leaving only the local injectors).  This would require
-    // some extra data motion though.  (But would give a more robust
-    // implementation against variations in MP implementation.)
+    // injector buffer sizing here.  Namely, we could create one local
+    // injector buffer of nm in size.  All injection for all boundaries
+    // would be done here.  The local buffer would then be counted to
+    // determine the size of each send buffer.  The local buffer would
+    // then move all injectors into the appropriate send buffers, leaving
+    // only the local injectors.  This would require some extra data
+    // motion though, but would give a more robust implementation against
+    // variations in MP implementation.
     //
     // FIXME: This presizing assumes that custom boundary conditions
     // inject at most one particle per incident particle.  Currently,
     // the invocation of pbc_interact[*] insures that assumption will
-    // be satisfied (if the handlers conform that it).  We should be
-    // more flexible though in the future (especially given above the
-    // above overalloc).
+    // be satisfied, if the handlers conform that it.  We should be
+    // more flexible though in the future, especially given the above
+    // overalloc.
 
-    int nm = 0; LIST_FOR_EACH( sp, sp_list ) nm += sp->nm;
+    int nm = 0;
 
-    for( face=0; face<6; face++ )
-      if( shared[face] ) {
-        mp_size_send_buffer( mp, f2b[face], 16+nm*sizeof(particle_injector_t) );
-        pi_send[face] = (particle_injector_t *)(((char *)mp_send_buffer(mp,f2b[face]))+16);
-        n_send[face] = 0;
+    LIST_FOR_EACH( sp, sp_list ) nm += sp->nm;
+
+    for( face = 0; face < 6; face++ )
+    {
+      if ( shared[ face ] )
+      {
+        mp_size_send_buffer( mp,
+                             f2b[ face ],
+                             16 + nm * sizeof( particle_injector_t ) );
+
+        pi_send[ face ] = (particle_injector_t *) ( ( (char *) mp_send_buffer( mp,
+                                                                               f2b[ face ] )
+                                                    ) + 16 );
+
+        n_send[ face ] = 0;
       }
+    }
 
-    if( max_ci<nm ) {
+    if ( max_ci < nm )
+    {
       particle_injector_t * new_ci = ci;
+
       FREE_ALIGNED( new_ci );
+
       MALLOC_ALIGNED( new_ci, nm, 16 );
+
       ci     = new_ci;
       max_ci = nm;
     }
+
     n_ci = 0;
 
-    // For each species, load the movers
+    // For each species, load the movers.
 
-    LIST_FOR_EACH( sp, sp_list ) {
+    LIST_FOR_EACH( sp, sp_list )
+    {
       const float   sp_q  = sp->q;
       const int32_t sp_id = sp->id;
 
@@ -187,49 +251,70 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
       // Note that particle movers for each species are processed in
       // reverse order.  This allows us to backfill holes in the
       // particle list created by boundary conditions and/or
-      // communication.  This assumes particle on the mover list are
+      // communication.  This assumes particles on the mover list are
       // monotonically increasing.  That is: pm[n].i > pm[n-1].i for
       // n=1...nm-1.  advance_p and inject_particle create movers with
       // property if all aged particle injection occurs after
-      // advance_p and before this
+      // advance_p and before this.
 
-      for( ; nm; pm--, nm-- ) {
-        i = pm->i;
-        voxel = p0[i].i;
-        face = voxel & 7;
+      for( ; nm; pm--, nm-- )
+      {
+        i       = pm->i;
+        voxel   = p0[i].i;
+        face    = voxel & 7;
         voxel >>= 3;
         p0[i].i = voxel;
-        nn = neighbor[ 6*voxel + face ];
+        nn      = neighbor[ 6 * voxel + face ];
 
-        // Absorb
+        // Absorb.
 
-        if( nn==absorb_particles ) {
+        if ( nn == absorb_particles )
+        {
           // Ideally, we would batch all rhob accumulations together
-          // for efficiency
-          accumulate_rhob( f, p0+i, g, sp_q );
+          // for efficiency.
+          accumulate_rhob( f, p0 + i, g, sp_q );
+
           goto backfill;
         }
 
-        // Send to a neighboring node
+        // Send to a neighboring node.
+
+        if ( ( ( nn >= 0      ) & ( nn <  rangel ) ) |
+             ( ( nn >  rangeh ) & ( nn <= rangem ) ) )
+        {
+          pi = &pi_send[ face ] [ n_send[ face ]++ ];
+
+          #ifdef V4_ACCELERATION
 
-        if( ((nn>=0) & (nn< rangel)) | ((nn>rangeh) & (nn<=rangem)) ) {
-          pi = &pi_send[face][n_send[face]++];
-#         ifdef V4_ACCELERATION
           copy_4x1( &pi->dx,    &p0[i].dx  );
           copy_4x1( &pi->ux,    &p0[i].ux  );
           copy_4x1( &pi->dispx, &pm->dispx );
-#         else
-          pi->dx=p0[i].dx; pi->dy=p0[i].dy; pi->dz=p0[i].dz;
-          pi->ux=p0[i].ux; pi->uy=p0[i].uy; pi->uz=p0[i].uz; pi->w=p0[i].w;
-          pi->dispx = pm->dispx; pi->dispy = pm->dispy; pi->dispz = pm->dispz;
-#         endif
-          (&pi->dx)[axis[face]] = dir[face];
-          pi->i                 = nn - range[face];
-          pi->sp_id             = sp_id;
+
+          #else
+
+          pi->dx    = p0[i].dx;
+          pi->dy    = p0[i].dy;
+          pi->dz    = p0[i].dz;
+
+          pi->ux    = p0[i].ux;
+          pi->uy    = p0[i].uy;
+          pi->uz    = p0[i].uz;
+          pi->w     = p0[i].w;
+
+          pi->dispx = pm->dispx;
+          pi->dispy = pm->dispy;
+          pi->dispz = pm->dispz;
+
+          #endif
+
+          ( &pi->dx )[ axis[ face ] ] = dir[ face ];
+          pi->i                       = nn - range[ face ];
+          pi->sp_id                   = sp_id;
+
           goto backfill;
         }
 
-        // User-defined handling
+        // User-defined handling.
 
         // After a particle interacts with a boundary it is removed
         // from the local particle list.  Thus, if a boundary handler
@@ -248,27 +333,45 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
         // nothing to rhob.
 
         nn = -nn - 3; // Assumes reflective/absorbing are -1, -2
-        if( (nn>=0) & (nn<nb) ) {
-          n_ci += pbc_interact[nn]( pbc_params[nn], sp, p0+i, pm,
-                                    ci+n_ci, 1, face );
+
+        if ( ( nn >= 0  ) &
+             ( nn <  nb ) )
+        {
+          n_ci += pbc_interact[ nn ]( pbc_params[ nn ],
+                                      sp,
+                                      p0 + i,
+                                      pm,
+                                      ci + n_ci,
+                                      1,
+                                      face );
+
           goto backfill;
         }
 
-        // Uh-oh: We fell through
+        // Uh-oh: We fell through.
 
-        WARNING(( "Unknown boundary interaction ... dropping particle "
-                  "(species=%s)", sp->name ));
+        WARNING( ( "Unknown boundary interaction ... dropping particle "
+                   "(species=%s)",
+                   sp->name ) );
 
       backfill:
 
         np--;
-#       ifdef V4_ACCELERATION
+
+        #if defined(V8_ACCELERATION)
+
+        copy_8x1( &p0[i].dx, &p0[np].dx );
+
+        #elif defined(V4_ACCELERATION)
+
         copy_4x1( &p0[i].dx, &p0[np].dx );
         copy_4x1( &p0[i].ux, &p0[np].ux );
-#       else
+
+        #else
+
         p0[i] = p0[np];
-#       endif
 
+        #endif
       }
 
       sp->np = np;
@@ -289,230 +392,399 @@ boundary_p( particle_bc_t       * RESTRICT pbc_list,
   // equilvanet of a MPI_Getcount to determine how much data you
   // actually received.
 
-  for( face=0; face<6; face++ )
-    if( shared[face] ) {
-      *((int *)mp_send_buffer( mp, f2b[face] )) = n_send[face];
-      mp_begin_send( mp, f2b[face], sizeof(int), bc[face], f2b[face] );
+  for( face = 0; face < 6; face++ )
+  {
+    if ( shared[ face ] )
+    {
+      *( (int *) mp_send_buffer( mp,
+                                 f2b[ face ] ) ) = n_send[ face ];
+
+      mp_begin_send( mp,
+                     f2b[ face ],
+                     sizeof( int ),
+                     bc[ face ],
+                     f2b[ face ] );
     }
+  }
 
-  for( face=0; face<6; face++ )
-    if( shared[face] )  {
-      mp_end_recv( mp, f2b[face] );
-      n_recv[face] = *((int *)mp_recv_buffer( mp, f2b[face] ));
-      mp_size_recv_buffer( mp, f2b[face],
-                           16+n_recv[face]*sizeof(particle_injector_t) );
-      mp_begin_recv( mp, f2b[face], 16+n_recv[face]*sizeof(particle_injector_t),
-                     bc[face], f2rb[face] );
+  for( face = 0; face < 6; face++ )
+  {
+    if ( shared[ face ] )
+    {
+      mp_end_recv( mp,
+                   f2b[ face ] );
+
+      n_recv[ face ] = *( (int *) mp_recv_buffer( mp,
+                                                  f2b[ face ] ) );
+
+      mp_size_recv_buffer( mp,
+                           f2b[ face ],
+                           16 + n_recv[ face ] * sizeof( particle_injector_t ) );
+
+      mp_begin_recv( mp,
+                     f2b[ face ],
+                     16 + n_recv[ face ] * sizeof( particle_injector_t ),
+                     bc[ face ],
+                     f2rb[ face ] );
     }
+  }
+
+  for( face = 0; face < 6; face++ )
+  {
+    if ( shared[ face ] )
+    {
+      mp_end_send( mp,
+                   f2b[ face ] );
 
-  for( face=0; face<6; face++ )
-    if( shared[face] ) {
-      mp_end_send( mp, f2b[face] );
       // FIXME: ASSUMES MP WON'T MUCK WITH REST OF SEND BUFFER. IF WE
       // DID MORE EFFICIENT MOVER ALLOCATION ABOVE, THIS WOULD BE
-      // ROBUSTED AGAINST MP IMPLEMENTATION VAGARIES
-      mp_begin_send( mp, f2b[face], 16+n_send[face]*sizeof(particle_injector_t),
-                     bc[face], f2b[face] );
+      // ROBUSTED AGAINST MP IMPLEMENTATION VAGARIES.
+
+      mp_begin_send( mp,
+                     f2b[ face ],
+                     16 + n_send[ face ] * sizeof( particle_injector_t ),
+                     bc[ face ],
+                     f2b[ face ] );
     }
+  }
 
-# ifndef DISABLE_DYNAMIC_RESIZING
-  // Resize particle storage to accomodate worst case inject
+  #ifndef DISABLE_DYNAMIC_RESIZING
+  // Resize particle storage to accomodate worst case inject.
 
-  do {
+  do
+  {
     int n, nm;
 
     // Resize each species's particle and mover storage to be large
-    // enough to guarantee successful injection.  (If we broke down
+    // enough to guarantee successful injection.  If we broke down
     // the n_recv[face] by species before sending it, we could be
-    // tighter on memory footprint here.)
+    // tighter on memory footprint here.
 
     int max_inj = n_ci;
-    for( face=0; face<6; face++ )
-      if( shared[face] ) max_inj += n_recv[face];
 
-    LIST_FOR_EACH( sp, sp_list ) {
+    for( face = 0; face < 6; face++ )
+    {
+      if ( shared[ face ] )
+      {
+        max_inj += n_recv[ face ];
+      }
+    }
+
+    LIST_FOR_EACH( sp, sp_list )
+    {
       particle_mover_t * new_pm;
-      particle_t * new_p;
+      particle_t       * new_p;
 
       n = sp->np + max_inj;
-      if( n>sp->max_np ) {
-        n += 0.3125*n; // Increase by 31.25% (~<"silver
-        /**/                     // ratio") to minimize resizes (max
-        /**/                     // rate that avoids excessive heap
-        /**/                     // fragmentation)
-        //float resize_ratio = (float)n/sp->max_np;
-        WARNING(( "Resizing local %s particle storage from %i to %i",
-                  sp->name, sp->max_np, n ));
+
+      if ( n > sp->max_np )
+      {
+        n += 0.3125 * n; // Increase by 31.25% (~<"silver
+        /**/             // ratio") to minimize resizes (max
+        /**/             // rate that avoids excessive heap
+        /**/             // fragmentation)
+
+        // float resize_ratio = (float)n/sp->max_np;
+
+        WARNING( ( "Resizing local %s particle storage from %i to %i",
+                   sp->name,
+                   sp->max_np,
+                   n ) );
+
         MALLOC_ALIGNED( new_p, n, 128 );
+
         COPY( new_p, sp->p, sp->np );
+
         FREE_ALIGNED( sp->p );
-        sp->p = new_p, sp->max_np = n;
 
-        /*nm = sp->max_nm * resize_ratio;
+        sp->p      = new_p;
+        sp->max_np = n;
+
+        /*
+        nm = sp->max_nm * resize_ratio;
         WARNING(( "Resizing local %s mover storage from %i to %i",
                   sp->name, sp->max_nm, nm ));
         MALLOC_ALIGNED( new_pm, nm, 128 );
         COPY( new_pm, sp->pm, sp->nm );
         FREE_ALIGNED( sp->pm );
         sp->pm = new_pm;
-        sp->max_nm = nm;*/
+        sp->max_nm = nm;
+        */
       }
-      else if(sp->max_np > MIN_NP && n < sp->max_np>>1)
+
+      else if( sp->max_np > MIN_NP          &&
+               n          < sp->max_np >> 1 )
       {
-        n += 0.125*n; // Overallocate by less since this rank is decreasing
-        if (n<MIN_NP) n = MIN_NP;
-        //float resize_ratio = (float)n/sp->max_np;
-        WARNING(( "Resizing (shrinking) local %s particle storage from "
-                    "%i to %i", sp->name, sp->max_np, n));
+        n += 0.125 * n; // Overallocate by less since this rank is decreasing
+
+        if ( n < MIN_NP )
+        {
+          n = MIN_NP;
+        }
+
+        // float resize_ratio = (float)n/sp->max_np;
+
+        WARNING( ( "Resizing (shrinking) local %s particle storage from "
+                   "%i to %i",
+                   sp->name,
+                   sp->max_np,
+                   n ) );
+
         MALLOC_ALIGNED( new_p, n, 128 );
+
         COPY( new_p, sp->p, sp->np );
+
         FREE_ALIGNED( sp->p );
-        sp->p = new_p, sp->max_np = n;
 
-        /*nm = sp->max_nm * resize_ratio;
+        sp->p      = new_p;
+        sp->max_np = n;
+
+        /*
+        nm = sp->max_nm * resize_ratio;
+
         WARNING(( "Resizing (shrinking) local %s mover storage from "
                     "%i to %i", sp->name, sp->max_nm, nm));
         MALLOC_ALIGNED( new_pm, nm, 128 );
         COPY( new_pm, sp->pm, sp->nm );
         FREE_ALIGNED( sp->pm );
-        sp->pm = new_pm, sp->max_nm = nm;*/
+        sp->pm = new_pm, sp->max_nm = nm;
+        */
       }
 
       // Feasibly, a vacuum-filled rank may receive a shock and need more movers
-      // than available from MIN_NP
+      // than available from MIN_NP.
+
       nm = sp->nm + max_inj;
-      if( nm>sp->max_nm ) {
-        nm += 0.3125*nm; // See note above
-        //float resize_ratio = (float)nm/sp->max_nm;
-        WARNING(( "This happened.  Resizing local %s mover storage from "
-                    "%i to %i based on not enough movers",
-                  sp->name, sp->max_nm, nm ));
+
+      if ( nm > sp->max_nm )
+      {
+        nm += 0.3125 * nm; // See note above
+
+        // float resize_ratio = (float)nm/sp->max_nm;
+
+        WARNING( ( "This happened.  Resizing local %s mover storage from "
+                   "%i to %i based on not enough movers",
+                   sp->name,
+                   sp->max_nm,
+                   nm ) );
+
         MALLOC_ALIGNED( new_pm, nm, 128 );
+
         COPY( new_pm, sp->pm, sp->nm );
+
         FREE_ALIGNED( sp->pm );
-        sp->pm = new_pm;
+
+        sp->pm     = new_pm;
         sp->max_nm = nm;
 
-        /*n = sp->max_np * resize_ratio;
+        /*
+        n = sp->max_np * resize_ratio;
         WARNING(( "Resizing local %s particle storage from %i to %i",
                   sp->name, sp->max_np, n ));
         MALLOC_ALIGNED( new_p, n, 128 );
         COPY( new_p, sp->p, sp->np );
         FREE_ALIGNED( sp->p );
-        sp->p = new_p, sp->max_np = n;*/
+        sp->p = new_p, sp->max_np = n;
+        */
       }
     }
   } while(0);
-# endif
+  #endif
 
-  do {
+  do
+  {
+    // Unpack the species list for random acesss.
 
-    // Unpack the species list for random acesss
+    particle_t       * RESTRICT ALIGNED(32) sp_p [ MAX_SP ];
+    particle_mover_t * RESTRICT ALIGNED(32) sp_pm[ MAX_SP ];
 
-    particle_t       * RESTRICT ALIGNED(32) sp_p[ MAX_SP];
-    particle_mover_t * RESTRICT ALIGNED(32) sp_pm[MAX_SP];
-    float sp_q[MAX_SP];
-    int sp_np[MAX_SP];
-    int sp_nm[MAX_SP];
+    float sp_q [ MAX_SP ];
+    int   sp_np[ MAX_SP ];
+    int   sp_nm[ MAX_SP ];
 
-#   ifdef DISABLE_DYNAMIC_RESIZING
+    #ifdef DISABLE_DYNAMIC_RESIZING
     int sp_max_np[64], n_dropped_particles[64];
-    int sp_max_nm[64], n_dropped_movers[64];
-#   endif
+    int sp_max_nm[64], n_dropped_movers   [64];
+    #endif
 
-    if( num_species( sp_list ) > MAX_SP )
-      ERROR(( "Update this to support more species" ));
-    LIST_FOR_EACH( sp, sp_list ) {
-      sp_p[  sp->id ] = sp->p;
+    if ( num_species( sp_list ) > MAX_SP )
+    {
+      ERROR( ( "Update this to support more species." ) );
+    }
+
+    LIST_FOR_EACH( sp, sp_list )
+    {
+      sp_p [ sp->id ] = sp->p;
       sp_pm[ sp->id ] = sp->pm;
-      sp_q[  sp->id ] = sp->q;
+      sp_q [ sp->id ] = sp->q;
       sp_np[ sp->id ] = sp->np;
       sp_nm[ sp->id ] = sp->nm;
-#     ifdef DISABLE_DYNAMIC_RESIZING
-      sp_max_np[sp->id]=sp->max_np; n_dropped_particles[sp->id]=0;
-      sp_max_nm[sp->id]=sp->max_nm; n_dropped_movers[sp->id]=0;
-#     endif
+
+      #ifdef DISABLE_DYNAMIC_RESIZING
+      sp_max_np[ sp->id ] = sp->max_np;
+      sp_max_nm[ sp->id ] = sp->max_nm;
+
+      n_dropped_particles[ sp->id ] = 0;
+      n_dropped_movers   [ sp->id ] = 0;
+      #endif
     }
 
     // Inject particles.  We do custom local injection first to
     // increase message overlap opportunities.
 
     face = 5;
-    do {
+
+    do
+    {
       /**/  particle_t          * RESTRICT ALIGNED(32) p;
       /**/  particle_mover_t    * RESTRICT ALIGNED(16) pm;
       const particle_injector_t * RESTRICT ALIGNED(16) pi;
+
       int np, nm, n, id;
 
-      face++; if( face==7 ) face = 0;
-      if( face==6 ) pi = ci, n = n_ci;
-      else if( shared[face] ) {
-        mp_end_recv( mp, f2b[face] );
+      face++;
+
+      if ( face == 7 )
+      {
+        face = 0;
+      }
+
+      if ( face == 6 )
+      {
+        pi = ci;
+        n  = n_ci;
+      }
+
+      else if ( shared[ face ] )
+      {
+        mp_end_recv( mp,
+                     f2b[ face ] );
+
         pi = (const particle_injector_t *)
-          (((char *)mp_recv_buffer(mp,f2b[face]))+16);
-        n  = n_recv[face];
-      } else continue;
+             ( ( (char *) mp_recv_buffer( mp,
+                                          f2b[ face ] ) ) + 16 );
+
+        n  = n_recv[ face ];
+      }
+
+      else
+      {
+        continue;
+      }
 
       // Reverse order injection is done to reduce thrashing of the
-      // particle list (particles are removed reverse order so the
+      // particle list. Particles are removed in reverse order so the
       // overall impact of removal + injection is to keep injected
-      // particles in order).
+      // particles in order.
       //
-      // WARNING: THIS TRUSTS THAT THE INJECTORS (INCLUDING THOSE
-      // RECEIVED FROM OTHER NODES) HAVE VALID PARTICLE IDS.
+      // WARNING: THIS TRUSTS THAT THE INJECTORS, INCLUDING THOSE
+      // RECEIVED FROM OTHER NODES, HAVE VALID PARTICLE IDS.
+
+      pi += n - 1;
 
-      pi += n-1;
-      for( ; n; pi--, n-- ) {
+      for( ; n; pi--, n-- )
+      {
         id = pi->sp_id;
-        p  = sp_p[id];  np = sp_np[id];
-        pm = sp_pm[id]; nm = sp_nm[id];
-
-#       ifdef DISABLE_DYNAMIC_RESIZING
-        if( np>=sp_max_np[id] ) { n_dropped_particles[id]++; continue; }
-#       endif
-#       ifdef V4_ACCELERATION
-        copy_4x1(  &p[np].dx,    &pi->dx    );
-        copy_4x1(  &p[np].ux,    &pi->ux    );
-#       else
-        p[np].dx=pi->dx; p[np].dy=pi->dy; p[np].dz=pi->dz; p[np].i=pi->i;
-        p[np].ux=pi->ux; p[np].uy=pi->uy; p[np].uz=pi->uz; p[np].w=pi->w;
-#       endif
-        sp_np[id] = np+1;
-
-#       ifdef DISABLE_DYNAMIC_RESIZING
-        if( nm>=sp_max_nm[id] ) { n_dropped_movers[id]++;    continue; }
-#       endif
-#       ifdef V4_ACCELERATION
+
+        p  = sp_p [id];
+        np = sp_np[id];
+
+        pm = sp_pm[id];
+        nm = sp_nm[id];
+
+        #ifdef DISABLE_DYNAMIC_RESIZING
+        if ( np >= sp_max_np[ id ] )
+        {
+          n_dropped_particles[ id ]++;
+
+          continue;
+        }
+        #endif
+
+        #ifdef V4_ACCELERATION
+
+        copy_4x1( &p[np].dx, &pi->dx );
+        copy_4x1( &p[np].ux, &pi->ux );
+
+        #else
+
+        p[np].dx = pi->dx;
+        p[np].dy = pi->dy;
+        p[np].dz = pi->dz;
+        p[np].i  = pi->i;
+
+        p[np].ux = pi->ux;
+        p[np].uy = pi->uy;
+        p[np].uz = pi->uz;
+        p[np].w  = pi->w;
+
+        #endif
+
+        sp_np[id] = np + 1;
+
+        #ifdef DISABLE_DYNAMIC_RESIZING
+        if ( nm >= sp_max_nm[ id ] )
+        {
+          n_dropped_movers[ id ]++;
+
+          continue;
+        }
+        #endif
+
+        #ifdef V4_ACCELERATION
+
         copy_4x1( &pm[nm].dispx, &pi->dispx );
+
         pm[nm].i = np;
-#       else
-        pm[nm].dispx=pi->dispx; pm[nm].dispy=pi->dispy; pm[nm].dispz=pi->dispz;
-        pm[nm].i=np;
-#       endif
-        sp_nm[id] = nm + move_p( p, pm+nm, a0, g, sp_q[id] );
+
+        #else
+
+        pm[nm].dispx = pi->dispx;
+        pm[nm].dispy = pi->dispy;
+        pm[nm].dispz = pi->dispz;
+        pm[nm].i     = np;
+
+        #endif
+
+        sp_nm[id] = nm + move_p( p, pm + nm, a0, g, sp_q[id] );
       }
-    } while(face!=5);
-
-    LIST_FOR_EACH( sp, sp_list ) {
-#     ifdef DISABLE_DYNAMIC_RESIZING
-      if( n_dropped_particles[sp->id] )
-        WARNING(( "Dropped %i particles from species \"%s\".  Use a larger "
-                  "local particle allocation in your simulation setup for "
-                  "this species on this node.",
-                  n_dropped_particles[sp->id], sp->name ));
-      if( n_dropped_movers[sp->id] )
-        WARNING(( "%i particles were not completed moved to their final "
-                  "location this timestep for species \"%s\".  Use a larger "
-                  "local particle mover buffer in your simulation setup "
-                  "for this species on this node.",
-                  n_dropped_movers[sp->id], sp->name ));
-#     endif
-      sp->np=sp_np[sp->id];
-      sp->nm=sp_nm[sp->id];
+    } while( face != 5 );
+
+    LIST_FOR_EACH( sp, sp_list )
+    {
+      #ifdef DISABLE_DYNAMIC_RESIZING
+      if ( n_dropped_particles[ sp->id ] )
+      {
+        WARNING( ( "Dropped %i particles from species \"%s\".  Use a larger "
+                   "local particle allocation in your simulation setup for "
+                   "this species on this node.",
+                   n_dropped_particles[ sp->id ],
+                   sp->name ) );
+      }
+
+      if ( n_dropped_movers[ sp->id ] )
+      {
+        WARNING( ( "%i particles were not completed moved to their final "
+                   "location this timestep for species \"%s\".  Use a larger "
+                   "local particle mover buffer in your simulation setup "
+                   "for this species on this node.",
+                   n_dropped_movers[ sp->id ],
+                   sp->name ) );
+      }
+      #endif
+
+      sp->np = sp_np[ sp->id ];
+      sp->nm = sp_nm[ sp->id ];
     }
 
   } while(0);
 
-  for( face=0; face<6; face++ )
-    if( shared[face] ) mp_end_send(mp,f2b[face]);
+  for( face = 0; face < 6; face++ )
+  {
+    if ( shared[ face ] )
+    {
+      mp_end_send( mp,
+                   f2b[ face ] );
+    }
+  }
 }
diff --git a/src/boundary/link.cc b/src/boundary/link.cc
index 6743cfeb..0e12cec4 100644
--- a/src/boundary/link.cc
+++ b/src/boundary/link.cc
@@ -24,7 +24,8 @@ link_boundary( link_boundary_t     * lb,
                species_t           * sp, 
                particle_injector_t * pi,
                rng_t               * rng,
-               int                   face ) {
+               int                   face )
+{
   static FILE *fp = NULL; 
   int ix, iy, iz;
   double x, y, z;
diff --git a/src/collision/collision_private.h b/src/collision/collision_private.h
index 52d6c8e4..2bbef1b6 100644
--- a/src/collision/collision_private.h
+++ b/src/collision/collision_private.h
@@ -44,7 +44,8 @@ END_C_DECLS
 ///////////////////////////////////////////////////////////////////////////////
 // Langevin pipeline interface
 
-typedef struct langevin_pipeline_args {
+typedef struct langevin_pipeline_args
+{
   MEM_PTR( particle_t, 128 ) p;
   MEM_PTR( rng_t,      128 ) rng[ MAX_PIPELINE ];
   float decay; 
diff --git a/src/collision/hard_sphere.cc b/src/collision/hard_sphere.cc
index 966d75c2..922ebfb5 100644
--- a/src/collision/hard_sphere.cc
+++ b/src/collision/hard_sphere.cc
@@ -2,7 +2,8 @@
 
 /* Private interface *********************************************************/
 
-typedef struct hard_sphere {
+typedef struct hard_sphere
+{
   float twomu_mi, twomu_mj, Kc;
   float udx, udy, udz, ut;
   float ut2, alpha_Kt2ut4, beta_Kt2ut2, gamma_Kt2;
@@ -99,7 +100,8 @@ typedef struct hard_sphere {
 float
 hard_sphere_fluid_rate_constant( const hard_sphere_t * RESTRICT hs,
                                  const species_t     * RESTRICT spi,
-                                 const particle_t    * RESTRICT pi ) {
+                                 const particle_t    * RESTRICT pi )
+{
   static const float gamma = (3.*M_PI-8.)/(24.-6*M_PI);
   float urx = pi->ux - hs->udx;
   float ury = pi->uy - hs->udy;
@@ -116,7 +118,8 @@ hard_sphere_rate_constant( const hard_sphere_t * RESTRICT hs,
                            const species_t     * RESTRICT spi,
                            const species_t     * RESTRICT spj,
                            const particle_t    * RESTRICT pi,
-                           const particle_t    * RESTRICT pj ) {
+                           const particle_t    * RESTRICT pj )
+{
   float urx = pi->ux - pj->ux;
   float ury = pi->uy - pj->uy;
   float urz = pi->uz - pj->uz;
@@ -236,45 +239,46 @@ hard_sphere_rate_constant( const hard_sphere_t * RESTRICT hs,
 
 #define CMOV(a,b) if(t0<t1) a=b
 
-#define COMPUTE_MOMENTUM_TRANSFER(urx,ury,urz,ax,ay,az,rng) do {        \
-    float bcs_R, bsn_R, b2_R2, ur, tx, ty, tz, t0, t1, t2, stack[3];    \
-    int d0, d1, d2;                                                     \
-                                                                        \
-    do {                                                                \
-      bcs_R = 2*frand_c0(rng) - 1;                                      \
-      bsn_R = 2*frand_c0(rng) - 1;                                      \
-      b2_R2 = bcs_R*bcs_R + bsn_R*bsn_R;                                \
-    } while( b2_R2>=1 );                                                \
-                                                                        \
-    /* There are lots of ways to formulate T vector formation    */     \
-    /* This has no branches (but uses L1 heavily)                */     \
-                                                                        \
+#define COMPUTE_MOMENTUM_TRANSFER(urx,ury,urz,ax,ay,az,rng)                  \
+  do {									     \
+    float bcs_R, bsn_R, b2_R2, ur, tx, ty, tz, t0, t1, t2, stack[3];         \
+    int d0, d1, d2;                                                          \
+                                                                             \
+    do {                                                                     \
+      bcs_R = 2*frand_c0(rng) - 1;                                           \
+      bsn_R = 2*frand_c0(rng) - 1;                                           \
+      b2_R2 = bcs_R*bcs_R + bsn_R*bsn_R;                                     \
+    } while( b2_R2>=1 );                                                     \
+                                                                             \
+    /* There are lots of ways to formulate T vector formation    */          \
+    /* This has no branches (but uses L1 heavily)                */          \
+                                                                             \
     t0 = urx*urx;      d0=0;       d1=1;       d2=2;       t1=t0;  ur  = t0; \
     t0 = ury*ury; CMOV(d0,1); CMOV(d1,2); CMOV(d2,0); CMOV(t1,t0); ur += t0; \
     t0 = urz*urz; CMOV(d0,2); CMOV(d1,0); CMOV(d2,1);              ur += t0; \
-    ur = sqrtf( ur );                                                   \
-                                                                        \
-    stack[0] = urx;                                                     \
-    stack[1] = ury;                                                     \
-    stack[2] = urz;                                                     \
-    t1  = stack[d1];                                                    \
-    t2  = stack[d2];                                                    \
-    t0  = 1 / sqrtf( t1*t1 + t2*t2 + FLT_MIN );                         \
-    stack[d0] =  0;                                                     \
-    stack[d1] =  t0*t2;                                                 \
-    stack[d2] = -t0*t1;                                                 \
-    tx = stack[0];                                                      \
-    ty = stack[1];                                                      \
-    tz = stack[2];                                                      \
-                                                                        \
-    t0  = 1 - b2_R2;                                                    \
-    t2  = sqrtf( t0 );                                                  \
-    t1  = t2*bcs_R*ur;                                                  \
-    t2 *= bsn_R;                                                        \
-                                                                        \
-    ax = (t0*urx - t1*tx) - t2*( ury*tz - urz*ty );                     \
-    ay = (t0*ury - t1*ty) - t2*( urz*tx - urx*tz );                     \
-    az = (t0*urz - t1*tz) - t2*( urx*ty - ury*tx );                     \
+    ur = sqrtf( ur );                                                        \
+                                                                             \
+    stack[0] = urx;                                                          \
+    stack[1] = ury;                                                          \
+    stack[2] = urz;                                                          \
+    t1  = stack[d1];                                                         \
+    t2  = stack[d2];                                                         \
+    t0  = 1 / sqrtf( t1*t1 + t2*t2 + FLT_MIN );                              \
+    stack[d0] =  0;                                                          \
+    stack[d1] =  t0*t2;                                                      \
+    stack[d2] = -t0*t1;                                                      \
+    tx = stack[0];                                                           \
+    ty = stack[1];                                                           \
+    tz = stack[2];                                                           \
+                                                                             \
+    t0  = 1 - b2_R2;                                                         \
+    t2  = sqrtf( t0 );                                                       \
+    t1  = t2*bcs_R*ur;                                                       \
+    t2 *= bsn_R;                                                             \
+                                                                             \
+    ax = (t0*urx - t1*tx) - t2*( ury*tz - urz*ty );                          \
+    ay = (t0*ury - t1*ty) - t2*( urz*tx - urx*tz );                          \
+    az = (t0*urz - t1*tz) - t2*( urx*ty - ury*tx );                          \
   } while(0)
 
 /* It would be nice to preserve redundant rate constant
@@ -284,7 +288,8 @@ void
 hard_sphere_fluid_collision( const hard_sphere_t * RESTRICT hs,
                              const species_t     * RESTRICT spi,
                              /**/  particle_t    * RESTRICT pi,
-                             /**/  rng_t         * RESTRICT rng ) {
+                             /**/  rng_t         * RESTRICT rng )
+{
   float urx, ury, urz, ax, ay, az, w;
 
   urx = pi->ux - hs->udx;
@@ -313,7 +318,8 @@ hard_sphere_collision( const hard_sphere_t * RESTRICT hs,
                        /**/  particle_t    * RESTRICT pi,
                        /**/  particle_t    * RESTRICT pj,
                        /**/  rng_t         * RESTRICT rng,
-                       const int                      type ) {
+                       const int                      type )
+{
   float urx, ury, urz, ax, ay, az, w;
 
   urx = pi->ux - pj->ux;
@@ -340,12 +346,14 @@ hard_sphere_collision( const hard_sphere_t * RESTRICT hs,
 #undef CMOV
 
 void
-checkpt_hard_sphere( const hard_sphere_t * hs ) {
+checkpt_hard_sphere( const hard_sphere_t * hs )
+{
   CHECKPT( hs, 1 );
 }
 
 hard_sphere_t *
-restore_hard_sphere( void ) {
+restore_hard_sphere( void )
+{
   hard_sphere_t * hs;
   RESTORE( hs );
   return hs;
@@ -365,7 +373,8 @@ hard_sphere_fluid( const char * RESTRICT name, /* Model name */
                    species_t * RESTRICT sp,    /* Species */
                    const float rsp,            /* Species p. radius (LENGTH) */
                    rng_pool_t * RESTRICT rp,   /* Entropy pool */
-                   const int interval ) {      /* How often to apply this */
+                   const int interval )        /* How often to apply this */
+{
   hard_sphere_t * hs;
 
   if( n0<0 || kT0<0 || m0<=0 || r0<0 ||
@@ -401,7 +410,8 @@ hard_sphere( const char * RESTRICT name, /* Model name */
              const float rj,             /* Species-j p. radius (LENGTH) */
              rng_pool_t * RESTRICT rp,   /* Entropy pool */
              const double sample,        /* Sampling density */
-             const int interval ) {      /* How often to apply this */
+             const int interval )        /* How often to apply this */
+{
   hard_sphere_t * hs;
 
   if( !spi || spi->m<=0 || ri<0 ||
@@ -419,4 +429,3 @@ hard_sphere( const char * RESTRICT name, /* Model name */
                         (binary_collision_func_t)    hard_sphere_collision,
                                  hs, spi, spj, rp, sample, interval );
 }
-
diff --git a/src/collision/large_angle_coulomb.cc b/src/collision/large_angle_coulomb.cc
index 901ff072..bdae3beb 100644
--- a/src/collision/large_angle_coulomb.cc
+++ b/src/collision/large_angle_coulomb.cc
@@ -2,7 +2,8 @@
 
 /* Private interface *********************************************************/
 
-typedef struct large_angle_coulomb {
+typedef struct large_angle_coulomb
+{
   float cc, twomu_mi, twomu_mj, Kc;
   float udx, udy, udz, ut;
   float ut2, alpha_Kt2ut4, beta_Kt2ut2, gamma_Kt2;
@@ -14,7 +15,8 @@ float
 large_angle_coulomb_fluid_rate_constant(
     const large_angle_coulomb_t * RESTRICT lac,
     const species_t             * RESTRICT spi,
-    const particle_t            * RESTRICT pi ) {
+    const particle_t            * RESTRICT pi )
+{
   static const float gamma = (3.*M_PI-8.)/(24.-6*M_PI);
   float urx = pi->ux - lac->udx;
   float ury = pi->uy - lac->udy;
@@ -30,7 +32,8 @@ large_angle_coulomb_rate_constant(
     const species_t             * RESTRICT spi,
     const species_t             * RESTRICT spj,
     const particle_t            * RESTRICT pi,
-    const particle_t            * RESTRICT pj ) {
+    const particle_t            * RESTRICT pj )
+{
   float urx = pi->ux - pj->ux;
   float ury = pi->uy - pj->uy;
   float urz = pi->uz - pj->uz;
@@ -79,48 +82,49 @@ large_angle_coulomb_rate_constant(
     
 #define CMOV(a,b) if(t0<t1) a=b
 
-#define COMPUTE_MOMENTUM_TRANSFER(urx,ury,urz,ax,ay,az,rng) do {        \
-    float bcs_bmax, bsn_bmax, b2_bmax2, ur2, ur, tx, ty, tz;            \
-    float t0, t1, t2, stack[3];                                         \
-    int d0, d1, d2;                                                     \
-                                                                        \
-    do {                                                                \
-      bcs_bmax = 2*frand_c0(rng) - 1;                                   \
-      bsn_bmax = 2*frand_c0(rng) - 1;                                   \
-      b2_bmax2 = bcs_bmax*bcs_bmax + bsn_bmax*bsn_bmax;                 \
-    } while( b2_bmax2>=1 );                                             \
-                                                                        \
-    /* There are lots of ways to formulate T vector formation    */     \
-    /* This has no branches (but uses L1 heavily)                */     \
-                                                                        \
+#define COMPUTE_MOMENTUM_TRANSFER(urx,ury,urz,ax,ay,az,rng)                   \
+  do {									      \
+    float bcs_bmax, bsn_bmax, b2_bmax2, ur2, ur, tx, ty, tz;                  \
+    float t0, t1, t2, stack[3];                                               \
+    int d0, d1, d2;                                                           \
+                                                                              \
+    do {                                                                      \
+      bcs_bmax = 2*frand_c0(rng) - 1;                                         \
+      bsn_bmax = 2*frand_c0(rng) - 1;                                         \
+      b2_bmax2 = bcs_bmax*bcs_bmax + bsn_bmax*bsn_bmax;                       \
+    } while( b2_bmax2>=1 );                                                   \
+                                                                              \
+    /* There are lots of ways to formulate T vector formation    */           \
+    /* This has no branches (but uses L1 heavily)                */           \
+                                                                              \
     t0 = urx*urx;      d0=0;       d1=1;       d2=2;       t1=t0;  ur2  = t0; \
     t0 = ury*ury; CMOV(d0,1); CMOV(d1,2); CMOV(d2,0); CMOV(t1,t0); ur2 += t0; \
     t0 = urz*urz; CMOV(d0,2); CMOV(d1,0); CMOV(d2,1);              ur2 += t0; \
-    ur = sqrtf( ur2 );                                                  \
-                                                                        \
-    stack[0] = urx;                                                     \
-    stack[1] = ury;                                                     \
-    stack[2] = urz;                                                     \
-    t1  = stack[d1];                                                    \
-    t2  = stack[d2];                                                    \
-    t0  = 1 / sqrtf( t1*t1 + t2*t2 + FLT_MIN );                         \
-    stack[d0] =  0;                                                     \
-    stack[d1] =  t0*t2;                                                 \
-    stack[d2] = -t0*t1;                                                 \
-    tx = stack[0];                                                      \
-    ty = stack[1];                                                      \
-    tz = stack[2];                                                      \
-                                                                        \
-    t2 = lac->cc;                /* 4 pi eps0 mu c^2 bmax / (qi qj) */  \
-    t1 = t2 * ur2;               /*  B (bmax / b)                   */  \
-    t0 = 1/(1+(t1*t1)*b2_bmax2); /*  1 / ( B^2 + 1 )                */  \
-    t2 = t0*t1;                  /* (B / ( B^2 + 1 ))(bmax / b)     */  \
-    t1 = t2*bcs_bmax*ur;         /* (B / (B^2+1)) cos phi |ur0|     */  \
-    t2 = t2*bsn_bmax;            /* (B / (B^2+1)) sin phi           */  \
-                                                                        \
-    ax = (t0*urx - t1*tx) - t2*( ury*tz - urz*ty );                     \
-    ay = (t0*ury - t1*ty) - t2*( urz*tx - urx*tz );                     \
-    az = (t0*urz - t1*tz) - t2*( urx*ty - ury*tx );                     \
+    ur = sqrtf( ur2 );                                                        \
+                                                                              \
+    stack[0] = urx;                                                           \
+    stack[1] = ury;                                                           \
+    stack[2] = urz;                                                           \
+    t1  = stack[d1];                                                          \
+    t2  = stack[d2];                                                          \
+    t0  = 1 / sqrtf( t1*t1 + t2*t2 + FLT_MIN );                               \
+    stack[d0] =  0;                                                           \
+    stack[d1] =  t0*t2;                                                       \
+    stack[d2] = -t0*t1;                                                       \
+    tx = stack[0];                                                            \
+    ty = stack[1];                                                            \
+    tz = stack[2];                                                            \
+                                                                              \
+    t2 = lac->cc;                /* 4 pi eps0 mu c^2 bmax / (qi qj) */        \
+    t1 = t2 * ur2;               /*  B (bmax / b)                   */        \
+    t0 = 1/(1+(t1*t1)*b2_bmax2); /*  1 / ( B^2 + 1 )                */        \
+    t2 = t0*t1;                  /* (B / ( B^2 + 1 ))(bmax / b)     */        \
+    t1 = t2*bcs_bmax*ur;         /* (B / (B^2+1)) cos phi |ur0|     */        \
+    t2 = t2*bsn_bmax;            /* (B / (B^2+1)) sin phi           */        \
+                                                                              \
+    ax = (t0*urx - t1*tx) - t2*( ury*tz - urz*ty );                           \
+    ay = (t0*ury - t1*ty) - t2*( urz*tx - urx*tz );                           \
+    az = (t0*urz - t1*tz) - t2*( urx*ty - ury*tx );                           \
   } while(0)
 
 /* It would be nice to preserve redundant rate constant
@@ -131,7 +135,8 @@ large_angle_coulomb_fluid_collision(
     const large_angle_coulomb_t * RESTRICT lac,
     const species_t             * RESTRICT spi,
     /**/  particle_t            * RESTRICT pi,
-    /**/  rng_t                 * RESTRICT rng ) {
+    /**/  rng_t                 * RESTRICT rng )
+{
   float urx, ury, urz, ax, ay, az, w;
 
   urx = pi->ux - lac->udx;
@@ -161,7 +166,8 @@ large_angle_coulomb_collision(
     /**/  particle_t            * RESTRICT pi,
     /**/  particle_t            * RESTRICT pj,
     /**/  rng_t                 * RESTRICT rng,
-    const int                              type ) {
+    const int                              type )
+{
   float urx, ury, urz, ax, ay, az, w;
 
   urx = pi->ux - pj->ux;
@@ -188,12 +194,14 @@ large_angle_coulomb_collision(
 #undef CMOV
 
 void
-checkpt_large_angle_coulomb( const large_angle_coulomb_t * lac ) {
+checkpt_large_angle_coulomb( const large_angle_coulomb_t * lac )
+{
   CHECKPT( lac, 1 );
 }
 
 large_angle_coulomb_t *
-restore_large_angle_coulomb( void ) {
+restore_large_angle_coulomb( void )
+{
   large_angle_coulomb_t * lac;
   RESTORE( lac );
   return lac;
@@ -214,7 +222,8 @@ large_angle_coulomb_fluid(
     species_t * RESTRICT sp,    /* Species */
     const float bmax,           /* Impact parameter cutoff */
     rng_pool_t * RESTRICT rp,   /* Entropy pool */
-    const int interval ) {      /* How often to apply this */
+    const int interval )        /* How often to apply this */
+{
   large_angle_coulomb_t * lac;
 
   if( n0<0 || kT0<0 || !q0 || m0<=0 || !sp || !sp->q || sp->m<=0 || bmax<0 )
@@ -255,7 +264,8 @@ large_angle_coulomb( const char * RESTRICT name, /* Model name */
                      const float bmax,           /* Impact parameter cutoff */
                      rng_pool_t * RESTRICT rp,   /* Entropy pool */
                      const double sample,        /* Sampling density */
-                     const int interval ) {      /* How often to apply this */
+                     const int interval )        /* How often to apply this */
+{
   large_angle_coulomb_t * lac;
 
   if( !spi || !spi->q || spi->m<=0 || 
@@ -277,4 +287,3 @@ large_angle_coulomb( const char * RESTRICT name, /* Model name */
                 (binary_collision_func_t)    large_angle_coulomb_collision,
                                  lac, spi, spj, rp, sample, interval );
 }
-
diff --git a/src/emitter/child_langmuir.cc b/src/emitter/child_langmuir.cc
index dd74178a..cbeb5c20 100644
--- a/src/emitter/child_langmuir.cc
+++ b/src/emitter/child_langmuir.cc
@@ -208,4 +208,3 @@ child_langmuir( /**/  species_t            * RESTRICT sp,
                                (restore_func_t)restore_child_langmuir,
                                NULL );
 }
-
diff --git a/src/species_advance/species_advance.cc b/src/species_advance/species_advance.cc
index 33ffd435..0e85a646 100644
--- a/src/species_advance/species_advance.cc
+++ b/src/species_advance/species_advance.cc
@@ -13,7 +13,8 @@
 /* Private interface *********************************************************/
 
 void
-checkpt_species( const species_t * sp ) {
+checkpt_species( const species_t * sp )
+{
   CHECKPT( sp, 1 );
   CHECKPT_STR( sp->name );
   checkpt_data( sp->p,
@@ -28,7 +29,8 @@ checkpt_species( const species_t * sp ) {
 }
 
 species_t *
-restore_species( void ) {
+restore_species( void )
+{
   species_t * sp;
   RESTORE( sp );
   RESTORE_STR( sp->name );
@@ -41,7 +43,8 @@ restore_species( void ) {
 }
 
 void
-delete_species( species_t * sp ) {
+delete_species( species_t * sp )
+{
   UNREGISTER_OBJECT( sp );
   FREE_ALIGNED( sp->partition );
   FREE_ALIGNED( sp->pm );
@@ -53,12 +56,14 @@ delete_species( species_t * sp ) {
 /* Public interface **********************************************************/
 
 int
-num_species( const species_t * sp_list ) {
+num_species( const species_t * sp_list )
+{
   return sp_list ? sp_list->id+1 : 0;
 }
 
 void
-delete_species_list( species_t * sp_list ) {
+delete_species_list( species_t * sp_list )
+{
   species_t * sp;
   while( sp_list ) {
     sp = sp_list;
@@ -69,7 +74,8 @@ delete_species_list( species_t * sp_list ) {
 
 species_t *
 find_species_id( species_id id,
-                 species_t * sp_list ) {
+                 species_t * sp_list )
+{
   species_t * sp;
   LIST_FIND_FIRST( sp, sp_list, sp->id==id );
   return sp;
@@ -77,7 +83,8 @@ find_species_id( species_id id,
 
 species_t *
 find_species_name( const char * name,
-                   species_t * sp_list ) {
+                   species_t * sp_list )
+{
   species_t * sp;
   if( !name ) return NULL;
   LIST_FIND_FIRST( sp, sp_list, strcmp( sp->name, name )==0 );
@@ -86,7 +93,8 @@ find_species_name( const char * name,
 
 species_t *
 append_species( species_t * sp,
-                species_t ** sp_list ) {
+                species_t ** sp_list )
+{
   if( !sp || !sp_list ) ERROR(( "Bad args" ));
   if( sp->next ) ERROR(( "Species \"%s\" already in a list", sp->name ));
   if( find_species_name( sp->name, *sp_list ) )
@@ -107,7 +115,8 @@ species( const char * name,
          size_t max_local_nm,
          int sort_interval,
          int sort_out_of_place,
-         grid_t * g ) {
+         grid_t * g )
+{
   species_t * sp;
   int len = name ? strlen(name) : 0;
 
diff --git a/src/species_advance/standard/hydro_p.cc b/src/species_advance/standard/hydro_p.cc
index f85a79a2..f81f989f 100644
--- a/src/species_advance/standard/hydro_p.cc
+++ b/src/species_advance/standard/hydro_p.cc
@@ -26,7 +26,8 @@
 void
 accumulate_hydro_p( hydro_array_t              * RESTRICT ha,
                     const species_t            * RESTRICT sp,
-                    const interpolator_array_t * RESTRICT ia ) {
+                    const interpolator_array_t * RESTRICT ia )
+{
   /**/  hydro_t        * RESTRICT ALIGNED(128) h;
   const particle_t     * RESTRICT ALIGNED(128) p;
   const interpolator_t * RESTRICT ALIGNED(128) f;
diff --git a/src/species_advance/standard/move_p.cc b/src/species_advance/standard/move_p.cc
index dfba3785..507d1807 100644
--- a/src/species_advance/standard/move_p.cc
+++ b/src/species_advance/standard/move_p.cc
@@ -12,7 +12,7 @@
 // position is updated to the point where the particle interacted and
 // m->dispx, m->dispy, m->dispz contains the remaining particle
 // displacement. The displacements are the physical displacments
-// normalized current cell size.
+// normalized by the current cell size.
 //
 // Because move_p is frequently called, it does not check its input
 // arguments. Higher level routines are responsible for insuring valid
@@ -32,10 +32,10 @@ move_p( particle_t       * RESTRICT ALIGNED(128) p,
         particle_mover_t * RESTRICT ALIGNED(16)  pm,
         accumulator_t    * RESTRICT ALIGNED(128) a,
         const grid_t     *                       g,
-        const float                              qsp ) {
-
-  /*const*/ v4float one( 1.f );
-  /*const*/ v4float tiny( 1e-37f );
+        const float                              qsp )
+{
+  /*const*/ v4float one( 1.0f );
+  /*const*/ v4float tiny( 1.0e-37f );
   /*const*/ v4int   sign_bits( 1<<31 );
 
   v4float dr, r, u, q, q3;
@@ -217,19 +217,21 @@ move_p( particle_t       * ALIGNED(128) p0,
         particle_mover_t * ALIGNED(16)  pm,
         accumulator_t    * ALIGNED(128) a0,
         const grid_t     *              g,
-        const float                     qsp ) {
+        const float                     qsp )
+{
   float s_midx, s_midy, s_midz;
   float s_dispx, s_dispy, s_dispz;
   float s_dir[3];
   float v0, v1, v2, v3, v4, v5, q;
   int axis, face;
   int64_t neighbor;
-  float *a;
+  float * a;
   particle_t * ALIGNED(32) p = p0 + pm->i;
 
-  q = qsp*p->w;
+  q = qsp * p->w;
 
-  for(;;) {
+  for( ;; )
+  {
     s_midx = p->dx;
     s_midy = p->dy;
     s_midz = p->dz;
@@ -238,15 +240,15 @@ move_p( particle_t       * ALIGNED(128) p0,
     s_dispy = pm->dispy;
     s_dispz = pm->dispz;
 
-    s_dir[0] = (s_dispx>0.0f) ? 1.0f : -1.0f;
-    s_dir[1] = (s_dispy>0.0f) ? 1.0f : -1.0f;
-    s_dir[2] = (s_dispz>0.0f) ? 1.0f : -1.0f;
+    s_dir[0] = ( s_dispx > 0.0f ) ? 1.0f : -1.0f;
+    s_dir[1] = ( s_dispy > 0.0f ) ? 1.0f : -1.0f;
+    s_dir[2] = ( s_dispz > 0.0f ) ? 1.0f : -1.0f;
 
-    // Compute the twice the fractional distance to each potential
+    // Compute twice the fractional distance to each potential
     // streak/cell face intersection.
-    v0 = (s_dispx==0.0f) ? 3.4e38f : (s_dir[0]-s_midx)/s_dispx;
-    v1 = (s_dispy==0.0f) ? 3.4e38f : (s_dir[1]-s_midy)/s_dispy;
-    v2 = (s_dispz==0.0f) ? 3.4e38f : (s_dir[2]-s_midz)/s_dispz;
+    v0 = ( s_dispx == 0.0f ) ? 3.4e38f : ( s_dir[0] - s_midx ) / s_dispx;
+    v1 = ( s_dispy == 0.0f ) ? 3.4e38f : ( s_dir[1] - s_midy ) / s_dispy;
+    v2 = ( s_dispz == 0.0f ) ? 3.4e38f : ( s_dir[2] - s_midz ) / s_dispz;
 
     // Determine the fractional length and axis of current streak. The
     // streak ends on either the first face intersected by the
@@ -254,16 +256,17 @@ move_p( particle_t       * ALIGNED(128) p0,
     //
     //   axis 0,1 or 2 ... streak ends on a x,y or z-face respectively
     //   axis 3        ... streak ends at end of the particle track
-    /**/      v3=2.0f, axis=3;
-    if(v0<v3) v3=v0,   axis=0;
-    if(v1<v3) v3=v1,   axis=1;
-    if(v2<v3) v3=v2,   axis=2;
+    /**/           v3 = 2.0f, axis = 3;
+    if ( v0 < v3 ) v3 = v0,   axis = 0;
+    if ( v1 < v3 ) v3 = v1,   axis = 1;
+    if ( v2 < v3 ) v3 = v2,   axis = 2;
     v3 *= 0.5f;
 
     // Compute the midpoint and the normalized displacement of the streak
     s_dispx *= v3;
     s_dispy *= v3;
     s_dispz *= v3;
+
     s_midx += s_dispx;
     s_midy += s_dispy;
     s_midz += s_dispz;
@@ -271,9 +274,11 @@ move_p( particle_t       * ALIGNED(128) p0,
     // Accumulate the streak.  Note: accumulator values are 4 times
     // the total physical charge that passed through the appropriate
     // current quadrant in a time-step
-    v5 = q*s_dispx*s_dispy*s_dispz*(1./3.);
-    a = (float *)(a0 + p->i);
-#   define accumulate_j(X,Y,Z)                                        \
+    v5 = q * s_dispx * s_dispy * s_dispz * ( 1.0 / 3.0 );
+
+    a  = (float *) ( a0 + p->i );
+
+    #define accumulate_j(X,Y,Z)                                       \
     v4  = q*s_disp##X;    /* v2 = q ux                            */  \
     v1  = v4*s_mid##Y;    /* v1 = q ux dy                         */  \
     v0  = v4-v1;          /* v0 = q ux (1-dy)                     */  \
@@ -292,24 +297,31 @@ move_p( particle_t       * ALIGNED(128) p0,
     a[1] += v1;                                                       \
     a[2] += v2;                                                       \
     a[3] += v3
+
     accumulate_j(x,y,z); a += 4;
     accumulate_j(y,z,x); a += 4;
     accumulate_j(z,x,y);
-#   undef accumulate_j
 
-    // Compute the remaining particle displacment
+    #undef accumulate_j
+
+    // Compute the remaining particle displacment.
     pm->dispx -= s_dispx;
     pm->dispy -= s_dispy;
     pm->dispz -= s_dispz;
 
-    // Compute the new particle offset
-    p->dx += s_dispx+s_dispx;
-    p->dy += s_dispy+s_dispy;
-    p->dz += s_dispz+s_dispz;
+    // Compute the new particle offset.
+    p->dx += s_dispx + s_dispx;
+    p->dy += s_dispy + s_dispy;
+    p->dz += s_dispz + s_dispz;
 
-    // If an end streak, return success (should be ~50% of the time)
+    // If an end streak, return success (should be ~50% of the time). This
+    // is the case where the particle moves to a voxel located within the
+    // same MPI domain.
 
-    if( axis==3 ) break;
+    if ( axis == 3 )
+    {
+      break;
+    }
 
     // Determine if the particle crossed into a local cell or if it
     // hit a boundary and convert the coordinate system accordingly.
@@ -319,25 +331,38 @@ move_p( particle_t       * ALIGNED(128) p0,
     // +/-1 _exactly_ for the particle.
 
     v0 = s_dir[axis];
-    (&(p->dx))[axis] = v0; // Avoid roundoff fiascos--put the particle
-                           // _exactly_ on the boundary.
-    face = axis; if( v0>0 ) face += 3;
-    neighbor = g->neighbor[ 6*p->i + face ];
 
-    if( UNLIKELY( neighbor==reflect_particles ) ) {
+    ( &( p->dx ) )[axis] = v0; // Avoid roundoff fiascos--put the particle
+                               // _exactly_ on the boundary.
+
+    face = axis;
+
+    if ( v0 > 0 )
+    {
+      face += 3;
+    }
+
+    neighbor = g->neighbor[ 6 * p->i + face ];
+
+    if ( UNLIKELY( neighbor == reflect_particles ) )
+    {
       // Hit a reflecting boundary condition.  Reflect the particle
       // momentum and remaining displacement and keep moving the
       // particle.
-      (&(p->ux    ))[axis] = -(&(p->ux    ))[axis];
-      (&(pm->dispx))[axis] = -(&(pm->dispx))[axis];
+      ( &( p->ux     ) )[axis] = - ( &( p->ux     ) )[axis];
+      ( &( pm->dispx ) )[axis] = - ( &( pm->dispx ) )[axis];
+
       continue;
     }
 
-    if( UNLIKELY( neighbor<g->rangel || neighbor>g->rangeh ) ) {
+    if ( UNLIKELY( neighbor < g->rangel ||
+		   neighbor > g->rangeh ) )
+    {
       // Cannot handle the boundary condition here.  Save the updated
       // particle position, face it hit and update the remaining
       // displacement in the particle mover.
-      p->i = 8*p->i + face;
+      p->i = 8 * p->i + face;
+
       return 1; // Return "mover still in use"
     }
 
@@ -346,7 +371,7 @@ move_p( particle_t       * ALIGNED(128) p0,
 
     p->i = neighbor - g->rangel; // Compute local index of neighbor
     /**/                         // Note: neighbor - g->rangel < 2^31 / 6
-    (&(p->dx))[axis] = -v0;      // Convert coordinate system
+    ( &( p->dx ) )[axis] = - v0; // Convert coordinate system
   }
 
   return 0; // Return "mover not in use"
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
index 8dde6f27..3cdc4d10 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
@@ -13,7 +13,8 @@
 
 //----------------------------------------------------------------------------//
 // Reference implementation for an advance_p pipeline function which does not
-// make use of explicit calls to vector intrinsic functions.
+// make use of explicit calls to vector intrinsic functions. This is the AoS
+// version.
 //----------------------------------------------------------------------------//
 
 void
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc
index ef6f8b1a..49ea867e 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v16.cc
@@ -254,7 +254,7 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args,
     dy = v01;
     dz = v02;
 
-    v13 = q*ux*uy*uz*one_third;    // Charge conservation correction
+    v15 = q*ux*uy*uz*one_third;    // Charge conservation correction
 
     //--------------------------------------------------------------------------
     // Set current density accumulation pointers.
@@ -279,67 +279,82 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args,
     //--------------------------------------------------------------------------
     // Accumulate current density.
     //--------------------------------------------------------------------------
-    // Accumulate Jx for 16 particles into the v0-v3 vectors.
-    v12  = q*ux;     // v12 = q ux
-    v01  = v12*dy;   // v01 = q ux dy
-    v00  = v12-v01;  // v00 = q ux (1-dy)
-    v01 += v12;      // v01 = q ux (1+dy)
-    v12  = one+dz;   // v12 = 1+dz
-    v02  = v00*v12;  // v02 = q ux (1-dy)(1+dz)
-    v03  = v01*v12;  // v03 = q ux (1+dy)(1+dz)
-    v12  = one-dz;   // v12 = 1-dz
-    v00 *= v12;      // v00 = q ux (1-dy)(1-dz)
-    v01 *= v12;      // v01 = q ux (1+dy)(1-dz)
-    v00 += v13;      // v00 = q ux [ (1-dy)(1-dz) + uy*uz/3 ]
-    v01 -= v13;      // v01 = q ux [ (1+dy)(1-dz) - uy*uz/3 ]
-    v02 -= v13;      // v02 = q ux [ (1-dy)(1+dz) - uy*uz/3 ]
-    v03 += v13;      // v03 = q ux [ (1+dy)(1+dz) + uy*uz/3 ]
-
-    // Accumulate Jy for 16 particles into the v4-v7 vectors.
-    v12  = q*uy;     // v12 = q uy
-    v05  = v12*dz;   // v05 = q uy dz
-    v04  = v12-v05;  // v04 = q uy (1-dz)
-    v05 += v12;      // v05 = q uy (1+dz)
-    v12  = one+dx;   // v12 = 1+dx
-    v06  = v04*v12;  // v06 = q uy (1-dz)(1+dx)
-    v07  = v05*v12;  // v07 = q uy (1+dz)(1+dx)
-    v12  = one-dx;   // v12 = 1-dx
-    v04 *= v12;      // v04 = q uy (1-dz)(1-dx)
-    v05 *= v12;      // v05 = q uy (1+dz)(1-dx)
-    v04 += v13;      // v04 = q uy [ (1-dz)(1-dx) + ux*uz/3 ]
-    v05 -= v13;      // v05 = q uy [ (1+dz)(1-dx) - ux*uz/3 ]
-    v06 -= v13;      // v06 = q uy [ (1-dz)(1+dx) - ux*uz/3 ]
-    v07 += v13;      // v07 = q uy [ (1+dz)(1+dx) + ux*uz/3 ]
-
-    // Accumulate Jz for 16 particles into the v8-v11 vectors.
-    v12  = q*uz;     // v12 = q uz
-    v09  = v12*dx;   // v09 = q uz dx
-    v08  = v12-v09;  // v08 = q uz (1-dx)
-    v09 += v12;      // v09 = q uz (1+dx)
-    v12  = one+dy;   // v12 = 1+dy
-    v10  = v08*v12;  // v10 = q uz (1-dx)(1+dy)
-    v11  = v09*v12;  // v11 = q uz (1+dx)(1+dy)
-    v12  = one-dy;   // v12 = 1-dy
-    v08 *= v12;      // v08 = q uz (1-dx)(1-dy)
-    v09 *= v12;      // v09 = q uz (1+dx)(1-dy)
-    v08 += v13;      // v08 = q uz [ (1-dx)(1-dy) + ux*uy/3 ]
-    v09 -= v13;      // v09 = q uz [ (1+dx)(1-dy) - ux*uy/3 ]
-    v10 -= v13;      // v10 = q uz [ (1-dx)(1+dy) - ux*uy/3 ]
-    v11 += v13;      // v11 = q uz [ (1+dx)(1+dy) + ux*uy/3 ]
-
-    // Zero the v12-v15 vectors prior to transposing the data.
+    // Accumulate Jx for 16 particles into the v0 - v3 vectors.
+
+    v12  = q * ux;     // v12 = q ux
+    v01  = v12 * dy;   // v01 = q ux dy
+    v00  = v12 - v01;  // v00 = q ux (1-dy)
+    v01 += v12;        // v01 = q ux (1+dy)
+
+    v13  = one + dz;   // v13 = 1+dz
+    v02  = v00 * v13;  // v02 = q ux (1-dy)(1+dz)
+    v03  = v01 * v13;  // v03 = q ux (1+dy)(1+dz)
+
+    v14  = one - dz;   // v14 = 1-dz
+    v00 *= v14;        // v00 = q ux (1-dy)(1-dz)
+    v01 *= v14;        // v01 = q ux (1+dy)(1-dz)
+
+    v00 += v15;        // v00 = q ux [ (1-dy)(1-dz) + uy*uz/3 ]
+    v01 -= v15;        // v01 = q ux [ (1+dy)(1-dz) - uy*uz/3 ]
+    v02 -= v15;        // v02 = q ux [ (1-dy)(1+dz) - uy*uz/3 ]
+    v03 += v15;        // v03 = q ux [ (1+dy)(1+dz) + uy*uz/3 ]
+
+    // Accumulate Jy for 16 particles into the v4 - v7 vectors.
+
+    v12  = q * uy;     // v12 = q uy
+    v05  = v12 * dz;   // v05 = q uy dz
+    v04  = v12 - v05;  // v04 = q uy (1-dz)
+    v05 += v12;        // v05 = q uy (1+dz)
+
+    v13  = one + dx;   // v13 = 1+dx
+    v06  = v04 * v13;  // v06 = q uy (1-dz)(1+dx)
+    v07  = v05 * v13;  // v07 = q uy (1+dz)(1+dx)
+
+    v14  = one - dx;   // v14 = 1-dx
+    v04 *= v14;        // v04 = q uy (1-dz)(1-dx)
+    v05 *= v14;        // v05 = q uy (1+dz)(1-dx)
+
+    v04 += v15;        // v04 = q uy [ (1-dz)(1-dx) + ux*uz/3 ]
+    v05 -= v15;        // v05 = q uy [ (1+dz)(1-dx) - ux*uz/3 ]
+    v06 -= v15;        // v06 = q uy [ (1-dz)(1+dx) - ux*uz/3 ]
+    v07 += v15;        // v07 = q uy [ (1+dz)(1+dx) + ux*uz/3 ]
+
+    // Accumulate Jz for 16 particles into the v8 - v11 vectors.
+
+    v12  = q * uz;     // v12 = q uz
+    v09  = v12 * dx;   // v09 = q uz dx
+    v08  = v12 - v09;  // v08 = q uz (1-dx)
+    v09 += v12;        // v09 = q uz (1+dx)
+
+    v13  = one + dy;   // v13 = 1+dy
+    v10  = v08 * v13;  // v10 = q uz (1-dx)(1+dy)
+    v11  = v09 * v13;  // v11 = q uz (1+dx)(1+dy)
+
+    v14  = one - dy;   // v14 = 1-dy
+    v08 *= v14;        // v08 = q uz (1-dx)(1-dy)
+    v09 *= v14;        // v09 = q uz (1+dx)(1-dy)
+
+    v08 += v15;        // v08 = q uz [ (1-dx)(1-dy) + ux*uy/3 ]
+    v09 -= v15;        // v09 = q uz [ (1+dx)(1-dy) - ux*uy/3 ]
+    v10 -= v15;        // v10 = q uz [ (1-dx)(1+dy) - ux*uy/3 ]
+    v11 += v15;        // v11 = q uz [ (1+dx)(1+dy) + ux*uy/3 ]
+
+    // Zero the v12 - v15 vectors prior to transposing the data.
+
     v12 = 0.0;
     v13 = 0.0;
     v14 = 0.0;
     v15 = 0.0;
 
-    // Transpose the data in vectors v0-v15 so it can be added into the
+    // Transpose the data in vectors v0 - v15 so it can be added into the
     // accumulator arrays using vector operations.
+
     transpose( v00, v01, v02, v03, v04, v05, v06, v07,
                v08, v09, v10, v11, v12, v13, v14, v15 );
 
     // Add the contributions to Jx, Jy and Jz from 16 particles into the
     // accumulator arrays for Jx, Jy and Jz.
+
     increment_16x1( vp00, v00 );
     increment_16x1( vp01, v01 );
     increment_16x1( vp02, v02 );
@@ -362,7 +377,7 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args,
     // particles.
     //--------------------------------------------------------------------------
 
-#   define MOVE_OUTBND(N)                                               \
+    #define MOVE_OUTBND(N)                                              \
     if ( outbnd(N) )                                /* Unlikely */      \
     {                                                                   \
       local_pm->dispx = ux(N);                                          \
@@ -399,7 +414,7 @@ advance_p_pipeline_v16( advance_p_pipeline_args_t * args,
     MOVE_OUTBND(14);
     MOVE_OUTBND(15);
 
-#   undef MOVE_OUTBND
+    #undef MOVE_OUTBND
   }
 
   args->seg[pipeline_rank].pm        = pm;
diff --git a/src/species_advance/standard/pipeline/center_p_pipeline.cc b/src/species_advance/standard/pipeline/center_p_pipeline.cc
index 64b8ea32..bec2cdcd 100644
--- a/src/species_advance/standard/pipeline/center_p_pipeline.cc
+++ b/src/species_advance/standard/pipeline/center_p_pipeline.cc
@@ -112,7 +112,7 @@ center_p_pipeline( species_t * RESTRICT sp,
        !ia ||
        sp->g != ia->g )
   {
-    ERROR( ( "Bad args" ) );
+    ERROR( ( "Bad args." ) );
   }
 
   // Have the pipelines do the bulk of particles in blocks and have the
@@ -120,9 +120,10 @@ center_p_pipeline( species_t * RESTRICT sp,
 
   args->p0      = sp->p;
   args->f0      = ia->i;
-  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
+  args->qdt_2mc = ( sp->q * sp->g->dt ) / ( 2 * sp->m * sp->g->cvac );
   args->np      = sp->np;
 
   EXEC_PIPELINES( center_p, args, 0 );
+
   WAIT_PIPELINES();
 }
diff --git a/src/species_advance/standard/pipeline/spa_private.h b/src/species_advance/standard/pipeline/spa_private.h
index ed95881e..8f56a622 100644
--- a/src/species_advance/standard/pipeline/spa_private.h
+++ b/src/species_advance/standard/pipeline/spa_private.h
@@ -43,7 +43,6 @@ typedef struct advance_p_pipeline_args
   int                                  nz;       // z-mesh resolution
  
   PAD_STRUCT( 6*SIZEOF_MEM_PTR + 5*sizeof(float) + 5*sizeof(int) )
-
 } advance_p_pipeline_args_t;
 
 // PROTOTYPE_PIPELINE( advance_p, advance_p_pipeline_args_t );
@@ -79,7 +78,6 @@ typedef struct center_p_pipeline_args
   int                                  np;      // Number of particles
 
   PAD_STRUCT( 2*SIZEOF_MEM_PTR + sizeof(float) + sizeof(int) )
-
 } center_p_pipeline_args_t;
 
 // PROTOTYPE_PIPELINE( center_p,   center_p_pipeline_args_t );
@@ -138,7 +136,6 @@ typedef struct energy_p_pipeline_args
   int                                  np;      // Number of particles
 
   PAD_STRUCT( 3*SIZEOF_MEM_PTR + 2*sizeof(float) + sizeof(int) )
-
 } energy_p_pipeline_args_t;
 
 // PROTOTYPE_PIPELINE( energy_p, energy_p_pipeline_args_t );
@@ -206,7 +203,6 @@ typedef struct sort_p_pipeline_args
   int n_voxel;   // Number of voxels total (including ghosts)
 
   PAD_STRUCT( 5*SIZEOF_MEM_PTR + 5*sizeof(int) )
-
 } sort_p_pipeline_args_t;
 
 // PROTOTYPE_PIPELINE( coarse_count, sort_p_pipeline_args_t );
diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc
index f3b6d442..1b6a1cc3 100644
--- a/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc
+++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline.cc
@@ -108,11 +108,11 @@ uncenter_p_pipeline( species_t * RESTRICT sp,
 {
   DECLARE_ALIGNED_ARRAY( center_p_pipeline_args_t, 128, args, 1 );
 
-  if ( !sp ||
-       !ia ||
+  if ( ! sp ||
+       ! ia ||
        sp->g != ia->g )
   {
-    ERROR( ( "Bad args" ) );
+    ERROR( ( "Bad args." ) );
   }
 
   // Have the pipelines do the bulk of particles in blocks and have the
@@ -120,7 +120,7 @@ uncenter_p_pipeline( species_t * RESTRICT sp,
 
   args->p0      = sp->p;
   args->f0      = ia->i;
-  args->qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac);
+  args->qdt_2mc = ( sp->q * sp->g->dt ) / ( 2 * sp->m * sp->g->cvac );
   args->np      = sp->np;
 
   EXEC_PIPELINES( uncenter_p, args, 0 );
diff --git a/src/species_advance/standard/rho_p.cc b/src/species_advance/standard/rho_p.cc
index 629badc1..776594cd 100644
--- a/src/species_advance/standard/rho_p.cc
+++ b/src/species_advance/standard/rho_p.cc
@@ -17,11 +17,12 @@
 // interpolation is used.  rhof is known at the nodes at the same time
 // as particle positions.  No effort is made to fix up edges of the
 // computational domain; see note in synchronize_rhob about why this
-// is done this way.  All particles on the list must be inbounds.
+// is done this way.  All particles on the list must be in bounds.
 
 void
 accumulate_rho_p( /**/  field_array_t * RESTRICT fa,
-                  const species_t     * RESTRICT sp ) {
+                  const species_t     * RESTRICT sp )
+{
   if( !fa || !sp || fa->g!=sp->g ) ERROR(( "Bad args" ));
 
   /**/  field_t    * RESTRICT ALIGNED(128) f = fa->f;
@@ -126,7 +127,8 @@ void
 accumulate_rhob( field_t          * RESTRICT ALIGNED(128) f,
                  const particle_t * RESTRICT ALIGNED(32)  p,
                  const grid_t     * RESTRICT              g,
-                 const float                              qsp ) {
+                 const float                              qsp )
+{
 # if 1
 
   // See note in rhof for why this variant is used.
diff --git a/src/species_advance/standard/sort_p.cc b/src/species_advance/standard/sort_p.cc
index 3464011a..935d6fa9 100644
--- a/src/species_advance/standard/sort_p.cc
+++ b/src/species_advance/standard/sort_p.cc
@@ -153,9 +153,9 @@ sort_p( species_t * sp )
 void
 sort_p( species_t * sp )
 {
-  if ( !sp )
+  if ( ! sp )
   {
-    ERROR( ( "Bad args" ) );
+    ERROR( ( "Bad args." ) );
   }
 
   // Conditionally execute this when more abstractions are available.
diff --git a/src/util/profile/profile.h b/src/util/profile/profile.h
index 3175611f..f26de1e3 100644
--- a/src/util/profile/profile.h
+++ b/src/util/profile/profile.h
@@ -34,6 +34,7 @@
   _( load_interpolator ) \
   _( compute_curl_b    ) \
   _( compute_rhob      ) \
+  _( center_p          ) \
   _( uncenter_p        ) \
   _( user_initialization ) \
   _( user_particle_collisions ) \
diff --git a/src/util/v16/v16_avx512.h b/src/util/v16/v16_avx512.h
index b9331831..69d0922d 100644
--- a/src/util/v16/v16_avx512.h
+++ b/src/util/v16/v16_avx512.h
@@ -523,31 +523,33 @@ namespace v16
 
   // v16 memory manipulation functions
 
+  // Portable version.
   inline void load_16x1( const void * ALIGNED(64) p,
 			 v16 &a )
   {
     for( int j = 0; j < 16; j++ )
-      a.i[j] = ((const int * ALIGNED(64))p)[j];
+      a.i[j] = ( ( const int * ALIGNED(64) ) p )[j];
   }
 
+  // Portable version.
   inline void store_16x1( const v16 &a,
 			  void * ALIGNED(64) p )
   {
     for( int j = 0; j < 16; j++ )
-      ((int * ALIGNED(64))p)[j] = a.i[j];
+      ( ( int * ALIGNED(64) ) p )[j] = a.i[j];
   }
 
   inline void stream_16x1( const v16 &a,
 			   void * ALIGNED(64) p )
   {
     for( int j = 0; j < 16; j++ )
-      ((int * ALIGNED(64))p)[j] = a.i[j];
+      ( ( int * ALIGNED(64) ) p )[j] = a.i[j];
   }
 
   inline void clear_16x1( void * ALIGNED(64) p )
   {
     for( int j = 0; j < 16; j++ )
-      ((int * ALIGNED(64))p)[j] = 0;
+      ( ( int * ALIGNED(64) ) p )[j] = 0;
   }
 
   // FIXME: Ordering semantics
@@ -555,7 +557,7 @@ namespace v16
 			 const void * ALIGNED(64) src )
   {
     for( int j = 0; j < 16; j++ )
-      ((int * ALIGNED(64))dst)[j] = ((const int * ALIGNED(64))src)[j];
+      ( ( int * ALIGNED(64) ) dst )[j] = ( ( const int * ALIGNED(64) ) src )[j];
   }
 
   inline void swap_16x1( void * ALIGNED(64) a,
@@ -565,9 +567,9 @@ namespace v16
 
     for( int j = 0; j < 16; j++ )
     {
-      t = ((int * ALIGNED(64))a)[j];
-      ((int * ALIGNED(64))a)[j] = ((int * ALIGNED(64))b)[j];
-      ((int * ALIGNED(64))b)[j] = t;
+      t = ( ( int * ALIGNED(64) ) a )[j];
+      ( ( int * ALIGNED(64) ) a )[j] = ( ( int * ALIGNED(64) ) b )[j];
+      ( ( int * ALIGNED(64) ) b )[j] = t;
     }
   }
 
diff --git a/src/util/v4/v4_altivec.h b/src/util/v4/v4_altivec.h
index 2c52d963..6ff3f58c 100644
--- a/src/util/v4/v4_altivec.h
+++ b/src/util/v4/v4_altivec.h
@@ -232,33 +232,45 @@ namespace v4 {
 
   // v4 memory manipulation functions
   
-  inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) {
-    a.v = vec_ld( 0, (const float *)p );
+  inline void load_4x1( const void * ALIGNED(16) p,
+			v4 &a )
+  {
+    a.v = vec_ld( 0, ( const float * ) p );
   }
 
-  inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) {
-    vec_st( a.v, 0, (float *)p );
+  inline void store_4x1( const v4 &a,
+			 void * ALIGNED(16) p )
+  {
+    vec_st( a.v, 0, ( float * ) p );
   }
 
-  inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) {
-    vec_stl( a.v, 0, (float *)p );
+  inline void stream_4x1( const v4 &a,
+			  void * ALIGNED(16) p )
+  {
+    vec_stl( a.v, 0, ( float * ) p );
   }
 
   // FIXME: Ordering semantics
-  inline void clear_4x1( void * ALIGNED(16) d ) {
-    vec_st( _zero, 0, (float *)d );
+  inline void clear_4x1( void * ALIGNED(16) d )
+  {
+    vec_st( _zero, 0, ( float * ) d );
   }
 
   // FIXME: Ordering semantics
-  inline void copy_4x1( void * ALIGNED(16) d, const void * ALIGNED(16) s ) {
-    vec_st( vec_ld( 0, (const float *)s ), 0, (float *)d );
+  inline void copy_4x1( void * ALIGNED(16) d,
+			const void * ALIGNED(16) s )
+  {
+    vec_st( vec_ld( 0, ( const float * ) s ), 0, ( float * ) d );
   }
 
-  inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) {
-    _v4_float va = vec_ld( 0, (float *)a );
-    _v4_float vb = vec_ld( 0, (float *)b );
-    vec_st( vb, 0, (float *)a );
-    vec_st( va, 0, (float *)b );
+  inline void swap_4x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    _v4_float va = vec_ld( 0, ( float * ) a );
+    _v4_float vb = vec_ld( 0, ( float * ) b );
+
+    vec_st( vb, 0, ( float * ) a );
+    vec_st( va, 0, ( float * ) b );
   }
 
   // v4 transposed memory manipulation functions
diff --git a/src/util/v4/v4_avx.h b/src/util/v4/v4_avx.h
index f2b47552..3c48096e 100644
--- a/src/util/v4/v4_avx.h
+++ b/src/util/v4/v4_avx.h
@@ -178,32 +178,43 @@ namespace v4 {
 
   // v4 memory manipulation functions
   
-  inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) {
-    a.v = _mm_load_ps((float *)p);
+  inline void load_4x1( const void * ALIGNED(16) p,
+			v4 &a )
+  {
+    a.v = _mm_load_ps( ( float * ) p );
   }
 
-  inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) {
-    _mm_store_ps((float *)p,a.v);
+  inline void store_4x1( const v4 &a,
+			 void * ALIGNED(16) p )
+  {
+    _mm_store_ps( ( float * ) p, a.v );
   }
 
-  inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) {
-    _mm_stream_ps((float *)p,a.v);
+  inline void stream_4x1( const v4 &a,
+			  void * ALIGNED(16) p )
+  {
+    _mm_stream_ps( ( float * ) p, a.v );
   }
 
-  inline void clear_4x1( void * ALIGNED(16) p ) {
-    _mm_store_ps( (float *)p, _mm_setzero_ps() );
+  inline void clear_4x1( void * ALIGNED(16) p )
+  {
+    _mm_store_ps( ( float * ) p, _mm_setzero_ps() );
   }
 
   inline void copy_4x1( void * ALIGNED(16) dst,
-                        const void * ALIGNED(16) src ) {
-    _mm_store_ps( (float *)dst, _mm_load_ps( (const float *)src ) );
+                        const void * ALIGNED(16) src )
+  {
+    _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) );
   }
 
   /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
-  inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) {
-    __m128 t = _mm_load_ps((float *)a);
-    _mm_store_ps( (float *)a, _mm_load_ps( (float *)b ) );
-    _mm_store_ps( (float *)b, t );
+  inline void swap_4x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    __m128 t = _mm_load_ps( ( float * ) a );
+
+    _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) );
+    _mm_store_ps( ( float * ) b, t );
   }
 
   // v4 transposed memory manipulation functions
diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h
index abb7814f..a7b7b783 100644
--- a/src/util/v4/v4_avx2.h
+++ b/src/util/v4/v4_avx2.h
@@ -204,39 +204,39 @@ namespace v4
   inline void load_4x1( const void * ALIGNED(16) p,
 			v4 &a )
   {
-    a.v = _mm_load_ps( (float *)p );
+    a.v = _mm_load_ps( ( float * ) p );
   }
 
   inline void store_4x1( const v4 &a,
 			 void * ALIGNED(16) p )
   {
-    _mm_store_ps( (float *)p, a.v );
+    _mm_store_ps( ( float * ) p, a.v );
   }
 
   inline void stream_4x1( const v4 &a,
 			  void * ALIGNED(16) p )
   {
-    _mm_stream_ps( (float *)p, a.v );
+    _mm_stream_ps( ( float * ) p, a.v );
   }
 
   inline void clear_4x1( void * ALIGNED(16) p )
   {
-    _mm_store_ps( (float *)p, _mm_setzero_ps() );
+    _mm_store_ps( ( float * ) p, _mm_setzero_ps() );
   }
 
   inline void copy_4x1( void * ALIGNED(16) dst,
                         const void * ALIGNED(16) src )
   {
-    _mm_store_ps( (float *)dst, _mm_load_ps( (const float *)src ) );
+    _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) );
   }
 
   /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
   inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b )
   {
-    __m128 t = _mm_load_ps((float *)a);
+    __m128 t = _mm_load_ps( ( float * ) a );
 
-    _mm_store_ps( (float *)a, _mm_load_ps( (float *)b ) );
-    _mm_store_ps( (float *)b, t );
+    _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) );
+    _mm_store_ps( ( float * ) b, t );
   }
 
   // v4 transposed memory manipulation functions
diff --git a/src/util/v4/v4_sse.h b/src/util/v4/v4_sse.h
index fe82058f..b2ed5dcb 100644
--- a/src/util/v4/v4_sse.h
+++ b/src/util/v4/v4_sse.h
@@ -178,32 +178,43 @@ namespace v4 {
 
   // v4 memory manipulation functions
   
-  inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) {
-    a.v = _mm_load_ps((float *)p);
+  inline void load_4x1( const void * ALIGNED(16) p,
+			v4 &a )
+  {
+    a.v = _mm_load_ps( ( float * ) p );
   }
 
-  inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) {
-    _mm_store_ps((float *)p,a.v);
+  inline void store_4x1( const v4 &a,
+			 void * ALIGNED(16) p )
+  {
+    _mm_store_ps( ( float * ) p, a.v );
   }
 
-  inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) {
-    _mm_stream_ps((float *)p,a.v);
+  inline void stream_4x1( const v4 &a,
+			  void * ALIGNED(16) p )
+  {
+    _mm_stream_ps( ( float * ) p, a.v );
   }
 
-  inline void clear_4x1( void * ALIGNED(16) p ) {
-    _mm_store_ps( (float *)p, _mm_setzero_ps() );
+  inline void clear_4x1( void * ALIGNED(16) p )
+  {
+    _mm_store_ps( ( float * ) p, _mm_setzero_ps() );
   }
 
   inline void copy_4x1( void * ALIGNED(16) dst,
-                        const void * ALIGNED(16) src ) {
-    _mm_store_ps( (float *)dst, _mm_load_ps( (const float *)src ) );
+                        const void * ALIGNED(16) src )
+  {
+    _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) );
   }
 
   /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
-  inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) {
-    __m128 t = _mm_load_ps((float *)a);
-    _mm_store_ps( (float *)a, _mm_load_ps( (float *)b ) );
-    _mm_store_ps( (float *)b, t );
+  inline void swap_4x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    __m128 t = _mm_load_ps( ( float * ) a );
+
+    _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) );
+    _mm_store_ps( ( float * ) b, t );
   }
 
   // v4 transposed memory manipulation functions
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 31d689fb..62505147 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -259,7 +259,8 @@ vpic_simulation::dump_hydro( const char *sp_name,
 void
 vpic_simulation::dump_particles( const char *sp_name,
                                  const char *fbase,
-                                 int ftag ) {
+                                 int ftag )
+{
   species_t *sp;
   char fname[256];
   FileIO fileIO;
diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc
index d49bbf57..8cc28da0 100644
--- a/src/vpic/initialize.cc
+++ b/src/vpic/initialize.cc
@@ -68,4 +68,3 @@ vpic_simulation::finalize( void ) {
   barrier();
   update_profile( rank()==0 );
 }
-
diff --git a/src/vpic/misc.cc b/src/vpic/misc.cc
index 4cc3f0a5..28bb9e27 100644
--- a/src/vpic/misc.cc
+++ b/src/vpic/misc.cc
@@ -1,12 +1,9 @@
-/* 
- * Written by:
- *   Kevin J. Bowers, Ph.D.
- *   Plasma Physics Group (X-1)
- *   Applied Physics Division
- *   Los Alamos National Lab
- * March/April 2004 - Original version
- *
- */
+// Written by:
+//   Kevin J. Bowers, Ph.D.
+//   Plasma Physics Group (X-1)
+//   Applied Physics Division
+//   Los Alamos National Lab
+// March/April 2004 - Original version
 
 #include "vpic.h"
 
@@ -17,7 +14,8 @@ vpic_simulation::inject_particle( species_t * sp,
                                   double x,  double y,  double z,
                                   double ux, double uy, double uz,
                                   double w,  double age,
-                                  int update_rhob ) {
+                                  int update_rhob )
+{
   int ix, iy, iz;
 
   // Check input parameters
@@ -96,9 +94,8 @@ vpic_simulation::inject_particle( species_t * sp,
     pm->i     = sp->np-1;
     sp->nm += move_p( sp->p, pm, accumulator_array->a, grid, sp->q );
   }
-
 }
- 
+
 // Add capability to modify certain fields "on the fly" so that one
 // can, e.g., extend a run, change a quota, or modify a dump interval
 // without having to rerun from the start.
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index b309bd55..2c34d11f 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -531,7 +531,8 @@ class vpic_simulation {
   inline void
   inject_particle_raw( species_t * RESTRICT sp,
                        float dx, float dy, float dz, int32_t i,
-                       float ux, float uy, float uz, float w ) {
+                       float ux, float uy, float uz, float w )
+  {
     particle_t * RESTRICT p = sp->p + (sp->np++);
     p->dx = dx; p->dy = dy; p->dz = dz; p->i = i;
     p->ux = ux; p->uy = uy; p->uz = uz; p->w = w;
@@ -544,7 +545,8 @@ class vpic_simulation {
                        float dx, float dy, float dz, int32_t i,
                        float ux, float uy, float uz, float w,
                        float dispx, float dispy, float dispz,
-                       int update_rhob ) {
+                       int update_rhob )
+  {
     particle_t       * RESTRICT p  = sp->p  + (sp->np++);
     particle_mover_t * RESTRICT pm = sp->pm + sp->nm;
     p->dx = dx; p->dy = dy; p->dz = dz; p->i = i;

From 58a6873837e4298a4cede42c074cc93eeb9d671c Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Tue, 9 Jul 2019 22:22:29 -0600
Subject: [PATCH 15/95] Initial beginning of ARM Neon intrinsics support.

---
 src/util/v4/v4.h      |   15 +
 src/util/v4/v4_neon.h | 1184 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 1199 insertions(+)
 create mode 100644 src/util/v4/v4_neon.h

diff --git a/src/util/v4/v4.h b/src/util/v4/v4.h
index 3cf5183c..0b8cc4c1 100644
--- a/src/util/v4/v4.h
+++ b/src/util/v4/v4.h
@@ -1,20 +1,35 @@
 #ifndef _v4_h_
 #define _v4_h_
+
 /* FIXME: STYLE */
 #define IN_v4_h
+
 /* FIXME: SHOULDN'T THIS INCLUDE UTIL_BASE.H? */
+
 #ifdef __cplusplus
+
 # if defined USE_V4_ALTIVEC
 #   include "v4_altivec.h"
+
 # elif defined USE_V4_PORTABLE
 #   include "v4_portable.h"
+
 # elif defined USE_V4_SSE
 #   include "v4_sse.h"
+
 # elif defined USE_V4_AVX
 #   include "v4_avx.h"
+
 # elif defined USE_V4_AVX2
 #   include "v4_avx2.h"
+
+# elif defined USE_V4_NEON
+#   include "v4_neon.h"
+
 # endif
+
 #endif
+
 #undef IN_v4_h
+
 #endif // _v4_h_
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
new file mode 100644
index 00000000..af16614a
--- /dev/null
+++ b/src/util/v4/v4_neon.h
@@ -0,0 +1,1184 @@
+#ifndef _v4_neon_h_
+#define _v4_neon_h_
+
+#ifndef IN_v4_h
+#error "Do not include v4_neon.h directly; use v4.h"
+#endif
+
+#include <arm_neon.h>
+#include <math.h>
+
+#define V4_ACCELERATION
+#define V4_NEON_ACCELERATION
+
+#ifndef ALIGNED
+#define ALIGNED(n)
+#endif
+
+// This does not work with gcc 5.3.1 and the -fopenmp-simd
+// flag.  Does not seem to work with -fopenmp either.  Not
+// sure why.  It does work with the Intel compiler.  Need
+// to try later versions of gcc.
+// #define ALWAYS_VECTORIZE _Pragma( "omp simd" )
+
+// #define ALWAYS_VECTORIZE _Pragma( "simd" )
+
+#define ALWAYS_VECTORIZE \
+  _Pragma( "simd" ) \
+  _Pragma( "vector aligned" )
+
+#define ALWAYS_INLINE __attribute__((always_inline))
+
+namespace v4
+{
+  class v4;
+  class v4int;
+  class v4float;
+
+  ////////////////
+  // v4 base class
+
+  class v4
+  {
+    friend class v4int;
+    friend class v4float;
+
+    // v4 miscellaneous friends
+
+    friend inline int any( const v4 &a ) ALWAYS_INLINE;
+    friend inline int all( const v4 &a ) ALWAYS_INLINE;
+
+    template<int n>
+    friend inline v4 splat( const v4 &a ) ALWAYS_INLINE;
+
+    template<int i0, int i1, int i2, int i3>
+    friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE;
+
+    friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+
+    // v4 memory manipulation friends
+
+    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
+    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+    friend inline void   copy_4x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    // v4 transposed memory manipulation friends
+
+    friend inline void load_4x1_tr( const void *a0, const void *a1,
+                                    const void *a2, const void *a3,
+                                    v4 &a ) ALWAYS_INLINE;
+
+    friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                                    const void * ALIGNED(8) a1,
+                                    const void * ALIGNED(8) a2,
+                                    const void * ALIGNED(8) a3,
+                                    v4 &a, v4 &b ) ALWAYS_INLINE;
+
+    friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+
+    friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
+
+    friend inline void store_4x1_tr( const v4 &a,
+                                     void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x2_tr( const v4 &a, const v4 &b,
+                                     void * ALIGNED(8) a0,
+                                     void * ALIGNED(8) a1,
+                                     void * ALIGNED(8) a2,
+                                     void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+    friend inline void store_4x4_tr( const v4 &a, const v4 &b,
+                                     const v4 &c, const v4 &d,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
+  protected:
+
+    union
+    {
+      int i[4];
+      float f[4];
+    };
+
+  public:
+
+    v4() {}                    // Default constructor
+
+    v4( const v4 &a )          // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	i[j] = a.i[j];
+    }
+
+    ~v4() {}                   // Default destructor
+  };
+
+  // v4 miscellaneous functions
+
+  inline int any( const v4 &a )
+  {
+    return a.i[0] || a.i[1] || a.i[2] || a.i[3];
+  }
+
+  inline int all( const v4 &a )
+  {
+    return a.i[0] && a.i[1] && a.i[2] && a.i[3];
+  }
+
+  template<int n>
+  inline v4 splat( const v4 & a )
+  {
+    v4 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[n];
+
+    return b;
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  inline v4 shuffle( const v4 & a )
+  {
+    v4 b;
+
+    b.i[0] = a.i[i0];
+    b.i[1] = a.i[i1];
+    b.i[2] = a.i[i2];
+    b.i[3] = a.i[i3];
+
+    return b;
+  }
+
+# define sw(x,y) x^=y, y^=x, x^=y
+
+  inline void swap( v4 &a, v4 &b )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      sw( a.i[j], b.i[j] );
+  }
+
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
+    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
+                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
+                                                  sw( a2.i[3],a3.i[2] );
+  }
+
+# undef sw
+
+  // v4 memory manipulation functions
+
+  inline void load_4x1( const void * ALIGNED(16) p,
+			v4 &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      a.i[j] = ((const int * ALIGNED(16))p)[j];
+  }
+
+  inline void store_4x1( const v4 &a,
+			 void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      ((int * ALIGNED(16))p)[j] = a.i[j];
+  }
+
+  inline void stream_4x1( const v4 &a,
+			  void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      ((int * ALIGNED(16))p)[j] = a.i[j];
+  }
+
+  inline void clear_4x1( void * ALIGNED(16) p )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      ((int * ALIGNED(16))p)[j] = 0;
+  }
+
+  // FIXME: Ordering semantics
+  inline void copy_4x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j];
+  }
+
+  inline void swap_4x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
+  {
+    int t;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+    {
+      t = ((int * ALIGNED(16))a)[j];
+      ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j];
+      ((int * ALIGNED(16))b)[j] = t;
+    }
+  }
+
+  // v4 transposed memory manipulation functions
+
+  inline void load_4x1_tr( const void *a0, const void *a1,
+                           const void *a2, const void *a3,
+			   v4 &a )
+  {
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
+  }
+
+  inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+                           v4 &a, v4 &b )
+  {
+    a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+
+    a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+
+    a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+
+    a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+  }
+
+  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+  }
+
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c, v4 &d )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+  }
+
+  inline void store_4x1_tr( const v4 &a,
+                            void *a0, void *a1,
+			    void *a2, void *a3 )
+  {
+    ((int *)a0)[0] = a.i[0];
+    ((int *)a1)[0] = a.i[1];
+    ((int *)a2)[0] = a.i[2];
+    ((int *)a3)[0] = a.i[3];
+  }
+
+  inline void store_4x2_tr( const v4 &a, const v4 &b,
+                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  {
+    ((int * ALIGNED(8))a0)[0] = a.i[0];
+    ((int * ALIGNED(8))a0)[1] = b.i[0];
+
+    ((int * ALIGNED(8))a1)[0] = a.i[1];
+    ((int * ALIGNED(8))a1)[1] = b.i[1];
+
+    ((int * ALIGNED(8))a2)[0] = a.i[2];
+    ((int * ALIGNED(8))a2)[1] = b.i[2];
+
+    ((int * ALIGNED(8))a3)[0] = a.i[3];
+    ((int * ALIGNED(8))a3)[1] = b.i[3];
+  }
+
+  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+  }
+
+  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    ((int * ALIGNED(16))a0)[0] = a.i[0];
+    ((int * ALIGNED(16))a0)[1] = b.i[0];
+    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    ((int * ALIGNED(16))a1)[0] = a.i[1];
+    ((int * ALIGNED(16))a1)[1] = b.i[1];
+    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    ((int * ALIGNED(16))a2)[0] = a.i[2];
+    ((int * ALIGNED(16))a2)[1] = b.i[2];
+    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    ((int * ALIGNED(16))a3)[0] = a.i[3];
+    ((int * ALIGNED(16))a3)[1] = b.i[3];
+    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ((int * ALIGNED(16))a3)[3] = d.i[3];
+  }
+
+  //////////////
+  // v4int class
+
+  class v4int : public v4
+  {
+    // v4int prefix unary operator friends
+
+    friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  ~( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  !( const v4int & a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4int prefix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a ) ALWAYS_INLINE;
+
+    // v4int postfix increment / decrement operator friends
+
+    friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE;
+
+    // v4int binary operator friends
+
+    friend inline v4int operator  +( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  *( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  /( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  %( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  ^( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  &( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  |( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int logical operator friends
+
+    friend inline v4int operator  <( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+
+    // v4int miscellaneous friends
+
+    friend inline v4int abs( const v4int &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    // FIXME: cswap, notcswap!
+    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE;
+
+    // v4float unary operator friends
+
+    friend inline v4int operator  !( const v4float & a ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float miscellaneous friends
+
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+
+  public:
+
+    // v4int constructors / destructors
+
+    v4int() {}                                // Default constructor
+
+    v4int( const v4int &a )                   // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	i[j] = a.i[j];
+    }
+
+    v4int( const v4 &a )                      // Init from mixed
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	i[j] = a.i[j];
+    }
+
+    v4int( int a )                            // Init from scalar
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	i[j] = a;
+    }
+
+    v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
+    {
+      i[0] = i0;
+      i[1] = i1;
+      i[2] = i2;
+      i[3] = i3;
+    }
+
+    ~v4int() {}                               // Destructor
+
+    // v4int assignment operators
+
+#   define ASSIGN(op)			          \
+    inline v4int &operator op( const v4int &b )   \
+    {						  \
+      ALWAYS_VECTORIZE                            \
+      for( int j = 0; j < 4; j++ )                \
+        i[j] op b.i[j];                           \
+      return *this;                               \
+    }
+
+    ASSIGN( =)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(^=)
+    ASSIGN(&=)
+    ASSIGN(|=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+#   undef ASSIGN
+
+    // v4int member access operator
+
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
+  };
+
+  // v4int prefix unary operators
+
+# define PREFIX_UNARY(op)                       \
+  inline v4int operator op( const v4int & a )   \
+  {						\
+    v4int b;                                    \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 4; j++ )                \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_UNARY(+)
+  PREFIX_UNARY(-)
+
+  inline v4int operator !( const v4int & a )
+  {
+    v4int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = - ( !a.i[j] );
+
+    return b;
+  }
+
+  PREFIX_UNARY(~)
+
+# undef PREFIX_UNARY
+
+  // v4int prefix increment / decrement
+
+# define PREFIX_INCDEC(op)                      \
+  inline v4int operator op( v4int & a )         \
+  {						\
+    v4int b;                                    \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 4; j++ )                \
+      b.i[j] = ( op a.i[j] );                   \
+    return b;                                   \
+  }
+
+  PREFIX_INCDEC(++)
+  PREFIX_INCDEC(--)
+
+# undef PREFIX_INCDEC
+
+  // v4int postfix increment / decrement
+
+# define POSTFIX_INCDEC(op)                    \
+  inline v4int operator op( v4int & a, int )   \
+  {					       \
+    v4int b;                                   \
+    ALWAYS_VECTORIZE                           \
+    for( int j = 0; j < 4; j++ )               \
+      b.i[j] = ( a.i[j] op );                  \
+    return b;                                  \
+  }
+
+  POSTFIX_INCDEC(++)
+  POSTFIX_INCDEC(--)
+
+# undef POSTFIX_INCDEC
+
+  // v4int binary operators
+
+# define BINARY(op)                                             \
+  inline v4int operator op( const v4int &a, const v4int &b )    \
+  {								\
+    v4int c;                                                    \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 4; j++ )                                \
+      c.i[j] = a.i[j] op b.i[j];                                \
+    return c;                                                   \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+  BINARY(%)
+  BINARY(^)
+  BINARY(&)
+  BINARY(|)
+  BINARY(<<)
+  BINARY(>>)
+
+# undef BINARY
+
+  // v4int logical operators
+
+# define LOGICAL(op)                                           \
+  inline v4int operator op( const v4int &a, const v4int &b )   \
+  {							       \
+    v4int c;                                                   \
+    ALWAYS_VECTORIZE                                           \
+    for( int j = 0; j < 4; j++ )                               \
+      c.i[j] = - ( a.i[j] op b.i[j] );                         \
+    return c;                                                  \
+  }
+
+  LOGICAL(<)
+  LOGICAL(>)
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v4int miscellaneous functions
+
+  inline v4int abs( const v4int &a )
+  {
+    v4int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = ( a.i[j] >= 0 ) ? a.i[j] : -a.i[j];
+
+    return b;
+  }
+
+  inline v4 czero( const v4int &c, const v4 &a )
+  {
+    v4 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[j] & ~c.i[j];
+
+    return b;
+  }
+
+  inline v4 notczero( const v4int &c, const v4 &a )
+  {
+    v4 b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[j] & c.i[j];
+
+    return b;
+  }
+
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
+  {
+    v4 m;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+
+    return m;
+  }
+
+  ////////////////
+  // v4float class
+
+  class v4float : public v4
+  {
+    // v4float prefix unary operator friends
+
+    friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  ~( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4int   operator  !( const v4float &a ) ALWAYS_INLINE;
+    // Note: Referencing (*) and dereferencing (&) apply to the whole vector
+
+    // v4float prefix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a ) ALWAYS_INLINE;
+
+    // v4float postfix increment / decrement operator friends
+
+    friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE;
+
+    // v4float binary operator friends
+
+    friend inline v4float operator  +( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  *( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  /( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float logical operator friends
+
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+
+    // v4float math library friends
+
+#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                   const v4float &b ) ALWAYS_INLINE
+
+    CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
+    CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
+    CMATH_FR1(fabs);  CMATH_FR1(floor); CMATH_FR2(fmod); CMATH_FR1(log);
+    CMATH_FR1(log10); CMATH_FR2(pow);   CMATH_FR1(sin);  CMATH_FR1(sinh);
+    CMATH_FR1(sqrt);  CMATH_FR1(tan);   CMATH_FR1(tanh);
+
+    CMATH_FR2(copysign);
+
+#   undef CMATH_FR1
+#   undef CMATH_FR2
+
+    // v4float miscellaneous friends
+
+    friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void     scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
+
+  public:
+
+    // v4float constructors / destructors
+
+    v4float() {}                                        // Default constructor
+
+    v4float( const v4float &a )                         // Copy constructor
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	f[j] = a.f[j];
+    }
+
+    v4float( const v4 &a )                              // Init from mixed
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	f[j] = a.f[j];
+    }
+
+    v4float( float a )                                  // Init from scalar
+    {
+      ALWAYS_VECTORIZE
+      for( int j = 0; j < 4; j++ )
+	f[j] = a;
+    }
+
+    v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
+    {
+      f[0] = f0;
+      f[1] = f1;
+      f[2] = f2;
+      f[3] = f3;
+    }
+
+    ~v4float() {}                                       // Destructor
+
+    // v4float assignment operators
+
+#   define ASSIGN(op)                                   \
+    inline v4float &operator op( const v4float &b )     \
+    {							\
+      ALWAYS_VECTORIZE                                  \
+      for( int j = 0; j < 4; j++ )                      \
+        f[j] op b.f[j];		             		\
+      return *this;                                     \
+    }
+
+    ASSIGN(=)
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+
+#   undef ASSIGN
+
+    // v4float member access operator
+
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
+
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
+  };
+
+  // v4float prefix unary operators
+
+  inline v4float operator +( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = +a.f[j];
+
+    return b;
+  }
+
+  inline v4float operator -( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = -a.f[j];
+
+    return b;
+  }
+
+  inline v4int operator !( const v4float &a )
+  {
+    v4int b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[j] ? 0 : -1;
+
+    return b;
+  }
+
+  // v4float prefix increment / decrement operators
+
+  inline v4float operator ++( v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = ++a.f[j];
+
+    return b;
+  }
+
+  inline v4float operator --( v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = --a.f[j];
+
+    return b;
+  }
+
+  // v4float postfix increment / decrement operators
+
+  inline v4float operator ++( v4float &a, int )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = a.f[j]++;
+
+    return b;
+  }
+
+  inline v4float operator --( v4float &a, int )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = a.f[j]--;
+
+    return b;
+  }
+
+  // v4float binary operators
+
+# define BINARY(op)                                                  \
+  inline v4float operator op( const v4float &a, const v4float &b )   \
+  {								     \
+    v4float c;                                                       \
+    ALWAYS_VECTORIZE                                                 \
+    for( int j = 0; j < 4; j++ )                                     \
+      c.f[j] = a.f[j] op b.f[j];                                     \
+    return c;                                                        \
+  }
+
+  BINARY(+)
+  BINARY(-)
+  BINARY(*)
+  BINARY(/)
+
+# undef BINARY
+
+  // v4float logical operators
+
+# define LOGICAL(op)                                               \
+  inline v4int operator op( const v4float &a, const v4float &b )   \
+  {								   \
+    v4int c;                                                       \
+    ALWAYS_VECTORIZE                                               \
+    for( int j = 0; j < 4; j++ )                                   \
+      c.i[j] = - ( a.f[j] op b.f[j] );                             \
+    return c;                                                      \
+  }
+
+  LOGICAL(< )
+  LOGICAL(> )
+  LOGICAL(==)
+  LOGICAL(!=)
+  LOGICAL(<=)
+  LOGICAL(>=)
+  LOGICAL(&&)
+  LOGICAL(||)
+
+# undef LOGICAL
+
+  // v4float math library functions
+
+# define CMATH_FR1(fn)                          \
+  inline v4float fn( const v4float &a )         \
+  {						\
+    v4float b;                                  \
+    ALWAYS_VECTORIZE                            \
+    for( int j = 0; j < 4; j++ )                \
+      b.f[j] = ::fn( a.f[j] );                  \
+    return b;                                   \
+  }
+
+# define CMATH_FR2(fn)                                          \
+  inline v4float fn( const v4float &a, const v4float &b )       \
+  {								\
+    v4float c;                                                  \
+    ALWAYS_VECTORIZE                                            \
+    for( int j = 0; j < 4; j++ )                                \
+      c.f[j] = ::fn( a.f[j], b.f[j] );                          \
+    return c;                                                   \
+  }
+
+  CMATH_FR1(acos)     CMATH_FR1(asin)  CMATH_FR1(atan) CMATH_FR2(atan2)
+  CMATH_FR1(ceil)     CMATH_FR1(cos)   CMATH_FR1(cosh) CMATH_FR1(exp)
+  CMATH_FR1(fabs)     CMATH_FR1(floor) CMATH_FR2(fmod) CMATH_FR1(log)
+  CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
+  CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
+
+  inline v4float copysign( const v4float &a, const v4float &b )
+  {
+    v4float c;
+    float t;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+    {
+      t = ::fabs( a.f[j] );
+      if( b.f[j] < 0 ) t = -t;
+      c.f[j] = t;
+    }
+
+    return c;
+  }
+
+# undef CMATH_FR1
+# undef CMATH_FR2
+
+  // v4float miscellaneous functions
+
+  inline v4float rsqrt_approx( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+
+    return b;
+  }
+
+  inline v4float rsqrt( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+
+    return b;
+  }
+
+  inline v4float rcp_approx( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = 1.0f / a.f[j];
+
+    return b;
+  }
+
+  inline v4float rcp( const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = 1.0f / a.f[j];
+
+    return b;
+  }
+
+  inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      d.f[j] = a.f[j] * b.f[j] + c.f[j];
+
+    return d;
+  }
+
+  inline v4float fms( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      d.f[j] = a.f[j] * b.f[j] - c.f[j];
+
+    return d;
+  }
+
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c )
+  {
+    v4float d;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      d.f[j] = c.f[j] - a.f[j] * b.f[j];
+
+    return d;
+  }
+
+  inline v4float clear_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = ( ~m.i[j] ) & a.i[j];
+
+    return b;
+  }
+
+  inline v4float set_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = m.i[j] | a.i[j];
+
+    return b;
+  }
+
+  inline v4float toggle_bits( const v4int &m, const v4float &a )
+  {
+    v4float b;
+
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = m.i[j] ^ a.i[j];
+
+    return b;
+  }
+
+  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      p[j] += a.f[j];
+  }
+
+  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      p[j] -= a.f[j];
+  }
+
+  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  {
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      p[j] *= a.f[j];
+  }
+
+  inline void trilinear( v4float & wl, v4float & wh )
+  {
+    float x = wl.f[0], y = wl.f[1], z = wl.f[2];
+
+    wl.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f - z );
+    wl.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f - z );
+    wl.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f - z );
+    wl.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f - z );
+
+    wh.f[0] = ( ( 1.0f - x ) * ( 1.0f - y ) ) * ( 1.0f + z );
+    wh.f[1] = ( ( 1.0f + x ) * ( 1.0f - y ) ) * ( 1.0f + z );
+    wh.f[2] = ( ( 1.0f - x ) * ( 1.0f + y ) ) * ( 1.0f + z );
+    wh.f[3] = ( ( 1.0f + x ) * ( 1.0f + y ) ) * ( 1.0f + z );
+  }
+
+} // namespace v4
+
+#endif // _v4_neon_h_

From 304922e7ad8a8e4a9684aa4b9d53adaa7667248d Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Sun, 14 Jul 2019 18:58:43 -0600
Subject: [PATCH 16/95] Add CMake support for ARM NEON intrinsics.

---
 CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f9902c5..797eb136 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,8 @@ option(USE_PTHREADS "Use Pthreads" ON)
 
 option(USE_V4_ALTIVEC "Enable V4 Altivec" OFF)
 
+option(USE_V4_NEON "Enable V4 NEON" OFF)
+
 option(USE_V4_PORTABLE "Enable V4 Portable" OFF)
 
 option(USE_V4_SSE "Enable V4 SSE" OFF)
@@ -201,6 +203,12 @@ if(USE_V4_ALTIVEC)
   set(USE_V4 True)
 endif(USE_V4_ALTIVEC)
 
+if(USE_V4_NEON)
+  add_definitions(-DUSE_V4_NEON)
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DUSE_V4_NEON")
+  set(USE_V4 True)
+endif(USE_V4_NEON)
+
 #------------------------------------------------------------------------------#
 # Add options for building with v8 simd vector support.
 #------------------------------------------------------------------------------#

From c68b58acb924312cac89372f1adf0d2b132b8649 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Sun, 14 Jul 2019 21:01:33 -0600
Subject: [PATCH 17/95] Add NEON support for v4float binary operators.

---
 src/util/v4/v4_neon.h | 87 ++++++++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 35 deletions(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index af16614a..1dce1cb3 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -135,7 +135,7 @@ namespace v4
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	i[j] = a.i[j];
+        i[j] = a.i[j];
     }
 
     ~v4() {}                   // Default destructor
@@ -199,7 +199,7 @@ namespace v4
   // v4 memory manipulation functions
 
   inline void load_4x1( const void * ALIGNED(16) p,
-			v4 &a )
+                        v4 &a )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
@@ -207,7 +207,7 @@ namespace v4
   }
 
   inline void store_4x1( const v4 &a,
-			 void * ALIGNED(16) p )
+                         void * ALIGNED(16) p )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
@@ -215,7 +215,7 @@ namespace v4
   }
 
   inline void stream_4x1( const v4 &a,
-			  void * ALIGNED(16) p )
+                          void * ALIGNED(16) p )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
@@ -239,7 +239,7 @@ namespace v4
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
-			void * ALIGNED(16) b )
+                        void * ALIGNED(16) b )
   {
     int t;
 
@@ -256,7 +256,7 @@ namespace v4
 
   inline void load_4x1_tr( const void *a0, const void *a1,
                            const void *a2, const void *a3,
-			   v4 &a )
+                           v4 &a )
   {
     a.i[0] = ((const int *)a0)[0];
     a.i[1] = ((const int *)a1)[0];
@@ -335,7 +335,7 @@ namespace v4
 
   inline void store_4x1_tr( const v4 &a,
                             void *a0, void *a1,
-			    void *a2, void *a3 )
+                            void *a2, void *a3 )
   {
     ((int *)a0)[0] = a.i[0];
     ((int *)a1)[0] = a.i[1];
@@ -492,21 +492,21 @@ namespace v4
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	i[j] = a.i[j];
+        i[j] = a.i[j];
     }
 
     v4int( const v4 &a )                      // Init from mixed
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	i[j] = a.i[j];
+        i[j] = a.i[j];
     }
 
     v4int( int a )                            // Init from scalar
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	i[j] = a;
+        i[j] = a;
     }
 
     v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
@@ -521,9 +521,9 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)			          \
+#   define ASSIGN(op)                             \
     inline v4int &operator op( const v4int &b )   \
-    {						  \
+    {                                             \
       ALWAYS_VECTORIZE                            \
       for( int j = 0; j < 4; j++ )                \
         i[j] op b.i[j];                           \
@@ -561,7 +561,7 @@ namespace v4
 
 # define PREFIX_UNARY(op)                       \
   inline v4int operator op( const v4int & a )   \
-  {						\
+  {                                             \
     v4int b;                                    \
     ALWAYS_VECTORIZE                            \
     for( int j = 0; j < 4; j++ )                \
@@ -591,7 +591,7 @@ namespace v4
 
 # define PREFIX_INCDEC(op)                      \
   inline v4int operator op( v4int & a )         \
-  {						\
+  {                                             \
     v4int b;                                    \
     ALWAYS_VECTORIZE                            \
     for( int j = 0; j < 4; j++ )                \
@@ -608,7 +608,7 @@ namespace v4
 
 # define POSTFIX_INCDEC(op)                    \
   inline v4int operator op( v4int & a, int )   \
-  {					       \
+  {                                            \
     v4int b;                                   \
     ALWAYS_VECTORIZE                           \
     for( int j = 0; j < 4; j++ )               \
@@ -625,7 +625,7 @@ namespace v4
 
 # define BINARY(op)                                             \
   inline v4int operator op( const v4int &a, const v4int &b )    \
-  {								\
+  {                                                             \
     v4int c;                                                    \
     ALWAYS_VECTORIZE                                            \
     for( int j = 0; j < 4; j++ )                                \
@@ -650,7 +650,7 @@ namespace v4
 
 # define LOGICAL(op)                                           \
   inline v4int operator op( const v4int &a, const v4int &b )   \
-  {							       \
+  {                                                            \
     v4int c;                                                   \
     ALWAYS_VECTORIZE                                           \
     for( int j = 0; j < 4; j++ )                               \
@@ -800,21 +800,21 @@ namespace v4
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	f[j] = a.f[j];
+        f[j] = a.f[j];
     }
 
     v4float( const v4 &a )                              // Init from mixed
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	f[j] = a.f[j];
+        f[j] = a.f[j];
     }
 
     v4float( float a )                                  // Init from scalar
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	f[j] = a;
+        f[j] = a;
     }
 
     v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
@@ -831,10 +831,10 @@ namespace v4
 
 #   define ASSIGN(op)                                   \
     inline v4float &operator op( const v4float &b )     \
-    {							\
+    {                                                   \
       ALWAYS_VECTORIZE                                  \
       for( int j = 0; j < 4; j++ )                      \
-        f[j] op b.f[j];		             		\
+        f[j] op b.f[j];                                 \
       return *this;                                     \
     }
 
@@ -944,28 +944,43 @@ namespace v4
 
   // v4float binary operators
 
-# define BINARY(op)                                                  \
+  #define BINARY(op,intrin)                                          \
   inline v4float operator op( const v4float &a, const v4float &b )   \
-  {								     \
+  {                                                                  \
     v4float c;                                                       \
-    ALWAYS_VECTORIZE                                                 \
-    for( int j = 0; j < 4; j++ )                                     \
-      c.f[j] = a.f[j] op b.f[j];                                     \
+    c.v = intrin( a.v, b.v );                                        \
     return c;                                                        \
   }
 
-  BINARY(+)
-  BINARY(-)
-  BINARY(*)
-  BINARY(/)
+  BINARY( +, vaddq_f32 )
+  BINARY( -, vsubq_f32 )
+  BINARY( *, vmulq_f32 )
+  BINARY( /, vdivq_f32 )
 
-# undef BINARY
+  #undef BINARY
+
+  // #define BINARY(op)                                                 \
+  // inline v4float operator op( const v4float &a, const v4float &b )   \
+  // {                                                               \
+  //   v4float c;                                                       \
+  //   ALWAYS_VECTORIZE                                                 \
+  //   for( int j = 0; j < 4; j++ )                                     \
+  //     c.f[j] = a.f[j] op b.f[j];                                     \
+  //   return c;                                                        \
+  // }
+
+  // BINARY(+)
+  // BINARY(-)
+  // BINARY(*)
+  // BINARY(/)
+
+  // #undef BINARY
 
   // v4float logical operators
 
 # define LOGICAL(op)                                               \
   inline v4int operator op( const v4float &a, const v4float &b )   \
-  {								   \
+  {                                                                \
     v4int c;                                                       \
     ALWAYS_VECTORIZE                                               \
     for( int j = 0; j < 4; j++ )                                   \
@@ -988,7 +1003,7 @@ namespace v4
 
 # define CMATH_FR1(fn)                          \
   inline v4float fn( const v4float &a )         \
-  {						\
+  {                                             \
     v4float b;                                  \
     ALWAYS_VECTORIZE                            \
     for( int j = 0; j < 4; j++ )                \
@@ -998,7 +1013,7 @@ namespace v4
 
 # define CMATH_FR2(fn)                                          \
   inline v4float fn( const v4float &a, const v4float &b )       \
-  {								\
+  {                                                             \
     v4float c;                                                  \
     ALWAYS_VECTORIZE                                            \
     for( int j = 0; j < 4; j++ )                                \
@@ -1081,6 +1096,8 @@ namespace v4
   {
     v4float d;
 
+    // d.v = _mm_fmadd_ps( a.v, b.v, c.v );
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       d.f[j] = a.f[j] * b.f[j] + c.f[j];

From ab4dd1044993a7496afd1a2ec3e031035004c366 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Sun, 14 Jul 2019 21:06:12 -0600
Subject: [PATCH 18/95] Add float32x4_t type to the v4_neon union.

---
 src/util/v4/v4_neon.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 1dce1cb3..bd340b55 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -125,6 +125,7 @@ namespace v4
     {
       int i[4];
       float f[4];
+      float32x4_t v;
     };
 
   public:

From 66ad58a91cbc3dc55bfdfd29705fd54994c5a61a Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Sun, 14 Jul 2019 21:33:15 -0600
Subject: [PATCH 19/95] Add ARM NEON intrinsics support for fma and fms.

---
 src/util/v4/v4_neon.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index bd340b55..57886e87 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -1097,11 +1097,11 @@ namespace v4
   {
     v4float d;
 
-    // d.v = _mm_fmadd_ps( a.v, b.v, c.v );
+    d.v = vfmaq_f32( a.v, b.v, c.v );
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      d.f[j] = a.f[j] * b.f[j] + c.f[j];
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   d.f[j] = a.f[j] * b.f[j] + c.f[j];
 
     return d;
   }
@@ -1110,9 +1110,11 @@ namespace v4
   {
     v4float d;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      d.f[j] = a.f[j] * b.f[j] - c.f[j];
+    d.v = vfmsq_f32( a.v, b.v, c.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   d.f[j] = a.f[j] * b.f[j] - c.f[j];
 
     return d;
   }

From f891774fdd3609c2bf381d23329af4c164244b85 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 15 Jul 2019 11:05:08 -0600
Subject: [PATCH 20/95] Remove some tabs. Format tweaks. Add NEON intrinsics
 support for contructors.

---
 src/util/v4/v4_avx2.h        | 112 +++++++++++++++++------------------
 src/util/v4/v4_neon.h        |  84 ++++++++++++++++++--------
 src/util/v4/v4_portable.h    |  38 ++++++------
 src/util/v4/v4_portable_v0.h |  38 ++++++------
 src/util/v4/v4_portable_v1.h |  52 ++++++++--------
 5 files changed, 179 insertions(+), 145 deletions(-)

diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h
index a7b7b783..104121db 100644
--- a/src/util/v4/v4_avx2.h
+++ b/src/util/v4/v4_avx2.h
@@ -130,7 +130,7 @@ namespace v4
 
     v4( const v4 &a )          // Copy constructor
     {
-      v=a.v;
+      v = a.v;
     }
 
     ~v4() {}                   // Default destructor
@@ -202,19 +202,19 @@ namespace v4
   // v4 memory manipulation functions
 
   inline void load_4x1( const void * ALIGNED(16) p,
-			v4 &a )
+                        v4 &a )
   {
     a.v = _mm_load_ps( ( float * ) p );
   }
 
   inline void store_4x1( const v4 &a,
-			 void * ALIGNED(16) p )
+                         void * ALIGNED(16) p )
   {
     _mm_store_ps( ( float * ) p, a.v );
   }
 
   inline void stream_4x1( const v4 &a,
-			  void * ALIGNED(16) p )
+                          void * ALIGNED(16) p )
   {
     _mm_stream_ps( ( float * ) p, a.v );
   }
@@ -243,7 +243,7 @@ namespace v4
 
   inline void load_4x1_tr( const void *a0, const void *a1,
                            const void *a2, const void *a3,
-			   v4 &a )
+                           v4 &a )
   {
     a.v = _mm_setr_ps( ((const float *)a0)[0],
                        ((const float *)a1)[0],
@@ -397,7 +397,7 @@ namespace v4
 
   inline void store_4x1_tr( const v4 &a,
                             void *a0, void *a1,
-			    void *a2, void *a3 )
+                            void *a2, void *a3 )
   {
     ((float *)a0)[0] = a.f[0];
     ((float *)a1)[0] = a.f[1];
@@ -446,7 +446,7 @@ namespace v4
 
   // FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR)
   inline void store_4x4_tr( const v4 &a, const v4 &b,
-			    const v4 &c, const v4 &d,
+                            const v4 &c, const v4 &d,
                             void * ALIGNED(16) a0, void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
   {
@@ -564,8 +564,8 @@ namespace v4
     {
       union
       {
-	int i;
-	float f;
+        int i;
+        float f;
       } u;
 
       u.i = a;
@@ -576,8 +576,8 @@ namespace v4
     {
       union
       {
-	int i;
-	float f;
+        int i;
+        float f;
       } u0, u1, u2, u3;
 
       u0.i = i0;
@@ -592,9 +592,9 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)			          \
+#   define ASSIGN(op)                             \
     inline v4int &operator op( const v4int &b )   \
-    {						  \
+    {                                             \
       i[0] op b.i[0];                             \
       i[1] op b.i[1];                             \
       i[2] op b.i[2];                             \
@@ -658,7 +658,7 @@ namespace v4
 
 # define PREFIX_UNARY(op)                       \
   inline v4int operator op( const v4int & a )   \
-  {						\
+  {                                             \
     v4int b;                                    \
     b.i[0] = ( op a.i[0] );                     \
     b.i[1] = ( op a.i[1] );                     \
@@ -712,7 +712,7 @@ namespace v4
 
 # define PREFIX_INCDEC(op)                      \
   inline v4int operator op( v4int & a )         \
-  {						\
+  {                                             \
     v4int b;                                    \
     b.i[0] = ( op a.i[0] );                     \
     b.i[1] = ( op a.i[1] );                     \
@@ -730,7 +730,7 @@ namespace v4
 
 # define POSTFIX_INCDEC(op)                    \
   inline v4int operator op( v4int & a, int )   \
-  {					       \
+  {                                            \
     v4int b;                                   \
     b.i[0] = ( a.i[0] op );                    \
     b.i[1] = ( a.i[1] op );                    \
@@ -748,7 +748,7 @@ namespace v4
 
 # define BINARY(op)                                             \
   inline v4int operator op( const v4int &a, const v4int &b )    \
-  {								\
+  {                                                             \
     v4int c;                                                    \
     c.i[0] = a.i[0] op b.i[0];                                  \
     c.i[1] = a.i[1] op b.i[1];                                  \
@@ -799,7 +799,7 @@ namespace v4
 
 # define LOGICAL(op)                                           \
   inline v4int operator op( const v4int &a, const v4int &b )   \
-  {							       \
+  {                                                            \
     v4int c;                                                   \
     c.i[0] = - ( a.i[0] op b.i[0] );                           \
     c.i[1] = - ( a.i[1] op b.i[1] );                           \
@@ -857,7 +857,7 @@ namespace v4
     v4 tf;
 
     tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ),
-		      _mm_and_ps( c_v, t.v ) );
+                      _mm_and_ps( c_v, t.v ) );
 
     return tf;
   }
@@ -967,11 +967,11 @@ namespace v4
 
     // v4float assignment operators
 
-#   define ASSIGN(op,intrin)				\
+#   define ASSIGN(op,intrin)                            \
     inline v4float &operator op( const v4float &b )     \
-    {							\
+    {                                                   \
       v = intrin( v, b.v );                             \
-      return *this;					\
+      return *this;                                     \
     }
 
     inline v4float &operator =( const v4float &b )
@@ -1086,7 +1086,7 @@ namespace v4
 
 # define BINARY(op,intrin)                                           \
   inline v4float operator op( const v4float &a, const v4float &b )   \
-  {								     \
+  {                                                                  \
     v4float c;                                                       \
     c.v = intrin( a.v, b.v );                                        \
     return c;                                                        \
@@ -1103,7 +1103,7 @@ namespace v4
 
 # define LOGICAL(op,intrin)                                        \
   inline v4int operator op( const v4float &a, const v4float &b )   \
-  {								   \
+  {                                                                \
     v4int c;                                                       \
     c.v = intrin( a.v, b.v );                                      \
     return c;                                                      \
@@ -1123,7 +1123,7 @@ namespace v4
     __m128 vzero = _mm_setzero_ps();
 
     c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ),
-		      _mm_cmpneq_ps( b.v, vzero ) );
+                      _mm_cmpneq_ps( b.v, vzero ) );
 
     return c;
   }
@@ -1135,7 +1135,7 @@ namespace v4
     __m128 vzero = _mm_setzero_ps();
 
     c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ),
-		     _mm_cmpneq_ps( b.v, vzero ) );
+                     _mm_cmpneq_ps( b.v, vzero ) );
 
     return c;
   }
@@ -1146,7 +1146,7 @@ namespace v4
 
 # define CMATH_FR1(fn)                          \
   inline v4float fn( const v4float &a )         \
-  {						\
+  {                                             \
     v4float b;                                  \
     b.f[0] = ::fn( a.f[0] );                    \
     b.f[1] = ::fn( a.f[1] );                    \
@@ -1157,7 +1157,7 @@ namespace v4
 
 # define CMATH_FR2(fn)                                          \
   inline v4float fn( const v4float &a, const v4float &b )       \
-  {								\
+  {                                                             \
     v4float c;                                                  \
     c.f[0] = ::fn( a.f[0], b.f[0] );                            \
     c.f[1] = ::fn( a.f[1], b.f[1] );                            \
@@ -1197,7 +1197,7 @@ namespace v4
     __m128 t = _mm_set1_ps( -0.0f );
 
     c.v = _mm_or_ps( _mm_and_ps( t, b.v ),
-		     _mm_andnot_ps( t, a.v ) );
+                     _mm_andnot_ps( t, a.v ) );
 
     return c;
   }
@@ -1228,15 +1228,15 @@ namespace v4
     // Note: It is quicker to just call div_ps and sqrt_ps if more
     // refinement desired!
     b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ),
-				       _mm_sub_ps( b_v,
-						   _mm_mul_ps( a_v,
-							       _mm_mul_ps( b_v,
-									   _mm_mul_ps( b_v, b_v )
-									 )
-							     )
-						 )
-				     )
-		    );
+                                       _mm_sub_ps( b_v,
+                                                   _mm_mul_ps( a_v,
+                                                               _mm_mul_ps( b_v,
+                                                                           _mm_mul_ps( b_v, b_v )
+                                                                         )
+                                                             )
+                                                 )
+                                     )
+                    );
 
     return b;
   }
@@ -1255,11 +1255,11 @@ namespace v4
     // refinement desired!
 
     b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ),
-			_mm_fnmadd_ps( a_v,
-				       _mm_mul_ps( b_v,
-						   _mm_mul_ps( b_v, b_v ) ),
-				       b_v ),
-			b_v );
+                        _mm_fnmadd_ps( a_v,
+                                       _mm_mul_ps( b_v,
+                                                   _mm_mul_ps( b_v, b_v ) ),
+                                       b_v ),
+                        b_v );
 
     return b;
   }
@@ -1277,11 +1277,11 @@ namespace v4
     // refinement desired!
 
     b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ),
-			_mm_fnmadd_ps( a.v,
-				       _mm_mul_ps( b_v,
-						   _mm_mul_ps( b_v, b_v ) ),
-				       b_v ),
-			b_v );
+                        _mm_fnmadd_ps( a.v,
+                                       _mm_mul_ps( b_v,
+                                                   _mm_mul_ps( b_v, b_v ) ),
+                                       b_v ),
+                        b_v );
 
     return b;
   }
@@ -1305,10 +1305,10 @@ namespace v4
     b_v = _mm_rcp_ps( a_v );
 
     b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ),
-		      _mm_mul_ps( a_v,
-				  _mm_mul_ps( b_v, b_v )
-				)
-		    );
+                      _mm_mul_ps( a_v,
+                                  _mm_mul_ps( b_v, b_v )
+                                )
+                    );
 
     return b;
   }
@@ -1324,8 +1324,8 @@ namespace v4
     b_v = _mm_rcp_ps( a_v );
 
     b.v = _mm_fnmadd_ps( a_v,
-			 _mm_mul_ps( b_v, b_v ),
-			 _mm_add_ps( b_v, b_v ) );
+                         _mm_mul_ps( b_v, b_v ),
+                         _mm_add_ps( b_v, b_v ) );
 
     return b;
   }
@@ -1340,8 +1340,8 @@ namespace v4
     b_v = _mm_rcp_ps( a.v );
 
     b.v = _mm_fnmadd_ps( a.v,
-			 _mm_mul_ps( b_v, b_v ),
-			 _mm_add_ps( b_v, b_v ) );
+                         _mm_mul_ps( b_v, b_v ),
+                         _mm_add_ps( b_v, b_v ) );
 
     return b;
   }
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 57886e87..44aa4648 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -134,9 +134,11 @@ namespace v4
 
     v4( const v4 &a )          // Copy constructor
     {
-      ALWAYS_VECTORIZE
-      for( int j = 0; j < 4; j++ )
-        i[j] = a.i[j];
+      v = a.v;
+
+      // ALWAYS_VECTORIZE
+      // for( int j = 0; j < 4; j++ )
+      //   i[j] = a.i[j];
     }
 
     ~v4() {}                   // Default destructor
@@ -179,7 +181,7 @@ namespace v4
     return b;
   }
 
-# define sw(x,y) x^=y, y^=x, x^=y
+  #define sw(x,y) x^=y, y^=x, x^=y
 
   inline void swap( v4 &a, v4 &b )
   {
@@ -195,7 +197,7 @@ namespace v4
                                                   sw( a2.i[3],a3.i[2] );
   }
 
-# undef sw
+  #undef sw
 
   // v4 memory manipulation functions
 
@@ -491,27 +493,53 @@ namespace v4
 
     v4int( const v4int &a )                   // Copy constructor
     {
-      ALWAYS_VECTORIZE
-      for( int j = 0; j < 4; j++ )
-        i[j] = a.i[j];
+      v = a.v;
+
+      // ALWAYS_VECTORIZE
+      // for( int j = 0; j < 4; j++ )
+      //   i[j] = a.i[j];
     }
 
     v4int( const v4 &a )                      // Init from mixed
     {
-      ALWAYS_VECTORIZE
-      for( int j = 0; j < 4; j++ )
-        i[j] = a.i[j];
+      v = a.v;
+
+      // ALWAYS_VECTORIZE
+      // for( int j = 0; j < 4; j++ )
+      //   i[j] = a.i[j];
     }
 
     v4int( int a )                            // Init from scalar
     {
-      ALWAYS_VECTORIZE
-      for( int j = 0; j < 4; j++ )
-        i[j] = a;
+      union
+      {
+        int i;
+        float f;
+      } u;
+
+      u.i = a;
+      v   = vdupq_n_f32( u.f );
+
+      // ALWAYS_VECTORIZE
+      // for( int j = 0; j < 4; j++ )
+      //   i[j] = a;
     }
 
     v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
     {
+      // union
+      // {
+      //   int i;
+      //   float f;
+      // } u0, u1, u2, u3;
+
+      // u0.i = i0;
+      // u1.i = i1;
+      // u2.i = i2;
+      // u3.i = i3;
+
+      // v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f );
+
       i[0] = i0;
       i[1] = i1;
       i[2] = i2;
@@ -799,23 +827,29 @@ namespace v4
 
     v4float( const v4float &a )                         // Copy constructor
     {
-      ALWAYS_VECTORIZE
-      for( int j = 0; j < 4; j++ )
-        f[j] = a.f[j];
+      v = a.v;
+
+      // ALWAYS_VECTORIZE
+      // for( int j = 0; j < 4; j++ )
+      //   f[j] = a.f[j];
     }
 
     v4float( const v4 &a )                              // Init from mixed
     {
-      ALWAYS_VECTORIZE
-      for( int j = 0; j < 4; j++ )
-        f[j] = a.f[j];
+      v = a.v;
+
+      // ALWAYS_VECTORIZE
+      // for( int j = 0; j < 4; j++ )
+      //   f[j] = a.f[j];
     }
 
     v4float( float a )                                  // Init from scalar
     {
-      ALWAYS_VECTORIZE
-      for( int j = 0; j < 4; j++ )
-        f[j] = a;
+      v = vdupq_n_f32( a );
+
+      // ALWAYS_VECTORIZE
+      // for( int j = 0; j < 4; j++ )
+      //   f[j] = a;
     }
 
     v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
@@ -830,7 +864,7 @@ namespace v4
 
     // v4float assignment operators
 
-#   define ASSIGN(op)                                   \
+    #define ASSIGN(op)                                  \
     inline v4float &operator op( const v4float &b )     \
     {                                                   \
       ALWAYS_VECTORIZE                                  \
@@ -845,7 +879,7 @@ namespace v4
     ASSIGN(*=)
     ASSIGN(/=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4float member access operator
 
diff --git a/src/util/v4/v4_portable.h b/src/util/v4/v4_portable.h
index 9f199697..6dbb790b 100644
--- a/src/util/v4/v4_portable.h
+++ b/src/util/v4/v4_portable.h
@@ -189,7 +189,7 @@ namespace v4
   // v4 memory manipulation functions
 
   inline void load_4x1( const void * ALIGNED(16) p,
-			v4 &a )
+                        v4 &a )
   {
     a.i[0] = ((const int * ALIGNED(16))p)[0];
     a.i[1] = ((const int * ALIGNED(16))p)[1];
@@ -198,7 +198,7 @@ namespace v4
   }
 
   inline void store_4x1( const v4 &a,
-			 void * ALIGNED(16) p )
+                         void * ALIGNED(16) p )
   {
     ((int * ALIGNED(16))p)[0] = a.i[0];
     ((int * ALIGNED(16))p)[1] = a.i[1];
@@ -207,7 +207,7 @@ namespace v4
   }
 
   inline void stream_4x1( const v4 &a,
-			  void * ALIGNED(16) p )
+                          void * ALIGNED(16) p )
   {
     ((int * ALIGNED(16))p)[0] = a.i[0];
     ((int * ALIGNED(16))p)[1] = a.i[1];
@@ -234,7 +234,7 @@ namespace v4
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
-			void * ALIGNED(16) b )
+                        void * ALIGNED(16) b )
   {
     int t;
 
@@ -259,7 +259,7 @@ namespace v4
 
   inline void load_4x1_tr( const void *a0, const void *a1,
                            const void *a2, const void *a3,
-			   v4 &a )
+                           v4 &a )
   {
     a.i[0] = ((const int *)a0)[0];
     a.i[1] = ((const int *)a1)[0];
@@ -338,7 +338,7 @@ namespace v4
 
   inline void store_4x1_tr( const v4 &a,
                             void *a0, void *a1,
-			    void *a2, void *a3 )
+                            void *a2, void *a3 )
   {
     ((int *)a0)[0] = a.i[0];
     ((int *)a1)[0] = a.i[1];
@@ -527,9 +527,9 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)			          \
+#   define ASSIGN(op)                             \
     inline v4int &operator op( const v4int &b )   \
-    {						  \
+    {                                             \
       i[0] op b.i[0];                             \
       i[1] op b.i[1];                             \
       i[2] op b.i[2];                             \
@@ -568,7 +568,7 @@ namespace v4
 
 # define PREFIX_UNARY(op)                       \
   inline v4int operator op( const v4int & a )   \
-  {						\
+  {                                             \
     v4int b;                                    \
     b.i[0] = ( op a.i[0] );                     \
     b.i[1] = ( op a.i[1] );                     \
@@ -600,7 +600,7 @@ namespace v4
 
 # define PREFIX_INCDEC(op)                      \
   inline v4int operator op( v4int & a )         \
-  {						\
+  {                                             \
     v4int b;                                    \
     b.i[0] = ( op a.i[0] );                     \
     b.i[1] = ( op a.i[1] );                     \
@@ -618,7 +618,7 @@ namespace v4
 
 # define POSTFIX_INCDEC(op)                    \
   inline v4int operator op( v4int & a, int )   \
-  {					       \
+  {                                            \
     v4int b;                                   \
     b.i[0] = ( a.i[0] op );                    \
     b.i[1] = ( a.i[1] op );                    \
@@ -636,7 +636,7 @@ namespace v4
 
 # define BINARY(op)                                             \
   inline v4int operator op( const v4int &a, const v4int &b )    \
-  {								\
+  {                                                             \
     v4int c;                                                    \
     c.i[0] = a.i[0] op b.i[0];                                  \
     c.i[1] = a.i[1] op b.i[1];                                  \
@@ -662,7 +662,7 @@ namespace v4
 
 # define LOGICAL(op)                                           \
   inline v4int operator op( const v4int &a, const v4int &b )   \
-  {							       \
+  {                                                            \
     v4int c;                                                   \
     c.i[0] = -(a.i[0] op b.i[0]);                              \
     c.i[1] = -(a.i[1] op b.i[1]);                              \
@@ -851,8 +851,8 @@ namespace v4
 
 #   define ASSIGN(op)                                   \
     inline v4float &operator op( const v4float &b )     \
-    {							\
-      f[0] op b.f[0];		             		\
+    {                                                   \
+      f[0] op b.f[0];                                   \
       f[1] op b.f[1];                                   \
       f[2] op b.f[2];                                   \
       f[3] op b.f[3];                                   \
@@ -974,7 +974,7 @@ namespace v4
 
 # define BINARY(op)                                                  \
   inline v4float operator op( const v4float &a, const v4float &b )   \
-  {								     \
+  {                                                                  \
     v4float c;                                                       \
     c.f[0] = a.f[0] op b.f[0];                                       \
     c.f[1] = a.f[1] op b.f[1];                                       \
@@ -994,7 +994,7 @@ namespace v4
 
 # define LOGICAL(op)                                               \
   inline v4int operator op( const v4float &a, const v4float &b )   \
-  {								   \
+  {                                                                \
     v4int c;                                                       \
     c.i[0] = - ( a.f[0] op b.f[0] );                               \
     c.i[1] = - ( a.f[1] op b.f[1] );                               \
@@ -1018,7 +1018,7 @@ namespace v4
 
 # define CMATH_FR1(fn)                          \
   inline v4float fn( const v4float &a )         \
-  {						\
+  {                                             \
     v4float b;                                  \
     b.f[0] = ::fn( a.f[0] );                    \
     b.f[1] = ::fn( a.f[1] );                    \
@@ -1029,7 +1029,7 @@ namespace v4
 
 # define CMATH_FR2(fn)                                          \
   inline v4float fn( const v4float &a, const v4float &b )       \
-  {								\
+  {                                                             \
     v4float c;                                                  \
     c.f[0] = ::fn( a.f[0], b.f[0] );                            \
     c.f[1] = ::fn( a.f[1], b.f[1] );                            \
diff --git a/src/util/v4/v4_portable_v0.h b/src/util/v4/v4_portable_v0.h
index 6b2555e8..6a89939e 100644
--- a/src/util/v4/v4_portable_v0.h
+++ b/src/util/v4/v4_portable_v0.h
@@ -189,7 +189,7 @@ namespace v4
   // v4 memory manipulation functions
 
   inline void load_4x1( const void * ALIGNED(16) p,
-			v4 &a )
+                        v4 &a )
   {
     a.i[0] = ((const int * ALIGNED(16))p)[0];
     a.i[1] = ((const int * ALIGNED(16))p)[1];
@@ -198,7 +198,7 @@ namespace v4
   }
 
   inline void store_4x1( const v4 &a,
-			 void * ALIGNED(16) p )
+                         void * ALIGNED(16) p )
   {
     ((int * ALIGNED(16))p)[0] = a.i[0];
     ((int * ALIGNED(16))p)[1] = a.i[1];
@@ -207,7 +207,7 @@ namespace v4
   }
 
   inline void stream_4x1( const v4 &a,
-			  void * ALIGNED(16) p )
+                          void * ALIGNED(16) p )
   {
     ((int * ALIGNED(16))p)[0] = a.i[0];
     ((int * ALIGNED(16))p)[1] = a.i[1];
@@ -234,7 +234,7 @@ namespace v4
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
-			void * ALIGNED(16) b )
+                        void * ALIGNED(16) b )
   {
     int t;
 
@@ -259,7 +259,7 @@ namespace v4
 
   inline void load_4x1_tr( const void *a0, const void *a1,
                            const void *a2, const void *a3,
-			   v4 &a )
+                           v4 &a )
   {
     a.i[0] = ((const int *)a0)[0];
     a.i[1] = ((const int *)a1)[0];
@@ -338,7 +338,7 @@ namespace v4
 
   inline void store_4x1_tr( const v4 &a,
                             void *a0, void *a1,
-			    void *a2, void *a3 )
+                            void *a2, void *a3 )
   {
     ((int *)a0)[0] = a.i[0];
     ((int *)a1)[0] = a.i[1];
@@ -527,9 +527,9 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)			          \
+#   define ASSIGN(op)                             \
     inline v4int &operator op( const v4int &b )   \
-    {						  \
+    {                                             \
       i[0] op b.i[0];                             \
       i[1] op b.i[1];                             \
       i[2] op b.i[2];                             \
@@ -568,7 +568,7 @@ namespace v4
 
 # define PREFIX_UNARY(op)                       \
   inline v4int operator op( const v4int & a )   \
-  {						\
+  {                                             \
     v4int b;                                    \
     b.i[0] = ( op a.i[0] );                     \
     b.i[1] = ( op a.i[1] );                     \
@@ -600,7 +600,7 @@ namespace v4
 
 # define PREFIX_INCDEC(op)                      \
   inline v4int operator op( v4int & a )         \
-  {						\
+  {                                             \
     v4int b;                                    \
     b.i[0] = ( op a.i[0] );                     \
     b.i[1] = ( op a.i[1] );                     \
@@ -618,7 +618,7 @@ namespace v4
 
 # define POSTFIX_INCDEC(op)                    \
   inline v4int operator op( v4int & a, int )   \
-  {					       \
+  {                                            \
     v4int b;                                   \
     b.i[0] = ( a.i[0] op );                    \
     b.i[1] = ( a.i[1] op );                    \
@@ -636,7 +636,7 @@ namespace v4
 
 # define BINARY(op)                                             \
   inline v4int operator op( const v4int &a, const v4int &b )    \
-  {								\
+  {                                                             \
     v4int c;                                                    \
     c.i[0] = a.i[0] op b.i[0];                                  \
     c.i[1] = a.i[1] op b.i[1];                                  \
@@ -662,7 +662,7 @@ namespace v4
 
 # define LOGICAL(op)                                           \
   inline v4int operator op( const v4int &a, const v4int &b )   \
-  {							       \
+  {                                                            \
     v4int c;                                                   \
     c.i[0] = - ( a.i[0] op b.i[0] );                           \
     c.i[1] = - ( a.i[1] op b.i[1] );                           \
@@ -851,8 +851,8 @@ namespace v4
 
 #   define ASSIGN(op)                                   \
     inline v4float &operator op( const v4float &b )     \
-    {							\
-      f[0] op b.f[0];		             		\
+    {                                                   \
+      f[0] op b.f[0];                                   \
       f[1] op b.f[1];                                   \
       f[2] op b.f[2];                                   \
       f[3] op b.f[3];                                   \
@@ -974,7 +974,7 @@ namespace v4
 
 # define BINARY(op)                                                  \
   inline v4float operator op( const v4float &a, const v4float &b )   \
-  {								     \
+  {                                                                  \
     v4float c;                                                       \
     c.f[0] = a.f[0] op b.f[0];                                       \
     c.f[1] = a.f[1] op b.f[1];                                       \
@@ -994,7 +994,7 @@ namespace v4
 
 # define LOGICAL(op)                                               \
   inline v4int operator op( const v4float &a, const v4float &b )   \
-  {								   \
+  {                                                                \
     v4int c;                                                       \
     c.i[0] = - ( a.f[0] op b.f[0] );                               \
     c.i[1] = - ( a.f[1] op b.f[1] );                               \
@@ -1018,7 +1018,7 @@ namespace v4
 
 # define CMATH_FR1(fn)                          \
   inline v4float fn( const v4float &a )         \
-  {						\
+  {                                             \
     v4float b;                                  \
     b.f[0] = ::fn( a.f[0] );                    \
     b.f[1] = ::fn( a.f[1] );                    \
@@ -1029,7 +1029,7 @@ namespace v4
 
 # define CMATH_FR2(fn)                                          \
   inline v4float fn( const v4float &a, const v4float &b )       \
-  {								\
+  {                                                             \
     v4float c;                                                  \
     c.f[0] = ::fn( a.f[0], b.f[0] );                            \
     c.f[1] = ::fn( a.f[1], b.f[1] );                            \
diff --git a/src/util/v4/v4_portable_v1.h b/src/util/v4/v4_portable_v1.h
index 4d3c4b20..d67bf4b8 100644
--- a/src/util/v4/v4_portable_v1.h
+++ b/src/util/v4/v4_portable_v1.h
@@ -134,7 +134,7 @@ namespace v4
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	i[j] = a.i[j];
+        i[j] = a.i[j];
     }
 
     ~v4() {}                   // Default destructor
@@ -198,7 +198,7 @@ namespace v4
   // v4 memory manipulation functions
 
   inline void load_4x1( const void * ALIGNED(16) p,
-			v4 &a )
+                        v4 &a )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
@@ -206,7 +206,7 @@ namespace v4
   }
 
   inline void store_4x1( const v4 &a,
-			 void * ALIGNED(16) p )
+                         void * ALIGNED(16) p )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
@@ -214,7 +214,7 @@ namespace v4
   }
 
   inline void stream_4x1( const v4 &a,
-			  void * ALIGNED(16) p )
+                          void * ALIGNED(16) p )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
@@ -238,7 +238,7 @@ namespace v4
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
-			void * ALIGNED(16) b )
+                        void * ALIGNED(16) b )
   {
     int t;
 
@@ -255,7 +255,7 @@ namespace v4
 
   inline void load_4x1_tr( const void *a0, const void *a1,
                            const void *a2, const void *a3,
-			   v4 &a )
+                           v4 &a )
   {
     a.i[0] = ((const int *)a0)[0];
     a.i[1] = ((const int *)a1)[0];
@@ -334,7 +334,7 @@ namespace v4
 
   inline void store_4x1_tr( const v4 &a,
                             void *a0, void *a1,
-			    void *a2, void *a3 )
+                            void *a2, void *a3 )
   {
     ((int *)a0)[0] = a.i[0];
     ((int *)a1)[0] = a.i[1];
@@ -491,21 +491,21 @@ namespace v4
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	i[j] = a.i[j];
+        i[j] = a.i[j];
     }
 
     v4int( const v4 &a )                      // Init from mixed
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	i[j] = a.i[j];
+        i[j] = a.i[j];
     }
 
     v4int( int a )                            // Init from scalar
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	i[j] = a;
+        i[j] = a;
     }
 
     v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
@@ -520,9 +520,9 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)			          \
+#   define ASSIGN(op)                             \
     inline v4int &operator op( const v4int &b )   \
-    {						  \
+    {                                             \
       ALWAYS_VECTORIZE                            \
       for( int j = 0; j < 4; j++ )                \
         i[j] op b.i[j];                           \
@@ -560,7 +560,7 @@ namespace v4
 
 # define PREFIX_UNARY(op)                       \
   inline v4int operator op( const v4int & a )   \
-  {						\
+  {                                             \
     v4int b;                                    \
     ALWAYS_VECTORIZE                            \
     for( int j = 0; j < 4; j++ )                \
@@ -590,7 +590,7 @@ namespace v4
 
 # define PREFIX_INCDEC(op)                      \
   inline v4int operator op( v4int & a )         \
-  {						\
+  {                                             \
     v4int b;                                    \
     ALWAYS_VECTORIZE                            \
     for( int j = 0; j < 4; j++ )                \
@@ -607,7 +607,7 @@ namespace v4
 
 # define POSTFIX_INCDEC(op)                    \
   inline v4int operator op( v4int & a, int )   \
-  {					       \
+  {                                            \
     v4int b;                                   \
     ALWAYS_VECTORIZE                           \
     for( int j = 0; j < 4; j++ )               \
@@ -624,7 +624,7 @@ namespace v4
 
 # define BINARY(op)                                             \
   inline v4int operator op( const v4int &a, const v4int &b )    \
-  {								\
+  {                                                             \
     v4int c;                                                    \
     ALWAYS_VECTORIZE                                            \
     for( int j = 0; j < 4; j++ )                                \
@@ -649,7 +649,7 @@ namespace v4
 
 # define LOGICAL(op)                                           \
   inline v4int operator op( const v4int &a, const v4int &b )   \
-  {							       \
+  {                                                            \
     v4int c;                                                   \
     ALWAYS_VECTORIZE                                           \
     for( int j = 0; j < 4; j++ )                               \
@@ -799,21 +799,21 @@ namespace v4
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	f[j] = a.f[j];
+        f[j] = a.f[j];
     }
 
     v4float( const v4 &a )                              // Init from mixed
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	f[j] = a.f[j];
+        f[j] = a.f[j];
     }
 
     v4float( float a )                                  // Init from scalar
     {
       ALWAYS_VECTORIZE
       for( int j = 0; j < 4; j++ )
-	f[j] = a;
+        f[j] = a;
     }
 
     v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
@@ -830,10 +830,10 @@ namespace v4
 
 #   define ASSIGN(op)                                   \
     inline v4float &operator op( const v4float &b )     \
-    {							\
+    {                                                   \
       ALWAYS_VECTORIZE                                  \
       for( int j = 0; j < 4; j++ )                      \
-        f[j] op b.f[j];		             		\
+        f[j] op b.f[j];                                 \
       return *this;                                     \
     }
 
@@ -945,7 +945,7 @@ namespace v4
 
 # define BINARY(op)                                                  \
   inline v4float operator op( const v4float &a, const v4float &b )   \
-  {								     \
+  {                                                                  \
     v4float c;                                                       \
     ALWAYS_VECTORIZE                                                 \
     for( int j = 0; j < 4; j++ )                                     \
@@ -964,7 +964,7 @@ namespace v4
 
 # define LOGICAL(op)                                               \
   inline v4int operator op( const v4float &a, const v4float &b )   \
-  {								   \
+  {                                                                \
     v4int c;                                                       \
     ALWAYS_VECTORIZE                                               \
     for( int j = 0; j < 4; j++ )                                   \
@@ -987,7 +987,7 @@ namespace v4
 
 # define CMATH_FR1(fn)                          \
   inline v4float fn( const v4float &a )         \
-  {						\
+  {                                             \
     v4float b;                                  \
     ALWAYS_VECTORIZE                            \
     for( int j = 0; j < 4; j++ )                \
@@ -997,7 +997,7 @@ namespace v4
 
 # define CMATH_FR2(fn)                                          \
   inline v4float fn( const v4float &a, const v4float &b )       \
-  {								\
+  {                                                             \
     v4float c;                                                  \
     ALWAYS_VECTORIZE                                            \
     for( int j = 0; j < 4; j++ )                                \

From 2ddef89abb86a58e8738ac467654b8e70aab1a61 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 15 Jul 2019 12:34:39 -0600
Subject: [PATCH 21/95] Add NEON intrinsics support for load_4x4_tr and
 store_4x4_tr.

---
 src/util/v4/v4_neon.h | 285 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 247 insertions(+), 38 deletions(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 44aa4648..f1a691a5 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -273,6 +273,19 @@ namespace v4
                            const void * ALIGNED(8) a3,
                            v4 &a, v4 &b )
   {
+    // __m128 a_v, b_v, t;
+
+    // b_v = _mm_setzero_ps();
+
+    // t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
+    // b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
+
+    // a_v = _mm_shuffle_ps( t, b_v, 0x88 );
+    // b_v = _mm_shuffle_ps( t, b_v, 0xdd );
+
+    // a.v = a_v;
+    // b.v = b_v;
+
     a.i[0] = ((const int * ALIGNED(8))a0)[0];
     b.i[0] = ((const int * ALIGNED(8))a0)[1];
 
@@ -315,25 +328,123 @@ namespace v4
                            const void * ALIGNED(16) a3,
                            v4 &a, v4 &b, v4 &c, v4 &d )
   {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
-    d.i[0] = ((const int * ALIGNED(16))a0)[3];
-
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
-    d.i[1] = ((const int * ALIGNED(16))a1)[3];
-
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
-    d.i[2] = ((const int * ALIGNED(16))a2)[3];
-
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2];
-    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+    //-----------------------------------------------------------------
+    float32x4_t a_v, b_v, c_v, d_v, t, u;
+    //-----------------------------------------------------------------
+    // __m128 a_v, b_v, c_v, d_v, t, u;
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    a_v = vld1q_f32( (const float *) a0 );
+    b_v = vld1q_f32( (const float *) a1 );
+    c_v = vld1q_f32( (const float *) a2 );
+    d_v = vld1q_f32( (const float *) a3 );
+    //-----------------------------------------------------------------
+    // a_v = _mm_load_ps( (const float *) a0 );
+    // b_v = _mm_load_ps( (const float *) a1 );
+    // c_v = _mm_load_ps( (const float *) a2 );
+    // d_v = _mm_load_ps( (const float *) a3 );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t a_vh = vget_high_f32( a_v );
+    float32x2_t b_vh = vget_high_f32( b_v );
+
+    float32x2x2_t res_ab_h = vzip_f32( a_vh, b_vh );
+
+    t = vcombine_f32( res_ab_h.val[0], res_ab_h.val[1] );
+    //-----------------------------------------------------------------
+    // t   = _mm_unpackhi_ps( a_v, b_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t c_vh = vget_high_f32( c_v );
+    float32x2_t d_vh = vget_high_f32( d_v );
+
+    float32x2x2_t res_cd_h = vzip_f32( c_vh, d_vh );
+
+    u = vcombine_f32( res_cd_h.val[0], res_cd_h.val[1] );
+    //-----------------------------------------------------------------
+    // u   = _mm_unpackhi_ps( c_v, d_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t a_vl = vget_low_f32( a_v );
+    float32x2_t b_vl = vget_low_f32( b_v );
+
+    float32x2x2_t res_ab_l = vzip_f32( a_vl, b_vl );
+
+    a_v = vcombine_f32( res_ab_l.val[0], res_ab_l.val[1] );
+    //-----------------------------------------------------------------
+    // a_v = _mm_unpacklo_ps( a_v, b_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t c_vl = vget_low_f32( c_v );
+    float32x2_t d_vl = vget_low_f32( d_v );
+
+    float32x2x2_t res_cd_l = vzip_f32( c_vl, d_vl );
+
+    c_v = vcombine_f32( res_cd_l.val[0], res_cd_l.val[1] );
+    //-----------------------------------------------------------------
+    // c_v = _mm_unpacklo_ps( c_v, d_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    a.v[0] = a_v[0];
+    a.v[1] = a_v[1];
+    a.v[2] = c_v[0];
+    a.v[3] = c_v[1];
+    //-----------------------------------------------------------------
+    // a.v = _mm_movelh_ps( a_v, c_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    c.v[0] = t[0];
+    c.v[1] = t[1];
+    c.v[2] = u[0];
+    c.v[3] = u[1];
+    //-----------------------------------------------------------------
+    // c.v = _mm_movelh_ps( t, u );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    b.v[0] = a_v[2];
+    b.v[1] = a_v[3];
+    b.v[2] = c_v[2];
+    b.v[3] = c_v[3];
+    //-----------------------------------------------------------------
+    // b.v = _mm_movehl_ps( c_v, a_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    d.v[0] = t[2];
+    d.v[1] = t[3];
+    d.v[2] = u[2];
+    d.v[3] = u[3];
+    //-----------------------------------------------------------------
+    // d.v = _mm_movehl_ps( u, t );
+    //-----------------------------------------------------------------
+
+    // a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    // b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    // c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    // d.i[0] = ((const int * ALIGNED(16))a0)[3];
+
+    // a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    // b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    // c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    // d.i[1] = ((const int * ALIGNED(16))a1)[3];
+
+    // a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    // b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    // c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    // d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    // a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    // b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    // c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    // d.i[3] = ((const int * ALIGNED(16))a3)[3];
   }
 
   inline void store_4x1_tr( const v4 &a,
@@ -388,25 +499,123 @@ namespace v4
                             void * ALIGNED(16) a0, void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
   {
-    ((int * ALIGNED(16))a0)[0] = a.i[0];
-    ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0];
-    ((int * ALIGNED(16))a0)[3] = d.i[0];
-
-    ((int * ALIGNED(16))a1)[0] = a.i[1];
-    ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1];
-    ((int * ALIGNED(16))a1)[3] = d.i[1];
-
-    ((int * ALIGNED(16))a2)[0] = a.i[2];
-    ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2];
-    ((int * ALIGNED(16))a2)[3] = d.i[2];
-
-    ((int * ALIGNED(16))a3)[0] = a.i[3];
-    ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3];
-    ((int * ALIGNED(16))a3)[3] = d.i[3];
+    //-----------------------------------------------------------------
+    float32x4_t a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
+    //-----------------------------------------------------------------
+    // __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t a_vh = vget_high_f32( a_v );
+    float32x2_t b_vh = vget_high_f32( b_v );
+
+    float32x2x2_t res_ab_h = vzip_f32( a_vh, b_vh );
+
+    t = vcombine_f32( res_ab_h.val[0], res_ab_h.val[1] );
+    //-----------------------------------------------------------------
+    // t   = _mm_unpackhi_ps( a_v, b_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t a_vl = vget_low_f32( a_v );
+    float32x2_t b_vl = vget_low_f32( b_v );
+
+    float32x2x2_t res_ab_l = vzip_f32( a_vl, b_vl );
+
+    a_v = vcombine_f32( res_ab_l.val[0], res_ab_l.val[1] );
+    //-----------------------------------------------------------------
+    // a_v = _mm_unpacklo_ps( a_v, b_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t c_vh = vget_high_f32( c_v );
+    float32x2_t d_vh = vget_high_f32( d_v );
+
+    float32x2x2_t res_cd_h = vzip_f32( c_vh, d_vh );
+
+    u = vcombine_f32( res_cd_h.val[0], res_cd_h.val[1] );
+    //-----------------------------------------------------------------
+    // u   = _mm_unpackhi_ps( c_v, d_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t c_vl = vget_low_f32( c_v );
+    float32x2_t d_vl = vget_low_f32( d_v );
+
+    float32x2x2_t res_cd_l = vzip_f32( c_vl, d_vl );
+
+    c_v = vcombine_f32( res_cd_l.val[0], res_cd_l.val[1] );
+    //-----------------------------------------------------------------
+    // c_v = _mm_unpacklo_ps( c_v, d_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    b_v[0] = a_v[2];
+    b_v[1] = a_v[3];
+    b_v[2] = c_v[2];
+    b_v[3] = c_v[3];
+    //-----------------------------------------------------------------
+    // b_v = _mm_movehl_ps( c_v, a_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    a_v[0] = a_v[0];
+    a_v[1] = a_v[1];
+    a_v[2] = c_v[0];
+    a_v[3] = c_v[1];
+    //-----------------------------------------------------------------
+    // a_v = _mm_movelh_ps( a_v, c_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    c_v[0] = t[0];
+    c_v[1] = t[1];
+    c_v[2] = u[0];
+    c_v[3] = u[1];
+    //-----------------------------------------------------------------
+    // c_v = _mm_movelh_ps( t, u );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    d_v[0] = t[2];
+    d_v[1] = t[3];
+    d_v[2] = u[2];
+    d_v[3] = u[3];
+    //-----------------------------------------------------------------
+    // d_v = _mm_movehl_ps( u, t );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    vst1q_f32( (float *) a0, a_v );
+    vst1q_f32( (float *) a1, b_v );
+    vst1q_f32( (float *) a2, c_v );
+    vst1q_f32( (float *) a3, d_v );
+    //-----------------------------------------------------------------
+    // _mm_store_ps( (float *) a0, a_v );
+    // _mm_store_ps( (float *) a1, b_v );
+    // _mm_store_ps( (float *) a2, c_v );
+    // _mm_store_ps( (float *) a3, d_v );
+    //-----------------------------------------------------------------
+
+    // ((int * ALIGNED(16))a0)[0] = a.i[0];
+    // ((int * ALIGNED(16))a0)[1] = b.i[0];
+    // ((int * ALIGNED(16))a0)[2] = c.i[0];
+    // ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    // ((int * ALIGNED(16))a1)[0] = a.i[1];
+    // ((int * ALIGNED(16))a1)[1] = b.i[1];
+    // ((int * ALIGNED(16))a1)[2] = c.i[1];
+    // ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    // ((int * ALIGNED(16))a2)[0] = a.i[2];
+    // ((int * ALIGNED(16))a2)[1] = b.i[2];
+    // ((int * ALIGNED(16))a2)[2] = c.i[2];
+    // ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    // ((int * ALIGNED(16))a3)[0] = a.i[3];
+    // ((int * ALIGNED(16))a3)[1] = b.i[3];
+    // ((int * ALIGNED(16))a3)[2] = c.i[3];
+    // ((int * ALIGNED(16))a3)[3] = d.i[3];
   }
 
   //////////////

From 2a2fedde10e73d5fffabb5173a2b46746f36308a Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 15 Jul 2019 14:07:06 -0600
Subject: [PATCH 22/95] Add NEON intrinsics support for v4float assignment
 operators.

---
 src/util/v4/v4_avx2.h |  8 ++++----
 src/util/v4/v4_neon.h | 43 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h
index 104121db..1fdde94a 100644
--- a/src/util/v4/v4_avx2.h
+++ b/src/util/v4/v4_avx2.h
@@ -592,7 +592,7 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)                             \
+    #define ASSIGN(op)                            \
     inline v4int &operator op( const v4int &b )   \
     {                                             \
       i[0] op b.i[0];                             \
@@ -639,7 +639,7 @@ namespace v4
     ASSIGN(<<=)
     ASSIGN(>>=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4int member access operator
 
@@ -967,7 +967,7 @@ namespace v4
 
     // v4float assignment operators
 
-#   define ASSIGN(op,intrin)                            \
+    #define ASSIGN(op,intrin)                           \
     inline v4float &operator op( const v4float &b )     \
     {                                                   \
       v = intrin( v, b.v );                             \
@@ -986,7 +986,7 @@ namespace v4
     ASSIGN( *=, _mm_mul_ps )
     ASSIGN( /=, _mm_div_ps )
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4float member access operator
 
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index f1a691a5..2ec33500 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -759,7 +759,7 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)                             \
+    #define ASSIGN(op)                            \
     inline v4int &operator op( const v4int &b )   \
     {                                             \
       ALWAYS_VECTORIZE                            \
@@ -780,7 +780,7 @@ namespace v4
     ASSIGN(<<=)
     ASSIGN(>>=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4int member access operator
 
@@ -1073,23 +1073,44 @@ namespace v4
 
     // v4float assignment operators
 
-    #define ASSIGN(op)                                  \
+    #define ASSIGN(op,intrin)                           \
     inline v4float &operator op( const v4float &b )     \
     {                                                   \
-      ALWAYS_VECTORIZE                                  \
-      for( int j = 0; j < 4; j++ )                      \
-        f[j] op b.f[j];                                 \
+      v = intrin( v, b.v );                             \
       return *this;                                     \
     }
 
-    ASSIGN(=)
-    ASSIGN(+=)
-    ASSIGN(-=)
-    ASSIGN(*=)
-    ASSIGN(/=)
+    inline v4float &operator =( const v4float &b )
+    {
+      v = b.v;
+
+      return *this;
+    }
+
+    ASSIGN( +=, vaddq_f32 )
+    ASSIGN( -=, vsubq_f32 )
+    ASSIGN( *=, vmulq_f32 )
+    ASSIGN( /=, vdivq_f32 )
 
     #undef ASSIGN
 
+    // #define ASSIGN(op)                                  \
+    // inline v4float &operator op( const v4float &b )     \
+    // {                                                   \
+    //   ALWAYS_VECTORIZE                                  \
+    //   for( int j = 0; j < 4; j++ )                      \
+    //     f[j] op b.f[j];                                 \
+    //   return *this;                                     \
+    // }
+
+    // ASSIGN(=)
+    // ASSIGN(+=)
+    // ASSIGN(-=)
+    // ASSIGN(*=)
+    // ASSIGN(/=)
+
+    // #undef ASSIGN
+
     // v4float member access operator
 
     inline float &operator []( int n )

From 29c7a566202f5e5262d724ff7760826d68c9f3c4 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 15 Jul 2019 15:38:59 -0600
Subject: [PATCH 23/95] Add NEON intrinsics support for rsqrt and rcp
 functions.

---
 src/util/v4/v4_avx2.h |  4 +-
 src/util/v4/v4_neon.h | 91 +++++++++++++++++++++++++++++--------------
 2 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h
index 1fdde94a..c8132a3c 100644
--- a/src/util/v4/v4_avx2.h
+++ b/src/util/v4/v4_avx2.h
@@ -1202,8 +1202,8 @@ namespace v4
     return c;
   }
 
-# undef CMATH_FR1
-# undef CMATH_FR2
+  #undef CMATH_FR1
+  #undef CMATH_FR2
 
   // v4float miscelleanous functions
 
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 2ec33500..4c6efd3c 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -797,7 +797,7 @@ namespace v4
 
   // v4int prefix unary operators
 
-# define PREFIX_UNARY(op)                       \
+  #define PREFIX_UNARY(op)                      \
   inline v4int operator op( const v4int & a )   \
   {                                             \
     v4int b;                                    \
@@ -823,11 +823,11 @@ namespace v4
 
   PREFIX_UNARY(~)
 
-# undef PREFIX_UNARY
+  #undef PREFIX_UNARY
 
   // v4int prefix increment / decrement
 
-# define PREFIX_INCDEC(op)                      \
+  #define PREFIX_INCDEC(op)                     \
   inline v4int operator op( v4int & a )         \
   {                                             \
     v4int b;                                    \
@@ -840,11 +840,11 @@ namespace v4
   PREFIX_INCDEC(++)
   PREFIX_INCDEC(--)
 
-# undef PREFIX_INCDEC
+  #undef PREFIX_INCDEC
 
   // v4int postfix increment / decrement
 
-# define POSTFIX_INCDEC(op)                    \
+  #define POSTFIX_INCDEC(op)                   \
   inline v4int operator op( v4int & a, int )   \
   {                                            \
     v4int b;                                   \
@@ -857,11 +857,11 @@ namespace v4
   POSTFIX_INCDEC(++)
   POSTFIX_INCDEC(--)
 
-# undef POSTFIX_INCDEC
+  #undef POSTFIX_INCDEC
 
   // v4int binary operators
 
-# define BINARY(op)                                             \
+  #define BINARY(op)                                            \
   inline v4int operator op( const v4int &a, const v4int &b )    \
   {                                                             \
     v4int c;                                                    \
@@ -882,11 +882,11 @@ namespace v4
   BINARY(<<)
   BINARY(>>)
 
-# undef BINARY
+  #undef BINARY
 
   // v4int logical operators
 
-# define LOGICAL(op)                                           \
+  #define LOGICAL(op)                                          \
   inline v4int operator op( const v4int &a, const v4int &b )   \
   {                                                            \
     v4int c;                                                   \
@@ -905,7 +905,7 @@ namespace v4
   LOGICAL(&&)
   LOGICAL(||)
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4int miscellaneous functions
 
@@ -996,8 +996,8 @@ namespace v4
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
-#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+    #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+    #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
@@ -1008,8 +1008,8 @@ namespace v4
 
     CMATH_FR2(copysign);
 
-#   undef CMATH_FR1
-#   undef CMATH_FR2
+    #undef CMATH_FR1
+    #undef CMATH_FR2
 
     // v4float miscellaneous friends
 
@@ -1266,7 +1266,7 @@ namespace v4
 
   // v4float math library functions
 
-# define CMATH_FR1(fn)                          \
+  #define CMATH_FR1(fn)                         \
   inline v4float fn( const v4float &a )         \
   {                                             \
     v4float b;                                  \
@@ -1276,7 +1276,7 @@ namespace v4
     return b;                                   \
   }
 
-# define CMATH_FR2(fn)                                          \
+  #define CMATH_FR2(fn)                                         \
   inline v4float fn( const v4float &a, const v4float &b )       \
   {                                                             \
     v4float c;                                                  \
@@ -1308,8 +1308,8 @@ namespace v4
     return c;
   }
 
-# undef CMATH_FR1
-# undef CMATH_FR2
+  #undef CMATH_FR1
+  #undef CMATH_FR2
 
   // v4float miscellaneous functions
 
@@ -1317,9 +1317,11 @@ namespace v4
   {
     v4float b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+    b.v = vrsqrteq_f32( a.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.f[j] = ::sqrt( 1.0f / a.f[j] );
 
     return b;
   }
@@ -1328,9 +1330,26 @@ namespace v4
   {
     v4float b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+    float32x4_t a_v = a.v, b_v;
+
+    b_v = vrsqrteq_f32( a_v );
+
+    // Note: It is quicker to just call div_ps and sqrt_ps if more
+    // refinement desired!
+    b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ),
+                                     vsubq_f32( b_v,
+                                                vmulq_f32( a_v,
+                                                           vmulq_f32( b_v,
+                                                                      vmulq_f32( b_v, b_v )
+                                                                    )
+                                                         )
+                                              )
+                                   )
+                    );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.f[j] = ::sqrt( 1.0f / a.f[j] );
 
     return b;
   }
@@ -1339,9 +1358,11 @@ namespace v4
   {
     v4float b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.f[j] = 1.0f / a.f[j];
+    b.v = vrecpeq_f32( a.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.f[j] = 1.0f / a.f[j];
 
     return b;
   }
@@ -1350,9 +1371,19 @@ namespace v4
   {
     v4float b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.f[j] = 1.0f / a.f[j];
+    float32x4_t a_v = a.v, b_v;
+
+    b_v = vrecpeq_f32( a_v );
+
+    b.v = vsubq_f32( vaddq_f32( b_v, b_v ),
+                     vmulq_f32( a_v,
+                                vmulq_f32( b_v, b_v )
+                              )
+                   );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.f[j] = 1.0f / a.f[j];
 
     return b;
   }

From a482375d202992045250f884ced3194ff5e5f09d Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 15 Jul 2019 17:31:49 -0600
Subject: [PATCH 24/95] Add NEON intrinsics support for transpose function.

---
 src/util/v4/v4_neon.h | 88 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 4c6efd3c..964510c9 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -192,9 +192,91 @@ namespace v4
 
   inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
   {
-    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
-                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
-                                                  sw( a2.i[3],a3.i[2] );
+    fload32x4_t a0_v, a2_v, t, u;
+
+    //-----------------------------------------------------------------
+    float32x2_t a0_vh = vget_high_f32( a0.v );
+    float32x2_t a1_vh = vget_high_f32( a1.v );
+
+    float32x2x2_t res_a0a1_h = vzip_f32( a0_vh, a1_vh );
+
+    t = vcombine_f32( res_a0a1_h.val[0], res_a0a1_h.val[1] );
+    //-----------------------------------------------------------------
+    // t    = _mm_unpackhi_ps( a0.v, a1.v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t a0_vl = vget_low_f32( a0.v );
+    float32x2_t a1_vl = vget_low_f32( a1.v );
+
+    float32x2x2_t res_a0a1_l = vzip_f32( a0_vl, a1_vl );
+
+    a0_v = vcombine_f32( res_a0a1_l.val[0], res_a0a1_l.val[1] );
+    //-----------------------------------------------------------------
+    // a0_v = _mm_unpacklo_ps( a0.v, a1.v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t a2_vh = vget_high_f32( a2.v );
+    float32x2_t a3_vh = vget_high_f32( a3.v );
+
+    float32x2x2_t res_a2a3_h = vzip_f32( a2_vh, a3_vh );
+
+    u = vcombine_f32( res_a2a3_h.val[0], res_a2a3_h.val[1] );
+    //-----------------------------------------------------------------
+    // u    = _mm_unpackhi_ps( a2.v, a3.v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    float32x2_t a2_vl = vget_low_f32( a2.v );
+    float32x2_t a3_vl = vget_low_f32( a3.v );
+
+    float32x2x2_t res_a2a3_l = vzip_f32( a2_vl, a3_vl );
+
+    a2_v = vcombine_f32( res_a2a3_l.val[0], res_a2a3_l.val[1] );
+    //-----------------------------------------------------------------
+    // a2_v = _mm_unpacklo_ps( a2.v, a3.v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    a0.v[0] = a0_v[0];
+    a0.v[1] = a0_v[1];
+    a0.v[2] = a2_v[0];
+    a0.v[3] = a2_v[1];
+    //-----------------------------------------------------------------
+    // a0.v = _mm_movelh_ps( a0_v, a2_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    a1.v[0] = a0_v[2];
+    a1.v[1] = a0_v[3];
+    a1.v[2] = a2_v[2];
+    a1.v[3] = a2_v[3];
+    //-----------------------------------------------------------------
+    // a1.v = _mm_movehl_ps( a2_v, a0_v );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    a2.v[0] = t[0];
+    a2.v[1] = t[1];
+    a2.v[2] = u[0];
+    a2.v[3] = u[1];
+    //-----------------------------------------------------------------
+    // a2.v = _mm_movelh_ps( t, u );
+    //-----------------------------------------------------------------
+
+    //-----------------------------------------------------------------
+    a3.v[0] = t[2];
+    a3.v[1] = t[3];
+    a3.v[2] = u[2];
+    a3.v[3] = u[3];
+    //-----------------------------------------------------------------
+    // a3.v = _mm_movehl_ps( u, t );
+    //-----------------------------------------------------------------
+
+    // sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
+    //                        sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
+    //                                               sw( a2.i[3],a3.i[2] );
   }
 
   #undef sw

From 7353705152f52f28e8c888393e091f6c13dbebdd Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 15 Jul 2019 17:39:16 -0600
Subject: [PATCH 25/95] Fix a typo.

---
 src/util/v4/v4_neon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 964510c9..5f3a48ac 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -192,7 +192,7 @@ namespace v4
 
   inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
   {
-    fload32x4_t a0_v, a2_v, t, u;
+    float32x4_t a0_v, a2_v, t, u;
 
     //-----------------------------------------------------------------
     float32x2_t a0_vh = vget_high_f32( a0.v );

From 4a80b0170363386b2bf37fa94f4487415b33646c Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 15 Jul 2019 21:02:58 -0600
Subject: [PATCH 26/95] Add support for benchmarking center_p and uncenter_p.

---
 src/vpic/initialize.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc
index 8cc28da0..4aa7f0e7 100644
--- a/src/vpic/initialize.cc
+++ b/src/vpic/initialize.cc
@@ -51,6 +51,12 @@ vpic_simulation::initialize( int argc,
     if( rank()==0 ) MESSAGE(( "Uncentering particles" ));
     TIC load_interpolator_array( interpolator_array, field_array ); TOC( load_interpolator, 1 );
   }
+  LIST_FOR_EACH( sp, species_list ) TIC sort_p( sp ); TOC( sort_p, 1 );
+  for( int iwdn = 0; iwdn < 1000; iwdn++ )
+  {
+    LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 );
+    LIST_FOR_EACH( sp, species_list ) TIC   center_p( sp, interpolator_array ); TOC(   center_p, 1 );
+  }
   LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 );
 
   if( rank()==0 ) MESSAGE(( "Performing initial diagnostics" ));

From 0c16a2024b7d382b91718461ab3ff74625718240 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Tue, 16 Jul 2019 11:18:01 -0600
Subject: [PATCH 27/95] Change number of iterations for uncenter/center loop.

---
 src/vpic/initialize.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc
index 4aa7f0e7..4961559b 100644
--- a/src/vpic/initialize.cc
+++ b/src/vpic/initialize.cc
@@ -52,7 +52,7 @@ vpic_simulation::initialize( int argc,
     TIC load_interpolator_array( interpolator_array, field_array ); TOC( load_interpolator, 1 );
   }
   LIST_FOR_EACH( sp, species_list ) TIC sort_p( sp ); TOC( sort_p, 1 );
-  for( int iwdn = 0; iwdn < 1000; iwdn++ )
+  for( int iwdn = 0; iwdn < 100; iwdn++ )
   {
     LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 );
     LIST_FOR_EACH( sp, species_list ) TIC   center_p( sp, interpolator_array ); TOC(   center_p, 1 );

From ec22f2c32a76e2cd028d503e26032dd0b8c1a530 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Tue, 16 Jul 2019 21:00:19 -0600
Subject: [PATCH 28/95] Add NEON intrinsic support for v4float logical
 operators to v4_neon.h.

---
 src/util/v4/v4_avx2.h |   2 +-
 src/util/v4/v4_neon.h | 102 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h
index c8132a3c..4800c98e 100644
--- a/src/util/v4/v4_avx2.h
+++ b/src/util/v4/v4_avx2.h
@@ -1101,7 +1101,7 @@ namespace v4
 
   // v4float logical operators
 
-# define LOGICAL(op,intrin)                                        \
+  #define LOGICAL(op,intrin)                                       \
   inline v4int operator op( const v4float &a, const v4float &b )   \
   {                                                                \
     v4int c;                                                       \
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 5f3a48ac..763519f9 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -123,8 +123,10 @@ namespace v4
 
     union
     {
-      int i[4];
-      float f[4];
+      int         i[4];
+      float       f[4];
+      int32x4_t   vsi;
+      uint32x4_t  vui;
       float32x4_t v;
     };
 
@@ -1325,26 +1327,96 @@ namespace v4
 
   // v4float logical operators
 
-# define LOGICAL(op)                                               \
+  #define LOGICAL(op,intrin)                                       \
   inline v4int operator op( const v4float &a, const v4float &b )   \
   {                                                                \
     v4int c;                                                       \
-    ALWAYS_VECTORIZE                                               \
-    for( int j = 0; j < 4; j++ )                                   \
-      c.i[j] = - ( a.f[j] op b.f[j] );                             \
+    c.v = intrin( a.v, b.v );                                      \
     return c;                                                      \
   }
 
-  LOGICAL(< )
-  LOGICAL(> )
-  LOGICAL(==)
-  LOGICAL(!=)
-  LOGICAL(<=)
-  LOGICAL(>=)
-  LOGICAL(&&)
-  LOGICAL(||)
+  LOGICAL(  <, vcltq_f32 )
+  LOGICAL(  >, vcgtq_f32 )
+  LOGICAL( ==, vceqq_f32 )
+  LOGICAL( <=, vcleq_f32 )
+  LOGICAL( >=, vcgeq_f32 )
+  // LOGICAL( !=, _mm_cmpneq_ps )
+
+  inline v4int operator !=( const v4float &a, const v4float &b )
+  {
+    v4int c;
+
+    // r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
+    // return type looks wrong here. try adding uint32x4_t vi to
+    // the union. may need to do a cast.
+
+    c.vui = vmvnq_u32( vceqq_f32( a.v, b.v ) );
+
+    return c;
+  }
+
+  inline v4int operator &&( const v4float &a, const v4float &b )
+  {
+    v4int c;
+
+    float32x4_t vzero = vdupq_n_f32(0.0f);
+
+    // __m128 vzero = _mm_setzero_ps();
+
+    // Is there a better way to do this than the SSE way?
+    c.vsi = vandq_s32( vmvnq_u32( vceqq_f32( a.v,
+					     vzero ) ),
+		       vmvnq_u32( vceqq_f32( b.v,
+					     vzero ) ) );
+
+    // c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ),
+    //                   _mm_cmpneq_ps( b.v, vzero ) );
+
+    return c;
+  }
+
+  inline v4int operator ||( const v4float &a, const v4float &b )
+  {
+    v4int c;
+
+    float32x4_t vzero = vdupq_n_f32(0.0f);
+
+    // __m128 vzero = _mm_setzero_ps();
+
+    // Is there a better way to do this than the SSE way?
+    c.vsi = vorrq_s32( vmvnq_u32( vceqq_f32( a.v,
+					     vzero ) ),
+		       vmvnq_u32( vceqq_f32( b.v,
+					     vzero ) ) );
+
+    // c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ),
+    //                  _mm_cmpneq_ps( b.v, vzero ) );
+
+    return c;
+  }
+
+  #undef LOGICAL
+
+  // #define LOGICAL(op)                                              \
+  // inline v4int operator op( const v4float &a, const v4float &b )   \
+  // {                                                                \
+  //   v4int c;                                                       \
+  //   ALWAYS_VECTORIZE                                               \
+  //   for( int j = 0; j < 4; j++ )                                   \
+  //     c.i[j] = - ( a.f[j] op b.f[j] );                             \
+  //   return c;                                                      \
+  // }
+
+  // LOGICAL(< )
+  // LOGICAL(> )
+  // LOGICAL(==)
+  // LOGICAL(!=)
+  // LOGICAL(<=)
+  // LOGICAL(>=)
+  // LOGICAL(&&)
+  // LOGICAL(||)
 
-# undef LOGICAL
+  // #undef LOGICAL
 
   // v4float math library functions
 

From 6be57c295d5519329d54a8692d8c605ff7170672 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Wed, 17 Jul 2019 09:31:12 -0600
Subject: [PATCH 29/95] Try an implementation of load/store transpose
 operations using vld4q_f32 and vst4q_f32.

---
 src/util/v4/v4_neon.h | 142 ++++++++++++++++++++++++++++++++----------
 1 file changed, 108 insertions(+), 34 deletions(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 763519f9..1fd3d609 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -345,10 +345,14 @@ namespace v4
                            const void *a2, const void *a3,
                            v4 &a )
   {
-    a.i[0] = ((const int *)a0)[0];
-    a.i[1] = ((const int *)a1)[0];
-    a.i[2] = ((const int *)a2)[0];
-    a.i[3] = ((const int *)a3)[0];
+    float32x4x4_t mat = vld4q_f32( (const float *) a0 );
+
+    a.v = mat.val[0];
+
+    // a.i[0] = ((const int *)a0)[0];
+    // a.i[1] = ((const int *)a1)[0];
+    // a.i[2] = ((const int *)a2)[0];
+    // a.i[3] = ((const int *)a3)[0];
   }
 
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
@@ -357,55 +361,88 @@ namespace v4
                            const void * ALIGNED(8) a3,
                            v4 &a, v4 &b )
   {
-    // __m128 a_v, b_v, t;
+    float32x4x4_t mat = vld4q_f32( (const float *) a0 );
 
-    // b_v = _mm_setzero_ps();
+    a.v = mat.val[0];
+    b.v = mat.val[1];
 
-    // t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
-    // b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
+    // a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    // b.i[0] = ((const int * ALIGNED(8))a0)[1];
 
-    // a_v = _mm_shuffle_ps( t, b_v, 0x88 );
-    // b_v = _mm_shuffle_ps( t, b_v, 0xdd );
+    // a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    // b.i[1] = ((const int * ALIGNED(8))a1)[1];
 
-    // a.v = a_v;
-    // b.v = b_v;
+    // a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    // b.i[2] = ((const int * ALIGNED(8))a2)[1];
 
-    a.i[0] = ((const int * ALIGNED(8))a0)[0];
-    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+    // a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    // b.i[3] = ((const int * ALIGNED(8))a3)[1];
+  }
 
-    a.i[1] = ((const int * ALIGNED(8))a1)[0];
-    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a, v4 &b, v4 &c )
+  {
+    float32x4x4_t mat = vld4q_f32( (const float *) a0 );
 
-    a.i[2] = ((const int * ALIGNED(8))a2)[0];
-    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+    a.v = mat.val[0];
+    b.v = mat.val[1];
+    c.v = mat.val[2];
 
-    a.i[3] = ((const int * ALIGNED(8))a3)[0];
-    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+    // a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    // b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    // c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    // a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    // b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    // c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    // a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    // b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    // c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    // a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    // b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    // c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
   }
 
-  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c )
+                           v4 &a, v4 &b, v4 &c, v4 &d )
   {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    float32x4x4_t mat = vld4q_f32( (const float *) a0 );
+
+    a.v = mat.val[0];
+    b.v = mat.val[1];
+    c.v = mat.val[2];
+    d.v = mat.val[3];
 
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    // a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    // b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    // c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    // d.i[0] = ((const int * ALIGNED(16))a0)[3];
 
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    // a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    // b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    // c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    // d.i[1] = ((const int * ALIGNED(16))a1)[3];
 
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+    // a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    // b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    // c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    // d.i[2] = ((const int * ALIGNED(16))a2)[3];
+
+    // a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    // b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    // c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    // d.i[3] = ((const int * ALIGNED(16))a3)[3];
   }
 
+  #if 0
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
@@ -530,6 +567,7 @@ namespace v4
     // c.i[3] = ((const int * ALIGNED(16))a3)[2];
     // d.i[3] = ((const int * ALIGNED(16))a3)[3];
   }
+  #endif
 
   inline void store_4x1_tr( const v4 &a,
                             void *a0, void *a1,
@@ -579,6 +617,41 @@ namespace v4
     ((int * ALIGNED(16))a3)[2] = c.i[3];
   }
 
+  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
+                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  {
+    float32x4x4_t mat;
+
+    mat.val[0] = a.v;
+    mat.val[1] = b.v;
+    mat.val[2] = c.v;
+    mat.val[3] = d.v;
+
+    vst4q_f32( (const float *) a0, mat );
+
+    // ((int * ALIGNED(16))a0)[0] = a.i[0];
+    // ((int * ALIGNED(16))a0)[1] = b.i[0];
+    // ((int * ALIGNED(16))a0)[2] = c.i[0];
+    // ((int * ALIGNED(16))a0)[3] = d.i[0];
+
+    // ((int * ALIGNED(16))a1)[0] = a.i[1];
+    // ((int * ALIGNED(16))a1)[1] = b.i[1];
+    // ((int * ALIGNED(16))a1)[2] = c.i[1];
+    // ((int * ALIGNED(16))a1)[3] = d.i[1];
+
+    // ((int * ALIGNED(16))a2)[0] = a.i[2];
+    // ((int * ALIGNED(16))a2)[1] = b.i[2];
+    // ((int * ALIGNED(16))a2)[2] = c.i[2];
+    // ((int * ALIGNED(16))a2)[3] = d.i[2];
+
+    // ((int * ALIGNED(16))a3)[0] = a.i[3];
+    // ((int * ALIGNED(16))a3)[1] = b.i[3];
+    // ((int * ALIGNED(16))a3)[2] = c.i[3];
+    // ((int * ALIGNED(16))a3)[3] = d.i[3];
+  }
+
+  #if 0
   inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
                             void * ALIGNED(16) a0, void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
@@ -701,6 +774,7 @@ namespace v4
     // ((int * ALIGNED(16))a3)[2] = c.i[3];
     // ((int * ALIGNED(16))a3)[3] = d.i[3];
   }
+  #endif
 
   //////////////
   // v4int class

From e7fd3e09bed07c7d477bddf0157cfe358b767b12 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 18 Jul 2019 16:26:03 -0600
Subject: [PATCH 30/95] Comment out new implementations for load_4x4_tr and
 store_4x4_tr since they will not work.

---
 src/util/v4/v4_neon.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 1fd3d609..bdd30925 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -408,6 +408,7 @@ namespace v4
     // c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
   }
 
+  #if 0
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
@@ -441,8 +442,9 @@ namespace v4
     // c.i[3] = ((const int * ALIGNED(16))a3)[2];
     // d.i[3] = ((const int * ALIGNED(16))a3)[3];
   }
+  #endif
 
-  #if 0
+  #if 1
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
@@ -617,6 +619,7 @@ namespace v4
     ((int * ALIGNED(16))a3)[2] = c.i[3];
   }
 
+  #if 0
   inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
                             void * ALIGNED(16) a0, void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
@@ -650,8 +653,9 @@ namespace v4
     // ((int * ALIGNED(16))a3)[2] = c.i[3];
     // ((int * ALIGNED(16))a3)[3] = d.i[3];
   }
+  #endif
 
-  #if 0
+  #if 1
   inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
                             void * ALIGNED(16) a0, void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2, void * ALIGNED(16) a3 )

From d9b7adb51a0460c5d413d0406b06239d06959f16 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 22 Jul 2019 15:11:50 -0600
Subject: [PATCH 31/95] More work on v4_neon support.

---
 src/util/v4/v4_avx2.h | 182 ++++++++------
 src/util/v4/v4_neon.h | 553 +++++++++++++++++++++++++++++++-----------
 2 files changed, 518 insertions(+), 217 deletions(-)

diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h
index 4800c98e..023ba95a 100644
--- a/src/util/v4/v4_avx2.h
+++ b/src/util/v4/v4_avx2.h
@@ -29,7 +29,7 @@ namespace v4
     constexpr static int value = i0 + i1*4 + i2*16 + i3*64;
   };
 
-# define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64)
+  #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64)
 
   ////////////////
   // v4 base class
@@ -151,8 +151,8 @@ namespace v4
   template<int n>
   inline v4 splat( const v4 & a )
   {
-    __m128 a_v = a.v;
     v4 b;
+    __m128 a_v = a.v;
 
     b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) );
 
@@ -162,8 +162,8 @@ namespace v4
   template<int i0, int i1, int i2, int i3>
   inline v4 shuffle( const v4 & a )
   {
-    __m128 a_v = a.v;
     v4 b;
+    __m128 a_v = a.v;
 
     b.v = _mm_shuffle_ps( a_v, a_v, ( permute<i0,i1,i2,i3>::value ) );
 
@@ -231,7 +231,8 @@ namespace v4
   }
 
   /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
-  inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b )
+  inline void swap_4x1( void * ALIGNED(16) a,
+			void * ALIGNED(16) b )
   {
     __m128 t = _mm_load_ps( ( float * ) a );
 
@@ -241,21 +242,24 @@ namespace v4
 
   // v4 transposed memory manipulation functions
 
-  inline void load_4x1_tr( const void *a0, const void *a1,
-                           const void *a2, const void *a3,
+  inline void load_4x1_tr( const void *a0,
+			   const void *a1,
+                           const void *a2,
+			   const void *a3,
                            v4 &a )
   {
-    a.v = _mm_setr_ps( ((const float *)a0)[0],
-                       ((const float *)a1)[0],
-                       ((const float *)a2)[0],
-                       ((const float *)a3)[0] );
+    a.v = _mm_setr_ps( ( (const float *) a0 )[0],
+                       ( (const float *) a1 )[0],
+                       ( (const float *) a2 )[0],
+                       ( (const float *) a3 )[0] );
   }
 
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
-                           v4 &a, v4 &b )
+                           v4 &a,
+			   v4 &b )
   {
     __m128 a_v, b_v, t;
 
@@ -275,7 +279,9 @@ namespace v4
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c )
+                           v4 &a,
+			   v4 &b,
+			   v4 &c )
   {
     __m128 a_v, b_v, c_v, t, u;
 
@@ -298,12 +304,16 @@ namespace v4
     c.v = c_v;
   }
 
-#if 0
+  #if 0
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+                           v4 &a,
+			   v4 &b,
+			   v4 &c,
+			   v4 &d )
+  {
     __m128 a_v, b_v, c_v, d_v, t, u;
     a_v = _mm_load_ps( (const float *)a0 );
     b_v = _mm_load_ps( (const float *)a1 );
@@ -319,14 +329,18 @@ namespace v4
     d_v = _mm_movehl_ps( u, t );
     a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v;
   }
-#endif
+  #endif
 
-#if 0
+  #if 0
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+                           v4 &a,
+			   v4 &b,
+			   v4 &c,
+			   v4 &d )
+  {
     __m128 a_v, b_v, c_v, d_v, t, u;
 
     a_v = _mm_load_ps( (const float *)a0 );
@@ -344,14 +358,18 @@ namespace v4
     c.v = _mm_movelh_ps( t, u );
     d.v = _mm_movehl_ps( u, t );
   }
-#endif
+  #endif
 
-#if 0
+  #if 0
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+                           v4 &a,
+			   v4 &b,
+			   v4 &c,
+			   v4 &d )
+  {
     __m128 a_v, b_v, c_v, d_v, t, u;
 
     a_v = _mm_load_ps( (const float *)a0 );
@@ -369,13 +387,16 @@ namespace v4
     d.v = _mm_movehl_ps( u, t );
     c.v = _mm_movelh_ps( t, u );
   }
-#endif
+  #endif
 
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d )
+                           v4 &a,
+			   v4 &b,
+			   v4 &c,
+			   v4 &d )
   {
     __m128 a_v, b_v, c_v, d_v, t, u;
 
@@ -396,18 +417,23 @@ namespace v4
   }
 
   inline void store_4x1_tr( const v4 &a,
-                            void *a0, void *a1,
-                            void *a2, void *a3 )
+                            void *a0,
+			    void *a1,
+                            void *a2,
+			    void *a3 )
   {
-    ((float *)a0)[0] = a.f[0];
-    ((float *)a1)[0] = a.f[1];
-    ((float *)a2)[0] = a.f[2];
-    ((float *)a3)[0] = a.f[3];
-  }
-
-  inline void store_4x2_tr( const v4 &a, const v4 &b,
-                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
-                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+    ( (float *) a0 )[0] = a.f[0];
+    ( (float *) a1 )[0] = a.f[1];
+    ( (float *) a2 )[0] = a.f[2];
+    ( (float *) a3 )[0] = a.f[3];
+  }
+
+  inline void store_4x2_tr( const v4 &a,
+			    const v4 &b,
+                            void * ALIGNED(8) a0,
+			    void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2,
+			    void * ALIGNED(8) a3 )
   {
     __m128 a_v = a.v, b_v = b.v, t;
 
@@ -422,9 +448,13 @@ namespace v4
     _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3       -> a3
   }
 
-  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x3_tr( const v4 &a,
+			    const v4 &b,
+			    const v4 &c,
+                            void * ALIGNED(16) a0,
+			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+			    void * ALIGNED(16) a3 )
   {
     __m128 a_v = a.v, b_v = b.v, t;
 
@@ -445,10 +475,14 @@ namespace v4
   }
 
   // FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR)
-  inline void store_4x4_tr( const v4 &a, const v4 &b,
-                            const v4 &c, const v4 &d,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x4_tr( const v4 &a,
+			    const v4 &b,
+                            const v4 &c,
+			    const v4 &d,
+                            void * ALIGNED(16) a0,
+			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+			    void * ALIGNED(16) a3 )
   {
     __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
 
@@ -602,6 +636,14 @@ namespace v4
       return *this;                               \
     }
 
+    ASSIGN(+=)
+    ASSIGN(-=)
+    ASSIGN(*=)
+    ASSIGN(/=)
+    ASSIGN(%=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
     inline v4int &operator =( const v4int &b )
     {
       v = b.v;
@@ -609,12 +651,6 @@ namespace v4
       return *this;
     }
 
-    ASSIGN(+=)
-    ASSIGN(-=)
-    ASSIGN(*=)
-    ASSIGN(/=)
-    ASSIGN(%=)
-
     inline v4int &operator ^=( const v4int &b )
     {
       v = _mm_xor_ps( v, b.v );
@@ -636,9 +672,6 @@ namespace v4
       return *this;
     }
 
-    ASSIGN(<<=)
-    ASSIGN(>>=)
-
     #undef ASSIGN
 
     // v4int member access operator
@@ -656,7 +689,7 @@ namespace v4
 
   // v4int prefix unary operators
 
-# define PREFIX_UNARY(op)                       \
+  #define PREFIX_UNARY(op)                      \
   inline v4int operator op( const v4int & a )   \
   {                                             \
     v4int b;                                    \
@@ -706,11 +739,11 @@ namespace v4
     return b;
   }
 
-# undef PREFIX_UNARY
+  #undef PREFIX_UNARY
 
   // v4int prefix increment / decrement
 
-# define PREFIX_INCDEC(op)                      \
+  #define PREFIX_INCDEC(op)                     \
   inline v4int operator op( v4int & a )         \
   {                                             \
     v4int b;                                    \
@@ -724,11 +757,11 @@ namespace v4
   PREFIX_INCDEC(++)
   PREFIX_INCDEC(--)
 
-# undef PREFIX_INCDEC
+  #undef PREFIX_INCDEC
 
   // v4int postfix increment / decrement
 
-# define POSTFIX_INCDEC(op)                    \
+  #define POSTFIX_INCDEC(op)                   \
   inline v4int operator op( v4int & a, int )   \
   {                                            \
     v4int b;                                   \
@@ -742,11 +775,11 @@ namespace v4
   POSTFIX_INCDEC(++)
   POSTFIX_INCDEC(--)
 
-# undef POSTFIX_INCDEC
+  #undef POSTFIX_INCDEC
 
   // v4int binary operators
 
-# define BINARY(op)                                             \
+  #define BINARY(op)                                            \
   inline v4int operator op( const v4int &a, const v4int &b )    \
   {                                                             \
     v4int c;                                                    \
@@ -762,6 +795,8 @@ namespace v4
   BINARY(*)
   BINARY(/)
   BINARY(%)
+  BINARY(<<)
+  BINARY(>>)
 
   inline v4int operator ^( const v4int &a, const v4int &b )
   {
@@ -790,14 +825,11 @@ namespace v4
     return c;
   }
 
-  BINARY(<<)
-  BINARY(>>)
-
-# undef BINARY
+  #undef BINARY
 
   // v4int logical operators
 
-# define LOGICAL(op)                                           \
+  #define LOGICAL(op)                                          \
   inline v4int operator op( const v4int &a, const v4int &b )   \
   {                                                            \
     v4int c;                                                   \
@@ -817,7 +849,7 @@ namespace v4
   LOGICAL(&&)
   LOGICAL(||)
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4int miscellaneous functions
 
@@ -905,9 +937,9 @@ namespace v4
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
-#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b ) ALWAYS_INLINE
+    #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+    #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -917,8 +949,8 @@ namespace v4
 
     CMATH_FR2(copysign);
 
-#   undef CMATH_FR1
-#   undef CMATH_FR2
+    #undef CMATH_FR1
+    #undef CMATH_FR2
 
     // v4float miscellaneous friends
 
@@ -974,6 +1006,11 @@ namespace v4
       return *this;                                     \
     }
 
+    ASSIGN( +=, _mm_add_ps )
+    ASSIGN( -=, _mm_sub_ps )
+    ASSIGN( *=, _mm_mul_ps )
+    ASSIGN( /=, _mm_div_ps )
+
     inline v4float &operator =( const v4float &b )
     {
       v = b.v;
@@ -981,11 +1018,6 @@ namespace v4
       return *this;
     }
 
-    ASSIGN( +=, _mm_add_ps )
-    ASSIGN( -=, _mm_sub_ps )
-    ASSIGN( *=, _mm_mul_ps )
-    ASSIGN( /=, _mm_div_ps )
-
     #undef ASSIGN
 
     // v4float member access operator
@@ -1140,11 +1172,11 @@ namespace v4
     return c;
   }
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4float math library functions
 
-# define CMATH_FR1(fn)                          \
+  #define CMATH_FR1(fn)                         \
   inline v4float fn( const v4float &a )         \
   {                                             \
     v4float b;                                  \
@@ -1155,7 +1187,7 @@ namespace v4
     return b;                                   \
   }
 
-# define CMATH_FR2(fn)                                          \
+  #define CMATH_FR2(fn)                                         \
   inline v4float fn( const v4float &a, const v4float &b )       \
   {                                                             \
     v4float c;                                                  \
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index bdd30925..1734f62a 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -162,6 +162,10 @@ namespace v4
   inline v4 splat( const v4 & a )
   {
     v4 b;
+    // __m128 a_v = a.v;
+
+    // b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) );
+
 
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
@@ -174,6 +178,9 @@ namespace v4
   inline v4 shuffle( const v4 & a )
   {
     v4 b;
+    // __m128 a_v = a.v;
+
+    // b.v = _mm_shuffle_ps( a_v, a_v, ( permute<i0,i1,i2,i3>::value ) );
 
     b.i[0] = a.i[i0];
     b.i[1] = a.i[i1];
@@ -187,6 +194,12 @@ namespace v4
 
   inline void swap( v4 &a, v4 &b )
   {
+    // __m128 a_v = a.v;
+
+    // a.v = b.v;
+
+    // b.v = a_v;
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       sw( a.i[j], b.i[j] );
@@ -288,22 +301,32 @@ namespace v4
   inline void load_4x1( const void * ALIGNED(16) p,
                         v4 &a )
   {
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      a.i[j] = ((const int * ALIGNED(16))p)[j];
+    a.v = vld1q_f32( ( float * ) p );
+
+    // a.v = _mm_load_ps( ( float * ) p );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   a.i[j] = ((const int * ALIGNED(16))p)[j];
   }
 
   inline void store_4x1( const v4 &a,
                          void * ALIGNED(16) p )
   {
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      ((int * ALIGNED(16))p)[j] = a.i[j];
+    vst1q_f32( ( float * ) p, a.v );
+
+    // _mm_store_ps( ( float * ) p, a.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   ((int * ALIGNED(16))p)[j] = a.i[j];
   }
 
   inline void stream_4x1( const v4 &a,
                           void * ALIGNED(16) p )
   {
+    // _mm_stream_ps( ( float * ) p, a.v );
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       ((int * ALIGNED(16))p)[j] = a.i[j];
@@ -311,101 +334,165 @@ namespace v4
 
   inline void clear_4x1( void * ALIGNED(16) p )
   {
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      ((int * ALIGNED(16))p)[j] = 0;
+    vst1q_f32( ( float * ) p, vdupq_n_f32( 0.0f ) );
+
+    // _mm_store_ps( ( float * ) p, _mm_setzero_ps() );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   ((int * ALIGNED(16))p)[j] = 0;
   }
 
   // FIXME: Ordering semantics
   inline void copy_4x1( void * ALIGNED(16) dst,
                         const void * ALIGNED(16) src )
   {
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j];
+    vst1q_f32( ( float * ) dst, vld1q_f32( ( const float * ) src ) );
+
+    // _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j];
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
                         void * ALIGNED(16) b )
   {
-    int t;
+    float32x4_t t = vld1q_f32( ( float * ) a );
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-    {
-      t = ((int * ALIGNED(16))a)[j];
-      ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j];
-      ((int * ALIGNED(16))b)[j] = t;
-    }
+    vst1q_f32( ( float * ) a, vld1q_f32( ( float * ) b ) );
+    vst1q_f32( ( float * ) b, t );
+
+    // __m128 t = _mm_load_ps( ( float * ) a );
+
+    // _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) );
+    // _mm_store_ps( ( float * ) b, t );
+
+    // int t;
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    // {
+    //   t = ((int * ALIGNED(16))a)[j];
+    //   ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j];
+    //   ((int * ALIGNED(16))b)[j] = t;
+    // }
   }
 
   // v4 transposed memory manipulation functions
 
-  inline void load_4x1_tr( const void *a0, const void *a1,
-                           const void *a2, const void *a3,
+  inline void load_4x1_tr( const void *a0,
+			   const void *a1,
+                           const void *a2,
+			   const void *a3,
                            v4 &a )
   {
-    float32x4x4_t mat = vld4q_f32( (const float *) a0 );
+    // a.v = _mm_setr_ps( ( (const float *) a0 )[0],
+    //                    ( (const float *) a1 )[0],
+    //                    ( (const float *) a2 )[0],
+    //                    ( (const float *) a3 )[0] );
 
-    a.v = mat.val[0];
+    // Not correct.
+    // float32x4x4_t mat = vld4q_f32( (const float *) a0 );
 
-    // a.i[0] = ((const int *)a0)[0];
-    // a.i[1] = ((const int *)a1)[0];
-    // a.i[2] = ((const int *)a2)[0];
-    // a.i[3] = ((const int *)a3)[0];
+    // a.v = mat.val[0];
+
+    a.i[0] = ((const int *)a0)[0];
+    a.i[1] = ((const int *)a1)[0];
+    a.i[2] = ((const int *)a2)[0];
+    a.i[3] = ((const int *)a3)[0];
   }
 
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
-                           v4 &a, v4 &b )
+                           v4 &a,
+			   v4 &b )
   {
-    float32x4x4_t mat = vld4q_f32( (const float *) a0 );
+    // __m128 a_v, b_v, t;
 
-    a.v = mat.val[0];
-    b.v = mat.val[1];
+    // b_v = _mm_setzero_ps();
+
+    // t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
+    // b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
+
+    // a_v = _mm_shuffle_ps( t, b_v, 0x88 );
+    // b_v = _mm_shuffle_ps( t, b_v, 0xdd );
+
+    // a.v = a_v;
+    // b.v = b_v;
+
+    // Not correct.
+    // float32x4x4_t mat = vld4q_f32( (const float *) a0 );
+
+    // a.v = mat.val[0];
+    // b.v = mat.val[1];
 
-    // a.i[0] = ((const int * ALIGNED(8))a0)[0];
-    // b.i[0] = ((const int * ALIGNED(8))a0)[1];
+    a.i[0] = ((const int * ALIGNED(8))a0)[0];
+    b.i[0] = ((const int * ALIGNED(8))a0)[1];
 
-    // a.i[1] = ((const int * ALIGNED(8))a1)[0];
-    // b.i[1] = ((const int * ALIGNED(8))a1)[1];
+    a.i[1] = ((const int * ALIGNED(8))a1)[0];
+    b.i[1] = ((const int * ALIGNED(8))a1)[1];
 
-    // a.i[2] = ((const int * ALIGNED(8))a2)[0];
-    // b.i[2] = ((const int * ALIGNED(8))a2)[1];
+    a.i[2] = ((const int * ALIGNED(8))a2)[0];
+    b.i[2] = ((const int * ALIGNED(8))a2)[1];
 
-    // a.i[3] = ((const int * ALIGNED(8))a3)[0];
-    // b.i[3] = ((const int * ALIGNED(8))a3)[1];
+    a.i[3] = ((const int * ALIGNED(8))a3)[0];
+    b.i[3] = ((const int * ALIGNED(8))a3)[1];
   }
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c )
+                           v4 &a,
+			   v4 &b,
+			   v4 &c )
   {
-    float32x4x4_t mat = vld4q_f32( (const float *) a0 );
+    // __m128 a_v, b_v, c_v, t, u;
 
-    a.v = mat.val[0];
-    b.v = mat.val[1];
-    c.v = mat.val[2];
+    // t   = _mm_load_ps( (const float *)a0 );
+    // b_v = _mm_load_ps( (const float *)a1 );
+    // c_v = _mm_load_ps( (const float *)a2 );
+    // u   = _mm_load_ps( (const float *)a3 );
 
-    // a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    // b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    // c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    // a_v = _mm_unpacklo_ps( t, b_v );
+    // b_v = _mm_unpackhi_ps( t, b_v );
+    // t   = _mm_unpacklo_ps( c_v, u );
+    // u   = _mm_unpackhi_ps( c_v, u );
 
-    // a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    // b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    // c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    // c_v = _mm_movelh_ps( b_v, u );
+    // b_v = _mm_movehl_ps( t, a_v );
+    // a_v = _mm_movelh_ps( a_v, t );
 
-    // a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    // b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    // c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    // a.v = a_v;
+    // b.v = b_v;
+    // c.v = c_v;
 
-    // a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    // b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    // c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+    // Not correct.
+    // float32x4x4_t mat = vld4q_f32( (const float *) a0 );
+
+    // a.v = mat.val[0];
+    // b.v = mat.val[1];
+    // c.v = mat.val[2];
+
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
   }
 
   #if 0
@@ -413,8 +500,12 @@ namespace v4
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d )
+                           v4 &a,
+			   v4 &b,
+			   v4 &c,
+			   v4 &d )
   {
+    // Not correct.
     float32x4x4_t mat = vld4q_f32( (const float *) a0 );
 
     a.v = mat.val[0];
@@ -449,7 +540,10 @@ namespace v4
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d )
+                           v4 &a,
+			   v4 &b,
+			   v4 &c,
+			   v4 &d )
   {
     //-----------------------------------------------------------------
     float32x4_t a_v, b_v, c_v, d_v, t, u;
@@ -572,58 +666,102 @@ namespace v4
   #endif
 
   inline void store_4x1_tr( const v4 &a,
-                            void *a0, void *a1,
-                            void *a2, void *a3 )
+                            void *a0,
+			    void *a1,
+                            void *a2,
+			    void *a3 )
   {
-    ((int *)a0)[0] = a.i[0];
-    ((int *)a1)[0] = a.i[1];
-    ((int *)a2)[0] = a.i[2];
-    ((int *)a3)[0] = a.i[3];
+    ( (int *) a0 )[0] = a.i[0];
+    ( (int *) a1 )[0] = a.i[1];
+    ( (int *) a2 )[0] = a.i[2];
+    ( (int *) a3 )[0] = a.i[3];
   }
 
-  inline void store_4x2_tr( const v4 &a, const v4 &b,
-                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
-                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  inline void store_4x2_tr( const v4 &a,
+			    const v4 &b,
+                            void * ALIGNED(8) a0,
+			    void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2,
+			    void * ALIGNED(8) a3 )
   {
-    ((int * ALIGNED(8))a0)[0] = a.i[0];
-    ((int * ALIGNED(8))a0)[1] = b.i[0];
+    // __m128 a_v = a.v, b_v = b.v, t;
+
+    // t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t
 
-    ((int * ALIGNED(8))a1)[0] = a.i[1];
-    ((int * ALIGNED(8))a1)[1] = b.i[1];
+    // _mm_storel_pi( (__m64 *)a0, t ); // a0 b0       -> a0
+    // _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1       -> a1
 
-    ((int * ALIGNED(8))a2)[0] = a.i[2];
-    ((int * ALIGNED(8))a2)[1] = b.i[2];
+    // t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t
 
-    ((int * ALIGNED(8))a3)[0] = a.i[3];
-    ((int * ALIGNED(8))a3)[1] = b.i[3];
+    // _mm_storel_pi( (__m64 *)a2, t ); // a2 b2       -> a2
+    // _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3       -> a3
+
+    ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0];
+
+    ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1];
+
+    ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2];
+
+    ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3];
   }
 
-  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x3_tr( const v4 &a,
+			    const v4 &b,
+			    const v4 &c,
+                            void * ALIGNED(16) a0,
+			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+			    void * ALIGNED(16) a3 )
   {
-    ((int * ALIGNED(16))a0)[0] = a.i[0];
-    ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    // __m128 a_v = a.v, b_v = b.v, t;
+
+    // t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t
+
+    // _mm_storel_pi( (__m64 *)a0, t ); // a0 b0       -> a0
+    // _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1       -> a1
 
-    ((int * ALIGNED(16))a1)[0] = a.i[1];
-    ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    // t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t
 
-    ((int * ALIGNED(16))a2)[0] = a.i[2];
-    ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    // _mm_storel_pi( (__m64 *)a2, t ); // a2 b2       -> a2
+    // _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3       -> a3
 
-    ((int * ALIGNED(16))a3)[0] = a.i[3];
-    ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    // ((float *)a0)[2] = c.f[0];
+    // ((float *)a1)[2] = c.f[1];
+    // ((float *)a2)[2] = c.f[2];
+    // ((float *)a3)[2] = c.f[3];
+
+    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
+
+    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
+
+    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
+
+    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
   }
 
   #if 0
-  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x4_tr( const v4 &a,
+			    const v4 &b,
+			    const v4 &c,
+			    const v4 &d,
+                            void * ALIGNED(16) a0,
+			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+			    void * ALIGNED(16) a3 )
   {
+    // Not correct.
     float32x4x4_t mat;
 
     mat.val[0] = a.v;
@@ -656,9 +794,14 @@ namespace v4
   #endif
 
   #if 1
-  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x4_tr( const v4 &a,
+			    const v4 &b,
+			    const v4 &c,
+			    const v4 &d,
+                            void * ALIGNED(16) a0,
+			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+			    void * ALIGNED(16) a3 )
   {
     //-----------------------------------------------------------------
     float32x4_t a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
@@ -930,17 +1073,45 @@ namespace v4
       return *this;                               \
     }
 
-    ASSIGN( =)
     ASSIGN(+=)
     ASSIGN(-=)
     ASSIGN(*=)
     ASSIGN(/=)
     ASSIGN(%=)
-    ASSIGN(^=)
-    ASSIGN(&=)
-    ASSIGN(|=)
     ASSIGN(<<=)
     ASSIGN(>>=)
+    // ASSIGN( =)
+    // ASSIGN(^=)
+    // ASSIGN(&=)
+    // ASSIGN(|=)
+
+    inline v4int &operator =( const v4int &b )
+    {
+      v = b.v;
+
+      return *this;
+    }
+
+    inline v4int &operator ^=( const v4int &b )
+    {
+      vsi = veorq_s32( vsi, b.vsi );
+
+      return *this;
+    }
+
+    inline v4int &operator &=( const v4int &b )
+    {
+      vsi = vandq_s32( vsi, b.vsi );
+
+      return *this;
+    }
+
+    inline v4int &operator |=( const v4int &b )
+    {
+      vsi = vorrq_s32( vsi, b.vsi );
+
+      return *this;
+    }
 
     #undef ASSIGN
 
@@ -1038,11 +1209,38 @@ namespace v4
   BINARY(*)
   BINARY(/)
   BINARY(%)
-  BINARY(^)
-  BINARY(&)
-  BINARY(|)
   BINARY(<<)
   BINARY(>>)
+  // BINARY(^)
+  // BINARY(&)
+  // BINARY(|)
+
+  inline v4int operator ^( const v4int &a, const v4int &b )
+  {
+    v4int c;
+
+    c.vsi = veorq_s32( a.vsi, b.vsi );
+
+    return c;
+  }
+
+  inline v4int operator &( const v4int &a, const v4int &b )
+  {
+    v4int c;
+
+    c.vsi = vandq_s32( a.vsi, b.vsi );
+
+    return c;
+  }
+
+  inline v4int operator |( const v4int &a, const v4int &b )
+  {
+    v4int c;
+
+    c.vsi = vorrq_s32( a.vsi, b.vsi );
+
+    return c;
+  }
 
   #undef BINARY
 
@@ -1086,9 +1284,13 @@ namespace v4
   {
     v4 b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.i[j] = a.i[j] & ~c.i[j];
+    b.vsi = vbicq_s32( c.vsi, a.vsi );
+
+    // b.v = _mm_andnot_ps( c.v, a.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.i[j] = a.i[j] & ~c.i[j];
 
     return b;
   }
@@ -1097,9 +1299,13 @@ namespace v4
   {
     v4 b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.i[j] = a.i[j] & c.i[j];
+    b.vsi = vandq_s32( c.vsi, a.vsi );
+
+    // b.v = _mm_and_ps( c.v, a.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.i[j] = a.i[j] & c.i[j];
 
     return b;
   }
@@ -1108,9 +1314,18 @@ namespace v4
   {
     v4 m;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+    m.vsi = vorrq_s32( vbicq_s32( c.vsi, f.vsi ),
+                       vandq_s32( c.vsi, t.vsi ) );
+
+    // __m128 c_v = c.v;
+    // v4 tf;
+
+    // tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ),
+    //                   _mm_and_ps( c_v, t.v ) );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
 
     return m;
   }
@@ -1225,6 +1440,8 @@ namespace v4
 
     v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
     {
+      // v = _mm_setr_ps( f0, f1, f2, f3 );
+
       f[0] = f0;
       f[1] = f1;
       f[2] = f2;
@@ -1242,6 +1459,11 @@ namespace v4
       return *this;                                     \
     }
 
+    ASSIGN( +=, vaddq_f32 )
+    ASSIGN( -=, vsubq_f32 )
+    ASSIGN( *=, vmulq_f32 )
+    ASSIGN( /=, vdivq_f32 )
+
     inline v4float &operator =( const v4float &b )
     {
       v = b.v;
@@ -1249,11 +1471,6 @@ namespace v4
       return *this;
     }
 
-    ASSIGN( +=, vaddq_f32 )
-    ASSIGN( -=, vsubq_f32 )
-    ASSIGN( *=, vmulq_f32 )
-    ASSIGN( /=, vdivq_f32 )
-
     #undef ASSIGN
 
     // #define ASSIGN(op)                                  \
@@ -1292,9 +1509,11 @@ namespace v4
   {
     v4float b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.f[j] = +a.f[j];
+    b.v = a.v;
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.f[j] = +a.f[j];
 
     return b;
   }
@@ -1303,6 +1522,8 @@ namespace v4
   {
     v4float b;
 
+    // b.v = _mm_sub_ps( _mm_setzero_ps(), a.v );
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       b.f[j] = -a.f[j];
@@ -1314,6 +1535,8 @@ namespace v4
   {
     v4int b;
 
+    // b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v );
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       b.i[j] = a.i[j] ? 0 : -1;
@@ -1327,6 +1550,11 @@ namespace v4
   {
     v4float b;
 
+    // __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) );
+
+    // a.v = t;
+    // b.v = t;
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       b.f[j] = ++a.f[j];
@@ -1338,6 +1566,11 @@ namespace v4
   {
     v4float b;
 
+    // __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) );
+
+    // a.v = t;
+    // b.v = t;
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       b.f[j] = --a.f[j];
@@ -1351,6 +1584,11 @@ namespace v4
   {
     v4float b;
 
+    // __m128 a_v = a.v;
+
+    // a.v = _mm_add_ps( a_v, _mm_set1_ps( 1 ) );
+    // b.v = a_v;
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       b.f[j] = a.f[j]++;
@@ -1362,6 +1600,11 @@ namespace v4
   {
     v4float b;
 
+    // __m128 a_v = a.v;
+
+    // a.v = _mm_sub_ps( a_v, _mm_set1_ps( 1 ) );
+    // b.v = a_v;
+
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       b.f[j] = a.f[j]--;
@@ -1650,9 +1893,11 @@ namespace v4
   {
     v4float d;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      d.f[j] = c.f[j] - a.f[j] * b.f[j];
+    d.v = vsubq_f32( vdupq_n_f32( 0.0f ), vfmsq_f32( a.v, b.v, c.v ) );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   d.f[j] = c.f[j] - a.f[j] * b.f[j];
 
     return d;
   }
@@ -1661,9 +1906,13 @@ namespace v4
   {
     v4float b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.i[j] = ( ~m.i[j] ) & a.i[j];
+    b.vsi = vbicq_s32( m.vsi, a.vsi );
+
+    // b.v = _mm_andnot_ps( m.v, a.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.i[j] = ( ~m.i[j] ) & a.i[j];
 
     return b;
   }
@@ -1672,9 +1921,13 @@ namespace v4
   {
     v4float b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.i[j] = m.i[j] | a.i[j];
+    b.vsi = vorrq_s32( m.vsi, a.vsi );
+
+    // b.v = _mm_or_ps( m.v, a.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.i[j] = m.i[j] | a.i[j];
 
     return b;
   }
@@ -1683,32 +1936,48 @@ namespace v4
   {
     v4float b;
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.i[j] = m.i[j] ^ a.i[j];
+    b.vsi = veorq_s32( m.vsi, a.vsi );
+
+    // b.v = _mm_xor_ps( m.v, a.v );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.i[j] = m.i[j] ^ a.i[j];
 
     return b;
   }
 
   inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
   {
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      p[j] += a.f[j];
+    vst1q_f32( p, vaddq_f32( vld1q_f32( p ), a.v ) );
+
+    // _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   p[j] += a.f[j];
   }
 
   inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
   {
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      p[j] -= a.f[j];
+    vst1q_f32( p, vsubq_f32( vld1q_f32( p ), a.v ) );
+
+    // _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   p[j] -= a.f[j];
   }
 
   inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
   {
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      p[j] *= a.f[j];
+    vst1q_f32( p, vmulq_f32( vld1q_f32( p ), a.v ) );
+
+    // _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) );
+
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   p[j] *= a.f[j];
   }
 
   inline void trilinear( v4float & wl, v4float & wh )

From b62ab7a96997754474148c07c0e49c783c113dfc Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 22 Jul 2019 17:27:25 -0600
Subject: [PATCH 32/95] Add another NEON implementation of transpose.

---
 src/util/v4/v4_neon.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 1734f62a..6272d322 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -205,6 +205,24 @@ namespace v4
       sw( a.i[j], b.i[j] );
   }
 
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
+    float32x4_t r, s, t, u;
+
+    r = vtrn1q_f32( a0.v, a1.v );
+    s = vtrn2q_f32( a0.v, a1.v );
+
+    t = vtrn1q_f32( a2.v, a3.v );
+    u = vtrn2q_f32( a2.v, a3.v );
+
+    a0.v = vtrn1q_f64( r, t );
+    a2.v = vtrn2q_f64( r, t );
+
+    a1.v = vtrn1q_f64( s, u );
+    a3.v = vtrn2q_f64( s, u );
+  }
+
+  #if 0
   inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
   {
     float32x4_t a0_v, a2_v, t, u;
@@ -293,6 +311,17 @@ namespace v4
     //                        sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
     //                                               sw( a2.i[3],a3.i[2] );
   }
+  #endif
+
+  #if 0
+  // Portable version.
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
+    sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
+                           sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
+                                                  sw( a2.i[3],a3.i[2] );
+  }
+  #endif
 
   #undef sw
 

From 70191a65e580b9e5200ce7d1fa7e407dbd68bad3 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Tue, 23 Jul 2019 09:43:50 -0600
Subject: [PATCH 33/95] Add new NEON implementations for load and store
 transpose functions.

---
 src/util/v4/v4_neon.h | 193 +++++++++++++++++++++++++++++++++---------
 1 file changed, 151 insertions(+), 42 deletions(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 6272d322..86fe7446 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -205,6 +205,7 @@ namespace v4
       sw( a.i[j], b.i[j] );
   }
 
+  #if 1
   inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
   {
     float32x4_t r, s, t, u;
@@ -221,6 +222,7 @@ namespace v4
     a1.v = vtrn1q_f64( s, u );
     a3.v = vtrn2q_f64( s, u );
   }
+  #endif
 
   #if 0
   inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
@@ -306,10 +308,6 @@ namespace v4
     //-----------------------------------------------------------------
     // a3.v = _mm_movehl_ps( u, t );
     //-----------------------------------------------------------------
-
-    // sw( a0.i[1],a1.i[0] ); sw( a0.i[2],a2.i[0] ); sw( a0.i[3],a3.i[0] );
-    //                        sw( a1.i[2],a2.i[1] ); sw( a1.i[3],a3.i[1] );
-    //                                               sw( a2.i[3],a3.i[2] );
   }
   #endif
 
@@ -433,6 +431,7 @@ namespace v4
     a.i[3] = ((const int *)a3)[0];
   }
 
+  #if 1
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
@@ -440,37 +439,46 @@ namespace v4
                            v4 &a,
 			   v4 &b )
   {
-    // __m128 a_v, b_v, t;
+    float32x4_t r, s, t, u, a2_v, a3_v;
 
-    // b_v = _mm_setzero_ps();
+    a.v  = vld1q_f32( (const float *) a0 );
+    b.v  = vld1q_f32( (const float *) a1 );
+    a2_v = vld1q_f32( (const float *) a2 );
+    a3_v = vld1q_f32( (const float *) a3 );
 
-    // t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
-    // b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
-
-    // a_v = _mm_shuffle_ps( t, b_v, 0x88 );
-    // b_v = _mm_shuffle_ps( t, b_v, 0xdd );
-
-    // a.v = a_v;
-    // b.v = b_v;
+    r = vtrn1q_f32( a.v, b.v );
+    s = vtrn2q_f32( a.v, b.v );
 
-    // Not correct.
-    // float32x4x4_t mat = vld4q_f32( (const float *) a0 );
+    t = vtrn1q_f32( a2_v, a3_v );
+    u = vtrn2q_f32( a2_v, a3_v );
 
-    // a.v = mat.val[0];
-    // b.v = mat.val[1];
+    a.v = vtrn1q_f64( r, t );
+    b.v = vtrn1q_f64( s, u );
+  }
+  #endif
 
-    a.i[0] = ((const int * ALIGNED(8))a0)[0];
-    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+  #if 0
+  // Portable version.
+  inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+                           v4 &a,
+			   v4 &b )
+  {
+    a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1];
 
-    a.i[1] = ((const int * ALIGNED(8))a1)[0];
-    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+    a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1];
 
-    a.i[2] = ((const int * ALIGNED(8))a2)[0];
-    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+    a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1];
 
-    a.i[3] = ((const int * ALIGNED(8))a3)[0];
-    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+    a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1];
   }
+  #endif
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
@@ -565,6 +573,36 @@ namespace v4
   #endif
 
   #if 1
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a,
+			   v4 &b,
+			   v4 &c,
+			   v4 &d )
+  {
+    float32x4_t r, s, t, u;
+
+    a.v = vld1q_f32( (const float *) a0 );
+    b.v = vld1q_f32( (const float *) a1 );
+    c.v = vld1q_f32( (const float *) a2 );
+    d.v = vld1q_f32( (const float *) a3 );
+
+    r = vtrn1q_f32( a.v, b.v );
+    s = vtrn2q_f32( a.v, b.v );
+
+    t = vtrn1q_f32( c.v, d.v );
+    u = vtrn2q_f32( c.v, d.v );
+
+    a.v = vtrn1q_f64( r, t );
+    b.v = vtrn1q_f64( s, u );
+    c.v = vtrn2q_f64( r, t );
+    d.v = vtrn2q_f64( s, u );
+  }
+  #endif
+
+  #if 0
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
@@ -671,26 +709,39 @@ namespace v4
     //-----------------------------------------------------------------
     // d.v = _mm_movehl_ps( u, t );
     //-----------------------------------------------------------------
+  }
+  #endif
 
-    // a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    // b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    // c.i[0] = ((const int * ALIGNED(16))a0)[2];
-    // d.i[0] = ((const int * ALIGNED(16))a0)[3];
+  #if 0
+  // Portable version.
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a,
+			   v4 &b,
+			   v4 &c,
+			   v4 &d )
+  {
+    a.i[0] = ((const int * ALIGNED(16))a0)[0];
+    b.i[0] = ((const int * ALIGNED(16))a0)[1];
+    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    d.i[0] = ((const int * ALIGNED(16))a0)[3];
 
-    // a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    // b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    // c.i[1] = ((const int * ALIGNED(16))a1)[2];
-    // d.i[1] = ((const int * ALIGNED(16))a1)[3];
+    a.i[1] = ((const int * ALIGNED(16))a1)[0];
+    b.i[1] = ((const int * ALIGNED(16))a1)[1];
+    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    d.i[1] = ((const int * ALIGNED(16))a1)[3];
 
-    // a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    // b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    // c.i[2] = ((const int * ALIGNED(16))a2)[2];
-    // d.i[2] = ((const int * ALIGNED(16))a2)[3];
+    a.i[2] = ((const int * ALIGNED(16))a2)[0];
+    b.i[2] = ((const int * ALIGNED(16))a2)[1];
+    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    d.i[2] = ((const int * ALIGNED(16))a2)[3];
 
-    // a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    // b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    // c.i[3] = ((const int * ALIGNED(16))a3)[2];
-    // d.i[3] = ((const int * ALIGNED(16))a3)[3];
+    a.i[3] = ((const int * ALIGNED(16))a3)[0];
+    b.i[3] = ((const int * ALIGNED(16))a3)[1];
+    c.i[3] = ((const int * ALIGNED(16))a3)[2];
+    d.i[3] = ((const int * ALIGNED(16))a3)[3];
   }
   #endif
 
@@ -823,6 +874,31 @@ namespace v4
   #endif
 
   #if 1
+  inline void store_4x4_tr( const v4 &a,
+			    const v4 &b,
+			    const v4 &c,
+			    const v4 &d,
+                            void * ALIGNED(16) a0,
+			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+			    void * ALIGNED(16) a3 )
+  {
+    float32x4_t r, s, t, u;
+
+    r = vtrn1q_f32( a.v, b.v );
+    s = vtrn2q_f32( a.v, b.v );
+
+    t = vtrn1q_f32( c.v, d.v );
+    u = vtrn2q_f32( c.v, d.v );
+
+    vst1q_f32( (float *) a0, vtrn1q_f64( r, t ) );
+    vst1q_f32( (float *) a1, vtrn1q_f64( s, u ) );
+    vst1q_f32( (float *) a2, vtrn2q_f64( r, t ) );
+    vst1q_f32( (float *) a3, vtrn2q_f64( s, u ) );
+  }
+  #endif
+
+  #if 0
   inline void store_4x4_tr( const v4 &a,
 			    const v4 &b,
 			    const v4 &c,
@@ -952,6 +1028,39 @@ namespace v4
   }
   #endif
 
+  #if 0
+  // Portable version.
+  inline void store_4x4_tr( const v4 &a,
+			    const v4 &b,
+			    const v4 &c,
+			    const v4 &d,
+                            void * ALIGNED(16) a0,
+			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+			    void * ALIGNED(16) a3 )
+  {
+    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0];
+
+    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1];
+
+    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2];
+
+    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3];
+  }
+  #endif
+
   //////////////
   // v4int class
 

From 2762aece44025ae6f671fd9ccc62084d20208900 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Tue, 23 Jul 2019 11:41:48 -0600
Subject: [PATCH 34/95] Try another idea for implementing the transpose
 function.

---
 src/util/v4/v4_neon.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 86fe7446..9d99bbf7 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -207,6 +207,22 @@ namespace v4
 
   #if 1
   inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
+    float32x4x2_t r, s;
+
+    r = vtrnq_f32( a0.v, a1.v );
+    s = vtrnq_f32( a2.v, a3.v );
+
+    a0.v = vtrn1q_f64( r.val[0], s.val[0] );
+    a2.v = vtrn2q_f64( r.val[0], s.val[0] );
+
+    a1.v = vtrn1q_f64( r.val[1], s.val[1] );
+    a3.v = vtrn2q_f64( r.val[1], s.val[1] );
+  }
+  #endif
+
+  #if 0
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
   {
     float32x4_t r, s, t, u;
 

From d82f07d2bd87127e89f6ec55e3202d75285bdfc4 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 25 Jul 2019 13:35:23 -0600
Subject: [PATCH 35/95] Add support for using load_4x8_tr and load_4x16_tr for
 the ARM NEON case.

---
 .../standard/pipeline/center_p_pipeline_v4.cc | 146 +++++++++++
 .../pipeline/uncenter_p_pipeline_v4.cc        | 162 +++++++++++-
 src/util/v4/v4_neon.h                         | 243 ++++++++++++------
 3 files changed, 461 insertions(+), 90 deletions(-)

diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
index dc6d5e18..611bd5d5 100644
--- a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
+++ b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
@@ -6,6 +6,150 @@
 
 using namespace v4;
 
+#ifdef V4_NEON_ACCELERATION
+
+void
+center_p_pipeline_v4( center_p_pipeline_args_t * args,
+                      int pipeline_rank,
+                      int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(128) p;
+
+  const float          * ALIGNED(16)  vp00;
+  const float          * ALIGNED(16)  vp01;
+  const float          * ALIGNED(16)  vp02;
+  const float          * ALIGNED(16)  vp03;
+
+  const v4float qdt_2mc(    args->qdt_2mc);
+  const v4float qdt_4mc(0.5*args->qdt_2mc); // For half Boris rotate.
+  const v4float one(1.0);
+  const v4float one_third(1.0/3.0);
+  const v4float two_fifteenths(2.0/15.0);
+
+  v4float dx, dy, dz, ux, uy, uz, q;
+  v4float hax, hay, haz, cbx, cby, cbz;
+  v4float v00, v01, v02, v03, v04, v05;
+  v4float v06, v07, v08, v09, v10;
+  v4int   ii;
+
+  int itmp, nq;
+
+  // Determine which particle blocks this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
+
+  p = args->p0 + itmp;
+
+  nq >>= 2;
+
+  // Process the particle blocks for this pipeline.
+
+  for( ; nq; nq--, p+=4 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_4x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+                 dx, dy, dz, ii, ux, uy, uz, q );
+
+    // load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+    // 		 dx, dy, dz, ii );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) );
+    vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) );
+    vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) );
+    vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x16_tr( vp00, vp01, vp02, vp03,
+                  hax, v00, v01, v02,
+                  hay, v03, v04, v05,
+                  haz, v06, v07, v08,
+                  cbx, v09, cby, v10 );
+
+    // load_4x4_tr( vp00, vp01, vp02, vp03,
+    // 		 hax, v00, v01, v02 );
+
+    hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    // load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+    // 		 hay, v03, v04, v05 );
+
+    hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    // load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+    // 		 haz, v00, v01, v02 );
+
+    haz = qdt_2mc * fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    // load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
+    // 		 cbx, v03, cby, v04 );
+
+    cbx = fma( v09, dx, cbx );
+    cby = fma( v10, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+		 cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Load particle momentum data.  Could use load_4x3_tr.
+    //--------------------------------------------------------------------------
+    // load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+    // 		 ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
+    v02  = ( v00 * v00 ) * v01;
+    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
+    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
+    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
+    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
+
+    //--------------------------------------------------------------------------
+    // Store particle momentum data.  Could use store_4x3_tr.
+    //--------------------------------------------------------------------------
+    store_4x4_tr( ux, uy, uz, q,
+		  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux );
+  }
+}
+
+#else
+
 void
 center_p_pipeline_v4( center_p_pipeline_args_t * args,
                       int pipeline_rank,
@@ -136,6 +280,8 @@ center_p_pipeline_v4( center_p_pipeline_args_t * args,
   }
 }
 
+#endif
+
 #else
 
 void
diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
index 908fedec..8d33c7a0 100644
--- a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
+++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
@@ -6,6 +6,150 @@
 
 using namespace v4;
 
+#ifdef V4_NEON_ACCELERATION
+
+void
+uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
+                        int pipeline_rank,
+                        int n_pipeline )
+{
+  const interpolator_t * ALIGNED(128) f0 = args->f0;
+
+  particle_t           * ALIGNED(128) p;
+
+  const float          * ALIGNED(16)  vp00;
+  const float          * ALIGNED(16)  vp01;
+  const float          * ALIGNED(16)  vp02;
+  const float          * ALIGNED(16)  vp03;
+
+  const v4float qdt_2mc(    -args->qdt_2mc); // For backward half advance.
+  const v4float qdt_4mc(-0.5*args->qdt_2mc); // For backward half Boris rotate.
+  const v4float one(1.0);
+  const v4float one_third(1.0/3.0);
+  const v4float two_fifteenths(2.0/15.0);
+
+  v4float dx, dy, dz, ux, uy, uz, q;
+  v4float hax, hay, haz, cbx, cby, cbz;
+  v4float v00, v01, v02, v03, v04, v05;
+  v4float v06, v07, v08, v09, v10;
+  v4int   ii;
+
+  int first, nq;
+
+  // Determine which particle quads this pipeline processes.
+
+  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, nq );
+
+  p = args->p0 + first;
+
+  nq >>= 2;
+
+  // Process the particle quads for this pipeline.
+
+  for( ; nq; nq--, p+=4 )
+  {
+    //--------------------------------------------------------------------------
+    // Load particle position data.
+    //--------------------------------------------------------------------------
+    load_4x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+                 dx, dy, dz, ii, ux, uy, uz, q );
+
+    // load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
+    // 		 dx, dy, dz, ii );
+
+    //--------------------------------------------------------------------------
+    // Set field interpolation pointers.
+    //--------------------------------------------------------------------------
+    vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) );
+    vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) );
+    vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) );
+    vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    load_4x16_tr( vp00, vp01, vp02, vp03,
+                  hax, v00, v01, v02,
+                  hay, v03, v04, v05,
+                  haz, v06, v07, v08,
+                  cbx, v09, cby, v10 );
+
+    // load_4x4_tr( vp00, vp01, vp02, vp03,
+    // 		 hax, v00, v01, v02 );
+
+    hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    // load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
+    // 		 hay, v03, v04, v05 );
+
+    hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    // load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
+    // 		 haz, v00, v01, v02 );
+
+    haz = qdt_2mc * fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles.
+    //--------------------------------------------------------------------------
+    // load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
+    // 		 cbx, v03, cby, v04 );
+
+    cbx = fma( v09, dx, cbx );
+    cby = fma( v10, dy, cby );
+
+    //--------------------------------------------------------------------------
+    // Load interpolation data for particles, final.
+    //--------------------------------------------------------------------------
+    load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
+		 cbz, v05 );
+
+    cbz = fma( v05, dz, cbz );
+
+    //--------------------------------------------------------------------------
+    // Load particle momentum data.  Could use load_4x3_tr.
+    //--------------------------------------------------------------------------
+    // load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
+    // 		 ux, uy, uz, q );
+
+    //--------------------------------------------------------------------------
+    // Update momentum.
+    //--------------------------------------------------------------------------
+    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
+    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
+    v02  = ( v00 * v00 ) * v01;
+    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
+    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
+    v04 += v04;
+
+    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
+    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
+    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
+
+    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
+    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
+    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
+
+    ux  += hax;
+    uy  += hay;
+    uz  += haz;
+
+    //--------------------------------------------------------------------------
+    // Store particle data.  Could use store_4x3_tr.
+    //--------------------------------------------------------------------------
+    store_4x4_tr( ux, uy, uz, q,
+		  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux );
+  }
+}
+
+#else
+
 void
 uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
                         int pipeline_rank,
@@ -49,7 +193,7 @@ uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
     // Load particle position data.
     //--------------------------------------------------------------------------
     load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
-		 dx, dy, dz, ii );
+    		 dx, dy, dz, ii );
 
     //--------------------------------------------------------------------------
     // Set field interpolation pointers.
@@ -63,31 +207,31 @@ uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
     // Load interpolation data for particles.
     //--------------------------------------------------------------------------
     load_4x4_tr( vp00, vp01, vp02, vp03,
-		 hax, v00, v01, v02 );
+    		 hax, v00, v01, v02 );
 
-    hax = qdt_2mc*fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
+    hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
 
     //--------------------------------------------------------------------------
     // Load interpolation data for particles.
     //--------------------------------------------------------------------------
     load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
-		 hay, v03, v04, v05 );
+    		 hay, v03, v04, v05 );
 
-    hay = qdt_2mc*fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
+    hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
 
     //--------------------------------------------------------------------------
     // Load interpolation data for particles.
     //--------------------------------------------------------------------------
     load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
-		 haz, v00, v01, v02 );
+    		 haz, v00, v01, v02 );
 
-    haz = qdt_2mc*fma( fma( dx, v02, v01 ), dy, fma( dx, v00, haz ) );
+    haz = qdt_2mc * fma( fma( dx, v02, v01 ), dy, fma( dx, v00, haz ) );
 
     //--------------------------------------------------------------------------
     // Load interpolation data for particles.
     //--------------------------------------------------------------------------
     load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
-		 cbx, v03, cby, v04 );
+    		 cbx, v03, cby, v04 );
 
     cbx = fma( v03, dx, cbx );
     cby = fma( v04, dy, cby );
@@ -136,6 +280,8 @@ uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
   }
 }
 
+#endif
+
 #else
 
 void
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 9d99bbf7..50142a2f 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -97,6 +97,28 @@ namespace v4
                                     const void * ALIGNED(16) a3,
                                     v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
 
+    friend inline void load_4x8_tr( const void * ALIGNED(16) a0,
+                                    const void * ALIGNED(16) a1,
+                                    const void * ALIGNED(16) a2,
+                                    const void * ALIGNED(16) a3,
+                                    v4 &b00, v4 &b01,
+                                    v4 &b02, v4 &b03,
+                                    v4 &b04, v4 &b05,
+                                    v4 &b06, v4 &b07 ) ALWAYS_INLINE;
+
+    friend inline void load_4x16_tr( const void * ALIGNED(16) a0,
+                                     const void * ALIGNED(16) a1,
+                                     const void * ALIGNED(16) a2,
+                                     const void * ALIGNED(16) a3,
+                                     v4 &b00, v4 &b01,
+                                     v4 &b02, v4 &b03,
+                                     v4 &b04, v4 &b05,
+                                     v4 &b06, v4 &b07,
+                                     v4 &b08, v4 &b09,
+                                     v4 &b10, v4 &b11,
+                                     v4 &b12, v4 &b13,
+                                     v4 &b14, v4 &b15 ) ALWAYS_INLINE;
+
     friend inline void store_4x1_tr( const v4 &a,
                                      void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
 
@@ -119,6 +141,15 @@ namespace v4
                                      void * ALIGNED(16) a2,
                                      void * ALIGNED(16) a3 ) ALWAYS_INLINE;
 
+    friend inline void store_4x8_tr( const v4 &b00, const v4 &b01,
+                                     const v4 &b02, const v4 &b03,
+                                     const v4 &b04, const v4 &b05,
+                                     const v4 &b06, const v4 &b07,
+                                     void * ALIGNED(16) a0,
+                                     void * ALIGNED(16) a1,
+                                     void * ALIGNED(16) a2,
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
   protected:
 
     union
@@ -548,46 +579,6 @@ namespace v4
     c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
   }
 
-  #if 0
-  inline void load_4x4_tr( const void * ALIGNED(16) a0,
-                           const void * ALIGNED(16) a1,
-                           const void * ALIGNED(16) a2,
-                           const void * ALIGNED(16) a3,
-                           v4 &a,
-			   v4 &b,
-			   v4 &c,
-			   v4 &d )
-  {
-    // Not correct.
-    float32x4x4_t mat = vld4q_f32( (const float *) a0 );
-
-    a.v = mat.val[0];
-    b.v = mat.val[1];
-    c.v = mat.val[2];
-    d.v = mat.val[3];
-
-    // a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    // b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    // c.i[0] = ((const int * ALIGNED(16))a0)[2];
-    // d.i[0] = ((const int * ALIGNED(16))a0)[3];
-
-    // a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    // b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    // c.i[1] = ((const int * ALIGNED(16))a1)[2];
-    // d.i[1] = ((const int * ALIGNED(16))a1)[3];
-
-    // a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    // b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    // c.i[2] = ((const int * ALIGNED(16))a2)[2];
-    // d.i[2] = ((const int * ALIGNED(16))a2)[3];
-
-    // a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    // b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    // c.i[3] = ((const int * ALIGNED(16))a3)[2];
-    // d.i[3] = ((const int * ALIGNED(16))a3)[3];
-  }
-  #endif
-
   #if 1
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
@@ -761,6 +752,105 @@ namespace v4
   }
   #endif
 
+  #if 1
+  inline void load_4x8_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &b00,
+                           v4 &b01,
+                           v4 &b02,
+                           v4 &b03,
+                           v4 &b04,
+                           v4 &b05,
+                           v4 &b06,
+                           v4 &b07 )
+  {
+    float32x4x4_t mat0 = vld4q_f32( (const float *) a0 );
+    float32x4x4_t mat2 = vld4q_f32( (const float *) a2 );
+
+    b00.v = vuzp1q_f32( mat0.val[0], mat2.val[0] );
+    b01.v = vuzp1q_f32( mat0.val[1], mat2.val[1] );
+    b02.v = vuzp1q_f32( mat0.val[2], mat2.val[2] );
+    b03.v = vuzp1q_f32( mat0.val[3], mat2.val[3] );
+
+    b04.v = vuzp2q_f32( mat0.val[0], mat2.val[0] );
+    b05.v = vuzp2q_f32( mat0.val[1], mat2.val[1] );
+    b06.v = vuzp2q_f32( mat0.val[2], mat2.val[2] );
+    b07.v = vuzp2q_f32( mat0.val[3], mat2.val[3] );
+  }
+  #endif
+
+  #if 1
+  inline void load_4x16_tr( const void * ALIGNED(16) a0,
+                            const void * ALIGNED(16) a1,
+                            const void * ALIGNED(16) a2,
+                            const void * ALIGNED(16) a3,
+                            v4 &b00,
+                            v4 &b01,
+                            v4 &b02,
+                            v4 &b03,
+                            v4 &b04,
+                            v4 &b05,
+                            v4 &b06,
+                            v4 &b07,
+                            v4 &b08,
+                            v4 &b09,
+                            v4 &b10,
+                            v4 &b11,
+                            v4 &b12,
+                            v4 &b13,
+                            v4 &b14,
+                            v4 &b15 )
+  {
+    float32x4 c00, c01, c02, c03, c04, c05, c06, c07;
+    float32x4 c08, c09, c10, c11, c12, c13, c14, c15;
+
+    float32x4x4_t mat0 = vld4q_f32( (const float *) a0 );
+    float32x4x4_t mat1 = vld4q_f32( (const float *) a1 );
+    float32x4x4_t mat2 = vld4q_f32( (const float *) a2 );
+    float32x4x4_t mat3 = vld4q_f32( (const float *) a3 );
+
+    c00 = vuzp1q_f32( mat0.val[0], mat1.val[0] );
+    c01 = vuzp1q_f32( mat0.val[1], mat1.val[1] );
+    c02 = vuzp1q_f32( mat0.val[2], mat1.val[2] );
+    c03 = vuzp1q_f32( mat0.val[3], mat1.val[3] );
+
+    c04 = vuzp2q_f32( mat0.val[0], mat1.val[0] );
+    c05 = vuzp2q_f32( mat0.val[1], mat1.val[1] );
+    c06 = vuzp2q_f32( mat0.val[2], mat1.val[2] );
+    c07 = vuzp2q_f32( mat0.val[3], mat1.val[3] );
+
+    c08 = vuzp1q_f32( mat2.val[0], mat3.val[0] );
+    c09 = vuzp1q_f32( mat2.val[1], mat3.val[1] );
+    c10 = vuzp1q_f32( mat2.val[2], mat3.val[2] );
+    c11 = vuzp1q_f32( mat2.val[3], mat3.val[3] );
+
+    c12 = vuzp2q_f32( mat2.val[0], mat3.val[0] );
+    c13 = vuzp2q_f32( mat2.val[1], mat3.val[1] );
+    c14 = vuzp2q_f32( mat2.val[2], mat3.val[2] );
+    c15 = vuzp2q_f32( mat2.val[3], mat3.val[3] );
+
+    b00.v = vuzp1q_f32( c00, c08 );
+    b01.v = vuzp1q_f32( c01, c09 );
+    b02.v = vuzp1q_f32( c02, c10 );
+    b03.v = vuzp1q_f32( c03, c11 );
+    b04.v = vuzp1q_f32( c04, c12 );
+    b05.v = vuzp1q_f32( c05, c13 );
+    b06.v = vuzp1q_f32( c06, c14 );
+    b07.v = vuzp1q_f32( c07, c15 );
+
+    b08.v = vuzp2q_f32( c00, c08 );
+    b09.v = vuzp2q_f32( c01, c09 );
+    b10.v = vuzp2q_f32( c02, c10 );
+    b11.v = vuzp2q_f32( c03, c11 );
+    b12.v = vuzp2q_f32( c04, c12 );
+    b13.v = vuzp2q_f32( c05, c13 );
+    b14.v = vuzp2q_f32( c06, c14 );
+    b15.v = vuzp2q_f32( c07, c15 );
+  }
+  #endif
+
   inline void store_4x1_tr( const v4 &a,
                             void *a0,
 			    void *a1,
@@ -847,48 +937,6 @@ namespace v4
     ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
   }
 
-  #if 0
-  inline void store_4x4_tr( const v4 &a,
-			    const v4 &b,
-			    const v4 &c,
-			    const v4 &d,
-                            void * ALIGNED(16) a0,
-			    void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2,
-			    void * ALIGNED(16) a3 )
-  {
-    // Not correct.
-    float32x4x4_t mat;
-
-    mat.val[0] = a.v;
-    mat.val[1] = b.v;
-    mat.val[2] = c.v;
-    mat.val[3] = d.v;
-
-    vst4q_f32( (const float *) a0, mat );
-
-    // ((int * ALIGNED(16))a0)[0] = a.i[0];
-    // ((int * ALIGNED(16))a0)[1] = b.i[0];
-    // ((int * ALIGNED(16))a0)[2] = c.i[0];
-    // ((int * ALIGNED(16))a0)[3] = d.i[0];
-
-    // ((int * ALIGNED(16))a1)[0] = a.i[1];
-    // ((int * ALIGNED(16))a1)[1] = b.i[1];
-    // ((int * ALIGNED(16))a1)[2] = c.i[1];
-    // ((int * ALIGNED(16))a1)[3] = d.i[1];
-
-    // ((int * ALIGNED(16))a2)[0] = a.i[2];
-    // ((int * ALIGNED(16))a2)[1] = b.i[2];
-    // ((int * ALIGNED(16))a2)[2] = c.i[2];
-    // ((int * ALIGNED(16))a2)[3] = d.i[2];
-
-    // ((int * ALIGNED(16))a3)[0] = a.i[3];
-    // ((int * ALIGNED(16))a3)[1] = b.i[3];
-    // ((int * ALIGNED(16))a3)[2] = c.i[3];
-    // ((int * ALIGNED(16))a3)[3] = d.i[3];
-  }
-  #endif
-
   #if 1
   inline void store_4x4_tr( const v4 &a,
 			    const v4 &b,
@@ -1077,6 +1125,37 @@ namespace v4
   }
   #endif
 
+  #if 1
+  inline void store_4x8_tr( const v4 &b00,
+			    const v4 &b01,
+			    const v4 &b02,
+			    const v4 &b03,
+			    const v4 &b04,
+			    const v4 &b05,
+			    const v4 &b06,
+			    const v4 &b07,
+                            void * ALIGNED(16) a0,
+			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+			    void * ALIGNED(16) a3 )
+  {
+    float32x4x4_t mat0, mat2;
+
+    mat0.val[0] = vuzp1q_f32( b00.v, b04.v );
+    mat0.val[1] = vuzp1q_f32( b01.v, b05.v );
+    mat0.val[2] = vuzp1q_f32( b02.v, b06.v );
+    mat0.val[3] = vuzp1q_f32( b03.v, b07.v );
+
+    mat2.val[0] = vuzp2q_f32( b00.v, b04.v );
+    mat2.val[1] = vuzp2q_f32( b01.v, b05.v );
+    mat2.val[2] = vuzp2q_f32( b02.v, b06.v );
+    mat2.val[3] = vuzp2q_f32( b03.v, b07.v );
+
+    vst4q_f32( (float *) a0, mat0 );
+    vst4q_f32( (float *) a2, mat2 );
+  }
+  #endif
+
   //////////////
   // v4int class
 

From b2f9017dbbe341357ee608268d3b4e03ee0fdd31 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 25 Jul 2019 15:47:55 -0600
Subject: [PATCH 36/95] Add test cases to V4 unit tests.

---
 src/util/v4/test/v4.cc | 57 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/src/util/v4/test/v4.cc b/src/util/v4/test/v4.cc
index 7e0db0e5..73a51540 100644
--- a/src/util/v4/test/v4.cc
+++ b/src/util/v4/test/v4.cc
@@ -274,6 +274,63 @@ TEST_CASE("TEST_CASE_load_4x4_tr", "[v4]") {
   REQUIRE( i==16 );
 } // TEST_CASE
 
+#ifdef V4_NEON_ACCELERATION
+TEST_CASE("TEST_CASE_load_4x8_tr", "[v4]") {
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 32 );
+  v4int a0, a1, a2, a3, a4, a5, a6, a7;
+  int i;
+  for( i=0; i<32; i++ ) mem[i] = i;
+  load_4x8_tr(mem,mem+8,mem+16,mem+24,a0,a1,a2,a3,a4,a5,a6,a7);
+  for( i=0; i<32; i++ ) if( mem[i]!=i ) break;
+  //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) ||
+  //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 );
+
+  REQUIRE( any(a0==v4int( 0,  8, 16, 24 )) );
+  REQUIRE( any(a1==v4int( 1,  9, 17, 25 )) );
+  REQUIRE( any(a2==v4int( 2, 10, 18, 26 )) );
+  REQUIRE( any(a3==v4int( 3, 11, 19, 27 )) );
+  REQUIRE( any(a4==v4int( 4, 12, 20, 28 )) );
+  REQUIRE( any(a5==v4int( 5, 13, 21, 29 )) );
+  REQUIRE( any(a6==v4int( 6, 14, 22, 30 )) );
+  REQUIRE( any(a7==v4int( 7, 15, 23, 31 )) );
+  REQUIRE( i==32 );
+} // TEST_CASE
+#endif
+
+#ifdef V4_NEON_ACCELERATION
+TEST_CASE("TEST_CASE_load_4x16_tr", "[v4]") {
+  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
+  v4int a00, a01, a02, a03, a04, a05, a06, a07;
+  v4int a08, a09, a10, a11, a12, a13, a14, a15;
+  int i;
+  for( i=0; i<64; i++ ) mem[i] = i;
+  load_4x16_tr(mem,mem+16,mem+32,mem+48,
+	       a00,a01,a02,a03,a04,a05,a06,a07,
+	       a08,a09,a10,a11,a12,a13,a14,a15);
+  for( i=0; i<64; i++ ) if( mem[i]!=i ) break;
+  //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) ||
+  //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 );
+
+  REQUIRE( any(a00==v4int(  0, 16, 32, 48 )) );
+  REQUIRE( any(a01==v4int(  1, 17, 33, 49 )) );
+  REQUIRE( any(a02==v4int(  2, 18, 34, 50 )) );
+  REQUIRE( any(a03==v4int(  3, 19, 35, 51 )) );
+  REQUIRE( any(a04==v4int(  4, 20, 36, 52 )) );
+  REQUIRE( any(a05==v4int(  5, 21, 37, 53 )) );
+  REQUIRE( any(a06==v4int(  6, 22, 38, 54 )) );
+  REQUIRE( any(a07==v4int(  7, 23, 39, 55 )) );
+  REQUIRE( any(a08==v4int(  8, 24, 40, 56 )) );
+  REQUIRE( any(a09==v4int(  9, 25, 41, 57 )) );
+  REQUIRE( any(a10==v4int( 10, 26, 42, 58 )) );
+  REQUIRE( any(a11==v4int( 11, 27, 43, 59 )) );
+  REQUIRE( any(a12==v4int( 12, 28, 44, 60 )) );
+  REQUIRE( any(a13==v4int( 13, 29, 45, 61 )) );
+  REQUIRE( any(a14==v4int( 14, 30, 46, 62 )) );
+  REQUIRE( any(a15==v4int( 15, 31, 47, 63 )) );
+  REQUIRE( i==64 );
+} // TEST_CASE
+#endif
+
 TEST_CASE("TEST_CASE_store_4x1_tr", "[v4]") {
   DECLARE_ALIGNED_ARRAY( int, 16, mem, 16 );
   v4int a0( 0, 4, 8,12), a1( 1, 5, 9,13), a2( 2, 6,10,14), a3( 3, 7,11,15);

From 15b8da39e69cd1ae7e9c5413e8000e760b1676e1 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 25 Jul 2019 16:02:31 -0600
Subject: [PATCH 37/95] Fix a declaration error.

---
 src/util/v4/v4_neon.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 50142a2f..f4c7a8a7 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -803,8 +803,8 @@ namespace v4
                             v4 &b14,
                             v4 &b15 )
   {
-    float32x4 c00, c01, c02, c03, c04, c05, c06, c07;
-    float32x4 c08, c09, c10, c11, c12, c13, c14, c15;
+    float32x4_t c00, c01, c02, c03, c04, c05, c06, c07;
+    float32x4_t c08, c09, c10, c11, c12, c13, c14, c15;
 
     float32x4x4_t mat0 = vld4q_f32( (const float *) a0 );
     float32x4x4_t mat1 = vld4q_f32( (const float *) a1 );

From 60703bded811ce00b52a51a194760ee04a3eb2b4 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Thu, 25 Jul 2019 16:17:12 -0600
Subject: [PATCH 38/95] Test different memory alignment for V4 NEON
 implementation.

---
 src/sf_interface/sf_interface.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h
index 6dc86883..f49be9eb 100644
--- a/src/sf_interface/sf_interface.h
+++ b/src/sf_interface/sf_interface.h
@@ -52,6 +52,15 @@
 
 #endif
 
+// Temporary hack.
+#ifdef V4_NEON_ACCELERATION
+
+#define PAD_SIZE_INTERPOLATOR 14
+#define PAD_SIZE_ACCUMULATOR   4
+#define PAD_SIZE_HYDRO         2
+
+#endif
+
 /*****************************************************************************/
 
 // Interpolator arrays shall be a (nx+2) x (ny+2) x (nz+2) allocation

From cbdc2ccf0dcd36878512b07b39b95a14191f1622 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Sat, 27 Jul 2019 19:57:52 -0600
Subject: [PATCH 39/95] Format tweak.

---
 .../standard/pipeline/advance_p_pipeline.cc               | 4 ++--
 .../standard/pipeline/advance_p_pipeline_v4.cc            | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline.cc b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
index 3cdc4d10..e2ccfd93 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline.cc
@@ -183,7 +183,7 @@ advance_p_pipeline_scalar( advance_p_pipeline_args_t * args,
 
       a  = (float *)( a0 + ii );              // Get accumulator
 
-#     define ACCUMULATE_J(X,Y,Z,offset)                                 \
+      #define ACCUMULATE_J(X,Y,Z,offset)                                \
       v4  = q*u##X;   /* v2 = q ux                            */        \
       v1  = v4*d##Y;  /* v1 = q ux dy                         */        \
       v0  = v4-v1;    /* v0 = q ux (1-dy)                     */        \
@@ -207,7 +207,7 @@ advance_p_pipeline_scalar( advance_p_pipeline_args_t * args,
       ACCUMULATE_J( y, z, x, 4 );
       ACCUMULATE_J( z, x, y, 8 );
 
-#     undef ACCUMULATE_J
+      #undef ACCUMULATE_J
     }
 
     else                                        // Unlikely
diff --git a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc
index 19d82ade..88f83b34 100644
--- a/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc
+++ b/src/species_advance/standard/pipeline/advance_p_pipeline_v4.cc
@@ -245,7 +245,7 @@ advance_p_pipeline_v4( advance_p_pipeline_args_t * args,
     //--------------------------------------------------------------------------
     // Accumulate current density.
     //--------------------------------------------------------------------------
-#   define ACCUMULATE_J(X,Y,Z,offset)                                  \
+    #define ACCUMULATE_J(X,Y,Z,offset)                                 \
     v04  = q*u##X;    /* v04 = q ux                            */      \
     v01  = v04*d##Y;  /* v01 = q ux dy                         */      \
     v00  = v04-v01;   /* v00 = q ux (1-dy)                     */      \
@@ -270,14 +270,14 @@ advance_p_pipeline_v4( advance_p_pipeline_args_t * args,
     ACCUMULATE_J( y, z, x, 4 );
     ACCUMULATE_J( z, x, y, 8 );
 
-#   undef ACCUMULATE_J
+    #undef ACCUMULATE_J
 
     //--------------------------------------------------------------------------
     // Update position and accumulate current density for out of bounds
     // particles.
     //--------------------------------------------------------------------------
 
-#   define MOVE_OUTBND(N)                                               \
+    #define MOVE_OUTBND(N)                                              \
     if ( outbnd(N) )                                /* Unlikely */      \
     {                                                                   \
       local_pm->dispx = ux(N);                                          \
@@ -302,7 +302,7 @@ advance_p_pipeline_v4( advance_p_pipeline_args_t * args,
     MOVE_OUTBND( 2);
     MOVE_OUTBND( 3);
 
-#   undef MOVE_OUTBND
+    #undef MOVE_OUTBND
   }
 
   args->seg[pipeline_rank].pm        = pm;

From 577b42eefa87c7f0f09ad25c327f66a2c95792f7 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Sun, 28 Jul 2019 22:45:59 -0600
Subject: [PATCH 40/95] Do not use special test version of V4 NEON
 implementations of center_p and uncenter_p.

---
 src/species_advance/standard/pipeline/center_p_pipeline_v4.cc   | 2 +-
 src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
index 611bd5d5..2a25611f 100644
--- a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
+++ b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
@@ -6,7 +6,7 @@
 
 using namespace v4;
 
-#ifdef V4_NEON_ACCELERATION
+#ifdef V4_NEON_ACCELERATION_SNOUT
 
 void
 center_p_pipeline_v4( center_p_pipeline_args_t * args,
diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
index 8d33c7a0..d4bfc425 100644
--- a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
+++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
@@ -6,7 +6,7 @@
 
 using namespace v4;
 
-#ifdef V4_NEON_ACCELERATION
+#ifdef V4_NEON_ACCELERATION_SNOUT
 
 void
 uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,

From 7c6c53dc40ed33f8f985c3ed96cff7361630ab95 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Sun, 28 Jul 2019 23:24:26 -0600
Subject: [PATCH 41/95] Revert rcp and rsqrt functions back to their portable
 versions to try to isolate NaN problem.

---
 src/sf_interface/sf_interface.h |  2 +-
 src/util/v4/v4_neon.h           | 58 ++++++++++++++++-----------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h
index f49be9eb..fd9b72eb 100644
--- a/src/sf_interface/sf_interface.h
+++ b/src/sf_interface/sf_interface.h
@@ -53,7 +53,7 @@
 #endif
 
 // Temporary hack.
-#ifdef V4_NEON_ACCELERATION
+#ifdef V4_NEON_ACCELERATION_SNOUT
 
 #define PAD_SIZE_INTERPOLATOR 14
 #define PAD_SIZE_ACCUMULATOR   4
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index f4c7a8a7..22e8dff6 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -2038,26 +2038,26 @@ namespace v4
   {
     v4float b;
 
-    float32x4_t a_v = a.v, b_v;
-
-    b_v = vrsqrteq_f32( a_v );
-
-    // Note: It is quicker to just call div_ps and sqrt_ps if more
-    // refinement desired!
-    b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ),
-                                     vsubq_f32( b_v,
-                                                vmulq_f32( a_v,
-                                                           vmulq_f32( b_v,
-                                                                      vmulq_f32( b_v, b_v )
-                                                                    )
-                                                         )
-                                              )
-                                   )
-                    );
+    // float32x4_t a_v = a.v, b_v;
+
+    // b_v = vrsqrteq_f32( a_v );
+
+    // // Note: It is quicker to just call div_ps and sqrt_ps if more
+    // // refinement desired!
+    // b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ),
+    //                                  vsubq_f32( b_v,
+    //                                             vmulq_f32( a_v,
+    //                                                        vmulq_f32( b_v,
+    //                                                                   vmulq_f32( b_v, b_v )
+    //                                                                 )
+    //                                                      )
+    //                                           )
+    //                                )
+    //                 );
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   b.f[j] = ::sqrt( 1.0f / a.f[j] );
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = ::sqrt( 1.0f / a.f[j] );
 
     return b;
   }
@@ -2079,19 +2079,19 @@ namespace v4
   {
     v4float b;
 
-    float32x4_t a_v = a.v, b_v;
+    // float32x4_t a_v = a.v, b_v;
 
-    b_v = vrecpeq_f32( a_v );
+    // b_v = vrecpeq_f32( a_v );
 
-    b.v = vsubq_f32( vaddq_f32( b_v, b_v ),
-                     vmulq_f32( a_v,
-                                vmulq_f32( b_v, b_v )
-                              )
-                   );
+    // b.v = vsubq_f32( vaddq_f32( b_v, b_v ),
+    //                  vmulq_f32( a_v,
+    //                             vmulq_f32( b_v, b_v )
+    //                           )
+    //                );
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   b.f[j] = 1.0f / a.f[j];
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.f[j] = 1.0f / a.f[j];
 
     return b;
   }

From bd000c43119140bb51e936722de8812236dcce36 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 10:01:38 -0600
Subject: [PATCH 42/95] Format tweaks and cleanup before merging into other
 branches.

---
 src/util/v4/v4_altivec.h     | 1241 +++++++++++++++++++---------------
 src/util/v4/v4_avx.h         | 1041 ++++++++++++++++++----------
 src/util/v4/v4_avx2.h        |  414 +++++++-----
 src/util/v4/v4_neon.h        |  897 ++++--------------------
 src/util/v4/v4_portable.h    |  436 ++++++------
 src/util/v4/v4_portable_v0.h |  428 ++++++------
 src/util/v4/v4_portable_v1.h |  338 ++++-----
 src/util/v4/v4_sse.h         | 1025 ++++++++++++++++++----------
 8 files changed, 3088 insertions(+), 2732 deletions(-)

diff --git a/src/util/v4/v4_altivec.h b/src/util/v4/v4_altivec.h
index 6ff3f58c..f1361278 100644
--- a/src/util/v4/v4_altivec.h
+++ b/src/util/v4/v4_altivec.h
@@ -1,10 +1,13 @@
-#ifndef _v4_altivec_h_
+ #ifndef _v4_altivec_h_
 #define _v4_altivec_h_
 
 #ifndef IN_v4_h
 #error "Do not include v4_altivec.h directly; use v4.h"
 #endif
 
+#include <altivec.h>
+#include <math.h>
+
 #define V4_ACCELERATION
 #define V4_ALTIVEC_ACCELERATION
 
@@ -12,31 +15,30 @@
 #define ALIGNED(n)
 #endif
 
-#include <math.h>
-#include <altivec.h>
-
 // See if this fixes a problem when compiling with GNU compilers.
 #ifdef __GNUC__
 #undef bool
 #undef vector
 #endif
 
-namespace v4 {
+#define ALWAYS_INLINE __attribute__((always_inline))
 
+namespace v4
+{
   class v4;
   class v4int;
   class v4float;
 
-# define _v4_int     __vector int
-# define _v4_uint    __vector unsigned int
-# define _v4_float   __vector float
-# define _v16_uchar  __vector unsigned char
+  #define _v4_int     __vector int
+  #define _v4_uint    __vector unsigned int
+  #define _v4_float   __vector float
+  #define _v16_uchar  __vector unsigned char
 
-# define _PERM(i0,i1,i2,i3)                              \
-    ((_v16_uchar){ 4*(i0), 4*(i0)+1, 4*(i0)+2, 4*(i0)+3, \
-                   4*(i1), 4*(i1)+1, 4*(i1)+2, 4*(i1)+3, \
-                   4*(i2), 4*(i2)+1, 4*(i2)+2, 4*(i2)+3, \
-                   4*(i3), 4*(i3)+1, 4*(i3)+2, 4*(i3)+3 })
+  #define _PERM(i0,i1,i2,i3)                              \
+  ( (_v16_uchar) { 4*(i0), 4*(i0)+1, 4*(i0)+2, 4*(i0)+3,  \
+                   4*(i1), 4*(i1)+1, 4*(i1)+2, 4*(i1)+3,  \
+                   4*(i2), 4*(i2)+1, 4*(i2)+2, 4*(i2)+3,  \
+                   4*(i3), 4*(i3)+1, 4*(i3)+2, 4*(i3)+3 } )
 
   // FIXME: IS IT FASTER TO SPLAT THESE ON THE FLY
 
@@ -44,123 +46,104 @@ namespace v4 {
   const _v4_int   _true  = { -1, -1, -1, -1 };
   const _v4_int   _ione  = {  1,  1,  1,  1 };
 
-  const _v4_float _zero  = { 0.0f, 0.0f, 0.0f, 0.0f };
-  const _v4_float _half  = { 0.5f, 0.5f, 0.5f, 0.5f };
-  const _v4_float _one   = { 1.0f, 1.0f, 1.0f, 1.0f };
-  const _v4_float _sign  = {-0.0f,-0.0f,-0.0f,-0.0f };
-  const _v4_float _n02   = {-0.0f,+0.0f,-0.0f,+0.0f };
+  const _v4_float _zero  = {  0.0f,  0.0f,  0.0f,  0.0f };
+  const _v4_float _half  = {  0.5f,  0.5f,  0.5f,  0.5f };
+  const _v4_float _one   = {  1.0f,  1.0f,  1.0f,  1.0f };
+  const _v4_float _sign  = { -0.0f, -0.0f, -0.0f, -0.0f };
+  const _v4_float _n02   = { -0.0f, +0.0f, -0.0f, +0.0f };
 
   ////////////////
   // v4 base class
-  
-  class v4 {
-    
+
+  class v4
+  {
     friend class v4int;
     friend class v4float;
 
-    // -----------------------------------------------------------------------------
-    // hacks that need to be resolved more elegantly
+    // v4 miscellaneous friends
 
-/*     friend inline v4 operator *( const v4 &a, const v4 &b ); */
+    friend inline int any( const v4 &a ) ALWAYS_INLINE;
+    friend inline int all( const v4 &a ) ALWAYS_INLINE;
 
-/* #   define ASSIGN(op,instr)                             \ */
-/*     inline v4 &operator op( const v4 &b )               \ */
-/*     {							\ */
-/*       instr;                                            \ */
-/*       return *this;                                     \ */
-/*     } */
+    template<int n>
+    friend inline v4 splat( const v4 &a ) ALWAYS_INLINE;
 
-/*     ASSIGN(=, v = b.v ); */
+    template<int i0, int i1, int i2, int i3>
+    friend inline v4 shuffle( const v4 &a ) ALWAYS_INLINE;
 
-/* #   undef ASSIGN */
+    friend inline void swap( v4 &a, v4 &b ) ALWAYS_INLINE;
+    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) ALWAYS_INLINE;
 
-/* #   define BINARY(op,instr)                                            \ */
-/*     inline v4 operator op( const v4 &a, const v4 &b )                  \ */
-/*     {								       \ */
-/*       v4 c;                                                            \ */
-/*       instr;                                                           \ */
-/*       return c;                                                        \ */
-/*     } */
+    // v4int miscellaneous friends
 
-/*     BINARY(+, c.v = vec_add( a.v, b.v ) ) */
-/*     BINARY(-, c.v = vec_sub( a.v, b.v ) ) */
-/*     BINARY(*, c.v = vec_mul( a.v, b.v ) ) */
-/*       // BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) ) */
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4    merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
-/* #   undef BINARY */
-    // end hacks
-    // -----------------------------------------------------------------------------
+    // v4 memory manipulation friends
 
-    // v4 miscellenous friends
+    friend inline void   load_4x1( const void * ALIGNED(16) p,
+                                   v4 &a ) ALWAYS_INLINE;
 
-    friend inline int any( const v4 &a );
-    friend inline int all( const v4 &a );
-    template<int n>
-    friend inline v4 splat( const v4 &a );
-    // friend inline v4 splat( const v4 &a, int n );
-    template<int i0, int i1, int i2, int i3>
-    friend inline v4 shuffle( const v4 &a );
-    // friend inline v4 shuffle( const v4 &a,
-    //                           int i0, int i1, int i2, int i3 );
-    friend inline void swap( v4 &a, v4 &b );
-    friend inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 );
+    friend inline void  store_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
 
-    // v4int miscellaneous friends
+    friend inline void stream_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
 
-    friend inline v4 czero(    const v4int &c, const v4 &a );
-    friend inline v4 notczero( const v4int &c, const v4 &a );
-    friend inline v4 merge(    const v4int &c, const v4 &a, const v4 &b );
+    friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
 
-    // v4 memory manipulation friends
-        
-    friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a );
-    friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p );
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p );
-    friend inline void clear_4x1( void * ALIGNED(16) dst );
-    friend inline void copy_4x1( void * ALIGNED(16) dst,
-                                 const void * ALIGNED(16) src );
-    friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b );
+    friend inline void   copy_4x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+
+    friend inline void   swap_4x1( void * ALIGNED(16) a,
+                                   void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
-    // Note: Half aligned values are permissible in the 4x2_tr variants!
 
     friend inline void load_4x1_tr( const void *a0, const void *a1,
                                     const void *a2, const void *a3,
-                                    v4 &a );
+                                    v4 &a ) ALWAYS_INLINE;
+
     friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
                                     const void * ALIGNED(8) a1,
                                     const void * ALIGNED(8) a2,
                                     const void * ALIGNED(8) a3,
-                                    v4 &a, v4 &b );
+                                    v4 &a, v4 &b ) ALWAYS_INLINE;
+
     friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
-                                    v4 &a, v4 &b, v4 &c );
+                                    v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+
     friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
-                                    v4 &a, v4 &b, v4 &c, v4 &d );
-    
+                                    v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
+
     friend inline void store_4x1_tr( const v4 &a,
-                                     void *a0, void *a1, void *a2, void *a3 );
+                                     void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x2_tr( const v4 &a, const v4 &b,
                                      void * ALIGNED(8) a0,
                                      void * ALIGNED(8) a1,
                                      void * ALIGNED(8) a2,
-                                     void * ALIGNED(8) a3 );
+                                     void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
                                      void * ALIGNED(16) a0,
                                      void * ALIGNED(16) a1,
                                      void * ALIGNED(16) a2,
-                                     void * ALIGNED(16) a3 );
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x4_tr( const v4 &a, const v4 &b,
                                      const v4 &c, const v4 &d,
                                      void * ALIGNED(16) a0,
                                      void * ALIGNED(16) a1,
                                      void * ALIGNED(16) a2,
-                                     void * ALIGNED(16) a3 );
+                                     void * ALIGNED(16) a3 ) ALWAYS_INLINE;
 
   protected:
   public: // wdn
@@ -170,46 +153,58 @@ namespace v4 {
   public:
 
     v4() {}                    // Default constructor
-    v4(const v4 &a) {          // Copy constructor
+
+    v4( const v4 &a )          // Copy constructor
+    {
       v = a.v;
     }
-    ~v4() {}                   // Default destructor
 
+    ~v4() {}                   // Default destructor
   };
-  
+
   // v4 miscellaneous functions
 
-  inline int any( const v4 &a ) {
-    return vec_any_ne( (_v4_int)a.v, _false );
+  inline int any( const v4 &a )
+  {
+    return vec_any_ne( (_v4_int) a.v, _false );
   }
-  
-  inline int all( const v4 &a ) {
-    return vec_all_ne( (_v4_int)a.v, _false );
+
+  inline int all( const v4 &a )
+  {
+    return vec_all_ne( (_v4_int) a.v, _false );
   }
 
   template<int n>
-  inline v4 splat( const v4 & a ) {
+  inline v4 splat( const v4 & a )
+  {
     v4 b;
+
     b.v = vec_splat( a.v, n );
+
     return b;
   }
 
   template<int i0, int i1, int i2, int i3>
-  inline v4 shuffle( const v4 & a ) {
-    _v4_float a_v = a.v;
+  inline v4 shuffle( const v4 & a )
+  {
     v4 b;
-    b.v = vec_perm( a_v, a_v, _PERM( i0, i1, i2, i3 ) );
+
+    b.v = vec_perm( a.v, a.v, _PERM( i0, i1, i2, i3 ) );
+
     return b;
   }
 
-  inline void swap( v4 &a, v4 &b ) { 
-    _v4_float t;
-    t   = a.v;
+  inline void swap( v4 &a, v4 &b )
+  {
+    _v4_float t = a.v;
+
     a.v = b.v;
+
     b.v = t;
   }
 
-  inline void transpose( v4 &a, v4 &b, v4 &c, v4 &d ) {
+  inline void transpose( v4 &a, v4 &b, v4 &c, v4 &d )
+  {
     _v4_float a0 = a.v;                  // a0 =  0  1  2  3
     _v4_float b0 = b.v;                  // b0 =  4  5  6  7
     _v4_float c1 = c.v;                  // c1 =  8  9 10 11
@@ -231,40 +226,38 @@ namespace v4 {
   }
 
   // v4 memory manipulation functions
-  
+
   inline void load_4x1( const void * ALIGNED(16) p,
-			v4 &a )
+                        v4 &a )
   {
     a.v = vec_ld( 0, ( const float * ) p );
   }
 
   inline void store_4x1( const v4 &a,
-			 void * ALIGNED(16) p )
+                         void * ALIGNED(16) p )
   {
     vec_st( a.v, 0, ( float * ) p );
   }
 
   inline void stream_4x1( const v4 &a,
-			  void * ALIGNED(16) p )
+                          void * ALIGNED(16) p )
   {
     vec_stl( a.v, 0, ( float * ) p );
   }
 
-  // FIXME: Ordering semantics
-  inline void clear_4x1( void * ALIGNED(16) d )
+  inline void clear_4x1( void * ALIGNED(16) p )
   {
-    vec_st( _zero, 0, ( float * ) d );
+    vec_st( _zero, 0, ( float * ) p );
   }
 
-  // FIXME: Ordering semantics
-  inline void copy_4x1( void * ALIGNED(16) d,
-			const void * ALIGNED(16) s )
+  inline void copy_4x1( void * ALIGNED(16) dst,
+                        const void * ALIGNED(16) src )
   {
-    vec_st( vec_ld( 0, ( const float * ) s ), 0, ( float * ) d );
+    vec_st( vec_ld( 0, ( const float * ) src ), 0, ( float * ) dst );
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
-			void * ALIGNED(16) b )
+                        void * ALIGNED(16) b )
   {
     _v4_float va = vec_ld( 0, ( float * ) a );
     _v4_float vb = vec_ld( 0, ( float * ) b );
@@ -275,285 +268,330 @@ namespace v4 {
 
   // v4 transposed memory manipulation functions
 
-  inline void load_4x1_tr( const void *pa,
-                           const void *pb,
-                           const void *pc,
-                           const void *pd,
-                           v4 &a ) {
-    a.v = (_v4_float){ ((const float *)pa)[0],
-                       ((const float *)pb)[0],
-                       ((const float *)pc)[0],
-                       ((const float *)pd)[0] };
-  }
-
-  #if 0
-  inline void load_4x2_tr( const void * ALIGNED(8) pa,
-                           const void * ALIGNED(8) pb,
-                           const void * ALIGNED(8) pc,
-                           const void * ALIGNED(8) pd,
-                           v4 &a, v4 &b ) { // FIXME: UGLY!!
-    a.v = (_v4_float){ ((const float *)pa)[0],
-                       ((const float *)pb)[0],
-                       ((const float *)pc)[0],
-                       ((const float *)pd)[0] };
-    b.v = (_v4_float){ ((const float *)pa)[1],
-                       ((const float *)pb)[1],
-                       ((const float *)pc)[1],
-                       ((const float *)pd)[1] };
-  }
-  #endif
-
-  inline void load_4x2_tr( const void * ALIGNED(8) pa,
-                           const void * ALIGNED(8) pb,
-                           const void * ALIGNED(8) pc,
-                           const void * ALIGNED(8) pd,
-                           v4 &a, v4 &b )
+  inline void load_4x1_tr( const void *a0,
+                           const void *a1,
+                           const void *a2,
+                           const void *a3,
+                           v4 &a )
+  {
+    a.v = (_v4_float){ ( (const float *) a0 )[0],
+                       ( (const float *) a1 )[0],
+                       ( (const float *) a2 )[0],
+                       ( (const float *) a3 )[0] };
+  }
+
+  inline void load_4x2_tr( const void * ALIGNED(8) a0,
+                           const void * ALIGNED(8) a1,
+                           const void * ALIGNED(8) a2,
+                           const void * ALIGNED(8) a3,
+                           v4 &a,
+                           v4 &b )
   {
-    _v4_float a0 = vec_ld( 0, (const float *)pa ); // a0 =  0  1  2  3
-    _v4_float b0 = vec_ld( 0, (const float *)pb ); // b0 =  4  5  6  7
-    _v4_float c1 = vec_ld( 0, (const float *)pc ); // c1 =  8  9 10 11
-    _v4_float d1 = vec_ld( 0, (const float *)pd ); // d1 = 12 13 14 15
+    _v4_float r = vec_ld( 0, (const float *) a0 ); // r =  0  1  2  3
+    _v4_float s = vec_ld( 0, (const float *) a1 ); // s =  4  5  6  7
+    _v4_float t = vec_ld( 0, (const float *) a2 ); // t =  8  9 10 11
+    _v4_float u = vec_ld( 0, (const float *) a3 ); // u = 12 13 14 15
 
     // Step 1: Interleave top and bottom half
 
-    _v4_float a1 = vec_mergeh( a0, c1 );           // a1 =  0  8  1  9
-    _v4_float b1 = vec_mergeh( b0, d1 );           // b1 =  4 12  5 13
+    _v4_float v = vec_mergeh( r, t );              // v =  0  8  1  9
+    _v4_float w = vec_mergeh( s, u );              // w =  4 12  5 13
 
     // Step 2: Interleave even and odd rows
 
-    a.v          = vec_mergeh( a1, b1 );           // a  =  0  4  8 12
-    b.v          = vec_mergel( a1, b1 );           // b  =  1  5  9 13
+    a.v         = vec_mergeh( v, w );              // a  =  0  4  8 12
+    b.v         = vec_mergel( v, w );              // b  =  1  5  9 13
   }
-  
-  inline void load_4x3_tr( const void * ALIGNED(16) pa,
-                           const void * ALIGNED(16) pb,
-                           const void * ALIGNED(16) pc,
-                           const void * ALIGNED(16) pd,
-                           v4 &a, v4 &b, v4 &c ) {
-    _v4_float a0 = vec_ld( 0, (const float *)pa ); // a0 =  0  1  2  x
-    _v4_float b0 = vec_ld( 0, (const float *)pb ); // b0 =  4  5  6  x
-    _v4_float c1 = vec_ld( 0, (const float *)pc ); // c1 =  8  9 10  x
-    _v4_float d1 = vec_ld( 0, (const float *)pd ); // d1 = 12 13 14  x
+
+  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a,
+                           v4 &b,
+                           v4 &c )
+  {
+    _v4_float r, s, t, u, d_v;
+
+    a.v = vec_ld( 0, (const float *) a0 ); // a =  0  1  2  x
+    b.v = vec_ld( 0, (const float *) a1 ); // b =  4  5  6  x
+    c.v = vec_ld( 0, (const float *) a2 ); // c =  8  9 10  x
+    d_v = vec_ld( 0, (const float *) a3 ); // d = 12 13 14  x
 
     // Step 1: Interleave top and bottom half
 
-    _v4_float a1 = vec_mergeh( a0, c1 );           // a1 =  0  8  1  9
-    _v4_float b1 = vec_mergeh( b0, d1 );           // b1 =  4 12  5 13
-    c1           = vec_mergel( a0, c1 );           // c1 =  2 10  x  x
-    d1           = vec_mergel( b0, d1 );           // d1 =  6 14  x  x
+    r = vec_mergeh( a.v, c.v );            // r =  0  8  1  9
+    s = vec_mergeh( b.v, d_v );            // s =  4 12  5 13
+
+    t = vec_mergel( a.v, c.v );            // t =  2 10  x  x
+    u = vec_mergel( b.v, d_v );            // u =  6 14  x  x
 
     // Step 2: Interleave even and odd rows
 
-    a.v          = vec_mergeh( a1, b1 );           // a  =  0  4  8 12
-    b.v          = vec_mergel( a1, b1 );           // b  =  1  5  9 13
-    c.v          = vec_mergeh( c1, d1 );           // c  =  2  6 10 14
+    a.v = vec_mergeh( r, s );              // a  =  0  4  8 12
+    b.v = vec_mergel( r, s );              // b  =  1  5  9 13
+    c.v = vec_mergeh( t, u );              // c  =  2  6 10 14
   }
 
-  inline void load_4x4_tr( const void * ALIGNED(16) pa,
-                           const void * ALIGNED(16) pb,
-                           const void * ALIGNED(16) pc,
-                           const void * ALIGNED(16) pd,
-                           v4 &a, v4 &b, v4 &c, v4 &d )
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a,
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
-    _v4_float a0 = vec_ld( 0, (const float *)pa ); // a0 =  0  1  2  3
-    _v4_float b0 = vec_ld( 0, (const float *)pb ); // b0 =  4  5  6  7
-    _v4_float c1 = vec_ld( 0, (const float *)pc ); // c1 =  8  9 10 11
-    _v4_float d1 = vec_ld( 0, (const float *)pd ); // d1 = 12 13 14 15
+    _v4_float r, s, t, u;
+
+    a.v = vec_ld( 0, (const float *) a0 ); // a =  0  1  2  3
+    b.v = vec_ld( 0, (const float *) a1 ); // b =  4  5  6  7
+    c.v = vec_ld( 0, (const float *) a2 ); // c =  8  9 10 11
+    d.v = vec_ld( 0, (const float *) a3 ); // d = 12 13 14 15
 
     // Step 1: Interleave top and bottom half
 
-    _v4_float a1 = vec_mergeh( a0, c1 );           // a1 =  0  8  1  9
-    _v4_float b1 = vec_mergeh( b0, d1 );           // b1 =  4 12  5 13
-    c1           = vec_mergel( a0, c1 );           // c1 =  2 10  3 11
-    d1           = vec_mergel( b0, d1 );           // d1 =  6 14  7 15
+    r   = vec_mergeh( a.v, c.v );          // r =  0  8  1  9
+    s   = vec_mergeh( b.v, d.v );          // s =  4 12  5 13
+
+    t   = vec_mergel( a.v, c.v );          // t =  2 10  3 11
+    u   = vec_mergel( b.v, d.v );          // u =  6 14  7 15
 
     // Step 2: Interleave even and odd rows
 
-    a.v          = vec_mergeh( a1, b1 );           // a  =  0  4  8 12
-    b.v          = vec_mergel( a1, b1 );           // b  =  1  5  9 13
-    c.v          = vec_mergeh( c1, d1 );           // c  =  2  6 10 14
-    d.v          = vec_mergel( c1, d1 );           // d  =  3  7 11 15
+    a.v = vec_mergeh( r, s );              // a =  0  4  8 12
+    b.v = vec_mergel( r, s );              // b =  1  5  9 13
+    c.v = vec_mergeh( t, u );              // c =  2  6 10 14
+    d.v = vec_mergel( t, u );              // d =  3  7 11 15
   }
 
   inline void store_4x1_tr( const v4 &a,
-                            void * pa,
-                            void * pb,
-                            void * pc,
-                            void * pd ) {
-    _v4_float a_v = a.v;
-    vec_ste( vec_splat(a_v,0), 0, (float *)pa );
-    vec_ste( vec_splat(a_v,1), 0, (float *)pb );
-    vec_ste( vec_splat(a_v,2), 0, (float *)pc );
-    vec_ste( vec_splat(a_v,3), 0, (float *)pd );
-  }
-
-  inline void store_4x2_tr( const v4 &a, const v4 &b,
-                            void * ALIGNED(8) pa,
-                            void * ALIGNED(8) pb,
-                            void * ALIGNED(8) pc,
-                            void * ALIGNED(8) pd ) { 
-    _v4_float t, a_v = a.v, b_v = b.v;
-    t = vec_perm( a_v, b_v, _PERM(0,4,0,4) ); vec_ste( t, 0, (float *)pa );
-                                              vec_ste( t, 4, (float *)pa );
-    t = vec_perm( a_v, b_v, _PERM(1,5,1,5) ); vec_ste( t, 0, (float *)pb );
-                                              vec_ste( t, 4, (float *)pb );
-    t = vec_perm( a_v, b_v, _PERM(2,6,2,6) ); vec_ste( t, 0, (float *)pc );
-                                              vec_ste( t, 4, (float *)pc );
-    t = vec_perm( a_v, b_v, _PERM(3,7,3,7) ); vec_ste( t, 0, (float *)pd );
-                                              vec_ste( t, 4, (float *)pd );
-  }
-
-  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
-                            void * ALIGNED(16) pa,
-                            void * ALIGNED(16) pb,
-                            void * ALIGNED(16) pc,
-                            void * ALIGNED(16) pd ) { 
+                            void *a0,
+                            void *a1,
+                            void *a2,
+                            void *a3 )
+  {
+    vec_ste( vec_splat( a.v, 0 ), 0, (float *) a0 );
+    vec_ste( vec_splat( a.v, 1 ), 0, (float *) a1 );
+    vec_ste( vec_splat( a.v, 2 ), 0, (float *) a2 );
+    vec_ste( vec_splat( a.v, 3 ), 0, (float *) a3 );
+  }
+
+  inline void store_4x2_tr( const v4 &a,
+                            const v4 &b,
+                            void * ALIGNED(8) a0,
+                            void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2,
+                            void * ALIGNED(8) a3 )
+  {
+    _v4_float t;
+
+    t = vec_perm( a.v, b.v, _PERM(0,4,0,4) );
+
+    vec_ste( t, 0, (float *) a0 );
+    vec_ste( t, 4, (float *) a0 );
+
+    t = vec_perm( a.v, b.v, _PERM(1,5,1,5) );
+
+    vec_ste( t, 0, (float *) a1 );
+    vec_ste( t, 4, (float *) a1 );
+
+    t = vec_perm( a.v, b.v, _PERM(2,6,2,6) );
+
+    vec_ste( t, 0, (float *) a2 );
+    vec_ste( t, 4, (float *) a2 );
+
+    t = vec_perm( a.v, b.v, _PERM(3,7,3,7) );
+
+    vec_ste( t, 0, (float *) a3 );
+    vec_ste( t, 4, (float *) a3 );
+  }
+
+  inline void store_4x3_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
+  {
     _v4_float a_v = a.v;        // a =  0  1  2  3
     _v4_float b_v = b.v;        // b =  4  5  6  7
     _v4_float c_v = c.v;        // c =  8  9 10 11
+
     _v4_float t, u, v;
+
     t = vec_mergeh( a_v, c_v ); // t =  0  8  1  9
     u = vec_mergeh( b_v, b_v ); // u =  4  x  5  x
-    v = vec_mergeh( t, u ); vec_ste( v, 0, (float *)pa );
-                            vec_ste( v, 4, (float *)pa );
-                            vec_ste( v, 8, (float *)pa );
-    v = vec_mergel( t, u ); vec_ste( v, 0, (float *)pb );
-                            vec_ste( v, 4, (float *)pb );
-                            vec_ste( v, 8, (float *)pb );
+
+    v = vec_mergeh( t, u );
+
+    vec_ste( v, 0, (float *) a0 );
+    vec_ste( v, 4, (float *) a0 );
+    vec_ste( v, 8, (float *) a0 );
+
+    v = vec_mergel( t, u );
+
+    vec_ste( v, 0, (float *) a1 );
+    vec_ste( v, 4, (float *) a1 );
+    vec_ste( v, 8, (float *) a1 );
+
     t = vec_mergel( a_v, c_v ); // t =  2 10  3 11
     u = vec_mergel( b_v, b_v ); // u =  6  x  7  x
-    v = vec_mergeh( t, u ); vec_ste( v, 0, (float *)pc );
-                            vec_ste( v, 4, (float *)pc );
-                            vec_ste( v, 8, (float *)pc );
-    v = vec_mergel( t, u ); vec_ste( v, 0, (float *)pd );
-                            vec_ste( v, 4, (float *)pd );
-                            vec_ste( v, 8, (float *)pd );
+
+    v = vec_mergeh( t, u );
+
+    vec_ste( v, 0, (float *) a2 );
+    vec_ste( v, 4, (float *) a2 );
+    vec_ste( v, 8, (float *) a2 );
+
+    v = vec_mergel( t, u );
+
+    vec_ste( v, 0, (float *) a3 );
+    vec_ste( v, 4, (float *) a3 );
+    vec_ste( v, 8, (float *) a3 );
   }
-  
-  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
-                            void * ALIGNED(16) pa,
-                            void * ALIGNED(16) pb,
-                            void * ALIGNED(16) pc,
-                            void * ALIGNED(16) pd ) {
-    _v4_float a0 = a.v;                             // a0 =  0  1  2  3
-    _v4_float b0 = b.v;                             // b0 =  4  5  6  7
-    _v4_float c1 = c.v;                             // c1 =  8  9 10 11
-    _v4_float d1 = d.v;                             // d1 = 12 13 14 15
+
+  inline void store_4x4_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            const v4 &d,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
+  {
+    _v4_float r, s, t, u;
+
+                                                   // a  =  0  1  2  3
+                                                   // b  =  4  5  6  7
+                                                   // c  =  8  9 10 11
+                                                   // d  = 12 13 14 15
 
     // Step 1: Interleave top and bottom half
 
-    _v4_float a1 = vec_mergeh( a0, c1 );            // a1 =  0  8  1  9
-    _v4_float b1 = vec_mergeh( b0, d1 );            // b1 =  4 12  5 13
-    c1           = vec_mergel( a0, c1 );            // c1 =  2 10  3 11
-    d1           = vec_mergel( b0, d1 );            // d1 =  6 14  7 15
+    r = vec_mergeh( a.v, c.v );                    // r  =  0  8  1  9
+    s = vec_mergeh( b.v, d.v );                    // s  =  4 12  5 13
+    t = vec_mergel( a.v, c.v );                    // t  =  2 10  3 11
+    u = vec_mergel( b.v, d.v );                    // u  =  6 14  7 15
 
     // Step 2: Interleave even and odd rows
 
-    vec_st( vec_mergeh( a1, b1 ), 0, (float *)pa ); // a  =  0  4  8 12
-    vec_st( vec_mergel( a1, b1 ), 0, (float *)pb ); // b  =  1  5  9 13
-    vec_st( vec_mergeh( c1, d1 ), 0, (float *)pc ); // c  =  2  6 10 14
-    vec_st( vec_mergel( c1, d1 ), 0, (float *)pd ); // d  =  3  7 11 15
+    vec_st( vec_mergeh( r, s ), 0, (float *) a0 ); // a0 =  0  4  8 12
+    vec_st( vec_mergel( r, s ), 0, (float *) a1 ); // a1 =  1  5  9 13
+    vec_st( vec_mergeh( t, u ), 0, (float *) a2 ); // a2 =  2  6 10 14
+    vec_st( vec_mergel( t, u ), 0, (float *) a3 ); // a3 =  3  7 11 15
   }
 
   //////////////
   // v4int class
 
-  class v4int : public v4 {
-
+  class v4int : public v4
+  {
     // v4int prefix unary operator friends
 
-    friend inline v4int operator  +( const v4int & a );
-    friend inline v4int operator  -( const v4int & a );
-    friend inline v4int operator  ~( const v4int & a );
-    friend inline v4int operator  !( const v4int & a );
+    friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  ~( const v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator  !( const v4int & a ) ALWAYS_INLINE;
     // Note: Referencing (*) and dereferencing (&) apply to the whole vector
 
     // v4int prefix increment / decrement operator friends
 
-    friend inline v4int operator ++( v4int & a );
-    friend inline v4int operator --( v4int & a );
+    friend inline v4int operator ++( v4int & a ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a ) ALWAYS_INLINE;
 
     // v4int postfix increment / decrement operator friends
 
-    friend inline v4int operator ++( v4int & a, int );
-    friend inline v4int operator --( v4int & a, int );
+    friend inline v4int operator ++( v4int & a, int ) ALWAYS_INLINE;
+    friend inline v4int operator --( v4int & a, int ) ALWAYS_INLINE;
 
     // v4int binary operator friends
 
-    friend inline v4int operator  +( const v4int &a, const v4int &b );
-    friend inline v4int operator  -( const v4int &a, const v4int &b );
-    friend inline v4int operator  *( const v4int &a, const v4int &b );
-    friend inline v4int operator  /( const v4int &a, const v4int &b );
-    friend inline v4int operator  %( const v4int &a, const v4int &b );
-    friend inline v4int operator  ^( const v4int &a, const v4int &b );
-    friend inline v4int operator  &( const v4int &a, const v4int &b );
-    friend inline v4int operator  |( const v4int &a, const v4int &b );
-    friend inline v4int operator <<( const v4int &a, const v4int &b );
-    friend inline v4int operator >>( const v4int &a, const v4int &b );
+    friend inline v4int operator  +( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  -( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  *( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  /( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  %( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  ^( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  &( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  |( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <<( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >>( const v4int &a, const v4int &b ) ALWAYS_INLINE;
 
     // v4int logical operator friends
 
-    friend inline v4int operator  <( const v4int &a, const v4int &b );
-    friend inline v4int operator  >( const v4int &a, const v4int &b );
-    friend inline v4int operator ==( const v4int &a, const v4int &b );
-    friend inline v4int operator !=( const v4int &a, const v4int &b );
-    friend inline v4int operator <=( const v4int &a, const v4int &b );
-    friend inline v4int operator >=( const v4int &a, const v4int &b );
-    friend inline v4int operator &&( const v4int &a, const v4int &b );
-    friend inline v4int operator ||( const v4int &a, const v4int &b );
+    friend inline v4int operator  <( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4int &a, const v4int &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4int &a, const v4int &b ) ALWAYS_INLINE;
 
     // v4int miscellaneous friends
 
-    friend inline v4int abs( const v4int &a );
-    friend inline v4    czero( const v4int &c, const v4 &a );
-    friend inline v4 notczero( const v4int &c, const v4 &a );
+    friend inline v4int abs( const v4int &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     // FIXME: cswap, notcswap!
-    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f );
+    friend inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) ALWAYS_INLINE;
 
     // v4float unary operator friends
 
-    friend inline v4int operator  !( const v4float & a ); 
+    friend inline v4int operator  !( const v4float & a ) ALWAYS_INLINE;
 
     // v4float logical operator friends
 
-    friend inline v4int operator  <( const v4float &a, const v4float &b );
-    friend inline v4int operator  >( const v4float &a, const v4float &b );
-    friend inline v4int operator ==( const v4float &a, const v4float &b );
-    friend inline v4int operator !=( const v4float &a, const v4float &b );
-    friend inline v4int operator <=( const v4float &a, const v4float &b );
-    friend inline v4int operator >=( const v4float &a, const v4float &b );
-    friend inline v4int operator &&( const v4float &a, const v4float &b );
-    friend inline v4int operator ||( const v4float &a, const v4float &b );
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float miscellaneous friends
 
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a );
-    friend inline v4float set_bits(    const v4int &m, const v4float &a );
-    friend inline v4float toggle_bits( const v4int &m, const v4float &a );
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
 
   public:
 
     // v4int constructors / destructors
-    
+
     v4int() {}                                // Default constructor
-    v4int( const v4int &a ) {                 // Copy constructor
+
+    v4int( const v4int &a )                   // Copy constructor
+    {
       v = a.v;
     }
-    v4int( const v4 &a ) {                    // Init from mixed
+
+    v4int( const v4 &a )                      // Init from mixed
+    {
       v = a.v;
     }
-    v4int( int a ) {                          // Init from scalar
-      v = (_v4_float)((_v4_int){ a, a, a, a });
+
+    v4int( int a )                            // Init from scalar
+    {
+      v = (_v4_float) ( (_v4_int) { a, a, a, a } );
     }
-    v4int( int i0, int i1, int i2, int i3 ) { // Init from scalars
-      v = (_v4_float)((_v4_int){ i0, i1, i2, i3 });
+
+    v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
+    {
+      v = (_v4_float) ( (_v4_int) { i0, i1, i2, i3 } );
     }
+
     ~v4int() {}                               // Destructor
-    
+
     // v4int assignment operators
-  
-#   define ASSIGN(op,instr)			  \
-    inline v4int &operator op( const v4int &b ) { \
+
+    #define ASSIGN(op,instr)                      \
+    inline v4int &operator op( const v4int &b )   \
+    {                                             \
       instr;                                      \
       return *this;                               \
     }
@@ -588,76 +626,103 @@ namespace v4 {
     ASSIGN(<<=, v = (_v4_float)vec_sl(  (_v4_int)v, (_v4_uint)b.v ) )
     ASSIGN(>>=, v = (_v4_float)vec_sr(  (_v4_int)v, (_v4_uint)b.v ) )
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4int member access operator
-    
-    // FIXME: [] operation probably breaks the compiler if used to modify
-    // a vector!
 
-    inline int &operator []( int n ) { return ((int *)&v)[n]; }
-    inline int  operator ()( int n ) {
-      union { int i[4]; _v4_float v; } t; t.v = v; return t.i[n];
+    inline int &operator []( int n )
+    {
+      return ( (int *) &v )[n];
     }
 
+    inline int  operator ()( int n )
+    {
+      union
+      {
+        int i[4];
+        _v4_float v;
+      } t;
+
+      t.v = v;
+
+      return t.i[n];
+    }
   };
 
   // v4int prefix unary operators
 
-# define PREFIX_UNARY(op,instr)                 \
-  inline v4int operator op( const v4int & a ) { \
+  #define PREFIX_UNARY(op,instr)                \
+  inline v4int operator op( const v4int &a )    \
+  {                                             \
     v4int b;                                    \
     instr;                                      \
     return b;                                   \
   }
 
-  PREFIX_UNARY(+, b.v = a.v )
-  PREFIX_UNARY(-, b.v = (_v4_float)vec_sub(   _false, (_v4_int)a.v ) )
-  PREFIX_UNARY(!, b.v = (_v4_float)vec_cmpeq( _false, (_v4_int)a.v ) )
-  PREFIX_UNARY(~, b.v = (_v4_float)vec_xor(   _true,  (_v4_int)a.v ) )
-  
-# undef PREFIX_UNARY
+  PREFIX_UNARY( +, b.v = a.v )
+  PREFIX_UNARY( -, b.v = (_v4_float) vec_sub(   _false, (_v4_int) a.v ) )
+  PREFIX_UNARY( !, b.v = (_v4_float) vec_cmpeq( _false, (_v4_int) a.v ) )
+  PREFIX_UNARY( ~, b.v = (_v4_float) vec_xor(   _true,  (_v4_int) a.v ) )
+
+  #undef PREFIX_UNARY
 
   // v4int prefix increment / decrement operators
 
-  inline v4int operator ++( v4int &a ) {
-    _v4_float a_v = (_v4_float)vec_add( (_v4_int)a.v, _ione );
+  inline v4int operator ++( v4int &a )
+  {
+    _v4_float a_v = (_v4_float) vec_add( (_v4_int) a.v, _ione );
+
     v4int b;
+
     a.v = a_v;
     b.v = a_v;
+
     return b;
   }
 
-  inline v4int operator --( v4int &a ) {
-    _v4_float a_v = (_v4_float)vec_sub( (_v4_int)a.v, _ione );
+  inline v4int operator --( v4int &a )
+  {
+    _v4_float a_v = (_v4_float) vec_sub( (_v4_int) a.v, _ione );
+
     v4int b;
+
     a.v = a_v;
     b.v = a_v;
+
     return b;
   }
 
   // v4int postfix increment / decrement operators
 
-  inline v4int operator ++( v4int &a, int ) {
+  inline v4int operator ++( v4int &a, int )
+  {
     _v4_float a_v = a.v;
+
     v4int b;
-    a.v = (_v4_float)vec_add( (_v4_int)a_v, _ione );
+
+    a.v = (_v4_float) vec_add( (_v4_int) a_v, _ione );
     b.v = a_v;
+
     return b;
   }
 
-  inline v4int operator --( v4int &a, int ) {
+  inline v4int operator --( v4int &a, int )
+  {
     _v4_float a_v = a.v;
+
     v4int b;
-    a.v = (_v4_float)vec_sub( (_v4_int)a_v, _ione );
+
+    a.v = (_v4_float) vec_sub( (_v4_int) a_v, _ione );
     b.v = a_v;
+
     return b;
   }
 
   // v4int binary operators
-  
-# define BINARY(op,instr)                                       \
-  inline v4int operator op( const v4int &a, const v4int &b ) {	\
+
+  #define BINARY(op,instr)                                      \
+  inline v4int operator op( const v4int &a, const v4int &b )    \
+  {                                                             \
     v4int c;                                                    \
     instr;                                                      \
     return c;                                                   \
@@ -692,12 +757,13 @@ namespace v4 {
   BINARY(<<, c.v = (_v4_float)vec_sl(  (_v4_int)a.v, (_v4_uint)b.v ) )
   BINARY(>>, c.v = (_v4_float)vec_sr(  (_v4_int)a.v, (_v4_uint)b.v ) )
 
-# undef BINARY
+  #undef BINARY
 
   // v4int logical operators
 
-# define LOGICAL(op,instr)                                     \
-  inline v4int operator op( const v4int &a, const v4int &b ) { \
+  #define LOGICAL(op,instr)                                    \
+  inline v4int operator op( const v4int &a, const v4int &b )   \
+  {                                                            \
     v4int c;                                                   \
     instr;                                                     \
     return c;                                                  \
@@ -723,86 +789,92 @@ namespace v4 {
                                                  vec_cmpeq( (_v4_int)b.v,
                                                             _false ) ) ) )
   
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4int miscellaneous functions
 
-  inline v4int abs( const v4int &a ) {
+  inline v4int abs( const v4int &a )
+  {
     v4int b;
-    b.v = (_v4_float)vec_abs( (_v4_int)a.v );
+
+    b.v = (_v4_float) vec_abs( (_v4_int) a.v );
+
     return b;
   }
 
-  inline v4 czero( const v4int &c, const v4 &a ) {
+  inline v4 czero( const v4int &c, const v4 &a )
+  {
     v4 b;
+
     b.v = vec_andc( a.v, c.v );
+
     return b;
   }
 
-  inline v4 notczero( const v4int &c, const v4 &a ) {
+  inline v4 notczero( const v4int &c, const v4 &a )
+  {
     v4 b;
+
     b.v = vec_and( a.v, c.v );
+
     return b;
   }
-  
-  inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) {
-    v4 m;
-    m.v = vec_sel( f.v, t.v, (_v4_uint)c.v );
-    return m;
+
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
+  {
+    v4 tf;
+
+    tf.v = vec_sel( f.v, t.v, (_v4_uint) c.v );
+
+    return tf;
   }
 
   ////////////////
   // v4float class
 
-  class v4float : public v4 {
-
+  class v4float : public v4
+  {
     // v4float prefix unary operator friends
 
-    friend inline v4float operator  +( const v4float &a );
-    friend inline v4float operator  -( const v4float &a );
-    friend inline v4float operator  ~( const v4float &a );
-    friend inline v4int   operator  !( const v4float &a );
+    friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator  ~( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4int   operator  !( const v4float &a ) ALWAYS_INLINE;
     // Note: Referencing (*) and dereferencing (&) apply to the whole vector
 
     // v4float prefix increment / decrement operator friends
 
-    friend inline v4float operator ++( v4float &a );
-    friend inline v4float operator --( v4float &a );
+    friend inline v4float operator ++( v4float &a ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a ) ALWAYS_INLINE;
 
     // v4float postfix increment / decrement operator friends
 
-    friend inline v4float operator ++( v4float &a, int );
-    friend inline v4float operator --( v4float &a, int );
+    friend inline v4float operator ++( v4float &a, int ) ALWAYS_INLINE;
+    friend inline v4float operator --( v4float &a, int ) ALWAYS_INLINE;
 
     // v4float binary operator friends
 
-    friend inline v4float operator  +( const v4float &a, const v4float &b );
-    friend inline v4float operator  -( const v4float &a, const v4float &b );
-    friend inline v4float operator  *( const v4float &a, const v4float &b );
-    friend inline v4float operator  /( const v4float &a, const v4float &b );
-
-    // -------------------------------------------------------------------------
-    // begin hacks
-    // friend inline v4float operator  *( const v4float &a, const v4 &b );
-    // end hacks
-    // -------------------------------------------------------------------------
+    friend inline v4float operator  +( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  -( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  *( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4float operator  /( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float logical operator friends
 
-    friend inline v4int operator  <( const v4float &a, const v4float &b );
-    friend inline v4int operator  >( const v4float &a, const v4float &b );
-    friend inline v4int operator ==( const v4float &a, const v4float &b );
-    friend inline v4int operator !=( const v4float &a, const v4float &b );
-    friend inline v4int operator <=( const v4float &a, const v4float &b );
-    friend inline v4int operator >=( const v4float &a, const v4float &b );
-    friend inline v4int operator &&( const v4float &a, const v4float &b );
-    friend inline v4int operator ||( const v4float &a, const v4float &b );
+    friend inline v4int operator  <( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator  >( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ==( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator !=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator <=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator >=( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator &&( const v4float &a, const v4float &b ) ALWAYS_INLINE;
+    friend inline v4int operator ||( const v4float &a, const v4float &b ) ALWAYS_INLINE;
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a )
-#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b )
+    #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+    #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -812,61 +884,73 @@ namespace v4 {
 
     CMATH_FR2(copysign);
 
-#   undef CMATH_FR1
-#   undef CMATH_FR2
+    #undef CMATH_FR1
+    #undef CMATH_FR2
 
     // v4float miscellaneous friends
 
-    friend inline v4float rsqrt_approx( const v4float &a );
-    friend inline v4float rsqrt( const v4float &a );
-    friend inline v4float rcp_approx( const v4float &a );
-    friend inline v4float rcp( const v4float &a );
-    friend inline v4float fma(  const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float fms(  const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c );
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a );
-    friend inline v4float set_bits(    const v4int &m, const v4float &a );
-    friend inline v4float toggle_bits( const v4int &m, const v4float &a );
-    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a );
-    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a );
-    friend inline void scale_4x1(     float * ALIGNED(16) p, const v4float &a );
-    // FIXME: crack
-    friend inline void trilinear( v4float & wl, v4float & wh );
-    
+    friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rcp       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void     scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
+    friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
+
   public:
 
     // v4float constructors / destructors
-    
+
     v4float() {}                                        // Default constructor
-    v4float( const v4float &a ) {                       // Copy constructor
+
+    v4float( const v4float &a )                         // Copy constructor
+    {
       v = a.v;
     }
-    v4float( const v4 &a ) {                            // Init from mixed
+
+    v4float( const v4 &a )                              // Init from mixed
+    {
       v = a.v;
     }
-    v4float( float a ) {                                // Init from scalar
-      v = (_v4_float){ a, a, a, a };
+
+    v4float( float a )                                  // Init from scalar
+    {
+      v = (_v4_float) { a, a, a, a };
     }
-    v4float( float f0, float f1, float f2, float f3 ) { // Init from scalars
-      v = (_v4_float){ f0, f1, f2, f3 };
+
+    v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
+    {
+      v = (_v4_float) { f0, f1, f2, f3 };
     }
+
     ~v4float() {}                                       // Destructor
 
     // v4float assignment operators
 
-#   define ASSIGN(op,instr)                             \
-    inline v4float &operator op( const v4float &b ) {	\
+    #define ASSIGN(op,instr)                            \
+    inline v4float &operator op( const v4float &b )     \
+    {                                                   \
       instr;                                            \
       return *this;                                     \
     }
 
-    ASSIGN(=,  v = b.v );
-    ASSIGN(+=, v = vec_add(v,b.v) );
-    ASSIGN(-=, v = vec_sub(v,b.v) );
-    ASSIGN(*=, v = vec_madd(v,b.v,_zero) );
+    ASSIGN(  =, v = b.v );
+    ASSIGN( +=, v = vec_add( v, b.v ) );
+    ASSIGN( -=, v = vec_sub( v, b.v ) );
+    ASSIGN( *=, v = vec_madd( v, b.v, _zero ) );
+
+    #undef ASSIGN
 
     // This does one NR iteration and is supposed to be accurate enough.
-    inline v4float &operator /=( const v4float &a ) {
+    inline v4float &operator /=( const v4float &a )
+    {
       _v4_float a_v = a.v, b_v;
 
       // Compute an estimate of the reciprocal of a (??-bit accurate)
@@ -892,7 +976,8 @@ namespace v4 {
 
     #if 0
     // This is a more accurate version that does two NR iterations.
-    inline v4float &operator /=( const v4float &a ) {
+    inline v4float &operator /=( const v4float &a )
+    {
       _v4_float a_v = a.v, b_v;
 
       // Compute an estimate of the reciprocal of a (??-bit accurate)
@@ -918,93 +1003,130 @@ namespace v4 {
     }
     #endif
 
-#   undef ASSIGN
-
     // v4float member access operator
 
-    // FIXME: [] operation probably breaks the compiler if used to modify
-    // a vector!
-
-    inline float &operator []( int n ) { return ((float *)&v)[n]; }
-    inline float  operator ()( int n ) {
-      union { float f[4]; _v4_float v; } t; t.v = v; return t.f[n];
+    inline float &operator []( int n )
+    {
+      return ( (float *) &v )[n];
     }
 
+    inline float  operator ()( int n )
+    {
+      union
+      {
+        float f[4];
+        _v4_float v;
+      } t;
+
+      t.v = v;
+
+      return t.f[n];
+    }
   };
 
   // v4float prefix unary operators
 
-  inline v4float operator +( const v4float &a ) {
+  inline v4float operator +( const v4float &a )
+  {
     v4float b;
+
     b.v = a.v;
+
     return b;
   }
 
-  inline v4float operator -( const v4float &a ) {
+  inline v4float operator -( const v4float &a )
+  {
     v4float b;
+
     b.v = vec_sub( _zero, a.v );
+
     return b;
   }
 
-  inline v4int operator !( const v4float &a ) {
+  inline v4int operator !( const v4float &a )
+  {
     v4int b;
-    b.v = (_v4_float)vec_cmpeq( a.v, _zero );
+
+    b.v = (_v4_float) vec_cmpeq( a.v, _zero );
+
     return b;
   }
 
   // v4float prefix increment / decrement operators
 
-  inline v4float operator ++( v4float &a ) {
-    _v4_float a_v = vec_add( a.v, _one );
+  inline v4float operator ++( v4float &a )
+  {
     v4float b;
-    a.v = a_v;
-    b.v = a_v;
+
+    _v4_float t = vec_add( a.v, _one );
+
+    a.v = t;
+    b.v = t;
+
     return b;
   }
 
-  inline v4float operator --( v4float &a ) {
-    _v4_float a_v = vec_sub( a.v, _one );
+  inline v4float operator --( v4float &a )
+  {
     v4float b;
-    a.v = a_v;
-    b.v = a_v;
+
+    _v4_float t = vec_sub( a.v, _one );
+
+    a.v = t;
+    b.v = t;
+
     return b;
   }
 
   // v4float postfix increment / decrement operators
 
-  inline v4float operator ++( v4float &a, int ) {
-    _v4_float a_v = a.v;
+  inline v4float operator ++( v4float &a, int )
+  {
     v4float b;
+
+    _v4_float a_v = a.v;
+
     a.v = vec_add( a_v, _one );
     b.v = a_v;
+
     return b;
   }
 
-  inline v4float operator --( v4float &a, int ) {
-    _v4_float a_v = a.v;
+  inline v4float operator --( v4float &a, int )
+  {
     v4float b;
+
+    _v4_float a_v = a.v;
+
     a.v = vec_sub( a_v, _one );
     b.v = a_v;
+
     return b;
   }
 
   // v4float binary operators
 
-# define BINARY(op,instr)                                            \
-  inline v4float operator op( const v4float &a, const v4float &b ) { \
+  #define BINARY(op,instr)                                           \
+  inline v4float operator op( const v4float &a, const v4float &b )   \
+  {                                                                  \
     v4float c;                                                       \
     instr;                                                           \
     return c;                                                        \
   }
 
-  BINARY(+, c.v = vec_add( a.v, b.v ) )
-  BINARY(-, c.v = vec_sub( a.v, b.v ) )
-  BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) )
+  BINARY( +, c.v = vec_add( a.v, b.v ) )
+  BINARY( -, c.v = vec_sub( a.v, b.v ) )
+  BINARY( *, c.v = vec_madd( a.v, b.v, _zero ) )
 
-  inline v4float operator /( const v4float &n, const v4float &a ) {
-    _v4_float a_v = a.v, b_v;
+  #undef BINARY
+
+  inline v4float operator /( const v4float &n, const v4float &a )
+  {
     v4float c;
 
+    _v4_float a_v = a.v, b_v;
+
     // Compute an estimate of the reciprocal of a (??-bit accurate)
 
     b_v = vec_re( a_v );
@@ -1028,10 +1150,12 @@ namespace v4 {
 
   #if 0
   // This is a more accurate version that does two NR iterations.
-  inline v4float operator /( const v4float &n, const v4float &a ) {
-    _v4_float a_v = a.v, b_v;
+  inline v4float operator /( const v4float &n, const v4float &a )
+  {
     v4float c;
 
+    _v4_float a_v = a.v, b_v;
+
     // Compute an estimate of the reciprocal of a (??-bit accurate)
 
     b_v = vec_re( a_v );
@@ -1055,72 +1179,59 @@ namespace v4 {
   }
   #endif
 
-# undef BINARY
-
-  // -------------------------------------------------------------------------
-  // begin hacks
-/* # define BINARY(op,instr)                                            \ */
-/*   inline v4float operator op( const v4float &a, const v4 &b ) {      \ */
-/*     v4float c;                                                       \ */
-/*     instr;                                                           \ */
-/*     return c;                                                        \ */
-/*   } */
-
-/*   BINARY(*, c.v = vec_madd( a.v, b.v, _zero ) ) */
-
-/* # undef BINARY */
-  // end hacks
-  // -------------------------------------------------------------------------
-
   // v4float logical operators
 
-# define LOGICAL(op,instr)                                         \
-  inline v4int operator op( const v4float &a, const v4float &b ) { \
+  #define LOGICAL(op,instr)                                        \
+  inline v4int operator op( const v4float &a, const v4float &b )   \
+  {                                                                \
     v4int c;                                                       \
     instr;                                                         \
     return c;                                                      \
   }
 
-  LOGICAL(<,  c.v = (_v4_float)vec_cmplt( a.v, b.v ) )
-  LOGICAL(>,  c.v = (_v4_float)vec_cmpgt( a.v, b.v ) )
-  LOGICAL(==, c.v = (_v4_float)vec_cmpeq( a.v, b.v ) )
-  LOGICAL(!=, c.v = (_v4_float)vec_xor( vec_cmpeq( a.v, b.v ), _true ) )
-  LOGICAL(<=, c.v = (_v4_float)vec_cmple( a.v, b.v ) )
-  LOGICAL(>=, c.v = (_v4_float)vec_cmpge( a.v, b.v ) )
-  LOGICAL(&&, c.v = (_v4_float)vec_xor( vec_or( vec_cmpeq( a.v, _zero ),
-                                                vec_cmpeq( b.v, _zero ) ),
-                                        _true ) )
-  LOGICAL(||, c.v = (_v4_float)vec_xor( vec_and( vec_cmpeq( a.v, _zero ),
-                                                 vec_cmpeq( b.v, _zero ) ),
-                                        _true ) )
+  LOGICAL(  <, c.v = (_v4_float) vec_cmplt( a.v, b.v ) )
+  LOGICAL(  >, c.v = (_v4_float) vec_cmpgt( a.v, b.v ) )
+  LOGICAL( ==, c.v = (_v4_float) vec_cmpeq( a.v, b.v ) )
+  LOGICAL( <=, c.v = (_v4_float) vec_cmple( a.v, b.v ) )
+  LOGICAL( >=, c.v = (_v4_float) vec_cmpge( a.v, b.v ) )
+  LOGICAL( !=, c.v = (_v4_float) vec_xor( vec_cmpeq( a.v, b.v ),
+                                          _true ) )
+  LOGICAL( &&, c.v = (_v4_float) vec_xor( vec_or( vec_cmpeq( a.v, _zero ),
+                                                  vec_cmpeq( b.v, _zero ) ),
+                                          _true ) )
+  LOGICAL( ||, c.v = (_v4_float) vec_xor( vec_and( vec_cmpeq( a.v, _zero ),
+                                                   vec_cmpeq( b.v, _zero ) ),
+                                          _true ) )
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4float math library functions
 
-# define CMATH_FR1(fn)                    \
-  inline v4float fn( const v4float &a ) { \
-    union { float f[4]; _v4_float v; } t; \
-    v4float b;                            \
-    t.v = a.v;                            \
-    b.v = (_v4_float){ (float) ::fn( t.f[0] ),  \
+  #define CMATH_FR1(fn)                           \
+  inline v4float fn( const v4float &a )           \
+  {                                               \
+    union { float f[4]; _v4_float v; } t;         \
+    v4float b;                                    \
+    t.v = a.v;                                    \
+    b.v = (_v4_float){ (float) ::fn( t.f[0] ),    \
                        (float) ::fn( t.f[1] ),    \
                        (float) ::fn( t.f[2] ),    \
                        (float) ::fn( t.f[3] ) };  \
-    return b;                             \
+    return b;                                     \
   }
-  
-# define CMATH_FR2(fn)                                      \
-  inline v4float fn( const v4float &a, const v4float &b ) { \
+
+  #define CMATH_FR2(fn)                                     \
+  inline v4float fn( const v4float &a, const v4float &b )   \
+  {                                                         \
     union { float f[4]; _v4_float v; } t;                   \
     union { float f[4]; _v4_float v; } u;                   \
     v4float c;                                              \
     t.v = a.v;                                              \
     u.v = b.v;                                              \
-    c.v = (_v4_float){ (float) ::fn( t.f[0], u.f[0] ),              \
-                       (float) ::fn( t.f[1], u.f[1] ),              \
-                       (float) ::fn( t.f[2], u.f[2] ),              \
-                       (float) ::fn( t.f[3], u.f[3] ) };            \
+    c.v = (_v4_float){ (float) ::fn( t.f[0], u.f[0] ),      \
+                       (float) ::fn( t.f[1], u.f[1] ),      \
+                       (float) ::fn( t.f[2], u.f[2] ),      \
+                       (float) ::fn( t.f[3], u.f[3] ) };    \
     return c;                                               \
   }
 
@@ -1130,17 +1241,25 @@ namespace v4 {
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
 
-  inline v4float fabs( const v4float &a ) {
+  #undef CMATH_FR1
+  #undef CMATH_FR2
+
+  inline v4float fabs( const v4float &a )
+  {
     v4float b;
+
     b.v = vec_andc( a.v, _sign );
+
     return b;
   }
 
   // This version does one NR iteration and is supposed to be accurate enough.
-  inline v4float sqrt( const v4float &a ) {
-    _v4_float a_v = a.v, b_v;
+  inline v4float sqrt( const v4float &a )
+  {
     v4float b;
 
+    _v4_float a_v = a.v, b_v;
+
     // Compute an estimate of the rsqrt (??-bit accurate)
 
     b_v = vec_rsqrte( a_v );
@@ -1148,7 +1267,7 @@ namespace v4 {
     // FIXME: CHECK NUMERICS.  HOW MANY N-R STEPS NECESSARY?
     // APPLE'S ALTIVEC PAGE SUGGESTS TWO.
 
-    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ),
                     vec_madd( b_v, _half, _zero ),
                     b_v );
 
@@ -1161,10 +1280,12 @@ namespace v4 {
 
   #if 0
   // This is a more accurate version that does two NR iterations.
-  inline v4float sqrt( const v4float &a ) {
-    _v4_float a_v = a.v, b_v;
+  inline v4float sqrt( const v4float &a )
+  {
     v4float b;
 
+    _v4_float a_v = a.v, b_v;
+
     // Compute an estimate of the rsqrt (??-bit accurate)
 
     b_v = vec_rsqrte( a_v );
@@ -1172,10 +1293,10 @@ namespace v4 {
     // FIXME: CHECK NUMERICS.  HOW MANY N-R STEPS NECESSARY?
     // APPLE'S ALTIVEC PAGE SUGGESTS TWO.
 
-    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ),
                     vec_madd( b_v, _half, _zero ),
                     b_v );
-    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ),
                     vec_madd( b_v, _half, _zero ),
                     b_v );
 
@@ -1187,28 +1308,33 @@ namespace v4 {
   }
   #endif
 
-  inline v4float copysign( const v4float &a, const v4float &b ) {
+  inline v4float copysign( const v4float &a, const v4float &b )
+  {
     v4float c;
+
     c.v = vec_or( vec_andc( a.v, _sign ), vec_and( b.v, _sign ) );
+
     return c;
   }
 
-# undef CMATH_FR1
-# undef CMATH_FR2
+  // v4float miscellaneous functions
 
-  // v4float miscelleanous functions
-  
-  inline v4float rsqrt_approx( const v4float &a ) {
+  inline v4float rsqrt_approx( const v4float &a )
+  {
     v4float b;
+
     b.v = vec_rsqrte( a.v );
+
     return b;
   }
 
   // This version does one NR iteration and is supposed to be accurate enough.
-  inline v4float rsqrt( const v4float &a ) {
-    _v4_float a_v = a.v, b_v;
+  inline v4float rsqrt( const v4float &a )
+  {
     v4float b;
 
+    _v4_float a_v = a.v, b_v;
+
     // Compute an estimate of the rsqrt (??-bit accurate)
 
     b_v = vec_rsqrte( a_v );
@@ -1219,7 +1345,8 @@ namespace v4 {
     //    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
     //                    vec_madd( b_v, _half, _zero ),
     //                    b_v );
-    b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+
+    b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ),
                     vec_madd( b_v, _half, _zero ),
                     b_v );
 
@@ -1228,10 +1355,12 @@ namespace v4 {
 
   #if 0
   // This is a more accurate version that does two NR iterations.
-  inline v4float rsqrt( const v4float &a ) {
-    _v4_float a_v = a.v, b_v;
+  inline v4float rsqrt( const v4float &a )
+  {
     v4float b;
 
+    _v4_float a_v = a.v, b_v;
+
     // Compute an estimate of the rsqrt (??-bit accurate)
 
     b_v = vec_rsqrte( a_v );
@@ -1239,10 +1368,11 @@ namespace v4 {
     // FIXME: CHECK NUMERICS.  HOW MANY N-R STEPS NECESSARY?
     // APPLE'S ALTIVEC PAGE SUGGESTS TWO.
 
-    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+    b_v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ),
                     vec_madd( b_v, _half, _zero ),
                     b_v );
-    b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v,   _zero ), a_v, _one ),
+
+    b.v = vec_madd( vec_nmsub( vec_madd( b_v, b_v, _zero ), a_v, _one ),
                     vec_madd( b_v, _half, _zero ),
                     b_v );
 
@@ -1250,17 +1380,22 @@ namespace v4 {
   }
   #endif
 
-  inline v4float rcp_approx( const v4float &a ) {
+  inline v4float rcp_approx( const v4float &a )
+  {
     v4float b;
+
     b.v = vec_re( a.v );
+
     return b;
   }
 
   // This version does one NR iteration and is supposed to be accurate enough.
-  inline v4float rcp( const v4float &a ) {
-    _v4_float a_v = a.v, b_v;
+  inline v4float rcp( const v4float &a )
+  {
     v4float b;
 
+    _v4_float a_v = a.v, b_v;
+
     // Compute an estimate of the reciprocal of a (??-bit accurate)
 
     b_v = vec_re( a_v );
@@ -1273,7 +1408,8 @@ namespace v4 {
     // THE SPU IMPLEMENTATION HAS AN ALTERNATE THAT MAY BE MORE
     // ACCURATE (OR AT LEAST USES FEWER CONSTANTS).
 
-    //    b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
+    // b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
+
     b.v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
 
     return b;
@@ -1281,10 +1417,12 @@ namespace v4 {
 
   #if 0
   // This is a more accurate version that does two NR iterations.
-  inline v4float rcp( const v4float &a ) {
-    _v4_float a_v = a.v, b_v;
+  inline v4float rcp( const v4float &a )
+  {
     v4float b;
 
+    _v4_float a_v = a.v, b_v;
+
     // Compute an estimate of the reciprocal of a (??-bit accurate)
 
     b_v = vec_re( a_v );
@@ -1298,76 +1436,113 @@ namespace v4 {
     // ACCURATE (OR AT LEAST USES FEWER CONSTANTS).
 
     b_v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
+
     b.v = vec_madd( vec_nmsub( b_v, a_v, _one ), b_v, b_v );
 
     return b;
   }
   #endif
 
-  inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
+
     d.v = vec_madd( a.v, b.v, c.v );
+
     return d;
   }
 
-  inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fms( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
-    //    d.v = vec_sub( _zero, vec_nmsub( a.v, b.v, c.v ) ); // FIXME: Sigh ...
+
+    // d.v = vec_sub( _zero, vec_nmsub( a.v, b.v, c.v ) ); // FIXME: Sigh ...
+
     d.v = vec_msub( a.v, b.v, c.v ) ; 
+
     return d;
   }
 
-  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
+
     d.v = vec_nmsub( a.v, b.v, c.v );
+
     return d;
   }
 
-  inline v4float clear_bits(  const v4int &m, const v4float &a ) {
+  inline v4float clear_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = vec_andc( a.v, m.v );
+
     return b;
   }
 
-  inline v4float set_bits(    const v4int &m, const v4float &a ) {
+  inline v4float set_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = vec_or( a.v, m.v );
+
     return b;
   }
 
-  inline v4float toggle_bits( const v4int &m, const v4float &a ) {
+  inline v4float toggle_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = vec_xor( a.v, m.v );
+
     return b;
   }
-  
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) {
+
+  inline void increment_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
+  {
     vec_st( vec_add( vec_ld( 0, p ), a.v ), 0, p );
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void decrement_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
+  {
     vec_st( vec_sub( vec_ld( 0, p ), a.v ), 0, p );
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void scale_4x1( float * ALIGNED(16) p,
+                         const v4float &a )
+  {
     vec_st( vec_madd( vec_ld( 0, p ), a.v, _zero ), 0, p );
   }
 
-  inline void trilinear( v4float & wl, v4float & wh ) {
+  // Given wl = x y z w, compute:
+  // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
+  // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
+  inline void trilinear( v4float &wl, v4float &wh )
+  {
     _v4_float z = wl.v, xy;
-    xy = vec_add( _one, vec_xor( _n02, vec_mergeh(z,z) ) );
-    z  = vec_add( _one, vec_xor( _n02, vec_splat(z,2) ) );
-    xy = vec_madd( vec_perm(xy,xy,_PERM(0,1,0,1)), vec_mergel(xy,xy), _zero );
-    wl.v = vec_madd( xy, vec_splat(z,0), _zero );
-    wh.v = vec_madd( xy, vec_splat(z,1), _zero );
+
+    xy = vec_add( _one, vec_xor( _n02, vec_mergeh( z, z ) ) );
+
+    z  = vec_add( _one, vec_xor( _n02, vec_splat( z, 2 ) ) );
+
+    xy = vec_madd( vec_perm( xy, xy, _PERM(0,1,0,1) ),
+                   vec_mergel( xy, xy ),
+                   _zero );
+
+    wl.v = vec_madd( xy, vec_splat( z, 0 ), _zero );
+
+    wh.v = vec_madd( xy, vec_splat( z, 1 ), _zero );
   }
 
-# undef _v4_int
-# undef _v4_uint
-# undef _v4_float
-# undef _v16_uchar
+  #undef _v4_int
+  #undef _v4_uint
+  #undef _v4_float
+  #undef _v16_uchar
 
-# undef _PERM
+  #undef _PERM
 
 } // namespace v4
 
diff --git a/src/util/v4/v4_avx.h b/src/util/v4/v4_avx.h
index 3c48096e..29612f45 100644
--- a/src/util/v4/v4_avx.h
+++ b/src/util/v4/v4_avx.h
@@ -5,47 +5,41 @@
 #error "Do not include v4_avx.h directly; use v4.h"
 #endif
 
-#define V4_ACCELERATION
-#define V4_AVX_ACCELERATION
-
 #include <xmmintrin.h>
 #include <math.h>
 
+#define V4_ACCELERATION
+#define V4_AVX_ACCELERATION
+
 #ifndef ALIGNED
 #define ALIGNED(n)
 #endif
 
-// FIXME: IN PORTABLE, ALTIVEC, SPU
-// - UPDATE V4INT, V4FLOAT
-
-// This requires gcc-3.3 and up
-// Also, Bug 12902 has not been resolved on gcc-3.x.x. See README.patches for
-// details.  gcc-4.x.x does not seem to have this bug but may suffer from
-// other problems (use "-fno-strict-aliasing" on these platforms)
-
 #define ALWAYS_INLINE __attribute__((always_inline))
 
-namespace v4 {
-
+namespace v4
+{
   class v4;
   class v4int;
   class v4float;
 
-  template<int i0, int i1, int i2, int i3> struct permute {
+  template<int i0, int i1, int i2, int i3>
+  struct permute
+  {
     constexpr static int value = i0 + i1*4 + i2*16 + i3*64;
-  }; // permute
+  };
+
+  #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64)
 
-# define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64)
-  
   ////////////////
   // v4 base class
-  
-  class v4 {
-    
+
+  class v4
+  {
     friend class v4int;
     friend class v4float;
-      
-    // v4 miscellenous friends
+
+    // v4 miscellaneous friends
 
     friend inline int any( const v4 &a ) ALWAYS_INLINE;
     friend inline int all( const v4 &a ) ALWAYS_INLINE;
@@ -61,53 +55,68 @@ namespace v4 {
 
     // v4int miscellaneous friends
 
-    friend inline v4 czero(    const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
-    friend inline v4 merge(    const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+    friend inline v4    merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
-        
-    friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
-    friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
-    friend inline void copy_4x1( void * ALIGNED(16) dst,
-                                 const void * ALIGNED(16) src ) ALWAYS_INLINE;
-    friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    friend inline void   load_4x1( const void * ALIGNED(16) p,
+                                   v4 &a ) ALWAYS_INLINE;
+
+    friend inline void  store_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void stream_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+
+    friend inline void   copy_4x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+
+    friend inline void   swap_4x1( void * ALIGNED(16) a,
+                                   void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
 
     friend inline void load_4x1_tr( const void *a0, const void *a1,
                                     const void *a2, const void *a3,
                                     v4 &a ) ALWAYS_INLINE;
+
     friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
                                     const void * ALIGNED(8) a1,
                                     const void * ALIGNED(8) a2,
                                     const void * ALIGNED(8) a3,
                                     v4 &a, v4 &b ) ALWAYS_INLINE;
+
     friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
                                     v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+
     friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
                                     v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
-    
+
     friend inline void store_4x1_tr( const v4 &a,
                                      void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x2_tr( const v4 &a, const v4 &b,
                                      void * ALIGNED(8) a0,
                                      void * ALIGNED(8) a1,
                                      void * ALIGNED(8) a2,
                                      void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
                                      void * ALIGNED(16) a0,
                                      void * ALIGNED(16) a1,
                                      void * ALIGNED(16) a2,
                                      void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x4_tr( const v4 &a, const v4 &b,
                                      const v4 &c, const v4 &d,
                                      void * ALIGNED(16) a0,
@@ -117,81 +126,102 @@ namespace v4 {
 
   protected:
 
-    union {
+    union
+    {
       int i[4];
       float f[4];
       __m128 v;
     };
-    
+
   public:
 
     v4() {}                    // Default constructor
-    v4(const v4 &a) { v=a.v; } // Copy constructor
-    ~v4() {}                   // Default destructor
 
+    v4( const v4 &a )          // Copy constructor
+    {
+      v = a.v;
+    }
+
+    ~v4() {}                   // Default destructor
   };
-  
+
   // v4 miscellaneous functions
 
-  inline int any( const v4 &a ) {
+  inline int any( const v4 &a )
+  {
     return a.i[0] || a.i[1] || a.i[2] || a.i[3];
   }
-  
-  inline int all( const v4 &a ) {
+
+  inline int all( const v4 &a )
+  {
     return a.i[0] && a.i[1] && a.i[2] && a.i[3];
   }
 
-  // Note: n MUST BE AN IMMEDIATE!
   template<int n>
-  inline v4 splat(const v4 & a) {
-    __m128 a_v = a.v;
+  inline v4 splat( const v4 & a )
+  {
     v4 b;
-    b.v = _mm_shuffle_ps( a_v, a_v, (n*permute<1,1,1,1>::value));
+
+    b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) );
+
     return b;
   }
 
-  // Note: i0:3 MUST BE IMMEDIATES! */
   template<int i0, int i1, int i2, int i3>
-  inline v4 shuffle( const v4 & a ) {
-    __m128 a_v = a.v;
+  inline v4 shuffle( const v4 & a )
+  {
     v4 b;
-    b.v = _mm_shuffle_ps( a_v, a_v, (permute<i0,i1,i2,i3>::value) );
+
+    b.v = _mm_shuffle_ps( a.v, a.v, ( permute<i0,i1,i2,i3>::value ) );
+
     return b;
   }
 
-  inline void swap( v4 &a, v4 &b ) { 
-    __m128 a_v = a.v; a.v = b.v; b.v = a_v;
+  inline void swap( v4 &a, v4 &b )
+  {
+    __m128 t = a.v;
+
+    a.v = b.v;
+
+    b.v = t;
   }
 
-  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) {
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
     __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u;
+
     t    = _mm_unpackhi_ps( a0_v, a1_v );
     a0_v = _mm_unpacklo_ps( a0_v, a1_v );
     u    = _mm_unpackhi_ps( a2_v, a3_v );
     a2_v = _mm_unpacklo_ps( a2_v, a3_v );
+
     a1_v = _mm_movehl_ps( a2_v, a0_v );
     a0_v = _mm_movelh_ps( a0_v, a2_v );
     a2_v = _mm_movelh_ps( t, u );
     a3_v = _mm_movehl_ps( u, t );
-    a0.v = a0_v; a1.v = a1_v; a2.v = a2_v; a3.v = a3_v;
+
+    a0.v = a0_v;
+    a1.v = a1_v;
+    a2.v = a2_v;
+    a3.v = a3_v;
   }
 
   // v4 memory manipulation functions
-  
+
   inline void load_4x1( const void * ALIGNED(16) p,
-			v4 &a )
+                        v4 &a )
   {
     a.v = _mm_load_ps( ( float * ) p );
   }
 
   inline void store_4x1( const v4 &a,
-			 void * ALIGNED(16) p )
+                         void * ALIGNED(16) p )
   {
     _mm_store_ps( ( float * ) p, a.v );
   }
 
   inline void stream_4x1( const v4 &a,
-			  void * ALIGNED(16) p )
+                          void * ALIGNED(16) p )
   {
     _mm_stream_ps( ( float * ) p, a.v );
   }
@@ -207,9 +237,8 @@ namespace v4 {
     _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) );
   }
 
-  /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
   inline void swap_4x1( void * ALIGNED(16) a,
-			void * ALIGNED(16) b )
+                        void * ALIGNED(16) b )
   {
     __m128 t = _mm_load_ps( ( float * ) a );
 
@@ -219,129 +248,180 @@ namespace v4 {
 
   // v4 transposed memory manipulation functions
 
-  inline void load_4x1_tr( const void *a0, const void *a1,
-                           const void *a2, const void *a3, v4 &a ) {
-    a.v = _mm_setr_ps( ((const float *)a0)[0],
-                       ((const float *)a1)[0],
-                       ((const float *)a2)[0],
-                       ((const float *)a3)[0] );
+  inline void load_4x1_tr( const void *a0,
+                           const void *a1,
+                           const void *a2,
+                           const void *a3,
+                           v4 &a )
+  {
+    a.v = _mm_setr_ps( ( (const float *) a0 )[0],
+                       ( (const float *) a1 )[0],
+                       ( (const float *) a2 )[0],
+                       ( (const float *) a3 )[0] );
   }
 
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
-                           v4 &a, v4 &b ) {
+                           v4 &a,
+                           v4 &b )
+  {
     __m128 a_v, b_v, t;
+
     b_v = _mm_setzero_ps();
-    t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
-    b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
-    a_v = _mm_shuffle_ps( t, b_v, 0x88 );
-    b_v = _mm_shuffle_ps( t, b_v, 0xdd );
-    a.v = a_v; b.v = b_v;
+
+    t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 );
+    b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 );
+
+    a.v = _mm_shuffle_ps( t, b_v, 0x88 );
+    b.v = _mm_shuffle_ps( t, b_v, 0xdd );
   }
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c ) {
-    __m128 a_v, b_v, c_v, t, u;
-    t   = _mm_load_ps( (const float *)a0 );
-    b_v = _mm_load_ps( (const float *)a1 );
-    c_v = _mm_load_ps( (const float *)a2 );
-    u   = _mm_load_ps( (const float *)a3 );
-    a_v = _mm_unpacklo_ps( t, b_v );
-    b_v = _mm_unpackhi_ps( t, b_v );
-    t   = _mm_unpacklo_ps( c_v, u );
-    u   = _mm_unpackhi_ps( c_v, u );
-    c_v = _mm_movelh_ps( b_v, u );
-    b_v = _mm_movehl_ps( t, a_v );
-    a_v = _mm_movelh_ps( a_v, t );
-    a.v = a_v; b.v = b_v; c.v = c_v;
+                           v4 &a,
+                           v4 &b,
+                           v4 &c )
+  {
+    __m128 r, s, t, u, d_v;
+
+    a.v = _mm_load_ps( (const float *) a0 );
+    b.v = _mm_load_ps( (const float *) a1 );
+    c.v = _mm_load_ps( (const float *) a2 );
+    d_v = _mm_load_ps( (const float *) a3 );
+
+    r   = _mm_unpacklo_ps( a.v, b.v );
+    s   = _mm_unpackhi_ps( a.v, b.v );
+
+    t   = _mm_unpacklo_ps( c.v, d_v );
+    u   = _mm_unpackhi_ps( c.v, d_v );
+
+    a.v = _mm_movelh_ps( r, t );
+    b.v = _mm_movehl_ps( t, r );
+    c.v = _mm_movelh_ps( s, u );
   }
 
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d ) {
-    __m128 a_v, b_v, c_v, d_v, t, u;
-    a_v = _mm_load_ps( (const float *)a0 );
-    b_v = _mm_load_ps( (const float *)a1 );
-    c_v = _mm_load_ps( (const float *)a2 );
-    d_v = _mm_load_ps( (const float *)a3 );
-    t   = _mm_unpackhi_ps( a_v, b_v );
-    a_v = _mm_unpacklo_ps( a_v, b_v );
-    u   = _mm_unpackhi_ps( c_v, d_v );
-    c_v = _mm_unpacklo_ps( c_v, d_v );
-    b_v = _mm_movehl_ps( c_v, a_v );
-    a_v = _mm_movelh_ps( a_v, c_v );
-    c_v = _mm_movelh_ps( t, u );
-    d_v = _mm_movehl_ps( u, t );
-    a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v;
+                           v4 &a,
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
+  {
+    __m128 r, s, t, u;
+
+    a.v = _mm_load_ps( (const float *) a0 );
+    b.v = _mm_load_ps( (const float *) a1 );
+    c.v = _mm_load_ps( (const float *) a2 );
+    d.v = _mm_load_ps( (const float *) a3 );
+
+    r   = _mm_unpackhi_ps( a.v, b.v );
+    s   = _mm_unpacklo_ps( a.v, b.v );
+
+    t   = _mm_unpackhi_ps( c.v, d.v );
+    u   = _mm_unpacklo_ps( c.v, d.v );
+
+    a.v = _mm_movelh_ps( s, u );
+    b.v = _mm_movehl_ps( u, s );
+    c.v = _mm_movelh_ps( r, t );
+    d.v = _mm_movehl_ps( t, r );
   }
 
   inline void store_4x1_tr( const v4 &a,
-                            void *a0, void *a1, void *a2, void *a3 ) {
-    ((float *)a0)[0] = a.f[0];
-    ((float *)a1)[0] = a.f[1];
-    ((float *)a2)[0] = a.f[2];
-    ((float *)a3)[0] = a.f[3];
-  }
-
-  inline void store_4x2_tr( const v4 &a, const v4 &b,
-                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
-                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) {
-    __m128 a_v = a.v, b_v = b.v, t;
-    t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t
-    _mm_storel_pi((__m64 *)a0,t); // a0 b0       -> a0
-    _mm_storeh_pi((__m64 *)a1,t); // a1 b1       -> a1
-    t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t
-    _mm_storel_pi((__m64 *)a2,t); // a2 b2       -> a2
-    _mm_storeh_pi((__m64 *)a3,t); // a3 b3       -> a3
-  }
-
-  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) {
-    __m128 a_v = a.v, b_v = b.v, t;
-    t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t
-    _mm_storel_pi((__m64 *)a0,t); // a0 b0       -> a0
-    _mm_storeh_pi((__m64 *)a1,t); // a1 b1       -> a1
-    t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t
-    _mm_storel_pi((__m64 *)a2,t); // a2 b2       -> a2
-    _mm_storeh_pi((__m64 *)a3,t); // a3 b3       -> a3
-    ((float *)a0)[2] = c.f[0];
-    ((float *)a1)[2] = c.f[1];
-    ((float *)a2)[2] = c.f[2];
-    ((float *)a3)[2] = c.f[3];
-  }
-
-  /* FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR) */
-  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) {
-    __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
-    t   = _mm_unpackhi_ps( a_v, b_v );
-    a_v = _mm_unpacklo_ps( a_v, b_v );
-    u   = _mm_unpackhi_ps( c_v, d_v );
-    c_v = _mm_unpacklo_ps( c_v, d_v );
+                            void *a0,
+                            void *a1,
+                            void *a2,
+                            void *a3 )
+  {
+    ( (float *) a0 )[0] = a.f[0];
+    ( (float *) a1 )[0] = a.f[1];
+    ( (float *) a2 )[0] = a.f[2];
+    ( (float *) a3 )[0] = a.f[3];
+  }
+
+  inline void store_4x2_tr( const v4 &a,
+                            const v4 &b,
+                            void * ALIGNED(8) a0,
+                            void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2,
+                            void * ALIGNED(8) a3 )
+  {
+    __m128 t;
+
+    t = _mm_unpacklo_ps( a.v, b.v );  // a0 b0 a1 b1 -> t
+
+    _mm_storel_pi( (__m64 *) a0, t ); // a0 b0       -> a0
+    _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1       -> a1
+
+    t = _mm_unpackhi_ps( a.v, b.v );  // a2 b2 a3 b3 -> t
+
+    _mm_storel_pi( (__m64 *) a2, t ); // a2 b2       -> a2
+    _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3       -> a3
+  }
+
+  inline void store_4x3_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
+  {
+    __m128 t;
+
+    t = _mm_unpacklo_ps( a.v, b.v );  // a0 b0 a1 b1 -> t
+
+    _mm_storel_pi( (__m64 *) a0, t ); // a0 b0       -> a0
+    _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1       -> a1
+
+    t = _mm_unpackhi_ps( a.v, b.v );  // a2 b2 a3 b3 -> t
+
+    _mm_storel_pi( (__m64 *) a2, t ); // a2 b2       -> a2
+    _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3       -> a3
+
+    ( (float *) a0 )[2] = c.f[0];
+    ( (float *) a1 )[2] = c.f[1];
+    ( (float *) a2 )[2] = c.f[2];
+    ( (float *) a3 )[2] = c.f[3];
+  }
+
+  inline void store_4x4_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            const v4 &d,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
+  {
+    __m128 a_v, b_v, c_v, d_v, t, u;
+
+    t   = _mm_unpackhi_ps( a.v, b.v );
+    a_v = _mm_unpacklo_ps( a.v, b.v );
+    u   = _mm_unpackhi_ps( c.v, d.v );
+    c_v = _mm_unpacklo_ps( c.v, d.v );
+
     b_v = _mm_movehl_ps( c_v, a_v );
     a_v = _mm_movelh_ps( a_v, c_v );
     c_v = _mm_movelh_ps( t, u );
     d_v = _mm_movehl_ps( u, t );
-    _mm_store_ps( (float *)a0, a_v );
-    _mm_store_ps( (float *)a1, b_v );
-    _mm_store_ps( (float *)a2, c_v );
-    _mm_store_ps( (float *)a3, d_v );
+
+    _mm_store_ps( (float *) a0, a_v );
+    _mm_store_ps( (float *) a1, b_v );
+    _mm_store_ps( (float *) a2, c_v );
+    _mm_store_ps( (float *) a3, d_v );
   }
 
   //////////////
   // v4int class
 
-  class v4int : public v4 {
-
+  class v4int : public v4
+  {
     // v4int prefix unary operator friends
 
     friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
@@ -409,33 +489,61 @@ namespace v4 {
 
     // v4float miscellaneous friends
 
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float set_bits(    const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
     friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
 
   public:
 
     // v4int constructors / destructors
-    
+
     v4int() {}                                // Default constructor
-    v4int( const v4int &a ) { v = a.v; }      // Copy constructor
-    v4int( const v4 &a ) { v = a.v; }         // Init from mixed
-    v4int( int a ) {                          // Init from scalar
-      union { int i; float f; } u;
+
+    v4int( const v4int &a )                   // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v4int( const v4 &a )                      // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v4int( int a )                            // Init from scalar
+    {
+      union
+      {
+        int i;
+        float f;
+      } u;
+
       u.i = a;
-      v = _mm_set1_ps( u.f );
+      v   = _mm_set1_ps( u.f );
     }
-    v4int( int i0, int i1, int i2, int i3 ) { // Init from scalars
-      union { int i; float f; } u0, u1, u2, u3;
-      u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3;
+
+    v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
+    {
+      union
+      {
+        int i;
+        float f;
+      } u0, u1, u2, u3;
+
+      u0.i = i0;
+      u1.i = i1;
+      u2.i = i2;
+      u3.i = i3;
+
       v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f );
     }
-    ~v4int() {};                              // Destructor
-    
+
+    ~v4int() {}                               // Destructor
+
     // v4int assignment operators
-  
-#   define ASSIGN(op)			          \
-    inline v4int &operator op( const v4int &b ) { \
+
+    #define ASSIGN(op)                            \
+    inline v4int &operator op( const v4int &b )   \
+    {                                             \
       i[0] op b.i[0];                             \
       i[1] op b.i[1];                             \
       i[2] op b.i[2];                             \
@@ -443,121 +551,153 @@ namespace v4 {
       return *this;                               \
     }
 
-    inline v4int &operator =(const v4int &b) {
-      v = b.v;
-      return *this;
-    }
-
     ASSIGN(+=)
     ASSIGN(-=)
     ASSIGN(*=)
     ASSIGN(/=)
     ASSIGN(%=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+
+    #undef ASSIGN
 
-    inline v4int &operator ^=(const v4int &b) {
+    inline v4int &operator =( const v4int &b )
+    {
+      v = b.v;
+
+      return *this;
+    }
+
+    inline v4int &operator ^=( const v4int &b )
+    {
       v = _mm_xor_ps( v, b.v );
+
       return *this;
     }
 
-    inline v4int &operator &=(const v4int &b) {
+    inline v4int &operator &=( const v4int &b )
+    {
       v = _mm_and_ps( v, b.v );
+
       return *this;
     }
 
-    inline v4int &operator |=(const v4int &b) {
+    inline v4int &operator |=( const v4int &b )
+    {
       v = _mm_or_ps( v, b.v );
+
       return *this;
     }
 
-    ASSIGN(<<=)
-    ASSIGN(>>=)
-
-#   undef ASSIGN
-
     // v4int member access operator
-    
-    inline int &operator []( int n ) { return i[n]; }
-    inline int  operator ()( int n ) { return i[n]; }
 
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
   };
 
   // v4int prefix unary operators
 
-# define PREFIX_UNARY(op)                       \
-  inline v4int operator op( const v4int & a ) { \
+  #define PREFIX_UNARY(op)                      \
+  inline v4int operator op( const v4int &a )    \
+  {                                             \
     v4int b;                                    \
-    b.i[0] = (op a.i[0]);                       \
-    b.i[1] = (op a.i[1]);                       \
-    b.i[2] = (op a.i[2]);                       \
-    b.i[3] = (op a.i[3]);                       \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
     return b;                                   \
   }
 
-  inline v4int operator +( const v4int & a ) {
+  inline v4int operator +( const v4int &a )
+  {
     v4int b;
+
     b.v = a.v;
+
     return b;
   }
 
   PREFIX_UNARY(-)
 
-  inline v4int operator !( const v4int & a ) {
+  inline v4int operator !( const v4int &a )
+  {
     v4int b;
-    b.i[0] = -(!a.i[0]);
-    b.i[1] = -(!a.i[1]);
-    b.i[2] = -(!a.i[2]);
-    b.i[3] = -(!a.i[3]);
+
+    b.i[0] = - ( ! a.i[0] );
+    b.i[1] = - ( ! a.i[1] );
+    b.i[2] = - ( ! a.i[2] );
+    b.i[3] = - ( ! a.i[3] );
+
     return b;
   }
 
-  inline v4int operator ~( const v4int & a ) {
+  inline v4int operator ~( const v4int &a )
+  {
     v4int b;
-    union { int i; float f; } u;
+
+    union
+    {
+      int i;
+      float f;
+    } u;
+
     u.i = -1;
+
     b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) );
+
     return b;
   }
-  
-# undef PREFIX_UNARY
+
+  #undef PREFIX_UNARY
 
   // v4int prefix increment / decrement
 
-# define PREFIX_INCDEC(op)                      \
-  inline v4int operator op( v4int & a ) {       \
+  #define PREFIX_INCDEC(op)                     \
+  inline v4int operator op( v4int &a )          \
+  {                                             \
     v4int b;                                    \
-    b.i[0] = (op a.i[0]);                       \
-    b.i[1] = (op a.i[1]);                       \
-    b.i[2] = (op a.i[2]);                       \
-    b.i[3] = (op a.i[3]);                       \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
     return b;                                   \
   }
 
   PREFIX_INCDEC(++)
   PREFIX_INCDEC(--)
 
-# undef PREFIX_INCDEC
+  #undef PREFIX_INCDEC
 
   // v4int postfix increment / decrement
 
-# define POSTFIX_INCDEC(op)                    \
-  inline v4int operator op( v4int & a, int ) { \
+  #define POSTFIX_INCDEC(op)                   \
+  inline v4int operator op( v4int &a, int )    \
+  {                                            \
     v4int b;                                   \
-    b.i[0] = (a.i[0] op);                      \
-    b.i[1] = (a.i[1] op);                      \
-    b.i[2] = (a.i[2] op);                      \
-    b.i[3] = (a.i[3] op);                      \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
     return b;                                  \
   }
 
   POSTFIX_INCDEC(++)
   POSTFIX_INCDEC(--)
 
-# undef POSTFIX_INCDEC
+  #undef POSTFIX_INCDEC
 
   // v4int binary operators
-  
-# define BINARY(op)                                             \
-  inline v4int operator op( const v4int &a, const v4int &b ) {	\
+
+  #define BINARY(op)                                            \
+  inline v4int operator op( const v4int &a, const v4int &b )    \
+  {                                                             \
     v4int c;                                                    \
     c.i[0] = a.i[0] op b.i[0];                                  \
     c.i[1] = a.i[1] op b.i[1];                                  \
@@ -571,39 +711,48 @@ namespace v4 {
   BINARY(*)
   BINARY(/)
   BINARY(%)
+  BINARY(<<)
+  BINARY(>>)
 
-  inline v4int operator ^( const v4int &a, const v4int &b ) {
+  #undef BINARY
+
+  inline v4int operator ^( const v4int &a, const v4int &b )
+  {
     v4int c;
+
     c.v = _mm_xor_ps( a.v, b.v );
+
     return c;
   }
 
-  inline v4int operator &( const v4int &a, const v4int &b ) {
+  inline v4int operator &( const v4int &a, const v4int &b )
+  {
     v4int c;
+
     c.v = _mm_and_ps( a.v, b.v );
+
     return c;
   }
 
-  inline v4int operator |( const v4int &a, const v4int &b ) {
+  inline v4int operator |( const v4int &a, const v4int &b )
+  {
     v4int c;
+
     c.v = _mm_or_ps( a.v, b.v );
+
     return c;
   }
 
-  BINARY(<<)
-  BINARY(>>)
-
-# undef BINARY
-
   // v4int logical operators
 
-# define LOGICAL(op)                                           \
-  inline v4int operator op( const v4int &a, const v4int &b ) { \
+  #define LOGICAL(op)                                          \
+  inline v4int operator op( const v4int &a, const v4int &b )   \
+  {                                                            \
     v4int c;                                                   \
-    c.i[0] = -(a.i[0] op b.i[0]);                              \
-    c.i[1] = -(a.i[1] op b.i[1]);                              \
-    c.i[2] = -(a.i[2] op b.i[2]);                              \
-    c.i[3] = -(a.i[3] op b.i[3]);                              \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
     return c;                                                  \
   }
 
@@ -615,44 +764,58 @@ namespace v4 {
   LOGICAL(>=)
   LOGICAL(&&)
   LOGICAL(||)
-  
-# undef LOGICAL
+
+  #undef LOGICAL
 
   // v4int miscellaneous functions
 
-  inline v4int abs( const v4int &a ) {
+  inline v4int abs( const v4int &a )
+  {
     v4int b;
-    b.i[0] = (a.i[0]>=0) ? a.i[0] : -a.i[0];
-    b.i[1] = (a.i[1]>=0) ? a.i[1] : -a.i[1];
-    b.i[2] = (a.i[2]>=0) ? a.i[2] : -a.i[2];
-    b.i[3] = (a.i[3]>=0) ? a.i[3] : -a.i[3];
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+
     return b;
   }
 
-  inline v4 czero( const v4int &c, const v4 &a ) {
+  inline v4 czero( const v4int &c, const v4 &a )
+  {
     v4 b;
-    b.v = _mm_andnot_ps(c.v,a.v);
+
+    b.v = _mm_andnot_ps( c.v, a.v );
+
     return b;
   }
 
-  inline v4 notczero( const v4int &c, const v4 &a ) {
+  inline v4 notczero( const v4int &c, const v4 &a )
+  {
     v4 b;
-    b.v = _mm_and_ps(c.v,a.v);
+
+    b.v = _mm_and_ps( c.v, a.v );
+
     return b;
   }
-  
-  inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) {
-    __m128 c_v = c.v;
+
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
+  {
     v4 tf;
-    tf.v = _mm_or_ps(_mm_andnot_ps(c_v,f.v),_mm_and_ps(c_v,t.v));
+
+    __m128 c_v = c.v;
+
+    tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ),
+                      _mm_and_ps( c_v, t.v ) );
+
     return tf;
   }
 
   ////////////////
   // v4float class
 
-  class v4float : public v4 {
-
+  class v4float : public v4
+  {
     // v4float prefix unary operator friends
 
     friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
@@ -691,9 +854,9 @@ namespace v4 {
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
-#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b ) ALWAYS_INLINE
+    #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+    #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -703,192 +866,252 @@ namespace v4 {
 
     CMATH_FR2(copysign);
 
-#   undef CMATH_FR1
-#   undef CMATH_FR2
+    #undef CMATH_FR1
+    #undef CMATH_FR2
 
     // v4float miscellaneous friends
 
     friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float rsqrt( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt       ( const v4float &a ) ALWAYS_INLINE;
     friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float rcp( const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
-    friend inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float rcp       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
     friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float set_bits(    const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
     friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
     friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
     friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
-    friend inline void scale_4x1(     float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
-    // FIXME: crack
+    friend inline void     scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
     friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
-    
+
   public:
 
     // v4float constructors / destructors
-    
+
     v4float() {}                                        // Default constructor
-    v4float( const v4float &a ) { v = a.v; }            // Copy constructor
-    v4float( const v4 &a ) { v = a.v; }                 // Init from mixed
-    v4float( float a ) {                                // Init from scalar
+
+    v4float( const v4float &a )                         // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v4float( const v4 &a )                              // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v4float( float a )                                  // Init from scalar
+    {
       v = _mm_set1_ps( a );
     }
-    v4float( float f0, float f1, float f2, float f3 ) { // Init from scalars
+
+    v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
+    {
       v = _mm_setr_ps( f0, f1, f2, f3 );
     }
+
     ~v4float() {}                                       // Destructor
 
     // v4float assignment operators
 
-#   define ASSIGN(op,intrin)				\
-    inline v4float &operator op(const v4float &b) {	\
-      v = intrin(v,b.v);				\
-      return *this;					\
+    #define ASSIGN(op,intrin)                           \
+    inline v4float &operator op( const v4float &b )     \
+    {                                                   \
+      v = intrin( v, b.v );                             \
+      return *this;                                     \
     }
 
-    inline v4float &operator =(const v4float &b) {
+    ASSIGN( +=, _mm_add_ps )
+    ASSIGN( -=, _mm_sub_ps )
+    ASSIGN( *=, _mm_mul_ps )
+    ASSIGN( /=, _mm_div_ps )
+
+    #undef ASSIGN
+
+    inline v4float &operator =( const v4float &b )
+    {
       v = b.v;
+
       return *this;
     }
 
-    ASSIGN(+=,_mm_add_ps)
-    ASSIGN(-=,_mm_sub_ps)
-    ASSIGN(*=,_mm_mul_ps)
-    ASSIGN(/=,_mm_div_ps)
-
-#   undef ASSIGN
-
     // v4float member access operator
 
-    inline float &operator []( int n ) { return f[n]; }
-    inline float  operator ()( int n ) { return f[n]; }
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
 
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
   };
 
   // v4float prefix unary operators
 
-  inline v4float operator +( const v4float &a ) {
+  inline v4float operator +( const v4float &a )
+  {
     v4float b;
+
     b.v = a.v;
+
     return b;
   }
 
-  inline v4float operator -( const v4float &a ) {
+  inline v4float operator -( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_sub_ps(_mm_setzero_ps(),a.v);
+
+    b.v = _mm_sub_ps( _mm_setzero_ps(), a.v );
+
     return b;
   }
 
-  inline v4int operator !( const v4float &a ) {
+  inline v4int operator !( const v4float &a )
+  {
     v4int b;
-    b.v = _mm_cmpeq_ps(_mm_setzero_ps(),a.v);
+
+    b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v );
+
     return b;
   }
 
   // v4float prefix increment / decrement operators
 
-  inline v4float operator ++( v4float &a ) {
+  inline v4float operator ++( v4float &a )
+  {
     v4float b;
+
     __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) );
+
     a.v = t;
     b.v = t;
+
     return b;
   }
 
-  inline v4float operator --( v4float &a ) {
+  inline v4float operator --( v4float &a )
+  {
     v4float b;
+
     __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) );
+
     a.v = t;
     b.v = t;
+
     return b;
   }
 
   // v4float postfix increment / decrement operators
 
-  inline v4float operator ++( v4float &a, int ) {
+  inline v4float operator ++( v4float &a, int )
+  {
     v4float b;
+
     __m128 a_v = a.v;
+
     a.v = _mm_add_ps( a_v, _mm_set1_ps( 1 ) );
     b.v = a_v;
+
     return b;
   }
 
-  inline v4float operator --( v4float &a, int ) {
+  inline v4float operator --( v4float &a, int )
+  {
     v4float b;
+
     __m128 a_v = a.v;
-    a.v = _mm_sub_ps(a_v, _mm_set1_ps( 1 ) );
+
+    a.v = _mm_sub_ps( a_v, _mm_set1_ps( 1 ) );
     b.v = a_v;
+
     return b;
   }
 
   // v4float binary operators
-    
-# define BINARY(op,intrin)                                           \
-  inline v4float operator op( const v4float &a, const v4float &b ) { \
+
+  #define BINARY(op,intrin)                                          \
+  inline v4float operator op( const v4float &a, const v4float &b )   \
+  {                                                                  \
     v4float c;                                                       \
-    c.v = intrin(a.v,b.v);                                           \
+    c.v = intrin( a.v, b.v );                                        \
     return c;                                                        \
   }
 
-  BINARY(+,_mm_add_ps)
-  BINARY(-,_mm_sub_ps)
-  BINARY(*,_mm_mul_ps)
-  BINARY(/,_mm_div_ps)
+  BINARY( +, _mm_add_ps )
+  BINARY( -, _mm_sub_ps )
+  BINARY( *, _mm_mul_ps )
+  BINARY( /, _mm_div_ps )
 
-# undef BINARY
+  #undef BINARY
 
   // v4float logical operators
 
-# define LOGICAL(op,intrin)                                        \
-  inline v4int operator op( const v4float &a, const v4float &b ) { \
+  #define LOGICAL(op,intrin)                                       \
+  inline v4int operator op( const v4float &a, const v4float &b )   \
+  {                                                                \
     v4int c;                                                       \
-    c.v = intrin(a.v,b.v);                                         \
+    c.v = intrin( a.v, b.v );                                      \
     return c;                                                      \
   }
 
-  LOGICAL(<, _mm_cmplt_ps )
-  LOGICAL(>, _mm_cmpgt_ps )
-  LOGICAL(==,_mm_cmpeq_ps )
-  LOGICAL(!=,_mm_cmpneq_ps)
-  LOGICAL(<=,_mm_cmple_ps )
-  LOGICAL(>=,_mm_cmpge_ps )
+  LOGICAL(  <, _mm_cmplt_ps  )
+  LOGICAL(  >, _mm_cmpgt_ps  )
+  LOGICAL( ==, _mm_cmpeq_ps  )
+  LOGICAL( <=, _mm_cmple_ps  )
+  LOGICAL( >=, _mm_cmpge_ps  )
+  LOGICAL( !=, _mm_cmpneq_ps )
 
-  inline v4int operator &&( const v4float &a, const v4float &b ) {
+  #undef LOGICAL
+
+  inline v4int operator &&( const v4float &a, const v4float &b )
+  {
     v4int c;
+
     __m128 vzero = _mm_setzero_ps();
-    c.v = _mm_and_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero));
+
+    c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ),
+                      _mm_cmpneq_ps( b.v, vzero ) );
+
     return c;
   }
 
-  inline v4int operator ||( const v4float &a, const v4float &b ) {
+  inline v4int operator ||( const v4float &a, const v4float &b )
+  {
     v4int c;
+
     __m128 vzero = _mm_setzero_ps();
-    c.v = _mm_or_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero));
+
+    c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ),
+                     _mm_cmpneq_ps( b.v, vzero ) );
+
     return c;
   }
 
-# undef LOGICAL
-
   // v4float math library functions
 
-# define CMATH_FR1(fn)                          \
-  inline v4float fn( const v4float &a ) {       \
+  #define CMATH_FR1(fn)                         \
+  inline v4float fn( const v4float &a )         \
+  {                                             \
     v4float b;                                  \
-    b.f[0] = ::fn(a.f[0]);                      \
-    b.f[1] = ::fn(a.f[1]);                      \
-    b.f[2] = ::fn(a.f[2]);                      \
-    b.f[3] = ::fn(a.f[3]);                      \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
     return b;                                   \
   }
 
-# define CMATH_FR2(fn)                                          \
-  inline v4float fn( const v4float &a, const v4float &b ) {     \
+  #define CMATH_FR2(fn)                                         \
+  inline v4float fn( const v4float &a, const v4float &b )       \
+  {                                                             \
     v4float c;                                                  \
-    c.f[0] = ::fn(a.f[0],b.f[0]);                               \
-    c.f[1] = ::fn(a.f[1],b.f[1]);                               \
-    c.f[2] = ::fn(a.f[2],b.f[2]);                               \
-    c.f[3] = ::fn(a.f[3],b.f[3]);                               \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
     return c;                                                   \
   }
 
@@ -898,126 +1121,202 @@ namespace v4 {
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
 
-  inline v4float fabs( const v4float &a ) {
+  #undef CMATH_FR1
+  #undef CMATH_FR2
+
+  inline v4float fabs( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_andnot_ps( _mm_set1_ps( -0.f ), a.v );
+
+    b.v = _mm_andnot_ps( _mm_set1_ps( -0.0f ), a.v );
+
     return b;
   }
 
-  inline v4float sqrt( const v4float &a ) {
+  inline v4float sqrt( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_sqrt_ps(a.v);
+
+    b.v = _mm_sqrt_ps( a.v );
+
     return b;
   }
 
-  inline v4float copysign( const v4float &a, const v4float &b ) {
+  inline v4float copysign( const v4float &a, const v4float &b )
+  {
     v4float c;
-    __m128 t = _mm_set1_ps( -0.f );
-    c.v = _mm_or_ps( _mm_and_ps( t, b.v ), _mm_andnot_ps( t, a.v ) );
+
+    __m128 t = _mm_set1_ps( -0.0f );
+
+    c.v = _mm_or_ps( _mm_and_ps( t, b.v ),
+                     _mm_andnot_ps( t, a.v ) );
+
     return c;
   }
 
-# undef CMATH_FR1
-# undef CMATH_FR2
+  // v4float miscellaneous functions
 
-  // v4float miscelleanous functions
-  
-  inline v4float rsqrt_approx( const v4float &a ) {
+  inline v4float rsqrt_approx( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_rsqrt_ps(a.v);
+
+    b.v = _mm_rsqrt_ps( a.v );
+
     return b;
   }
-  
-  inline v4float rsqrt( const v4float &a ) {
+
+  inline v4float rsqrt( const v4float &a )
+  {
     v4float b;
+
     __m128 a_v = a.v, b_v;
-    b_v = _mm_rsqrt_ps(a_v);
-    // Note: It is quicker to just call div_ps and sqrt_ps if more
-    // refinement desired!
-    b.v = _mm_add_ps(b_v,_mm_mul_ps(_mm_set1_ps(0.5f),
-                                    _mm_sub_ps(b_v,_mm_mul_ps(a_v,
-                                                   _mm_mul_ps(b_v,
-                                                   _mm_mul_ps(b_v,b_v))))));
+
+    b_v = _mm_rsqrt_ps( a_v );
+
+    b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ),
+                                       _mm_sub_ps( b_v,
+                                                   _mm_mul_ps( a_v,
+                                                               _mm_mul_ps( b_v,
+                                                                           _mm_mul_ps( b_v, b_v )
+                                                                         )
+                                                             )
+                                                 )
+                                     )
+                    );
+
     return b;
   }
 
-  inline v4float rcp_approx( const v4float &a ) {
+  inline v4float rcp_approx( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_rcp_ps(a.v);
+
+    b.v = _mm_rcp_ps( a.v );
+
     return b;
   }
-  
-  inline v4float rcp( const v4float &a ) {
+
+  inline v4float rcp( const v4float &a )
+  {
     v4float b;
+
     __m128 a_v = a.v, b_v;
-    b_v = _mm_rcp_ps(a_v);
-    b.v = _mm_sub_ps(_mm_add_ps(b_v,b_v),_mm_mul_ps(a_v,_mm_mul_ps(b_v,b_v)));
+
+    b_v = _mm_rcp_ps( a_v );
+
+    b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ),
+                      _mm_mul_ps( a_v,
+                                  _mm_mul_ps( b_v, b_v )
+                                )
+                    );
+
     return b;
   }
 
-  inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
+
     d.v = _mm_add_ps( _mm_mul_ps( a.v, b.v ), c.v );
+
     return d;
   }
 
-  inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fms( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
+
     d.v = _mm_sub_ps( _mm_mul_ps( a.v, b.v ), c.v );
+
     return d;
   }
 
-  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
+
     d.v = _mm_sub_ps( c.v, _mm_mul_ps( a.v, b.v ) );
+
     return d;
   }
 
-  inline v4float clear_bits( const v4int &m, const v4float &a ) {
+  inline v4float clear_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = _mm_andnot_ps( m.v, a.v );
+
     return b;
   }
 
-  inline v4float set_bits( const v4int &m, const v4float &a ) {
+  inline v4float set_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = _mm_or_ps( m.v, a.v );
+
     return b;
   }
 
-  inline v4float toggle_bits( const v4int &m, const v4float &a ) {
+  inline v4float toggle_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = _mm_xor_ps( m.v, a.v );
+
     return b;
   }
 
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void increment_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
+  {
     _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) );
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void decrement_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
+  {
     _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) );
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void scale_4x1( float * ALIGNED(16) p,
+                         const v4float &a )
+  {
     _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) );
   }
 
   // Given wl = x y z w, compute:
   // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
   // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
-  inline void trilinear( v4float &wl, v4float &wh ) {
-    __m128 l = _mm_set1_ps(1), s = _mm_setr_ps(-0.f,+0.f,-0.f,+0.f);
+  inline void trilinear( v4float &wl, v4float &wh )
+  {
+    __m128 l = _mm_set1_ps( 1.0f );
+
+    __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f );
+
     __m128 z = wl.v, xy;
-    xy = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(0,0,1,1) ) ) );
-    z  = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(2,2,2,2) ) ) );
-    xy = _mm_mul_ps( _mm_shuffle_ps( xy,xy, PERM(0,1,0,1) ),
-                     _mm_shuffle_ps( xy,xy, PERM(2,2,3,3) ) );
-    wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(0,0,0,0) ) );
-    wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(1,1,1,1) ) );
+
+    xy = _mm_add_ps( l,
+                     _mm_xor_ps( s,
+                                 _mm_shuffle_ps( z, z, PERM(0,0,1,1) )
+                               )
+                   );
+
+    z  = _mm_add_ps( l,
+                     _mm_xor_ps( s,
+                                 _mm_shuffle_ps( z, z, PERM(2,2,2,2) )
+                               )
+                   );
+
+    xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ),
+                     _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) );
+
+    wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) );
+
+    wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) );
   }
 
-# undef PERM
+  #undef PERM
 
 } // namespace v4
 
diff --git a/src/util/v4/v4_avx2.h b/src/util/v4/v4_avx2.h
index 023ba95a..2cab8b9c 100644
--- a/src/util/v4/v4_avx2.h
+++ b/src/util/v4/v4_avx2.h
@@ -57,17 +57,26 @@ namespace v4
 
     friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
-    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+    friend inline v4    merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
 
-    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
-    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void   load_4x1( const void * ALIGNED(16) p,
+                                   v4 &a ) ALWAYS_INLINE;
+
+    friend inline void  store_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void stream_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
     friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+
     friend inline void   copy_4x1( void * ALIGNED(16) dst,
                                    const void * ALIGNED(16) src ) ALWAYS_INLINE;
-    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    friend inline void   swap_4x1( void * ALIGNED(16) a,
+                                   void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
 
@@ -152,9 +161,8 @@ namespace v4
   inline v4 splat( const v4 & a )
   {
     v4 b;
-    __m128 a_v = a.v;
 
-    b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) );
+    b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) );
 
     return b;
   }
@@ -163,20 +171,19 @@ namespace v4
   inline v4 shuffle( const v4 & a )
   {
     v4 b;
-    __m128 a_v = a.v;
 
-    b.v = _mm_shuffle_ps( a_v, a_v, ( permute<i0,i1,i2,i3>::value ) );
+    b.v = _mm_shuffle_ps( a.v, a.v, ( permute<i0,i1,i2,i3>::value ) );
 
     return b;
   }
 
   inline void swap( v4 &a, v4 &b )
-  { 
-    __m128 a_v = a.v;
+  {
+    __m128 t = a.v;
 
     a.v = b.v;
 
-    b.v = a_v;
+    b.v = t;
   }
 
   inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
@@ -230,9 +237,8 @@ namespace v4
     _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) );
   }
 
-  /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
   inline void swap_4x1( void * ALIGNED(16) a,
-			void * ALIGNED(16) b )
+                        void * ALIGNED(16) b )
   {
     __m128 t = _mm_load_ps( ( float * ) a );
 
@@ -243,9 +249,9 @@ namespace v4
   // v4 transposed memory manipulation functions
 
   inline void load_4x1_tr( const void *a0,
-			   const void *a1,
+                           const void *a1,
                            const void *a2,
-			   const void *a3,
+                           const void *a3,
                            v4 &a )
   {
     a.v = _mm_setr_ps( ( (const float *) a0 )[0],
@@ -259,29 +265,53 @@ namespace v4
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
                            v4 &a,
-			   v4 &b )
+                           v4 &b )
   {
     __m128 a_v, b_v, t;
 
     b_v = _mm_setzero_ps();
 
-    t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
-    b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
+    t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 );
+    b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 );
 
-    a_v = _mm_shuffle_ps( t, b_v, 0x88 );
-    b_v = _mm_shuffle_ps( t, b_v, 0xdd );
+    a.v = _mm_shuffle_ps( t, b_v, 0x88 );
+    b.v = _mm_shuffle_ps( t, b_v, 0xdd );
+  }
 
-    a.v = a_v;
-    b.v = b_v;
+  inline void load_4x3_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a,
+                           v4 &b,
+                           v4 &c )
+  {
+    __m128 r, s, t, u, d_v;
+
+    a.v = _mm_load_ps( (const float *) a0 );
+    b.v = _mm_load_ps( (const float *) a1 );
+    c.v = _mm_load_ps( (const float *) a2 );
+    d_v = _mm_load_ps( (const float *) a3 );
+
+    r   = _mm_unpacklo_ps( a.v, b.v );
+    s   = _mm_unpackhi_ps( a.v, b.v );
+
+    t   = _mm_unpacklo_ps( c.v, d_v );
+    u   = _mm_unpackhi_ps( c.v, d_v );
+
+    a.v = _mm_movelh_ps( r, t );
+    b.v = _mm_movehl_ps( t, r );
+    c.v = _mm_movelh_ps( s, u );
   }
 
+  #if 0
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
                            v4 &a,
-			   v4 &b,
-			   v4 &c )
+                           v4 &b,
+                           v4 &c )
   {
     __m128 a_v, b_v, c_v, t, u;
 
@@ -303,6 +333,35 @@ namespace v4
     b.v = b_v;
     c.v = c_v;
   }
+  #endif
+
+  inline void load_4x4_tr( const void * ALIGNED(16) a0,
+                           const void * ALIGNED(16) a1,
+                           const void * ALIGNED(16) a2,
+                           const void * ALIGNED(16) a3,
+                           v4 &a,
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
+  {
+    __m128 r, s, t, u;
+
+    a.v = _mm_load_ps( (const float *) a0 );
+    b.v = _mm_load_ps( (const float *) a1 );
+    c.v = _mm_load_ps( (const float *) a2 );
+    d.v = _mm_load_ps( (const float *) a3 );
+
+    r   = _mm_unpackhi_ps( a.v, b.v );
+    s   = _mm_unpacklo_ps( a.v, b.v );
+
+    t   = _mm_unpackhi_ps( c.v, d.v );
+    u   = _mm_unpacklo_ps( c.v, d.v );
+
+    a.v = _mm_movelh_ps( s, u );
+    b.v = _mm_movehl_ps( u, s );
+    c.v = _mm_movelh_ps( r, t );
+    d.v = _mm_movehl_ps( t, r );
+  }
 
   #if 0
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
@@ -310,24 +369,26 @@ namespace v4
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
                            v4 &a,
-			   v4 &b,
-			   v4 &c,
-			   v4 &d )
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
     __m128 a_v, b_v, c_v, d_v, t, u;
+
     a_v = _mm_load_ps( (const float *)a0 );
     b_v = _mm_load_ps( (const float *)a1 );
     c_v = _mm_load_ps( (const float *)a2 );
     d_v = _mm_load_ps( (const float *)a3 );
+
     t   = _mm_unpackhi_ps( a_v, b_v );
-    a_v = _mm_unpacklo_ps( a_v, b_v );
     u   = _mm_unpackhi_ps( c_v, d_v );
+    a_v = _mm_unpacklo_ps( a_v, b_v );
     c_v = _mm_unpacklo_ps( c_v, d_v );
-    b_v = _mm_movehl_ps( c_v, a_v );
-    a_v = _mm_movelh_ps( a_v, c_v );
-    c_v = _mm_movelh_ps( t, u );
-    d_v = _mm_movehl_ps( u, t );
-    a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v;
+
+    a.v = _mm_movelh_ps( a_v, c_v );
+    c.v = _mm_movelh_ps( t, u );
+    b.v = _mm_movehl_ps( c_v, a_v );
+    d.v = _mm_movehl_ps( u, t );
   }
   #endif
 
@@ -337,26 +398,24 @@ namespace v4
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
                            v4 &a,
-			   v4 &b,
-			   v4 &c,
-			   v4 &d )
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
     __m128 a_v, b_v, c_v, d_v, t, u;
-
     a_v = _mm_load_ps( (const float *)a0 );
     b_v = _mm_load_ps( (const float *)a1 );
     c_v = _mm_load_ps( (const float *)a2 );
     d_v = _mm_load_ps( (const float *)a3 );
-
     t   = _mm_unpackhi_ps( a_v, b_v );
     a_v = _mm_unpacklo_ps( a_v, b_v );
     u   = _mm_unpackhi_ps( c_v, d_v );
     c_v = _mm_unpacklo_ps( c_v, d_v );
-
-    b.v = _mm_movehl_ps( c_v, a_v );
-    a.v = _mm_movelh_ps( a_v, c_v );
-    c.v = _mm_movelh_ps( t, u );
-    d.v = _mm_movehl_ps( u, t );
+    b_v = _mm_movehl_ps( c_v, a_v );
+    a_v = _mm_movelh_ps( a_v, c_v );
+    c_v = _mm_movelh_ps( t, u );
+    d_v = _mm_movehl_ps( u, t );
+    a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v;
   }
   #endif
 
@@ -366,9 +425,9 @@ namespace v4
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
                            v4 &a,
-			   v4 &b,
-			   v4 &c,
-			   v4 &d )
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
     __m128 a_v, b_v, c_v, d_v, t, u;
 
@@ -378,25 +437,26 @@ namespace v4
     d_v = _mm_load_ps( (const float *)a3 );
 
     t   = _mm_unpackhi_ps( a_v, b_v );
-    u   = _mm_unpackhi_ps( c_v, d_v );
     a_v = _mm_unpacklo_ps( a_v, b_v );
+    u   = _mm_unpackhi_ps( c_v, d_v );
     c_v = _mm_unpacklo_ps( c_v, d_v );
 
-    a.v = _mm_movelh_ps( a_v, c_v );
     b.v = _mm_movehl_ps( c_v, a_v );
-    d.v = _mm_movehl_ps( u, t );
+    a.v = _mm_movelh_ps( a_v, c_v );
     c.v = _mm_movelh_ps( t, u );
+    d.v = _mm_movehl_ps( u, t );
   }
   #endif
 
+  #if 0
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
                            v4 &a,
-			   v4 &b,
-			   v4 &c,
-			   v4 &d )
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
     __m128 a_v, b_v, c_v, d_v, t, u;
 
@@ -411,16 +471,17 @@ namespace v4
     c_v = _mm_unpacklo_ps( c_v, d_v );
 
     a.v = _mm_movelh_ps( a_v, c_v );
-    c.v = _mm_movelh_ps( t, u );
     b.v = _mm_movehl_ps( c_v, a_v );
     d.v = _mm_movehl_ps( u, t );
+    c.v = _mm_movelh_ps( t, u );
   }
+  #endif
 
   inline void store_4x1_tr( const v4 &a,
                             void *a0,
-			    void *a1,
+                            void *a1,
                             void *a2,
-			    void *a3 )
+                            void *a3 )
   {
     ( (float *) a0 )[0] = a.f[0];
     ( (float *) a1 )[0] = a.f[1];
@@ -429,77 +490,76 @@ namespace v4
   }
 
   inline void store_4x2_tr( const v4 &a,
-			    const v4 &b,
+                            const v4 &b,
                             void * ALIGNED(8) a0,
-			    void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a1,
                             void * ALIGNED(8) a2,
-			    void * ALIGNED(8) a3 )
+                            void * ALIGNED(8) a3 )
   {
-    __m128 a_v = a.v, b_v = b.v, t;
+    __m128 t;
 
-    t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t
+    t = _mm_unpacklo_ps( a.v, b.v );  // a0 b0 a1 b1 -> t
 
-    _mm_storel_pi( (__m64 *)a0, t ); // a0 b0       -> a0
-    _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1       -> a1
+    _mm_storel_pi( (__m64 *) a0, t ); // a0 b0       -> a0
+    _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1       -> a1
 
-    t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t
+    t = _mm_unpackhi_ps( a.v, b.v );  // a2 b2 a3 b3 -> t
 
-    _mm_storel_pi( (__m64 *)a2, t ); // a2 b2       -> a2
-    _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3       -> a3
+    _mm_storel_pi( (__m64 *) a2, t ); // a2 b2       -> a2
+    _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3       -> a3
   }
 
   inline void store_4x3_tr( const v4 &a,
-			    const v4 &b,
-			    const v4 &c,
+                            const v4 &b,
+                            const v4 &c,
                             void * ALIGNED(16) a0,
-			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2,
-			    void * ALIGNED(16) a3 )
+                            void * ALIGNED(16) a3 )
   {
-    __m128 a_v = a.v, b_v = b.v, t;
+    __m128 t;
 
-    t = _mm_unpacklo_ps( a_v, b_v ); // a0 b0 a1 b1 -> t
+    t = _mm_unpacklo_ps( a.v, b.v );  // a0 b0 a1 b1 -> t
 
-    _mm_storel_pi( (__m64 *)a0, t ); // a0 b0       -> a0
-    _mm_storeh_pi( (__m64 *)a1, t ); // a1 b1       -> a1
+    _mm_storel_pi( (__m64 *) a0, t ); // a0 b0       -> a0
+    _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1       -> a1
 
-    t = _mm_unpackhi_ps( a_v, b_v ); // a2 b2 a3 b3 -> t
+    t = _mm_unpackhi_ps( a.v, b.v );  // a2 b2 a3 b3 -> t
 
-    _mm_storel_pi( (__m64 *)a2, t ); // a2 b2       -> a2
-    _mm_storeh_pi( (__m64 *)a3, t ); // a3 b3       -> a3
+    _mm_storel_pi( (__m64 *) a2, t ); // a2 b2       -> a2
+    _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3       -> a3
 
-    ((float *)a0)[2] = c.f[0];
-    ((float *)a1)[2] = c.f[1];
-    ((float *)a2)[2] = c.f[2];
-    ((float *)a3)[2] = c.f[3];
+    ( (float *) a0 )[2] = c.f[0];
+    ( (float *) a1 )[2] = c.f[1];
+    ( (float *) a2 )[2] = c.f[2];
+    ( (float *) a3 )[2] = c.f[3];
   }
 
-  // FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR)
   inline void store_4x4_tr( const v4 &a,
-			    const v4 &b,
+                            const v4 &b,
                             const v4 &c,
-			    const v4 &d,
+                            const v4 &d,
                             void * ALIGNED(16) a0,
-			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2,
-			    void * ALIGNED(16) a3 )
+                            void * ALIGNED(16) a3 )
   {
-    __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
+    __m128 a_v, b_v, c_v, d_v, t, u;
 
-    t   = _mm_unpackhi_ps( a_v, b_v );
-    a_v = _mm_unpacklo_ps( a_v, b_v );
-    u   = _mm_unpackhi_ps( c_v, d_v );
-    c_v = _mm_unpacklo_ps( c_v, d_v );
+    t   = _mm_unpackhi_ps( a.v, b.v );
+    a_v = _mm_unpacklo_ps( a.v, b.v );
+    u   = _mm_unpackhi_ps( c.v, d.v );
+    c_v = _mm_unpacklo_ps( c.v, d.v );
 
     b_v = _mm_movehl_ps( c_v, a_v );
     a_v = _mm_movelh_ps( a_v, c_v );
     c_v = _mm_movelh_ps( t, u );
     d_v = _mm_movehl_ps( u, t );
 
-    _mm_store_ps( (float *)a0, a_v );
-    _mm_store_ps( (float *)a1, b_v );
-    _mm_store_ps( (float *)a2, c_v );
-    _mm_store_ps( (float *)a3, d_v );
+    _mm_store_ps( (float *) a0, a_v );
+    _mm_store_ps( (float *) a1, b_v );
+    _mm_store_ps( (float *) a2, c_v );
+    _mm_store_ps( (float *) a3, d_v );
   }
 
   //////////////
@@ -644,6 +704,8 @@ namespace v4
     ASSIGN(<<=)
     ASSIGN(>>=)
 
+    #undef ASSIGN
+
     inline v4int &operator =( const v4int &b )
     {
       v = b.v;
@@ -672,8 +734,6 @@ namespace v4
       return *this;
     }
 
-    #undef ASSIGN
-
     // v4int member access operator
 
     inline int &operator []( int n )
@@ -690,7 +750,7 @@ namespace v4
   // v4int prefix unary operators
 
   #define PREFIX_UNARY(op)                      \
-  inline v4int operator op( const v4int & a )   \
+  inline v4int operator op( const v4int &a )    \
   {                                             \
     v4int b;                                    \
     b.i[0] = ( op a.i[0] );                     \
@@ -700,7 +760,7 @@ namespace v4
     return b;                                   \
   }
 
-  inline v4int operator +( const v4int & a )
+  inline v4int operator +( const v4int &a )
   {
     v4int b;
 
@@ -711,19 +771,19 @@ namespace v4
 
   PREFIX_UNARY(-)
 
-  inline v4int operator !( const v4int & a )
+  inline v4int operator !( const v4int &a )
   {
     v4int b;
 
-    b.i[0] = - ( !a.i[0] );
-    b.i[1] = - ( !a.i[1] );
-    b.i[2] = - ( !a.i[2] );
-    b.i[3] = - ( !a.i[3] );
+    b.i[0] = - ( ! a.i[0] );
+    b.i[1] = - ( ! a.i[1] );
+    b.i[2] = - ( ! a.i[2] );
+    b.i[3] = - ( ! a.i[3] );
 
     return b;
   }
 
-  inline v4int operator ~( const v4int & a )
+  inline v4int operator ~( const v4int &a )
   {
     v4int b;
 
@@ -734,6 +794,7 @@ namespace v4
     } u;
 
     u.i = -1;
+
     b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) );
 
     return b;
@@ -744,7 +805,7 @@ namespace v4
   // v4int prefix increment / decrement
 
   #define PREFIX_INCDEC(op)                     \
-  inline v4int operator op( v4int & a )         \
+  inline v4int operator op( v4int &a )          \
   {                                             \
     v4int b;                                    \
     b.i[0] = ( op a.i[0] );                     \
@@ -762,7 +823,7 @@ namespace v4
   // v4int postfix increment / decrement
 
   #define POSTFIX_INCDEC(op)                   \
-  inline v4int operator op( v4int & a, int )   \
+  inline v4int operator op( v4int &a, int )    \
   {                                            \
     v4int b;                                   \
     b.i[0] = ( a.i[0] op );                    \
@@ -798,6 +859,8 @@ namespace v4
   BINARY(<<)
   BINARY(>>)
 
+  #undef BINARY
+
   inline v4int operator ^( const v4int &a, const v4int &b )
   {
     v4int c;
@@ -825,8 +888,6 @@ namespace v4
     return c;
   }
 
-  #undef BINARY
-
   // v4int logical operators
 
   #define LOGICAL(op)                                          \
@@ -885,9 +946,10 @@ namespace v4
 
   inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
   {
-    __m128 c_v = c.v;
     v4 tf;
 
+    __m128 c_v = c.v;
+
     tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ),
                       _mm_and_ps( c_v, t.v ) );
 
@@ -1011,6 +1073,8 @@ namespace v4
     ASSIGN( *=, _mm_mul_ps )
     ASSIGN( /=, _mm_div_ps )
 
+    #undef ASSIGN
+
     inline v4float &operator =( const v4float &b )
     {
       v = b.v;
@@ -1018,8 +1082,6 @@ namespace v4
       return *this;
     }
 
-    #undef ASSIGN
-
     // v4float member access operator
 
     inline float &operator []( int n )
@@ -1116,7 +1178,7 @@ namespace v4
 
   // v4float binary operators
 
-# define BINARY(op,intrin)                                           \
+  #define BINARY(op,intrin)                                          \
   inline v4float operator op( const v4float &a, const v4float &b )   \
   {                                                                  \
     v4float c;                                                       \
@@ -1129,7 +1191,7 @@ namespace v4
   BINARY( *, _mm_mul_ps )
   BINARY( /, _mm_div_ps )
 
-# undef BINARY
+  #undef BINARY
 
   // v4float logical operators
 
@@ -1141,12 +1203,14 @@ namespace v4
     return c;                                                      \
   }
 
-  LOGICAL(  <, _mm_cmplt_ps )
-  LOGICAL(  >, _mm_cmpgt_ps )
-  LOGICAL( ==, _mm_cmpeq_ps )
+  LOGICAL(  <, _mm_cmplt_ps  )
+  LOGICAL(  >, _mm_cmpgt_ps  )
+  LOGICAL( ==, _mm_cmpeq_ps  )
+  LOGICAL( <=, _mm_cmple_ps  )
+  LOGICAL( >=, _mm_cmpge_ps  )
   LOGICAL( !=, _mm_cmpneq_ps )
-  LOGICAL( <=, _mm_cmple_ps )
-  LOGICAL( >=, _mm_cmpge_ps )
+
+  #undef LOGICAL
 
   inline v4int operator &&( const v4float &a, const v4float &b )
   {
@@ -1172,8 +1236,6 @@ namespace v4
     return c;
   }
 
-  #undef LOGICAL
-
   // v4float math library functions
 
   #define CMATH_FR1(fn)                         \
@@ -1204,6 +1266,9 @@ namespace v4
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
 
+  #undef CMATH_FR1
+  #undef CMATH_FR2
+
   inline v4float fabs( const v4float &a )
   {
     v4float b;
@@ -1234,13 +1299,10 @@ namespace v4
     return c;
   }
 
-  #undef CMATH_FR1
-  #undef CMATH_FR2
-
-  // v4float miscelleanous functions
+  // v4float miscellaneous functions
 
   inline v4float rsqrt_approx( const v4float &a )
- {
+  {
     v4float b;
 
     b.v = _mm_rsqrt_ps( a.v );
@@ -1248,6 +1310,24 @@ namespace v4
     return b;
   }
 
+  inline v4float rsqrt( const v4float &a )
+  {
+    v4float b;
+
+    __m128 b_v;
+
+    b_v = _mm_rsqrt_ps( a.v );
+
+    b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ),
+                        _mm_fnmadd_ps( a.v,
+                                       _mm_mul_ps( b_v,
+                                                   _mm_mul_ps( b_v, b_v ) ),
+                                       b_v ),
+                        b_v );
+
+    return b;
+  }
+
   #if 0
   inline v4float rsqrt( const v4float &a )
   {
@@ -1257,8 +1337,6 @@ namespace v4
 
     b_v = _mm_rsqrt_ps( a_v );
 
-    // Note: It is quicker to just call div_ps and sqrt_ps if more
-    // refinement desired!
     b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ),
                                        _mm_sub_ps( b_v,
                                                    _mm_mul_ps( a_v,
@@ -1283,9 +1361,6 @@ namespace v4
 
     b_v = _mm_rsqrt_ps( a_v );
 
-    // Note: It is quicker to just call div_ps and sqrt_ps if more
-    // refinement desired!
-
     b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ),
                         _mm_fnmadd_ps( a_v,
                                        _mm_mul_ps( b_v,
@@ -1297,32 +1372,26 @@ namespace v4
   }
   #endif
 
-  inline v4float rsqrt( const v4float &a )
+  inline v4float rcp_approx( const v4float &a )
   {
     v4float b;
 
-    __m128 b_v;
-
-    b_v = _mm_rsqrt_ps( a.v );
-
-    // Note: It is quicker to just call div_ps and sqrt_ps if more
-    // refinement desired!
-
-    b.v = _mm_fmadd_ps( _mm_set1_ps( 0.5f ),
-                        _mm_fnmadd_ps( a.v,
-                                       _mm_mul_ps( b_v,
-                                                   _mm_mul_ps( b_v, b_v ) ),
-                                       b_v ),
-                        b_v );
+    b.v = _mm_rcp_ps( a.v );
 
     return b;
   }
 
-  inline v4float rcp_approx( const v4float &a )
+  inline v4float rcp( const v4float &a )
   {
     v4float b;
 
-    b.v = _mm_rcp_ps( a.v );
+    __m128 b_v;
+
+    b_v = _mm_rcp_ps( a.v );
+
+    b.v = _mm_fnmadd_ps( a.v,
+                         _mm_mul_ps( b_v, b_v ),
+                         _mm_add_ps( b_v, b_v ) );
 
     return b;
   }
@@ -1363,21 +1432,6 @@ namespace v4
   }
   #endif
 
-  inline v4float rcp( const v4float &a )
-  {
-    v4float b;
-
-    __m128 b_v;
-
-    b_v = _mm_rcp_ps( a.v );
-
-    b.v = _mm_fnmadd_ps( a.v,
-                         _mm_mul_ps( b_v, b_v ),
-                         _mm_add_ps( b_v, b_v ) );
-
-    return b;
-  }
-
   inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
   {
     v4float d;
@@ -1432,17 +1486,20 @@ namespace v4
     return b;
   }
 
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void increment_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) );
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void decrement_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) );
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void scale_4x1( float * ALIGNED(16) p,
+                         const v4float &a )
   {
     _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) );
   }
@@ -1452,22 +1509,33 @@ namespace v4
   // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
   inline void trilinear( v4float &wl, v4float &wh )
   {
-    __m128 l = _mm_set1_ps( 1.0f ), s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f );
+    __m128 l = _mm_set1_ps( 1.0f );
+
+    __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f );
+
     __m128 z = wl.v, xy;
 
-    xy = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(0,0,1,1) ) ) );
+    xy = _mm_add_ps( l,
+                     _mm_xor_ps( s,
+                                 _mm_shuffle_ps( z, z, PERM(0,0,1,1) )
+                               )
+                   );
 
-    z  = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(2,2,2,2) ) ) );
+    z  = _mm_add_ps( l,
+                     _mm_xor_ps( s,
+                                 _mm_shuffle_ps( z, z, PERM(2,2,2,2) )
+                               )
+                   );
 
-    xy = _mm_mul_ps( _mm_shuffle_ps( xy,xy, PERM(0,1,0,1) ),
-                     _mm_shuffle_ps( xy,xy, PERM(2,2,3,3) ) );
+    xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ),
+                     _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) );
 
-    wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(0,0,0,0) ) );
+    wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) );
 
-    wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(1,1,1,1) ) );
+    wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) );
   }
 
-# undef PERM
+  #undef PERM
 
 } // namespace v4
 
diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 22e8dff6..0152ad2b 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -61,17 +61,26 @@ namespace v4
 
     friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
-    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+    friend inline v4    merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
 
-    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
-    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void   load_4x1( const void * ALIGNED(16) p,
+                                   v4 &a ) ALWAYS_INLINE;
+
+    friend inline void  store_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void stream_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
     friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+
     friend inline void   copy_4x1( void * ALIGNED(16) dst,
                                    const void * ALIGNED(16) src ) ALWAYS_INLINE;
-    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    friend inline void   swap_4x1( void * ALIGNED(16) a,
+                                   void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
 
@@ -168,10 +177,6 @@ namespace v4
     v4( const v4 &a )          // Copy constructor
     {
       v = a.v;
-
-      // ALWAYS_VECTORIZE
-      // for( int j = 0; j < 4; j++ )
-      //   i[j] = a.i[j];
     }
 
     ~v4() {}                   // Default destructor
@@ -193,10 +198,6 @@ namespace v4
   inline v4 splat( const v4 & a )
   {
     v4 b;
-    // __m128 a_v = a.v;
-
-    // b.v = _mm_shuffle_ps( a_v, a_v, ( n*permute<1,1,1,1>::value ) );
-
 
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
@@ -209,9 +210,6 @@ namespace v4
   inline v4 shuffle( const v4 & a )
   {
     v4 b;
-    // __m128 a_v = a.v;
-
-    // b.v = _mm_shuffle_ps( a_v, a_v, ( permute<i0,i1,i2,i3>::value ) );
 
     b.i[0] = a.i[i0];
     b.i[1] = a.i[i1];
@@ -271,93 +269,6 @@ namespace v4
   }
   #endif
 
-  #if 0
-  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
-  {
-    float32x4_t a0_v, a2_v, t, u;
-
-    //-----------------------------------------------------------------
-    float32x2_t a0_vh = vget_high_f32( a0.v );
-    float32x2_t a1_vh = vget_high_f32( a1.v );
-
-    float32x2x2_t res_a0a1_h = vzip_f32( a0_vh, a1_vh );
-
-    t = vcombine_f32( res_a0a1_h.val[0], res_a0a1_h.val[1] );
-    //-----------------------------------------------------------------
-    // t    = _mm_unpackhi_ps( a0.v, a1.v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t a0_vl = vget_low_f32( a0.v );
-    float32x2_t a1_vl = vget_low_f32( a1.v );
-
-    float32x2x2_t res_a0a1_l = vzip_f32( a0_vl, a1_vl );
-
-    a0_v = vcombine_f32( res_a0a1_l.val[0], res_a0a1_l.val[1] );
-    //-----------------------------------------------------------------
-    // a0_v = _mm_unpacklo_ps( a0.v, a1.v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t a2_vh = vget_high_f32( a2.v );
-    float32x2_t a3_vh = vget_high_f32( a3.v );
-
-    float32x2x2_t res_a2a3_h = vzip_f32( a2_vh, a3_vh );
-
-    u = vcombine_f32( res_a2a3_h.val[0], res_a2a3_h.val[1] );
-    //-----------------------------------------------------------------
-    // u    = _mm_unpackhi_ps( a2.v, a3.v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t a2_vl = vget_low_f32( a2.v );
-    float32x2_t a3_vl = vget_low_f32( a3.v );
-
-    float32x2x2_t res_a2a3_l = vzip_f32( a2_vl, a3_vl );
-
-    a2_v = vcombine_f32( res_a2a3_l.val[0], res_a2a3_l.val[1] );
-    //-----------------------------------------------------------------
-    // a2_v = _mm_unpacklo_ps( a2.v, a3.v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    a0.v[0] = a0_v[0];
-    a0.v[1] = a0_v[1];
-    a0.v[2] = a2_v[0];
-    a0.v[3] = a2_v[1];
-    //-----------------------------------------------------------------
-    // a0.v = _mm_movelh_ps( a0_v, a2_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    a1.v[0] = a0_v[2];
-    a1.v[1] = a0_v[3];
-    a1.v[2] = a2_v[2];
-    a1.v[3] = a2_v[3];
-    //-----------------------------------------------------------------
-    // a1.v = _mm_movehl_ps( a2_v, a0_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    a2.v[0] = t[0];
-    a2.v[1] = t[1];
-    a2.v[2] = u[0];
-    a2.v[3] = u[1];
-    //-----------------------------------------------------------------
-    // a2.v = _mm_movelh_ps( t, u );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    a3.v[0] = t[2];
-    a3.v[1] = t[3];
-    a3.v[2] = u[2];
-    a3.v[3] = u[3];
-    //-----------------------------------------------------------------
-    // a3.v = _mm_movehl_ps( u, t );
-    //-----------------------------------------------------------------
-  }
-  #endif
-
   #if 0
   // Portable version.
   inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
@@ -376,58 +287,31 @@ namespace v4
                         v4 &a )
   {
     a.v = vld1q_f32( ( float * ) p );
-
-    // a.v = _mm_load_ps( ( float * ) p );
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   a.i[j] = ((const int * ALIGNED(16))p)[j];
   }
 
   inline void store_4x1( const v4 &a,
                          void * ALIGNED(16) p )
   {
     vst1q_f32( ( float * ) p, a.v );
-
-    // _mm_store_ps( ( float * ) p, a.v );
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   ((int * ALIGNED(16))p)[j] = a.i[j];
   }
 
   inline void stream_4x1( const v4 &a,
                           void * ALIGNED(16) p )
   {
-    // _mm_stream_ps( ( float * ) p, a.v );
-
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
-      ((int * ALIGNED(16))p)[j] = a.i[j];
+      ( (int * ALIGNED(16) ) p )[j] = a.i[j];
   }
 
   inline void clear_4x1( void * ALIGNED(16) p )
   {
     vst1q_f32( ( float * ) p, vdupq_n_f32( 0.0f ) );
-
-    // _mm_store_ps( ( float * ) p, _mm_setzero_ps() );
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   ((int * ALIGNED(16))p)[j] = 0;
   }
 
-  // FIXME: Ordering semantics
   inline void copy_4x1( void * ALIGNED(16) dst,
                         const void * ALIGNED(16) src )
   {
     vst1q_f32( ( float * ) dst, vld1q_f32( ( const float * ) src ) );
-
-    // _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) );
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   ((int * ALIGNED(16))dst)[j] = ((const int * ALIGNED(16))src)[j];
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
@@ -437,54 +321,28 @@ namespace v4
 
     vst1q_f32( ( float * ) a, vld1q_f32( ( float * ) b ) );
     vst1q_f32( ( float * ) b, t );
-
-    // __m128 t = _mm_load_ps( ( float * ) a );
-
-    // _mm_store_ps( ( float * ) a, _mm_load_ps( ( float * ) b ) );
-    // _mm_store_ps( ( float * ) b, t );
-
-    // int t;
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    // {
-    //   t = ((int * ALIGNED(16))a)[j];
-    //   ((int * ALIGNED(16))a)[j] = ((int * ALIGNED(16))b)[j];
-    //   ((int * ALIGNED(16))b)[j] = t;
-    // }
   }
 
   // v4 transposed memory manipulation functions
 
   inline void load_4x1_tr( const void *a0,
-			   const void *a1,
+                           const void *a1,
                            const void *a2,
-			   const void *a3,
+                           const void *a3,
                            v4 &a )
   {
-    // a.v = _mm_setr_ps( ( (const float *) a0 )[0],
-    //                    ( (const float *) a1 )[0],
-    //                    ( (const float *) a2 )[0],
-    //                    ( (const float *) a3 )[0] );
-
-    // Not correct.
-    // float32x4x4_t mat = vld4q_f32( (const float *) a0 );
-
-    // a.v = mat.val[0];
-
-    a.i[0] = ((const int *)a0)[0];
-    a.i[1] = ((const int *)a1)[0];
-    a.i[2] = ((const int *)a2)[0];
-    a.i[3] = ((const int *)a3)[0];
+    a.i[0] = ( (const int *) a0 )[0];
+    a.i[1] = ( (const int *) a1 )[0];
+    a.i[2] = ( (const int *) a2 )[0];
+    a.i[3] = ( (const int *) a3 )[0];
   }
 
-  #if 1
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
                            v4 &a,
-			   v4 &b )
+                           v4 &b )
   {
     float32x4_t r, s, t, u, a2_v, a3_v;
 
@@ -502,92 +360,41 @@ namespace v4
     a.v = vtrn1q_f64( r, t );
     b.v = vtrn1q_f64( s, u );
   }
-  #endif
-
-  #if 0
-  // Portable version.
-  inline void load_4x2_tr( const void * ALIGNED(8) a0,
-                           const void * ALIGNED(8) a1,
-                           const void * ALIGNED(8) a2,
-                           const void * ALIGNED(8) a3,
-                           v4 &a,
-			   v4 &b )
-  {
-    a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0];
-    b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1];
-
-    a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0];
-    b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1];
-
-    a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0];
-    b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1];
-
-    a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0];
-    b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1];
-  }
-  #endif
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
                            v4 &a,
-			   v4 &b,
-			   v4 &c )
+                           v4 &b,
+                           v4 &c )
   {
-    // __m128 a_v, b_v, c_v, t, u;
-
-    // t   = _mm_load_ps( (const float *)a0 );
-    // b_v = _mm_load_ps( (const float *)a1 );
-    // c_v = _mm_load_ps( (const float *)a2 );
-    // u   = _mm_load_ps( (const float *)a3 );
-
-    // a_v = _mm_unpacklo_ps( t, b_v );
-    // b_v = _mm_unpackhi_ps( t, b_v );
-    // t   = _mm_unpacklo_ps( c_v, u );
-    // u   = _mm_unpackhi_ps( c_v, u );
+    float32x4_t r, s, t, u, d_v;
 
-    // c_v = _mm_movelh_ps( b_v, u );
-    // b_v = _mm_movehl_ps( t, a_v );
-    // a_v = _mm_movelh_ps( a_v, t );
-
-    // a.v = a_v;
-    // b.v = b_v;
-    // c.v = c_v;
-
-    // Not correct.
-    // float32x4x4_t mat = vld4q_f32( (const float *) a0 );
-
-    // a.v = mat.val[0];
-    // b.v = mat.val[1];
-    // c.v = mat.val[2];
-
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    a.v = vld1q_f32( (const float *) a0 );
+    b.v = vld1q_f32( (const float *) a1 );
+    c.v = vld1q_f32( (const float *) a2 );
+    d_v = vld1q_f32( (const float *) a3 );
 
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    r   = vtrn1q_f32( a.v, b.v );
+    s   = vtrn2q_f32( a.v, b.v );
 
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    t   = vtrn1q_f32( c.v, d_v );
+    u   = vtrn2q_f32( c.v, d_v );
 
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+    a.v = vtrn1q_f64( r, t );
+    b.v = vtrn1q_f64( s, u );
+    c.v = vtrn2q_f64( r, t );
   }
 
-  #if 1
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
                            v4 &a,
-			   v4 &b,
-			   v4 &c,
-			   v4 &d )
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
     float32x4_t r, s, t, u;
 
@@ -607,150 +414,6 @@ namespace v4
     c.v = vtrn2q_f64( r, t );
     d.v = vtrn2q_f64( s, u );
   }
-  #endif
-
-  #if 0
-  inline void load_4x4_tr( const void * ALIGNED(16) a0,
-                           const void * ALIGNED(16) a1,
-                           const void * ALIGNED(16) a2,
-                           const void * ALIGNED(16) a3,
-                           v4 &a,
-			   v4 &b,
-			   v4 &c,
-			   v4 &d )
-  {
-    //-----------------------------------------------------------------
-    float32x4_t a_v, b_v, c_v, d_v, t, u;
-    //-----------------------------------------------------------------
-    // __m128 a_v, b_v, c_v, d_v, t, u;
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    a_v = vld1q_f32( (const float *) a0 );
-    b_v = vld1q_f32( (const float *) a1 );
-    c_v = vld1q_f32( (const float *) a2 );
-    d_v = vld1q_f32( (const float *) a3 );
-    //-----------------------------------------------------------------
-    // a_v = _mm_load_ps( (const float *) a0 );
-    // b_v = _mm_load_ps( (const float *) a1 );
-    // c_v = _mm_load_ps( (const float *) a2 );
-    // d_v = _mm_load_ps( (const float *) a3 );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t a_vh = vget_high_f32( a_v );
-    float32x2_t b_vh = vget_high_f32( b_v );
-
-    float32x2x2_t res_ab_h = vzip_f32( a_vh, b_vh );
-
-    t = vcombine_f32( res_ab_h.val[0], res_ab_h.val[1] );
-    //-----------------------------------------------------------------
-    // t   = _mm_unpackhi_ps( a_v, b_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t c_vh = vget_high_f32( c_v );
-    float32x2_t d_vh = vget_high_f32( d_v );
-
-    float32x2x2_t res_cd_h = vzip_f32( c_vh, d_vh );
-
-    u = vcombine_f32( res_cd_h.val[0], res_cd_h.val[1] );
-    //-----------------------------------------------------------------
-    // u   = _mm_unpackhi_ps( c_v, d_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t a_vl = vget_low_f32( a_v );
-    float32x2_t b_vl = vget_low_f32( b_v );
-
-    float32x2x2_t res_ab_l = vzip_f32( a_vl, b_vl );
-
-    a_v = vcombine_f32( res_ab_l.val[0], res_ab_l.val[1] );
-    //-----------------------------------------------------------------
-    // a_v = _mm_unpacklo_ps( a_v, b_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t c_vl = vget_low_f32( c_v );
-    float32x2_t d_vl = vget_low_f32( d_v );
-
-    float32x2x2_t res_cd_l = vzip_f32( c_vl, d_vl );
-
-    c_v = vcombine_f32( res_cd_l.val[0], res_cd_l.val[1] );
-    //-----------------------------------------------------------------
-    // c_v = _mm_unpacklo_ps( c_v, d_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    a.v[0] = a_v[0];
-    a.v[1] = a_v[1];
-    a.v[2] = c_v[0];
-    a.v[3] = c_v[1];
-    //-----------------------------------------------------------------
-    // a.v = _mm_movelh_ps( a_v, c_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    c.v[0] = t[0];
-    c.v[1] = t[1];
-    c.v[2] = u[0];
-    c.v[3] = u[1];
-    //-----------------------------------------------------------------
-    // c.v = _mm_movelh_ps( t, u );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    b.v[0] = a_v[2];
-    b.v[1] = a_v[3];
-    b.v[2] = c_v[2];
-    b.v[3] = c_v[3];
-    //-----------------------------------------------------------------
-    // b.v = _mm_movehl_ps( c_v, a_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    d.v[0] = t[2];
-    d.v[1] = t[3];
-    d.v[2] = u[2];
-    d.v[3] = u[3];
-    //-----------------------------------------------------------------
-    // d.v = _mm_movehl_ps( u, t );
-    //-----------------------------------------------------------------
-  }
-  #endif
-
-  #if 0
-  // Portable version.
-  inline void load_4x4_tr( const void * ALIGNED(16) a0,
-                           const void * ALIGNED(16) a1,
-                           const void * ALIGNED(16) a2,
-                           const void * ALIGNED(16) a3,
-                           v4 &a,
-			   v4 &b,
-			   v4 &c,
-			   v4 &d )
-  {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
-    d.i[0] = ((const int * ALIGNED(16))a0)[3];
-
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
-    d.i[1] = ((const int * ALIGNED(16))a1)[3];
-
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
-    d.i[2] = ((const int * ALIGNED(16))a2)[3];
-
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2];
-    d.i[3] = ((const int * ALIGNED(16))a3)[3];
-  }
-  #endif
 
   #if 1
   inline void load_4x8_tr( const void * ALIGNED(16) a0,
@@ -853,9 +516,9 @@ namespace v4
 
   inline void store_4x1_tr( const v4 &a,
                             void *a0,
-			    void *a1,
+                            void *a1,
                             void *a2,
-			    void *a3 )
+                            void *a3 )
   {
     ( (int *) a0 )[0] = a.i[0];
     ( (int *) a1 )[0] = a.i[1];
@@ -864,11 +527,11 @@ namespace v4
   }
 
   inline void store_4x2_tr( const v4 &a,
-			    const v4 &b,
+                            const v4 &b,
                             void * ALIGNED(8) a0,
-			    void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a1,
                             void * ALIGNED(8) a2,
-			    void * ALIGNED(8) a3 )
+                            void * ALIGNED(8) a3 )
   {
     // __m128 a_v = a.v, b_v = b.v, t;
 
@@ -896,12 +559,12 @@ namespace v4
   }
 
   inline void store_4x3_tr( const v4 &a,
-			    const v4 &b,
-			    const v4 &c,
+                            const v4 &b,
+                            const v4 &c,
                             void * ALIGNED(16) a0,
-			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2,
-			    void * ALIGNED(16) a3 )
+                            void * ALIGNED(16) a3 )
   {
     // __m128 a_v = a.v, b_v = b.v, t;
 
@@ -937,15 +600,14 @@ namespace v4
     ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
   }
 
-  #if 1
   inline void store_4x4_tr( const v4 &a,
-			    const v4 &b,
-			    const v4 &c,
-			    const v4 &d,
+                            const v4 &b,
+                            const v4 &c,
+                            const v4 &d,
                             void * ALIGNED(16) a0,
-			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2,
-			    void * ALIGNED(16) a3 )
+                            void * ALIGNED(16) a3 )
   {
     float32x4_t r, s, t, u;
 
@@ -960,184 +622,20 @@ namespace v4
     vst1q_f32( (float *) a2, vtrn2q_f64( r, t ) );
     vst1q_f32( (float *) a3, vtrn2q_f64( s, u ) );
   }
-  #endif
-
-  #if 0
-  inline void store_4x4_tr( const v4 &a,
-			    const v4 &b,
-			    const v4 &c,
-			    const v4 &d,
-                            void * ALIGNED(16) a0,
-			    void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2,
-			    void * ALIGNED(16) a3 )
-  {
-    //-----------------------------------------------------------------
-    float32x4_t a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
-    //-----------------------------------------------------------------
-    // __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t a_vh = vget_high_f32( a_v );
-    float32x2_t b_vh = vget_high_f32( b_v );
-
-    float32x2x2_t res_ab_h = vzip_f32( a_vh, b_vh );
-
-    t = vcombine_f32( res_ab_h.val[0], res_ab_h.val[1] );
-    //-----------------------------------------------------------------
-    // t   = _mm_unpackhi_ps( a_v, b_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t a_vl = vget_low_f32( a_v );
-    float32x2_t b_vl = vget_low_f32( b_v );
-
-    float32x2x2_t res_ab_l = vzip_f32( a_vl, b_vl );
-
-    a_v = vcombine_f32( res_ab_l.val[0], res_ab_l.val[1] );
-    //-----------------------------------------------------------------
-    // a_v = _mm_unpacklo_ps( a_v, b_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t c_vh = vget_high_f32( c_v );
-    float32x2_t d_vh = vget_high_f32( d_v );
-
-    float32x2x2_t res_cd_h = vzip_f32( c_vh, d_vh );
-
-    u = vcombine_f32( res_cd_h.val[0], res_cd_h.val[1] );
-    //-----------------------------------------------------------------
-    // u   = _mm_unpackhi_ps( c_v, d_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    float32x2_t c_vl = vget_low_f32( c_v );
-    float32x2_t d_vl = vget_low_f32( d_v );
-
-    float32x2x2_t res_cd_l = vzip_f32( c_vl, d_vl );
-
-    c_v = vcombine_f32( res_cd_l.val[0], res_cd_l.val[1] );
-    //-----------------------------------------------------------------
-    // c_v = _mm_unpacklo_ps( c_v, d_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    b_v[0] = a_v[2];
-    b_v[1] = a_v[3];
-    b_v[2] = c_v[2];
-    b_v[3] = c_v[3];
-    //-----------------------------------------------------------------
-    // b_v = _mm_movehl_ps( c_v, a_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    a_v[0] = a_v[0];
-    a_v[1] = a_v[1];
-    a_v[2] = c_v[0];
-    a_v[3] = c_v[1];
-    //-----------------------------------------------------------------
-    // a_v = _mm_movelh_ps( a_v, c_v );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    c_v[0] = t[0];
-    c_v[1] = t[1];
-    c_v[2] = u[0];
-    c_v[3] = u[1];
-    //-----------------------------------------------------------------
-    // c_v = _mm_movelh_ps( t, u );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    d_v[0] = t[2];
-    d_v[1] = t[3];
-    d_v[2] = u[2];
-    d_v[3] = u[3];
-    //-----------------------------------------------------------------
-    // d_v = _mm_movehl_ps( u, t );
-    //-----------------------------------------------------------------
-
-    //-----------------------------------------------------------------
-    vst1q_f32( (float *) a0, a_v );
-    vst1q_f32( (float *) a1, b_v );
-    vst1q_f32( (float *) a2, c_v );
-    vst1q_f32( (float *) a3, d_v );
-    //-----------------------------------------------------------------
-    // _mm_store_ps( (float *) a0, a_v );
-    // _mm_store_ps( (float *) a1, b_v );
-    // _mm_store_ps( (float *) a2, c_v );
-    // _mm_store_ps( (float *) a3, d_v );
-    //-----------------------------------------------------------------
-
-    // ((int * ALIGNED(16))a0)[0] = a.i[0];
-    // ((int * ALIGNED(16))a0)[1] = b.i[0];
-    // ((int * ALIGNED(16))a0)[2] = c.i[0];
-    // ((int * ALIGNED(16))a0)[3] = d.i[0];
-
-    // ((int * ALIGNED(16))a1)[0] = a.i[1];
-    // ((int * ALIGNED(16))a1)[1] = b.i[1];
-    // ((int * ALIGNED(16))a1)[2] = c.i[1];
-    // ((int * ALIGNED(16))a1)[3] = d.i[1];
-
-    // ((int * ALIGNED(16))a2)[0] = a.i[2];
-    // ((int * ALIGNED(16))a2)[1] = b.i[2];
-    // ((int * ALIGNED(16))a2)[2] = c.i[2];
-    // ((int * ALIGNED(16))a2)[3] = d.i[2];
-
-    // ((int * ALIGNED(16))a3)[0] = a.i[3];
-    // ((int * ALIGNED(16))a3)[1] = b.i[3];
-    // ((int * ALIGNED(16))a3)[2] = c.i[3];
-    // ((int * ALIGNED(16))a3)[3] = d.i[3];
-  }
-  #endif
-
-  #if 0
-  // Portable version.
-  inline void store_4x4_tr( const v4 &a,
-			    const v4 &b,
-			    const v4 &c,
-			    const v4 &d,
-                            void * ALIGNED(16) a0,
-			    void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2,
-			    void * ALIGNED(16) a3 )
-  {
-    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
-    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
-    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
-    ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0];
-
-    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
-    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
-    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
-    ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1];
-
-    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
-    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
-    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
-    ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2];
-
-    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
-    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
-    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
-    ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3];
-  }
-  #endif
 
   #if 1
   inline void store_4x8_tr( const v4 &b00,
-			    const v4 &b01,
-			    const v4 &b02,
-			    const v4 &b03,
-			    const v4 &b04,
-			    const v4 &b05,
-			    const v4 &b06,
-			    const v4 &b07,
+                            const v4 &b01,
+                            const v4 &b02,
+                            const v4 &b03,
+                            const v4 &b04,
+                            const v4 &b05,
+                            const v4 &b06,
+                            const v4 &b07,
                             void * ALIGNED(16) a0,
-			    void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a1,
                             void * ALIGNED(16) a2,
-			    void * ALIGNED(16) a3 )
+                            void * ALIGNED(16) a3 )
   {
     float32x4x4_t mat0, mat2;
 
@@ -1241,19 +739,11 @@ namespace v4
     v4int( const v4int &a )                   // Copy constructor
     {
       v = a.v;
-
-      // ALWAYS_VECTORIZE
-      // for( int j = 0; j < 4; j++ )
-      //   i[j] = a.i[j];
     }
 
     v4int( const v4 &a )                      // Init from mixed
     {
       v = a.v;
-
-      // ALWAYS_VECTORIZE
-      // for( int j = 0; j < 4; j++ )
-      //   i[j] = a.i[j];
     }
 
     v4int( int a )                            // Init from scalar
@@ -1266,10 +756,6 @@ namespace v4
 
       u.i = a;
       v   = vdupq_n_f32( u.f );
-
-      // ALWAYS_VECTORIZE
-      // for( int j = 0; j < 4; j++ )
-      //   i[j] = a;
     }
 
     v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
@@ -1313,10 +799,8 @@ namespace v4
     ASSIGN(%=)
     ASSIGN(<<=)
     ASSIGN(>>=)
-    // ASSIGN( =)
-    // ASSIGN(^=)
-    // ASSIGN(&=)
-    // ASSIGN(|=)
+
+    #undef ASSIGN
 
     inline v4int &operator =( const v4int &b )
     {
@@ -1346,8 +830,6 @@ namespace v4
       return *this;
     }
 
-    #undef ASSIGN
-
     // v4int member access operator
 
     inline int &operator []( int n )
@@ -1364,7 +846,7 @@ namespace v4
   // v4int prefix unary operators
 
   #define PREFIX_UNARY(op)                      \
-  inline v4int operator op( const v4int & a )   \
+  inline v4int operator op( const v4int &a )    \
   {                                             \
     v4int b;                                    \
     ALWAYS_VECTORIZE                            \
@@ -1376,7 +858,7 @@ namespace v4
   PREFIX_UNARY(+)
   PREFIX_UNARY(-)
 
-  inline v4int operator !( const v4int & a )
+  inline v4int operator !( const v4int &a )
   {
     v4int b;
 
@@ -1394,7 +876,7 @@ namespace v4
   // v4int prefix increment / decrement
 
   #define PREFIX_INCDEC(op)                     \
-  inline v4int operator op( v4int & a )         \
+  inline v4int operator op( v4int &a )          \
   {                                             \
     v4int b;                                    \
     ALWAYS_VECTORIZE                            \
@@ -1411,7 +893,7 @@ namespace v4
   // v4int postfix increment / decrement
 
   #define POSTFIX_INCDEC(op)                   \
-  inline v4int operator op( v4int & a, int )   \
+  inline v4int operator op( v4int &a, int )    \
   {                                            \
     v4int b;                                   \
     ALWAYS_VECTORIZE                           \
@@ -1444,9 +926,8 @@ namespace v4
   BINARY(%)
   BINARY(<<)
   BINARY(>>)
-  // BINARY(^)
-  // BINARY(&)
-  // BINARY(|)
+
+  #undef BINARY
 
   inline v4int operator ^( const v4int &a, const v4int &b )
   {
@@ -1475,8 +956,6 @@ namespace v4
     return c;
   }
 
-  #undef BINARY
-
   // v4int logical operators
 
   #define LOGICAL(op)                                          \
@@ -1517,13 +996,14 @@ namespace v4
   {
     v4 b;
 
-    b.vsi = vbicq_s32( c.vsi, a.vsi );
+    // This seems broken.
+    // b.vsi = vbicq_s32( c.vsi, a.vsi );
 
     // b.v = _mm_andnot_ps( c.v, a.v );
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   b.i[j] = a.i[j] & ~c.i[j];
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      b.i[j] = a.i[j] & ~c.i[j];
 
     return b;
   }
@@ -1545,10 +1025,11 @@ namespace v4
 
   inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
   {
-    v4 m;
+    v4 tf;
 
-    m.vsi = vorrq_s32( vbicq_s32( c.vsi, f.vsi ),
-                       vandq_s32( c.vsi, t.vsi ) );
+    // This seems broken.
+    // tf.vsi = vorrq_s32( vbicq_s32( c.vsi, f.vsi ),
+    //                     vandq_s32( c.vsi, t.vsi ) );
 
     // __m128 c_v = c.v;
     // v4 tf;
@@ -1556,11 +1037,11 @@ namespace v4
     // tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ),
     //                   _mm_and_ps( c_v, t.v ) );
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+    ALWAYS_VECTORIZE
+    for( int j = 0; j < 4; j++ )
+      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
 
-    return m;
+    return tf;
   }
 
   ////////////////
@@ -1608,7 +1089,7 @@ namespace v4
 
     #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
     #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b ) ALWAYS_INLINE
+                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -1647,28 +1128,16 @@ namespace v4
     v4float( const v4float &a )                         // Copy constructor
     {
       v = a.v;
-
-      // ALWAYS_VECTORIZE
-      // for( int j = 0; j < 4; j++ )
-      //   f[j] = a.f[j];
     }
 
     v4float( const v4 &a )                              // Init from mixed
     {
       v = a.v;
-
-      // ALWAYS_VECTORIZE
-      // for( int j = 0; j < 4; j++ )
-      //   f[j] = a.f[j];
     }
 
     v4float( float a )                                  // Init from scalar
     {
       v = vdupq_n_f32( a );
-
-      // ALWAYS_VECTORIZE
-      // for( int j = 0; j < 4; j++ )
-      //   f[j] = a;
     }
 
     v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
@@ -1697,6 +1166,8 @@ namespace v4
     ASSIGN( *=, vmulq_f32 )
     ASSIGN( /=, vdivq_f32 )
 
+    #undef ASSIGN
+
     inline v4float &operator =( const v4float &b )
     {
       v = b.v;
@@ -1704,25 +1175,6 @@ namespace v4
       return *this;
     }
 
-    #undef ASSIGN
-
-    // #define ASSIGN(op)                                  \
-    // inline v4float &operator op( const v4float &b )     \
-    // {                                                   \
-    //   ALWAYS_VECTORIZE                                  \
-    //   for( int j = 0; j < 4; j++ )                      \
-    //     f[j] op b.f[j];                                 \
-    //   return *this;                                     \
-    // }
-
-    // ASSIGN(=)
-    // ASSIGN(+=)
-    // ASSIGN(-=)
-    // ASSIGN(*=)
-    // ASSIGN(/=)
-
-    // #undef ASSIGN
-
     // v4float member access operator
 
     inline float &operator []( int n )
@@ -1744,10 +1196,6 @@ namespace v4
 
     b.v = a.v;
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   b.f[j] = +a.f[j];
-
     return b;
   }
 
@@ -1862,23 +1310,6 @@ namespace v4
 
   #undef BINARY
 
-  // #define BINARY(op)                                                 \
-  // inline v4float operator op( const v4float &a, const v4float &b )   \
-  // {                                                               \
-  //   v4float c;                                                       \
-  //   ALWAYS_VECTORIZE                                                 \
-  //   for( int j = 0; j < 4; j++ )                                     \
-  //     c.f[j] = a.f[j] op b.f[j];                                     \
-  //   return c;                                                        \
-  // }
-
-  // BINARY(+)
-  // BINARY(-)
-  // BINARY(*)
-  // BINARY(/)
-
-  // #undef BINARY
-
   // v4float logical operators
 
   #define LOGICAL(op,intrin)                                       \
@@ -1894,7 +1325,8 @@ namespace v4
   LOGICAL( ==, vceqq_f32 )
   LOGICAL( <=, vcleq_f32 )
   LOGICAL( >=, vcgeq_f32 )
-  // LOGICAL( !=, _mm_cmpneq_ps )
+
+  #undef LOGICAL
 
   inline v4int operator !=( const v4float &a, const v4float &b )
   {
@@ -1919,9 +1351,9 @@ namespace v4
 
     // Is there a better way to do this than the SSE way?
     c.vsi = vandq_s32( vmvnq_u32( vceqq_f32( a.v,
-					     vzero ) ),
-		       vmvnq_u32( vceqq_f32( b.v,
-					     vzero ) ) );
+                                             vzero ) ),
+                       vmvnq_u32( vceqq_f32( b.v,
+                                             vzero ) ) );
 
     // c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ),
     //                   _mm_cmpneq_ps( b.v, vzero ) );
@@ -1939,9 +1371,9 @@ namespace v4
 
     // Is there a better way to do this than the SSE way?
     c.vsi = vorrq_s32( vmvnq_u32( vceqq_f32( a.v,
-					     vzero ) ),
-		       vmvnq_u32( vceqq_f32( b.v,
-					     vzero ) ) );
+                                             vzero ) ),
+                       vmvnq_u32( vceqq_f32( b.v,
+                                             vzero ) ) );
 
     // c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ),
     //                  _mm_cmpneq_ps( b.v, vzero ) );
@@ -1949,29 +1381,6 @@ namespace v4
     return c;
   }
 
-  #undef LOGICAL
-
-  // #define LOGICAL(op)                                              \
-  // inline v4int operator op( const v4float &a, const v4float &b )   \
-  // {                                                                \
-  //   v4int c;                                                       \
-  //   ALWAYS_VECTORIZE                                               \
-  //   for( int j = 0; j < 4; j++ )                                   \
-  //     c.i[j] = - ( a.f[j] op b.f[j] );                             \
-  //   return c;                                                      \
-  // }
-
-  // LOGICAL(< )
-  // LOGICAL(> )
-  // LOGICAL(==)
-  // LOGICAL(!=)
-  // LOGICAL(<=)
-  // LOGICAL(>=)
-  // LOGICAL(&&)
-  // LOGICAL(||)
-
-  // #undef LOGICAL
-
   // v4float math library functions
 
   #define CMATH_FR1(fn)                         \
@@ -2000,6 +1409,9 @@ namespace v4
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
 
+  #undef CMATH_FR1
+  #undef CMATH_FR2
+
   inline v4float copysign( const v4float &a, const v4float &b )
   {
     v4float c;
@@ -2016,9 +1428,6 @@ namespace v4
     return c;
   }
 
-  #undef CMATH_FR1
-  #undef CMATH_FR2
-
   // v4float miscellaneous functions
 
   inline v4float rsqrt_approx( const v4float &a )
@@ -2027,10 +1436,6 @@ namespace v4
 
     b.v = vrsqrteq_f32( a.v );
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   b.f[j] = ::sqrt( 1.0f / a.f[j] );
-
     return b;
   }
 
@@ -2038,26 +1443,24 @@ namespace v4
   {
     v4float b;
 
-    // float32x4_t a_v = a.v, b_v;
+    float32x4_t a_v = a.v, b_v;
 
-    // b_v = vrsqrteq_f32( a_v );
+    b_v = vrsqrteq_f32( a_v );
 
-    // // Note: It is quicker to just call div_ps and sqrt_ps if more
-    // // refinement desired!
-    // b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ),
-    //                                  vsubq_f32( b_v,
-    //                                             vmulq_f32( a_v,
-    //                                                        vmulq_f32( b_v,
-    //                                                                   vmulq_f32( b_v, b_v )
-    //                                                                 )
-    //                                                      )
-    //                                           )
-    //                                )
-    //                 );
+    b.v = vaddq_f32( b_v, vmulq_f32( vdupq_n_f32( 0.5f ),
+                                     vsubq_f32( b_v,
+                                                vmulq_f32( a_v,
+                                                           vmulq_f32( b_v,
+                                                                      vmulq_f32( b_v, b_v )
+                                                                    )
+                                                         )
+                                              )
+                                   )
+                    );
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.f[j] = ::sqrt( 1.0f / a.f[j] );
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.f[j] = ::sqrt( 1.0f / a.f[j] );
 
     return b;
   }
@@ -2068,10 +1471,6 @@ namespace v4
 
     b.v = vrecpeq_f32( a.v );
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   b.f[j] = 1.0f / a.f[j];
-
     return b;
   }
 
@@ -2079,19 +1478,19 @@ namespace v4
   {
     v4float b;
 
-    // float32x4_t a_v = a.v, b_v;
+    float32x4_t a_v = a.v, b_v;
 
-    // b_v = vrecpeq_f32( a_v );
+    b_v = vrecpeq_f32( a_v );
 
-    // b.v = vsubq_f32( vaddq_f32( b_v, b_v ),
-    //                  vmulq_f32( a_v,
-    //                             vmulq_f32( b_v, b_v )
-    //                           )
-    //                );
+    b.v = vsubq_f32( vaddq_f32( b_v, b_v ),
+                     vmulq_f32( a_v,
+                                vmulq_f32( b_v, b_v )
+                              )
+                   );
 
-    ALWAYS_VECTORIZE
-    for( int j = 0; j < 4; j++ )
-      b.f[j] = 1.0f / a.f[j];
+    // ALWAYS_VECTORIZE
+    // for( int j = 0; j < 4; j++ )
+    //   b.f[j] = 1.0f / a.f[j];
 
     return b;
   }
@@ -2100,11 +1499,10 @@ namespace v4
   {
     v4float d;
 
-    d.v = vfmaq_f32( a.v, b.v, c.v );
+    d.v = vaddq_f32( vmulq_f32( a.v, b.v ), c.v );
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   d.f[j] = a.f[j] * b.f[j] + c.f[j];
+    // This seems broken.
+    // d.v = vfmaq_f32( a.v, b.v, c.v );
 
     return d;
   }
@@ -2113,11 +1511,10 @@ namespace v4
   {
     v4float d;
 
-    d.v = vfmsq_f32( a.v, b.v, c.v );
+    d.v = vsubq_f32( vmulq_f32( a.v, b.v ), c.v );
 
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   d.f[j] = a.f[j] * b.f[j] - c.f[j];
+    // This seems broken.
+    // d.v = vfmsq_f32( a.v, b.v, c.v );
 
     return d;
   }
@@ -2126,11 +1523,7 @@ namespace v4
   {
     v4float d;
 
-    d.v = vsubq_f32( vdupq_n_f32( 0.0f ), vfmsq_f32( a.v, b.v, c.v ) );
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   d.f[j] = c.f[j] - a.f[j] * b.f[j];
+    d.v = vsubq_f32( c.v, vmulq_f32( a.v, b.v ) );
 
     return d;
   }
@@ -2180,40 +1573,28 @@ namespace v4
     return b;
   }
 
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void increment_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     vst1q_f32( p, vaddq_f32( vld1q_f32( p ), a.v ) );
-
-    // _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) );
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   p[j] += a.f[j];
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void decrement_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     vst1q_f32( p, vsubq_f32( vld1q_f32( p ), a.v ) );
-
-    // _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) );
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   p[j] -= a.f[j];
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void scale_4x1( float * ALIGNED(16) p,
+                         const v4float &a )
   {
     vst1q_f32( p, vmulq_f32( vld1q_f32( p ), a.v ) );
-
-    // _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) );
-
-    // ALWAYS_VECTORIZE
-    // for( int j = 0; j < 4; j++ )
-    //   p[j] *= a.f[j];
   }
 
-  inline void trilinear( v4float & wl, v4float & wh )
+  // Given wl = x y z w, compute:
+  // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
+  // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
+  inline void trilinear( v4float &wl, v4float &wh )
   {
     float x = wl.f[0], y = wl.f[1], z = wl.f[2];
 
diff --git a/src/util/v4/v4_portable.h b/src/util/v4/v4_portable.h
index 6dbb790b..b192c514 100644
--- a/src/util/v4/v4_portable.h
+++ b/src/util/v4/v4_portable.h
@@ -48,17 +48,26 @@ namespace v4
 
     friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
-    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+    friend inline v4    merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
 
-    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
-    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void   load_4x1( const void * ALIGNED(16) p,
+                                   v4 &a ) ALWAYS_INLINE;
+
+    friend inline void  store_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void stream_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
     friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+
     friend inline void   copy_4x1( void * ALIGNED(16) dst,
                                    const void * ALIGNED(16) src ) ALWAYS_INLINE;
-    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    friend inline void   swap_4x1( void * ALIGNED(16) a,
+                                   void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
 
@@ -167,7 +176,7 @@ namespace v4
     return b;
   }
 
-# define sw(x,y) x^=y, y^=x, x^=y
+  #define sw(x,y) x^=y, y^=x, x^=y
 
   inline void swap( v4 &a, v4 &b )
   {
@@ -184,53 +193,52 @@ namespace v4
                                                   sw( a2.i[3],a3.i[2] );
   }
 
-# undef sw
+  #undef sw
 
   // v4 memory manipulation functions
 
   inline void load_4x1( const void * ALIGNED(16) p,
                         v4 &a )
   {
-    a.i[0] = ((const int * ALIGNED(16))p)[0];
-    a.i[1] = ((const int * ALIGNED(16))p)[1];
-    a.i[2] = ((const int * ALIGNED(16))p)[2];
-    a.i[3] = ((const int * ALIGNED(16))p)[3];
+    a.i[0] = ( ( const int * ALIGNED(16) ) p )[0];
+    a.i[1] = ( ( const int * ALIGNED(16) ) p )[1];
+    a.i[2] = ( ( const int * ALIGNED(16) ) p )[2];
+    a.i[3] = ( ( const int * ALIGNED(16) ) p )[3];
   }
 
   inline void store_4x1( const v4 &a,
                          void * ALIGNED(16) p )
   {
-    ((int * ALIGNED(16))p)[0] = a.i[0];
-    ((int * ALIGNED(16))p)[1] = a.i[1];
-    ((int * ALIGNED(16))p)[2] = a.i[2];
-    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ( ( int * ALIGNED(16) ) p )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) p )[1] = a.i[1];
+    ( ( int * ALIGNED(16) ) p )[2] = a.i[2];
+    ( ( int * ALIGNED(16) ) p )[3] = a.i[3];
   }
 
   inline void stream_4x1( const v4 &a,
                           void * ALIGNED(16) p )
   {
-    ((int * ALIGNED(16))p)[0] = a.i[0];
-    ((int * ALIGNED(16))p)[1] = a.i[1];
-    ((int * ALIGNED(16))p)[2] = a.i[2];
-    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ( ( int * ALIGNED(16) ) p )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) p )[1] = a.i[1];
+    ( ( int * ALIGNED(16) ) p )[2] = a.i[2];
+    ( ( int * ALIGNED(16) ) p )[3] = a.i[3];
   }
 
   inline void clear_4x1( void * ALIGNED(16) p )
   {
-    ((int * ALIGNED(16))p)[0] = 0;
-    ((int * ALIGNED(16))p)[1] = 0;
-    ((int * ALIGNED(16))p)[2] = 0;
-    ((int * ALIGNED(16))p)[3] = 0;
+    ( ( int * ALIGNED(16) ) p )[0] = 0;
+    ( ( int * ALIGNED(16) ) p )[1] = 0;
+    ( ( int * ALIGNED(16) ) p )[2] = 0;
+    ( ( int * ALIGNED(16) ) p )[3] = 0;
   }
 
-  // FIXME: Ordering semantics
   inline void copy_4x1( void * ALIGNED(16) dst,
                         const void * ALIGNED(16) src )
   {
-    ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0];
-    ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1];
-    ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2];
-    ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3];
+    ( ( int * ALIGNED(16) ) dst )[0] = ( ( const int * ALIGNED(16) ) src )[0];
+    ( ( int * ALIGNED(16) ) dst )[1] = ( ( const int * ALIGNED(16) ) src )[1];
+    ( ( int * ALIGNED(16) ) dst )[2] = ( ( const int * ALIGNED(16) ) src )[2];
+    ( ( int * ALIGNED(16) ) dst )[3] = ( ( const int * ALIGNED(16) ) src )[3];
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
@@ -238,175 +246,201 @@ namespace v4
   {
     int t;
 
-    t = ((int * ALIGNED(16))a)[0];
-    ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0];
-    ((int * ALIGNED(16))b)[0] = t;
+    t = ( ( int * ALIGNED(16) ) a )[0];
+
+    ( ( int * ALIGNED(16) ) a )[0] = ( ( int * ALIGNED(16) ) b )[0];
+    ( ( int * ALIGNED(16) ) b )[0] = t;
 
-    t = ((int * ALIGNED(16))a)[1];
-    ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1];
-    ((int * ALIGNED(16))b)[1] = t;
+    t = ( ( int * ALIGNED(16) ) a )[1];
 
-    t = ((int * ALIGNED(16))a)[2];
-    ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2];
-    ((int * ALIGNED(16))b)[2] = t;
+    ( ( int * ALIGNED(16) ) a )[1] = ( ( int * ALIGNED(16) ) b )[1];
+    ( ( int * ALIGNED(16) ) b )[1] = t;
 
-    t = ((int * ALIGNED(16))a)[3];
-    ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3];
-    ((int * ALIGNED(16))b)[3] = t;
+    t = ( ( int * ALIGNED(16) ) a )[2];
+
+    ( ( int * ALIGNED(16) ) a )[2] = ( ( int * ALIGNED(16) ) b )[2];
+    ( ( int * ALIGNED(16) ) b )[2] = t;
+
+    t = ( ( int * ALIGNED(16) ) a )[3];
+
+    ( ( int * ALIGNED(16) ) a )[3] = ( ( int * ALIGNED(16) ) b )[3];
+    ( ( int * ALIGNED(16) ) b )[3] = t;
   }
 
   // v4 transposed memory manipulation functions
 
-  inline void load_4x1_tr( const void *a0, const void *a1,
-                           const void *a2, const void *a3,
+  inline void load_4x1_tr( const void *a0,
+                           const void *a1,
+                           const void *a2,
+                           const void *a3,
                            v4 &a )
   {
-    a.i[0] = ((const int *)a0)[0];
-    a.i[1] = ((const int *)a1)[0];
-    a.i[2] = ((const int *)a2)[0];
-    a.i[3] = ((const int *)a3)[0];
+    a.i[0] = ( (const int *) a0 )[0];
+    a.i[1] = ( (const int *) a1 )[0];
+    a.i[2] = ( (const int *) a2 )[0];
+    a.i[3] = ( (const int *) a3 )[0];
   }
 
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
-                           v4 &a, v4 &b )
+                           v4 &a,
+                           v4 &b )
   {
-    a.i[0] = ((const int * ALIGNED(8))a0)[0];
-    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+    a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1];
 
-    a.i[1] = ((const int * ALIGNED(8))a1)[0];
-    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+    a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1];
 
-    a.i[2] = ((const int * ALIGNED(8))a2)[0];
-    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+    a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1];
 
-    a.i[3] = ((const int * ALIGNED(8))a3)[0];
-    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+    a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1];
   }
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c )
+                           v4 &a,
+                           v4 &b,
+                           v4 &c )
   {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1];
+    c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2];
 
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1];
+    c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2];
 
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1];
+    c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2];
 
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+    a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1];
+    c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2];
   }
 
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d )
+                           v4 &a,
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
-    d.i[0] = ((const int * ALIGNED(16))a0)[3];
-
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
-    d.i[1] = ((const int * ALIGNED(16))a1)[3];
-
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
-    d.i[2] = ((const int * ALIGNED(16))a2)[3];
-
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2];
-    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+    a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1];
+    c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2];
+    d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3];
+
+    a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1];
+    c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2];
+    d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3];
+
+    a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1];
+    c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2];
+    d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3];
+
+    a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1];
+    c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2];
+    d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3];
   }
 
   inline void store_4x1_tr( const v4 &a,
-                            void *a0, void *a1,
-                            void *a2, void *a3 )
+                            void *a0,
+                            void *a1,
+                            void *a2,
+                            void *a3 )
   {
-    ((int *)a0)[0] = a.i[0];
-    ((int *)a1)[0] = a.i[1];
-    ((int *)a2)[0] = a.i[2];
-    ((int *)a3)[0] = a.i[3];
+    ( (int *) a0 )[0] = a.i[0];
+    ( (int *) a1 )[0] = a.i[1];
+    ( (int *) a2 )[0] = a.i[2];
+    ( (int *) a3 )[0] = a.i[3];
   }
 
-  inline void store_4x2_tr( const v4 &a, const v4 &b,
-                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
-                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  inline void store_4x2_tr( const v4 &a,
+                            const v4 &b,
+                            void * ALIGNED(8) a0,
+                            void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2,
+                            void * ALIGNED(8) a3 )
   {
-    ((int * ALIGNED(8))a0)[0] = a.i[0];
-    ((int * ALIGNED(8))a0)[1] = b.i[0];
+    ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0];
 
-    ((int * ALIGNED(8))a1)[0] = a.i[1];
-    ((int * ALIGNED(8))a1)[1] = b.i[1];
+    ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1];
 
-    ((int * ALIGNED(8))a2)[0] = a.i[2];
-    ((int * ALIGNED(8))a2)[1] = b.i[2];
+    ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2];
 
-    ((int * ALIGNED(8))a3)[0] = a.i[3];
-    ((int * ALIGNED(8))a3)[1] = b.i[3];
+    ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3];
   }
 
-  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x3_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
   {
-    ((int * ALIGNED(16))a0)[0] = a.i[0];
-    ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
 
-    ((int * ALIGNED(16))a1)[0] = a.i[1];
-    ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
 
-    ((int * ALIGNED(16))a2)[0] = a.i[2];
-    ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
 
-    ((int * ALIGNED(16))a3)[0] = a.i[3];
-    ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
   }
 
-  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x4_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            const v4 &d,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
   {
-    ((int * ALIGNED(16))a0)[0] = a.i[0];
-    ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0];
-    ((int * ALIGNED(16))a0)[3] = d.i[0];
-
-    ((int * ALIGNED(16))a1)[0] = a.i[1];
-    ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1];
-    ((int * ALIGNED(16))a1)[3] = d.i[1];
-
-    ((int * ALIGNED(16))a2)[0] = a.i[2];
-    ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2];
-    ((int * ALIGNED(16))a2)[3] = d.i[2];
-
-    ((int * ALIGNED(16))a3)[0] = a.i[3];
-    ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3];
-    ((int * ALIGNED(16))a3)[3] = d.i[3];
+    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0];
+
+    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1];
+
+    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2];
+
+    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3];
   }
 
   //////////////
@@ -527,7 +561,7 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)                             \
+    #define ASSIGN(op)                            \
     inline v4int &operator op( const v4int &b )   \
     {                                             \
       i[0] op b.i[0];                             \
@@ -537,19 +571,19 @@ namespace v4
       return *this;                               \
     }
 
-    ASSIGN( =)
     ASSIGN(+=)
     ASSIGN(-=)
     ASSIGN(*=)
     ASSIGN(/=)
     ASSIGN(%=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+    ASSIGN( =)
     ASSIGN(^=)
     ASSIGN(&=)
     ASSIGN(|=)
-    ASSIGN(<<=)
-    ASSIGN(>>=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4int member access operator
 
@@ -566,7 +600,7 @@ namespace v4
 
   // v4int prefix unary operators
 
-# define PREFIX_UNARY(op)                       \
+  #define PREFIX_UNARY(op)                      \
   inline v4int operator op( const v4int & a )   \
   {                                             \
     v4int b;                                    \
@@ -584,21 +618,21 @@ namespace v4
   {
     v4int b;
 
-    b.i[0] = - ( !a.i[0] );
-    b.i[1] = - ( !a.i[1] );
-    b.i[2] = - ( !a.i[2] );
-    b.i[3] = - ( !a.i[3] );
+    b.i[0] = - ( ! a.i[0] );
+    b.i[1] = - ( ! a.i[1] );
+    b.i[2] = - ( ! a.i[2] );
+    b.i[3] = - ( ! a.i[3] );
 
     return b;
   }
 
   PREFIX_UNARY(~)
 
-# undef PREFIX_UNARY
+  #undef PREFIX_UNARY
 
   // v4int prefix increment / decrement
 
-# define PREFIX_INCDEC(op)                      \
+  #define PREFIX_INCDEC(op)                     \
   inline v4int operator op( v4int & a )         \
   {                                             \
     v4int b;                                    \
@@ -612,11 +646,11 @@ namespace v4
   PREFIX_INCDEC(++)
   PREFIX_INCDEC(--)
 
-# undef PREFIX_INCDEC
+  #undef PREFIX_INCDEC
 
   // v4int postfix increment / decrement
 
-# define POSTFIX_INCDEC(op)                    \
+  #define POSTFIX_INCDEC(op)                   \
   inline v4int operator op( v4int & a, int )   \
   {                                            \
     v4int b;                                   \
@@ -630,11 +664,11 @@ namespace v4
   POSTFIX_INCDEC(++)
   POSTFIX_INCDEC(--)
 
-# undef POSTFIX_INCDEC
+  #undef POSTFIX_INCDEC
 
   // v4int binary operators
 
-# define BINARY(op)                                             \
+  #define BINARY(op)                                            \
   inline v4int operator op( const v4int &a, const v4int &b )    \
   {                                                             \
     v4int c;                                                    \
@@ -650,24 +684,24 @@ namespace v4
   BINARY(*)
   BINARY(/)
   BINARY(%)
+  BINARY(<<)
+  BINARY(>>)
   BINARY(^)
   BINARY(&)
   BINARY(|)
-  BINARY(<<)
-  BINARY(>>)
 
-# undef BINARY
+  #undef BINARY
 
   // v4int logical operators
 
-# define LOGICAL(op)                                           \
+  #define LOGICAL(op)                                          \
   inline v4int operator op( const v4int &a, const v4int &b )   \
   {                                                            \
     v4int c;                                                   \
-    c.i[0] = -(a.i[0] op b.i[0]);                              \
-    c.i[1] = -(a.i[1] op b.i[1]);                              \
-    c.i[2] = -(a.i[2] op b.i[2]);                              \
-    c.i[3] = -(a.i[3] op b.i[3]);                              \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
     return c;                                                  \
   }
 
@@ -680,7 +714,7 @@ namespace v4
   LOGICAL(&&)
   LOGICAL(||)
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4int miscellaneous functions
 
@@ -722,14 +756,14 @@ namespace v4
 
   inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
   {
-    v4 m;
+    v4 tf;
 
-    m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] );
-    m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] );
-    m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] );
-    m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] );
+    tf.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] );
+    tf.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] );
+    tf.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] );
+    tf.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] );
 
-    return m;
+    return tf;
   }
 
   ////////////////
@@ -775,9 +809,9 @@ namespace v4
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
-#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b ) ALWAYS_INLINE
+    #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+    #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -787,8 +821,8 @@ namespace v4
 
     CMATH_FR2(copysign);
 
-#   undef CMATH_FR1
-#   undef CMATH_FR2
+    #undef CMATH_FR1
+    #undef CMATH_FR2
 
     // v4float miscellaneous friends
 
@@ -849,7 +883,7 @@ namespace v4
 
     // v4float assignment operators
 
-#   define ASSIGN(op)                                   \
+    #define ASSIGN(op)                                  \
     inline v4float &operator op( const v4float &b )     \
     {                                                   \
       f[0] op b.f[0];                                   \
@@ -865,7 +899,7 @@ namespace v4
     ASSIGN(*=)
     ASSIGN(/=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4float member access operator
 
@@ -972,7 +1006,7 @@ namespace v4
 
   // v4float binary operators
 
-# define BINARY(op)                                                  \
+  #define BINARY(op)                                                 \
   inline v4float operator op( const v4float &a, const v4float &b )   \
   {                                                                  \
     v4float c;                                                       \
@@ -988,11 +1022,11 @@ namespace v4
   BINARY(*)
   BINARY(/)
 
-# undef BINARY
+  #undef BINARY
 
   // v4float logical operators
 
-# define LOGICAL(op)                                               \
+  #define LOGICAL(op)                                              \
   inline v4int operator op( const v4float &a, const v4float &b )   \
   {                                                                \
     v4int c;                                                       \
@@ -1006,17 +1040,17 @@ namespace v4
   LOGICAL(< )
   LOGICAL(> )
   LOGICAL(==)
-  LOGICAL(!=)
   LOGICAL(<=)
   LOGICAL(>=)
+  LOGICAL(!=)
   LOGICAL(&&)
   LOGICAL(||)
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4float math library functions
 
-# define CMATH_FR1(fn)                          \
+  #define CMATH_FR1(fn)                         \
   inline v4float fn( const v4float &a )         \
   {                                             \
     v4float b;                                  \
@@ -1027,7 +1061,7 @@ namespace v4
     return b;                                   \
   }
 
-# define CMATH_FR2(fn)                                          \
+  #define CMATH_FR2(fn)                                         \
   inline v4float fn( const v4float &a, const v4float &b )       \
   {                                                             \
     v4float c;                                                  \
@@ -1044,33 +1078,33 @@ namespace v4
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
 
+  #undef CMATH_FR1
+  #undef CMATH_FR2
+
   inline v4float copysign( const v4float &a, const v4float &b )
   {
     v4float c;
     float t;
 
     t = ::fabs( a.f[0] );
-    if( b.f[0] < 0 ) t = -t;
+    if ( b.f[0] < 0 ) t = -t;
     c.f[0] = t;
 
     t = ::fabs( a.f[1] );
-    if( b.f[1] < 0 ) t = -t;
+    if ( b.f[1] < 0 ) t = -t;
     c.f[1] = t;
 
     t = ::fabs( a.f[2] );
-    if( b.f[2] < 0 ) t = -t;
+    if ( b.f[2] < 0 ) t = -t;
     c.f[2] = t;
 
     t = ::fabs( a.f[3] );
-    if( b.f[3] < 0 ) t = -t;
+    if ( b.f[3] < 0 ) t = -t;
     c.f[3] = t;
 
     return c;
   }
 
-# undef CMATH_FR1
-# undef CMATH_FR2
-
   // v4float miscellaneous functions
 
   inline v4float rsqrt_approx( const v4float &a )
@@ -1193,7 +1227,8 @@ namespace v4
     return b;
   }
 
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void increment_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     p[0] += a.f[0];
     p[1] += a.f[1];
@@ -1201,7 +1236,8 @@ namespace v4
     p[3] += a.f[3];
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void decrement_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     p[0] -= a.f[0];
     p[1] -= a.f[1];
@@ -1209,7 +1245,8 @@ namespace v4
     p[3] -= a.f[3];
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void scale_4x1( float * ALIGNED(16) p,
+                         const v4float &a )
   {
     p[0] *= a.f[0];
     p[1] *= a.f[1];
@@ -1217,7 +1254,10 @@ namespace v4
     p[3] *= a.f[3];
   }
 
-  inline void trilinear( v4float & wl, v4float & wh )
+  // Given wl = x y z w, compute:
+  // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
+  // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
+  inline void trilinear( v4float &wl, v4float &wh )
   {
     float x = wl.f[0], y = wl.f[1], z = wl.f[2];
 
diff --git a/src/util/v4/v4_portable_v0.h b/src/util/v4/v4_portable_v0.h
index 6a89939e..b192c514 100644
--- a/src/util/v4/v4_portable_v0.h
+++ b/src/util/v4/v4_portable_v0.h
@@ -48,17 +48,26 @@ namespace v4
 
     friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
-    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+    friend inline v4    merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
 
-    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
-    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void   load_4x1( const void * ALIGNED(16) p,
+                                   v4 &a ) ALWAYS_INLINE;
+
+    friend inline void  store_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void stream_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
     friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+
     friend inline void   copy_4x1( void * ALIGNED(16) dst,
                                    const void * ALIGNED(16) src ) ALWAYS_INLINE;
-    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    friend inline void   swap_4x1( void * ALIGNED(16) a,
+                                   void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
 
@@ -167,7 +176,7 @@ namespace v4
     return b;
   }
 
-# define sw(x,y) x^=y, y^=x, x^=y
+  #define sw(x,y) x^=y, y^=x, x^=y
 
   inline void swap( v4 &a, v4 &b )
   {
@@ -184,53 +193,52 @@ namespace v4
                                                   sw( a2.i[3],a3.i[2] );
   }
 
-# undef sw
+  #undef sw
 
   // v4 memory manipulation functions
 
   inline void load_4x1( const void * ALIGNED(16) p,
                         v4 &a )
   {
-    a.i[0] = ((const int * ALIGNED(16))p)[0];
-    a.i[1] = ((const int * ALIGNED(16))p)[1];
-    a.i[2] = ((const int * ALIGNED(16))p)[2];
-    a.i[3] = ((const int * ALIGNED(16))p)[3];
+    a.i[0] = ( ( const int * ALIGNED(16) ) p )[0];
+    a.i[1] = ( ( const int * ALIGNED(16) ) p )[1];
+    a.i[2] = ( ( const int * ALIGNED(16) ) p )[2];
+    a.i[3] = ( ( const int * ALIGNED(16) ) p )[3];
   }
 
   inline void store_4x1( const v4 &a,
                          void * ALIGNED(16) p )
   {
-    ((int * ALIGNED(16))p)[0] = a.i[0];
-    ((int * ALIGNED(16))p)[1] = a.i[1];
-    ((int * ALIGNED(16))p)[2] = a.i[2];
-    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ( ( int * ALIGNED(16) ) p )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) p )[1] = a.i[1];
+    ( ( int * ALIGNED(16) ) p )[2] = a.i[2];
+    ( ( int * ALIGNED(16) ) p )[3] = a.i[3];
   }
 
   inline void stream_4x1( const v4 &a,
                           void * ALIGNED(16) p )
   {
-    ((int * ALIGNED(16))p)[0] = a.i[0];
-    ((int * ALIGNED(16))p)[1] = a.i[1];
-    ((int * ALIGNED(16))p)[2] = a.i[2];
-    ((int * ALIGNED(16))p)[3] = a.i[3];
+    ( ( int * ALIGNED(16) ) p )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) p )[1] = a.i[1];
+    ( ( int * ALIGNED(16) ) p )[2] = a.i[2];
+    ( ( int * ALIGNED(16) ) p )[3] = a.i[3];
   }
 
   inline void clear_4x1( void * ALIGNED(16) p )
   {
-    ((int * ALIGNED(16))p)[0] = 0;
-    ((int * ALIGNED(16))p)[1] = 0;
-    ((int * ALIGNED(16))p)[2] = 0;
-    ((int * ALIGNED(16))p)[3] = 0;
+    ( ( int * ALIGNED(16) ) p )[0] = 0;
+    ( ( int * ALIGNED(16) ) p )[1] = 0;
+    ( ( int * ALIGNED(16) ) p )[2] = 0;
+    ( ( int * ALIGNED(16) ) p )[3] = 0;
   }
 
-  // FIXME: Ordering semantics
   inline void copy_4x1( void * ALIGNED(16) dst,
                         const void * ALIGNED(16) src )
   {
-    ((int * ALIGNED(16))dst)[0] = ((const int * ALIGNED(16))src)[0];
-    ((int * ALIGNED(16))dst)[1] = ((const int * ALIGNED(16))src)[1];
-    ((int * ALIGNED(16))dst)[2] = ((const int * ALIGNED(16))src)[2];
-    ((int * ALIGNED(16))dst)[3] = ((const int * ALIGNED(16))src)[3];
+    ( ( int * ALIGNED(16) ) dst )[0] = ( ( const int * ALIGNED(16) ) src )[0];
+    ( ( int * ALIGNED(16) ) dst )[1] = ( ( const int * ALIGNED(16) ) src )[1];
+    ( ( int * ALIGNED(16) ) dst )[2] = ( ( const int * ALIGNED(16) ) src )[2];
+    ( ( int * ALIGNED(16) ) dst )[3] = ( ( const int * ALIGNED(16) ) src )[3];
   }
 
   inline void swap_4x1( void * ALIGNED(16) a,
@@ -238,175 +246,201 @@ namespace v4
   {
     int t;
 
-    t = ((int * ALIGNED(16))a)[0];
-    ((int * ALIGNED(16))a)[0] = ((int * ALIGNED(16))b)[0];
-    ((int * ALIGNED(16))b)[0] = t;
+    t = ( ( int * ALIGNED(16) ) a )[0];
+
+    ( ( int * ALIGNED(16) ) a )[0] = ( ( int * ALIGNED(16) ) b )[0];
+    ( ( int * ALIGNED(16) ) b )[0] = t;
 
-    t = ((int * ALIGNED(16))a)[1];
-    ((int * ALIGNED(16))a)[1] = ((int * ALIGNED(16))b)[1];
-    ((int * ALIGNED(16))b)[1] = t;
+    t = ( ( int * ALIGNED(16) ) a )[1];
 
-    t = ((int * ALIGNED(16))a)[2];
-    ((int * ALIGNED(16))a)[2] = ((int * ALIGNED(16))b)[2];
-    ((int * ALIGNED(16))b)[2] = t;
+    ( ( int * ALIGNED(16) ) a )[1] = ( ( int * ALIGNED(16) ) b )[1];
+    ( ( int * ALIGNED(16) ) b )[1] = t;
 
-    t = ((int * ALIGNED(16))a)[3];
-    ((int * ALIGNED(16))a)[3] = ((int * ALIGNED(16))b)[3];
-    ((int * ALIGNED(16))b)[3] = t;
+    t = ( ( int * ALIGNED(16) ) a )[2];
+
+    ( ( int * ALIGNED(16) ) a )[2] = ( ( int * ALIGNED(16) ) b )[2];
+    ( ( int * ALIGNED(16) ) b )[2] = t;
+
+    t = ( ( int * ALIGNED(16) ) a )[3];
+
+    ( ( int * ALIGNED(16) ) a )[3] = ( ( int * ALIGNED(16) ) b )[3];
+    ( ( int * ALIGNED(16) ) b )[3] = t;
   }
 
   // v4 transposed memory manipulation functions
 
-  inline void load_4x1_tr( const void *a0, const void *a1,
-                           const void *a2, const void *a3,
+  inline void load_4x1_tr( const void *a0,
+                           const void *a1,
+                           const void *a2,
+                           const void *a3,
                            v4 &a )
   {
-    a.i[0] = ((const int *)a0)[0];
-    a.i[1] = ((const int *)a1)[0];
-    a.i[2] = ((const int *)a2)[0];
-    a.i[3] = ((const int *)a3)[0];
+    a.i[0] = ( (const int *) a0 )[0];
+    a.i[1] = ( (const int *) a1 )[0];
+    a.i[2] = ( (const int *) a2 )[0];
+    a.i[3] = ( (const int *) a3 )[0];
   }
 
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
-                           v4 &a, v4 &b )
+                           v4 &a,
+                           v4 &b )
   {
-    a.i[0] = ((const int * ALIGNED(8))a0)[0];
-    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+    a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1];
 
-    a.i[1] = ((const int * ALIGNED(8))a1)[0];
-    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+    a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1];
 
-    a.i[2] = ((const int * ALIGNED(8))a2)[0];
-    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+    a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1];
 
-    a.i[3] = ((const int * ALIGNED(8))a3)[0];
-    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+    a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1];
   }
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c )
+                           v4 &a,
+                           v4 &b,
+                           v4 &c )
   {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1];
+    c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2];
 
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1];
+    c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2];
 
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1];
+    c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2];
 
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+    a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1];
+    c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2];
   }
 
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d )
+                           v4 &a,
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
-    d.i[0] = ((const int * ALIGNED(16))a0)[3];
-
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
-    d.i[1] = ((const int * ALIGNED(16))a1)[3];
-
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
-    d.i[2] = ((const int * ALIGNED(16))a2)[3];
-
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2];
-    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+    a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1];
+    c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2];
+    d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3];
+
+    a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1];
+    c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2];
+    d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3];
+
+    a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1];
+    c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2];
+    d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3];
+
+    a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1];
+    c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2];
+    d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3];
   }
 
   inline void store_4x1_tr( const v4 &a,
-                            void *a0, void *a1,
-                            void *a2, void *a3 )
+                            void *a0,
+                            void *a1,
+                            void *a2,
+                            void *a3 )
   {
-    ((int *)a0)[0] = a.i[0];
-    ((int *)a1)[0] = a.i[1];
-    ((int *)a2)[0] = a.i[2];
-    ((int *)a3)[0] = a.i[3];
+    ( (int *) a0 )[0] = a.i[0];
+    ( (int *) a1 )[0] = a.i[1];
+    ( (int *) a2 )[0] = a.i[2];
+    ( (int *) a3 )[0] = a.i[3];
   }
 
-  inline void store_4x2_tr( const v4 &a, const v4 &b,
-                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
-                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  inline void store_4x2_tr( const v4 &a,
+                            const v4 &b,
+                            void * ALIGNED(8) a0,
+                            void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2,
+                            void * ALIGNED(8) a3 )
   {
-    ((int * ALIGNED(8))a0)[0] = a.i[0];
-    ((int * ALIGNED(8))a0)[1] = b.i[0];
+    ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0];
 
-    ((int * ALIGNED(8))a1)[0] = a.i[1];
-    ((int * ALIGNED(8))a1)[1] = b.i[1];
+    ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1];
 
-    ((int * ALIGNED(8))a2)[0] = a.i[2];
-    ((int * ALIGNED(8))a2)[1] = b.i[2];
+    ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2];
 
-    ((int * ALIGNED(8))a3)[0] = a.i[3];
-    ((int * ALIGNED(8))a3)[1] = b.i[3];
+    ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3];
   }
 
-  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x3_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
   {
-    ((int * ALIGNED(16))a0)[0] = a.i[0];
-    ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
 
-    ((int * ALIGNED(16))a1)[0] = a.i[1];
-    ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
 
-    ((int * ALIGNED(16))a2)[0] = a.i[2];
-    ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
 
-    ((int * ALIGNED(16))a3)[0] = a.i[3];
-    ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
   }
 
-  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x4_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            const v4 &d,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
   {
-    ((int * ALIGNED(16))a0)[0] = a.i[0];
-    ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0];
-    ((int * ALIGNED(16))a0)[3] = d.i[0];
-
-    ((int * ALIGNED(16))a1)[0] = a.i[1];
-    ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1];
-    ((int * ALIGNED(16))a1)[3] = d.i[1];
-
-    ((int * ALIGNED(16))a2)[0] = a.i[2];
-    ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2];
-    ((int * ALIGNED(16))a2)[3] = d.i[2];
-
-    ((int * ALIGNED(16))a3)[0] = a.i[3];
-    ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3];
-    ((int * ALIGNED(16))a3)[3] = d.i[3];
+    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0];
+
+    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1];
+
+    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2];
+
+    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3];
   }
 
   //////////////
@@ -527,7 +561,7 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)                             \
+    #define ASSIGN(op)                            \
     inline v4int &operator op( const v4int &b )   \
     {                                             \
       i[0] op b.i[0];                             \
@@ -537,19 +571,19 @@ namespace v4
       return *this;                               \
     }
 
-    ASSIGN( =)
     ASSIGN(+=)
     ASSIGN(-=)
     ASSIGN(*=)
     ASSIGN(/=)
     ASSIGN(%=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+    ASSIGN( =)
     ASSIGN(^=)
     ASSIGN(&=)
     ASSIGN(|=)
-    ASSIGN(<<=)
-    ASSIGN(>>=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4int member access operator
 
@@ -566,7 +600,7 @@ namespace v4
 
   // v4int prefix unary operators
 
-# define PREFIX_UNARY(op)                       \
+  #define PREFIX_UNARY(op)                      \
   inline v4int operator op( const v4int & a )   \
   {                                             \
     v4int b;                                    \
@@ -584,21 +618,21 @@ namespace v4
   {
     v4int b;
 
-    b.i[0] = - ( !a.i[0] );
-    b.i[1] = - ( !a.i[1] );
-    b.i[2] = - ( !a.i[2] );
-    b.i[3] = - ( !a.i[3] );
+    b.i[0] = - ( ! a.i[0] );
+    b.i[1] = - ( ! a.i[1] );
+    b.i[2] = - ( ! a.i[2] );
+    b.i[3] = - ( ! a.i[3] );
 
     return b;
   }
 
   PREFIX_UNARY(~)
 
-# undef PREFIX_UNARY
+  #undef PREFIX_UNARY
 
   // v4int prefix increment / decrement
 
-# define PREFIX_INCDEC(op)                      \
+  #define PREFIX_INCDEC(op)                     \
   inline v4int operator op( v4int & a )         \
   {                                             \
     v4int b;                                    \
@@ -612,11 +646,11 @@ namespace v4
   PREFIX_INCDEC(++)
   PREFIX_INCDEC(--)
 
-# undef PREFIX_INCDEC
+  #undef PREFIX_INCDEC
 
   // v4int postfix increment / decrement
 
-# define POSTFIX_INCDEC(op)                    \
+  #define POSTFIX_INCDEC(op)                   \
   inline v4int operator op( v4int & a, int )   \
   {                                            \
     v4int b;                                   \
@@ -630,11 +664,11 @@ namespace v4
   POSTFIX_INCDEC(++)
   POSTFIX_INCDEC(--)
 
-# undef POSTFIX_INCDEC
+  #undef POSTFIX_INCDEC
 
   // v4int binary operators
 
-# define BINARY(op)                                             \
+  #define BINARY(op)                                            \
   inline v4int operator op( const v4int &a, const v4int &b )    \
   {                                                             \
     v4int c;                                                    \
@@ -650,17 +684,17 @@ namespace v4
   BINARY(*)
   BINARY(/)
   BINARY(%)
+  BINARY(<<)
+  BINARY(>>)
   BINARY(^)
   BINARY(&)
   BINARY(|)
-  BINARY(<<)
-  BINARY(>>)
 
-# undef BINARY
+  #undef BINARY
 
   // v4int logical operators
 
-# define LOGICAL(op)                                           \
+  #define LOGICAL(op)                                          \
   inline v4int operator op( const v4int &a, const v4int &b )   \
   {                                                            \
     v4int c;                                                   \
@@ -680,7 +714,7 @@ namespace v4
   LOGICAL(&&)
   LOGICAL(||)
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4int miscellaneous functions
 
@@ -722,14 +756,14 @@ namespace v4
 
   inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
   {
-    v4 m;
+    v4 tf;
 
-    m.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] );
-    m.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] );
-    m.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] );
-    m.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] );
+    tf.i[0] = ( f.i[0] & ~c.i[0] ) | ( t.i[0] & c.i[0] );
+    tf.i[1] = ( f.i[1] & ~c.i[1] ) | ( t.i[1] & c.i[1] );
+    tf.i[2] = ( f.i[2] & ~c.i[2] ) | ( t.i[2] & c.i[2] );
+    tf.i[3] = ( f.i[3] & ~c.i[3] ) | ( t.i[3] & c.i[3] );
 
-    return m;
+    return tf;
   }
 
   ////////////////
@@ -775,9 +809,9 @@ namespace v4
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
-#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b ) ALWAYS_INLINE
+    #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+    #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -787,8 +821,8 @@ namespace v4
 
     CMATH_FR2(copysign);
 
-#   undef CMATH_FR1
-#   undef CMATH_FR2
+    #undef CMATH_FR1
+    #undef CMATH_FR2
 
     // v4float miscellaneous friends
 
@@ -849,7 +883,7 @@ namespace v4
 
     // v4float assignment operators
 
-#   define ASSIGN(op)                                   \
+    #define ASSIGN(op)                                  \
     inline v4float &operator op( const v4float &b )     \
     {                                                   \
       f[0] op b.f[0];                                   \
@@ -865,7 +899,7 @@ namespace v4
     ASSIGN(*=)
     ASSIGN(/=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4float member access operator
 
@@ -972,7 +1006,7 @@ namespace v4
 
   // v4float binary operators
 
-# define BINARY(op)                                                  \
+  #define BINARY(op)                                                 \
   inline v4float operator op( const v4float &a, const v4float &b )   \
   {                                                                  \
     v4float c;                                                       \
@@ -988,11 +1022,11 @@ namespace v4
   BINARY(*)
   BINARY(/)
 
-# undef BINARY
+  #undef BINARY
 
   // v4float logical operators
 
-# define LOGICAL(op)                                               \
+  #define LOGICAL(op)                                              \
   inline v4int operator op( const v4float &a, const v4float &b )   \
   {                                                                \
     v4int c;                                                       \
@@ -1006,17 +1040,17 @@ namespace v4
   LOGICAL(< )
   LOGICAL(> )
   LOGICAL(==)
-  LOGICAL(!=)
   LOGICAL(<=)
   LOGICAL(>=)
+  LOGICAL(!=)
   LOGICAL(&&)
   LOGICAL(||)
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4float math library functions
 
-# define CMATH_FR1(fn)                          \
+  #define CMATH_FR1(fn)                         \
   inline v4float fn( const v4float &a )         \
   {                                             \
     v4float b;                                  \
@@ -1027,7 +1061,7 @@ namespace v4
     return b;                                   \
   }
 
-# define CMATH_FR2(fn)                                          \
+  #define CMATH_FR2(fn)                                         \
   inline v4float fn( const v4float &a, const v4float &b )       \
   {                                                             \
     v4float c;                                                  \
@@ -1044,33 +1078,33 @@ namespace v4
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
 
+  #undef CMATH_FR1
+  #undef CMATH_FR2
+
   inline v4float copysign( const v4float &a, const v4float &b )
   {
     v4float c;
     float t;
 
     t = ::fabs( a.f[0] );
-    if( b.f[0] < 0 ) t = -t;
+    if ( b.f[0] < 0 ) t = -t;
     c.f[0] = t;
 
     t = ::fabs( a.f[1] );
-    if( b.f[1] < 0 ) t = -t;
+    if ( b.f[1] < 0 ) t = -t;
     c.f[1] = t;
 
     t = ::fabs( a.f[2] );
-    if( b.f[2] < 0 ) t = -t;
+    if ( b.f[2] < 0 ) t = -t;
     c.f[2] = t;
 
     t = ::fabs( a.f[3] );
-    if( b.f[3] < 0 ) t = -t;
+    if ( b.f[3] < 0 ) t = -t;
     c.f[3] = t;
 
     return c;
   }
 
-# undef CMATH_FR1
-# undef CMATH_FR2
-
   // v4float miscellaneous functions
 
   inline v4float rsqrt_approx( const v4float &a )
@@ -1193,7 +1227,8 @@ namespace v4
     return b;
   }
 
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void increment_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     p[0] += a.f[0];
     p[1] += a.f[1];
@@ -1201,7 +1236,8 @@ namespace v4
     p[3] += a.f[3];
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void decrement_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     p[0] -= a.f[0];
     p[1] -= a.f[1];
@@ -1209,7 +1245,8 @@ namespace v4
     p[3] -= a.f[3];
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void scale_4x1( float * ALIGNED(16) p,
+                         const v4float &a )
   {
     p[0] *= a.f[0];
     p[1] *= a.f[1];
@@ -1217,7 +1254,10 @@ namespace v4
     p[3] *= a.f[3];
   }
 
-  inline void trilinear( v4float & wl, v4float & wh )
+  // Given wl = x y z w, compute:
+  // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
+  // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
+  inline void trilinear( v4float &wl, v4float &wh )
   {
     float x = wl.f[0], y = wl.f[1], z = wl.f[2];
 
diff --git a/src/util/v4/v4_portable_v1.h b/src/util/v4/v4_portable_v1.h
index d67bf4b8..9a6cca87 100644
--- a/src/util/v4/v4_portable_v1.h
+++ b/src/util/v4/v4_portable_v1.h
@@ -60,17 +60,26 @@ namespace v4
 
     friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
-    friend inline v4 merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+    friend inline v4    merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
 
-    friend inline void   load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
-    friend inline void  store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
+    friend inline void   load_4x1( const void * ALIGNED(16) p,
+                                   v4 &a ) ALWAYS_INLINE;
+
+    friend inline void  store_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void stream_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
     friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+
     friend inline void   copy_4x1( void * ALIGNED(16) dst,
                                    const void * ALIGNED(16) src ) ALWAYS_INLINE;
-    friend inline void   swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    friend inline void   swap_4x1( void * ALIGNED(16) a,
+                                   void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
 
@@ -177,7 +186,7 @@ namespace v4
     return b;
   }
 
-# define sw(x,y) x^=y, y^=x, x^=y
+  #define sw(x,y) x^=y, y^=x, x^=y
 
   inline void swap( v4 &a, v4 &b )
   {
@@ -193,7 +202,7 @@ namespace v4
                                                   sw( a2.i[3],a3.i[2] );
   }
 
-# undef sw
+  #undef sw
 
   // v4 memory manipulation functions
 
@@ -228,7 +237,6 @@ namespace v4
       ((int * ALIGNED(16))p)[j] = 0;
   }
 
-  // FIXME: Ordering semantics
   inline void copy_4x1( void * ALIGNED(16) dst,
                         const void * ALIGNED(16) src )
   {
@@ -253,156 +261,178 @@ namespace v4
 
   // v4 transposed memory manipulation functions
 
-  inline void load_4x1_tr( const void *a0, const void *a1,
-                           const void *a2, const void *a3,
+  inline void load_4x1_tr( const void *a0,
+                           const void *a1,
+                           const void *a2,
+                           const void *a3,
                            v4 &a )
   {
-    a.i[0] = ((const int *)a0)[0];
-    a.i[1] = ((const int *)a1)[0];
-    a.i[2] = ((const int *)a2)[0];
-    a.i[3] = ((const int *)a3)[0];
+    a.i[0] = ( (const int *) a0 )[0];
+    a.i[1] = ( (const int *) a1 )[0];
+    a.i[2] = ( (const int *) a2 )[0];
+    a.i[3] = ( (const int *) a3 )[0];
   }
 
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
-                           v4 &a, v4 &b )
+                           v4 &a,
+                           v4 &b )
   {
-    a.i[0] = ((const int * ALIGNED(8))a0)[0];
-    b.i[0] = ((const int * ALIGNED(8))a0)[1];
+    a.i[0] = ( ( const int * ALIGNED(8) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(8) ) a0 )[1];
 
-    a.i[1] = ((const int * ALIGNED(8))a1)[0];
-    b.i[1] = ((const int * ALIGNED(8))a1)[1];
+    a.i[1] = ( ( const int * ALIGNED(8) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(8) ) a1 )[1];
 
-    a.i[2] = ((const int * ALIGNED(8))a2)[0];
-    b.i[2] = ((const int * ALIGNED(8))a2)[1];
+    a.i[2] = ( ( const int * ALIGNED(8) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(8) ) a2 )[1];
 
-    a.i[3] = ((const int * ALIGNED(8))a3)[0];
-    b.i[3] = ((const int * ALIGNED(8))a3)[1];
+    a.i[3] = ( ( const int * ALIGNED(8) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(8) ) a3 )[1];
   }
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c )
+                           v4 &a,
+                           v4 &b,
+                           v4 &c )
   {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
+    a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1];
+    c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2];
 
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
+    a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1];
+    c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2];
 
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
+    a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1];
+    c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2];
 
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2]; 
+    a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1];
+    c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2];
   }
 
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d )
+                           v4 &a,
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
   {
-    a.i[0] = ((const int * ALIGNED(16))a0)[0];
-    b.i[0] = ((const int * ALIGNED(16))a0)[1];
-    c.i[0] = ((const int * ALIGNED(16))a0)[2];
-    d.i[0] = ((const int * ALIGNED(16))a0)[3];
-
-    a.i[1] = ((const int * ALIGNED(16))a1)[0];
-    b.i[1] = ((const int * ALIGNED(16))a1)[1];
-    c.i[1] = ((const int * ALIGNED(16))a1)[2];
-    d.i[1] = ((const int * ALIGNED(16))a1)[3];
-
-    a.i[2] = ((const int * ALIGNED(16))a2)[0];
-    b.i[2] = ((const int * ALIGNED(16))a2)[1];
-    c.i[2] = ((const int * ALIGNED(16))a2)[2];
-    d.i[2] = ((const int * ALIGNED(16))a2)[3];
-
-    a.i[3] = ((const int * ALIGNED(16))a3)[0];
-    b.i[3] = ((const int * ALIGNED(16))a3)[1];
-    c.i[3] = ((const int * ALIGNED(16))a3)[2];
-    d.i[3] = ((const int * ALIGNED(16))a3)[3];
+    a.i[0] = ( ( const int * ALIGNED(16) ) a0 )[0];
+    b.i[0] = ( ( const int * ALIGNED(16) ) a0 )[1];
+    c.i[0] = ( ( const int * ALIGNED(16) ) a0 )[2];
+    d.i[0] = ( ( const int * ALIGNED(16) ) a0 )[3];
+
+    a.i[1] = ( ( const int * ALIGNED(16) ) a1 )[0];
+    b.i[1] = ( ( const int * ALIGNED(16) ) a1 )[1];
+    c.i[1] = ( ( const int * ALIGNED(16) ) a1 )[2];
+    d.i[1] = ( ( const int * ALIGNED(16) ) a1 )[3];
+
+    a.i[2] = ( ( const int * ALIGNED(16) ) a2 )[0];
+    b.i[2] = ( ( const int * ALIGNED(16) ) a2 )[1];
+    c.i[2] = ( ( const int * ALIGNED(16) ) a2 )[2];
+    d.i[2] = ( ( const int * ALIGNED(16) ) a2 )[3];
+
+    a.i[3] = ( ( const int * ALIGNED(16) ) a3 )[0];
+    b.i[3] = ( ( const int * ALIGNED(16) ) a3 )[1];
+    c.i[3] = ( ( const int * ALIGNED(16) ) a3 )[2];
+    d.i[3] = ( ( const int * ALIGNED(16) ) a3 )[3];
   }
 
   inline void store_4x1_tr( const v4 &a,
-                            void *a0, void *a1,
-                            void *a2, void *a3 )
+                            void *a0,
+                            void *a1,
+                            void *a2,
+                            void *a3 )
   {
-    ((int *)a0)[0] = a.i[0];
-    ((int *)a1)[0] = a.i[1];
-    ((int *)a2)[0] = a.i[2];
-    ((int *)a3)[0] = a.i[3];
+    ( (int *) a0 )[0] = a.i[0];
+    ( (int *) a1 )[0] = a.i[1];
+    ( (int *) a2 )[0] = a.i[2];
+    ( (int *) a3 )[0] = a.i[3];
   }
 
-  inline void store_4x2_tr( const v4 &a, const v4 &b,
-                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
-                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 )
+  inline void store_4x2_tr( const v4 &a,
+                            const v4 &b,
+                            void * ALIGNED(8) a0,
+                            void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2,
+                            void * ALIGNED(8) a3 )
   {
-    ((int * ALIGNED(8))a0)[0] = a.i[0];
-    ((int * ALIGNED(8))a0)[1] = b.i[0];
+    ( ( int * ALIGNED(8) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(8) ) a0 )[1] = b.i[0];
 
-    ((int * ALIGNED(8))a1)[0] = a.i[1];
-    ((int * ALIGNED(8))a1)[1] = b.i[1];
+    ( ( int * ALIGNED(8) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(8) ) a1 )[1] = b.i[1];
 
-    ((int * ALIGNED(8))a2)[0] = a.i[2];
-    ((int * ALIGNED(8))a2)[1] = b.i[2];
+    ( ( int * ALIGNED(8) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(8) ) a2 )[1] = b.i[2];
 
-    ((int * ALIGNED(8))a3)[0] = a.i[3];
-    ((int * ALIGNED(8))a3)[1] = b.i[3];
+    ( ( int * ALIGNED(8) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(8) ) a3 )[1] = b.i[3];
   }
 
-  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x3_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
   {
-    ((int * ALIGNED(16))a0)[0] = a.i[0];
-    ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
 
-    ((int * ALIGNED(16))a1)[0] = a.i[1];
-    ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
 
-    ((int * ALIGNED(16))a2)[0] = a.i[2];
-    ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
 
-    ((int * ALIGNED(16))a3)[0] = a.i[3];
-    ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
   }
 
-  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 )
+  inline void store_4x4_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            const v4 &d,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
   {
-    ((int * ALIGNED(16))a0)[0] = a.i[0];
-    ((int * ALIGNED(16))a0)[1] = b.i[0];
-    ((int * ALIGNED(16))a0)[2] = c.i[0];
-    ((int * ALIGNED(16))a0)[3] = d.i[0];
-
-    ((int * ALIGNED(16))a1)[0] = a.i[1];
-    ((int * ALIGNED(16))a1)[1] = b.i[1];
-    ((int * ALIGNED(16))a1)[2] = c.i[1];
-    ((int * ALIGNED(16))a1)[3] = d.i[1];
-
-    ((int * ALIGNED(16))a2)[0] = a.i[2];
-    ((int * ALIGNED(16))a2)[1] = b.i[2];
-    ((int * ALIGNED(16))a2)[2] = c.i[2];
-    ((int * ALIGNED(16))a2)[3] = d.i[2];
-
-    ((int * ALIGNED(16))a3)[0] = a.i[3];
-    ((int * ALIGNED(16))a3)[1] = b.i[3];
-    ((int * ALIGNED(16))a3)[2] = c.i[3];
-    ((int * ALIGNED(16))a3)[3] = d.i[3];
+    ( ( int * ALIGNED(16) ) a0 )[0] = a.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[1] = b.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[2] = c.i[0];
+    ( ( int * ALIGNED(16) ) a0 )[3] = d.i[0];
+
+    ( ( int * ALIGNED(16) ) a1 )[0] = a.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[1] = b.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[2] = c.i[1];
+    ( ( int * ALIGNED(16) ) a1 )[3] = d.i[1];
+
+    ( ( int * ALIGNED(16) ) a2 )[0] = a.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[1] = b.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[2] = c.i[2];
+    ( ( int * ALIGNED(16) ) a2 )[3] = d.i[2];
+
+    ( ( int * ALIGNED(16) ) a3 )[0] = a.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[1] = b.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[2] = c.i[3];
+    ( ( int * ALIGNED(16) ) a3 )[3] = d.i[3];
   }
 
   //////////////
@@ -520,7 +550,7 @@ namespace v4
 
     // v4int assignment operators
 
-#   define ASSIGN(op)                             \
+    #define ASSIGN(op)                            \
     inline v4int &operator op( const v4int &b )   \
     {                                             \
       ALWAYS_VECTORIZE                            \
@@ -529,19 +559,19 @@ namespace v4
       return *this;                               \
     }
 
-    ASSIGN( =)
     ASSIGN(+=)
     ASSIGN(-=)
     ASSIGN(*=)
     ASSIGN(/=)
     ASSIGN(%=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
+    ASSIGN( =)
     ASSIGN(^=)
     ASSIGN(&=)
     ASSIGN(|=)
-    ASSIGN(<<=)
-    ASSIGN(>>=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4int member access operator
 
@@ -558,7 +588,7 @@ namespace v4
 
   // v4int prefix unary operators
 
-# define PREFIX_UNARY(op)                       \
+  #define PREFIX_UNARY(op)                      \
   inline v4int operator op( const v4int & a )   \
   {                                             \
     v4int b;                                    \
@@ -584,11 +614,11 @@ namespace v4
 
   PREFIX_UNARY(~)
 
-# undef PREFIX_UNARY
+  #undef PREFIX_UNARY
 
   // v4int prefix increment / decrement
 
-# define PREFIX_INCDEC(op)                      \
+  #define PREFIX_INCDEC(op)                     \
   inline v4int operator op( v4int & a )         \
   {                                             \
     v4int b;                                    \
@@ -601,11 +631,11 @@ namespace v4
   PREFIX_INCDEC(++)
   PREFIX_INCDEC(--)
 
-# undef PREFIX_INCDEC
+  #undef PREFIX_INCDEC
 
   // v4int postfix increment / decrement
 
-# define POSTFIX_INCDEC(op)                    \
+  #define POSTFIX_INCDEC(op)                   \
   inline v4int operator op( v4int & a, int )   \
   {                                            \
     v4int b;                                   \
@@ -618,11 +648,11 @@ namespace v4
   POSTFIX_INCDEC(++)
   POSTFIX_INCDEC(--)
 
-# undef POSTFIX_INCDEC
+  #undef POSTFIX_INCDEC
 
   // v4int binary operators
 
-# define BINARY(op)                                             \
+  #define BINARY(op)                                            \
   inline v4int operator op( const v4int &a, const v4int &b )    \
   {                                                             \
     v4int c;                                                    \
@@ -637,17 +667,17 @@ namespace v4
   BINARY(*)
   BINARY(/)
   BINARY(%)
+  BINARY(<<)
+  BINARY(>>)
   BINARY(^)
   BINARY(&)
   BINARY(|)
-  BINARY(<<)
-  BINARY(>>)
 
-# undef BINARY
+  #undef BINARY
 
   // v4int logical operators
 
-# define LOGICAL(op)                                           \
+  #define LOGICAL(op)                                          \
   inline v4int operator op( const v4int &a, const v4int &b )   \
   {                                                            \
     v4int c;                                                   \
@@ -666,7 +696,7 @@ namespace v4
   LOGICAL(&&)
   LOGICAL(||)
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4int miscellaneous functions
 
@@ -705,13 +735,13 @@ namespace v4
 
   inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
   {
-    v4 m;
+    v4 tf;
 
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
-      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+      tf.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
 
-    return m;
+    return tf;
   }
 
   ////////////////
@@ -757,9 +787,9 @@ namespace v4
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
-#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b ) ALWAYS_INLINE
+    #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+    #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -769,8 +799,8 @@ namespace v4
 
     CMATH_FR2(copysign);
 
-#   undef CMATH_FR1
-#   undef CMATH_FR2
+    #undef CMATH_FR1
+    #undef CMATH_FR2
 
     // v4float miscellaneous friends
 
@@ -828,7 +858,7 @@ namespace v4
 
     // v4float assignment operators
 
-#   define ASSIGN(op)                                   \
+    #define ASSIGN(op)                                  \
     inline v4float &operator op( const v4float &b )     \
     {                                                   \
       ALWAYS_VECTORIZE                                  \
@@ -843,7 +873,7 @@ namespace v4
     ASSIGN(*=)
     ASSIGN(/=)
 
-#   undef ASSIGN
+    #undef ASSIGN
 
     // v4float member access operator
 
@@ -943,7 +973,7 @@ namespace v4
 
   // v4float binary operators
 
-# define BINARY(op)                                                  \
+  #define BINARY(op)                                                 \
   inline v4float operator op( const v4float &a, const v4float &b )   \
   {                                                                  \
     v4float c;                                                       \
@@ -958,11 +988,11 @@ namespace v4
   BINARY(*)
   BINARY(/)
 
-# undef BINARY
+  #undef BINARY
 
   // v4float logical operators
 
-# define LOGICAL(op)                                               \
+  #define LOGICAL(op)                                              \
   inline v4int operator op( const v4float &a, const v4float &b )   \
   {                                                                \
     v4int c;                                                       \
@@ -975,17 +1005,17 @@ namespace v4
   LOGICAL(< )
   LOGICAL(> )
   LOGICAL(==)
-  LOGICAL(!=)
   LOGICAL(<=)
   LOGICAL(>=)
+  LOGICAL(!=)
   LOGICAL(&&)
   LOGICAL(||)
 
-# undef LOGICAL
+  #undef LOGICAL
 
   // v4float math library functions
 
-# define CMATH_FR1(fn)                          \
+  #define CMATH_FR1(fn)                         \
   inline v4float fn( const v4float &a )         \
   {                                             \
     v4float b;                                  \
@@ -995,7 +1025,7 @@ namespace v4
     return b;                                   \
   }
 
-# define CMATH_FR2(fn)                                          \
+  #define CMATH_FR2(fn)                                         \
   inline v4float fn( const v4float &a, const v4float &b )       \
   {                                                             \
     v4float c;                                                  \
@@ -1011,6 +1041,9 @@ namespace v4
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   CMATH_FR1(sqrt)     CMATH_FR1(tan)   CMATH_FR1(tanh)
 
+  #undef CMATH_FR1
+  #undef CMATH_FR2
+
   inline v4float copysign( const v4float &a, const v4float &b )
   {
     v4float c;
@@ -1027,9 +1060,6 @@ namespace v4
     return c;
   }
 
-# undef CMATH_FR1
-# undef CMATH_FR2
-
   // v4float miscellaneous functions
 
   inline v4float rsqrt_approx( const v4float &a )
@@ -1142,28 +1172,34 @@ namespace v4
     return b;
   }
 
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void increment_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       p[j] += a.f[j];
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void decrement_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       p[j] -= a.f[j];
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a )
+  inline void scale_4x1( float * ALIGNED(16) p,
+                         const v4float &a )
   {
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
       p[j] *= a.f[j];
   }
 
-  inline void trilinear( v4float & wl, v4float & wh )
+  // Given wl = x y z w, compute:
+  // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
+  // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
+  inline void trilinear( v4float &wl, v4float &wh )
   {
     float x = wl.f[0], y = wl.f[1], z = wl.f[2];
 
diff --git a/src/util/v4/v4_sse.h b/src/util/v4/v4_sse.h
index b2ed5dcb..5f9e7d9d 100644
--- a/src/util/v4/v4_sse.h
+++ b/src/util/v4/v4_sse.h
@@ -5,47 +5,41 @@
 #error "Do not include v4_sse.h directly; use v4.h"
 #endif
 
-#define V4_ACCELERATION
-#define V4_SSE_ACCELERATION
-
 #include <xmmintrin.h>
 #include <math.h>
 
+#define V4_ACCELERATION
+#define V4_SSE_ACCELERATION
+
 #ifndef ALIGNED
 #define ALIGNED(n)
 #endif
 
-// FIXME: IN PORTABLE, ALTIVEC, SPU
-// - UPDATE V4INT, V4FLOAT
-
-// This requires gcc-3.3 and up
-// Also, Bug 12902 has not been resolved on gcc-3.x.x. See README.patches for
-// details.  gcc-4.x.x does not seem to have this bug but may suffer from
-// other problems (use "-fno-strict-aliasing" on these platforms)
-
 #define ALWAYS_INLINE __attribute__((always_inline))
 
-namespace v4 {
-
+namespace v4
+{
   class v4;
   class v4int;
   class v4float;
 
-  template<int i0, int i1, int i2, int i3> struct permute {
+  template<int i0, int i1, int i2, int i3>
+  struct permute
+  {
     constexpr static int value = i0 + i1*4 + i2*16 + i3*64;
-  }; // permute
+  };
+
+  #define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64)
 
-# define PERM(i0,i1,i2,i3) ((i0) + (i1)*4 + (i2)*16 + (i3)*64)
-  
   ////////////////
   // v4 base class
-  
-  class v4 {
-    
+
+  class v4
+  {
     friend class v4int;
     friend class v4float;
-      
-    // v4 miscellenous friends
+
+    // v4 miscellaneous friends
 
     friend inline int any( const v4 &a ) ALWAYS_INLINE;
     friend inline int all( const v4 &a ) ALWAYS_INLINE;
@@ -61,53 +55,68 @@ namespace v4 {
 
     // v4int miscellaneous friends
 
-    friend inline v4 czero(    const v4int &c, const v4 &a ) ALWAYS_INLINE;
+    friend inline v4    czero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
     friend inline v4 notczero( const v4int &c, const v4 &a ) ALWAYS_INLINE;
-    friend inline v4 merge(    const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
+    friend inline v4    merge( const v4int &c, const v4 &a, const v4 &b ) ALWAYS_INLINE;
 
     // v4 memory manipulation friends
-        
-    friend inline void load_4x1( const void * ALIGNED(16) p, v4 &a ) ALWAYS_INLINE;
-    friend inline void store_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void stream_4x1( const v4 &a, void * ALIGNED(16) p ) ALWAYS_INLINE;
-    friend inline void clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
-    friend inline void copy_4x1( void * ALIGNED(16) dst,
-                                 const void * ALIGNED(16) src ) ALWAYS_INLINE;
-    friend inline void swap_4x1( void * ALIGNED(16) a, void * ALIGNED(16) b ) ALWAYS_INLINE;
+
+    friend inline void   load_4x1( const void * ALIGNED(16) p,
+                                   v4 &a ) ALWAYS_INLINE;
+
+    friend inline void  store_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void stream_4x1( const v4 &a,
+                                   void * ALIGNED(16) p ) ALWAYS_INLINE;
+
+    friend inline void  clear_4x1( void * ALIGNED(16) dst ) ALWAYS_INLINE;
+
+    friend inline void   copy_4x1( void * ALIGNED(16) dst,
+                                   const void * ALIGNED(16) src ) ALWAYS_INLINE;
+
+    friend inline void   swap_4x1( void * ALIGNED(16) a,
+                                   void * ALIGNED(16) b ) ALWAYS_INLINE;
 
     // v4 transposed memory manipulation friends
 
     friend inline void load_4x1_tr( const void *a0, const void *a1,
                                     const void *a2, const void *a3,
                                     v4 &a ) ALWAYS_INLINE;
+
     friend inline void load_4x2_tr( const void * ALIGNED(8) a0,
                                     const void * ALIGNED(8) a1,
                                     const void * ALIGNED(8) a2,
                                     const void * ALIGNED(8) a3,
                                     v4 &a, v4 &b ) ALWAYS_INLINE;
+
     friend inline void load_4x3_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
                                     v4 &a, v4 &b, v4 &c ) ALWAYS_INLINE;
+
     friend inline void load_4x4_tr( const void * ALIGNED(16) a0,
                                     const void * ALIGNED(16) a1,
                                     const void * ALIGNED(16) a2,
                                     const void * ALIGNED(16) a3,
                                     v4 &a, v4 &b, v4 &c, v4 &d ) ALWAYS_INLINE;
-    
+
     friend inline void store_4x1_tr( const v4 &a,
                                      void *a0, void *a1, void *a2, void *a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x2_tr( const v4 &a, const v4 &b,
                                      void * ALIGNED(8) a0,
                                      void * ALIGNED(8) a1,
                                      void * ALIGNED(8) a2,
                                      void * ALIGNED(8) a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
                                      void * ALIGNED(16) a0,
                                      void * ALIGNED(16) a1,
                                      void * ALIGNED(16) a2,
                                      void * ALIGNED(16) a3 ) ALWAYS_INLINE;
+
     friend inline void store_4x4_tr( const v4 &a, const v4 &b,
                                      const v4 &c, const v4 &d,
                                      void * ALIGNED(16) a0,
@@ -117,81 +126,102 @@ namespace v4 {
 
   protected:
 
-    union {
+    union
+    {
       int i[4];
       float f[4];
       __m128 v;
     };
-    
+
   public:
 
     v4() {}                    // Default constructor
-    v4(const v4 &a) { v=a.v; } // Copy constructor
-    ~v4() {}                   // Default destructor
 
+    v4( const v4 &a )          // Copy constructor
+    {
+      v = a.v;
+    }
+
+    ~v4() {}                   // Default destructor
   };
-  
+
   // v4 miscellaneous functions
 
-  inline int any( const v4 &a ) {
+  inline int any( const v4 &a )
+  {
     return a.i[0] || a.i[1] || a.i[2] || a.i[3];
   }
-  
-  inline int all( const v4 &a ) {
+
+  inline int all( const v4 &a )
+  {
     return a.i[0] && a.i[1] && a.i[2] && a.i[3];
   }
 
-  // Note: n MUST BE AN IMMEDIATE!
   template<int n>
-  inline v4 splat(const v4 & a) {
-    __m128 a_v = a.v;
+  inline v4 splat( const v4 & a )
+  {
     v4 b;
-    b.v = _mm_shuffle_ps( a_v, a_v, (n*permute<1,1,1,1>::value));
+
+    b.v = _mm_shuffle_ps( a.v, a.v, ( n * permute<1,1,1,1>::value ) );
+
     return b;
   }
 
-  // Note: i0:3 MUST BE IMMEDIATES! */
   template<int i0, int i1, int i2, int i3>
-  inline v4 shuffle( const v4 & a ) {
-    __m128 a_v = a.v;
+  inline v4 shuffle( const v4 & a )
+  {
     v4 b;
-    b.v = _mm_shuffle_ps( a_v, a_v, (permute<i0,i1,i2,i3>::value) );
+
+    b.v = _mm_shuffle_ps( a.v, a.v, ( permute<i0,i1,i2,i3>::value ) );
+
     return b;
   }
 
-  inline void swap( v4 &a, v4 &b ) { 
-    __m128 a_v = a.v; a.v = b.v; b.v = a_v;
+  inline void swap( v4 &a, v4 &b )
+  {
+    __m128 t = a.v;
+
+    a.v = b.v;
+
+    b.v = t;
   }
 
-  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 ) {
+  inline void transpose( v4 &a0, v4 &a1, v4 &a2, v4 &a3 )
+  {
     __m128 a0_v = a0.v, a1_v = a1.v, a2_v = a2.v, a3_v = a3.v, t, u;
+
     t    = _mm_unpackhi_ps( a0_v, a1_v );
     a0_v = _mm_unpacklo_ps( a0_v, a1_v );
     u    = _mm_unpackhi_ps( a2_v, a3_v );
     a2_v = _mm_unpacklo_ps( a2_v, a3_v );
+
     a1_v = _mm_movehl_ps( a2_v, a0_v );
     a0_v = _mm_movelh_ps( a0_v, a2_v );
     a2_v = _mm_movelh_ps( t, u );
     a3_v = _mm_movehl_ps( u, t );
-    a0.v = a0_v; a1.v = a1_v; a2.v = a2_v; a3.v = a3_v;
+
+    a0.v = a0_v;
+    a1.v = a1_v;
+    a2.v = a2_v;
+    a3.v = a3_v;
   }
 
   // v4 memory manipulation functions
-  
+
   inline void load_4x1( const void * ALIGNED(16) p,
-			v4 &a )
+                        v4 &a )
   {
     a.v = _mm_load_ps( ( float * ) p );
   }
 
   inline void store_4x1( const v4 &a,
-			 void * ALIGNED(16) p )
+                         void * ALIGNED(16) p )
   {
     _mm_store_ps( ( float * ) p, a.v );
   }
 
   inline void stream_4x1( const v4 &a,
-			  void * ALIGNED(16) p )
+                          void * ALIGNED(16) p )
   {
     _mm_stream_ps( ( float * ) p, a.v );
   }
@@ -207,9 +237,8 @@ namespace v4 {
     _mm_store_ps( ( float * ) dst, _mm_load_ps( ( const float * ) src ) );
   }
 
-  /* FIXME: MAKE ROBUST AGAINST ALIASING ISSUES */
   inline void swap_4x1( void * ALIGNED(16) a,
-			void * ALIGNED(16) b )
+                        void * ALIGNED(16) b )
   {
     __m128 t = _mm_load_ps( ( float * ) a );
 
@@ -219,129 +248,192 @@ namespace v4 {
 
   // v4 transposed memory manipulation functions
 
-  inline void load_4x1_tr( const void *a0, const void *a1,
-                           const void *a2, const void *a3, v4 &a ) {
-    a.v = _mm_setr_ps( ((const float *)a0)[0],
-                       ((const float *)a1)[0],
-                       ((const float *)a2)[0],
-                       ((const float *)a3)[0] );
+  inline void load_4x1_tr( const void *a0,
+                           const void *a1,
+                           const void *a2,
+                           const void *a3,
+                           v4 &a )
+  {
+    a.v = _mm_setr_ps( ( (const float *) a0 )[0],
+                       ( (const float *) a1 )[0],
+                       ( (const float *) a2 )[0],
+                       ( (const float *) a3 )[0] );
   }
 
   inline void load_4x2_tr( const void * ALIGNED(8) a0,
                            const void * ALIGNED(8) a1,
                            const void * ALIGNED(8) a2,
                            const void * ALIGNED(8) a3,
-                           v4 &a, v4 &b ) {
+                           v4 &a,
+                           v4 &b )
+  {
     __m128 a_v, b_v, t;
+
     b_v = _mm_setzero_ps();
-    t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a0 ), (__m64 *)a1 );
-    b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *)a2 ), (__m64 *)a3 );
+
+    t   = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a0 ), (__m64 *) a1 );
+    b_v = _mm_loadh_pi( _mm_loadl_pi( b_v, (__m64 *) a2 ), (__m64 *) a3 );
+
     a_v = _mm_shuffle_ps( t, b_v, 0x88 );
     b_v = _mm_shuffle_ps( t, b_v, 0xdd );
-    a.v = a_v; b.v = b_v;
+
+    a.v = a_v;
+    b.v = b_v;
   }
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c ) {
+                           v4 &a,
+                           v4 &b,
+                           v4 &c )
+  {
     __m128 a_v, b_v, c_v, t, u;
-    t   = _mm_load_ps( (const float *)a0 );
-    b_v = _mm_load_ps( (const float *)a1 );
-    c_v = _mm_load_ps( (const float *)a2 );
-    u   = _mm_load_ps( (const float *)a3 );
+
+    t   = _mm_load_ps( (const float *) a0 );
+    b_v = _mm_load_ps( (const float *) a1 );
+    c_v = _mm_load_ps( (const float *) a2 );
+    u   = _mm_load_ps( (const float *) a3 );
+
     a_v = _mm_unpacklo_ps( t, b_v );
     b_v = _mm_unpackhi_ps( t, b_v );
+
     t   = _mm_unpacklo_ps( c_v, u );
     u   = _mm_unpackhi_ps( c_v, u );
+
     c_v = _mm_movelh_ps( b_v, u );
     b_v = _mm_movehl_ps( t, a_v );
     a_v = _mm_movelh_ps( a_v, t );
-    a.v = a_v; b.v = b_v; c.v = c_v;
+
+    a.v = a_v;
+    b.v = b_v;
+    c.v = c_v;
   }
 
   inline void load_4x4_tr( const void * ALIGNED(16) a0,
                            const void * ALIGNED(16) a1,
                            const void * ALIGNED(16) a2,
                            const void * ALIGNED(16) a3,
-                           v4 &a, v4 &b, v4 &c, v4 &d ) {
+                           v4 &a,
+                           v4 &b,
+                           v4 &c,
+                           v4 &d )
+  {
     __m128 a_v, b_v, c_v, d_v, t, u;
-    a_v = _mm_load_ps( (const float *)a0 );
-    b_v = _mm_load_ps( (const float *)a1 );
-    c_v = _mm_load_ps( (const float *)a2 );
-    d_v = _mm_load_ps( (const float *)a3 );
+
+    a_v = _mm_load_ps( (const float *) a0 );
+    b_v = _mm_load_ps( (const float *) a1 );
+    c_v = _mm_load_ps( (const float *) a2 );
+    d_v = _mm_load_ps( (const float *) a3 );
+
     t   = _mm_unpackhi_ps( a_v, b_v );
     a_v = _mm_unpacklo_ps( a_v, b_v );
+
     u   = _mm_unpackhi_ps( c_v, d_v );
     c_v = _mm_unpacklo_ps( c_v, d_v );
+
     b_v = _mm_movehl_ps( c_v, a_v );
     a_v = _mm_movelh_ps( a_v, c_v );
     c_v = _mm_movelh_ps( t, u );
     d_v = _mm_movehl_ps( u, t );
-    a.v = a_v; b.v = b_v; c.v = c_v; d.v = d_v;
+
+    a.v = a_v;
+    b.v = b_v;
+    c.v = c_v;
+    d.v = d_v;
   }
 
   inline void store_4x1_tr( const v4 &a,
-                            void *a0, void *a1, void *a2, void *a3 ) {
-    ((float *)a0)[0] = a.f[0];
-    ((float *)a1)[0] = a.f[1];
-    ((float *)a2)[0] = a.f[2];
-    ((float *)a3)[0] = a.f[3];
+                            void *a0,
+                            void *a1,
+                            void *a2,
+                            void *a3 )
+  {
+    ( (float *) a0 )[0] = a.f[0];
+    ( (float *) a1 )[0] = a.f[1];
+    ( (float *) a2 )[0] = a.f[2];
+    ( (float *) a3 )[0] = a.f[3];
+  }
+
+  inline void store_4x2_tr( const v4 &a,
+                            const v4 &b,
+                            void * ALIGNED(8) a0,
+                            void * ALIGNED(8) a1,
+                            void * ALIGNED(8) a2,
+                            void * ALIGNED(8) a3 )
+  {
+    __m128 a_v = a.v, b_v = b.v, t;
+
+    t = _mm_unpacklo_ps( a_v, b_v );  // a0 b0 a1 b1 -> t
+
+    _mm_storel_pi( (__m64 *) a0, t ); // a0 b0       -> a0
+    _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1       -> a1
+
+    t = _mm_unpackhi_ps( a_v, b_v );  // a2 b2 a3 b3 -> t
+
+    _mm_storel_pi( (__m64 *) a2, t ); // a2 b2       -> a2
+    _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3       -> a3
   }
 
-  inline void store_4x2_tr( const v4 &a, const v4 &b,
-                            void * ALIGNED(8) a0, void * ALIGNED(8) a1,
-                            void * ALIGNED(8) a2, void * ALIGNED(8) a3 ) {
+  inline void store_4x3_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
+  {
     __m128 a_v = a.v, b_v = b.v, t;
-    t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t
-    _mm_storel_pi((__m64 *)a0,t); // a0 b0       -> a0
-    _mm_storeh_pi((__m64 *)a1,t); // a1 b1       -> a1
-    t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t
-    _mm_storel_pi((__m64 *)a2,t); // a2 b2       -> a2
-    _mm_storeh_pi((__m64 *)a3,t); // a3 b3       -> a3
+
+    t = _mm_unpacklo_ps( a_v, b_v );  // a0 b0 a1 b1 -> t
+
+    _mm_storel_pi( (__m64 *) a0, t ); // a0 b0       -> a0
+    _mm_storeh_pi( (__m64 *) a1, t ); // a1 b1       -> a1
+
+    t = _mm_unpackhi_ps( a_v, b_v );  // a2 b2 a3 b3 -> t
+
+    _mm_storel_pi( (__m64 *) a2, t ); // a2 b2       -> a2
+    _mm_storeh_pi( (__m64 *) a3, t ); // a3 b3       -> a3
+
+    ( (float *) a0 )[2] = c.f[0];
+    ( (float *) a1 )[2] = c.f[1];
+    ( (float *) a2 )[2] = c.f[2];
+    ( (float *) a3 )[2] = c.f[3];
   }
 
-  inline void store_4x3_tr( const v4 &a, const v4 &b, const v4 &c,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) {
-    __m128 a_v = a.v, b_v = b.v, t;
-    t = _mm_unpacklo_ps(a_v,b_v); // a0 b0 a1 b1 -> t
-    _mm_storel_pi((__m64 *)a0,t); // a0 b0       -> a0
-    _mm_storeh_pi((__m64 *)a1,t); // a1 b1       -> a1
-    t = _mm_unpackhi_ps(a_v,b_v); // a2 b2 a3 b3 -> t
-    _mm_storel_pi((__m64 *)a2,t); // a2 b2       -> a2
-    _mm_storeh_pi((__m64 *)a3,t); // a3 b3       -> a3
-    ((float *)a0)[2] = c.f[0];
-    ((float *)a1)[2] = c.f[1];
-    ((float *)a2)[2] = c.f[2];
-    ((float *)a3)[2] = c.f[3];
-  }
-
-  /* FIXME: IS THIS FASTER THAN THE OLD WAY (HAD MORE STORE INSTR) */
-  inline void store_4x4_tr( const v4 &a, const v4 &b, const v4 &c, const v4 &d,
-                            void * ALIGNED(16) a0, void * ALIGNED(16) a1,
-                            void * ALIGNED(16) a2, void * ALIGNED(16) a3 ) {
+  inline void store_4x4_tr( const v4 &a,
+                            const v4 &b,
+                            const v4 &c,
+                            const v4 &d,
+                            void * ALIGNED(16) a0,
+                            void * ALIGNED(16) a1,
+                            void * ALIGNED(16) a2,
+                            void * ALIGNED(16) a3 )
+  {
     __m128 a_v = a.v, b_v = b.v, c_v = c.v, d_v = d.v, t, u;
+
     t   = _mm_unpackhi_ps( a_v, b_v );
     a_v = _mm_unpacklo_ps( a_v, b_v );
     u   = _mm_unpackhi_ps( c_v, d_v );
     c_v = _mm_unpacklo_ps( c_v, d_v );
+
     b_v = _mm_movehl_ps( c_v, a_v );
     a_v = _mm_movelh_ps( a_v, c_v );
     c_v = _mm_movelh_ps( t, u );
     d_v = _mm_movehl_ps( u, t );
-    _mm_store_ps( (float *)a0, a_v );
-    _mm_store_ps( (float *)a1, b_v );
-    _mm_store_ps( (float *)a2, c_v );
-    _mm_store_ps( (float *)a3, d_v );
+
+    _mm_store_ps( (float *) a0, a_v );
+    _mm_store_ps( (float *) a1, b_v );
+    _mm_store_ps( (float *) a2, c_v );
+    _mm_store_ps( (float *) a3, d_v );
   }
 
   //////////////
   // v4int class
 
-  class v4int : public v4 {
-
+  class v4int : public v4
+  {
     // v4int prefix unary operator friends
 
     friend inline v4int operator  +( const v4int & a ) ALWAYS_INLINE;
@@ -409,33 +501,61 @@ namespace v4 {
 
     // v4float miscellaneous friends
 
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float set_bits(    const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
     friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
 
   public:
 
     // v4int constructors / destructors
-    
+
     v4int() {}                                // Default constructor
-    v4int( const v4int &a ) { v = a.v; }      // Copy constructor
-    v4int( const v4 &a ) { v = a.v; }         // Init from mixed
-    v4int( int a ) {                          // Init from scalar
-      union { int i; float f; } u;
+
+    v4int( const v4int &a )                   // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v4int( const v4 &a )                      // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v4int( int a )                            // Init from scalar
+    {
+      union
+      {
+        int i;
+        float f;
+      } u;
+
       u.i = a;
-      v = _mm_set1_ps( u.f );
+      v   = _mm_set1_ps( u.f );
     }
-    v4int( int i0, int i1, int i2, int i3 ) { // Init from scalars
-      union { int i; float f; } u0, u1, u2, u3;
-      u0.i = i0; u1.i = i1; u2.i = i2; u3.i = i3;
+
+    v4int( int i0, int i1, int i2, int i3 )   // Init from scalars
+    {
+      union
+      {
+        int i;
+        float f;
+      } u0, u1, u2, u3;
+
+      u0.i = i0;
+      u1.i = i1;
+      u2.i = i2;
+      u3.i = i3;
+
       v = _mm_setr_ps( u0.f, u1.f, u2.f, u3.f );
     }
-    ~v4int() {};                              // Destructor
-    
+
+    ~v4int() {}                               // Destructor
+
     // v4int assignment operators
-  
-#   define ASSIGN(op)			          \
-    inline v4int &operator op( const v4int &b ) { \
+
+    #define ASSIGN(op)                            \
+    inline v4int &operator op( const v4int &b )   \
+    {                                             \
       i[0] op b.i[0];                             \
       i[1] op b.i[1];                             \
       i[2] op b.i[2];                             \
@@ -443,121 +563,153 @@ namespace v4 {
       return *this;                               \
     }
 
-    inline v4int &operator =(const v4int &b) {
-      v = b.v;
-      return *this;
-    }
-
     ASSIGN(+=)
     ASSIGN(-=)
     ASSIGN(*=)
     ASSIGN(/=)
     ASSIGN(%=)
+    ASSIGN(<<=)
+    ASSIGN(>>=)
 
-    inline v4int &operator ^=(const v4int &b) {
+    #undef ASSIGN
+
+    inline v4int &operator =( const v4int &b )
+    {
+      v = b.v;
+
+      return *this;
+    }
+
+    inline v4int &operator ^=( const v4int &b )
+    {
       v = _mm_xor_ps( v, b.v );
+
       return *this;
     }
 
-    inline v4int &operator &=(const v4int &b) {
+    inline v4int &operator &=( const v4int &b )
+    {
       v = _mm_and_ps( v, b.v );
+
       return *this;
     }
 
-    inline v4int &operator |=(const v4int &b) {
+    inline v4int &operator |=( const v4int &b )
+    {
       v = _mm_or_ps( v, b.v );
+
       return *this;
     }
 
-    ASSIGN(<<=)
-    ASSIGN(>>=)
-
-#   undef ASSIGN
-
     // v4int member access operator
-    
-    inline int &operator []( int n ) { return i[n]; }
-    inline int  operator ()( int n ) { return i[n]; }
 
+    inline int &operator []( int n )
+    {
+      return i[n];
+    }
+
+    inline int  operator ()( int n )
+    {
+      return i[n];
+    }
   };
 
   // v4int prefix unary operators
 
-# define PREFIX_UNARY(op)                       \
-  inline v4int operator op( const v4int & a ) { \
+  #define PREFIX_UNARY(op)                      \
+  inline v4int operator op( const v4int &a )    \
+  {                                             \
     v4int b;                                    \
-    b.i[0] = (op a.i[0]);                       \
-    b.i[1] = (op a.i[1]);                       \
-    b.i[2] = (op a.i[2]);                       \
-    b.i[3] = (op a.i[3]);                       \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
     return b;                                   \
   }
 
-  inline v4int operator +( const v4int & a ) {
+  inline v4int operator +( const v4int &a )
+  {
     v4int b;
+
     b.v = a.v;
+
     return b;
   }
 
   PREFIX_UNARY(-)
 
-  inline v4int operator !( const v4int & a ) {
+  inline v4int operator !( const v4int &a )
+  {
     v4int b;
-    b.i[0] = -(!a.i[0]);
-    b.i[1] = -(!a.i[1]);
-    b.i[2] = -(!a.i[2]);
-    b.i[3] = -(!a.i[3]);
+
+    b.i[0] = - ( ! a.i[0] );
+    b.i[1] = - ( ! a.i[1] );
+    b.i[2] = - ( ! a.i[2] );
+    b.i[3] = - ( ! a.i[3] );
+
     return b;
   }
 
-  inline v4int operator ~( const v4int & a ) {
+  inline v4int operator ~( const v4int &a )
+  {
     v4int b;
-    union { int i; float f; } u;
+
+    union
+    {
+      int i;
+      float f;
+    } u;
+
     u.i = -1;
+
     b.v = _mm_xor_ps( a.v, _mm_set1_ps( u.f ) );
+
     return b;
   }
-  
-# undef PREFIX_UNARY
+
+  #undef PREFIX_UNARY
 
   // v4int prefix increment / decrement
 
-# define PREFIX_INCDEC(op)                      \
-  inline v4int operator op( v4int & a ) {       \
+  #define PREFIX_INCDEC(op)                     \
+  inline v4int operator op( v4int &a )          \
+  {                                             \
     v4int b;                                    \
-    b.i[0] = (op a.i[0]);                       \
-    b.i[1] = (op a.i[1]);                       \
-    b.i[2] = (op a.i[2]);                       \
-    b.i[3] = (op a.i[3]);                       \
+    b.i[0] = ( op a.i[0] );                     \
+    b.i[1] = ( op a.i[1] );                     \
+    b.i[2] = ( op a.i[2] );                     \
+    b.i[3] = ( op a.i[3] );                     \
     return b;                                   \
   }
 
   PREFIX_INCDEC(++)
   PREFIX_INCDEC(--)
 
-# undef PREFIX_INCDEC
+  #undef PREFIX_INCDEC
 
   // v4int postfix increment / decrement
 
-# define POSTFIX_INCDEC(op)                    \
-  inline v4int operator op( v4int & a, int ) { \
+  #define POSTFIX_INCDEC(op)                   \
+  inline v4int operator op( v4int &a, int )    \
+  {                                            \
     v4int b;                                   \
-    b.i[0] = (a.i[0] op);                      \
-    b.i[1] = (a.i[1] op);                      \
-    b.i[2] = (a.i[2] op);                      \
-    b.i[3] = (a.i[3] op);                      \
+    b.i[0] = ( a.i[0] op );                    \
+    b.i[1] = ( a.i[1] op );                    \
+    b.i[2] = ( a.i[2] op );                    \
+    b.i[3] = ( a.i[3] op );                    \
     return b;                                  \
   }
 
   POSTFIX_INCDEC(++)
   POSTFIX_INCDEC(--)
 
-# undef POSTFIX_INCDEC
+  #undef POSTFIX_INCDEC
 
   // v4int binary operators
-  
-# define BINARY(op)                                             \
-  inline v4int operator op( const v4int &a, const v4int &b ) {	\
+
+  #define BINARY(op)                                            \
+  inline v4int operator op( const v4int &a, const v4int &b )    \
+  {                                                             \
     v4int c;                                                    \
     c.i[0] = a.i[0] op b.i[0];                                  \
     c.i[1] = a.i[1] op b.i[1];                                  \
@@ -571,39 +723,48 @@ namespace v4 {
   BINARY(*)
   BINARY(/)
   BINARY(%)
+  BINARY(<<)
+  BINARY(>>)
 
-  inline v4int operator ^( const v4int &a, const v4int &b ) {
+  #undef BINARY
+
+  inline v4int operator ^( const v4int &a, const v4int &b )
+  {
     v4int c;
+
     c.v = _mm_xor_ps( a.v, b.v );
+
     return c;
   }
 
-  inline v4int operator &( const v4int &a, const v4int &b ) {
+  inline v4int operator &( const v4int &a, const v4int &b )
+  {
     v4int c;
+
     c.v = _mm_and_ps( a.v, b.v );
+
     return c;
   }
 
-  inline v4int operator |( const v4int &a, const v4int &b ) {
+  inline v4int operator |( const v4int &a, const v4int &b )
+  {
     v4int c;
+
     c.v = _mm_or_ps( a.v, b.v );
+
     return c;
   }
 
-  BINARY(<<)
-  BINARY(>>)
-
-# undef BINARY
-
   // v4int logical operators
 
-# define LOGICAL(op)                                           \
-  inline v4int operator op( const v4int &a, const v4int &b ) { \
+  #define LOGICAL(op)                                          \
+  inline v4int operator op( const v4int &a, const v4int &b )   \
+  {                                                            \
     v4int c;                                                   \
-    c.i[0] = -(a.i[0] op b.i[0]);                              \
-    c.i[1] = -(a.i[1] op b.i[1]);                              \
-    c.i[2] = -(a.i[2] op b.i[2]);                              \
-    c.i[3] = -(a.i[3] op b.i[3]);                              \
+    c.i[0] = - ( a.i[0] op b.i[0] );                           \
+    c.i[1] = - ( a.i[1] op b.i[1] );                           \
+    c.i[2] = - ( a.i[2] op b.i[2] );                           \
+    c.i[3] = - ( a.i[3] op b.i[3] );                           \
     return c;                                                  \
   }
 
@@ -615,44 +776,58 @@ namespace v4 {
   LOGICAL(>=)
   LOGICAL(&&)
   LOGICAL(||)
-  
-# undef LOGICAL
+
+  #undef LOGICAL
 
   // v4int miscellaneous functions
 
-  inline v4int abs( const v4int &a ) {
+  inline v4int abs( const v4int &a )
+  {
     v4int b;
-    b.i[0] = (a.i[0]>=0) ? a.i[0] : -a.i[0];
-    b.i[1] = (a.i[1]>=0) ? a.i[1] : -a.i[1];
-    b.i[2] = (a.i[2]>=0) ? a.i[2] : -a.i[2];
-    b.i[3] = (a.i[3]>=0) ? a.i[3] : -a.i[3];
+
+    b.i[0] = ( a.i[0] >= 0 ) ? a.i[0] : -a.i[0];
+    b.i[1] = ( a.i[1] >= 0 ) ? a.i[1] : -a.i[1];
+    b.i[2] = ( a.i[2] >= 0 ) ? a.i[2] : -a.i[2];
+    b.i[3] = ( a.i[3] >= 0 ) ? a.i[3] : -a.i[3];
+
     return b;
   }
 
-  inline v4 czero( const v4int &c, const v4 &a ) {
+  inline v4 czero( const v4int &c, const v4 &a )
+  {
     v4 b;
-    b.v = _mm_andnot_ps(c.v,a.v);
+
+    b.v = _mm_andnot_ps( c.v, a.v );
+
     return b;
   }
 
-  inline v4 notczero( const v4int &c, const v4 &a ) {
+  inline v4 notczero( const v4int &c, const v4 &a )
+  {
     v4 b;
-    b.v = _mm_and_ps(c.v,a.v);
+
+    b.v = _mm_and_ps( c.v, a.v );
+
     return b;
   }
-  
-  inline v4 merge( const v4int &c, const v4 &t, const v4 &f ) {
-    __m128 c_v = c.v;
+
+  inline v4 merge( const v4int &c, const v4 &t, const v4 &f )
+  {
     v4 tf;
-    tf.v = _mm_or_ps(_mm_andnot_ps(c_v,f.v),_mm_and_ps(c_v,t.v));
+
+    __m128 c_v = c.v;
+
+    tf.v = _mm_or_ps( _mm_andnot_ps( c_v, f.v ),
+                      _mm_and_ps( c_v, t.v ) );
+
     return tf;
   }
 
   ////////////////
   // v4float class
 
-  class v4float : public v4 {
-
+  class v4float : public v4
+  {
     // v4float prefix unary operator friends
 
     friend inline v4float operator  +( const v4float &a ) ALWAYS_INLINE;
@@ -691,9 +866,9 @@ namespace v4 {
 
     // v4float math library friends
 
-#   define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
-#   define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
-                                                   const v4float &b ) ALWAYS_INLINE
+    #define CMATH_FR1(fn) friend inline v4float fn( const v4float &a ) ALWAYS_INLINE
+    #define CMATH_FR2(fn) friend inline v4float fn( const v4float &a,  \
+                                                    const v4float &b ) ALWAYS_INLINE
 
     CMATH_FR1(acos);  CMATH_FR1(asin);  CMATH_FR1(atan); CMATH_FR2(atan2);
     CMATH_FR1(ceil);  CMATH_FR1(cos);   CMATH_FR1(cosh); CMATH_FR1(exp);
@@ -703,192 +878,252 @@ namespace v4 {
 
     CMATH_FR2(copysign);
 
-#   undef CMATH_FR1
-#   undef CMATH_FR2
+    #undef CMATH_FR1
+    #undef CMATH_FR2
 
     // v4float miscellaneous friends
 
     friend inline v4float rsqrt_approx( const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float rsqrt( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float rsqrt       ( const v4float &a ) ALWAYS_INLINE;
     friend inline v4float rcp_approx( const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float rcp( const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
-    friend inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float rcp       ( const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float fma ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
+    friend inline v4float fms ( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
     friend inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) ALWAYS_INLINE;
-    friend inline v4float clear_bits(  const v4int &m, const v4float &a ) ALWAYS_INLINE;
-    friend inline v4float set_bits(    const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float  clear_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
+    friend inline v4float    set_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
     friend inline v4float toggle_bits( const v4int &m, const v4float &a ) ALWAYS_INLINE;
     friend inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
     friend inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
-    friend inline void scale_4x1(     float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
-    // FIXME: crack
+    friend inline void     scale_4x1( float * ALIGNED(16) p, const v4float &a ) ALWAYS_INLINE;
     friend inline void trilinear( v4float &wl, v4float &wh ) ALWAYS_INLINE;
-    
+
   public:
 
     // v4float constructors / destructors
-    
+
     v4float() {}                                        // Default constructor
-    v4float( const v4float &a ) { v = a.v; }            // Copy constructor
-    v4float( const v4 &a ) { v = a.v; }                 // Init from mixed
-    v4float( float a ) {                                // Init from scalar
+
+    v4float( const v4float &a )                         // Copy constructor
+    {
+      v = a.v;
+    }
+
+    v4float( const v4 &a )                              // Init from mixed
+    {
+      v = a.v;
+    }
+
+    v4float( float a )                                  // Init from scalar
+    {
       v = _mm_set1_ps( a );
     }
-    v4float( float f0, float f1, float f2, float f3 ) { // Init from scalars
+
+    v4float( float f0, float f1, float f2, float f3 )   // Init from scalars
+    {
       v = _mm_setr_ps( f0, f1, f2, f3 );
     }
+
     ~v4float() {}                                       // Destructor
 
     // v4float assignment operators
 
-#   define ASSIGN(op,intrin)				\
-    inline v4float &operator op(const v4float &b) {	\
-      v = intrin(v,b.v);				\
-      return *this;					\
+    #define ASSIGN(op,intrin)                           \
+    inline v4float &operator op( const v4float &b )     \
+    {                                                   \
+      v = intrin( v, b.v );                             \
+      return *this;                                     \
     }
 
-    inline v4float &operator =(const v4float &b) {
+    ASSIGN( +=, _mm_add_ps )
+    ASSIGN( -=, _mm_sub_ps )
+    ASSIGN( *=, _mm_mul_ps )
+    ASSIGN( /=, _mm_div_ps )
+
+    #undef ASSIGN
+
+    inline v4float &operator =( const v4float &b )
+    {
       v = b.v;
+
       return *this;
     }
 
-    ASSIGN(+=,_mm_add_ps)
-    ASSIGN(-=,_mm_sub_ps)
-    ASSIGN(*=,_mm_mul_ps)
-    ASSIGN(/=,_mm_div_ps)
-
-#   undef ASSIGN
-
     // v4float member access operator
 
-    inline float &operator []( int n ) { return f[n]; }
-    inline float  operator ()( int n ) { return f[n]; }
+    inline float &operator []( int n )
+    {
+      return f[n];
+    }
 
+    inline float  operator ()( int n )
+    {
+      return f[n];
+    }
   };
 
   // v4float prefix unary operators
 
-  inline v4float operator +( const v4float &a ) {
+  inline v4float operator +( const v4float &a )
+  {
     v4float b;
+
     b.v = a.v;
+
     return b;
   }
 
-  inline v4float operator -( const v4float &a ) {
+  inline v4float operator -( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_sub_ps(_mm_setzero_ps(),a.v);
+
+    b.v = _mm_sub_ps( _mm_setzero_ps(), a.v );
+
     return b;
   }
 
-  inline v4int operator !( const v4float &a ) {
+  inline v4int operator !( const v4float &a )
+  {
     v4int b;
-    b.v = _mm_cmpeq_ps(_mm_setzero_ps(),a.v);
+
+    b.v = _mm_cmpeq_ps( _mm_setzero_ps(), a.v );
+
     return b;
   }
 
   // v4float prefix increment / decrement operators
 
-  inline v4float operator ++( v4float &a ) {
+  inline v4float operator ++( v4float &a )
+  {
     v4float b;
+
     __m128 t = _mm_add_ps( a.v, _mm_set1_ps( 1 ) );
+
     a.v = t;
     b.v = t;
+
     return b;
   }
 
-  inline v4float operator --( v4float &a ) {
+  inline v4float operator --( v4float &a )
+  {
     v4float b;
+
     __m128 t = _mm_sub_ps( a.v, _mm_set1_ps( 1 ) );
+
     a.v = t;
     b.v = t;
+
     return b;
   }
 
   // v4float postfix increment / decrement operators
 
-  inline v4float operator ++( v4float &a, int ) {
+  inline v4float operator ++( v4float &a, int )
+  {
     v4float b;
+
     __m128 a_v = a.v;
+
     a.v = _mm_add_ps( a_v, _mm_set1_ps( 1 ) );
     b.v = a_v;
+
     return b;
   }
 
-  inline v4float operator --( v4float &a, int ) {
+  inline v4float operator --( v4float &a, int )
+  {
     v4float b;
+
     __m128 a_v = a.v;
-    a.v = _mm_sub_ps(a_v, _mm_set1_ps( 1 ) );
+
+    a.v = _mm_sub_ps( a_v, _mm_set1_ps( 1 ) );
     b.v = a_v;
+
     return b;
   }
 
   // v4float binary operators
-    
-# define BINARY(op,intrin)                                           \
-  inline v4float operator op( const v4float &a, const v4float &b ) { \
+
+  #define BINARY(op,intrin)                                          \
+  inline v4float operator op( const v4float &a, const v4float &b )   \
+  {                                                                  \
     v4float c;                                                       \
-    c.v = intrin(a.v,b.v);                                           \
+    c.v = intrin( a.v, b.v );                                        \
     return c;                                                        \
   }
 
-  BINARY(+,_mm_add_ps)
-  BINARY(-,_mm_sub_ps)
-  BINARY(*,_mm_mul_ps)
-  BINARY(/,_mm_div_ps)
+  BINARY( +, _mm_add_ps )
+  BINARY( -, _mm_sub_ps )
+  BINARY( *, _mm_mul_ps )
+  BINARY( /, _mm_div_ps )
 
-# undef BINARY
+  #undef BINARY
 
   // v4float logical operators
 
-# define LOGICAL(op,intrin)                                        \
-  inline v4int operator op( const v4float &a, const v4float &b ) { \
+  #define LOGICAL(op,intrin)                                       \
+  inline v4int operator op( const v4float &a, const v4float &b )   \
+  {                                                                \
     v4int c;                                                       \
-    c.v = intrin(a.v,b.v);                                         \
+    c.v = intrin( a.v, b.v );                                      \
     return c;                                                      \
   }
 
-  LOGICAL(<, _mm_cmplt_ps )
-  LOGICAL(>, _mm_cmpgt_ps )
-  LOGICAL(==,_mm_cmpeq_ps )
-  LOGICAL(!=,_mm_cmpneq_ps)
-  LOGICAL(<=,_mm_cmple_ps )
-  LOGICAL(>=,_mm_cmpge_ps )
+  LOGICAL(  <, _mm_cmplt_ps  )
+  LOGICAL(  >, _mm_cmpgt_ps  )
+  LOGICAL( ==, _mm_cmpeq_ps  )
+  LOGICAL( <=, _mm_cmple_ps  )
+  LOGICAL( >=, _mm_cmpge_ps  )
+  LOGICAL( !=, _mm_cmpneq_ps )
 
-  inline v4int operator &&( const v4float &a, const v4float &b ) {
+  #undef LOGICAL
+
+  inline v4int operator &&( const v4float &a, const v4float &b )
+  {
     v4int c;
+
     __m128 vzero = _mm_setzero_ps();
-    c.v = _mm_and_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero));
+
+    c.v = _mm_and_ps( _mm_cmpneq_ps( a.v, vzero ),
+                      _mm_cmpneq_ps( b.v, vzero ) );
+
     return c;
   }
 
-  inline v4int operator ||( const v4float &a, const v4float &b ) {
+  inline v4int operator ||( const v4float &a, const v4float &b )
+  {
     v4int c;
+
     __m128 vzero = _mm_setzero_ps();
-    c.v = _mm_or_ps(_mm_cmpneq_ps(a.v,vzero),_mm_cmpneq_ps(b.v,vzero));
+
+    c.v = _mm_or_ps( _mm_cmpneq_ps( a.v, vzero ),
+                     _mm_cmpneq_ps( b.v, vzero ) );
+
     return c;
   }
 
-# undef LOGICAL
-
   // v4float math library functions
 
-# define CMATH_FR1(fn)                          \
-  inline v4float fn( const v4float &a ) {       \
+  #define CMATH_FR1(fn)                         \
+  inline v4float fn( const v4float &a )         \
+  {                                             \
     v4float b;                                  \
-    b.f[0] = ::fn(a.f[0]);                      \
-    b.f[1] = ::fn(a.f[1]);                      \
-    b.f[2] = ::fn(a.f[2]);                      \
-    b.f[3] = ::fn(a.f[3]);                      \
+    b.f[0] = ::fn( a.f[0] );                    \
+    b.f[1] = ::fn( a.f[1] );                    \
+    b.f[2] = ::fn( a.f[2] );                    \
+    b.f[3] = ::fn( a.f[3] );                    \
     return b;                                   \
   }
 
-# define CMATH_FR2(fn)                                          \
-  inline v4float fn( const v4float &a, const v4float &b ) {     \
+  #define CMATH_FR2(fn)                                         \
+  inline v4float fn( const v4float &a, const v4float &b )       \
+  {                                                             \
     v4float c;                                                  \
-    c.f[0] = ::fn(a.f[0],b.f[0]);                               \
-    c.f[1] = ::fn(a.f[1],b.f[1]);                               \
-    c.f[2] = ::fn(a.f[2],b.f[2]);                               \
-    c.f[3] = ::fn(a.f[3],b.f[3]);                               \
+    c.f[0] = ::fn( a.f[0], b.f[0] );                            \
+    c.f[1] = ::fn( a.f[1], b.f[1] );                            \
+    c.f[2] = ::fn( a.f[2], b.f[2] );                            \
+    c.f[3] = ::fn( a.f[3], b.f[3] );                            \
     return c;                                                   \
   }
 
@@ -898,148 +1133,230 @@ namespace v4 {
   CMATH_FR1(log10)    CMATH_FR2(pow)   CMATH_FR1(sin)  CMATH_FR1(sinh)
   /*CMATH_FR1(sqrt)*/ CMATH_FR1(tan)   CMATH_FR1(tanh)
 
-  inline v4float fabs( const v4float &a ) {
+  #undef CMATH_FR1
+  #undef CMATH_FR2
+
+  inline v4float fabs( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_andnot_ps( _mm_set1_ps( -0.f ), a.v );
+
+    b.v = _mm_andnot_ps( _mm_set1_ps( -0.0f ), a.v );
+
     return b;
   }
 
-  inline v4float sqrt( const v4float &a ) {
+  inline v4float sqrt( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_sqrt_ps(a.v);
+
+    b.v = _mm_sqrt_ps( a.v );
+
     return b;
   }
 
-  inline v4float copysign( const v4float &a, const v4float &b ) {
+  inline v4float copysign( const v4float &a, const v4float &b )
+  {
     v4float c;
-    __m128 t = _mm_set1_ps( -0.f );
-    c.v = _mm_or_ps( _mm_and_ps( t, b.v ), _mm_andnot_ps( t, a.v ) );
+
+    __m128 t = _mm_set1_ps( -0.0f );
+
+    c.v = _mm_or_ps( _mm_and_ps( t, b.v ),
+                     _mm_andnot_ps( t, a.v ) );
+
     return c;
   }
 
-# undef CMATH_FR1
-# undef CMATH_FR2
+  // v4float miscellaneous functions
 
-  // v4float miscelleanous functions
-  
-  inline v4float rsqrt_approx( const v4float &a ) {
+  inline v4float rsqrt_approx( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_rsqrt_ps(a.v);
+
+    b.v = _mm_rsqrt_ps( a.v );
+
     return b;
   }
 
-  #if 0
-  inline v4float rsqrt( const v4float &a ) {
+  inline v4float rsqrt( const v4float &a )
+  {
     v4float b;
+
     __m128 a_v = a.v, b_v;
-    b_v = _mm_rsqrt_ps(a_v);
-    // Note: It is quicker to just call div_ps and sqrt_ps if more
-    // refinement desired!
-    b.v = _mm_add_ps(b_v,_mm_mul_ps(_mm_set1_ps(0.5f),
-                                    _mm_sub_ps(b_v,_mm_mul_ps(a_v,
-                                                   _mm_mul_ps(b_v,
-                                                   _mm_mul_ps(b_v,b_v))))));
+
+    b_v = _mm_rsqrt_ps( a_v );
+
+    b.v = _mm_add_ps( b_v, _mm_mul_ps( _mm_set1_ps( 0.5f ),
+                                       _mm_sub_ps( b_v,
+                                                   _mm_mul_ps( a_v,
+                                                               _mm_mul_ps( b_v,
+                                                                           _mm_mul_ps( b_v, b_v )
+                                                                         )
+                                                             )
+                                                 )
+                                     )
+                    );
+
     return b;
   }
-  #endif
 
-  inline v4float rsqrt( const v4float &a ) {
+  #if 0
+  inline v4float rsqrt( const v4float &a )
+  {
     v4float b;
-    b.f[0] = ::sqrt( 1/a.f[0] );
-    b.f[1] = ::sqrt( 1/a.f[1] );
-    b.f[2] = ::sqrt( 1/a.f[2] );
-    b.f[3] = ::sqrt( 1/a.f[3] );
+
+    b.f[0] = ::sqrt( 1 / a.f[0] );
+    b.f[1] = ::sqrt( 1 / a.f[1] );
+    b.f[2] = ::sqrt( 1 / a.f[2] );
+    b.f[3] = ::sqrt( 1 / a.f[3] );
+
     return b;
   }
+  #endif
 
-  inline v4float rcp_approx( const v4float &a ) {
+  inline v4float rcp_approx( const v4float &a )
+  {
     v4float b;
-    b.v = _mm_rcp_ps(a.v);
+
+    b.v = _mm_rcp_ps( a.v );
+
     return b;
   }
 
-  #if 0
-  inline v4float rcp( const v4float &a ) {
+  inline v4float rcp( const v4float &a )
+  {
     v4float b;
+
     __m128 a_v = a.v, b_v;
-    b_v = _mm_rcp_ps(a_v);
-    b.v = _mm_sub_ps(_mm_add_ps(b_v,b_v),_mm_mul_ps(a_v,_mm_mul_ps(b_v,b_v)));
+
+    b_v = _mm_rcp_ps( a_v );
+
+    b.v = _mm_sub_ps( _mm_add_ps( b_v, b_v ),
+                      _mm_mul_ps( a_v,
+                                  _mm_mul_ps( b_v, b_v )
+                                )
+                    );
+
     return b;
   }
-  #endif
 
-  inline v4float rcp( const v4float &a ) {
+  #if 0
+  inline v4float rcp( const v4float &a )
+  {
     v4float b;
-    b.f[0] = 1/a.f[0];
-    b.f[1] = 1/a.f[1];
-    b.f[2] = 1/a.f[2];
-    b.f[3] = 1/a.f[3];
+
+    b.f[0] = 1 / a.f[0];
+    b.f[1] = 1 / a.f[1];
+    b.f[2] = 1 / a.f[2];
+    b.f[3] = 1 / a.f[3];
+
     return b;
   }
+  #endif
 
-  inline v4float fma(  const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fma( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
+
     d.v = _mm_add_ps( _mm_mul_ps( a.v, b.v ), c.v );
+
     return d;
   }
 
-  inline v4float fms(  const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fms( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
+
     d.v = _mm_sub_ps( _mm_mul_ps( a.v, b.v ), c.v );
+
     return d;
   }
 
-  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c ) {
+  inline v4float fnms( const v4float &a, const v4float &b, const v4float &c )
+  {
     v4float d;
+
     d.v = _mm_sub_ps( c.v, _mm_mul_ps( a.v, b.v ) );
+
     return d;
   }
 
-  inline v4float clear_bits( const v4int &m, const v4float &a ) {
+  inline v4float clear_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = _mm_andnot_ps( m.v, a.v );
+
     return b;
   }
 
-  inline v4float set_bits( const v4int &m, const v4float &a ) {
+  inline v4float set_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = _mm_or_ps( m.v, a.v );
+
     return b;
   }
 
-  inline v4float toggle_bits( const v4int &m, const v4float &a ) {
+  inline v4float toggle_bits( const v4int &m, const v4float &a )
+  {
     v4float b;
+
     b.v = _mm_xor_ps( m.v, a.v );
+
     return b;
   }
 
-  inline void increment_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void increment_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
+  {
     _mm_store_ps( p, _mm_add_ps( _mm_load_ps( p ), a.v ) );
   }
 
-  inline void decrement_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void decrement_4x1( float * ALIGNED(16) p,
+                             const v4float &a )
+  {
     _mm_store_ps( p, _mm_sub_ps( _mm_load_ps( p ), a.v ) );
   }
 
-  inline void scale_4x1( float * ALIGNED(16) p, const v4float &a ) {
+  inline void scale_4x1( float * ALIGNED(16) p,
+                         const v4float &a )
+  {
     _mm_store_ps( p, _mm_mul_ps( _mm_load_ps( p ), a.v ) );
   }
 
   // Given wl = x y z w, compute:
   // wl = (1-x)(1-y)(1-z) (1+x)(1-y)(1-z) (1-x)(1+y)(1-z) (1+x)(1+y)(1-z)
   // wh = (1-x)(1-y)(1+z) (1+x)(1-y)(1+z) (1-x)(1+y)(1+z) (1+x)(1+y)(1+z)
-  inline void trilinear( v4float &wl, v4float &wh ) {
-    __m128 l = _mm_set1_ps(1), s = _mm_setr_ps(-0.f,+0.f,-0.f,+0.f);
+  inline void trilinear( v4float &wl, v4float &wh )
+  {
+    __m128 l = _mm_set1_ps( 1.0f );
+
+    __m128 s = _mm_setr_ps( -0.0f, +0.0f, -0.0f, +0.0f );
+
     __m128 z = wl.v, xy;
-    xy = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(0,0,1,1) ) ) );
-    z  = _mm_add_ps( l, _mm_xor_ps( s, _mm_shuffle_ps( z,z, PERM(2,2,2,2) ) ) );
-    xy = _mm_mul_ps( _mm_shuffle_ps( xy,xy, PERM(0,1,0,1) ),
-                     _mm_shuffle_ps( xy,xy, PERM(2,2,3,3) ) );
-    wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(0,0,0,0) ) );
-    wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z,z, PERM(1,1,1,1) ) );
+
+    xy = _mm_add_ps( l,
+                     _mm_xor_ps( s,
+                                 _mm_shuffle_ps( z, z, PERM(0,0,1,1) )
+                               )
+                   );
+
+    z  = _mm_add_ps( l,
+                     _mm_xor_ps( s,
+                                 _mm_shuffle_ps( z, z, PERM(2,2,2,2) )
+                               )
+                   );
+
+    xy = _mm_mul_ps( _mm_shuffle_ps( xy, xy, PERM(0,1,0,1) ),
+                     _mm_shuffle_ps( xy, xy, PERM(2,2,3,3) ) );
+
+    wl.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(0,0,0,0) ) );
+
+    wh.v = _mm_mul_ps( xy, _mm_shuffle_ps( z, z, PERM(1,1,1,1) ) );
   }
 
-# undef PERM
+  #undef PERM
 
 } // namespace v4
 

From d938848c6e942aa2871f6c81a3b79a1c5ed0f3f2 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 10:48:42 -0600
Subject: [PATCH 43/95] Fix issue with undefined symbol.

---
 src/util/v4/v4_neon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/v4/v4_neon.h b/src/util/v4/v4_neon.h
index 0152ad2b..d20dfc67 100644
--- a/src/util/v4/v4_neon.h
+++ b/src/util/v4/v4_neon.h
@@ -1039,7 +1039,7 @@ namespace v4
 
     ALWAYS_VECTORIZE
     for( int j = 0; j < 4; j++ )
-      m.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
+      tf.i[j] = ( f.i[j] & ~c.i[j] ) | ( t.i[j] & c.i[j] );
 
     return tf;
   }

From 78aa8a4f00f85f95b17dc3ec4f8a94e2650c94e1 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 12:18:07 -0600
Subject: [PATCH 44/95] Try a fix to a unit test failure for the Altivec case.

---
 src/util/v4/v4_altivec.h | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/util/v4/v4_altivec.h b/src/util/v4/v4_altivec.h
index f1361278..d9438fc1 100644
--- a/src/util/v4/v4_altivec.h
+++ b/src/util/v4/v4_altivec.h
@@ -287,20 +287,37 @@ namespace v4
                            v4 &a,
                            v4 &b )
   {
-    _v4_float r = vec_ld( 0, (const float *) a0 ); // r =  0  1  2  3
-    _v4_float s = vec_ld( 0, (const float *) a1 ); // s =  4  5  6  7
-    _v4_float t = vec_ld( 0, (const float *) a2 ); // t =  8  9 10 11
-    _v4_float u = vec_ld( 0, (const float *) a3 ); // u = 12 13 14 15
+    _v4_float r, s, t, u;
+
+    a.v = vec_ld( 0, (const float *) a0 ); // a =  0  1  2  3
+    b.v = vec_ld( 0, (const float *) a1 ); // b =  4  5  6  7
+    t   = vec_ld( 0, (const float *) a2 ); // c =  8  9 10 11
+    u   = vec_ld( 0, (const float *) a3 ); // d = 12 13 14 15
+
+    // Step 1: Interleave top and bottom half
+
+    r   = vec_mergeh( a.v, t );            // r =  0  8  1  9
+    s   = vec_mergeh( b.v, u );            // s =  4 12  5 13
+
+    // Step 2: Interleave even and odd rows
+
+    a.v = vec_mergeh( r, s );              // a =  0  4  8 12
+    b.v = vec_mergel( r, s );              // b =  1  5  9 13
+
+    // _v4_float r = vec_ld( 0, (const float *) a0 ); // r =  0  1  2  3
+    // _v4_float s = vec_ld( 0, (const float *) a1 ); // s =  4  5  6  7
+    // _v4_float t = vec_ld( 0, (const float *) a2 ); // t =  8  9 10 11
+    // _v4_float u = vec_ld( 0, (const float *) a3 ); // u = 12 13 14 15
 
     // Step 1: Interleave top and bottom half
 
-    _v4_float v = vec_mergeh( r, t );              // v =  0  8  1  9
-    _v4_float w = vec_mergeh( s, u );              // w =  4 12  5 13
+    // _v4_float w = vec_mergeh( r, t );              // v =  0  8  1  9
+    // _v4_float x = vec_mergeh( s, u );              // w =  4 12  5 13
 
     // Step 2: Interleave even and odd rows
 
-    a.v         = vec_mergeh( v, w );              // a  =  0  4  8 12
-    b.v         = vec_mergel( v, w );              // b  =  1  5  9 13
+    // a.v         = vec_mergeh( w, x );              // a  =  0  4  8 12
+    // b.v         = vec_mergel( w, x );              // b  =  1  5  9 13
   }
 
   inline void load_4x3_tr( const void * ALIGNED(16) a0,

From 562ec80a5e286fae704545653d37fde1ea8f88ef Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 14:52:06 -0600
Subject: [PATCH 45/95] Modify TEST_CASE_load_4x2_tr to see if I can get it to
 pass on IBM with Altivec.

---
 src/util/v4/test/v4.cc | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/util/v4/test/v4.cc b/src/util/v4/test/v4.cc
index 73a51540..d2945cea 100644
--- a/src/util/v4/test/v4.cc
+++ b/src/util/v4/test/v4.cc
@@ -222,22 +222,24 @@ TEST_CASE("TEST_CASE_load_4x1_tr", "[v4]") {
 } // TEST_CASE
 
 TEST_CASE("TEST_CASE_load_4x2_tr", "[v4]") {
-  DECLARE_ALIGNED_ARRAY( int, 16, mem, 16 );
+  DECLARE_ALIGNED_ARRAY( int, 16, mem, 32 );
   v4int a0, a1, a2, a3;
   int i;
-  for( i=0; i<16; i++ ) mem[i] = i;
-  load_4x2_tr(mem,  mem+4,mem+8, mem+12,a0,a1);
-  load_4x2_tr(mem+2,mem+6,mem+10,mem+14,a2,a3);
-  for( i=0; i<16; i++ ) if( mem[i]!=i ) break;
+  for( i=0; i<32; i++ ) mem[i] = i;
+  load_4x2_tr(mem,   mem+4, mem+8, mem+12,a0,a1);
+  load_4x2_tr(mem+16,mem+20,mem+24,mem+28,a2,a3);
+  for( i=0; i<32; i++ ) if( mem[i]!=i ) break;
 
   //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) ||
   //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 );
 
   REQUIRE( any(a0==v4int( 0, 4, 8,12)) );
   REQUIRE( any(a1==v4int( 1, 5, 9,13)) );
-  REQUIRE( any(a2==v4int( 2, 6,10,14)) );
-  REQUIRE( any(a3==v4int( 3, 7,11,15)) );
-  REQUIRE( i==16 );
+  REQUIRE( any(a2==v4int(16,20,24,28)) );
+  REQUIRE( any(a3==v4int(17,21,25,29)) );
+  // REQUIRE( any(a2==v4int( 2, 6,10,14)) );
+  // REQUIRE( any(a3==v4int( 3, 7,11,15)) );
+  REQUIRE( i==32 );
 
 } // TEST_CASE
 

From cbcc9582a265747fae6fa3459b4c1a805d687ce9 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 18:09:17 -0600
Subject: [PATCH 46/95] Remove a test hack.

---
 src/sf_interface/sf_interface.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/sf_interface/sf_interface.h b/src/sf_interface/sf_interface.h
index fd9b72eb..6dc86883 100644
--- a/src/sf_interface/sf_interface.h
+++ b/src/sf_interface/sf_interface.h
@@ -52,15 +52,6 @@
 
 #endif
 
-// Temporary hack.
-#ifdef V4_NEON_ACCELERATION_SNOUT
-
-#define PAD_SIZE_INTERPOLATOR 14
-#define PAD_SIZE_ACCUMULATOR   4
-#define PAD_SIZE_HYDRO         2
-
-#endif
-
 /*****************************************************************************/
 
 // Interpolator arrays shall be a (nx+2) x (ny+2) x (nz+2) allocation

From 10c2adba8404b2a0fefad398b8b72094e5ec2f81 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 19:30:41 -0600
Subject: [PATCH 47/95] Remove some experimental NEON intrinsic code.

---
 .../standard/pipeline/center_p_pipeline_v4.cc | 146 ------------------
 .../pipeline/uncenter_p_pipeline_v4.cc        | 146 ------------------
 2 files changed, 292 deletions(-)

diff --git a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
index 2a25611f..dc6d5e18 100644
--- a/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
+++ b/src/species_advance/standard/pipeline/center_p_pipeline_v4.cc
@@ -6,150 +6,6 @@
 
 using namespace v4;
 
-#ifdef V4_NEON_ACCELERATION_SNOUT
-
-void
-center_p_pipeline_v4( center_p_pipeline_args_t * args,
-                      int pipeline_rank,
-                      int n_pipeline )
-{
-  const interpolator_t * ALIGNED(128) f0 = args->f0;
-
-  particle_t           * ALIGNED(128) p;
-
-  const float          * ALIGNED(16)  vp00;
-  const float          * ALIGNED(16)  vp01;
-  const float          * ALIGNED(16)  vp02;
-  const float          * ALIGNED(16)  vp03;
-
-  const v4float qdt_2mc(    args->qdt_2mc);
-  const v4float qdt_4mc(0.5*args->qdt_2mc); // For half Boris rotate.
-  const v4float one(1.0);
-  const v4float one_third(1.0/3.0);
-  const v4float two_fifteenths(2.0/15.0);
-
-  v4float dx, dy, dz, ux, uy, uz, q;
-  v4float hax, hay, haz, cbx, cby, cbz;
-  v4float v00, v01, v02, v03, v04, v05;
-  v4float v06, v07, v08, v09, v10;
-  v4int   ii;
-
-  int itmp, nq;
-
-  // Determine which particle blocks this pipeline processes.
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, itmp, nq );
-
-  p = args->p0 + itmp;
-
-  nq >>= 2;
-
-  // Process the particle blocks for this pipeline.
-
-  for( ; nq; nq--, p+=4 )
-  {
-    //--------------------------------------------------------------------------
-    // Load particle position data.
-    //--------------------------------------------------------------------------
-    load_4x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
-                 dx, dy, dz, ii, ux, uy, uz, q );
-
-    // load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
-    // 		 dx, dy, dz, ii );
-
-    //--------------------------------------------------------------------------
-    // Set field interpolation pointers.
-    //--------------------------------------------------------------------------
-    vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) );
-    vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) );
-    vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) );
-    vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles.
-    //--------------------------------------------------------------------------
-    load_4x16_tr( vp00, vp01, vp02, vp03,
-                  hax, v00, v01, v02,
-                  hay, v03, v04, v05,
-                  haz, v06, v07, v08,
-                  cbx, v09, cby, v10 );
-
-    // load_4x4_tr( vp00, vp01, vp02, vp03,
-    // 		 hax, v00, v01, v02 );
-
-    hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles.
-    //--------------------------------------------------------------------------
-    // load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
-    // 		 hay, v03, v04, v05 );
-
-    hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles.
-    //--------------------------------------------------------------------------
-    // load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
-    // 		 haz, v00, v01, v02 );
-
-    haz = qdt_2mc * fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles.
-    //--------------------------------------------------------------------------
-    // load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
-    // 		 cbx, v03, cby, v04 );
-
-    cbx = fma( v09, dx, cbx );
-    cby = fma( v10, dy, cby );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles, final.
-    //--------------------------------------------------------------------------
-    load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
-		 cbz, v05 );
-
-    cbz = fma( v05, dz, cbz );
-
-    //--------------------------------------------------------------------------
-    // Load particle momentum data.  Could use load_4x3_tr.
-    //--------------------------------------------------------------------------
-    // load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
-    // 		 ux, uy, uz, q );
-
-    //--------------------------------------------------------------------------
-    // Update momentum.
-    //--------------------------------------------------------------------------
-    ux  += hax;
-    uy  += hay;
-    uz  += haz;
-
-    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
-    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
-    v02  = ( v00 * v00 ) * v01;
-    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
-    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
-    v04 += v04;
-
-    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
-    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
-    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
-
-    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
-    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
-    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
-
-    //--------------------------------------------------------------------------
-    // Store particle momentum data.  Could use store_4x3_tr.
-    //--------------------------------------------------------------------------
-    store_4x4_tr( ux, uy, uz, q,
-		  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux );
-  }
-}
-
-#else
-
 void
 center_p_pipeline_v4( center_p_pipeline_args_t * args,
                       int pipeline_rank,
@@ -280,8 +136,6 @@ center_p_pipeline_v4( center_p_pipeline_args_t * args,
   }
 }
 
-#endif
-
 #else
 
 void
diff --git a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
index d4bfc425..3be32773 100644
--- a/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
+++ b/src/species_advance/standard/pipeline/uncenter_p_pipeline_v4.cc
@@ -6,150 +6,6 @@
 
 using namespace v4;
 
-#ifdef V4_NEON_ACCELERATION_SNOUT
-
-void
-uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
-                        int pipeline_rank,
-                        int n_pipeline )
-{
-  const interpolator_t * ALIGNED(128) f0 = args->f0;
-
-  particle_t           * ALIGNED(128) p;
-
-  const float          * ALIGNED(16)  vp00;
-  const float          * ALIGNED(16)  vp01;
-  const float          * ALIGNED(16)  vp02;
-  const float          * ALIGNED(16)  vp03;
-
-  const v4float qdt_2mc(    -args->qdt_2mc); // For backward half advance.
-  const v4float qdt_4mc(-0.5*args->qdt_2mc); // For backward half Boris rotate.
-  const v4float one(1.0);
-  const v4float one_third(1.0/3.0);
-  const v4float two_fifteenths(2.0/15.0);
-
-  v4float dx, dy, dz, ux, uy, uz, q;
-  v4float hax, hay, haz, cbx, cby, cbz;
-  v4float v00, v01, v02, v03, v04, v05;
-  v4float v06, v07, v08, v09, v10;
-  v4int   ii;
-
-  int first, nq;
-
-  // Determine which particle quads this pipeline processes.
-
-  DISTRIBUTE( args->np, 16, pipeline_rank, n_pipeline, first, nq );
-
-  p = args->p0 + first;
-
-  nq >>= 2;
-
-  // Process the particle quads for this pipeline.
-
-  for( ; nq; nq--, p+=4 )
-  {
-    //--------------------------------------------------------------------------
-    // Load particle position data.
-    //--------------------------------------------------------------------------
-    load_4x8_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
-                 dx, dy, dz, ii, ux, uy, uz, q );
-
-    // load_4x4_tr( &p[0].dx, &p[1].dx, &p[2].dx, &p[3].dx,
-    // 		 dx, dy, dz, ii );
-
-    //--------------------------------------------------------------------------
-    // Set field interpolation pointers.
-    //--------------------------------------------------------------------------
-    vp00 = ( const float * ALIGNED(16) ) ( f0 + ii(0) );
-    vp01 = ( const float * ALIGNED(16) ) ( f0 + ii(1) );
-    vp02 = ( const float * ALIGNED(16) ) ( f0 + ii(2) );
-    vp03 = ( const float * ALIGNED(16) ) ( f0 + ii(3) );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles.
-    //--------------------------------------------------------------------------
-    load_4x16_tr( vp00, vp01, vp02, vp03,
-                  hax, v00, v01, v02,
-                  hay, v03, v04, v05,
-                  haz, v06, v07, v08,
-                  cbx, v09, cby, v10 );
-
-    // load_4x4_tr( vp00, vp01, vp02, vp03,
-    // 		 hax, v00, v01, v02 );
-
-    hax = qdt_2mc * fma( fma( dy, v02, v01 ), dz, fma( dy, v00, hax ) );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles.
-    //--------------------------------------------------------------------------
-    // load_4x4_tr( vp00+4, vp01+4, vp02+4, vp03+4,
-    // 		 hay, v03, v04, v05 );
-
-    hay = qdt_2mc * fma( fma( dz, v05, v04 ), dx, fma( dz, v03, hay ) );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles.
-    //--------------------------------------------------------------------------
-    // load_4x4_tr( vp00+8, vp01+8, vp02+8, vp03+8,
-    // 		 haz, v00, v01, v02 );
-
-    haz = qdt_2mc * fma( fma( dx, v08, v07 ), dy, fma( dx, v06, haz ) );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles.
-    //--------------------------------------------------------------------------
-    // load_4x4_tr( vp00+12, vp01+12, vp02+12, vp03+12,
-    // 		 cbx, v03, cby, v04 );
-
-    cbx = fma( v09, dx, cbx );
-    cby = fma( v10, dy, cby );
-
-    //--------------------------------------------------------------------------
-    // Load interpolation data for particles, final.
-    //--------------------------------------------------------------------------
-    load_4x2_tr( vp00+16, vp01+16, vp02+16, vp03+16,
-		 cbz, v05 );
-
-    cbz = fma( v05, dz, cbz );
-
-    //--------------------------------------------------------------------------
-    // Load particle momentum data.  Could use load_4x3_tr.
-    //--------------------------------------------------------------------------
-    // load_4x4_tr( &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux,
-    // 		 ux, uy, uz, q );
-
-    //--------------------------------------------------------------------------
-    // Update momentum.
-    //--------------------------------------------------------------------------
-    v00  = qdt_4mc * rsqrt( one + fma( ux, ux, fma( uy, uy, uz * uz ) ) );
-    v01  = fma( cbx, cbx, fma( cby, cby, cbz * cbz ) );
-    v02  = ( v00 * v00 ) * v01;
-    v03  = v00 * fma( v02, fma( v02, two_fifteenths, one_third ), one );
-    v04  = v03 * rcp( fma( v03 * v03, v01, one ) );
-    v04 += v04;
-
-    v00  = fma( fms( uy, cbz, uz * cby ), v03, ux );
-    v01  = fma( fms( uz, cbx, ux * cbz ), v03, uy );
-    v02  = fma( fms( ux, cby, uy * cbx ), v03, uz );
-
-    ux   = fma( fms( v01, cbz, v02 * cby ), v04, ux );
-    uy   = fma( fms( v02, cbx, v00 * cbz ), v04, uy );
-    uz   = fma( fms( v00, cby, v01 * cbx ), v04, uz );
-
-    ux  += hax;
-    uy  += hay;
-    uz  += haz;
-
-    //--------------------------------------------------------------------------
-    // Store particle data.  Could use store_4x3_tr.
-    //--------------------------------------------------------------------------
-    store_4x4_tr( ux, uy, uz, q,
-		  &p[0].ux, &p[1].ux, &p[2].ux, &p[3].ux );
-  }
-}
-
-#else
-
 void
 uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
                         int pipeline_rank,
@@ -280,8 +136,6 @@ uncenter_p_pipeline_v4( center_p_pipeline_args_t * args,
   }
 }
 
-#endif
-
 #else
 
 void

From 23144b8ecb82e1b8803b566232053b4e25dd46d5 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 19:43:06 -0600
Subject: [PATCH 48/95] Remove more code that was used for testing ARM NEON
 intrinsics version.

---
 src/vpic/initialize.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/vpic/initialize.cc b/src/vpic/initialize.cc
index 4961559b..8cc28da0 100644
--- a/src/vpic/initialize.cc
+++ b/src/vpic/initialize.cc
@@ -51,12 +51,6 @@ vpic_simulation::initialize( int argc,
     if( rank()==0 ) MESSAGE(( "Uncentering particles" ));
     TIC load_interpolator_array( interpolator_array, field_array ); TOC( load_interpolator, 1 );
   }
-  LIST_FOR_EACH( sp, species_list ) TIC sort_p( sp ); TOC( sort_p, 1 );
-  for( int iwdn = 0; iwdn < 100; iwdn++ )
-  {
-    LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 );
-    LIST_FOR_EACH( sp, species_list ) TIC   center_p( sp, interpolator_array ); TOC(   center_p, 1 );
-  }
   LIST_FOR_EACH( sp, species_list ) TIC uncenter_p( sp, interpolator_array ); TOC( uncenter_p, 1 );
 
   if( rank()==0 ) MESSAGE(( "Performing initial diagnostics" ));

From 679b6bbaee09b99e3aef15606d3c4097e93d08b1 Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 19:52:16 -0600
Subject: [PATCH 49/95] Remove some dead code.

---
 src/util/v4/test/v4.cc | 59 ------------------------------------------
 1 file changed, 59 deletions(-)

diff --git a/src/util/v4/test/v4.cc b/src/util/v4/test/v4.cc
index d2945cea..b43ee5af 100644
--- a/src/util/v4/test/v4.cc
+++ b/src/util/v4/test/v4.cc
@@ -237,8 +237,6 @@ TEST_CASE("TEST_CASE_load_4x2_tr", "[v4]") {
   REQUIRE( any(a1==v4int( 1, 5, 9,13)) );
   REQUIRE( any(a2==v4int(16,20,24,28)) );
   REQUIRE( any(a3==v4int(17,21,25,29)) );
-  // REQUIRE( any(a2==v4int( 2, 6,10,14)) );
-  // REQUIRE( any(a3==v4int( 3, 7,11,15)) );
   REQUIRE( i==32 );
 
 } // TEST_CASE
@@ -276,63 +274,6 @@ TEST_CASE("TEST_CASE_load_4x4_tr", "[v4]") {
   REQUIRE( i==16 );
 } // TEST_CASE
 
-#ifdef V4_NEON_ACCELERATION
-TEST_CASE("TEST_CASE_load_4x8_tr", "[v4]") {
-  DECLARE_ALIGNED_ARRAY( int, 64, mem, 32 );
-  v4int a0, a1, a2, a3, a4, a5, a6, a7;
-  int i;
-  for( i=0; i<32; i++ ) mem[i] = i;
-  load_4x8_tr(mem,mem+8,mem+16,mem+24,a0,a1,a2,a3,a4,a5,a6,a7);
-  for( i=0; i<32; i++ ) if( mem[i]!=i ) break;
-  //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) ||
-  //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 );
-
-  REQUIRE( any(a0==v4int( 0,  8, 16, 24 )) );
-  REQUIRE( any(a1==v4int( 1,  9, 17, 25 )) );
-  REQUIRE( any(a2==v4int( 2, 10, 18, 26 )) );
-  REQUIRE( any(a3==v4int( 3, 11, 19, 27 )) );
-  REQUIRE( any(a4==v4int( 4, 12, 20, 28 )) );
-  REQUIRE( any(a5==v4int( 5, 13, 21, 29 )) );
-  REQUIRE( any(a6==v4int( 6, 14, 22, 30 )) );
-  REQUIRE( any(a7==v4int( 7, 15, 23, 31 )) );
-  REQUIRE( i==32 );
-} // TEST_CASE
-#endif
-
-#ifdef V4_NEON_ACCELERATION
-TEST_CASE("TEST_CASE_load_4x16_tr", "[v4]") {
-  DECLARE_ALIGNED_ARRAY( int, 64, mem, 64 );
-  v4int a00, a01, a02, a03, a04, a05, a06, a07;
-  v4int a08, a09, a10, a11, a12, a13, a14, a15;
-  int i;
-  for( i=0; i<64; i++ ) mem[i] = i;
-  load_4x16_tr(mem,mem+16,mem+32,mem+48,
-	       a00,a01,a02,a03,a04,a05,a06,a07,
-	       a08,a09,a10,a11,a12,a13,a14,a15);
-  for( i=0; i<64; i++ ) if( mem[i]!=i ) break;
-  //ASSERT_FALSE( any(a0!=v4int( 0, 4, 8,12)) || any(a1!=v4int( 1, 5, 9,13)) ||
-  //any(a2!=v4int( 2, 6,10,14)) || any(a3!=v4int( 3, 7,11,15)) || i!=16 );
-
-  REQUIRE( any(a00==v4int(  0, 16, 32, 48 )) );
-  REQUIRE( any(a01==v4int(  1, 17, 33, 49 )) );
-  REQUIRE( any(a02==v4int(  2, 18, 34, 50 )) );
-  REQUIRE( any(a03==v4int(  3, 19, 35, 51 )) );
-  REQUIRE( any(a04==v4int(  4, 20, 36, 52 )) );
-  REQUIRE( any(a05==v4int(  5, 21, 37, 53 )) );
-  REQUIRE( any(a06==v4int(  6, 22, 38, 54 )) );
-  REQUIRE( any(a07==v4int(  7, 23, 39, 55 )) );
-  REQUIRE( any(a08==v4int(  8, 24, 40, 56 )) );
-  REQUIRE( any(a09==v4int(  9, 25, 41, 57 )) );
-  REQUIRE( any(a10==v4int( 10, 26, 42, 58 )) );
-  REQUIRE( any(a11==v4int( 11, 27, 43, 59 )) );
-  REQUIRE( any(a12==v4int( 12, 28, 44, 60 )) );
-  REQUIRE( any(a13==v4int( 13, 29, 45, 61 )) );
-  REQUIRE( any(a14==v4int( 14, 30, 46, 62 )) );
-  REQUIRE( any(a15==v4int( 15, 31, 47, 63 )) );
-  REQUIRE( i==64 );
-} // TEST_CASE
-#endif
-
 TEST_CASE("TEST_CASE_store_4x1_tr", "[v4]") {
   DECLARE_ALIGNED_ARRAY( int, 16, mem, 16 );
   v4int a0( 0, 4, 8,12), a1( 1, 5, 9,13), a2( 2, 6,10,14), a3( 3, 7,11,15);

From 41b7d3ee27992d17e90ebcd533ab14ff98bd2abc Mon Sep 17 00:00:00 2001
From: Dave Nystrom <wdn@lanl.gov>
Date: Mon, 5 Aug 2019 20:13:01 -0600
Subject: [PATCH 50/95] Do not disable dynamic resizing since it is not
 disabled in github.

---
 src/boundary/boundary_p.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/boundary/boundary_p.cc b/src/boundary/boundary_p.cc
index a50d6657..cfcb699e 100644
--- a/src/boundary/boundary_p.cc
+++ b/src/boundary/boundary_p.cc
@@ -5,7 +5,7 @@
 // If this is defined particle and mover buffers will not resize dynamically.
 // This is the common case for the users.
 
-#define DISABLE_DYNAMIC_RESIZING
+// #define DISABLE_DYNAMIC_RESIZING
 
 // FIXME: ARCHITECTURAL FLAW!  CUSTOM BCS AND SHARED FACES CANNOT
 // COEXIST ON THE SAME FACE!  THIS MEANS THAT CUSTOM BOUNDARYS MUST

From 8599273b5cc242fa6643443ec47535e94010b0ed Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 13 Aug 2019 10:40:43 -0600
Subject: [PATCH 51/95] added global partition data to grid

---
 src/grid/grid.h       | 13 ++++++++-----
 src/grid/partition.cc | 25 +++++++++++++++----------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/grid/grid.h b/src/grid/grid.h
index 7654fe94..3167c7e6 100644
--- a/src/grid/grid.h
+++ b/src/grid/grid.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Written by:
  *   Kevin J. Bowers, Ph.D.
  *   Plasma Physics Group (X-1)
@@ -46,7 +46,7 @@ enum grid_enums {
   // B_tang  -> Symmetric           | B_tang  -> Anti-symmetric
   // E_norm  -> Symmetric           | E_norm  -> Anti-symmetric (see note)
   // div B   -> Symmetric           | div B   -> Anti-symmetric
-  // 
+  //
   // Note: B_norm is tricky. For a symmetry plane, B_norm on the
   // boundary must be zero as there are no magnetic charges (a
   // non-zero B_norm would imply an infinitesimal layer of magnetic
@@ -80,7 +80,7 @@ typedef struct grid {
   int64_t step;             // Current timestep
   double t0;                // Simulation time corresponding to step 0
 
-  // Phase 2 grid data structures 
+  // Phase 2 grid data structures
   float x0, y0, z0;         // Min corner local domain (must be coherent)
   float x1, y1, z1;         // Max corner local domain (must be coherent)
   int   nx, ny, nz;         // Local voxel mesh resolution.  Voxels are
@@ -99,6 +99,9 @@ typedef struct grid {
                             // 0 ... nproc-1 ... comm boundary condition
                             // <0 ... locally applied boundary condition
 
+  int gpx, gpy, gpz = -1; // Store global processor decomposition to let us figure
+                     // out where we are in the global decomposition
+
   // Phase 3 grid data structures
   // NOTE: VOXEL INDEXING LIMITS NUMBER OF VOXELS TO 2^31 (INCLUDING
   // GHOSTS) PER NODE.  NEIGHBOR INDEXING FURTHER LIMITS TO
@@ -147,7 +150,7 @@ typedef struct grid {
 // inner loops.)
 //
 // This is written with seeming extraneously if tests in order to get
-// the compiler to generate branceless conditional move and add 
+// the compiler to generate branceless conditional move and add
 // instructions (none of the branches below are actual branches in
 // assembly).
 
@@ -311,7 +314,7 @@ end_send_port( int i, // x port coord ([-1,0,1])
 // ordering (e.g. inner loop increments x-index).
 //
 // jobs are indexed from 0 to n_job-1.  jobs are _always_ have the
-// number of voxels an integer multiple of the bundle size.  If job 
+// number of voxels an integer multiple of the bundle size.  If job
 // is set to n_job, this function will determine the parameters of
 // the final incomplete bundle.
 
diff --git a/src/grid/partition.cc b/src/grid/partition.cc
index 96664b78..fc554c2d 100644
--- a/src/grid/partition.cc
+++ b/src/grid/partition.cc
@@ -1,4 +1,4 @@
-/* 
+/*
  * Written by:
  *   Kevin J. Bowers, Ph.D.
  *   Plasma Physics Group (X-1)
@@ -39,7 +39,7 @@ partition_periodic_box( grid_t * g,
                         int gnx, int gny, int gnz,
                         int gpx, int gpy, int gpz ) {
   double f;
-  int rank, px, py, pz; 
+  int rank, px, py, pz;
 
   // Make sure the grid can be setup
 
@@ -55,6 +55,11 @@ partition_periodic_box( grid_t * g,
   // Setup basic variables
   RANK_TO_INDEX( world_rank, px,py,pz );
 
+  // Capture global processor decomposition
+  g->gpx = gpx;
+  g->gpx = gpy;
+  g->gpx = gpz;
+
   g->dx = (gx1-gx0)/(double)gnx;
   g->dy = (gy1-gy0)/(double)gny;
   g->dz = (gz1-gz0)/(double)gnz;
@@ -96,7 +101,7 @@ partition_absorbing_box( grid_t * g,
                          int gnx, int gny, int gnz,
                          int gpx, int gpy, int gpz,
                          int pbc ) {
-  int px, py, pz; 
+  int px, py, pz;
 
   partition_periodic_box( g,
                           gx0, gy0, gz0,
@@ -108,30 +113,30 @@ partition_absorbing_box( grid_t * g,
 
   RANK_TO_INDEX( world_rank, px,py,pz );
 
-  if( px==0 && gnx>1 ) { 
+  if( px==0 && gnx>1 ) {
     set_fbc(g,BOUNDARY(-1,0,0),absorb_fields);
     set_pbc(g,BOUNDARY(-1,0,0),pbc);
-  } 
+  }
 
   if( px==gpx-1 && gnx>1 ) {
     set_fbc(g,BOUNDARY( 1,0,0),absorb_fields);
     set_pbc(g,BOUNDARY( 1,0,0),pbc);
   }
 
-  if( py==0 && gny>1 ) { 
+  if( py==0 && gny>1 ) {
     set_fbc(g,BOUNDARY(0,-1,0),absorb_fields);
     set_pbc(g,BOUNDARY(0,-1,0),pbc);
-  } 
+  }
 
   if( py==gpy-1 && gny>1 ) {
     set_fbc(g,BOUNDARY(0, 1,0),absorb_fields);
     set_pbc(g,BOUNDARY(0, 1,0),pbc);
   }
 
-  if( pz==0 && gnz>1 ) { 
+  if( pz==0 && gnz>1 ) {
     set_fbc(g,BOUNDARY(0,0,-1),absorb_fields);
     set_pbc(g,BOUNDARY(0,0,-1),pbc);
-  } 
+  }
 
   if( pz==gpz-1 && gnz>1 ) {
     set_fbc(g,BOUNDARY(0,0, 1),absorb_fields);
@@ -148,7 +153,7 @@ partition_metal_box( grid_t * g,
                      double gx1, double gy1, double gz1,
                      int gnx, int gny, int gnz,
                      int gpx, int gpy, int gpz ) {
-  int px, py, pz; 
+  int px, py, pz;
 
   partition_periodic_box( g,
                           gx0, gy0, gz0,

From 08a2d2d84596d16240119ec64239066b3a43c1c5 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 13 Aug 2019 12:11:40 -0600
Subject: [PATCH 52/95] first pass adding a compiling port of viou for HDF5
 vpic IO

---
 CMakeLists.txt              |  56 +--
 sample/harrisHDF5           | 432 +++++++++++++++++
 src/vpic/dump.cc            | 899 +++++++++++++++++++++++++++++++++++-
 src/vpic/hdf5_header_info.h | 259 +++++++++++
 src/vpic/vpic.h             |   8 +
 5 files changed, 1619 insertions(+), 35 deletions(-)
 create mode 100644 sample/harrisHDF5
 create mode 100644 src/vpic/hdf5_header_info.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7fcc1027..f7fd9d84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,10 +73,10 @@ option(USE_LEGACY_SORT "Enable Legacy Sort Implementation" OFF)
 
 option(VPIC_PRINT_MORE_DIGITS "Print more digits in VPIC timer info" OFF)
 
-option(ENABLE_OPENSSL "Enable OpenSSL support for checksums" OFF)
-
 option(DISABLE_DYNAMIC_RESIZING "Prevent particle arrays from dynamically resizing during a run" OFF)
 
+option(USE_HDF5 "Enable HDF5 for use during IO. VPIC does not help you install HDF5" OFF)
+
 # option to set minimum number of particles
 set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing")
 
@@ -114,24 +114,11 @@ if(NOT SET_MIN_NUM_PARTICLES STREQUAL "AUTO")
     add_definitions(-DMIN_NP=${SET_MIN_NUM_PARTICLES})
 endif()
 
-#------------------------------------------------------------------------------#
-# OpenSSL
-#------------------------------------------------------------------------------#
-
-if(ENABLE_OPENSSL)
-  find_package(OpenSSL REQUIRED)
-
-  include_directories(${OPENSSL_INCLUDE_DIR})
-  string(REPLACE ";" " " string_libraries "${OPENSSL_LIBRARIES}")
-  set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}")
-endif(ENABLE_OPENSSL)
-
 find_package(Threads REQUIRED)
 
 #------------------------------------------------------------------------------#
 # Act on build options set in project.cmake
 #------------------------------------------------------------------------------#
-
 #------------------------------------------------------------------------------#
 # Add options for building with the legacy particle sort implementation.
 #------------------------------------------------------------------------------#
@@ -277,10 +264,6 @@ endif()
 # Miscellaneous options.
 #------------------------------------------------------------------------------#
 
-if(ENABLE_OPENSSL)
-  add_definitions(-DENABLE_OPENSSL)
-endif(ENABLE_OPENSSL)
-
 if(VPIC_PRINT_MORE_DIGITS)
   add_definitions(-DVPIC_PRINT_MORE_DIGITS)
   set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_PRINT_MORE_DIGITS")
@@ -323,18 +306,6 @@ install(FILES ${CMAKE_SOURCE_DIR}/deck/main.cc
 install(FILES ${CMAKE_SOURCE_DIR}/deck/wrapper.cc
   DESTINATION share/vpic)
 
-# local script
-configure_file(${CMAKE_SOURCE_DIR}/bin/vpic-local.in
-  ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic)
-
-file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic
-  DESTINATION ${CMAKE_BINARY_DIR}/bin
-  FILE_PERMISSIONS
-    OWNER_READ OWNER_WRITE OWNER_EXECUTE
-    GROUP_READ GROUP_EXECUTE
-    WORLD_READ WORLD_EXECUTE
-)
-
 #------------------------------------------------------------------------------#
 # Add library target
 #------------------------------------------------------------------------------#
@@ -358,8 +329,29 @@ else()
   set(VPIC_SRC)
   install(TARGETS vpic LIBRARY DESTINATION lib ARCHIVE DESTINATION lib)
 endif()
+
+if(USE_HDF5)
+    # Enable HDF5, and the relevant defines
+    find_package(HDF5 REQUIRED)
+    add_definitions(-DVPIC_ENABLE_HDF5)
+    string(REPLACE ";" " " string_libraries "${HDF5_C_LIBRARIES}")
+    set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}")
+endif(USE_HDF5)
+
+# Configure local script to generate bin/vpic
+configure_file(${CMAKE_SOURCE_DIR}/bin/vpic-local.in
+  ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic)
+
+file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic
+  DESTINATION ${CMAKE_BINARY_DIR}/bin
+  FILE_PERMISSIONS
+    OWNER_READ OWNER_WRITE OWNER_EXECUTE
+    GROUP_READ GROUP_EXECUTE
+    WORLD_READ WORLD_EXECUTE
+)
+
 target_include_directories(vpic INTERFACE ${CMAKE_SOURCE_DIR}/src)
-target_link_libraries(vpic ${VPIC_EXPOSE} ${MPI_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${OPENSSL_LIBRARIES} ${CMAKE_DL_LIBS})
+target_link_libraries(vpic ${VPIC_EXPOSE} ${MPI_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS} ${HDF5_C_LIBRARIES})
 target_compile_options(vpic ${VPIC_EXPOSE} ${MPI_C_COMPILE_FLAGS})
 
 macro(build_a_vpic name deck)
diff --git a/sample/harrisHDF5 b/sample/harrisHDF5
new file mode 100644
index 00000000..ff565f33
--- /dev/null
+++ b/sample/harrisHDF5
@@ -0,0 +1,432 @@
+// Magnetic reconnection in a Harris equilibrium thin current sheet
+//
+// This input deck reproduces the PIC simulations found in:
+//   William Daughton. "Nonlinear dynamics of thin current sheets." Phys.
+//   Plasmas. 9(9): 3668-3678. September 2002.
+//
+// This input deck was written by:
+//   Kevin J Bowers, Ph.D.
+//   Plasma Physics Group (X-1)
+//   Applied Physics Division
+//   Los Alamos National Lab
+// August 2003      - original version
+// October 2003     - heavily revised to utilize input deck syntactic sugar
+// March/April 2004 - rewritten for domain decomposition V4PIC
+
+// If you want to use global variables (for example, to store the dump
+// intervals for your diagnostics section), it must be done in the globals
+// section. Variables declared the globals section will be preserved across
+// restart dumps. For example, if the globals section is:
+//   begin_globals {
+//     double variable;
+//   } end_globals
+// the double "variable" will be visible to other input deck sections as
+// "global->variable". Note: Variables declared in the globals section are set
+// to zero before the user's initialization block is executed. Up to 16K
+// of global variables can be defined.
+
+begin_globals {
+  double energies_interval;
+  double fields_interval;
+  double ehydro_interval;
+  double ihydro_interval;
+  double eparticle_interval;
+  double iparticle_interval;
+  double restart_interval;
+};
+
+begin_initialization {
+  // At this point, there is an empty grid and the random number generator is
+  // seeded with the rank. The grid, materials, species need to be defined.
+  // Then the initial non-zero fields need to be loaded at time level 0 and the
+  // particles (position and momentum both) need to be loaded at time level 0.
+
+  double input_mass_ratio;
+  int input_seed;
+
+  // Arguments can be passed from the command line to the input deck
+  if( num_cmdline_arguments!=3 ) {
+    // Set sensible defaults
+    input_mass_ratio = 1.0;
+    input_seed = 0;
+
+    sim_log( "Defaulting to mass_ratio of " << input_mass_ratio << " and seed of " << input_seed );
+    sim_log( "For Custom Usage: " << cmdline_argument[0] << " mass_ratio seed" );
+  }
+  else {
+    input_mass_ratio   = atof(cmdline_argument[1]); // Ion mass / electron mass
+    input_seed   = atof(cmdline_argument[2]); // Ion mass / electron mass
+    sim_log( "Detected input mass_ratio of " << input_mass_ratio << " and seed of " << input_seed );
+  }
+  seed_entropy( input_seed );
+
+  // Diagnostic messages can be passed written (usually to stderr)
+  sim_log( "Computing simulation parameters");
+
+  // Define the system of units for this problem (natural units)
+  double L    = 1; // Length normalization (sheet thickness)
+  double ec   = 1; // Charge normalization
+  double me   = 1; // Mass normalization
+  double c    = 1; // Speed of light
+  double eps0 = 1; // Permittivity of space
+
+  // Physics parameters
+  double mi_me   = input_mass_ratio; // Ion mass / electron mass
+  double rhoi_L  = 1;    // Ion thermal gyroradius / Sheet thickness
+  double Ti_Te   = 1;    // Ion temperature / electron temperature
+  double wpe_wce = 3;    // Electron plasma freq / electron cycltron freq
+  double theta   = 0;    // Orientation of the simulation wrt current sheet
+  double taui    = 100;  // Simulation wci's to run
+
+  // Numerical parameters
+  double Lx        = 16*L;  // How big should the box be in the x direction
+  double Ly        = 16*L;  // How big should the box be in the y direction
+  double Lz        = 16*L;  // How big should the box be in the z direction
+  double nx        = 64;    // Global resolution in the x direction
+  double ny        = 64;    // Global resolution in the y direction
+  double nz        = 1;     // Global resolution in the z direction
+  double nppc      = 64;    // Average number of macro particles per cell (both species combined!)
+  double cfl_req   = 0.99;  // How close to Courant should we try to run
+  double wpedt_max = 0.36;  // How big a timestep is allowed if Courant is not too restrictive
+  double damp      = 0.001; // Level of radiation damping
+
+  // Derived quantities
+  double mi   = me*mi_me;                             // Ion mass
+  double kTe  = me*c*c/(2*wpe_wce*wpe_wce*(1+Ti_Te)); // Electron temperature
+  double kTi  = kTe*Ti_Te;                            // Ion temperature
+  double vthe = sqrt(2*kTe/me);                       // Electron thermal velocity (B.D. convention)
+  double vthi = sqrt(2*kTi/mi);                       // Ion thermal velocity (B.D. convention)
+  double wci  = vthi/(rhoi_L*L);                      // Ion cyclotron frequency
+  double wce  = wci*mi_me;                            // Electron cyclotron frequency
+  double wpe  = wce*wpe_wce;                          // Electron plasma frequency
+  double wpi  = wpe/sqrt(mi_me);                      // Ion plasma frequency
+  double vdre = c*c*wce/(wpe*wpe*L*(1+Ti_Te));        // Electron drift velocity
+  double vdri = -Ti_Te*vdre;                          // Ion drift velocity
+  double b0   = me*wce/ec;                            // Asymptotic magnetic field strength
+  double n0   = me*eps0*wpe*wpe/(ec*ec);              // Peak electron density (also peak ion density)
+  double Npe  = 2*n0*Ly*Lz*L*tanh(0.5*Lx/L);          // Number of physical electrons in box
+  double Npi  = Npe;                                  // Number of physical ions in box
+  double Ne   = 0.5*nppc*nx*ny*nz;                    // Total macro electrons in box
+  Ne = trunc_granular(Ne,nproc());                    // Make it divisible by number of processors
+  double Ni   = Ne;                                   // Total macro ions in box
+  double we   = Npe/Ne;                               // Weight of a macro electron
+  double wi   = Npi/Ni;                               // Weight of a macro ion
+  double gdri = 1/sqrt(1-vdri*vdri/(c*c));            // gamma of ion drift frame
+  double gdre = 1/sqrt(1-vdre*vdre/(c*c));            // gamma of electron drift frame
+  double udri = vdri*gdri;                            // 4-velocity of ion drift frame
+  double udre = vdre*gdre;                            // 4-velocity of electron drift frame
+  double uthi = sqrt(kTi/mi)/c;                       // Normalized ion thermal velocity (K.B. convention)
+  double uthe = sqrt(kTe/me)/c;                       // Normalized electron thermal velocity (K.B. convention)
+  double cs   = cos(theta);
+  double sn   = sin(theta);
+
+  // Determine the timestep
+  double dg = courant_length(Lx,Ly,Lz,nx,ny,nz);      // Courant length
+  double dt = cfl_req*dg/c;                           // Courant limited time step
+  if( wpe*dt>wpedt_max ) dt=wpedt_max/wpe;            // Override time step if plasma frequency limited
+
+  ////////////////////////////////////////
+  // Setup high level simulation parmeters
+
+  num_step             = int(0.2*taui/(wci*dt));
+  status_interval      = int(1./(wci*dt));
+  sync_shared_interval = status_interval;
+  clean_div_e_interval = status_interval;
+  clean_div_b_interval = status_interval;
+
+  global->energies_interval  = status_interval;
+  global->fields_interval    = status_interval;
+  global->ehydro_interval    = status_interval;
+  global->ihydro_interval    = status_interval;
+  global->eparticle_interval = status_interval;
+  global->iparticle_interval = status_interval;
+  global->restart_interval   = status_interval;
+
+  ///////////////////////////
+  // Setup the space and time
+
+  // Setup basic grid parameters
+  define_units( c, eps0 );
+  define_timestep( dt );
+
+  // Parition a periodic box among the processors sliced uniformly along y
+  define_periodic_grid( -0.5*Lx, 0, 0,    // Low corner
+                         0.5*Lx, Ly, Lz,  // High corner
+                         nx, ny, nz,      // Resolution
+                         1, nproc(), 1 ); // Topology
+
+  // Override some of the boundary conditions to put a particle reflecting
+  // perfect electrical conductor on the -x and +x boundaries
+  set_domain_field_bc( BOUNDARY(-1,0,0), pec_fields );
+  set_domain_field_bc( BOUNDARY( 1,0,0), pec_fields );
+  set_domain_particle_bc( BOUNDARY(-1,0,0), reflect_particles );
+  set_domain_particle_bc( BOUNDARY( 1,0,0), reflect_particles );
+
+  define_material( "vacuum", 1 );
+  // Note: define_material defaults to isotropic materials with mu=1,sigma=0
+  // Tensor electronic, magnetic and conductive materials are supported
+  // though. See "shapes" for how to define them and assign them to regions.
+  // Also, space is initially filled with the first material defined.
+
+  // If you pass NULL to define field array, the standard field array will
+  // be used (if damp is not provided, no radiation damping will be used).
+  define_field_array( NULL, damp );
+
+  ////////////////////
+  // Setup the species
+
+  // Allow 50% more local_particles in case of non-uniformity
+  // VPIC will pick the number of movers to use for each species
+  // Both species use out-of-place sorting
+  species_t * ion      = define_species( "ion",       ec, mi, 1.5*Ni/nproc(), -1, 40, 1 );
+  species_t * electron = define_species( "electron", -ec, me, 1.5*Ne/nproc(), -1, 20, 1 );
+
+  ///////////////////////////////////////////////////
+  // Log diagnostic information about this simulation
+
+  sim_log( "" );
+  sim_log( "System of units" );
+  sim_log( "L = " << L );
+  sim_log( "ec = " << ec );
+  sim_log( "me = " << me );
+  sim_log( "c = " << c );
+  sim_log( "eps0 = " << eps0 );
+  sim_log( "" );
+  sim_log( "Physics parameters" );
+  sim_log( "rhoi/L = " << rhoi_L );
+  sim_log( "Ti/Te = " << Ti_Te );
+  sim_log( "wpe/wce = " << wpe_wce );
+  sim_log( "mi/me = " << mi_me );
+  sim_log( "theta = " << theta );
+  sim_log( "taui = " << taui );
+  sim_log( "" );
+  sim_log( "Numerical parameters" );
+  sim_log( "num_step = " << num_step );
+  sim_log( "dt = " << dt );
+  sim_log( "Lx = " << Lx << ", Lx/L = " << Lx/L );
+  sim_log( "Ly = " << Ly << ", Ly/L = " << Ly/L );
+  sim_log( "Lz = " << Lz << ", Lz/L = " << Lz/L );
+  sim_log( "nx = " << nx << ", dx = " << Lx/nx << ", L/dx = " << L*nx/Lx );
+  sim_log( "ny = " << ny << ", dy = " << Ly/ny << ", L/dy = " << L*ny/Ly );
+  sim_log( "nz = " << nz << ", dz = " << Lz/nz << ", L/dz = " << L*nz/Lz );
+  sim_log( "nppc = " << nppc );
+  sim_log( "courant = " << c*dt/dg );
+  sim_log( "damp = " << damp );
+  sim_log( "" );
+  sim_log( "Ion parameters" );
+  sim_log( "qpi = "  << ec << ", mi = " << mi << ", qpi/mi = " << ec/mi );
+  sim_log( "vthi = " << vthi << ", vthi/c = " << vthi/c << ", kTi = " << kTi  );
+  sim_log( "vdri = " << vdri << ", vdri/c = " << vdri/c );
+  sim_log( "wpi = " << wpi << ", wpi dt = " << wpi*dt << ", n0 = " << n0 );
+  sim_log( "wci = " << wci << ", wci dt = " << wci*dt );
+  sim_log( "rhoi = " << vthi/wci << ", L/rhoi = " << L/(vthi/wci) << ", dx/rhoi = " << (Lx/nx)/(vthi/wci) );
+  sim_log( "debyei = " << vthi/wpi << ", L/debyei = " << L/(vthi/wpi) << ", dx/debyei = " << (Lx/nx)/(vthi/wpi) );
+  sim_log( "Npi = " << Npi << ", Ni = " << Ni << ", Npi/Ni = " << Npi/Ni << ", wi = " << wi );
+  sim_log( "" );
+  sim_log( "Electron parameters" );
+  sim_log( "qpe = "  << -ec << ", me = " << me << ", qpe/me = " << -ec/me );
+  sim_log( "vthe = " << vthe << ", vthe/c = " << vthe/c << ", kTe = " << kTe  );
+  sim_log( "vdre = " << vdre << ", vdre/c = " << vdre/c );
+  sim_log( "wpe = " << wpe << ", wpe dt = " << wpe*dt << ", n0 = " << n0 );
+  sim_log( "wce = " << wce << ", wce dt = " << wce*dt );
+  sim_log( "rhoe = " << vthe/wce << ", L/rhoe = " << L/(vthe/wce) << ", dx/rhoe = " << (Lx/nx)/(vthe/wce) );
+  sim_log( "debyee = " << vthe/wpe << ", L/debyee = " << L/(vthe/wpe) << ", dx/debyee = " << (Lx/nx)/(vthe/wpe) );
+  sim_log( "Npe = " << Npe << ", Ne = " << Ne << ", Npe/Ne = " << Npe/Ne << ", we = " << we );
+  sim_log( "" );
+  sim_log( "Miscellaneous" );
+  sim_log( "nptotal = " << Ni + Ne );
+  sim_log( "nproc = " << nproc() );
+  sim_log( "" );
+
+  ////////////////////////////
+  // Load fields and particles
+
+  sim_log( "Loading fields" );
+
+  set_region_field( everywhere, 0, 0, 0,                    // Electric field
+                    0, -sn*b0*tanh(x/L), cs*b0*tanh(x/L) ); // Magnetic field
+  // Note: everywhere is a region that encompasses the entire simulation
+  // In general, regions are specied as logical equations (i.e. x>0 && x+y<2)
+
+  sim_log( "Loading particles" );
+
+  double ymin = rank()*Ly/nproc(), ymax = (rank()+1)*Ly/nproc();
+
+  repeat( Ni/nproc() ) {
+    double x, y, z, ux, uy, uz, d0;
+
+    // Pick an appropriately distributed random location for the pair
+    do {
+      x = L*atanh( uniform( rng(0), -1, 1 ) );
+    } while( x<=-0.5*Lx || x>=0.5*Lx );
+    y = uniform( rng(0), ymin, ymax );
+    z = uniform( rng(0), 0,    Lz   );
+
+    // For the ion, pick an isothermal normalized momentum in the drift frame
+    // (this is a proper thermal equilibrium in the non-relativistic limit),
+    // boost it from the drift frame to the frame with the magnetic field
+    // along z and then rotate it into the lab frame. Then load the particle.
+    // Repeat the process for the electron.
+
+    ux = normal( rng(0), 0, uthi );
+    uy = normal( rng(0), 0, uthi );
+    uz = normal( rng(0), 0, uthi );
+    d0 = gdri*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udri;
+    uy = d0*cs - uz*sn;
+    uz = d0*sn + uz*cs;
+    inject_particle( ion,      x, y, z, ux, uy, uz, wi, 0, 0 );
+
+    ux = normal( rng(0), 0, uthe );
+    uy = normal( rng(0), 0, uthe );
+    uz = normal( rng(0), 0, uthe );
+    d0 = gdre*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udre;
+    uy = d0*cs - uz*sn;
+    uz = d0*sn + uz*cs;
+    inject_particle( electron, x, y, z, ux, uy, uz, we, 0, 0 );
+  }
+
+  // Upon completion of the initialization, the following occurs:
+  // - The synchronization error (tang E, norm B) is computed between domains
+  //   and tang E / norm B are synchronized by averaging where discrepancies
+  //   are encountered.
+  // - The initial divergence error of the magnetic field is computed and
+  //   one pass of cleaning is done (for good measure)
+  // - The bound charge density necessary to give the simulation an initially
+  //   clean divergence e is computed.
+  // - The particle momentum is uncentered from u_0 to u_{-1/2}
+  // - The user diagnostics are called on the initial state
+  // - The physics loop is started
+  //
+  // The physics loop consists of:
+  // - Advance particles from x_0,u_{-1/2} to x_1,u_{1/2}
+  // - User particle injection at x_{1-age}, u_{1/2} (use inject_particles)
+  // - User current injection (adjust field(x,y,z).jfx, jfy, jfz)
+  // - Advance B from B_0 to B_{1/2}
+  // - Advance E from E_0 to E_1
+  // - User field injection to E_1 (adjust field(x,y,z).ex,ey,ez,cbx,cby,cbz)
+  // - Advance B from B_{1/2} to B_1
+  // - (periodically) Divergence clean electric field
+  // - (periodically) Divergence clean magnetic field
+  // - (periodically) Synchronize shared tang e and norm b
+  // - Increment the time step
+  // - Call user diagnostics
+  // - (periodically) Print a status message
+}
+
+begin_diagnostics {
+
+# define should_dump(x) (global->x##_interval>0 && remainder(step(),global->x##_interval)==0)
+
+  if( step()==-10 ) {
+    // A grid dump contains all grid parameters, field boundary conditions,
+    // particle boundary conditions and domain connectivity information. This
+    // is stored in a binary format. Each rank makes a grid dump
+    dump_grid("grid");
+
+    // A materials dump contains all the materials parameters. This is in a
+    // text format. Only rank 0 makes the materials dump
+    dump_materials("materials");
+
+    // A species dump contains the physics parameters of a species. This is in
+    // a text format. Only rank 0 makes the species dump
+    dump_species("species");
+  }
+
+  // Energy dumps store all the energies in various directions of E and B
+  // and the total kinetic (not including rest mass) energies of each species
+  // species in a simple text format. By default, the energies are appended to
+  // the file. However, if a "0" is added to the dump_energies call, a new
+  // energies dump file will be created. The energies are in the units of the
+  // problem and are all time centered appropriately. Note: When restarting a
+  // simulation from a restart dump made at a prior time step to the last
+  // energies dump, the energies file will have a "hiccup" of intervening
+  // time levels. This "hiccup" will not occur if the simulation is aborted
+  // immediately following a restart dump. Energies dumps are in a text
+  // format and the layout is documented at the top of the file. Only rank 0
+  // makes makes an energies dump.
+  if( should_dump(energies) ) dump_energies( "energies", step()==0 ? 0 : 1 );
+
+  // Field dumps store the raw electromagnetic fields, sources and material
+  // placement and a number of auxilliary fields. E, B and RHOB are
+  // timecentered, JF and TCA are half a step old. Material fields are static
+  // and the remaining fields (DIV E ERR, DIV B ERR and RHOF) are for
+  // debugging purposes. By default, field dump filenames are tagged with
+  // step(). However, if a "0" is added to the call, the filename will not be
+  // tagged. The JF that gets stored is accumulated with a charge-conserving
+  // algorithm. As a result, JF is not valid until at least one timestep has
+  // been completed. Field dumps are in a binary format. Each rank makes a
+  // field dump.
+  if( step()==-10 )         dump_fields_hdf5("fields"); // Get first valid total J
+  if( should_dump(fields) ) dump_fields_hdf5("fields");
+
+  // Hydro dumps store particle charge density, current density and
+  // stress-energy tensor. All these quantities are known at the time
+  // t = time().  All these quantities are accumulated trilinear
+  // node-centered. By default, species dump filenames are tagged with
+  // step(). However, if a "0" is added to the call, the filename will not
+  // be tagged. Note that the current density accumulated by this routine is
+  // purely diagnostic. It is not used by the simulation and it is not
+  // accumulated using a self-consistent charge-conserving method. Hydro dumps
+  // are in a binary format. Each rank makes a hydro dump.
+  if( should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro");
+  if( should_dump(ihydro) ) dump_hydro_hdf5("ion",     "ihydro");
+
+  // Particle dumps store the particle data for a given species. The data
+  // written is known at the time t = time().  By default, particle dumps
+  // are tagged with step(). However, if a "0" is added to the call, the
+  // filename will not be tagged. Particle dumps are in a binary format.
+  // Each rank makes a particle dump.
+  if( should_dump(eparticle) ) dump_particles_hdf5("electron","eparticle");
+  if( should_dump(iparticle) ) dump_particles_hdf5("ion",     "iparticle");
+
+  // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string
+  // and tag is an integer.  A typical usage is:
+  //   checkpt( "checkpt", step() ).
+  // This will cause each process to write their simulation state to a file
+  // whose name is based on fbase, tag and the node's rank.  For the above
+  // usage, if called on step 314 on a 4 process run, the four files:
+  //   checkpt.314.0, checkpt.314.1, checkpt.314.2, checkpt.314.3
+  // to be written.  The simulation can then be restarted from this point by
+  // invoking the application with "--restore checkpt.314".  checkpt must be
+  // the _VERY_ LAST_ diagnostic called.  If not, diagnostics performed after
+  // the checkpt but before the next timestep will be missed on restore.
+  // Restart dumps are in a binary format unique to the each simulation.
+
+  if( should_dump(restart) ) checkpt( "checkpt", step() );
+
+  // If you want to write a checkpt after a certain amount of simulation time,
+  // use uptime() in conjunction with checkpt.  For example, this will cause
+  // the simulation state to be written after 7.5 hours of running to the
+  // same file every time (useful for dealing with quotas on big machines).
+  //if( uptime()>=27000 ) {
+  //  checkpt( "timeout", 0 );
+  //  abort(0);
+  //}
+
+# undef should_dump
+
+}
+
+begin_particle_injection {
+
+  // No particle injection for this simulation
+
+}
+
+begin_current_injection {
+
+  // No current injection for this simulation
+
+}
+
+begin_field_injection {
+
+  // No field injection for this simulation
+
+}
+
+begin_particle_collisions{
+
+  // No collisions for this simulation
+
+}
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 62505147..1639a044 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -15,6 +15,11 @@
 #include "dumpmacros.h"
 #include "../util/io/FileUtils.h"
 
+#ifdef VPIC_ENABLE_HDF5
+#include "hdf5.h" // from the lib
+#include "hdf5_header_info.h" // from vpic
+#endif
+
 /* -1 means no ranks talk */
 #define VERBOSE_rank -1
 
@@ -256,6 +261,897 @@ vpic_simulation::dump_hydro( const char *sp_name,
   if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
 }
 
+#ifdef VPIC_ENABLE_HDF5
+#define DUMP_DIR_FORMAT "./%s"
+
+/* define to do C-style indexing */
+#define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
+
+// TODO: make function?
+#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag)                                 \
+  {                                                                                                                                       \
+    FILE *fp;                                                                                                                             \
+    fp = fopen(xml_file_name, "a");                                                                                                       \
+    fprintf(fp, main_body_head, time_step);                                                                                               \
+    if (field_dump_flag.enabledE())                                                                                                       \
+      write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez");             \
+    if (field_dump_flag.div_e_err)                                                                                                        \
+      fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err");               \
+    if (field_dump_flag.enabledCB())                                                                                                      \
+      write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz");          \
+    if (field_dump_flag.div_b_err)                                                                                                        \
+      fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err");               \
+    if (field_dump_flag.enabledTCA())                                                                                                     \
+      write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz");     \
+    if (field_dump_flag.rhob)                                                                                                             \
+      fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob");                         \
+    if (field_dump_flag.enabledJF())                                                                                                      \
+      write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz");         \
+    if (field_dump_flag.rhof)                                                                                                             \
+      fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof");                         \
+    if (field_dump_flag.enabledEMAT())                                                                                                    \
+      write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \
+    if (field_dump_flag.nmat)                                                                                                             \
+      fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat");                         \
+    if (field_dump_flag.enabledFMAT())                                                                                                    \
+      write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \
+    if (field_dump_flag.cmat)                                                                                                             \
+      fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat");                         \
+    fprintf(fp, "%s", main_body_foot);                                                                                                          \
+    if (add_footer_flag)                                                                                                                  \
+      fputs(footer, fp);                                                                                                                  \
+    fclose(fp);                                                                                                                           \
+  }
+void
+vpic_simulation::dump_fields_hdf5( const char *fbase, int ftag )
+{
+    size_t step_for_viou = step();
+
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+
+#ifdef DUMP_INFO_DEBUG
+    printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
+    printf("base dir for field: %s \n", global->fdParams.baseDir);
+    printf("stride x y z  = (%ld, %ld, %ld)\n", global->fdParams.stride_x, global->fdParams.stride_y, global->fdParams.stride_z);
+    printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
+    printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
+    printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
+    printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
+#endif
+
+#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
+    {                                                                                                             \
+        dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
+        temp_buf_index = 0;                                                                                       \
+        for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
+        {                                                                                                         \
+            for (size_t j(1); j < grid->ny + 1; j++)                                                              \
+            {                                                                                                     \
+                for (size_t k(1); k < grid->nz + 1; k++)                                                          \
+                {                                                                                                 \
+                    temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME;                     \
+                    temp_buf_index = temp_buf_index + 1;                                                          \
+                }                                                                                                 \
+            }                                                                                                     \
+        }                                                                                                         \
+        dataspace_id = H5Dget_space(dset_id);                                                                     \
+        H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
+        H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
+        H5Sclose(dataspace_id);                                                                                   \
+        H5Dclose(dset_id);                                                                                        \
+    }
+
+    char fname[256];
+    char field_scratch[128];
+    char subfield_scratch[128];
+
+    sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5");
+    dump_mkdir(field_scratch);
+    sprintf(subfield_scratch, "%s/T.%lld/", field_scratch, step_for_viou);
+    dump_mkdir(subfield_scratch);
+
+    sprintf(fname, "%s/%s_%lld.h5", subfield_scratch, "fields", step_for_viou);
+    double el1 = uptime();
+    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
+    H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+    hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+    H5Pclose(plist_id);
+
+    sprintf(fname, "Timestep_%lld", step_for_viou);
+    hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+    el1 = uptime() - el1;
+    //sim_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
+    double el2 = uptime();
+
+    /*
+// Create a variable list of field values to output.
+size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
+size_t * varlist = new size_t[numvars];
+
+for(size_t i(0), c(0); i<total_field_variables; i++)
+  if(global->fdParams.output_vars.bitset(i)) varlist[c++] = i;
+
+printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
+
+#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
+    /*
+    typedef struct field {
+    float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
+    float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
+    float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
+    float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
+    material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
+    material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
+    } field_t;*/
+    // Local voxel mesh resolution.  Voxels are
+    // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1
+    // with voxels 1:nx,1:ny,1:nz being non-ghost
+    // voxels.
+
+    float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
+    hsize_t temp_buf_index;
+    hid_t dset_id;
+    //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
+    plist_id = H5Pcreate(H5P_DATASET_XFER);
+    //Comment out for test only
+    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+    //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
+
+    //global->topology_x
+
+    hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3];
+    field_global_size[0] = (grid->nx * grid->gpx);
+    field_global_size[1] = (grid->ny * grid->gpy);
+    field_global_size[2] = (grid->nz * grid->gpz);
+
+    field_local_size[0] = grid->nx;
+    field_local_size[1] = grid->ny;
+    field_local_size[2] = grid->nz;
+
+    // TODO: delete this
+#define RANK_TO_INDEX2(rank, ix, iy, iz)                                      \
+    BEGIN_PRIMITIVE                                                           \
+    {                                                                         \
+        int _ix, _iy, _iz;                                                    \
+        _ix = (rank);                         /* ix = ix+gpx*( iy+gpy*iz ) */ \
+        _iy = _ix / int(grid->gpx);  /* iy = iy+gpy*iz */            \
+        _ix -= _iy * int(grid->gpx); /* ix = ix */                   \
+        _iz = _iy / int(grid->gpy);  /* iz = iz */                   \
+        _iy -= _iz * int(grid->gpy); /* iy = iy */                   \
+        (ix) = _ix;                                                           \
+        (iy) = _iy;                                                           \
+        (iz) = _iz;                                                           \
+    }                                                                         \
+    END_PRIMITIVE
+
+    int mpi_rank_x, mpi_rank_y, mpi_rank_z;
+    RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+
+    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+
+    global_offset[0] = (grid->nx) * mpi_rank_x;
+    global_offset[1] = (grid->ny) * mpi_rank_y;
+    global_offset[2] = (grid->nz) * mpi_rank_z;
+
+    global_count[0] = (grid->nx);
+    global_count[1] = (grid->ny);
+    global_count[2] = (grid->nz);
+
+#ifdef DUMP_INFO_DEBUG
+    printf("global size   = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", field_global_size[0], field_global_size[1], field_global_size[2]);
+    printf("global_offset = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_offset[0], global_offset[1], global_offset[2]);
+    printf("global_count  = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_count[0], global_count[1], global_count[2]);
+    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+    fflush(stdout);
+#endif
+
+    hid_t filespace = H5Screate_simple(3, field_global_size, NULL);
+    hid_t memspace = H5Screate_simple(3, field_local_size, NULL);
+    hid_t dataspace_id;
+
+    /*
+    typedef struct field {
+    float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
+    float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
+    float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
+    float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
+    material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
+    material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
+    } field_t;*/
+
+    if (field_dump_flag.ex)
+        DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.ey)
+        DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.ez)
+        DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.div_e_err)
+        DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT);
+
+    if (field_dump_flag.cbx)
+        DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.cby)
+        DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.cbz)
+        DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.div_b_err)
+        DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT);
+
+    if (field_dump_flag.tcax)
+        DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.tcay)
+        DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.tcaz)
+        DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.rhob)
+        DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT);
+
+    if (field_dump_flag.jfx)
+        DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.jfy)
+        DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.jfz)
+        DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.rhof)
+        DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT);
+
+    //H5T_NATIVE_SHORT  for material_id (typedef int16_t material_id)
+    if (field_dump_flag.ematx)
+        DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT);
+    if (field_dump_flag.ematy)
+        DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT);
+    if (field_dump_flag.ematz)
+        DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT);
+    if (field_dump_flag.nmat)
+        DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT);
+
+    if (field_dump_flag.fmatx)
+        DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT);
+    if (field_dump_flag.fmaty)
+        DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT);
+    if (field_dump_flag.fmatz)
+        DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT);
+    if (field_dump_flag.cmat)
+        DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT);
+
+    el2 = uptime() - el2;
+    //sim_log("TimeHDF5Write: " << el2 << " s");
+
+    double el3 = uptime();
+
+    //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
+    float attr_data[2][3];
+    attr_data[0][0] = grid->x0;
+    attr_data[0][1] = grid->y0;
+    attr_data[0][2] = grid->z0;
+    attr_data[1][0] = grid->dx;
+    attr_data[1][1] = grid->dy;
+    attr_data[1][2] = grid->dz;
+    hsize_t dims[2];
+    dims[0] = 2;
+    dims[1] = 3;
+    hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
+    hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
+    H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
+    H5Sclose(va_geo_dataspace_id);
+    H5Aclose(va_geo_attribute_id);
+
+    free(temp_buf);
+    H5Sclose(filespace);
+    H5Sclose(memspace);
+    H5Pclose(plist_id);
+    H5Gclose(group_id);
+    H5Fclose(file_id);
+
+    el3 = uptime() - el3;
+    //sim_log("TimeHDF5Close: " << el3 << " s");
+
+    if (mpi_rank == 0)
+    {
+        char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf";
+        char dimensions_3d[128];
+        sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]);
+        char dimensions_4d[128];
+        sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3);
+        char orignal[128];
+        sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
+        char dxdydz[128];
+        sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
+
+        //int fields_interval = global->fields_interval;
+        // TODO: make sure field interval is set
+        int nframes = num_step / field_interval + 1;
+        static int field_tframe = 0;
+
+#ifdef DUMP_INFO_DEBUG
+        printf("         meta file : %s \n", output_xml_file);
+        printf(" array dims per var: %s \n", dimensions_3d);
+        printf("array dims all vars: %s \n", dimensions_4d);
+        printf("            orignal: %s \n", orignal);
+        printf("             dxdydz: %s \n", dxdydz);
+        printf("            nframes: %d \n", nframes);
+        printf("    field_interval: %d \n", field_interval);
+        printf("       current step: %lld \n", step_for_viou);
+                printf("       current step: %lld \n", step_for_viou);
+
+        //printf("    Simulation time: %f \n", grid->t0);
+        printf("             tframe: %d \n", field_tframe);
+#endif
+
+        if (field_tframe >= 1)
+        {
+            if (field_tframe == (nframes - 1))
+            {
+                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
+            }
+            else
+            {
+                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
+            }
+        }
+        else
+        {
+            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval);
+            if (field_tframe == (nframes - 1))
+            {
+                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
+            }
+            else
+            {
+                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
+            }
+        }
+        field_tframe++;
+    }
+}
+void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
+                             const char *fbase,
+                             int ftag )
+{
+    size_t step_for_viou = step();
+
+#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
+    {                                                                                                             \
+        dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
+        temp_buf_index = 0;                                                                                       \
+        for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
+        {                                                                                                         \
+            for (size_t j(1); j < grid->ny + 1; j++)                                                              \
+            {                                                                                                     \
+                for (size_t k(1); k < grid->nz + 1; k++)                                                          \
+                {                                                                                                 \
+                    temp_buf[temp_buf_index] = hydro(i, j, k).ATTRIBUTE_NAME;                                     \
+                    temp_buf_index = temp_buf_index + 1;                                                          \
+                }                                                                                                 \
+            }                                                                                                     \
+        }                                                                                                         \
+        dataspace_id = H5Dget_space(dset_id);                                                                     \
+        H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
+        H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
+        H5Sclose(dataspace_id);                                                                                   \
+        H5Dclose(dset_id);                                                                                        \
+    }
+    //#define DUMP_INFO_DEBUG 1
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+    species_t *sp = find_species_name(speciesname, species_list);
+    if (!sp)
+        ERROR(("Invalid species name: %s", speciesname));
+
+#ifdef ENABLE_V407_SCIDAC
+    clear_hydro( hydro, grid );
+    accumulate_hydro_p( hydro, sp->p, sp->np, sp->q_m, interpolator, grid );
+    synchronize_hydro( hydro, grid );
+#else
+    clear_hydro_array(hydro_array);
+    accumulate_hydro_p(hydro_array, sp, interpolator_array);
+    synchronize_hydro_array(hydro_array);
+#endif
+    /*#ifdef DUMP_INFO_DEBUG
+printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
+printf("base dir for field: %s \n", global->fdParams.baseDir);
+printf("stride x y z  = (%ld, %ld, %ld)\n", global->fdParams.stride_x, global->fdParams.stride_y, global->fdParams.stride_z);
+printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
+printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
+printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
+printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
+#endif*/
+
+    char hname[256];
+    char hydro_scratch[128];
+    char subhydro_scratch[128];
+
+    sprintf(hydro_scratch, "./%s", "hydro_hdf5");
+    dump_mkdir(hydro_scratch);
+    sprintf(subhydro_scratch, "%s/T.%lld/", hydro_scratch, step_for_viou);
+    dump_mkdir(subhydro_scratch);
+
+    sprintf(hname, "%s/hydro_%s_%lld.h5", subhydro_scratch, speciesname, step_for_viou);
+    double el1 = uptime();
+    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
+    H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+    hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+    H5Pclose(plist_id);
+
+    sprintf(hname, "Timestep_%lld", step_for_viou);
+    hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+    el1 = uptime() - el1;
+    //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
+    double el2 = uptime();
+
+    // Create a variable list of field values to output.
+    //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
+    //size_t *varlist = new size_t[numvars];
+
+    //for (size_t i(0), c(0); i < total_field_variables; i++)
+    //    if (global->fdParams.output_vars.bitset(i))
+    //        varlist[c++] = i;
+
+    //printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);
+
+
+    //typedef struct hydro {
+    //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
+    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
+    //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
+    //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
+    //  float _pad[2];         // 16-byte align
+    //} hydro_t;
+
+    //typedef struct hydro_array {
+    //  hydro_t * ALIGNED(128) h;
+    //  grid_t * g;
+    //} hydro_array_t;
+
+    float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
+    hsize_t temp_buf_index;
+    hid_t dset_id;
+    //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
+    plist_id = H5Pcreate(H5P_DATASET_XFER);
+    //Comment out for test only
+    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+    //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
+
+    //global->topology_x
+
+    hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3];
+    hydro_global_size[0] = (grid->nx * grid->gpx);
+    hydro_global_size[1] = (grid->ny * grid->gpy);
+    hydro_global_size[2] = (grid->nz * grid->gpz);
+
+    hydro_local_size[0] = grid->nx;
+    hydro_local_size[1] = grid->ny;
+    hydro_local_size[2] = grid->nz;
+
+#define RANK_TO_INDEX2(rank, ix, iy, iz)                                      \
+    BEGIN_PRIMITIVE                                                           \
+    {                                                                         \
+        int _ix, _iy, _iz;                                                    \
+        _ix = (rank);                         /* ix = ix+gpx*( iy+gpy*iz ) */ \
+        _iy = _ix / int(grid->gpx);  /* iy = iy+gpy*iz */            \
+        _ix -= _iy * int(grid->gpx); /* ix = ix */                   \
+        _iz = _iy / int(grid->gpy);  /* iz = iz */                   \
+        _iy -= _iz * int(grid->gpy); /* iy = iy */                   \
+        (ix) = _ix;                                                           \
+        (iy) = _iy;                                                           \
+        (iz) = _iz;                                                           \
+    }                                                                         \
+    END_PRIMITIVE
+
+    int mpi_rank_x, mpi_rank_y, mpi_rank_z;
+    RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+
+    global_offset[0] = (grid->nx) * mpi_rank_x;
+    global_offset[1] = (grid->ny) * mpi_rank_y;
+    global_offset[2] = (grid->nz) * mpi_rank_z;
+
+    global_count[0] = (grid->nx);
+    global_count[1] = (grid->ny);
+    global_count[2] = (grid->nz);
+
+#ifdef DUMP_INFO_DEBUG
+    printf("global size   = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
+    printf("global_offset = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_offset[0], global_offset[1], global_offset[2]);
+    printf("global_count  = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_count[0], global_count[1], global_count[2]);
+    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+    fflush(stdout);
+#endif
+
+    hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL);
+    hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL);
+    hid_t dataspace_id;
+
+    //typedef struct hydro {
+    //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
+    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
+    //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
+    //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
+    //  float _pad[2];         // 16-byte align
+    //} hydro_t;
+
+    if (hydro_dump_flag.jx)
+        DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.jy)
+        DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.jz)
+        DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.rho)
+        DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT);
+
+    if (hydro_dump_flag.px)
+        DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.py)
+        DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.pz)
+        DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.ke)
+        DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT);
+
+    if (hydro_dump_flag.txx)
+        DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.tyy)
+        DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.tzz)
+        DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT);
+
+    if (hydro_dump_flag.tyz)
+        DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.tzx)
+        DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.txy)
+        DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
+
+    el2 = uptime() - el2;
+    //sim_log("TimeHDF5Write: " << el2 << " s");
+
+    double el3 = uptime();
+
+    //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
+    float attr_data[2][3];
+    attr_data[0][0] = grid->x0;
+    attr_data[0][1] = grid->y0;
+    attr_data[0][2] = grid->z0;
+    attr_data[1][0] = grid->dx;
+    attr_data[1][1] = grid->dy;
+    attr_data[1][2] = grid->dz;
+    hsize_t dims[2];
+    dims[0] = 2;
+    dims[1] = 3;
+    hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
+    hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
+    H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
+    H5Sclose(va_geo_dataspace_id);
+    H5Aclose(va_geo_attribute_id);
+
+    free(temp_buf);
+    H5Sclose(filespace);
+    H5Sclose(memspace);
+    H5Pclose(plist_id);
+    H5Gclose(group_id);
+    H5Fclose(file_id);
+
+    el3 = uptime() - el3;
+    //sim_log("TimeHDF5Close: " << el3 << " s");
+
+    if (mpi_rank == 0)
+    {
+        char output_xml_file[128];
+        sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", speciesname, ".xdmf");
+        char dimensions_3d[128];
+        sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
+        char dimensions_4d[128];
+        sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3);
+        char orignal[128];
+        sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
+        char dxdydz[128];
+        sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
+
+        int nframes = num_step / field_interval + 1;
+        int fields_interval = field_interval;
+        static int tframe = 0;
+
+#ifdef DUMP_INFO_DEBUG
+        printf("         meta file : %s \n", output_xml_file);
+        printf(" array dims per var: %s \n", dimensions_3d);
+        printf("array dims all vars: %s \n", dimensions_4d);
+        printf("            orignal: %s \n", orignal);
+        printf("             dxdydz: %s \n", dxdydz);
+        printf("            nframes: %d \n", nframes);
+        printf("    fields_interval: %d \n", fields_interval);
+        printf("       current step: %lld \n", step_for_viou);
+        printf("    Simulation time: %f \n", grid->t0);
+        printf("             tframe: %d \n", tframe);
+#endif
+
+        char speciesname_new[128];
+        sprintf(speciesname_new, "hydro_%s", speciesname);
+        if (tframe >= 1)
+        {
+            if (tframe == (nframes - 1))
+            {
+                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
+            }
+            else
+            {
+                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
+            }
+        }
+        else
+        {
+            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, fields_interval);
+            if (tframe == (nframes - 1))
+            {
+                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
+            }
+            else
+            {
+                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
+            }
+        }
+        tframe++;
+    }
+}
+
+// TODO": make the sp_name and speciesname varailbe naming consistent
+void
+vpic_simulation::dump_particles_hdf5( const char *sp_name,
+                                 const char *fbase,
+                                 int ftag )
+{
+    size_t step_for_viou = step();
+    char fname[256];
+    char group_name[256];
+    char particle_scratch[128];
+    char subparticle_scratch[128];
+
+    int np_local;
+    species_t *sp;
+
+    float *Pf;
+    int *Pi;
+
+    // get the total number of particles. in this example, output only electrons
+    sp = species_list;
+    sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5");
+    dump_mkdir(particle_scratch);
+    sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou);
+    dump_mkdir(subparticle_scratch);
+
+    // TODO: Allow the user to set this
+
+    int stride_particle_dump = 1;
+    while (sp)
+    {
+        np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump;
+
+        // make a copy of the part of particle data to be dumped
+        double ec1 = uptime();
+
+        int sp_np = sp->np;
+        int sp_max_np = sp->max_np;
+        particle_t *ALIGNED(128) p_buf = NULL;
+        if (!p_buf)
+            MALLOC_ALIGNED(p_buf, np_local, 128);
+        particle_t *sp_p = sp->p;
+        sp->p = p_buf;
+        sp->np = np_local;
+        sp->max_np = np_local;
+
+        for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i)
+        {
+            COPY(&sp->p[i], &sp_p[iptl], 1);
+        }
+    #ifdef ENABLE_V407_SCIDAC 
+        # define PBUF_SIZE 32768 // 1MB of particles
+        for( int buf_start=0; buf_start<np_local; buf_start += PBUF_SIZE ) {
+            int n_buf = PBUF_SIZE;
+            if( buf_start+n_buf > np_local ) n_buf = np_local - buf_start;
+                COPY( p_buf, &sp->p[buf_start], n_buf );
+            center_p( p_buf, n_buf, sp->q_m, interpolator, grid );
+        }
+    #else
+        center_p(sp, interpolator_array);
+    #endif
+        ec1 = uptime() - ec1;
+        int mpi_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+        //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl;
+        //sim_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local);
+
+        Pf = (float *)sp->p;
+        Pi = (int *)sp->p;
+
+        // open HDF5 file in "particle/T.<step>/" subdirectory
+        // filename: eparticle.h5p
+        sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
+        sprintf(group_name, "/Timestep_%ld", step_for_viou);
+        double el1 = uptime();
+
+        hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
+        H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+        hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+        hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+        H5Pclose(plist_id);
+
+        long long total_particles, offset;
+        long long numparticles = np_local;
+        MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+        offset -= numparticles;
+
+        hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL);
+
+        hsize_t memspace_count_temp = numparticles * 8;
+        hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
+        plist_id = H5Pcreate(H5P_DATASET_XFER);
+
+        //Comment out for test only
+        H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+        H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL);
+
+        hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local;
+        H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL);
+
+        el1 = uptime() - el1;
+        //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
+
+        double el2 = uptime();
+
+        hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf);
+        H5Dclose(dset_id);
+        //if (rank == 0) printf ("Written variable dX \n");
+
+        dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1);
+        H5Dclose(dset_id);
+        //if (rank == 0) printf ("Written variable dY \n");
+
+        dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
+        H5Dclose(dset_id);
+        //if (rank == 0) printf ("Written variable dZ \n");
+
+        dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3);
+        H5Dclose(dset_id);
+        //if (rank == 0) printf ("Written variable i \n");
+
+        dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4);
+        H5Dclose(dset_id);
+        //if (rank == 0) printf ("Written variable  Ux \n");
+
+        dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5);
+        H5Dclose(dset_id);
+        //if (rank == 0) printf ("Written variable Uy \n");
+
+        dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6);
+        H5Dclose(dset_id);
+        //if (rank == 0) printf ("Written variable Uz \n");
+
+        dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7);
+        H5Dclose(dset_id);
+        //if (rank == 0) printf ("Written variable q \n");
+
+        el2 = uptime() - el2;
+        //sim_log("Particle TimeHDF5Write: " << el2 << " s");
+
+        double el3 = uptime();
+        H5Sclose(memspace);
+        H5Sclose(filespace);
+        H5Pclose(plist_id);
+        H5Gclose(group_id);
+        H5Fclose(file_id);
+        el3 = uptime() - el3;
+        //sim_log("Particle TimeHDF5Close: " << el3 << " s");
+
+        sp->p = sp_p;
+        sp->np = sp_np;
+        sp->max_np = sp_max_np;
+        FREE_ALIGNED(p_buf);
+
+        // Write metadata if step() == 0
+        char meta_fname[256];
+
+        sprintf(meta_fname, "%s/grid_metadata_%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
+
+        double meta_el1 = uptime();
+
+        hid_t meta_plist_id = H5Pcreate(H5P_FILE_ACCESS);
+        H5Pset_fapl_mpio(meta_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+        hid_t meta_file_id = H5Fcreate(meta_fname, H5F_ACC_TRUNC, H5P_DEFAULT, meta_plist_id);
+        hid_t meta_group_id = H5Gcreate(meta_file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        H5Pclose(meta_plist_id);
+
+        long long meta_total_particles, meta_offset;
+        long long meta_numparticles = 1;
+        MPI_Allreduce(&meta_numparticles, &meta_total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Scan(&meta_numparticles, &meta_offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+        meta_offset -= meta_numparticles;
+
+        hid_t meta_filespace = H5Screate_simple(1, (hsize_t *)&meta_total_particles, NULL);
+        hid_t meta_memspace = H5Screate_simple(1, (hsize_t *)&meta_numparticles, NULL);
+        meta_plist_id = H5Pcreate(H5P_DATASET_XFER);
+        H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE);
+        H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL);
+        meta_el1 = uptime() - meta_el1;
+        //sim_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts
+
+        double meta_el2 = uptime();
+
+        hid_t meta_dset_id = H5Dcreate(meta_group_id, "np_local", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, (int32_t *)&np_local);
+        H5Dclose(meta_dset_id);
+        //if (rank == 0) printf ("Written variable dX \n");
+
+        meta_dset_id = H5Dcreate(meta_group_id, "nx", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nx);
+        H5Dclose(meta_dset_id);
+        //if (rank == 0) printf ("Written variable dY \n");
+
+        meta_dset_id = H5Dcreate(meta_group_id, "ny", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->ny);
+        H5Dclose(meta_dset_id);
+        //if (rank == 0) printf ("Written variable dZ \n");
+
+        meta_dset_id = H5Dcreate(meta_group_id, "nz", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nz);
+        H5Dclose(meta_dset_id);
+        //if (rank == 0) printf ("Written variable i \n");
+
+        meta_dset_id = H5Dcreate(meta_group_id, "x0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->x0);
+        H5Dclose(meta_dset_id);
+
+        meta_dset_id = H5Dcreate(meta_group_id, "y0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->y0);
+        H5Dclose(meta_dset_id);
+
+        meta_dset_id = H5Dcreate(meta_group_id, "z0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->z0);
+        H5Dclose(meta_dset_id);
+
+        meta_dset_id = H5Dcreate(meta_group_id, "dx", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dx);
+        H5Dclose(meta_dset_id);
+
+        meta_dset_id = H5Dcreate(meta_group_id, "dy", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dy);
+        H5Dclose(meta_dset_id);
+
+        meta_dset_id = H5Dcreate(meta_group_id, "dz", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dz);
+        H5Dclose(meta_dset_id);
+
+        meta_el2 = uptime() - meta_el2;
+        //sim_log("Metafile TimeHDF5Write: " << meta_el2 << " s");
+        double meta_el3 = uptime();
+        H5Sclose(meta_memspace);
+        H5Sclose(meta_filespace);
+        H5Pclose(meta_plist_id);
+        H5Gclose(meta_group_id);
+        H5Fclose(meta_file_id);
+        meta_el3 = uptime() - meta_el3;
+        //sim_log("Metafile TimeHDF5Close: " << meta_el3 << " s");
+
+        sp = sp->next;
+    }
+}
+#endif
+
 void
 vpic_simulation::dump_particles( const char *sp_name,
                                  const char *fbase,
@@ -699,9 +1595,6 @@ vpic_simulation::hydro_dump( const char * speciesname,
 
   int dim[3];
 
-  /* define to do C-style indexing */
-# define hydro(x,y,z) hydro_array->h[VOXEL(x,y,z, grid->nx,grid->ny,grid->nz)]
-
   /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
   nxout = (grid->nx)/istride;
   nyout = (grid->ny)/jstride;
diff --git a/src/vpic/hdf5_header_info.h b/src/vpic/hdf5_header_info.h
new file mode 100644
index 00000000..4f1ee934
--- /dev/null
+++ b/src/vpic/hdf5_header_info.h
@@ -0,0 +1,259 @@
+#ifndef VPIC_HDF5_HEAD_INFO
+#define VPIC_HDF5_HEAD_INFO
+
+#define FIELD_ARRAY_NAME field_array
+struct field_dump_flag_t
+{
+  bool ex = true, ey = true, ez = true, div_e_err = true;
+  bool cbx = true, cby = true, cbz = true, div_b_err = true;
+  bool tcax = true, tcay = true, tcaz = true, rhob = true;
+  bool jfx = true, jfy = true, jfz = true, rhof = true;
+  bool ematx = true, ematy = true, ematz = true, nmat = true;
+  bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
+  void disableE()
+  {
+    ex = false, ey = false, ez = false, div_e_err = false;
+  }
+
+  void disableCB()
+  {
+    cbx = false, cby = false, cbz = false, div_b_err = false;
+  }
+
+  void disableTCA()
+  {
+    tcax = false, tcay = false, tcaz = false, rhob = false;
+  }
+
+  void disableJF()
+  {
+    jfx = false, jfy = false, jfz = false, rhof = false;
+  }
+
+  void disableEMAT()
+  {
+    ematx = false, ematy = false, ematz = false, nmat = false;
+  }
+
+  void disableFMAT()
+  {
+    fmatx = false, fmaty = false, fmatz = false, cmat = false;
+  }
+
+  void resetToDefaults()
+  {
+    ex = true, ey = true, ez = true, div_e_err = true;
+    cbx = true, cby = true, cbz = true, div_b_err = true;
+    tcax = true, tcay = true, tcaz = true, rhob = true;
+    jfx = true, jfy = true, jfz = true, rhof = true;
+    ematx = true, ematy = true, ematz = true, nmat = true;
+    fmatx = true, fmaty = true, fmatz = true, cmat = true;
+  }
+
+  bool enabledE()
+  {
+    return ex && ey && ez;
+  }
+
+  bool enabledCB()
+  {
+    return cbx && cby && cbz;
+  }
+
+  bool enabledTCA()
+  {
+    return tcax && tcay && tcaz;
+  }
+
+  bool enabledJF()
+  {
+    return jfx && jfy && jfz;
+  }
+
+  bool enabledEMAT()
+  {
+    return ematx && ematy && ematz;
+  }
+
+  bool enabledFMAT()
+  {
+    return fmatx && fmaty && fmatz;
+  }
+};
+
+struct hydro_dump_flag_t
+{
+  bool jx = true, jy = true, jz = true, rho = true;
+  bool px = true, py = true, pz = true, ke = true;
+  bool txx = true, tyy = true, tzz = true;
+  bool tyz = true, tzx = true, txy = true;
+
+  void disableJ()
+  {
+    jx = false, jy = false, jz = false, rho = false;
+  }
+
+  void disableP()
+  {
+    px = false, py = false, pz = false, ke = false;
+  }
+
+  void disableTD() //Stress diagonal
+  {
+    txx = false, tyy = false, tzz = false;
+  }
+
+  void disableTOD() //Stress off-diagonal
+  {
+    tyz = false, tzx = false, txy = false;
+  }
+  void resetToDefaults()
+  {
+    jx = true, jy = true, jz = true, rho = true;
+    px = true, py = true, pz = true, ke = true;
+    txx = true, tyy = true, tzz = true;
+    tyz = true, tzx = true, txy = true;
+  }
+
+  bool enabledJ()
+  {
+    return jx && jy && jz;
+  }
+
+  bool enabledP()
+  {
+    return px && py && pz;
+  }
+
+  bool enabledTD()
+  {
+    return txx && tyy && tzz;
+  }
+
+  bool enabledTOD()
+  {
+    return tyz && tzx && txy;
+  }
+};
+
+// Declare vars to use
+hydro_dump_flag_t hydro_dump_flag;
+field_dump_flag_t field_dump_flag;
+
+// XML header stuff
+const char *header = "<?xml version=\"1.0\"?>\n<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []>\n<Xdmf xmlns:xi=\"http://www.w3.org/2001/XInclude\" Version=\"2.0\">\n\t<Domain>\n";
+const char *header_topology = "\t\t<Topology Dimensions=\"%s\" TopologyType=\"3DCoRectMesh\" name=\"topo\"/>\n";
+const char *header_geom = "\t\t<Geometry Type=\"ORIGIN_DXDYDZ\" name=\"geo\">\n";
+const char *header_origin = "\t\t\t<!-- Origin --> \n\t\t\t<DataItem Dimensions=\"3\" Format=\"XML\">%s</DataItem>\n";
+const char *header_dxdydz = "\t\t\t<!-- DxDyDz --> \n\t\t\t<DataItem Dimensions=\"3\" Format=\"XML\">%s</DataItem>\n";
+const char *footer_geom = "\t\t</Geometry>\n";
+const char *grid_line = "\t\t<Grid CollectionType=\"Temporal\" GridType=\"Collection\" Name=\"TimeSeries\"> \n \
+\t\t\t<Time TimeType=\"HyperSlab\"> \n \
+\t\t\t\t<DataItem Dimensions=\"%d\" Format=\"XML\" NumberType=\"Float\">";
+const char *grid_line_footer = "</DataItem> \n\
+\t\t\t</Time>\n";
+const char *footer = "\t\t</Grid>\n\t</Domain>\n</Xdmf>\n";
+
+const char *main_body_head = "\t\t\t<Grid GridType=\"Uniform\" Name=\"T%d\"> \n \
+\t\t\t\t<Topology Reference=\"/Xdmf/Domain/Topology[1]\"/>   \n \
+\t\t\t\t<Geometry Reference=\"/Xdmf/Domain/Geometry[1]\"/>  \n";
+const char *main_body_foot = "\t\t\t</Grid>\n";
+
+const char *main_body_attributeV = "\
+        \t\t\t\t <Attribute AttributeType =\"Vector\" Center=\"Node\" Name=\"%s\">  \n \
+            \t\t\t\t\t<DataItem Dimensions=\" %s \" Function=\"JOIN($0, $1, $2)\" ItemType=\"Function\">  \n \
+                \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
+                \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
+                \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
+            \t\t\t\t\t</DataItem>  \n \
+        \t\t\t\t</Attribute>  \n ";
+
+const char *main_body_attributeS = "\
+        \t\t\t\t <Attribute AttributeType =\"Scalar\" Center=\"Node\" Name=\"%s\">  \n \
+                \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
+        \t\t\t\t</Attribute>  \n ";
+
+#define create_file_with_header(xml_file_name, dimensions, orignal, dxdydz, nframes, fields_interval) \
+  {                                                                                                   \
+    FILE *fp;                                                                                         \
+    fp = fopen(xml_file_name, "w");                                                                   \
+    fputs(header, fp);                                                                                \
+    fprintf(fp, header_topology, dimensions);                                                         \
+    fputs(header_geom, fp);                                                                           \
+    fprintf(fp, header_origin, orignal);                                                              \
+    fprintf(fp, header_dxdydz, dxdydz);                                                               \
+    fputs(footer_geom, fp);                                                                           \
+    fprintf(fp, grid_line, nframes);                                                                  \
+    int i;                                                                                            \
+    for (i = 0; i < nframes; i++)                                                                     \
+      fprintf(fp, "%d ", i*fields_interval);                                                         \
+    fputs(grid_line_footer, fp);                                                                      \
+    fclose(fp);                                                                                       \
+  }
+#define write_main_body_attribute(fpp, main_body_attribute_p, attribute_name, dims_4d_p, dims_3d_p, file_name_pre_p, time_step_p, a1, a2, a3) \
+  {                                                                                                                                           \
+    fprintf(fpp, main_body_attribute_p, attribute_name, dims_4d_p,                                                                            \
+            dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a1,                                                            \
+            dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a2,                                                            \
+            dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a3);                                                           \
+  }
+
+#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag)                                 \
+  {                                                                                                                                       \
+    FILE *fp;                                                                                                                             \
+    fp = fopen(xml_file_name, "a");                                                                                                       \
+    fprintf(fp, main_body_head, time_step);                                                                                               \
+    if (field_dump_flag.enabledE())                                                                                                       \
+      write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez");             \
+    if (field_dump_flag.div_e_err)                                                                                                        \
+      fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err");               \
+    if (field_dump_flag.enabledCB())                                                                                                      \
+      write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz");          \
+    if (field_dump_flag.div_b_err)                                                                                                        \
+      fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err");               \
+    if (field_dump_flag.enabledTCA())                                                                                                     \
+      write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz");     \
+    if (field_dump_flag.rhob)                                                                                                             \
+      fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob");                         \
+    if (field_dump_flag.enabledJF())                                                                                                      \
+      write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz");         \
+    if (field_dump_flag.rhof)                                                                                                             \
+      fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof");                         \
+    if (field_dump_flag.enabledEMAT())                                                                                                    \
+      write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \
+    if (field_dump_flag.nmat)                                                                                                             \
+      fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat");                         \
+    if (field_dump_flag.enabledFMAT())                                                                                                    \
+      write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \
+    if (field_dump_flag.cmat)                                                                                                             \
+      fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat");                         \
+    fprintf(fp, "%s", main_body_foot);                                                                                                          \
+    if (add_footer_flag)                                                                                                                  \
+      fputs(footer, fp);                                                                                                                  \
+    fclose(fp);                                                                                                                           \
+  }
+#define invert_hydro_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag)                          \
+  {                                                                                                                                \
+    FILE *fp;                                                                                                                      \
+    fp = fopen(xml_file_name, "a");                                                                                                \
+    fprintf(fp, main_body_head, time_step);                                                                                        \
+    if (hydro_dump_flag.enabledJ())                                                                                                \
+      write_main_body_attribute(fp, main_body_attributeV, "J", dims_4d, dims_3d, speciesname_p, time_step, "jx", "jy", "jz");      \
+    if (hydro_dump_flag.rho)                                                                                                       \
+      fprintf(fp, main_body_attributeS, "rho", dims_3d, time_step, speciesname_p, time_step, time_step, "rho");                    \
+    if (hydro_dump_flag.enabledP())                                                                                                \
+      write_main_body_attribute(fp, main_body_attributeV, "P", dims_4d, dims_3d, speciesname_p, time_step, "px", "py", "pz");      \
+    if (hydro_dump_flag.ke)                                                                                                        \
+      fprintf(fp, main_body_attributeS, "ke", dims_3d, time_step, speciesname_p, time_step, time_step, "ke");                      \
+    if (hydro_dump_flag.enabledTD())                                                                                               \
+      write_main_body_attribute(fp, main_body_attributeV, "TD", dims_4d, dims_3d, speciesname_p, time_step, "txx", "tyy", "tzz");  \
+    if (hydro_dump_flag.enabledTOD())                                                                                              \
+      write_main_body_attribute(fp, main_body_attributeV, "TOD", dims_4d, dims_3d, speciesname_p, time_step, "tyz", "tzx", "txy"); \
+    fprintf(fp, "%s", main_body_foot);                                                                                                   \
+    if (add_footer_flag)                                                                                                           \
+      fputs(footer, fp);                                                                                                           \
+    fclose(fp);                                                                                                                    \
+  }
+
+
+#endif // VPIC_HDF5_HEAD_INFO
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index f7518836..80c2aaca 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -233,11 +233,19 @@ class vpic_simulation {
 
   // Binary dumps
   void dump_grid( const char *fbase );
+
   void dump_fields( const char *fbase, int fname_tag = 1 );
+  void dump_fields_hdf5( const char *fbase, int fname_tag = 1 );
+
   void dump_hydro( const char *sp_name, const char *fbase,
                    int fname_tag = 1 );
+  void dump_hydro_hdf5( const char *sp_name, const char *fbase,
+                   int fname_tag = 1 );
+
   void dump_particles( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
+  void dump_particles_hdf5( const char *sp_name, const char *fbase,
+                       int fname_tag = 1 );
 
   // convenience functions for simlog output
   void create_field_list(char * strlist, DumpParameters & dumpParams);

From d1ba88c976c90fc279d54f27f8c673e8dd5f7021 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 13 Aug 2019 14:01:34 -0600
Subject: [PATCH 53/95] fixed small bug in global topoly setting and add
 field_interval value to test deck

---
 sample/harrisHDF5     |  1 +
 src/grid/partition.cc |  4 +--
 src/vpic/dump.cc      | 69 ++++++++++++++++++++++---------------------
 src/vpic/vpic.h       |  2 +-
 4 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/sample/harrisHDF5 b/sample/harrisHDF5
index ff565f33..2b43e3d2 100644
--- a/sample/harrisHDF5
+++ b/sample/harrisHDF5
@@ -130,6 +130,7 @@ begin_initialization {
 
   num_step             = int(0.2*taui/(wci*dt));
   status_interval      = int(1./(wci*dt));
+  field_interval = 1;
   sync_shared_interval = status_interval;
   clean_div_e_interval = status_interval;
   clean_div_b_interval = status_interval;
diff --git a/src/grid/partition.cc b/src/grid/partition.cc
index fc554c2d..ff9b09f4 100644
--- a/src/grid/partition.cc
+++ b/src/grid/partition.cc
@@ -57,8 +57,8 @@ partition_periodic_box( grid_t * g,
 
   // Capture global processor decomposition
   g->gpx = gpx;
-  g->gpx = gpy;
-  g->gpx = gpz;
+  g->gpy = gpy;
+  g->gpz = gpz;
 
   g->dx = (gx1-gx0)/(double)gnx;
   g->dy = (gy1-gy0)/(double)gny;
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 1639a044..cbdb1289 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -264,6 +264,22 @@ vpic_simulation::dump_hydro( const char *sp_name,
 #ifdef VPIC_ENABLE_HDF5
 #define DUMP_DIR_FORMAT "./%s"
 
+// TODO: rename or remove this
+#define RANK_TO_INDEX2(rank, ix, iy, iz)                                      \
+    BEGIN_PRIMITIVE                                                           \
+    {                                                                         \
+        int _ix, _iy, _iz;                                                    \
+        _ix = (rank);                         /* ix = ix+gpx*( iy+gpy*iz ) */ \
+        _iy = _ix / grid->gpx;  /* iy = iy+gpy*iz */            \
+        _ix -= _iy * grid->gpx; /* ix = ix */                   \
+        _iz = _iy / grid->gpy;  /* iz = iz */                   \
+        _iy -= _iz * grid->gpy; /* iy = iy */                   \
+        (ix) = _ix;                                                           \
+        (iy) = _iy;                                                           \
+        (iz) = _iz;                                                           \
+    }                                                                         \
+    END_PRIMITIVE
+
 /* define to do C-style indexing */
 #define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
 
@@ -412,26 +428,26 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
     field_local_size[1] = grid->ny;
     field_local_size[2] = grid->nz;
 
-    // TODO: delete this
-#define RANK_TO_INDEX2(rank, ix, iy, iz)                                      \
-    BEGIN_PRIMITIVE                                                           \
-    {                                                                         \
-        int _ix, _iy, _iz;                                                    \
-        _ix = (rank);                         /* ix = ix+gpx*( iy+gpy*iz ) */ \
-        _iy = _ix / int(grid->gpx);  /* iy = iy+gpy*iz */            \
-        _ix -= _iy * int(grid->gpx); /* ix = ix */                   \
-        _iz = _iy / int(grid->gpy);  /* iz = iz */                   \
-        _iy -= _iz * int(grid->gpy); /* iy = iy */                   \
-        (ix) = _ix;                                                           \
-        (iy) = _iy;                                                           \
-        (iz) = _iz;                                                           \
-    }                                                                         \
-    END_PRIMITIVE
+    int gpx = grid->gpx;
+    int gpy = grid->gpy;
+    int gpz = grid->gpz;
 
     int mpi_rank_x, mpi_rank_y, mpi_rank_z;
-    RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
-
-    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+    //RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+
+    int _ix, _iy, _iz;
+    _ix = (mpi_rank);
+    _iy = _ix / grid->gpx;
+    _ix -= _iy * grid->gpx;
+    _iz = _iy / grid->gpy;
+    _iy -= _iz * grid->gpy;
+    int ix = _ix;
+    int iy = _iy;
+    int iz = _iz;
+
+    mpi_rank_x = ix;
+    mpi_rank_y = iy;
+    mpi_rank_z = iz;
 
     global_offset[0] = (grid->nx) * mpi_rank_x;
     global_offset[1] = (grid->ny) * mpi_rank_y;
@@ -730,21 +746,6 @@ printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid-
     hydro_local_size[1] = grid->ny;
     hydro_local_size[2] = grid->nz;
 
-#define RANK_TO_INDEX2(rank, ix, iy, iz)                                      \
-    BEGIN_PRIMITIVE                                                           \
-    {                                                                         \
-        int _ix, _iy, _iz;                                                    \
-        _ix = (rank);                         /* ix = ix+gpx*( iy+gpy*iz ) */ \
-        _iy = _ix / int(grid->gpx);  /* iy = iy+gpy*iz */            \
-        _ix -= _iy * int(grid->gpx); /* ix = ix */                   \
-        _iz = _iy / int(grid->gpy);  /* iz = iz */                   \
-        _iy -= _iz * int(grid->gpy); /* iy = iy */                   \
-        (ix) = _ix;                                                           \
-        (iy) = _iy;                                                           \
-        (iz) = _iz;                                                           \
-    }                                                                         \
-    END_PRIMITIVE
-
     int mpi_rank_x, mpi_rank_y, mpi_rank_z;
     RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
 
@@ -948,7 +949,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
         {
             COPY(&sp->p[i], &sp_p[iptl], 1);
         }
-    #ifdef ENABLE_V407_SCIDAC 
+    #ifdef ENABLE_V407_SCIDAC
         # define PBUF_SIZE 32768 // 1MB of particles
         for( int buf_start=0; buf_start<np_local; buf_start += PBUF_SIZE ) {
             int n_buf = PBUF_SIZE;
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index 80c2aaca..40af2f08 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -146,7 +146,7 @@ class vpic_simulation {
   double quota;
   int checkpt_interval;
   int hydro_interval;
-  int field_interval;
+  int field_interval = 1;
   int particle_interval;
 
   size_t nxout, nyout, nzout;

From 1836475650e70d9bb919e46a69e5415ae7a9d0b0 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 13 Aug 2019 14:37:05 -0600
Subject: [PATCH 54/95] updated field interval to be the correct value

---
 sample/harrisHDF5 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sample/harrisHDF5 b/sample/harrisHDF5
index 2b43e3d2..e8bf115d 100644
--- a/sample/harrisHDF5
+++ b/sample/harrisHDF5
@@ -130,7 +130,7 @@ begin_initialization {
 
   num_step             = int(0.2*taui/(wci*dt));
   status_interval      = int(1./(wci*dt));
-  field_interval = 1;
+  field_interval = status_interval;
   sync_shared_interval = status_interval;
   clean_div_e_interval = status_interval;
   clean_div_b_interval = status_interval;

From 7a10247e5617000c5249341c3b46d4bded434cfc Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Thu, 10 Oct 2019 09:56:23 -0600
Subject: [PATCH 55/95] clean up hdf5 build system. require paralle, and
 includes and guard example deck agaisnt no hdf5

---
 CMakeLists.txt    | 4 ++++
 sample/harrisHDF5 | 6 ++++++
 src/vpic/dump.cc  | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7fd9d84..46b15741 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -333,9 +333,13 @@ endif()
 if(USE_HDF5)
     # Enable HDF5, and the relevant defines
     find_package(HDF5 REQUIRED)
+    if (NOT HDF5_IS_PARALLEL)
+        message(FATAL_ERROR "HDF5 Parallel support is required: ${HDF5_IS_PARALLEL}")
+    endif()
     add_definitions(-DVPIC_ENABLE_HDF5)
     string(REPLACE ";" " " string_libraries "${HDF5_C_LIBRARIES}")
     set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}")
+    include_directories(${HDF5_INCLUDE_DIRS})
 endif(USE_HDF5)
 
 # Configure local script to generate bin/vpic
diff --git a/sample/harrisHDF5 b/sample/harrisHDF5
index e8bf115d..6beedeed 100644
--- a/sample/harrisHDF5
+++ b/sample/harrisHDF5
@@ -25,6 +25,12 @@
 // to zero before the user's initialization block is executed. Up to 16K
 // of global variables can be defined.
 
+
+// Deck only works if VPIC was build with HDF support. Check for that:
+#ifndef VPIC_ENABLE_HDF5
+#error "VPIC_ENABLE_HDF5" is required
+#endif
+
 begin_globals {
   double energies_interval;
   double fields_interval;
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index cbdb1289..837ddda3 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -15,6 +15,8 @@
 #include "dumpmacros.h"
 #include "../util/io/FileUtils.h"
 
+#include <mpi.h>
+
 #ifdef VPIC_ENABLE_HDF5
 #include "hdf5.h" // from the lib
 #include "hdf5_header_info.h" // from vpic

From 5f8211b0c7c699ffda7ea63af7c89a0c433895fa Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Thu, 10 Oct 2019 10:22:34 -0600
Subject: [PATCH 56/95] allow user to change dumping flags in the deck

---
 sample/harrisHDF5           |   5 ++
 src/vpic/dump.cc            |   2 -
 src/vpic/hdf5_header_info.h | 137 --------------------------------
 src/vpic/vpic.h             | 150 +++++++++++++++++++++++++++++++++++-
 4 files changed, 152 insertions(+), 142 deletions(-)

diff --git a/sample/harrisHDF5 b/sample/harrisHDF5
index 6beedeed..0c084b07 100644
--- a/sample/harrisHDF5
+++ b/sample/harrisHDF5
@@ -47,6 +47,11 @@ begin_initialization {
   // Then the initial non-zero fields need to be loaded at time level 0 and the
   // particles (position and momentum both) need to be loaded at time level 0.
 
+
+  // Example of how to call / set dumping
+  field_dump_flag.disableEMAT();
+
+
   double input_mass_ratio;
   int input_seed;
 
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 837ddda3..cbdb1289 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -15,8 +15,6 @@
 #include "dumpmacros.h"
 #include "../util/io/FileUtils.h"
 
-#include <mpi.h>
-
 #ifdef VPIC_ENABLE_HDF5
 #include "hdf5.h" // from the lib
 #include "hdf5_header_info.h" // from vpic
diff --git a/src/vpic/hdf5_header_info.h b/src/vpic/hdf5_header_info.h
index 4f1ee934..baed8f7d 100644
--- a/src/vpic/hdf5_header_info.h
+++ b/src/vpic/hdf5_header_info.h
@@ -2,143 +2,6 @@
 #define VPIC_HDF5_HEAD_INFO
 
 #define FIELD_ARRAY_NAME field_array
-struct field_dump_flag_t
-{
-  bool ex = true, ey = true, ez = true, div_e_err = true;
-  bool cbx = true, cby = true, cbz = true, div_b_err = true;
-  bool tcax = true, tcay = true, tcaz = true, rhob = true;
-  bool jfx = true, jfy = true, jfz = true, rhof = true;
-  bool ematx = true, ematy = true, ematz = true, nmat = true;
-  bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
-  void disableE()
-  {
-    ex = false, ey = false, ez = false, div_e_err = false;
-  }
-
-  void disableCB()
-  {
-    cbx = false, cby = false, cbz = false, div_b_err = false;
-  }
-
-  void disableTCA()
-  {
-    tcax = false, tcay = false, tcaz = false, rhob = false;
-  }
-
-  void disableJF()
-  {
-    jfx = false, jfy = false, jfz = false, rhof = false;
-  }
-
-  void disableEMAT()
-  {
-    ematx = false, ematy = false, ematz = false, nmat = false;
-  }
-
-  void disableFMAT()
-  {
-    fmatx = false, fmaty = false, fmatz = false, cmat = false;
-  }
-
-  void resetToDefaults()
-  {
-    ex = true, ey = true, ez = true, div_e_err = true;
-    cbx = true, cby = true, cbz = true, div_b_err = true;
-    tcax = true, tcay = true, tcaz = true, rhob = true;
-    jfx = true, jfy = true, jfz = true, rhof = true;
-    ematx = true, ematy = true, ematz = true, nmat = true;
-    fmatx = true, fmaty = true, fmatz = true, cmat = true;
-  }
-
-  bool enabledE()
-  {
-    return ex && ey && ez;
-  }
-
-  bool enabledCB()
-  {
-    return cbx && cby && cbz;
-  }
-
-  bool enabledTCA()
-  {
-    return tcax && tcay && tcaz;
-  }
-
-  bool enabledJF()
-  {
-    return jfx && jfy && jfz;
-  }
-
-  bool enabledEMAT()
-  {
-    return ematx && ematy && ematz;
-  }
-
-  bool enabledFMAT()
-  {
-    return fmatx && fmaty && fmatz;
-  }
-};
-
-struct hydro_dump_flag_t
-{
-  bool jx = true, jy = true, jz = true, rho = true;
-  bool px = true, py = true, pz = true, ke = true;
-  bool txx = true, tyy = true, tzz = true;
-  bool tyz = true, tzx = true, txy = true;
-
-  void disableJ()
-  {
-    jx = false, jy = false, jz = false, rho = false;
-  }
-
-  void disableP()
-  {
-    px = false, py = false, pz = false, ke = false;
-  }
-
-  void disableTD() //Stress diagonal
-  {
-    txx = false, tyy = false, tzz = false;
-  }
-
-  void disableTOD() //Stress off-diagonal
-  {
-    tyz = false, tzx = false, txy = false;
-  }
-  void resetToDefaults()
-  {
-    jx = true, jy = true, jz = true, rho = true;
-    px = true, py = true, pz = true, ke = true;
-    txx = true, tyy = true, tzz = true;
-    tyz = true, tzx = true, txy = true;
-  }
-
-  bool enabledJ()
-  {
-    return jx && jy && jz;
-  }
-
-  bool enabledP()
-  {
-    return px && py && pz;
-  }
-
-  bool enabledTD()
-  {
-    return txx && tyy && tzz;
-  }
-
-  bool enabledTOD()
-  {
-    return tyz && tzx && txy;
-  }
-};
-
-// Declare vars to use
-hydro_dump_flag_t hydro_dump_flag;
-field_dump_flag_t field_dump_flag;
 
 // XML header stuff
 const char *header = "<?xml version=\"1.0\"?>\n<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []>\n<Xdmf xmlns:xi=\"http://www.w3.org/2001/XInclude\" Version=\"2.0\">\n\t<Domain>\n";
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index bed8ab97..e90808bb 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -34,6 +34,144 @@
 #endif
 //  #include "dumpvars.h"
 
+
+// TODO: move these to a better header?
+#ifdef VPIC_ENABLE_HDF5
+struct field_dump_flag_t
+{
+  bool ex = true, ey = true, ez = true, div_e_err = true;
+  bool cbx = true, cby = true, cbz = true, div_b_err = true;
+  bool tcax = true, tcay = true, tcaz = true, rhob = true;
+  bool jfx = true, jfy = true, jfz = true, rhof = true;
+  bool ematx = true, ematy = true, ematz = true, nmat = true;
+  bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
+  void disableE()
+  {
+    ex = false, ey = false, ez = false, div_e_err = false;
+  }
+
+  void disableCB()
+  {
+    cbx = false, cby = false, cbz = false, div_b_err = false;
+  }
+
+  void disableTCA()
+  {
+    tcax = false, tcay = false, tcaz = false, rhob = false;
+  }
+
+  void disableJF()
+  {
+    jfx = false, jfy = false, jfz = false, rhof = false;
+  }
+
+  void disableEMAT()
+  {
+    ematx = false, ematy = false, ematz = false, nmat = false;
+  }
+
+  void disableFMAT()
+  {
+    fmatx = false, fmaty = false, fmatz = false, cmat = false;
+  }
+
+  void resetToDefaults()
+  {
+    ex = true, ey = true, ez = true, div_e_err = true;
+    cbx = true, cby = true, cbz = true, div_b_err = true;
+    tcax = true, tcay = true, tcaz = true, rhob = true;
+    jfx = true, jfy = true, jfz = true, rhof = true;
+    ematx = true, ematy = true, ematz = true, nmat = true;
+    fmatx = true, fmaty = true, fmatz = true, cmat = true;
+  }
+
+  bool enabledE()
+  {
+    return ex && ey && ez;
+  }
+
+  bool enabledCB()
+  {
+    return cbx && cby && cbz;
+  }
+
+  bool enabledTCA()
+  {
+    return tcax && tcay && tcaz;
+  }
+
+  bool enabledJF()
+  {
+    return jfx && jfy && jfz;
+  }
+
+  bool enabledEMAT()
+  {
+    return ematx && ematy && ematz;
+  }
+
+  bool enabledFMAT()
+  {
+    return fmatx && fmaty && fmatz;
+  }
+};
+
+struct hydro_dump_flag_t
+{
+  bool jx = true, jy = true, jz = true, rho = true;
+  bool px = true, py = true, pz = true, ke = true;
+  bool txx = true, tyy = true, tzz = true;
+  bool tyz = true, tzx = true, txy = true;
+
+  void disableJ()
+  {
+    jx = false, jy = false, jz = false, rho = false;
+  }
+
+  void disableP()
+  {
+    px = false, py = false, pz = false, ke = false;
+  }
+
+  void disableTD() //Stress diagonal
+  {
+    txx = false, tyy = false, tzz = false;
+  }
+
+  void disableTOD() //Stress off-diagonal
+  {
+    tyz = false, tzx = false, txy = false;
+  }
+  void resetToDefaults()
+  {
+    jx = true, jy = true, jz = true, rho = true;
+    px = true, py = true, pz = true, ke = true;
+    txx = true, tyy = true, tzz = true;
+    tyz = true, tzx = true, txy = true;
+  }
+
+  bool enabledJ()
+  {
+    return jx && jy && jz;
+  }
+
+  bool enabledP()
+  {
+    return px && py && pz;
+  }
+
+  bool enabledTD()
+  {
+    return txx && tyy && tzz;
+  }
+
+  bool enabledTOD()
+  {
+    return tyz && tzx && txy;
+  }
+};
+#endif
+
 typedef FileIO FILETYPE;
 
 const uint32_t all			(0xffffffff);
@@ -235,17 +373,23 @@ class vpic_simulation {
   void dump_grid( const char *fbase );
 
   void dump_fields( const char *fbase, int fname_tag = 1 );
-  void dump_fields_hdf5( const char *fbase, int fname_tag = 1 );
 
   void dump_hydro( const char *sp_name, const char *fbase,
                    int fname_tag = 1 );
-  void dump_hydro_hdf5( const char *sp_name, const char *fbase,
-                   int fname_tag = 1 );
 
   void dump_particles( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
+#ifdef VPIC_ENABLE_HDF5
   void dump_particles_hdf5( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
+  void dump_hydro_hdf5( const char *sp_name, const char *fbase,
+                   int fname_tag = 1 );
+  void dump_fields_hdf5( const char *fbase, int fname_tag = 1 );
+
+  // Declare vars to use
+  hydro_dump_flag_t hydro_dump_flag;
+  field_dump_flag_t field_dump_flag;
+#endif
 
   // convenience functions for simlog output
   void create_field_list(char * strlist, DumpParameters & dumpParams);

From 15d8b89a5f8058d79d5a8a716f7fb8815abe30b1 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Mon, 14 Oct 2019 17:40:05 -0600
Subject: [PATCH 57/95] default init hdf5 dumping structs

---
 src/vpic/vpic.cc | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc
index 4f150afc..a97d7978 100644
--- a/src/vpic/vpic.cc
+++ b/src/vpic/vpic.cc
@@ -7,9 +7,9 @@
  * March/April 2004 - Heavily revised and extended from earlier V4PIC versions
  *
  */
- 
+
 #include "vpic.h"
- 
+
 /* Note that, when a vpic_simulation is created (and thus registered
    with the checkpt service), it is created empty; none of the simulation
    objects on which it depends have been created yet. (These get created
@@ -99,7 +99,7 @@ vpic_simulation::vpic_simulation() {
   //   if( n_rng<spu.n_pipeline    ) n_rng = spu.n_pipeline;
   // # endif
 
-  n_rng++; 
+  n_rng++;
 
   entropy      = new_rng_pool( n_rng, 0, 0 );
   sync_entropy = new_rng_pool( n_rng, 0, 1 );
@@ -107,8 +107,14 @@ vpic_simulation::vpic_simulation() {
 
   REGISTER_OBJECT( this, checkpt_vpic_simulation,
                    restore_vpic_simulation, reanimate_vpic_simulation );
+
+#ifdef VPIC_ENABLE_HDF5
+  // Default init hdf5 dump flags
+  field_dump_flag = field_dump_flag_t();
+  hydro_dump_flag = hydro_dump_flag_t();
+#endif
 }
- 
+
 vpic_simulation::~vpic_simulation() {
   UNREGISTER_OBJECT( this );
   delete_emitter_list( emitter_list );
@@ -123,4 +129,4 @@ vpic_simulation::~vpic_simulation() {
   delete_rng_pool( sync_entropy );
   delete_rng_pool( entropy );
 }
- 
+

From 7297106290f93627d8f415eac9a4c4e9ae3ba90a Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Mon, 14 Oct 2019 18:39:44 -0600
Subject: [PATCH 58/95] temporarily disable second hydro dumping species as it
 breaks the current ported apporach

---
 sample/harrisHDF5 |  3 ++-
 src/vpic/dump.cc  | 21 ++++++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/sample/harrisHDF5 b/sample/harrisHDF5
index 0c084b07..d6cb208e 100644
--- a/sample/harrisHDF5
+++ b/sample/harrisHDF5
@@ -142,6 +142,7 @@ begin_initialization {
   num_step             = int(0.2*taui/(wci*dt));
   status_interval      = int(1./(wci*dt));
   field_interval = status_interval;
+  hydro_interval = status_interval;
   sync_shared_interval = status_interval;
   clean_div_e_interval = status_interval;
   clean_div_b_interval = status_interval;
@@ -381,7 +382,7 @@ begin_diagnostics {
   // accumulated using a self-consistent charge-conserving method. Hydro dumps
   // are in a binary format. Each rank makes a hydro dump.
   if( should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro");
-  if( should_dump(ihydro) ) dump_hydro_hdf5("ion",     "ihydro");
+  //if( should_dump(ihydro) ) dump_hydro_hdf5("ion",     "ihydro");
 
   // Particle dumps store the particle data for a given species. The data
   // written is known at the time t = time().  By default, particle dumps
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index cbdb1289..698c2283 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -330,11 +330,11 @@ vpic_simulation::dump_fields_hdf5( const char *fbase, int ftag )
 
 #ifdef DUMP_INFO_DEBUG
     printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
-    printf("base dir for field: %s \n", global->fdParams.baseDir);
-    printf("stride x y z  = (%ld, %ld, %ld)\n", global->fdParams.stride_x, global->fdParams.stride_y, global->fdParams.stride_z);
+    //printf("base dir for field: %s \n", fdParams.baseDir);
+    //printf("stride x y z  = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z);
     printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
     printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
-    printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
+    //printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
     printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
 #endif
 
@@ -458,9 +458,9 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
     global_count[2] = (grid->nz);
 
 #ifdef DUMP_INFO_DEBUG
-    printf("global size   = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", field_global_size[0], field_global_size[1], field_global_size[2]);
-    printf("global_offset = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_offset[0], global_offset[1], global_offset[2]);
-    printf("global_count  = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_count[0], global_count[1], global_count[2]);
+    printf("global size   = %d  %d %d \n", field_global_size[0], field_global_size[1], field_global_size[2]);
+    printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]);
+    printf("global_count  = %d  %d %d \n", global_count[0], global_count[1], global_count[2]);
     printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
     fflush(stdout);
 #endif
@@ -624,6 +624,9 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
         field_tframe++;
     }
 }
+
+// TODO: fix this, it currently uses a static global and the logic only
+// supports 1 species otherwise things get out of sync
 void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
                              const char *fbase,
                              int ftag )
@@ -758,9 +761,9 @@ printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid-
     global_count[2] = (grid->nz);
 
 #ifdef DUMP_INFO_DEBUG
-    printf("global size   = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
-    printf("global_offset = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_offset[0], global_offset[1], global_offset[2]);
-    printf("global_count  = " HSIZE_T ", " HSIZE_T ", " HSIZE_T "\n", global_count[0], global_count[1], global_count[2]);
+    printf("global size   = %d %d %d \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
+    printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]);
+    printf("global_count  = %d %d %d \n", global_count[0], global_count[1], global_count[2]);
     printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
     fflush(stdout);
 #endif

From 8143ab3e69cfb375b8ed2c9b498d3a1375ad4f28 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 15 Oct 2019 13:03:17 -0600
Subject: [PATCH 59/95] modify sprinf of size_t to be the correct zu

---
 src/vpic/dump.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 698c2283..eb3be0ba 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -366,17 +366,17 @@ vpic_simulation::dump_fields_hdf5( const char *fbase, int ftag )
 
     sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5");
     dump_mkdir(field_scratch);
-    sprintf(subfield_scratch, "%s/T.%lld/", field_scratch, step_for_viou);
+    sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou);
     dump_mkdir(subfield_scratch);
 
-    sprintf(fname, "%s/%s_%lld.h5", subfield_scratch, "fields", step_for_viou);
+    sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou);
     double el1 = uptime();
     hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
     H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
     hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
     H5Pclose(plist_id);
 
-    sprintf(fname, "Timestep_%lld", step_for_viou);
+    sprintf(fname, "Timestep_%zu", step_for_viou);
     hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
     el1 = uptime() - el1;
@@ -688,17 +688,17 @@ printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid-
 
     sprintf(hydro_scratch, "./%s", "hydro_hdf5");
     dump_mkdir(hydro_scratch);
-    sprintf(subhydro_scratch, "%s/T.%lld/", hydro_scratch, step_for_viou);
+    sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou);
     dump_mkdir(subhydro_scratch);
 
-    sprintf(hname, "%s/hydro_%s_%lld.h5", subhydro_scratch, speciesname, step_for_viou);
+    sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, speciesname, step_for_viou);
     double el1 = uptime();
     hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
     H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
     hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
     H5Pclose(plist_id);
 
-    sprintf(hname, "Timestep_%lld", step_for_viou);
+    sprintf(hname, "Timestep_%zu", step_for_viou);
     hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
     el1 = uptime() - el1;
@@ -713,7 +713,7 @@ printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid-
     //    if (global->fdParams.output_vars.bitset(i))
     //        varlist[c++] = i;
 
-    //printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);
+    //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars);
 
 
     //typedef struct hydro {

From eaa4c3cd96be699ed85fcccb0f92e8965d05866d Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 15 Oct 2019 13:30:21 -0600
Subject: [PATCH 60/95] remove scidac 407 ifdefs

---
 src/vpic/dump.cc | 27 ++-------------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index eb3be0ba..2b990533 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -663,24 +663,9 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
     if (!sp)
         ERROR(("Invalid species name: %s", speciesname));
 
-#ifdef ENABLE_V407_SCIDAC
-    clear_hydro( hydro, grid );
-    accumulate_hydro_p( hydro, sp->p, sp->np, sp->q_m, interpolator, grid );
-    synchronize_hydro( hydro, grid );
-#else
     clear_hydro_array(hydro_array);
     accumulate_hydro_p(hydro_array, sp, interpolator_array);
     synchronize_hydro_array(hydro_array);
-#endif
-    /*#ifdef DUMP_INFO_DEBUG
-printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
-printf("base dir for field: %s \n", global->fdParams.baseDir);
-printf("stride x y z  = (%ld, %ld, %ld)\n", global->fdParams.stride_x, global->fdParams.stride_y, global->fdParams.stride_z);
-printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
-printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
-printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
-printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
-#endif*/
 
     char hname[256];
     char hydro_scratch[128];
@@ -952,17 +937,9 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
         {
             COPY(&sp->p[i], &sp_p[iptl], 1);
         }
-    #ifdef ENABLE_V407_SCIDAC
-        # define PBUF_SIZE 32768 // 1MB of particles
-        for( int buf_start=0; buf_start<np_local; buf_start += PBUF_SIZE ) {
-            int n_buf = PBUF_SIZE;
-            if( buf_start+n_buf > np_local ) n_buf = np_local - buf_start;
-                COPY( p_buf, &sp->p[buf_start], n_buf );
-            center_p( p_buf, n_buf, sp->q_m, interpolator, grid );
-        }
-    #else
+
         center_p(sp, interpolator_array);
-    #endif
+
         ec1 = uptime() - ec1;
         int mpi_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);

From 6ad27e80d0585fa785696739902a69edefda3bb1 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 15 Oct 2019 16:36:14 -0600
Subject: [PATCH 61/95] fix global static and re-enable multiple species
 dumping

---
 sample/harrisHDF5 |  4 ++--
 src/vpic/dump.cc  | 21 +++++++++++++--------
 src/vpic/vpic.cc  |  2 ++
 src/vpic/vpic.h   |  2 +-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/sample/harrisHDF5 b/sample/harrisHDF5
index d6cb208e..2b3b21bf 100644
--- a/sample/harrisHDF5
+++ b/sample/harrisHDF5
@@ -381,8 +381,8 @@ begin_diagnostics {
   // purely diagnostic. It is not used by the simulation and it is not
   // accumulated using a self-consistent charge-conserving method. Hydro dumps
   // are in a binary format. Each rank makes a hydro dump.
-  if( should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro");
-  //if( should_dump(ihydro) ) dump_hydro_hdf5("ion",     "ihydro");
+  if(should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro");
+  if( should_dump(ihydro) ) dump_hydro_hdf5("ion",     "ihydro");
 
   // Particle dumps store the particle data for a given species. The data
   // written is known at the time t = time().  By default, particle dumps
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 2b990533..8ed36449 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -10,6 +10,7 @@
  */
 
 #include <cassert>
+#include <unordered_map>
 
 #include "vpic.h"
 #include "dumpmacros.h"
@@ -27,6 +28,10 @@
 // COMPATIBLE WITH EXISTING EXTERNAL 3RD PARTY VISUALIZATION SOFTWARE.
 // IN THE LONG RUN, THIS EXTERNAL SOFTWARE WILL NEED TO BE UPDATED.
 
+// TODO: this should live somewhere more sensible, but it's better than the
+// global static it replaces
+std::unordered_map<species_id, size_t> tframe_map;
+
 int vpic_simulation::dump_mkdir(const char * dname) {
 	return FileUtils::makeDirectory(dname);
 } // dump_mkdir
@@ -578,8 +583,6 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
         char dxdydz[128];
         sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
 
-        //int fields_interval = global->fields_interval;
-        // TODO: make sure field interval is set
         int nframes = num_step / field_interval + 1;
         static int field_tframe = 0;
 
@@ -598,6 +601,8 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
         printf("             tframe: %d \n", field_tframe);
 #endif
 
+        // TODO: this footer dumping is more likely better done in a
+        // destructor, rather than hoping a multiple division works out
         if (field_tframe >= 1)
         {
             if (field_tframe == (nframes - 1))
@@ -842,9 +847,9 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
         char dxdydz[128];
         sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
 
-        int nframes = num_step / field_interval + 1;
-        int fields_interval = field_interval;
-        static int tframe = 0;
+        int nframes = num_step / hydro_interval + 1;
+
+        const int tframe = tframe_map[sp->id];
 
 #ifdef DUMP_INFO_DEBUG
         printf("         meta file : %s \n", output_xml_file);
@@ -853,7 +858,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
         printf("            orignal: %s \n", orignal);
         printf("             dxdydz: %s \n", dxdydz);
         printf("            nframes: %d \n", nframes);
-        printf("    fields_interval: %d \n", fields_interval);
+        printf("    hydro_fields_interval: %d \n", hydro_interval);
         printf("       current step: %lld \n", step_for_viou);
         printf("    Simulation time: %f \n", grid->t0);
         printf("             tframe: %d \n", tframe);
@@ -874,7 +879,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
         }
         else
         {
-            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, fields_interval);
+            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval);
             if (tframe == (nframes - 1))
             {
                 invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
@@ -884,7 +889,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
                 invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
             }
         }
-        tframe++;
+        tframe_map[sp->id]++;
     }
 }
 
diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc
index a97d7978..bfd18767 100644
--- a/src/vpic/vpic.cc
+++ b/src/vpic/vpic.cc
@@ -110,6 +110,8 @@ vpic_simulation::vpic_simulation() {
 
 #ifdef VPIC_ENABLE_HDF5
   // Default init hdf5 dump flags
+  field_interval = 1;
+  hydro_interval = 1;
   field_dump_flag = field_dump_flag_t();
   hydro_dump_flag = hydro_dump_flag_t();
 #endif
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index e90808bb..ce1a2454 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -284,7 +284,7 @@ class vpic_simulation {
   double quota;
   int checkpt_interval;
   int hydro_interval;
-  int field_interval = 1;
+  int field_interval;
   int particle_interval;
 
   size_t nxout, nyout, nzout;

From f81849115cb20b1538b127eeac183b27c4e450a0 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Wed, 16 Oct 2019 12:33:22 -0600
Subject: [PATCH 62/95] first pass adding support for converting p->i to a
 global i

---
 src/vpic/dump.cc | 56 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 8ed36449..97253c79 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -693,7 +693,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
 
     el1 = uptime() - el1;
     //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
-    double el2 = uptime();
+    //double el2 = uptime();
 
     // Create a variable list of field values to output.
     //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
@@ -802,7 +802,7 @@ void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
     if (hydro_dump_flag.txy)
         DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
 
-    el2 = uptime() - el2;
+    //el2 = uptime() - el2;
     //sim_log("TimeHDF5Write: " << el2 << " s");
 
     double el3 = uptime();
@@ -990,49 +990,81 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
         el1 = uptime() - el1;
         //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
 
-        double el2 = uptime();
+        //double el2 = uptime();
 
+        // This point offset is silly, and loses the type safety (pf+1)
         hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf);
         H5Dclose(dset_id);
-        //if (rank == 0) printf ("Written variable dX \n");
 
         dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1);
         H5Dclose(dset_id);
-        //if (rank == 0) printf ("Written variable dY \n");
 
         dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
         H5Dclose(dset_id);
-        //if (rank == 0) printf ("Written variable dZ \n");
+
+        int local_i = *(Pi + 3);
+        int write_i = local_i;
+
+#ifdef OUTPUT_CONVERT_GLOBAL_ID
+# define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE {        \
+    int _ix, _iy, _iz;                                                  \
+    _ix  = (rank);        /* ix = ix+gpx*( iy+gpy*iz ) */       \
+    _iy  = _ix/int(nx);   /* iy = iy+gpy*iz */                  \
+    _ix -= _iy*int(nx);   /* ix = ix */                         \
+    _iz  = _iy/int(ny);   /* iz = iz */                         \
+    _iy -= _iz*int(ny);   /* iy = iy */                         \
+    (ix) = _ix;                                                         \
+    (iy) = _iy;                                                         \
+    (iz) = _iz;                                                         \
+  } END_PRIMITIVE
+        int ix, iy, iz, rx, ry, rz;
+        // Convert rank to local x/y/z
+        UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+        // Calculate local ix/iy/iz
+        UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
+
+        // Convert ix/iy/iz to global
+        int gix = ix + (grid->nx * (rx));
+        int giy = iy + (grid->ny * (ry));
+        int giz = iz + (grid->nz * (rz));
+
+        // calculate global grid sizes
+        int gnx = grid->nx * grid->gpx;
+        int gny = grid->ny * grid->gpy;
+        int gnz = grid->nz * grid->gpz;
+
+        int global_i = VOXEL(gix, giy, giz, gnx, gny, gnz);
+
+        write_i = global_i;
+        // TODO: update the address written below, it requires something more stable than a statced int
+#undef UNVOXEL
+#endif
 
         dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3);
         H5Dclose(dset_id);
-        //if (rank == 0) printf ("Written variable i \n");
 
         dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4);
         H5Dclose(dset_id);
-        //if (rank == 0) printf ("Written variable  Ux \n");
 
         dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5);
         H5Dclose(dset_id);
-        //if (rank == 0) printf ("Written variable Uy \n");
 
         dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6);
         H5Dclose(dset_id);
-        //if (rank == 0) printf ("Written variable Uz \n");
 
         dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7);
         H5Dclose(dset_id);
-        //if (rank == 0) printf ("Written variable q \n");
 
-        el2 = uptime() - el2;
+        //el2 = uptime() - el2;
         //sim_log("Particle TimeHDF5Write: " << el2 << " s");
 
         double el3 = uptime();

From 73e071690489ec7269bf7cd415881be672204626 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Wed, 16 Oct 2019 15:32:07 -0600
Subject: [PATCH 63/95] add loop ovr particles and ability to write custom
 global pi to file

---
 src/vpic/dump.cc | 58 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 97253c79..733a2913 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -978,6 +978,11 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
 
         hsize_t memspace_count_temp = numparticles * 8;
         hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
+
+        // Don't need, can just use H5S_ALL
+        //hsize_t linearspace_count_temp = numparticles;
+        //hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
+
         plist_id = H5Pcreate(H5P_DATASET_XFER);
 
         //Comment out for test only
@@ -1005,9 +1010,6 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
         H5Dclose(dset_id);
 
-        int local_i = *(Pi + 3);
-        int write_i = local_i;
-
 #ifdef OUTPUT_CONVERT_GLOBAL_ID
 # define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE {        \
     int _ix, _iy, _iz;                                                  \
@@ -1020,33 +1022,49 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
     (iy) = _iy;                                                         \
     (iz) = _iz;                                                         \
   } END_PRIMITIVE
-        int ix, iy, iz, rx, ry, rz;
-        // Convert rank to local x/y/z
-        UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
 
-        // Calculate local ix/iy/iz
-        UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
+        std::vector<int> global_pi;
+        global_pi.reserve(numparticles);
+        // TODO: this could be parallel
+        for (int i = 0; i < numparticles; i++)
+        {
+            int local_i = *(Pi + 3);
+            int write_i = local_i;
+
+            int ix, iy, iz, rx, ry, rz;
+            // Convert rank to local x/y/z
+            UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+            // Calculate local ix/iy/iz
+            UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
 
-        // Convert ix/iy/iz to global
-        int gix = ix + (grid->nx * (rx));
-        int giy = iy + (grid->ny * (ry));
-        int giz = iz + (grid->nz * (rz));
+            // Convert ix/iy/iz to global
+            int gix = ix + (grid->nx * (rx));
+            int giy = iy + (grid->ny * (ry));
+            int giz = iz + (grid->nz * (rz));
 
-        // calculate global grid sizes
-        int gnx = grid->nx * grid->gpx;
-        int gny = grid->ny * grid->gpy;
-        int gnz = grid->nz * grid->gpz;
+            // calculate global grid sizes
+            int gnx = grid->nx * grid->gpx;
+            int gny = grid->ny * grid->gpy;
+            int gnz = grid->nz * grid->gpz;
 
-        int global_i = VOXEL(gix, giy, giz, gnx, gny, gnz);
+            int global_i = VOXEL(gix, giy, giz, gnx, gny, gnz);
+            int* hmm = new int();
+            *hmm = 10;
+
+            global_pi[i] = global_i;
+        }
 
-        write_i = global_i;
-        // TODO: update the address written below, it requires something more stable than a statced int
 #undef UNVOXEL
-#endif
+        dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data());
+        H5Dclose(dset_id);
 
+#else
         dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3);
         H5Dclose(dset_id);
+#endif
 
         dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4);

From 843babb51429a496279371821c8f0e84a7844c78 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 22 Oct 2019 09:16:04 -0600
Subject: [PATCH 64/95] tidied up global particle id convert, close to correct
 now

---
 src/vpic/dump.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 733a2913..a63a13a2 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -1010,6 +1010,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
         ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
         H5Dclose(dset_id);
 
+#define OUTPUT_CONVERT_GLOBAL_ID 1
 #ifdef OUTPUT_CONVERT_GLOBAL_ID
 # define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE {        \
     int _ix, _iy, _iz;                                                  \
@@ -1028,8 +1029,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
         // TODO: this could be parallel
         for (int i = 0; i < numparticles; i++)
         {
-            int local_i = *(Pi + 3);
-            int write_i = local_i;
+            int local_i = sp->p[i].i;
 
             int ix, iy, iz, rx, ry, rz;
             // Convert rank to local x/y/z
@@ -1048,10 +1048,10 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
             int gny = grid->ny * grid->gpy;
             int gnz = grid->nz * grid->gpz;
 
-            int global_i = VOXEL(gix, giy, giz, gnx, gny, gnz);
-            int* hmm = new int();
-            *hmm = 10;
+            // TODO: find a better way to account for the hard coded ghosts in VOXEL
+            int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
 
+            //std::cout << rank() << " local i " << local_i << " becomes " << global_i << std::endl;
             global_pi[i] = global_i;
         }
 

From 351a785d46bb8088191ac90e2a5711aa897600f4 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 29 Oct 2019 14:27:46 -0600
Subject: [PATCH 65/95] first pass where build system does something useful

---
 CMakeLists.txt       |  94 +++++----
 bin/vpic-local.in    |   2 +-
 sample/harrisOpenPMD | 443 +++++++++++++++++++++++++++++++++++++++++++
 src/vpic/dump.cc     |  16 ++
 src/vpic/vpic.h      |   3 +
 5 files changed, 516 insertions(+), 42 deletions(-)
 create mode 100644 sample/harrisOpenPMD

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c354de5..57f1ed13 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,8 @@ option(DISABLE_DYNAMIC_RESIZING "Prevent particle arrays from dynamically resizi
 
 option(USE_HDF5 "Enable HDF5 for use during IO. VPIC does not help you install HDF5" OFF)
 
+option(USE_OPENPMD "Enable OpenPMD for use during IO. VPIC does not help you install OpenPM" OFF)
+
 # option to set minimum number of particles
 set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing")
 
@@ -136,7 +138,7 @@ find_package(Threads REQUIRED)
 
 if(USE_LEGACY_SORT)
   add_definitions(-DVPIC_USE_LEGACY_SORT)
-    set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_LEGACY_SORT")
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_LEGACY_SORT")
 endif(USE_LEGACY_SORT)
 
 #------------------------------------------------------------------------------#
@@ -151,7 +153,7 @@ endif()
 
 if(USE_PTHREADS)
   add_definitions(-DVPIC_USE_PTHREADS)
-    set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_PTHREADS")
+  set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} -DVPIC_USE_PTHREADS")
 endif(USE_PTHREADS)
 
 if(USE_OPENMP)
@@ -307,30 +309,6 @@ if(ENABLE_COVERAGE_BUILD)
     set(VPIC_CXX_FLAGS "${VPIC_CXX_FLAGS} --coverage")
 endif(ENABLE_COVERAGE_BUILD)
 
-# process Makefile.run.in to get a simple Makefile.run for a run. Points to
-# local built exe wrapper, and has example deck/platform.
-configure_file(${CMAKE_SOURCE_DIR}/sample/Makefile.run.in
-  ${CMAKE_BINARY_DIR}/bin/Makefile.run)
-
-# Append all defines to VPIC_DEFINES, so it can be seen during input deck building
-get_directory_property(ALL_DEFINES DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
-#string(REPLACE ";" " -D" EEK "${ALL_DEFINES}")
-foreach(d ${ALL_DEFINES})
-    set(VPIC_DEFINES "${VPIC_DEFINES} -D${d}")
-endforeach()
-
-# install script
-configure_file(${CMAKE_SOURCE_DIR}/bin/vpic.in
-  ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic-install)
-install(FILES ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic-install
-  DESTINATION bin
-  RENAME vpic
-  PERMISSIONS
-    OWNER_READ OWNER_WRITE OWNER_EXECUTE
-    GROUP_READ GROUP_EXECUTE
-    WORLD_READ WORLD_EXECUTE
-    )
-
 install(FILES deck/main.cc deck/wrapper.cc DESTINATION share/vpic)
 install(FILES deck/wrapper.h DESTINATION include/vpic)
 install(DIRECTORY src/ DESTINATION include/vpic
@@ -361,22 +339,22 @@ else()
   install(TARGETS vpic LIBRARY DESTINATION lib ARCHIVE DESTINATION lib)
 endif()
 
-# Configure local script to generate bin/vpic
-configure_file(${CMAKE_SOURCE_DIR}/bin/vpic-local.in
-  ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic)
-
-file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic
-  DESTINATION ${CMAKE_BINARY_DIR}/bin
-  FILE_PERMISSIONS
-    OWNER_READ OWNER_WRITE OWNER_EXECUTE
-    GROUP_READ GROUP_EXECUTE
-    WORLD_READ WORLD_EXECUTE
-)
-
 target_include_directories(vpic INTERFACE ${CMAKE_SOURCE_DIR}/src)
 target_link_libraries(vpic ${VPIC_EXPOSE} ${MPI_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS} ${HDF5_C_LIBRARIES})
 target_compile_options(vpic ${VPIC_EXPOSE} ${MPI_C_COMPILE_FLAGS})
 
+if(USE_OPENPMD)
+    # Enable OpenPMD, and the relevant defines
+    message("Using OpenPMD")
+    find_package(openPMD REQUIRED)
+    add_definitions(-DVPIC_ENABLE_OPENPMD)
+    link_libraries(openPMD::openPMD)
+    get_target_property(openPMD_LIBRARIES openPMD::openPMD LOCATION)
+    string(REPLACE ";" " " string_libraries ${openPMD_LIBRARIES})
+    set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}")
+endif(USE_OPENPMD)
+message(${VPIC_CXX_LIBRARIES})
+
 macro(build_a_vpic name deck)
   if(NOT EXISTS ${deck})
     message(FATAL_ERROR "Could not find deck '${deck}'")
@@ -434,6 +412,40 @@ if(ENABLE_PERFORMANCE_TESTS)
   include_directories(${CATCH_DIR})
   add_subdirectory(test/performance)
 endif(ENABLE_PERFORMANCE_TESTS)
-#~---------------------------------------------------------------------------~-#
-# vim: set tabstop=2 shiftwidth=2 expandtab :
-#~---------------------------------------------------------------------------~-#
+
+# process Makefile.run.in to get a simple Makefile.run for a run. Points to
+# local built exe wrapper, and has example deck/platform.
+configure_file(${CMAKE_SOURCE_DIR}/sample/Makefile.run.in
+  ${CMAKE_BINARY_DIR}/bin/Makefile.run)
+
+# Append all defines to VPIC_DEFINES, so it can be seen during input deck building
+get_directory_property(ALL_DEFINES DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
+#string(REPLACE ";" " -D" EEK "${ALL_DEFINES}")
+foreach(d ${ALL_DEFINES})
+    set(VPIC_DEFINES "${VPIC_DEFINES} -D${d}")
+endforeach()
+
+# install script
+configure_file(${CMAKE_SOURCE_DIR}/bin/vpic.in
+  ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic-install)
+install(FILES ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic-install
+  DESTINATION bin
+  RENAME vpic
+  PERMISSIONS
+    OWNER_READ OWNER_WRITE OWNER_EXECUTE
+    GROUP_READ GROUP_EXECUTE
+    WORLD_READ WORLD_EXECUTE
+)
+
+# Configure local script to generate bin/vpic
+configure_file(${CMAKE_SOURCE_DIR}/bin/vpic-local.in
+  ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic)
+
+file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/vpic
+  DESTINATION ${CMAKE_BINARY_DIR}/bin
+  FILE_PERMISSIONS
+    OWNER_READ OWNER_WRITE OWNER_EXECUTE
+    GROUP_READ GROUP_EXECUTE
+    WORLD_READ WORLD_EXECUTE
+)
+
diff --git a/bin/vpic-local.in b/bin/vpic-local.in
index f0e64ded..e372c91f 100644
--- a/bin/vpic-local.in
+++ b/bin/vpic-local.in
@@ -4,4 +4,4 @@ deck=`echo $1 | sed 's,\.cxx,,g;s,\.cc,,g;s,\.cpp,,g;s,.*\/,,g'`
 
 echo "${CMAKE_CXX_COMPILER} ${VPIC_DEFINES} ${CMAKE_CXX_FLAGS} -I. -I${CMAKE_SOURCE_DIR}/src ${VPIC_CXX_FLAGS} -DINPUT_DECK=$1 ${CMAKE_SOURCE_DIR}/deck/main.cc ${CMAKE_SOURCE_DIR}/deck/wrapper.cc -o $deck.${CMAKE_SYSTEM_NAME} -Wl,-rpath,${CMAKE_BINARY_DIR} -L${CMAKE_BINARY_DIR} -lvpic ${VPIC_CXX_LIBRARIES} -lpthread -ldl"
 
-${CMAKE_CXX_COMPILER} ${VPIC_DEFINES}${CMAKE_CXX_FLAGS} -I. -I${CMAKE_SOURCE_DIR}/src ${VPIC_CXX_FLAGS} -DINPUT_DECK=$1 ${CMAKE_SOURCE_DIR}/deck/main.cc ${CMAKE_SOURCE_DIR}/deck/wrapper.cc -o $deck.${CMAKE_SYSTEM_NAME} -Wl,-rpath,${CMAKE_BINARY_DIR} -L${CMAKE_BINARY_DIR} -lvpic ${VPIC_CXX_LIBRARIES} -lpthread -ldl
+${CMAKE_CXX_COMPILER} ${VPIC_DEFINES} ${CMAKE_CXX_FLAGS} -I. -I${CMAKE_SOURCE_DIR}/src ${VPIC_CXX_FLAGS} -DINPUT_DECK=$1 ${CMAKE_SOURCE_DIR}/deck/main.cc ${CMAKE_SOURCE_DIR}/deck/wrapper.cc -o $deck.${CMAKE_SYSTEM_NAME} -Wl,-rpath,${CMAKE_BINARY_DIR} -L${CMAKE_BINARY_DIR} -lvpic ${VPIC_CXX_LIBRARIES} -lpthread -ldl
diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
new file mode 100644
index 00000000..f8c932e9
--- /dev/null
+++ b/sample/harrisOpenPMD
@@ -0,0 +1,443 @@
+// Magnetic reconnection in a Harris equilibrium thin current sheet
+//
+// This input deck reproduces the PIC simulations found in:
+//   William Daughton. "Nonlinear dynamics of thin current sheets." Phys.
+//   Plasmas. 9(9): 3668-3678. September 2002.
+//
+// This input deck was written by:
+//   Kevin J Bowers, Ph.D.
+//   Plasma Physics Group (X-1)
+//   Applied Physics Division
+//   Los Alamos National Lab
+// August 2003      - original version
+// October 2003     - heavily revised to utilize input deck syntactic sugar
+// March/April 2004 - rewritten for domain decomposition V4PIC
+
+// If you want to use global variables (for example, to store the dump
+// intervals for your diagnostics section), it must be done in the globals
+// section. Variables declared the globals section will be preserved across
+// restart dumps. For example, if the globals section is:
+//   begin_globals {
+//     double variable;
+//   } end_globals
+// the double "variable" will be visible to other input deck sections as
+// "global->variable". Note: Variables declared in the globals section are set
+// to zero before the user's initialization block is executed. Up to 16K
+// of global variables can be defined.
+
+
+// Deck only works if VPIC was build with HDF support. Check for that:
+#ifndef VPIC_ENABLE_OPENPMD
+#error "VPIC_ENABLE_OPENPMD" is required
+#endif
+
+begin_globals {
+  double energies_interval;
+  double fields_interval;
+  double ehydro_interval;
+  double ihydro_interval;
+  double eparticle_interval;
+  double iparticle_interval;
+  double restart_interval;
+};
+
+begin_initialization {
+  // At this point, there is an empty grid and the random number generator is
+  // seeded with the rank. The grid, materials, species need to be defined.
+  // Then the initial non-zero fields need to be loaded at time level 0 and the
+  // particles (position and momentum both) need to be loaded at time level 0.
+
+  // Example of how to call / set dumping
+  //field_dump_flag.disableEMAT();
+
+  double input_mass_ratio;
+  int input_seed;
+
+  // Arguments can be passed from the command line to the input deck
+  if( num_cmdline_arguments!=3 ) {
+    // Set sensible defaults
+    input_mass_ratio = 1.0;
+    input_seed = 0;
+
+    sim_log( "Defaulting to mass_ratio of " << input_mass_ratio << " and seed of " << input_seed );
+    sim_log( "For Custom Usage: " << cmdline_argument[0] << " mass_ratio seed" );
+  }
+  else {
+    input_mass_ratio   = atof(cmdline_argument[1]); // Ion mass / electron mass
+    input_seed   = atof(cmdline_argument[2]); // Ion mass / electron mass
+    sim_log( "Detected input mass_ratio of " << input_mass_ratio << " and seed of " << input_seed );
+  }
+  seed_entropy( input_seed );
+
+  // Diagnostic messages can be passed written (usually to stderr)
+  sim_log( "Computing simulation parameters");
+
+  // Define the system of units for this problem (natural units)
+  double L    = 1; // Length normalization (sheet thickness)
+  double ec   = 1; // Charge normalization
+  double me   = 1; // Mass normalization
+  double c    = 1; // Speed of light
+  double eps0 = 1; // Permittivity of space
+
+  // Physics parameters
+  double mi_me   = input_mass_ratio; // Ion mass / electron mass
+  double rhoi_L  = 1;    // Ion thermal gyroradius / Sheet thickness
+  double Ti_Te   = 1;    // Ion temperature / electron temperature
+  double wpe_wce = 3;    // Electron plasma freq / electron cycltron freq
+  double theta   = 0;    // Orientation of the simulation wrt current sheet
+  double taui    = 100;  // Simulation wci's to run
+
+  // Numerical parameters
+  double Lx        = 16*L;  // How big should the box be in the x direction
+  double Ly        = 16*L;  // How big should the box be in the y direction
+  double Lz        = 16*L;  // How big should the box be in the z direction
+  double nx        = 64;    // Global resolution in the x direction
+  double ny        = 64;    // Global resolution in the y direction
+  double nz        = 1;     // Global resolution in the z direction
+  double nppc      = 64;    // Average number of macro particles per cell (both species combined!)
+  double cfl_req   = 0.99;  // How close to Courant should we try to run
+  double wpedt_max = 0.36;  // How big a timestep is allowed if Courant is not too restrictive
+  double damp      = 0.001; // Level of radiation damping
+
+  // Derived quantities
+  double mi   = me*mi_me;                             // Ion mass
+  double kTe  = me*c*c/(2*wpe_wce*wpe_wce*(1+Ti_Te)); // Electron temperature
+  double kTi  = kTe*Ti_Te;                            // Ion temperature
+  double vthe = sqrt(2*kTe/me);                       // Electron thermal velocity (B.D. convention)
+  double vthi = sqrt(2*kTi/mi);                       // Ion thermal velocity (B.D. convention)
+  double wci  = vthi/(rhoi_L*L);                      // Ion cyclotron frequency
+  double wce  = wci*mi_me;                            // Electron cyclotron frequency
+  double wpe  = wce*wpe_wce;                          // Electron plasma frequency
+  double wpi  = wpe/sqrt(mi_me);                      // Ion plasma frequency
+  double vdre = c*c*wce/(wpe*wpe*L*(1+Ti_Te));        // Electron drift velocity
+  double vdri = -Ti_Te*vdre;                          // Ion drift velocity
+  double b0   = me*wce/ec;                            // Asymptotic magnetic field strength
+  double n0   = me*eps0*wpe*wpe/(ec*ec);              // Peak electron density (also peak ion density)
+  double Npe  = 2*n0*Ly*Lz*L*tanh(0.5*Lx/L);          // Number of physical electrons in box
+  double Npi  = Npe;                                  // Number of physical ions in box
+  double Ne   = 0.5*nppc*nx*ny*nz;                    // Total macro electrons in box
+  Ne = trunc_granular(Ne,nproc());                    // Make it divisible by number of processors
+  double Ni   = Ne;                                   // Total macro ions in box
+  double we   = Npe/Ne;                               // Weight of a macro electron
+  double wi   = Npi/Ni;                               // Weight of a macro ion
+  double gdri = 1/sqrt(1-vdri*vdri/(c*c));            // gamma of ion drift frame
+  double gdre = 1/sqrt(1-vdre*vdre/(c*c));            // gamma of electron drift frame
+  double udri = vdri*gdri;                            // 4-velocity of ion drift frame
+  double udre = vdre*gdre;                            // 4-velocity of electron drift frame
+  double uthi = sqrt(kTi/mi)/c;                       // Normalized ion thermal velocity (K.B. convention)
+  double uthe = sqrt(kTe/me)/c;                       // Normalized electron thermal velocity (K.B. convention)
+  double cs   = cos(theta);
+  double sn   = sin(theta);
+
+  // Determine the timestep
+  double dg = courant_length(Lx,Ly,Lz,nx,ny,nz);      // Courant length
+  double dt = cfl_req*dg/c;                           // Courant limited time step
+  if( wpe*dt>wpedt_max ) dt=wpedt_max/wpe;            // Override time step if plasma frequency limited
+
+  ////////////////////////////////////////
+  // Setup high level simulation parmeters
+
+  num_step             = int(0.2*taui/(wci*dt));
+  status_interval      = int(1./(wci*dt));
+  field_interval = status_interval;
+  hydro_interval = status_interval;
+  sync_shared_interval = status_interval;
+  clean_div_e_interval = status_interval;
+  clean_div_b_interval = status_interval;
+
+  global->energies_interval  = status_interval;
+  global->fields_interval    = status_interval;
+  global->ehydro_interval    = status_interval;
+  global->ihydro_interval    = status_interval;
+  global->eparticle_interval = status_interval;
+  global->iparticle_interval = status_interval;
+  global->restart_interval   = status_interval;
+
+  ///////////////////////////
+  // Setup the space and time
+
+  // Setup basic grid parameters
+  define_units( c, eps0 );
+  define_timestep( dt );
+
+  // Parition a periodic box among the processors sliced uniformly along y
+  define_periodic_grid( -0.5*Lx, 0, 0,    // Low corner
+                         0.5*Lx, Ly, Lz,  // High corner
+                         nx, ny, nz,      // Resolution
+                         1, nproc(), 1 ); // Topology
+
+  // Override some of the boundary conditions to put a particle reflecting
+  // perfect electrical conductor on the -x and +x boundaries
+  set_domain_field_bc( BOUNDARY(-1,0,0), pec_fields );
+  set_domain_field_bc( BOUNDARY( 1,0,0), pec_fields );
+  set_domain_particle_bc( BOUNDARY(-1,0,0), reflect_particles );
+  set_domain_particle_bc( BOUNDARY( 1,0,0), reflect_particles );
+
+  define_material( "vacuum", 1 );
+  // Note: define_material defaults to isotropic materials with mu=1,sigma=0
+  // Tensor electronic, magnetic and conductive materials are supported
+  // though. See "shapes" for how to define them and assign them to regions.
+  // Also, space is initially filled with the first material defined.
+
+  // If you pass NULL to define field array, the standard field array will
+  // be used (if damp is not provided, no radiation damping will be used).
+  define_field_array( NULL, damp );
+
+  ////////////////////
+  // Setup the species
+
+  // Allow 50% more local_particles in case of non-uniformity
+  // VPIC will pick the number of movers to use for each species
+  // Both species use out-of-place sorting
+  species_t * ion      = define_species( "ion",       ec, mi, 1.5*Ni/nproc(), -1, 40, 1 );
+  species_t * electron = define_species( "electron", -ec, me, 1.5*Ne/nproc(), -1, 20, 1 );
+
+  ///////////////////////////////////////////////////
+  // Log diagnostic information about this simulation
+
+  sim_log( "" );
+  sim_log( "System of units" );
+  sim_log( "L = " << L );
+  sim_log( "ec = " << ec );
+  sim_log( "me = " << me );
+  sim_log( "c = " << c );
+  sim_log( "eps0 = " << eps0 );
+  sim_log( "" );
+  sim_log( "Physics parameters" );
+  sim_log( "rhoi/L = " << rhoi_L );
+  sim_log( "Ti/Te = " << Ti_Te );
+  sim_log( "wpe/wce = " << wpe_wce );
+  sim_log( "mi/me = " << mi_me );
+  sim_log( "theta = " << theta );
+  sim_log( "taui = " << taui );
+  sim_log( "" );
+  sim_log( "Numerical parameters" );
+  sim_log( "num_step = " << num_step );
+  sim_log( "dt = " << dt );
+  sim_log( "Lx = " << Lx << ", Lx/L = " << Lx/L );
+  sim_log( "Ly = " << Ly << ", Ly/L = " << Ly/L );
+  sim_log( "Lz = " << Lz << ", Lz/L = " << Lz/L );
+  sim_log( "nx = " << nx << ", dx = " << Lx/nx << ", L/dx = " << L*nx/Lx );
+  sim_log( "ny = " << ny << ", dy = " << Ly/ny << ", L/dy = " << L*ny/Ly );
+  sim_log( "nz = " << nz << ", dz = " << Lz/nz << ", L/dz = " << L*nz/Lz );
+  sim_log( "nppc = " << nppc );
+  sim_log( "courant = " << c*dt/dg );
+  sim_log( "damp = " << damp );
+  sim_log( "" );
+  sim_log( "Ion parameters" );
+  sim_log( "qpi = "  << ec << ", mi = " << mi << ", qpi/mi = " << ec/mi );
+  sim_log( "vthi = " << vthi << ", vthi/c = " << vthi/c << ", kTi = " << kTi  );
+  sim_log( "vdri = " << vdri << ", vdri/c = " << vdri/c );
+  sim_log( "wpi = " << wpi << ", wpi dt = " << wpi*dt << ", n0 = " << n0 );
+  sim_log( "wci = " << wci << ", wci dt = " << wci*dt );
+  sim_log( "rhoi = " << vthi/wci << ", L/rhoi = " << L/(vthi/wci) << ", dx/rhoi = " << (Lx/nx)/(vthi/wci) );
+  sim_log( "debyei = " << vthi/wpi << ", L/debyei = " << L/(vthi/wpi) << ", dx/debyei = " << (Lx/nx)/(vthi/wpi) );
+  sim_log( "Npi = " << Npi << ", Ni = " << Ni << ", Npi/Ni = " << Npi/Ni << ", wi = " << wi );
+  sim_log( "" );
+  sim_log( "Electron parameters" );
+  sim_log( "qpe = "  << -ec << ", me = " << me << ", qpe/me = " << -ec/me );
+  sim_log( "vthe = " << vthe << ", vthe/c = " << vthe/c << ", kTe = " << kTe  );
+  sim_log( "vdre = " << vdre << ", vdre/c = " << vdre/c );
+  sim_log( "wpe = " << wpe << ", wpe dt = " << wpe*dt << ", n0 = " << n0 );
+  sim_log( "wce = " << wce << ", wce dt = " << wce*dt );
+  sim_log( "rhoe = " << vthe/wce << ", L/rhoe = " << L/(vthe/wce) << ", dx/rhoe = " << (Lx/nx)/(vthe/wce) );
+  sim_log( "debyee = " << vthe/wpe << ", L/debyee = " << L/(vthe/wpe) << ", dx/debyee = " << (Lx/nx)/(vthe/wpe) );
+  sim_log( "Npe = " << Npe << ", Ne = " << Ne << ", Npe/Ne = " << Npe/Ne << ", we = " << we );
+  sim_log( "" );
+  sim_log( "Miscellaneous" );
+  sim_log( "nptotal = " << Ni + Ne );
+  sim_log( "nproc = " << nproc() );
+  sim_log( "" );
+
+  ////////////////////////////
+  // Load fields and particles
+
+  sim_log( "Loading fields" );
+
+  set_region_field( everywhere, 0, 0, 0,                    // Electric field
+                    0, -sn*b0*tanh(x/L), cs*b0*tanh(x/L) ); // Magnetic field
+  // Note: everywhere is a region that encompasses the entire simulation
+  // In general, regions are specied as logical equations (i.e. x>0 && x+y<2)
+
+  sim_log( "Loading particles" );
+
+  double ymin = rank()*Ly/nproc(), ymax = (rank()+1)*Ly/nproc();
+
+  repeat( Ni/nproc() ) {
+    double x, y, z, ux, uy, uz, d0;
+
+    // Pick an appropriately distributed random location for the pair
+    do {
+      x = L*atanh( uniform( rng(0), -1, 1 ) );
+    } while( x<=-0.5*Lx || x>=0.5*Lx );
+    y = uniform( rng(0), ymin, ymax );
+    z = uniform( rng(0), 0,    Lz   );
+
+    // For the ion, pick an isothermal normalized momentum in the drift frame
+    // (this is a proper thermal equilibrium in the non-relativistic limit),
+    // boost it from the drift frame to the frame with the magnetic field
+    // along z and then rotate it into the lab frame. Then load the particle.
+    // Repeat the process for the electron.
+
+    ux = normal( rng(0), 0, uthi );
+    uy = normal( rng(0), 0, uthi );
+    uz = normal( rng(0), 0, uthi );
+    d0 = gdri*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udri;
+    uy = d0*cs - uz*sn;
+    uz = d0*sn + uz*cs;
+    inject_particle( ion,      x, y, z, ux, uy, uz, wi, 0, 0 );
+
+    ux = normal( rng(0), 0, uthe );
+    uy = normal( rng(0), 0, uthe );
+    uz = normal( rng(0), 0, uthe );
+    d0 = gdre*uy + sqrt(ux*ux+uy*uy+uz*uz+1)*udre;
+    uy = d0*cs - uz*sn;
+    uz = d0*sn + uz*cs;
+    inject_particle( electron, x, y, z, ux, uy, uz, we, 0, 0 );
+  }
+
+  // Upon completion of the initialization, the following occurs:
+  // - The synchronization error (tang E, norm B) is computed between domains
+  //   and tang E / norm B are synchronized by averaging where discrepancies
+  //   are encountered.
+  // - The initial divergence error of the magnetic field is computed and
+  //   one pass of cleaning is done (for good measure)
+  // - The bound charge density necessary to give the simulation an initially
+  //   clean divergence e is computed.
+  // - The particle momentum is uncentered from u_0 to u_{-1/2}
+  // - The user diagnostics are called on the initial state
+  // - The physics loop is started
+  //
+  // The physics loop consists of:
+  // - Advance particles from x_0,u_{-1/2} to x_1,u_{1/2}
+  // - User particle injection at x_{1-age}, u_{1/2} (use inject_particles)
+  // - User current injection (adjust field(x,y,z).jfx, jfy, jfz)
+  // - Advance B from B_0 to B_{1/2}
+  // - Advance E from E_0 to E_1
+  // - User field injection to E_1 (adjust field(x,y,z).ex,ey,ez,cbx,cby,cbz)
+  // - Advance B from B_{1/2} to B_1
+  // - (periodically) Divergence clean electric field
+  // - (periodically) Divergence clean magnetic field
+  // - (periodically) Synchronize shared tang e and norm b
+  // - Increment the time step
+  // - Call user diagnostics
+  // - (periodically) Print a status message
+}
+
+begin_diagnostics {
+
+# define should_dump(x) (global->x##_interval>0 && remainder(step(),global->x##_interval)==0)
+
+  if( step()==-10 ) {
+    // A grid dump contains all grid parameters, field boundary conditions,
+    // particle boundary conditions and domain connectivity information. This
+    // is stored in a binary format. Each rank makes a grid dump
+    dump_grid("grid");
+
+    // A materials dump contains all the materials parameters. This is in a
+    // text format. Only rank 0 makes the materials dump
+    dump_materials("materials");
+
+    // A species dump contains the physics parameters of a species. This is in
+    // a text format. Only rank 0 makes the species dump
+    dump_species("species");
+  }
+
+  // Energy dumps store all the energies in various directions of E and B
+  // and the total kinetic (not including rest mass) energies of each species
+  // species in a simple text format. By default, the energies are appended to
+  // the file. However, if a "0" is added to the dump_energies call, a new
+  // energies dump file will be created. The energies are in the units of the
+  // problem and are all time centered appropriately. Note: When restarting a
+  // simulation from a restart dump made at a prior time step to the last
+  // energies dump, the energies file will have a "hiccup" of intervening
+  // time levels. This "hiccup" will not occur if the simulation is aborted
+  // immediately following a restart dump. Energies dumps are in a text
+  // format and the layout is documented at the top of the file. Only rank 0
+  // makes makes an energies dump.
+  if( should_dump(energies) ) dump_energies( "energies", step()==0 ? 0 : 1 );
+
+  // Field dumps store the raw electromagnetic fields, sources and material
+  // placement and a number of auxilliary fields. E, B and RHOB are
+  // timecentered, JF and TCA are half a step old. Material fields are static
+  // and the remaining fields (DIV E ERR, DIV B ERR and RHOF) are for
+  // debugging purposes. By default, field dump filenames are tagged with
+  // step(). However, if a "0" is added to the call, the filename will not be
+  // tagged. The JF that gets stored is accumulated with a charge-conserving
+  // algorithm. As a result, JF is not valid until at least one timestep has
+  // been completed. Field dumps are in a binary format. Each rank makes a
+  // field dump.
+  if( step()==-10 )         dump_fields_openpmd("fields"); // Get first valid total J
+  if( should_dump(fields) ) dump_fields_openpmd("fields");
+
+  // Hydro dumps store particle charge density, current density and
+  // stress-energy tensor. All these quantities are known at the time
+  // t = time().  All these quantities are accumulated trilinear
+  // node-centered. By default, species dump filenames are tagged with
+  // step(). However, if a "0" is added to the call, the filename will not
+  // be tagged. Note that the current density accumulated by this routine is
+  // purely diagnostic. It is not used by the simulation and it is not
+  // accumulated using a self-consistent charge-conserving method. Hydro dumps
+  // are in a binary format. Each rank makes a hydro dump.
+  //if(should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro");
+  //if( should_dump(ihydro) ) dump_hydro_hdf5("ion",     "ihydro");
+
+  // Particle dumps store the particle data for a given species. The data
+  // written is known at the time t = time().  By default, particle dumps
+  // are tagged with step(). However, if a "0" is added to the call, the
+  // filename will not be tagged. Particle dumps are in a binary format.
+  // Each rank makes a particle dump.
+  //if( should_dump(eparticle) ) dump_particles_hdf5("electron","eparticle");
+  //if( should_dump(iparticle) ) dump_particles_hdf5("ion",     "iparticle");
+
+  // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string
+  // and tag is an integer.  A typical usage is:
+  //   checkpt( "checkpt", step() ).
+  // This will cause each process to write their simulation state to a file
+  // whose name is based on fbase, tag and the node's rank.  For the above
+  // usage, if called on step 314 on a 4 process run, the four files:
+  //   checkpt.314.0, checkpt.314.1, checkpt.314.2, checkpt.314.3
+  // to be written.  The simulation can then be restarted from this point by
+  // invoking the application with "--restore checkpt.314".  checkpt must be
+  // the _VERY_ LAST_ diagnostic called.  If not, diagnostics performed after
+  // the checkpt but before the next timestep will be missed on restore.
+  // Restart dumps are in a binary format unique to the each simulation.
+
+  if( should_dump(restart) ) checkpt( "checkpt", step() );
+
+  // If you want to write a checkpt after a certain amount of simulation time,
+  // use uptime() in conjunction with checkpt.  For example, this will cause
+  // the simulation state to be written after 7.5 hours of running to the
+  // same file every time (useful for dealing with quotas on big machines).
+  //if( uptime()>=27000 ) {
+  //  checkpt( "timeout", 0 );
+  //  abort(0);
+  //}
+
+# undef should_dump
+
+}
+
+begin_particle_injection {
+
+  // No particle injection for this simulation
+
+}
+
+begin_current_injection {
+
+  // No current injection for this simulation
+
+}
+
+begin_field_injection {
+
+  // No field injection for this simulation
+
+}
+
+begin_particle_collisions{
+
+  // No collisions for this simulation
+
+}
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index a63a13a2..86749b5f 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -21,6 +21,11 @@
 #include "hdf5_header_info.h" // from vpic
 #endif
 
+#define VPIC_ENABLE_OPENPMD 1
+#ifdef VPIC_ENABLE_OPENPMD
+#include <openPMD/openPMD.hpp>
+#endif
+
 /* -1 means no ranks talk */
 #define VERBOSE_rank -1
 
@@ -266,6 +271,17 @@ vpic_simulation::dump_hydro( const char *sp_name,
   if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
 }
 
+#ifdef VPIC_ENABLE_OPENPMD
+void vpic_simulation::dump_fields_openpmd( const char *fbase, int ftag )
+{
+    openPMD::Series series = openPMD::Series(
+        "../samples/5_parallel_write.h5",
+        openPMD::AccessType::CREATE,
+        MPI_COMM_WORLD
+    );
+}
+#endif
+
 #ifdef VPIC_ENABLE_HDF5
 #define DUMP_DIR_FORMAT "./%s"
 
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index ce1a2454..ead1b631 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -379,6 +379,9 @@ class vpic_simulation {
 
   void dump_particles( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
+
+//#ifdef  // TODO: add ifdef
+  void dump_fields_openpmd( const char *fbase, int fname_tag = 1 );
 #ifdef VPIC_ENABLE_HDF5
   void dump_particles_hdf5( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );

From 9dc9c401cb4a421e99263801599affb0b3ba4104 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Wed, 30 Oct 2019 17:57:28 -0600
Subject: [PATCH 66/95] First pass where data dumps cbx

---
 CMakeLists.txt         |  34 ++++++++--
 sample/harrisOpenPMD   |   5 +-
 sample/read_openpmd.py |  39 +++++++++++
 src/grid/grid.h        |  14 ++++
 src/vpic/dump.cc       | 147 +++++++++++++++++++++++++++--------------
 5 files changed, 181 insertions(+), 58 deletions(-)
 create mode 100644 sample/read_openpmd.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57f1ed13..2c912f52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -343,17 +343,39 @@ target_include_directories(vpic INTERFACE ${CMAKE_SOURCE_DIR}/src)
 target_link_libraries(vpic ${VPIC_EXPOSE} ${MPI_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS} ${HDF5_C_LIBRARIES})
 target_compile_options(vpic ${VPIC_EXPOSE} ${MPI_C_COMPILE_FLAGS})
 
+# get absolute paths to linked libraries, and their transitive dependencies
+function(openpmdreclibs tgtname outname)
+    get_target_property(PC_PRIVATE_LIBS_TGT ${tgtname} INTERFACE_LINK_LIBRARIES)
+    foreach(PC_LIB IN LISTS PC_PRIVATE_LIBS_TGT)
+       if(TARGET ${PC_LIB})
+           openpmdreclibs(${PC_LIB} ${outname})
+       else()
+           if(PC_LIB)
+               string(APPEND ${outname} " ${PC_LIB}")
+           endif()
+       endif()
+    endforeach()
+    set(${outname} ${${outname}} PARENT_SCOPE)
+endfunction()
+
 if(USE_OPENPMD)
-    # Enable OpenPMD, and the relevant defines
-    message("Using OpenPMD")
-    find_package(openPMD REQUIRED)
+    # Enable openPMD, and the relevant defines
+    find_package(openPMD REQUIRED CONFIG COMPONENTS MPI)
+    target_link_libraries(vpic PRIVATE openPMD::openPMD)
+    target_compile_definitions(vpic PRIVATE "-DVPIC_ENABLE_OPENPMD")
+
     add_definitions(-DVPIC_ENABLE_OPENPMD)
-    link_libraries(openPMD::openPMD)
+
+    # legacy stuff for 2-phase compile
     get_target_property(openPMD_LIBRARIES openPMD::openPMD LOCATION)
-    string(REPLACE ";" " " string_libraries ${openPMD_LIBRARIES})
+    string(REPLACE ";" " " string_libraries "${openPMD_LIBRARIES}")
     set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${string_libraries}")
+    get_target_property(openPMD_TYPE openPMD::openPMD TYPE)
+    if("${openPMD_TYPE}" STREQUAL "STATIC_LIBRARY")
+        openpmdreclibs(openPMD openPMD_TRANSITIVE_LIBS)
+        set(VPIC_CXX_LIBRARIES "${VPIC_CXX_LIBRARIES} ${openPMD_TRANSITIVE_LIBS}")
+    endif()
 endif(USE_OPENPMD)
-message(${VPIC_CXX_LIBRARIES})
 
 macro(build_a_vpic name deck)
   if(NOT EXISTS ${deck})
diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
index f8c932e9..a93965f8 100644
--- a/sample/harrisOpenPMD
+++ b/sample/harrisOpenPMD
@@ -367,8 +367,9 @@ begin_diagnostics {
   // algorithm. As a result, JF is not valid until at least one timestep has
   // been completed. Field dumps are in a binary format. Each rank makes a
   // field dump.
-  if( step()==-10 )         dump_fields_openpmd("fields"); // Get first valid total J
-  if( should_dump(fields) ) dump_fields_openpmd("fields");
+  std::string openpm_field_name = "fields.h5";
+  if( step()==-10 )         dump_fields_openpmd(openpm_field_name.c_str()); // Get first valid total J
+  if( should_dump(fields) ) dump_fields_openpmd(openpm_field_name.c_str());
 
   // Hydro dumps store particle charge density, current density and
   // stress-energy tensor. All these quantities are known at the time
diff --git a/sample/read_openpmd.py b/sample/read_openpmd.py
new file mode 100644
index 00000000..84b19009
--- /dev/null
+++ b/sample/read_openpmd.py
@@ -0,0 +1,39 @@
+
+import openpmd_api as api
+
+# example: data handling
+import numpy as np
+
+file_name = "./fields.h5"
+series = api.Series( file_name, api.Access_Type.read_only)
+
+print(list(series.iterations))
+
+from pprint import pprint
+#pprint(vars(series))
+#pprint(vars(series.iterations))
+
+i = series.iterations[1];
+
+print("openPMD version: ",
+      series.openPMD)
+
+# record
+cB = i.meshes["B"]
+
+# record components
+cbx = cB["x"]
+
+x_data = cbx.load_chunk()
+
+series.flush()
+
+extent = cbx.shape
+
+print(
+    "First values in E_x "
+    "of shape: ",
+    extent)
+
+
+print(x_data)
diff --git a/src/grid/grid.h b/src/grid/grid.h
index 3167c7e6..b95a034e 100644
--- a/src/grid/grid.h
+++ b/src/grid/grid.h
@@ -138,6 +138,20 @@ typedef struct grid {
 
 #define VOXEL(x,y,z, nx,ny,nz) ((x) + ((nx)+2)*((y) + ((ny)+2)*(z)))
 
+// TODO: make the asymmetry in how nx+2 is handled more obvious
+#define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE {  \
+    int _ix, _iy, _iz;                                           \
+    _ix  = (rank);        /* ix = ix+gpx*( iy+gpy*iz ) */        \
+    _iy  = _ix/int(nx);   /* iy = iy+gpy*iz */                   \
+    _ix -= _iy*int(nx);   /* ix = ix */                          \
+    _iz  = _iy/int(ny);   /* iz = iz */                          \
+    _iy -= _iz*int(ny);   /* iy = iy */                          \
+    (ix) = _ix;                                                  \
+    (iy) = _iy;                                                  \
+    (iz) = _iz;                                                  \
+  } END_PRIMITIVE
+
+
 // Advance the voxel mesh index (v) and corresponding voxel mesh
 // coordinates (x,y,z) in a region with min- and max-corners of
 // (xl,yl,zl) and (xh,yh,zh) of a (nx,ny,nz) resolution voxel mesh in
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 86749b5f..8fa2a4de 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -271,36 +271,103 @@ vpic_simulation::dump_hydro( const char *sp_name,
   if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
 }
 
+// TODO: remove this hack
+static openPMD::Series* series;
+
 #ifdef VPIC_ENABLE_OPENPMD
-void vpic_simulation::dump_fields_openpmd( const char *fbase, int ftag )
+void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
 {
-    openPMD::Series series = openPMD::Series(
-        "../samples/5_parallel_write.h5",
-        openPMD::AccessType::CREATE,
-        MPI_COMM_WORLD
-    );
+
+    // TODO: recreating the series every time is probably not what we want?
+    std::cout << "Writing openPMD data" << std::endl;
+
+    if (series == nullptr) {
+        std::cout << "init series" << std::endl;
+        series = new openPMD::Series(
+                fbase,
+                //"test_parallel_write.h5",
+                //"test_parallel_write.bp",
+                openPMD::AccessType::CREATE,
+                MPI_COMM_WORLD
+                );
+    }
+
+    std::cout << "Writing itration " << step() << std::endl;
+    auto i = series->iterations[ step() + 0 ];
+    // TODO: it would be nice to set these...
+    //series.setAuthor( "Axel Huebl <a.huebl@hzdr.de>");
+    //series.setMachine( "Hall Probe 5000, Model 3");
+    i.setAttribute( "vacuum", true);
+
+    auto cB = i.meshes["B"];
+
+    // record components
+    auto cbx = cB["x"];
+    //auto B_y = B["y"];
+    //auto B_z = B["z"];
+
+    // TODO: set unitDimension so the anaylsis software knows what fields
+    // things are
+
+    //auto dataset = api::Dataset( api::determineDatatype<float>(), {150, 300});
+    size_t gnx = (grid->nx * grid->gpx);
+    size_t gny = (grid->ny * grid->gpy);
+    size_t gnz = (grid->nz * grid->gpz);
+    openPMD::Extent global_extent = {gny, gny, gnz};
+
+    openPMD::Datatype datatype = openPMD::determineDatatype<float>();
+    openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
+
+    cbx.resetDataset(dataset);
+    //B_y.resetDataset(dataset);
+    //B_z.resetDataset(dataset);
+
+    // Convert rank to local x/y/z
+    int rx, ry, rz;
+    UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+    size_t nx = grid->nx;
+    size_t ny = grid->ny;
+    size_t nz = grid->nz;
+
+    // NOTE: this assumes a static mesh decomposition in nx/ny/nz
+    size_t global_offset_x = (nx) * rx;
+    size_t global_offset_y = (ny) * ry;
+    size_t global_offset_z = (nz) * rz;
+
+    openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z};
+    openPMD::Extent chunk_extent = {nx, ny, nz};
+
+    // Store a local copy of the data which we pull out of the AoS
+    std::vector<float> cbx_data;
+    cbx_data.reserve(nx * ny * nz);
+
+    // We could do 1D here, but we don't really care about the ghosts, and we
+    // can thread over nz/ny (collapsed?)
+    // Go over non-ghosts and grab just that data into a dense array
+    for (size_t k = 1; k < grid->nz + 1; k++)
+    {
+        for (size_t j = 1; j < grid->ny + 1; j++)
+        {
+            for (size_t i = 1; i < grid->nx + 1; i++)
+            {
+                int local_index  = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2);
+                int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
+                cbx_data[local_index] = field_array->f[global_index].cbx;
+            }
+        }
+    }
+
+    cbx.storeChunk( cbx_data, chunk_offset, chunk_extent);
+    //B_y.storeChunk( y_data, chunk_offset, chunk_extent);
+    //B_z.storeChunk( z_data, chunk_offset, chunk_extent);
+    series->flush();
 }
 #endif
 
 #ifdef VPIC_ENABLE_HDF5
 #define DUMP_DIR_FORMAT "./%s"
 
-// TODO: rename or remove this
-#define RANK_TO_INDEX2(rank, ix, iy, iz)                                      \
-    BEGIN_PRIMITIVE                                                           \
-    {                                                                         \
-        int _ix, _iy, _iz;                                                    \
-        _ix = (rank);                         /* ix = ix+gpx*( iy+gpy*iz ) */ \
-        _iy = _ix / grid->gpx;  /* iy = iy+gpy*iz */            \
-        _ix -= _iy * grid->gpx; /* ix = ix */                   \
-        _iz = _iy / grid->gpy;  /* iz = iz */                   \
-        _iy -= _iz * grid->gpy; /* iy = iy */                   \
-        (ix) = _ix;                                                           \
-        (iy) = _iy;                                                           \
-        (iz) = _iz;                                                           \
-    }                                                                         \
-    END_PRIMITIVE
-
 /* define to do C-style indexing */
 #define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
 
@@ -453,22 +520,13 @@ printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
     int gpy = grid->gpy;
     int gpz = grid->gpz;
 
-    int mpi_rank_x, mpi_rank_y, mpi_rank_z;
-    //RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
-
-    int _ix, _iy, _iz;
-    _ix = (mpi_rank);
-    _iy = _ix / grid->gpx;
-    _ix -= _iy * grid->gpx;
-    _iz = _iy / grid->gpy;
-    _iy -= _iz * grid->gpy;
-    int ix = _ix;
-    int iy = _iy;
-    int iz = _iz;
-
-    mpi_rank_x = ix;
-    mpi_rank_y = iy;
-    mpi_rank_z = iz;
+    // Convert rank to local decomposition
+    int rx, ry, rz;
+    UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+    mpi_rank_x = rx;
+    mpi_rank_y = ry;
+    mpi_rank_z = rz;
 
     global_offset[0] = (grid->nx) * mpi_rank_x;
     global_offset[1] = (grid->ny) * mpi_rank_y;
@@ -1028,18 +1086,6 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
 
 #define OUTPUT_CONVERT_GLOBAL_ID 1
 #ifdef OUTPUT_CONVERT_GLOBAL_ID
-# define UNVOXEL(rank, ix, iy, iz, nx, ny, nz) BEGIN_PRIMITIVE {        \
-    int _ix, _iy, _iz;                                                  \
-    _ix  = (rank);        /* ix = ix+gpx*( iy+gpy*iz ) */       \
-    _iy  = _ix/int(nx);   /* iy = iy+gpy*iz */                  \
-    _ix -= _iy*int(nx);   /* ix = ix */                         \
-    _iz  = _iy/int(ny);   /* iz = iz */                         \
-    _iy -= _iz*int(ny);   /* iy = iy */                         \
-    (ix) = _ix;                                                         \
-    (iy) = _iy;                                                         \
-    (iz) = _iz;                                                         \
-  } END_PRIMITIVE
-
         std::vector<int> global_pi;
         global_pi.reserve(numparticles);
         // TODO: this could be parallel
@@ -1048,6 +1094,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
             int local_i = sp->p[i].i;
 
             int ix, iy, iz, rx, ry, rz;
+
             // Convert rank to local x/y/z
             UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
 

From 0662a19ed71ef909516414c4624af13d572b44d4 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Mon, 4 Nov 2019 11:09:18 -0700
Subject: [PATCH 67/95] data verified using python

---
 sample/harrisOpenPMD |  2 ++
 src/vpic/dump.cc     | 19 +++++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
index a93965f8..b60b6fdd 100644
--- a/sample/harrisOpenPMD
+++ b/sample/harrisOpenPMD
@@ -367,7 +367,9 @@ begin_diagnostics {
   // algorithm. As a result, JF is not valid until at least one timestep has
   // been completed. Field dumps are in a binary format. Each rank makes a
   // field dump.
+
   std::string openpm_field_name = "fields.h5";
+  //std::string openpm_field_name = "fields.bp";
   if( step()==-10 )         dump_fields_openpmd(openpm_field_name.c_str()); // Get first valid total J
   if( should_dump(fields) ) dump_fields_openpmd(openpm_field_name.c_str());
 
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 8fa2a4de..6671496e 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -303,8 +303,8 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
 
     // record components
     auto cbx = cB["x"];
-    //auto B_y = B["y"];
-    //auto B_z = B["z"];
+    auto cby = cB["y"];
+    auto cbz = cB["z"];
 
     // TODO: set unitDimension so the anaylsis software knows what fields
     // things are
@@ -319,6 +319,8 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
     openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
 
     cbx.resetDataset(dataset);
+    cby.resetDataset(dataset);
+    cbz.resetDataset(dataset);
     //B_y.resetDataset(dataset);
     //B_z.resetDataset(dataset);
 
@@ -340,7 +342,12 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
 
     // Store a local copy of the data which we pull out of the AoS
     std::vector<float> cbx_data;
+    std::vector<float> cby_data;
+    std::vector<float> cbz_data;
+
     cbx_data.reserve(nx * ny * nz);
+    cby_data.reserve(nx * ny * nz);
+    cbz_data.reserve(nx * ny * nz);
 
     // We could do 1D here, but we don't really care about the ghosts, and we
     // can thread over nz/ny (collapsed?)
@@ -353,14 +360,18 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
             {
                 int local_index  = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2);
                 int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
+
                 cbx_data[local_index] = field_array->f[global_index].cbx;
+                cby_data[local_index] = field_array->f[global_index].cby;
+                cbz_data[local_index] = field_array->f[global_index].cbz;
             }
         }
     }
 
     cbx.storeChunk( cbx_data, chunk_offset, chunk_extent);
-    //B_y.storeChunk( y_data, chunk_offset, chunk_extent);
-    //B_z.storeChunk( z_data, chunk_offset, chunk_extent);
+    cby.storeChunk( cby_data, chunk_offset, chunk_extent);
+    cbz.storeChunk( cbz_data, chunk_offset, chunk_extent);
+
     series->flush();
 }
 #endif

From 6b8416372db93081ceae03ac03ddf8b7b49a44ac Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Mon, 4 Nov 2019 11:27:38 -0700
Subject: [PATCH 68/95] quick tidy up and add j and e

---
 src/vpic/dump.cc | 78 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 6671496e..fbbd31d3 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -271,25 +271,22 @@ vpic_simulation::dump_hydro( const char *sp_name,
   if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
 }
 
-// TODO: remove this hack
+#ifdef VPIC_ENABLE_OPENPMD
+
+// TODO: remove this hack, and actually store the state properly
 static openPMD::Series* series;
 
-#ifdef VPIC_ENABLE_OPENPMD
 void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
 {
-
-    // TODO: recreating the series every time is probably not what we want?
     std::cout << "Writing openPMD data" << std::endl;
 
     if (series == nullptr) {
         std::cout << "init series" << std::endl;
         series = new openPMD::Series(
-                fbase,
-                //"test_parallel_write.h5",
-                //"test_parallel_write.bp",
-                openPMD::AccessType::CREATE,
-                MPI_COMM_WORLD
-                );
+            fbase,
+            openPMD::AccessType::CREATE,
+            MPI_COMM_WORLD
+        );
     }
 
     std::cout << "Writing itration " << step() << std::endl;
@@ -300,16 +297,25 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
     i.setAttribute( "vacuum", true);
 
     auto cB = i.meshes["B"];
+    auto E = i.meshes["E"];
+    auto J = i.meshes["J"];
 
     // record components
     auto cbx = cB["x"];
     auto cby = cB["y"];
     auto cbz = cB["z"];
 
+    auto Ex = E["x"];
+    auto Ey = E["y"];
+    auto Ez = E["z"];
+
+    auto Jx = J["x"];
+    auto Jy = J["y"];
+    auto Jz = J["z"];
+
     // TODO: set unitDimension so the anaylsis software knows what fields
     // things are
 
-    //auto dataset = api::Dataset( api::determineDatatype<float>(), {150, 300});
     size_t gnx = (grid->nx * grid->gpx);
     size_t gny = (grid->ny * grid->gpy);
     size_t gnz = (grid->nz * grid->gpz);
@@ -321,8 +327,14 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
     cbx.resetDataset(dataset);
     cby.resetDataset(dataset);
     cbz.resetDataset(dataset);
-    //B_y.resetDataset(dataset);
-    //B_z.resetDataset(dataset);
+
+    Ex.resetDataset(dataset);
+    Ey.resetDataset(dataset);
+    Ez.resetDataset(dataset);
+
+    Jx.resetDataset(dataset);
+    Jy.resetDataset(dataset);
+    Jz.resetDataset(dataset);
 
     // Convert rank to local x/y/z
     int rx, ry, rz;
@@ -345,9 +357,27 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
     std::vector<float> cby_data;
     std::vector<float> cbz_data;
 
-    cbx_data.reserve(nx * ny * nz);
-    cby_data.reserve(nx * ny * nz);
-    cbz_data.reserve(nx * ny * nz);
+    std::vector<float> ex_data;
+    std::vector<float> ey_data;
+    std::vector<float> ez_data;
+
+    std::vector<float> jx_data;
+    std::vector<float> jy_data;
+    std::vector<float> jz_data;
+
+    size_t nv = nx * ny * nz;
+
+    cbx_data.reserve(nv);
+    cby_data.reserve(nv);
+    cbz_data.reserve(nv);
+
+    ex_data.reserve(nv);
+    ey_data.reserve(nv);
+    ez_data.reserve(nv);
+
+    jx_data.reserve(nv);
+    jy_data.reserve(nv);
+    jz_data.reserve(nv);
 
     // We could do 1D here, but we don't really care about the ghosts, and we
     // can thread over nz/ny (collapsed?)
@@ -364,6 +394,14 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
                 cbx_data[local_index] = field_array->f[global_index].cbx;
                 cby_data[local_index] = field_array->f[global_index].cby;
                 cbz_data[local_index] = field_array->f[global_index].cbz;
+
+                ex_data[local_index] = field_array->f[global_index].ex;
+                ey_data[local_index] = field_array->f[global_index].ey;
+                ez_data[local_index] = field_array->f[global_index].ez;
+
+                jx_data[local_index] = field_array->f[global_index].jfx;
+                jy_data[local_index] = field_array->f[global_index].jfy;
+                jz_data[local_index] = field_array->f[global_index].jfz;
             }
         }
     }
@@ -372,6 +410,14 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
     cby.storeChunk( cby_data, chunk_offset, chunk_extent);
     cbz.storeChunk( cbz_data, chunk_offset, chunk_extent);
 
+    Ex.storeChunk( ex_data, chunk_offset, chunk_extent);
+    Ey.storeChunk( ey_data, chunk_offset, chunk_extent);
+    Ez.storeChunk( ez_data, chunk_offset, chunk_extent);
+
+    Jx.storeChunk( jx_data, chunk_offset, chunk_extent);
+    Jy.storeChunk( jy_data, chunk_offset, chunk_extent);
+    Jz.storeChunk( jz_data, chunk_offset, chunk_extent);
+
     series->flush();
 }
 #endif

From 572ee50f5670540c3075c8fe7d731627bbddd510 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 5 Nov 2019 11:01:52 -0700
Subject: [PATCH 69/95] add particle dumping via openpmd (close but not passing
 openpmd file validator becasue of missing timeOffset)

---
 sample/harrisOpenPMD |   2 +-
 src/vpic/dump.cc     | 119 ++++++++++++++++++++++++++++++++++++++++++-
 src/vpic/vpic.h      |   9 +++-
 3 files changed, 126 insertions(+), 4 deletions(-)

diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
index b60b6fdd..16abb432 100644
--- a/sample/harrisOpenPMD
+++ b/sample/harrisOpenPMD
@@ -390,7 +390,7 @@ begin_diagnostics {
   // are tagged with step(). However, if a "0" is added to the call, the
   // filename will not be tagged. Particle dumps are in a binary format.
   // Each rank makes a particle dump.
-  //if( should_dump(eparticle) ) dump_particles_hdf5("electron","eparticle");
+  if( should_dump(eparticle) ) dump_particles_openpmd("electron","eparticle");
   //if( should_dump(iparticle) ) dump_particles_hdf5("ion",     "iparticle");
 
   // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index fbbd31d3..45ebf853 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -21,7 +21,6 @@
 #include "hdf5_header_info.h" // from vpic
 #endif
 
-#define VPIC_ENABLE_OPENPMD 1
 #ifdef VPIC_ENABLE_OPENPMD
 #include <openPMD/openPMD.hpp>
 #endif
@@ -45,6 +44,39 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) {
 	return FileUtils::getCurrentWorkingDirectory(dname, size);
 } // dump_mkdir
 
+
+// TODO: move this somewhere more sensible
+std::array<int, 4> global_particle_index(int local_i, grid_t* grid, int rank)
+{
+    int ix, iy, iz, rx, ry, rz;
+    // Convert rank to local x/y/z
+    UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+    // Calculate local ix/iy/iz
+    UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
+
+    // Account for the "first" ghost cell
+    ix = ix - 1;
+    iy = iy - 1;
+    iz = iz - 1;
+
+    // Convert ix/iy/iz to global
+    int gix = ix + (grid->nx * (rx));
+    int giy = iy + (grid->ny * (ry));
+    int giz = iz + (grid->nz * (rz));
+
+    // calculate global grid sizes
+    int gnx = grid->nx * grid->gpx;
+    int gny = grid->ny * grid->gpy;
+    int gnz = grid->nz * grid->gpz;
+
+    // TODO: find a better way to account for the hard coded ghosts in VOXEL
+    int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
+
+    return { global_i, gix, giy, giz };
+}
+
+
 /*****************************************************************************
  * ASCII dump IO
  *****************************************************************************/
@@ -276,6 +308,86 @@ vpic_simulation::dump_hydro( const char *sp_name,
 // TODO: remove this hack, and actually store the state properly
 static openPMD::Series* series;
 
+void
+vpic_simulation::dump_particles_openpmd( const char *sp_name,
+                                 const char *fbase,
+                                 int ftag )
+{
+
+    species_t *sp = find_species_name( sp_name, species_list );
+
+    if (series == nullptr) {
+        std::cout << "init series" << std::endl;
+        series = new openPMD::Series(
+            fbase,
+            openPMD::AccessType::CREATE,
+            MPI_COMM_WORLD
+        );
+    }
+
+    auto i = series->iterations[ step() ];
+
+    // TODO: set these
+    i.setTime( (float)step() );
+    i.setDt(1.0);
+    i.setTimeUnitSI(1.0);
+
+    auto& p = i.particles[sp_name];
+    //openPMD::ParticleSpecies& p = i.particles[sp_name];
+
+    const int np = sp->np;
+
+    // TODO: this could be a function call as it's used elsewhere (in hdf5)
+    unsigned long long total_particles, offset;
+    unsigned long long numparticles = np;
+    MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+    offset -= numparticles;
+
+    openPMD::Extent global_extent = {total_particles};
+    openPMD::Datatype datatype = openPMD::determineDatatype<float>();
+    openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
+
+    auto px = p["position"]["x"];
+    auto pxo = p["positionOffset"]["x"];
+
+    px.resetDataset(dataset);
+    pxo.resetDataset(dataset);
+
+    // convert data to SoA, allowing the user to chunk the operation
+    const int max_chunk = 32768*8; // 1MB SoA
+    // Loop over all particles in chunks
+    for (int i = 0; i < np; i += max_chunk)
+    {
+        // We have to be careful as the last chunk may not be full
+        // Find how many are left and do that many
+        size_t to_write = std::min(np-i, max_chunk);
+
+        // Convert the chunk ready to write
+        std::vector<float> x_pos;
+        std::vector<float> x_off;
+        x_pos.reserve(to_write);
+        x_off.reserve(to_write);
+
+        for (int j = 0; j < to_write; j++)
+        {
+            // TODO: do I need to center the particles?
+            auto& particle = sp->p[i+j];
+            x_pos[j] = particle.dx;
+            std::array<int, 4> gi = global_particle_index(particle.i, grid, rank());
+            x_off[j] = (float)gi[1];
+        }
+
+        // Base offset plus i to account for chunks
+        auto o = openPMD::Offset{offset + i};
+        auto e = openPMD::Extent{to_write};
+        px.storeChunk(x_pos, o, e);
+        pxo.storeChunk(x_off, o, e);
+    }
+
+
+}
+
 void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
 {
     std::cout << "Writing openPMD data" << std::endl;
@@ -290,7 +402,7 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
     }
 
     std::cout << "Writing itration " << step() << std::endl;
-    auto i = series->iterations[ step() + 0 ];
+    auto i = series->iterations[ step() ];
     // TODO: it would be nice to set these...
     //series.setAuthor( "Axel Huebl <a.huebl@hzdr.de>");
     //series.setMachine( "Hall Probe 5000, Model 3");
@@ -379,6 +491,8 @@ void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
     jy_data.reserve(nv);
     jz_data.reserve(nv);
 
+    // TODO: make this AoS to SoA conversion a function
+
     // We could do 1D here, but we don't really care about the ghosts, and we
     // can thread over nz/ny (collapsed?)
     // Go over non-ghosts and grab just that data into a dense array
@@ -1143,6 +1257,7 @@ vpic_simulation::dump_particles_hdf5( const char *sp_name,
 
 #define OUTPUT_CONVERT_GLOBAL_ID 1
 #ifdef OUTPUT_CONVERT_GLOBAL_ID
+        // TODO: make a function out of this too, its used in openpmd
         std::vector<int> global_pi;
         global_pi.reserve(numparticles);
         // TODO: this could be parallel
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index ead1b631..6b657a16 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -380,8 +380,15 @@ class vpic_simulation {
   void dump_particles( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
 
-//#ifdef  // TODO: add ifdef
+#ifdef VPIC_ENABLE_OPENPMD
   void dump_fields_openpmd( const char *fbase, int fname_tag = 1 );
+  void dump_particles_openpmd(
+      const char *sp_name,
+      const char *fbase,
+      int ftag = 1
+  );
+#endif
+
 #ifdef VPIC_ENABLE_HDF5
   void dump_particles_hdf5( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );

From f197e3ea35694f22e389cefdee1bb80d36e60f32 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 5 Nov 2019 19:01:22 -0700
Subject: [PATCH 70/95] start refactoring dump into seperate class to better
 support user selection

---
 src/vpic/dump.cc         | 1393 ++------------------------------------
 src/vpic/dump.h          |   46 ++
 src/vpic/dump_strategy.h | 1365 +++++++++++++++++++++++++++++++++++++
 src/vpic/dumpmacros.h    |  197 +++---
 src/vpic/vpic.cc         |    8 +-
 src/vpic/vpic.h          |   25 +-
 6 files changed, 1570 insertions(+), 1464 deletions(-)
 create mode 100644 src/vpic/dump.h
 create mode 100644 src/vpic/dump_strategy.h

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 45ebf853..35d7441d 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -12,19 +12,10 @@
 #include <cassert>
 #include <unordered_map>
 
-#include "vpic.h"
 #include "dumpmacros.h"
+#include "vpic.h"
 #include "../util/io/FileUtils.h"
 
-#ifdef VPIC_ENABLE_HDF5
-#include "hdf5.h" // from the lib
-#include "hdf5_header_info.h" // from vpic
-#endif
-
-#ifdef VPIC_ENABLE_OPENPMD
-#include <openPMD/openPMD.hpp>
-#endif
-
 /* -1 means no ranks talk */
 #define VERBOSE_rank -1
 
@@ -44,42 +35,32 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) {
 	return FileUtils::getCurrentWorkingDirectory(dname, size);
 } // dump_mkdir
 
+/*****************************************************************************
+ * ASCII dump IO
+ *****************************************************************************/
 
-// TODO: move this somewhere more sensible
-std::array<int, 4> global_particle_index(int local_i, grid_t* grid, int rank)
+void vpic_simulation::dump_particles( const char *sp_name,
+                                 const char *fbase,
+                                 int ftag )
 {
-    int ix, iy, iz, rx, ry, rz;
-    // Convert rank to local x/y/z
-    UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
-
-    // Calculate local ix/iy/iz
-    UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
-
-    // Account for the "first" ghost cell
-    ix = ix - 1;
-    iy = iy - 1;
-    iz = iz - 1;
-
-    // Convert ix/iy/iz to global
-    int gix = ix + (grid->nx * (rx));
-    int giy = iy + (grid->ny * (ry));
-    int giz = iz + (grid->nz * (rz));
-
-    // calculate global grid sizes
-    int gnx = grid->nx * grid->gpx;
-    int gny = grid->ny * grid->gpy;
-    int gnz = grid->nz * grid->gpz;
-
-    // TODO: find a better way to account for the hard coded ghosts in VOXEL
-    int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
-
-    return { global_i, gix, giy, giz };
+    species_t * sp = find_species_name(sp_name, species_list);
+    dump_strategy.dump_particles(
+        fbase,
+        sp,
+        grid,
+        step(),
+        interpolator_array,
+        ftag
+    );
 }
 
+void dump_fields( const char *fbase, int fname_tag = 1 )
+{
+}
 
-/*****************************************************************************
- * ASCII dump IO
- *****************************************************************************/
+void dump_hydro( const char *sp_name, const char *fbase, int fname_tag = 1 )
+{
+}
 
 void
 vpic_simulation::dump_energies( const char *fname,
@@ -162,26 +143,6 @@ vpic_simulation::dump_materials( const char *fname ) {
  * Binary dump IO
  *****************************************************************************/
 
-/*
-enum dump_types {
-  grid_dump = 0,
-  field_dump = 1,
-  hydro_dump = 2,
-  particle_dump = 3,
-  restart_dump = 4
-};
-*/
-
-// TODO: should this be an enum?
-namespace dump_type {
-  const int grid_dump = 0;
-  const int field_dump = 1;
-  const int hydro_dump = 2;
-  const int particle_dump = 3;
-  const int restart_dump = 4;
-  const int history_dump = 5;
-} // namespace
-
 void
 vpic_simulation::dump_grid( const char *fbase ) {
   char fname[256];
@@ -196,14 +157,14 @@ vpic_simulation::dump_grid( const char *fbase ) {
   if( status==fail ) ERROR(( "Could not open \"%s\".", fname ));
 
   /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-  nxout = grid->nx;
-  nyout = grid->ny;
-  nzout = grid->nz;
-  dxout = grid->dx;
-  dyout = grid->dy;
-  dzout = grid->dz;
+  size_t nxout = grid->nx;
+  size_t nyout = grid->ny;
+  size_t nzout = grid->nz;
+  float dxout = grid->dx;
+  float dyout = grid->dy;
+  float dzout = grid->dz;
 
-  WRITE_HEADER_V0( dump_type::grid_dump, -1, 0, fileIO );
+  WRITE_HEADER_V0( dump_type::grid_dump, -1, 0, fileIO, step(), rank(), nproc());
 
   dim[0] = 3;
   dim[1] = 3;
@@ -225,1268 +186,6 @@ vpic_simulation::dump_grid( const char *fbase ) {
   if( fileIO.close() ) ERROR(( "File close failed on dump grid!!!" ));
 }
 
-void
-vpic_simulation::dump_fields( const char *fbase, int ftag ) {
-  char fname[256];
-  FileIO fileIO;
-  int dim[3];
-
-  if( !fbase ) ERROR(( "Invalid filename" ));
-
-  if( rank()==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase ));
-
-  if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() );
-  else       sprintf( fname, "%s.%i", fbase, rank() );
-
-  FileIOStatus status = fileIO.open(fname, io_write);
-  if( status==fail ) ERROR(( "Could not open \"%s\".", fname ));
-
-  /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-  nxout = grid->nx;
-  nyout = grid->ny;
-  nzout = grid->nz;
-  dxout = grid->dx;
-  dyout = grid->dy;
-  dzout = grid->dz;
-
-  WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO );
-
-  dim[0] = grid->nx+2;
-  dim[1] = grid->ny+2;
-  dim[2] = grid->nz+2;
-  WRITE_ARRAY_HEADER( field_array->f, 3, dim, fileIO );
-  fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] );
-  if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" ));
-}
-
-void
-vpic_simulation::dump_hydro( const char *sp_name,
-                             const char *fbase,
-                             int ftag ) {
-  species_t *sp;
-  char fname[256];
-  FileIO fileIO;
-  int dim[3];
-
-  sp = find_species_name( sp_name, species_list );
-  if( !sp ) ERROR(( "Invalid species \"%s\"", sp_name ));
-
-  clear_hydro_array( hydro_array );
-  accumulate_hydro_p( hydro_array, sp, interpolator_array );
-  synchronize_hydro_array( hydro_array );
-
-  if( !fbase ) ERROR(( "Invalid filename" ));
-
-  if( rank()==0 )
-    MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase));
-
-  if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() );
-  else       sprintf( fname, "%s.%i", fbase, rank() );
-  FileIOStatus status = fileIO.open(fname, io_write);
-  if( status==fail) ERROR(( "Could not open \"%s\".", fname ));
-
-  /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-  nxout = grid->nx;
-  nyout = grid->ny;
-  nzout = grid->nz;
-  dxout = grid->dx;
-  dyout = grid->dy;
-  dzout = grid->dz;
-
-  WRITE_HEADER_V0( dump_type::hydro_dump,sp->id,sp->q/sp->m,fileIO);
-
-  dim[0] = grid->nx+2;
-  dim[1] = grid->ny+2;
-  dim[2] = grid->nz+2;
-  WRITE_ARRAY_HEADER( hydro_array->h, 3, dim, fileIO );
-  fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] );
-  if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
-}
-
-#ifdef VPIC_ENABLE_OPENPMD
-
-// TODO: remove this hack, and actually store the state properly
-static openPMD::Series* series;
-
-void
-vpic_simulation::dump_particles_openpmd( const char *sp_name,
-                                 const char *fbase,
-                                 int ftag )
-{
-
-    species_t *sp = find_species_name( sp_name, species_list );
-
-    if (series == nullptr) {
-        std::cout << "init series" << std::endl;
-        series = new openPMD::Series(
-            fbase,
-            openPMD::AccessType::CREATE,
-            MPI_COMM_WORLD
-        );
-    }
-
-    auto i = series->iterations[ step() ];
-
-    // TODO: set these
-    i.setTime( (float)step() );
-    i.setDt(1.0);
-    i.setTimeUnitSI(1.0);
-
-    auto& p = i.particles[sp_name];
-    //openPMD::ParticleSpecies& p = i.particles[sp_name];
-
-    const int np = sp->np;
-
-    // TODO: this could be a function call as it's used elsewhere (in hdf5)
-    unsigned long long total_particles, offset;
-    unsigned long long numparticles = np;
-    MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-    offset -= numparticles;
-
-    openPMD::Extent global_extent = {total_particles};
-    openPMD::Datatype datatype = openPMD::determineDatatype<float>();
-    openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
-
-    auto px = p["position"]["x"];
-    auto pxo = p["positionOffset"]["x"];
-
-    px.resetDataset(dataset);
-    pxo.resetDataset(dataset);
-
-    // convert data to SoA, allowing the user to chunk the operation
-    const int max_chunk = 32768*8; // 1MB SoA
-    // Loop over all particles in chunks
-    for (int i = 0; i < np; i += max_chunk)
-    {
-        // We have to be careful as the last chunk may not be full
-        // Find how many are left and do that many
-        size_t to_write = std::min(np-i, max_chunk);
-
-        // Convert the chunk ready to write
-        std::vector<float> x_pos;
-        std::vector<float> x_off;
-        x_pos.reserve(to_write);
-        x_off.reserve(to_write);
-
-        for (int j = 0; j < to_write; j++)
-        {
-            // TODO: do I need to center the particles?
-            auto& particle = sp->p[i+j];
-            x_pos[j] = particle.dx;
-            std::array<int, 4> gi = global_particle_index(particle.i, grid, rank());
-            x_off[j] = (float)gi[1];
-        }
-
-        // Base offset plus i to account for chunks
-        auto o = openPMD::Offset{offset + i};
-        auto e = openPMD::Extent{to_write};
-        px.storeChunk(x_pos, o, e);
-        pxo.storeChunk(x_off, o, e);
-    }
-
-
-}
-
-void vpic_simulation::dump_fields_openpmd(const char *fbase, int ftag)
-{
-    std::cout << "Writing openPMD data" << std::endl;
-
-    if (series == nullptr) {
-        std::cout << "init series" << std::endl;
-        series = new openPMD::Series(
-            fbase,
-            openPMD::AccessType::CREATE,
-            MPI_COMM_WORLD
-        );
-    }
-
-    std::cout << "Writing itration " << step() << std::endl;
-    auto i = series->iterations[ step() ];
-    // TODO: it would be nice to set these...
-    //series.setAuthor( "Axel Huebl <a.huebl@hzdr.de>");
-    //series.setMachine( "Hall Probe 5000, Model 3");
-    i.setAttribute( "vacuum", true);
-
-    auto cB = i.meshes["B"];
-    auto E = i.meshes["E"];
-    auto J = i.meshes["J"];
-
-    // record components
-    auto cbx = cB["x"];
-    auto cby = cB["y"];
-    auto cbz = cB["z"];
-
-    auto Ex = E["x"];
-    auto Ey = E["y"];
-    auto Ez = E["z"];
-
-    auto Jx = J["x"];
-    auto Jy = J["y"];
-    auto Jz = J["z"];
-
-    // TODO: set unitDimension so the anaylsis software knows what fields
-    // things are
-
-    size_t gnx = (grid->nx * grid->gpx);
-    size_t gny = (grid->ny * grid->gpy);
-    size_t gnz = (grid->nz * grid->gpz);
-    openPMD::Extent global_extent = {gny, gny, gnz};
-
-    openPMD::Datatype datatype = openPMD::determineDatatype<float>();
-    openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
-
-    cbx.resetDataset(dataset);
-    cby.resetDataset(dataset);
-    cbz.resetDataset(dataset);
-
-    Ex.resetDataset(dataset);
-    Ey.resetDataset(dataset);
-    Ez.resetDataset(dataset);
-
-    Jx.resetDataset(dataset);
-    Jy.resetDataset(dataset);
-    Jz.resetDataset(dataset);
-
-    // Convert rank to local x/y/z
-    int rx, ry, rz;
-    UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
-
-    size_t nx = grid->nx;
-    size_t ny = grid->ny;
-    size_t nz = grid->nz;
-
-    // NOTE: this assumes a static mesh decomposition in nx/ny/nz
-    size_t global_offset_x = (nx) * rx;
-    size_t global_offset_y = (ny) * ry;
-    size_t global_offset_z = (nz) * rz;
-
-    openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z};
-    openPMD::Extent chunk_extent = {nx, ny, nz};
-
-    // Store a local copy of the data which we pull out of the AoS
-    std::vector<float> cbx_data;
-    std::vector<float> cby_data;
-    std::vector<float> cbz_data;
-
-    std::vector<float> ex_data;
-    std::vector<float> ey_data;
-    std::vector<float> ez_data;
-
-    std::vector<float> jx_data;
-    std::vector<float> jy_data;
-    std::vector<float> jz_data;
-
-    size_t nv = nx * ny * nz;
-
-    cbx_data.reserve(nv);
-    cby_data.reserve(nv);
-    cbz_data.reserve(nv);
-
-    ex_data.reserve(nv);
-    ey_data.reserve(nv);
-    ez_data.reserve(nv);
-
-    jx_data.reserve(nv);
-    jy_data.reserve(nv);
-    jz_data.reserve(nv);
-
-    // TODO: make this AoS to SoA conversion a function
-
-    // We could do 1D here, but we don't really care about the ghosts, and we
-    // can thread over nz/ny (collapsed?)
-    // Go over non-ghosts and grab just that data into a dense array
-    for (size_t k = 1; k < grid->nz + 1; k++)
-    {
-        for (size_t j = 1; j < grid->ny + 1; j++)
-        {
-            for (size_t i = 1; i < grid->nx + 1; i++)
-            {
-                int local_index  = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2);
-                int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
-
-                cbx_data[local_index] = field_array->f[global_index].cbx;
-                cby_data[local_index] = field_array->f[global_index].cby;
-                cbz_data[local_index] = field_array->f[global_index].cbz;
-
-                ex_data[local_index] = field_array->f[global_index].ex;
-                ey_data[local_index] = field_array->f[global_index].ey;
-                ez_data[local_index] = field_array->f[global_index].ez;
-
-                jx_data[local_index] = field_array->f[global_index].jfx;
-                jy_data[local_index] = field_array->f[global_index].jfy;
-                jz_data[local_index] = field_array->f[global_index].jfz;
-            }
-        }
-    }
-
-    cbx.storeChunk( cbx_data, chunk_offset, chunk_extent);
-    cby.storeChunk( cby_data, chunk_offset, chunk_extent);
-    cbz.storeChunk( cbz_data, chunk_offset, chunk_extent);
-
-    Ex.storeChunk( ex_data, chunk_offset, chunk_extent);
-    Ey.storeChunk( ey_data, chunk_offset, chunk_extent);
-    Ez.storeChunk( ez_data, chunk_offset, chunk_extent);
-
-    Jx.storeChunk( jx_data, chunk_offset, chunk_extent);
-    Jy.storeChunk( jy_data, chunk_offset, chunk_extent);
-    Jz.storeChunk( jz_data, chunk_offset, chunk_extent);
-
-    series->flush();
-}
-#endif
-
-#ifdef VPIC_ENABLE_HDF5
-#define DUMP_DIR_FORMAT "./%s"
-
-/* define to do C-style indexing */
-#define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
-
-// TODO: make function?
-#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag)                                 \
-  {                                                                                                                                       \
-    FILE *fp;                                                                                                                             \
-    fp = fopen(xml_file_name, "a");                                                                                                       \
-    fprintf(fp, main_body_head, time_step);                                                                                               \
-    if (field_dump_flag.enabledE())                                                                                                       \
-      write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez");             \
-    if (field_dump_flag.div_e_err)                                                                                                        \
-      fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err");               \
-    if (field_dump_flag.enabledCB())                                                                                                      \
-      write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz");          \
-    if (field_dump_flag.div_b_err)                                                                                                        \
-      fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err");               \
-    if (field_dump_flag.enabledTCA())                                                                                                     \
-      write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz");     \
-    if (field_dump_flag.rhob)                                                                                                             \
-      fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob");                         \
-    if (field_dump_flag.enabledJF())                                                                                                      \
-      write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz");         \
-    if (field_dump_flag.rhof)                                                                                                             \
-      fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof");                         \
-    if (field_dump_flag.enabledEMAT())                                                                                                    \
-      write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \
-    if (field_dump_flag.nmat)                                                                                                             \
-      fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat");                         \
-    if (field_dump_flag.enabledFMAT())                                                                                                    \
-      write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \
-    if (field_dump_flag.cmat)                                                                                                             \
-      fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat");                         \
-    fprintf(fp, "%s", main_body_foot);                                                                                                          \
-    if (add_footer_flag)                                                                                                                  \
-      fputs(footer, fp);                                                                                                                  \
-    fclose(fp);                                                                                                                           \
-  }
-void
-vpic_simulation::dump_fields_hdf5( const char *fbase, int ftag )
-{
-    size_t step_for_viou = step();
-
-    int mpi_size, mpi_rank;
-    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-
-
-#ifdef DUMP_INFO_DEBUG
-    printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
-    //printf("base dir for field: %s \n", fdParams.baseDir);
-    //printf("stride x y z  = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z);
-    printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
-    printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
-    //printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
-    printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
-#endif
-
-#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
-    {                                                                                                             \
-        dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
-        temp_buf_index = 0;                                                                                       \
-        for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
-        {                                                                                                         \
-            for (size_t j(1); j < grid->ny + 1; j++)                                                              \
-            {                                                                                                     \
-                for (size_t k(1); k < grid->nz + 1; k++)                                                          \
-                {                                                                                                 \
-                    temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME;                     \
-                    temp_buf_index = temp_buf_index + 1;                                                          \
-                }                                                                                                 \
-            }                                                                                                     \
-        }                                                                                                         \
-        dataspace_id = H5Dget_space(dset_id);                                                                     \
-        H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
-        H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
-        H5Sclose(dataspace_id);                                                                                   \
-        H5Dclose(dset_id);                                                                                        \
-    }
-
-    char fname[256];
-    char field_scratch[128];
-    char subfield_scratch[128];
-
-    sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5");
-    dump_mkdir(field_scratch);
-    sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou);
-    dump_mkdir(subfield_scratch);
-
-    sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou);
-    double el1 = uptime();
-    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-    H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-    hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
-    H5Pclose(plist_id);
-
-    sprintf(fname, "Timestep_%zu", step_for_viou);
-    hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-
-    el1 = uptime() - el1;
-    //sim_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
-    double el2 = uptime();
-
-    /*
-// Create a variable list of field values to output.
-size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
-size_t * varlist = new size_t[numvars];
-
-for(size_t i(0), c(0); i<total_field_variables; i++)
-  if(global->fdParams.output_vars.bitset(i)) varlist[c++] = i;
-
-printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
-
-#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
-    /*
-    typedef struct field {
-    float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
-    float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
-    float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
-    float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
-    material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
-    material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
-    } field_t;*/
-    // Local voxel mesh resolution.  Voxels are
-    // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1
-    // with voxels 1:nx,1:ny,1:nz being non-ghost
-    // voxels.
-
-    float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
-    hsize_t temp_buf_index;
-    hid_t dset_id;
-    //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
-    plist_id = H5Pcreate(H5P_DATASET_XFER);
-    //Comment out for test only
-    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-    //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
-
-    //global->topology_x
-
-    hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3];
-    field_global_size[0] = (grid->nx * grid->gpx);
-    field_global_size[1] = (grid->ny * grid->gpy);
-    field_global_size[2] = (grid->nz * grid->gpz);
-
-    field_local_size[0] = grid->nx;
-    field_local_size[1] = grid->ny;
-    field_local_size[2] = grid->nz;
-
-    int gpx = grid->gpx;
-    int gpy = grid->gpy;
-    int gpz = grid->gpz;
-
-    // Convert rank to local decomposition
-    int rx, ry, rz;
-    UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
-
-    mpi_rank_x = rx;
-    mpi_rank_y = ry;
-    mpi_rank_z = rz;
-
-    global_offset[0] = (grid->nx) * mpi_rank_x;
-    global_offset[1] = (grid->ny) * mpi_rank_y;
-    global_offset[2] = (grid->nz) * mpi_rank_z;
-
-    global_count[0] = (grid->nx);
-    global_count[1] = (grid->ny);
-    global_count[2] = (grid->nz);
-
-#ifdef DUMP_INFO_DEBUG
-    printf("global size   = %d  %d %d \n", field_global_size[0], field_global_size[1], field_global_size[2]);
-    printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]);
-    printf("global_count  = %d  %d %d \n", global_count[0], global_count[1], global_count[2]);
-    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
-    fflush(stdout);
-#endif
-
-    hid_t filespace = H5Screate_simple(3, field_global_size, NULL);
-    hid_t memspace = H5Screate_simple(3, field_local_size, NULL);
-    hid_t dataspace_id;
-
-    /*
-    typedef struct field {
-    float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
-    float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
-    float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
-    float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
-    material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
-    material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
-    } field_t;*/
-
-    if (field_dump_flag.ex)
-        DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.ey)
-        DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.ez)
-        DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.div_e_err)
-        DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT);
-
-    if (field_dump_flag.cbx)
-        DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.cby)
-        DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.cbz)
-        DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.div_b_err)
-        DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT);
-
-    if (field_dump_flag.tcax)
-        DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.tcay)
-        DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.tcaz)
-        DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.rhob)
-        DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT);
-
-    if (field_dump_flag.jfx)
-        DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.jfy)
-        DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.jfz)
-        DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT);
-    if (field_dump_flag.rhof)
-        DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT);
-
-    //H5T_NATIVE_SHORT  for material_id (typedef int16_t material_id)
-    if (field_dump_flag.ematx)
-        DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT);
-    if (field_dump_flag.ematy)
-        DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT);
-    if (field_dump_flag.ematz)
-        DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT);
-    if (field_dump_flag.nmat)
-        DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT);
-
-    if (field_dump_flag.fmatx)
-        DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT);
-    if (field_dump_flag.fmaty)
-        DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT);
-    if (field_dump_flag.fmatz)
-        DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT);
-    if (field_dump_flag.cmat)
-        DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT);
-
-    el2 = uptime() - el2;
-    //sim_log("TimeHDF5Write: " << el2 << " s");
-
-    double el3 = uptime();
-
-    //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
-    float attr_data[2][3];
-    attr_data[0][0] = grid->x0;
-    attr_data[0][1] = grid->y0;
-    attr_data[0][2] = grid->z0;
-    attr_data[1][0] = grid->dx;
-    attr_data[1][1] = grid->dy;
-    attr_data[1][2] = grid->dz;
-    hsize_t dims[2];
-    dims[0] = 2;
-    dims[1] = 3;
-    hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
-    hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
-    H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
-    H5Sclose(va_geo_dataspace_id);
-    H5Aclose(va_geo_attribute_id);
-
-    free(temp_buf);
-    H5Sclose(filespace);
-    H5Sclose(memspace);
-    H5Pclose(plist_id);
-    H5Gclose(group_id);
-    H5Fclose(file_id);
-
-    el3 = uptime() - el3;
-    //sim_log("TimeHDF5Close: " << el3 << " s");
-
-    if (mpi_rank == 0)
-    {
-        char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf";
-        char dimensions_3d[128];
-        sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]);
-        char dimensions_4d[128];
-        sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3);
-        char orignal[128];
-        sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
-        char dxdydz[128];
-        sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
-
-        int nframes = num_step / field_interval + 1;
-        static int field_tframe = 0;
-
-#ifdef DUMP_INFO_DEBUG
-        printf("         meta file : %s \n", output_xml_file);
-        printf(" array dims per var: %s \n", dimensions_3d);
-        printf("array dims all vars: %s \n", dimensions_4d);
-        printf("            orignal: %s \n", orignal);
-        printf("             dxdydz: %s \n", dxdydz);
-        printf("            nframes: %d \n", nframes);
-        printf("    field_interval: %d \n", field_interval);
-        printf("       current step: %lld \n", step_for_viou);
-                printf("       current step: %lld \n", step_for_viou);
-
-        //printf("    Simulation time: %f \n", grid->t0);
-        printf("             tframe: %d \n", field_tframe);
-#endif
-
-        // TODO: this footer dumping is more likely better done in a
-        // destructor, rather than hoping a multiple division works out
-        if (field_tframe >= 1)
-        {
-            if (field_tframe == (nframes - 1))
-            {
-                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
-            }
-            else
-            {
-                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
-            }
-        }
-        else
-        {
-            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval);
-            if (field_tframe == (nframes - 1))
-            {
-                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
-            }
-            else
-            {
-                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
-            }
-        }
-        field_tframe++;
-    }
-}
-
-// TODO: fix this, it currently uses a static global and the logic only
-// supports 1 species otherwise things get out of sync
-void vpic_simulation::dump_hydro_hdf5( const char *speciesname,
-                             const char *fbase,
-                             int ftag )
-{
-    size_t step_for_viou = step();
-
-#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
-    {                                                                                                             \
-        dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
-        temp_buf_index = 0;                                                                                       \
-        for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
-        {                                                                                                         \
-            for (size_t j(1); j < grid->ny + 1; j++)                                                              \
-            {                                                                                                     \
-                for (size_t k(1); k < grid->nz + 1; k++)                                                          \
-                {                                                                                                 \
-                    temp_buf[temp_buf_index] = hydro(i, j, k).ATTRIBUTE_NAME;                                     \
-                    temp_buf_index = temp_buf_index + 1;                                                          \
-                }                                                                                                 \
-            }                                                                                                     \
-        }                                                                                                         \
-        dataspace_id = H5Dget_space(dset_id);                                                                     \
-        H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
-        H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
-        H5Sclose(dataspace_id);                                                                                   \
-        H5Dclose(dset_id);                                                                                        \
-    }
-    //#define DUMP_INFO_DEBUG 1
-    int mpi_size, mpi_rank;
-    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-
-    species_t *sp = find_species_name(speciesname, species_list);
-    if (!sp)
-        ERROR(("Invalid species name: %s", speciesname));
-
-    clear_hydro_array(hydro_array);
-    accumulate_hydro_p(hydro_array, sp, interpolator_array);
-    synchronize_hydro_array(hydro_array);
-
-    char hname[256];
-    char hydro_scratch[128];
-    char subhydro_scratch[128];
-
-    sprintf(hydro_scratch, "./%s", "hydro_hdf5");
-    dump_mkdir(hydro_scratch);
-    sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou);
-    dump_mkdir(subhydro_scratch);
-
-    sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, speciesname, step_for_viou);
-    double el1 = uptime();
-    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-    H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-    hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
-    H5Pclose(plist_id);
-
-    sprintf(hname, "Timestep_%zu", step_for_viou);
-    hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-
-    el1 = uptime() - el1;
-    //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
-    //double el2 = uptime();
-
-    // Create a variable list of field values to output.
-    //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
-    //size_t *varlist = new size_t[numvars];
-
-    //for (size_t i(0), c(0); i < total_field_variables; i++)
-    //    if (global->fdParams.output_vars.bitset(i))
-    //        varlist[c++] = i;
-
-    //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars);
-
-
-    //typedef struct hydro {
-    //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
-    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
-    //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
-    //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
-    //  float _pad[2];         // 16-byte align
-    //} hydro_t;
-
-    //typedef struct hydro_array {
-    //  hydro_t * ALIGNED(128) h;
-    //  grid_t * g;
-    //} hydro_array_t;
-
-    float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
-    hsize_t temp_buf_index;
-    hid_t dset_id;
-    //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
-    plist_id = H5Pcreate(H5P_DATASET_XFER);
-    //Comment out for test only
-    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-    //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
-
-    //global->topology_x
-
-    hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3];
-    hydro_global_size[0] = (grid->nx * grid->gpx);
-    hydro_global_size[1] = (grid->ny * grid->gpy);
-    hydro_global_size[2] = (grid->nz * grid->gpz);
-
-    hydro_local_size[0] = grid->nx;
-    hydro_local_size[1] = grid->ny;
-    hydro_local_size[2] = grid->nz;
-
-    int mpi_rank_x, mpi_rank_y, mpi_rank_z;
-    RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
-
-    global_offset[0] = (grid->nx) * mpi_rank_x;
-    global_offset[1] = (grid->ny) * mpi_rank_y;
-    global_offset[2] = (grid->nz) * mpi_rank_z;
-
-    global_count[0] = (grid->nx);
-    global_count[1] = (grid->ny);
-    global_count[2] = (grid->nz);
-
-#ifdef DUMP_INFO_DEBUG
-    printf("global size   = %d %d %d \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
-    printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]);
-    printf("global_count  = %d %d %d \n", global_count[0], global_count[1], global_count[2]);
-    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
-    fflush(stdout);
-#endif
-
-    hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL);
-    hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL);
-    hid_t dataspace_id;
-
-    //typedef struct hydro {
-    //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
-    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
-    //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
-    //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
-    //  float _pad[2];         // 16-byte align
-    //} hydro_t;
-
-    if (hydro_dump_flag.jx)
-        DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.jy)
-        DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.jz)
-        DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.rho)
-        DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT);
-
-    if (hydro_dump_flag.px)
-        DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.py)
-        DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.pz)
-        DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.ke)
-        DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT);
-
-    if (hydro_dump_flag.txx)
-        DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.tyy)
-        DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.tzz)
-        DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT);
-
-    if (hydro_dump_flag.tyz)
-        DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.tzx)
-        DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT);
-    if (hydro_dump_flag.txy)
-        DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
-
-    //el2 = uptime() - el2;
-    //sim_log("TimeHDF5Write: " << el2 << " s");
-
-    double el3 = uptime();
-
-    //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
-    float attr_data[2][3];
-    attr_data[0][0] = grid->x0;
-    attr_data[0][1] = grid->y0;
-    attr_data[0][2] = grid->z0;
-    attr_data[1][0] = grid->dx;
-    attr_data[1][1] = grid->dy;
-    attr_data[1][2] = grid->dz;
-    hsize_t dims[2];
-    dims[0] = 2;
-    dims[1] = 3;
-    hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
-    hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
-    H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
-    H5Sclose(va_geo_dataspace_id);
-    H5Aclose(va_geo_attribute_id);
-
-    free(temp_buf);
-    H5Sclose(filespace);
-    H5Sclose(memspace);
-    H5Pclose(plist_id);
-    H5Gclose(group_id);
-    H5Fclose(file_id);
-
-    el3 = uptime() - el3;
-    //sim_log("TimeHDF5Close: " << el3 << " s");
-
-    if (mpi_rank == 0)
-    {
-        char output_xml_file[128];
-        sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", speciesname, ".xdmf");
-        char dimensions_3d[128];
-        sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
-        char dimensions_4d[128];
-        sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3);
-        char orignal[128];
-        sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
-        char dxdydz[128];
-        sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
-
-        int nframes = num_step / hydro_interval + 1;
-
-        const int tframe = tframe_map[sp->id];
-
-#ifdef DUMP_INFO_DEBUG
-        printf("         meta file : %s \n", output_xml_file);
-        printf(" array dims per var: %s \n", dimensions_3d);
-        printf("array dims all vars: %s \n", dimensions_4d);
-        printf("            orignal: %s \n", orignal);
-        printf("             dxdydz: %s \n", dxdydz);
-        printf("            nframes: %d \n", nframes);
-        printf("    hydro_fields_interval: %d \n", hydro_interval);
-        printf("       current step: %lld \n", step_for_viou);
-        printf("    Simulation time: %f \n", grid->t0);
-        printf("             tframe: %d \n", tframe);
-#endif
-
-        char speciesname_new[128];
-        sprintf(speciesname_new, "hydro_%s", speciesname);
-        if (tframe >= 1)
-        {
-            if (tframe == (nframes - 1))
-            {
-                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
-            }
-            else
-            {
-                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
-            }
-        }
-        else
-        {
-            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval);
-            if (tframe == (nframes - 1))
-            {
-                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
-            }
-            else
-            {
-                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
-            }
-        }
-        tframe_map[sp->id]++;
-    }
-}
-
-// TODO": make the sp_name and speciesname varailbe naming consistent
-void
-vpic_simulation::dump_particles_hdf5( const char *sp_name,
-                                 const char *fbase,
-                                 int ftag )
-{
-    size_t step_for_viou = step();
-    char fname[256];
-    char group_name[256];
-    char particle_scratch[128];
-    char subparticle_scratch[128];
-
-    int np_local;
-    species_t *sp;
-
-    float *Pf;
-    int *Pi;
-
-    // get the total number of particles. in this example, output only electrons
-    sp = species_list;
-    sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5");
-    dump_mkdir(particle_scratch);
-    sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou);
-    dump_mkdir(subparticle_scratch);
-
-    // TODO: Allow the user to set this
-
-    int stride_particle_dump = 1;
-    while (sp)
-    {
-        np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump;
-
-        // make a copy of the part of particle data to be dumped
-        double ec1 = uptime();
-
-        int sp_np = sp->np;
-        int sp_max_np = sp->max_np;
-        particle_t *ALIGNED(128) p_buf = NULL;
-        if (!p_buf)
-            MALLOC_ALIGNED(p_buf, np_local, 128);
-        particle_t *sp_p = sp->p;
-        sp->p = p_buf;
-        sp->np = np_local;
-        sp->max_np = np_local;
-
-        for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i)
-        {
-            COPY(&sp->p[i], &sp_p[iptl], 1);
-        }
-
-        center_p(sp, interpolator_array);
-
-        ec1 = uptime() - ec1;
-        int mpi_rank;
-        MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-
-        //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl;
-        //sim_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local);
-
-        Pf = (float *)sp->p;
-        Pi = (int *)sp->p;
-
-        // open HDF5 file in "particle/T.<step>/" subdirectory
-        // filename: eparticle.h5p
-        sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
-        sprintf(group_name, "/Timestep_%ld", step_for_viou);
-        double el1 = uptime();
-
-        hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-        H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-        hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
-        hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-
-        H5Pclose(plist_id);
-
-        long long total_particles, offset;
-        long long numparticles = np_local;
-        MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-        offset -= numparticles;
-
-        hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL);
-
-        hsize_t memspace_count_temp = numparticles * 8;
-        hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
-
-        // Don't need, can just use H5S_ALL
-        //hsize_t linearspace_count_temp = numparticles;
-        //hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
-
-        plist_id = H5Pcreate(H5P_DATASET_XFER);
-
-        //Comment out for test only
-        H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-        H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL);
-
-        hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local;
-        H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL);
-
-        el1 = uptime() - el1;
-        //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
-
-        //double el2 = uptime();
-
-        // This point offset is silly, and loses the type safety (pf+1)
-        hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf);
-        H5Dclose(dset_id);
-
-        dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1);
-        H5Dclose(dset_id);
-
-        dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
-        H5Dclose(dset_id);
-
-#define OUTPUT_CONVERT_GLOBAL_ID 1
-#ifdef OUTPUT_CONVERT_GLOBAL_ID
-        // TODO: make a function out of this too, its used in openpmd
-        std::vector<int> global_pi;
-        global_pi.reserve(numparticles);
-        // TODO: this could be parallel
-        for (int i = 0; i < numparticles; i++)
-        {
-            int local_i = sp->p[i].i;
-
-            int ix, iy, iz, rx, ry, rz;
-
-            // Convert rank to local x/y/z
-            UNVOXEL(rank(), rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
-
-            // Calculate local ix/iy/iz
-            UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
-
-            // Convert ix/iy/iz to global
-            int gix = ix + (grid->nx * (rx));
-            int giy = iy + (grid->ny * (ry));
-            int giz = iz + (grid->nz * (rz));
-
-            // calculate global grid sizes
-            int gnx = grid->nx * grid->gpx;
-            int gny = grid->ny * grid->gpy;
-            int gnz = grid->nz * grid->gpz;
-
-            // TODO: find a better way to account for the hard coded ghosts in VOXEL
-            int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
-
-            //std::cout << rank() << " local i " << local_i << " becomes " << global_i << std::endl;
-            global_pi[i] = global_i;
-        }
-
-#undef UNVOXEL
-        dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data());
-        H5Dclose(dset_id);
-
-#else
-        dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3);
-        H5Dclose(dset_id);
-#endif
-
-        dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4);
-        H5Dclose(dset_id);
-
-        dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5);
-        H5Dclose(dset_id);
-
-        dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6);
-        H5Dclose(dset_id);
-
-        dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7);
-        H5Dclose(dset_id);
-
-        //el2 = uptime() - el2;
-        //sim_log("Particle TimeHDF5Write: " << el2 << " s");
-
-        double el3 = uptime();
-        H5Sclose(memspace);
-        H5Sclose(filespace);
-        H5Pclose(plist_id);
-        H5Gclose(group_id);
-        H5Fclose(file_id);
-        el3 = uptime() - el3;
-        //sim_log("Particle TimeHDF5Close: " << el3 << " s");
-
-        sp->p = sp_p;
-        sp->np = sp_np;
-        sp->max_np = sp_max_np;
-        FREE_ALIGNED(p_buf);
-
-        // Write metadata if step() == 0
-        char meta_fname[256];
-
-        sprintf(meta_fname, "%s/grid_metadata_%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
-
-        double meta_el1 = uptime();
-
-        hid_t meta_plist_id = H5Pcreate(H5P_FILE_ACCESS);
-        H5Pset_fapl_mpio(meta_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-        hid_t meta_file_id = H5Fcreate(meta_fname, H5F_ACC_TRUNC, H5P_DEFAULT, meta_plist_id);
-        hid_t meta_group_id = H5Gcreate(meta_file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        H5Pclose(meta_plist_id);
-
-        long long meta_total_particles, meta_offset;
-        long long meta_numparticles = 1;
-        MPI_Allreduce(&meta_numparticles, &meta_total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Scan(&meta_numparticles, &meta_offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-        meta_offset -= meta_numparticles;
-
-        hid_t meta_filespace = H5Screate_simple(1, (hsize_t *)&meta_total_particles, NULL);
-        hid_t meta_memspace = H5Screate_simple(1, (hsize_t *)&meta_numparticles, NULL);
-        meta_plist_id = H5Pcreate(H5P_DATASET_XFER);
-        H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE);
-        H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL);
-        meta_el1 = uptime() - meta_el1;
-        //sim_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts
-
-        double meta_el2 = uptime();
-
-        hid_t meta_dset_id = H5Dcreate(meta_group_id, "np_local", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, (int32_t *)&np_local);
-        H5Dclose(meta_dset_id);
-        //if (rank == 0) printf ("Written variable dX \n");
-
-        meta_dset_id = H5Dcreate(meta_group_id, "nx", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nx);
-        H5Dclose(meta_dset_id);
-        //if (rank == 0) printf ("Written variable dY \n");
-
-        meta_dset_id = H5Dcreate(meta_group_id, "ny", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->ny);
-        H5Dclose(meta_dset_id);
-        //if (rank == 0) printf ("Written variable dZ \n");
-
-        meta_dset_id = H5Dcreate(meta_group_id, "nz", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nz);
-        H5Dclose(meta_dset_id);
-        //if (rank == 0) printf ("Written variable i \n");
-
-        meta_dset_id = H5Dcreate(meta_group_id, "x0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->x0);
-        H5Dclose(meta_dset_id);
-
-        meta_dset_id = H5Dcreate(meta_group_id, "y0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->y0);
-        H5Dclose(meta_dset_id);
-
-        meta_dset_id = H5Dcreate(meta_group_id, "z0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->z0);
-        H5Dclose(meta_dset_id);
-
-        meta_dset_id = H5Dcreate(meta_group_id, "dx", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dx);
-        H5Dclose(meta_dset_id);
-
-        meta_dset_id = H5Dcreate(meta_group_id, "dy", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dy);
-        H5Dclose(meta_dset_id);
-
-        meta_dset_id = H5Dcreate(meta_group_id, "dz", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dz);
-        H5Dclose(meta_dset_id);
-
-        meta_el2 = uptime() - meta_el2;
-        //sim_log("Metafile TimeHDF5Write: " << meta_el2 << " s");
-        double meta_el3 = uptime();
-        H5Sclose(meta_memspace);
-        H5Sclose(meta_filespace);
-        H5Pclose(meta_plist_id);
-        H5Gclose(meta_group_id);
-        H5Fclose(meta_file_id);
-        meta_el3 = uptime() - meta_el3;
-        //sim_log("Metafile TimeHDF5Close: " << meta_el3 << " s");
-
-        sp = sp->next;
-    }
-}
-#endif
-
-void
-vpic_simulation::dump_particles( const char *sp_name,
-                                 const char *fbase,
-                                 int ftag )
-{
-  species_t *sp;
-  char fname[256];
-  FileIO fileIO;
-  int dim[1], buf_start;
-  static particle_t * ALIGNED(128) p_buf = NULL;
-# define PBUF_SIZE 32768 // 1MB of particles
-
-  sp = find_species_name( sp_name, species_list );
-  if( !sp ) ERROR(( "Invalid species name \"%s\".", sp_name ));
-
-  if( !fbase ) ERROR(( "Invalid filename" ));
-
-  if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 );
-
-  if( rank()==0 )
-    MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase));
-
-  if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step(), rank() );
-  else       sprintf( fname, "%s.%i", fbase, rank() );
-  FileIOStatus status = fileIO.open(fname, io_write);
-  if( status==fail ) ERROR(( "Could not open \"%s\"", fname ));
-
-  /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-  nxout = grid->nx;
-  nyout = grid->ny;
-  nzout = grid->nz;
-  dxout = grid->dx;
-  dyout = grid->dy;
-  dzout = grid->dz;
-
-  WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO );
-
-  dim[0] = sp->np;
-  WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO );
-
-  // Copy a PBUF_SIZE hunk of the particle list into the particle
-  // buffer, timecenter it and write it out. This is done this way to
-  // guarantee the particle list unchanged while not requiring too
-  // much memory.
-
-  // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE
-  // LARGE.
-
-  particle_t * sp_p = sp->p;      sp->p      = p_buf;
-  int sp_np         = sp->np;     sp->np     = 0;
-  int sp_max_np     = sp->max_np; sp->max_np = PBUF_SIZE;
-  for( buf_start=0; buf_start<sp_np; buf_start += PBUF_SIZE ) {
-    sp->np = sp_np-buf_start; if( sp->np > PBUF_SIZE ) sp->np = PBUF_SIZE;
-    COPY( sp->p, &sp_p[buf_start], sp->np );
-    center_p( sp, interpolator_array );
-    fileIO.write( sp->p, sp->np );
-  }
-  sp->p      = sp_p;
-  sp->np     = sp_np;
-  sp->max_np = sp_max_np;
-
-  if( fileIO.close() ) ERROR(("File close failed on dump particles!!!"));
-}
-
 /*------------------------------------------------------------------------------
  * New dump logic
  *---------------------------------------------------------------------------*/
@@ -1683,6 +382,8 @@ vpic_simulation::global_header( const char * base,
   if( fileIO.close() ) ERROR(( "File close failed on global header!!!" ));
 }
 
+// TODO: why is there field_dump and dump_fields?
+// TODO: this could probably move into the dump_strategy
 void
 vpic_simulation::field_dump( DumpParameters & dumpParams ) {
 
@@ -1721,12 +422,12 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
 # define f(x,y,z) f[ VOXEL(x,y,z, grid->nx,grid->ny,grid->nz) ]
 
   /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-  nxout = (grid->nx)/istride;
-  nyout = (grid->ny)/jstride;
-  nzout = (grid->nz)/kstride;
-  dxout = (grid->dx)*istride;
-  dyout = (grid->dy)*jstride;
-  dzout = (grid->dz)*kstride;
+  size_t nxout = (grid->nx)/istride;
+  size_t nyout = (grid->ny)/jstride;
+  size_t nzout = (grid->nz)/kstride;
+  float dxout = (grid->dx)*istride;
+  float dyout = (grid->dy)*jstride;
+  float dzout = (grid->dz)*kstride;
 
   /* Banded output will write data as a single block-array as opposed to
    * the Array-of-Structure format that is used for native storage.
@@ -1738,7 +439,7 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
 
   if(dumpParams.format == band) {
 
-    WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO);
+    WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO, step(), rank(), nproc());
 
     dim[0] = nxout+2;
     dim[1] = nyout+2;
@@ -1799,7 +500,7 @@ vpic_simulation::field_dump( DumpParameters & dumpParams ) {
 
   } else { // band_interleave
 
-    WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO);
+    WRITE_HEADER_V0(dump_type::field_dump, -1, 0, fileIO, step(), rank(), nproc());
 
     dim[0] = nxout+2;
     dim[1] = nyout+2;
@@ -1867,12 +568,12 @@ vpic_simulation::hydro_dump( const char * speciesname,
   int dim[3];
 
   /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-  nxout = (grid->nx)/istride;
-  nyout = (grid->ny)/jstride;
-  nzout = (grid->nz)/kstride;
-  dxout = (grid->dx)*istride;
-  dyout = (grid->dy)*jstride;
-  dzout = (grid->dz)*kstride;
+  size_t nxout = (grid->nx)/istride;
+  size_t nyout = (grid->ny)/jstride;
+  size_t nzout = (grid->nz)/kstride;
+  float dxout = (grid->dx)*istride;
+  float dyout = (grid->dy)*jstride;
+  float dzout = (grid->dz)*kstride;
 
   /* Banded output will write data as a single block-array as opposed to
    * the Array-of-Structure format that is used for native storage.
@@ -1884,7 +585,7 @@ vpic_simulation::hydro_dump( const char * speciesname,
    */
   if(dumpParams.format == band) {
 
-    WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO);
+    WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step(), rank(), nproc());
 
     dim[0] = nxout+2;
     dim[1] = nyout+2;
@@ -1928,7 +629,7 @@ vpic_simulation::hydro_dump( const char * speciesname,
 
   } else { // band_interleave
 
-    WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO);
+    WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step(), rank(), nproc());
 
     dim[0] = nxout;
     dim[1] = nyout;
diff --git a/src/vpic/dump.h b/src/vpic/dump.h
new file mode 100644
index 00000000..966e627e
--- /dev/null
+++ b/src/vpic/dump.h
@@ -0,0 +1,46 @@
+#ifndef dump_h
+#define dump_h
+
+#include <array>
+
+// TODO: should this be an enum?
+namespace dump_type {
+  const int grid_dump = 0;
+  const int field_dump = 1;
+  const int hydro_dump = 2;
+  const int particle_dump = 3;
+  const int restart_dump = 4;
+  const int history_dump = 5;
+} // namespace
+
+// TODO: namesapce?
+std::array<int, 4> global_particle_index(int local_i, grid_t* grid, int rank)
+{
+    int ix, iy, iz, rx, ry, rz;
+    // Convert rank to local x/y/z
+    UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+    // Calculate local ix/iy/iz
+    UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
+
+    // Account for the "first" ghost cell
+    ix = ix - 1;
+    iy = iy - 1;
+    iz = iz - 1;
+
+    // Convert ix/iy/iz to global
+    int gix = ix + (grid->nx * (rx));
+    int giy = iy + (grid->ny * (ry));
+    int giz = iz + (grid->nz * (rz));
+
+    // calculate global grid sizes
+    int gnx = grid->nx * grid->gpx;
+    int gny = grid->ny * grid->gpy;
+    int gnz = grid->nz * grid->gpz;
+
+    // TODO: find a better way to account for the hard coded ghosts in VOXEL
+    int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
+
+    return { global_i, gix, giy, giz };
+}
+#endif
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
new file mode 100644
index 00000000..c745df7c
--- /dev/null
+++ b/src/vpic/dump_strategy.h
@@ -0,0 +1,1365 @@
+#ifndef Dump_Strategy_h
+#define Dump_Strategy_h
+
+#include "../util/io/FileIO.h"
+#include "../util/util_base.h"
+#include "../util/io/FileUtils.h"
+#include "dump.h"
+#include "dumpmacros.h"
+
+#ifdef VPIC_ENABLE_HDF5
+#include "hdf5.h" // from the lib
+#include "hdf5_header_info.h" // from vpic
+#endif
+
+#ifdef VPIC_ENABLE_OPENPMD
+#include <openPMD/openPMD.hpp>
+#endif
+
+class Dump_Strategy {
+    public:
+    int rank, nproc;
+
+    Dump_Strategy(int _rank, int _nproc) : rank(_rank), nproc(_nproc) { } // empty
+
+    virtual void dump_fields(
+        const char *fbase,
+        int step,
+        grid_t* grid,
+        field_array_t* field_array,
+        int ftag
+    );
+    virtual void dump_hydro(
+        const char *fbase,
+        int step,
+        hydro_array_t* hydro_array,
+        species_t* sp,
+        interpolator_array_t* interpolator_array,
+        grid_t* grid,
+        int ftag
+    );
+    virtual void dump_particles(
+        const char *fbase,
+        species_t* sp,
+        grid_t* grid,
+        int step,
+        interpolator_array_t* interpolator_array,
+        int ftag
+    );
+};
+
+class BinaryDump : public Dump_Strategy {
+    public:
+        using Dump_Strategy::Dump_Strategy; // inherit constructor
+
+        // TODO: now we pass rank and step, ftag has odd semanticds
+        void dump_fields(
+                const char *fbase,
+                int step,
+                grid_t* grid,
+                field_array_t* field_array,
+                int ftag
+        )
+        {
+            char fname[256];
+            FileIO fileIO;
+            int dim[3];
+
+            if( !fbase ) ERROR(( "Invalid filename" ));
+
+            if( rank==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase ));
+
+            if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
+            else       sprintf( fname, "%s.%i", fbase, rank );
+
+            FileIOStatus status = fileIO.open(fname, io_write);
+            if( status==fail ) ERROR(( "Could not open \"%s\".", fname ));
+
+            /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
+            size_t nxout = grid->nx;
+            size_t nyout = grid->ny;
+            size_t nzout = grid->nz;
+            float dxout = grid->dx;
+            float dyout = grid->dy;
+            float dzout = grid->dz;
+
+            WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO, step , rank, nproc);
+
+            dim[0] = grid->nx+2;
+            dim[1] = grid->ny+2;
+            dim[2] = grid->nz+2;
+            WRITE_ARRAY_HEADER( field_array->f, 3, dim, fileIO );
+            fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] );
+            if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" ));
+        }
+        void dump_particles(
+                const char *fbase,
+                species_t* sp,
+                grid_t* grid,
+                int step,
+                interpolator_array_t* interpolator_array,
+                int ftag
+                )
+        {
+            char fname[256];
+            FileIO fileIO;
+            int dim[1], buf_start;
+            static particle_t * ALIGNED(128) p_buf = NULL;
+# define PBUF_SIZE 32768 // 1MB of particles
+
+            if( !sp ) ERROR(( "Invalid species name \"%s\".", sp->name ));
+
+            if( !fbase ) ERROR(( "Invalid filename" ));
+
+            if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 );
+
+            if( rank==0 )
+                MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase));
+
+            if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
+            else       sprintf( fname, "%s.%i", fbase, rank );
+            FileIOStatus status = fileIO.open(fname, io_write);
+            if( status==fail ) ERROR(( "Could not open \"%s\"", fname ));
+
+            /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
+            size_t nxout = grid->nx;
+            size_t nyout = grid->ny;
+            size_t nzout = grid->nz;
+            float dxout = grid->dx;
+            float dyout = grid->dy;
+            float dzout = grid->dz;
+
+            WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc);
+
+            dim[0] = sp->np;
+            WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO );
+
+            // Copy a PBUF_SIZE hunk of the particle list into the particle
+            // buffer, timecenter it and write it out. This is done this way to
+            // guarantee the particle list unchanged while not requiring too
+            // much memory.
+
+            // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE
+            // LARGE.
+
+            particle_t * sp_p = sp->p;      sp->p      = p_buf;
+            int sp_np         = sp->np;     sp->np     = 0;
+            int sp_max_np     = sp->max_np; sp->max_np = PBUF_SIZE;
+            for( buf_start=0; buf_start<sp_np; buf_start += PBUF_SIZE ) {
+                sp->np = sp_np-buf_start; if( sp->np > PBUF_SIZE ) sp->np = PBUF_SIZE;
+                COPY( sp->p, &sp_p[buf_start], sp->np );
+                center_p( sp, interpolator_array );
+                fileIO.write( sp->p, sp->np );
+            }
+            sp->p      = sp_p;
+            sp->np     = sp_np;
+            sp->max_np = sp_max_np;
+
+            if( fileIO.close() ) ERROR(("File close failed on dump particles!!!"));
+        }
+        void dump_hydro(
+            const char *fbase,
+            int step,
+            hydro_array_t* hydro_array,
+            species_t* sp,
+            interpolator_array_t* interpolator_array,
+            grid_t* grid,
+            int ftag
+        )
+        {
+            char fname[256];
+            FileIO fileIO;
+            int dim[3];
+
+            if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name ));
+
+            clear_hydro_array( hydro_array );
+            accumulate_hydro_p( hydro_array, sp, interpolator_array );
+            synchronize_hydro_array( hydro_array );
+
+            if( !fbase ) ERROR(( "Invalid filename" ));
+
+            if( rank==0 )
+                MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase));
+
+            if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
+            else       sprintf( fname, "%s.%i", fbase, rank );
+            FileIOStatus status = fileIO.open(fname, io_write);
+            if( status==fail) ERROR(( "Could not open \"%s\".", fname ));
+
+            /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
+            size_t nxout = grid->nx;
+            size_t nyout = grid->ny;
+            size_t nzout = grid->nz;
+            float dxout = grid->dx;
+            float dyout = grid->dy;
+            float dzout = grid->dz;
+
+            WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc);
+
+            dim[0] = grid->nx+2;
+            dim[1] = grid->ny+2;
+            dim[2] = grid->nz+2;
+            WRITE_ARRAY_HEADER( hydro_array->h, 3, dim, fileIO );
+            fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] );
+            if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
+        }
+};
+
+#ifdef VPIC_ENABLE_HDF5
+class HDF5Dump : public Dump_Strategy {
+    public:
+        using Dump_Strategy::Dump_Strategy; // inherit constructor
+#define DUMP_DIR_FORMAT "./%s"
+
+        /* define to do C-style indexing */
+#define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
+
+        // TODO: make function?
+#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag)                                 \
+        {                                                                                                                                       \
+            FILE *fp;                                                                                                                             \
+            fp = fopen(xml_file_name, "a");                                                                                                       \
+            fprintf(fp, main_body_head, time_step);                                                                                               \
+            if (field_dump_flag.enabledE())                                                                                                       \
+            write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez");             \
+            if (field_dump_flag.div_e_err)                                                                                                        \
+            fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err");               \
+            if (field_dump_flag.enabledCB())                                                                                                      \
+            write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz");          \
+            if (field_dump_flag.div_b_err)                                                                                                        \
+            fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err");               \
+            if (field_dump_flag.enabledTCA())                                                                                                     \
+            write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz");     \
+            if (field_dump_flag.rhob)                                                                                                             \
+            fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob");                         \
+            if (field_dump_flag.enabledJF())                                                                                                      \
+            write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz");         \
+            if (field_dump_flag.rhof)                                                                                                             \
+            fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof");                         \
+            if (field_dump_flag.enabledEMAT())                                                                                                    \
+            write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \
+            if (field_dump_flag.nmat)                                                                                                             \
+            fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat");                         \
+            if (field_dump_flag.enabledFMAT())                                                                                                    \
+            write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \
+            if (field_dump_flag.cmat)                                                                                                             \
+            fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat");                         \
+            fprintf(fp, "%s", main_body_foot);                                                                                                          \
+            if (add_footer_flag)                                                                                                                  \
+            fputs(footer, fp);                                                                                                                  \
+            fclose(fp);                                                                                                                           \
+        }
+        void dump_fields(
+            const char *fbase,
+            int step,
+            grid_t* grid,
+            field_array_t* field_array,
+            int ftag
+        )
+        {
+            size_t step_for_viou = step;
+
+            int mpi_size, mpi_rank;
+            MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+
+#ifdef DUMP_INFO_DEBUG
+            printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
+            //printf("base dir for field: %s \n", fdParams.baseDir);
+            //printf("stride x y z  = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z);
+            printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
+            printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
+            //printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
+            printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
+#endif
+
+#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
+            {                                                                                                             \
+                dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
+                temp_buf_index = 0;                                                                                       \
+                for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
+                {                                                                                                         \
+                    for (size_t j(1); j < grid->ny + 1; j++)                                                              \
+                    {                                                                                                     \
+                        for (size_t k(1); k < grid->nz + 1; k++)                                                          \
+                        {                                                                                                 \
+                            temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME;                     \
+                            temp_buf_index = temp_buf_index + 1;                                                          \
+                        }                                                                                                 \
+                    }                                                                                                     \
+                }                                                                                                         \
+                dataspace_id = H5Dget_space(dset_id);                                                                     \
+                H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
+                H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
+                H5Sclose(dataspace_id);                                                                                   \
+                H5Dclose(dset_id);                                                                                        \
+            }
+
+            char fname[256];
+            char field_scratch[128];
+            char subfield_scratch[128];
+
+            sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5");
+            FileUtils::makeDirectory(field_scratch);
+            sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou);
+            FileUtils::makeDirectory(subfield_scratch);
+
+            sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou);
+            double el1 = uptime();
+            hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
+            H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+            hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+            H5Pclose(plist_id);
+
+            sprintf(fname, "Timestep_%zu", step_for_viou);
+            hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+            el1 = uptime() - el1;
+            //sim_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
+            double el2 = uptime();
+
+            /*
+            // Create a variable list of field values to output.
+            size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
+            size_t * varlist = new size_t[numvars];
+
+            for(size_t i(0), c(0); i<total_field_variables; i++)
+            if(global->fdParams.output_vars.bitset(i)) varlist[c++] = i;
+
+            printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
+
+#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
+            /*
+               typedef struct field {
+               float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
+               float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
+               float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
+               float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
+               material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
+               material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
+               } field_t;*/
+            // Local voxel mesh resolution.  Voxels are
+            // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1
+            // with voxels 1:nx,1:ny,1:nz being non-ghost
+            // voxels.
+
+            float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
+            hsize_t temp_buf_index;
+            hid_t dset_id;
+            //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
+            plist_id = H5Pcreate(H5P_DATASET_XFER);
+            //Comment out for test only
+            H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+            //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
+
+            //global->topology_x
+
+            hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3];
+            field_global_size[0] = (grid->nx * grid->gpx);
+            field_global_size[1] = (grid->ny * grid->gpy);
+            field_global_size[2] = (grid->nz * grid->gpz);
+
+            field_local_size[0] = grid->nx;
+            field_local_size[1] = grid->ny;
+            field_local_size[2] = grid->nz;
+
+            int gpx = grid->gpx;
+            int gpy = grid->gpy;
+            int gpz = grid->gpz;
+
+            // Convert rank to local decomposition
+            int rx, ry, rz;
+            UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+            mpi_rank_x = rx;
+            mpi_rank_y = ry;
+            mpi_rank_z = rz;
+
+            global_offset[0] = (grid->nx) * mpi_rank_x;
+            global_offset[1] = (grid->ny) * mpi_rank_y;
+            global_offset[2] = (grid->nz) * mpi_rank_z;
+
+            global_count[0] = (grid->nx);
+            global_count[1] = (grid->ny);
+            global_count[2] = (grid->nz);
+
+#ifdef DUMP_INFO_DEBUG
+            printf("global size   = %d  %d %d \n", field_global_size[0], field_global_size[1], field_global_size[2]);
+            printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]);
+            printf("global_count  = %d  %d %d \n", global_count[0], global_count[1], global_count[2]);
+            printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+            fflush(stdout);
+#endif
+
+            hid_t filespace = H5Screate_simple(3, field_global_size, NULL);
+            hid_t memspace = H5Screate_simple(3, field_local_size, NULL);
+            hid_t dataspace_id;
+
+            /*
+               typedef struct field {
+               float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
+               float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
+               float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
+               float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
+               material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
+               material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
+               } field_t;*/
+
+            if (field_dump_flag.ex)
+                DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.ey)
+                DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.ez)
+                DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.div_e_err)
+                DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT);
+
+            if (field_dump_flag.cbx)
+                DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.cby)
+                DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.cbz)
+                DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.div_b_err)
+                DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT);
+
+            if (field_dump_flag.tcax)
+                DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.tcay)
+                DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.tcaz)
+                DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.rhob)
+                DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT);
+
+            if (field_dump_flag.jfx)
+                DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.jfy)
+                DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.jfz)
+                DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT);
+            if (field_dump_flag.rhof)
+                DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT);
+
+            //H5T_NATIVE_SHORT  for material_id (typedef int16_t material_id)
+            if (field_dump_flag.ematx)
+                DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT);
+            if (field_dump_flag.ematy)
+                DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT);
+            if (field_dump_flag.ematz)
+                DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT);
+            if (field_dump_flag.nmat)
+                DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT);
+
+            if (field_dump_flag.fmatx)
+                DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT);
+            if (field_dump_flag.fmaty)
+                DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT);
+            if (field_dump_flag.fmatz)
+                DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT);
+            if (field_dump_flag.cmat)
+                DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT);
+
+            el2 = uptime() - el2;
+            //sim_log("TimeHDF5Write: " << el2 << " s");
+
+            double el3 = uptime();
+
+            //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
+            float attr_data[2][3];
+            attr_data[0][0] = grid->x0;
+            attr_data[0][1] = grid->y0;
+            attr_data[0][2] = grid->z0;
+            attr_data[1][0] = grid->dx;
+            attr_data[1][1] = grid->dy;
+            attr_data[1][2] = grid->dz;
+            hsize_t dims[2];
+            dims[0] = 2;
+            dims[1] = 3;
+            hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
+            hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
+            H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
+            H5Sclose(va_geo_dataspace_id);
+            H5Aclose(va_geo_attribute_id);
+
+            free(temp_buf);
+            H5Sclose(filespace);
+            H5Sclose(memspace);
+            H5Pclose(plist_id);
+            H5Gclose(group_id);
+            H5Fclose(file_id);
+
+            el3 = uptime() - el3;
+            //sim_log("TimeHDF5Close: " << el3 << " s");
+
+            if (mpi_rank == 0)
+            {
+                char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf";
+                char dimensions_3d[128];
+                sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]);
+                char dimensions_4d[128];
+                sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3);
+                char orignal[128];
+                sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
+                char dxdydz[128];
+                sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
+
+                int nframes = num_step / field_interval + 1;
+                static int field_tframe = 0;
+
+#ifdef DUMP_INFO_DEBUG
+                printf("         meta file : %s \n", output_xml_file);
+                printf(" array dims per var: %s \n", dimensions_3d);
+                printf("array dims all vars: %s \n", dimensions_4d);
+                printf("            orignal: %s \n", orignal);
+                printf("             dxdydz: %s \n", dxdydz);
+                printf("            nframes: %d \n", nframes);
+                printf("    field_interval: %d \n", field_interval);
+                printf("       current step: %lld \n", step_for_viou);
+                printf("       current step: %lld \n", step_for_viou);
+
+                //printf("    Simulation time: %f \n", grid->t0);
+                printf("             tframe: %d \n", field_tframe);
+#endif
+
+                // TODO: this footer dumping is more likely better done in a
+                // destructor, rather than hoping a multiple division works out
+                if (field_tframe >= 1)
+                {
+                    if (field_tframe == (nframes - 1))
+                    {
+                        invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
+                    }
+                    else
+                    {
+                        invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
+                    }
+                }
+                else
+                {
+                    create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval);
+                    if (field_tframe == (nframes - 1))
+                    {
+                        invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
+                    }
+                    else
+                    {
+                        invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
+                    }
+                }
+                field_tframe++;
+            }
+        }
+        void dump_particles(
+            const char *fbase,
+            species_t* sp,
+            grid_t* grid,
+            int step,
+            interpolator_array_t* interpolator_array,
+            int ftag
+        )
+        {
+            size_t step_for_viou = step;
+            char fname[256];
+            char group_name[256];
+            char particle_scratch[128];
+            char subparticle_scratch[128];
+
+            int np_local;
+
+            float *Pf;
+            int *Pi;
+
+            // get the total number of particles. in this example, output only electrons
+            //sp = species_list;
+            sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5");
+            FileUtils::makeDirector(particle_scratch);
+            sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou);
+            FileUtils::makeDirector(subparticle_scratch);
+
+            // TODO: Allow the user to set this
+            int stride_particle_dump = 1;
+
+            np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump;
+
+            // make a copy of the part of particle data to be dumped
+            double ec1 = uptime();
+
+            int sp_np = sp->np;
+            int sp_max_np = sp->max_np;
+            particle_t *ALIGNED(128) p_buf = NULL;
+            if (!p_buf)
+                MALLOC_ALIGNED(p_buf, np_local, 128);
+            particle_t *sp_p = sp->p;
+            sp->p = p_buf;
+            sp->np = np_local;
+            sp->max_np = np_local;
+
+            for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i)
+            {
+                COPY(&sp->p[i], &sp_p[iptl], 1);
+            }
+
+            center_p(sp, interpolator_array);
+
+            ec1 = uptime() - ec1;
+            int mpi_rank;
+            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+            //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl;
+            //sim_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local);
+
+            Pf = (float *)sp->p;
+            Pi = (int *)sp->p;
+
+            // open HDF5 file in "particle/T.<step>/" subdirectory
+            // filename: eparticle.h5p
+            sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
+            sprintf(group_name, "/Timestep_%ld", step_for_viou);
+            double el1 = uptime();
+
+            hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
+            H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+            hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+            hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+            H5Pclose(plist_id);
+
+            long long total_particles, offset;
+            long long numparticles = np_local;
+            MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+            MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+            offset -= numparticles;
+
+            hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL);
+
+            hsize_t memspace_count_temp = numparticles * 8;
+            hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
+
+            // Don't need, can just use H5S_ALL
+            //hsize_t linearspace_count_temp = numparticles;
+            //hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
+
+            plist_id = H5Pcreate(H5P_DATASET_XFER);
+
+            //Comment out for test only
+            H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+            H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL);
+
+            hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local;
+            H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL);
+
+            el1 = uptime() - el1;
+            //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
+
+            //double el2 = uptime();
+
+            // This point offset is silly, and loses the type safety (pf+1)
+            hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf);
+            H5Dclose(dset_id);
+
+            dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1);
+            H5Dclose(dset_id);
+
+            dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
+            H5Dclose(dset_id);
+
+#define OUTPUT_CONVERT_GLOBAL_ID 1
+#ifdef OUTPUT_CONVERT_GLOBAL_ID
+            // TODO: make a function out of this too, its used in openpmd
+            std::vector<int> global_pi;
+            global_pi.reserve(numparticles);
+            // TODO: this could be parallel
+            for (int i = 0; i < numparticles; i++)
+            {
+                int local_i = sp->p[i].i;
+
+                int ix, iy, iz, rx, ry, rz;
+
+                // Convert rank to local x/y/z
+                UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+                // Calculate local ix/iy/iz
+                UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
+
+                // Convert ix/iy/iz to global
+                int gix = ix + (grid->nx * (rx));
+                int giy = iy + (grid->ny * (ry));
+                int giz = iz + (grid->nz * (rz));
+
+                // calculate global grid sizes
+                int gnx = grid->nx * grid->gpx;
+                int gny = grid->ny * grid->gpy;
+                int gnz = grid->nz * grid->gpz;
+
+                // TODO: find a better way to account for the hard coded ghosts in VOXEL
+                int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
+
+                //std::cout << rank << " local i " << local_i << " becomes " << global_i << std::endl;
+                global_pi[i] = global_i;
+            }
+
+#undef UNVOXEL
+            dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data());
+            H5Dclose(dset_id);
+
+#else
+            dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3);
+            H5Dclose(dset_id);
+#endif
+
+            dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4);
+            H5Dclose(dset_id);
+
+            dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5);
+            H5Dclose(dset_id);
+
+            dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6);
+            H5Dclose(dset_id);
+
+            dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7);
+            H5Dclose(dset_id);
+
+            //el2 = uptime() - el2;
+            //sim_log("Particle TimeHDF5Write: " << el2 << " s");
+
+            double el3 = uptime();
+            H5Sclose(memspace);
+            H5Sclose(filespace);
+            H5Pclose(plist_id);
+            H5Gclose(group_id);
+            H5Fclose(file_id);
+            el3 = uptime() - el3;
+            //sim_log("Particle TimeHDF5Close: " << el3 << " s");
+
+            sp->p = sp_p;
+            sp->np = sp_np;
+            sp->max_np = sp_max_np;
+            FREE_ALIGNED(p_buf);
+
+            // Write metadata if step() == 0
+            char meta_fname[256];
+
+            sprintf(meta_fname, "%s/grid_metadata_%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
+
+            double meta_el1 = uptime();
+
+            hid_t meta_plist_id = H5Pcreate(H5P_FILE_ACCESS);
+            H5Pset_fapl_mpio(meta_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+            hid_t meta_file_id = H5Fcreate(meta_fname, H5F_ACC_TRUNC, H5P_DEFAULT, meta_plist_id);
+            hid_t meta_group_id = H5Gcreate(meta_file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            H5Pclose(meta_plist_id);
+
+            long long meta_total_particles, meta_offset;
+            long long meta_numparticles = 1;
+            MPI_Allreduce(&meta_numparticles, &meta_total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+            MPI_Scan(&meta_numparticles, &meta_offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+            meta_offset -= meta_numparticles;
+
+            hid_t meta_filespace = H5Screate_simple(1, (hsize_t *)&meta_total_particles, NULL);
+            hid_t meta_memspace = H5Screate_simple(1, (hsize_t *)&meta_numparticles, NULL);
+            meta_plist_id = H5Pcreate(H5P_DATASET_XFER);
+            H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE);
+            H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL);
+            meta_el1 = uptime() - meta_el1;
+            //sim_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts
+
+            double meta_el2 = uptime();
+
+            hid_t meta_dset_id = H5Dcreate(meta_group_id, "np_local", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, (int32_t *)&np_local);
+            H5Dclose(meta_dset_id);
+            //if (rank == 0) printf ("Written variable dX \n");
+
+            meta_dset_id = H5Dcreate(meta_group_id, "nx", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nx);
+            H5Dclose(meta_dset_id);
+            //if (rank == 0) printf ("Written variable dY \n");
+
+            meta_dset_id = H5Dcreate(meta_group_id, "ny", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->ny);
+            H5Dclose(meta_dset_id);
+            //if (rank == 0) printf ("Written variable dZ \n");
+
+            meta_dset_id = H5Dcreate(meta_group_id, "nz", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nz);
+            H5Dclose(meta_dset_id);
+            //if (rank == 0) printf ("Written variable i \n");
+
+            meta_dset_id = H5Dcreate(meta_group_id, "x0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->x0);
+            H5Dclose(meta_dset_id);
+
+            meta_dset_id = H5Dcreate(meta_group_id, "y0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->y0);
+            H5Dclose(meta_dset_id);
+
+            meta_dset_id = H5Dcreate(meta_group_id, "z0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->z0);
+            H5Dclose(meta_dset_id);
+
+            meta_dset_id = H5Dcreate(meta_group_id, "dx", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dx);
+            H5Dclose(meta_dset_id);
+
+            meta_dset_id = H5Dcreate(meta_group_id, "dy", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dy);
+            H5Dclose(meta_dset_id);
+
+            meta_dset_id = H5Dcreate(meta_group_id, "dz", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dz);
+            H5Dclose(meta_dset_id);
+
+            meta_el2 = uptime() - meta_el2;
+            //sim_log("Metafile TimeHDF5Write: " << meta_el2 << " s");
+            double meta_el3 = uptime();
+            H5Sclose(meta_memspace);
+            H5Sclose(meta_filespace);
+            H5Pclose(meta_plist_id);
+            H5Gclose(meta_group_id);
+            H5Fclose(meta_file_id);
+            meta_el3 = uptime() - meta_el3;
+            //sim_log("Metafile TimeHDF5Close: " << meta_el3 << " s");
+
+        }
+
+        void dump_hydro(
+            const char *fbase,
+            int step,
+            hydro_array_t* hydro_array,
+            species_t* sp,
+            interpolator_array_t* interpolator_array,
+            grid_t* grid,
+            int ftag
+        )
+        {
+            size_t step_for_viou = step;
+
+#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
+            {                                                                                                             \
+                dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
+                temp_buf_index = 0;                                                                                       \
+                for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
+                {                                                                                                         \
+                    for (size_t j(1); j < grid->ny + 1; j++)                                                              \
+                    {                                                                                                     \
+                        for (size_t k(1); k < grid->nz + 1; k++)                                                          \
+                        {                                                                                                 \
+                            temp_buf[temp_buf_index] = hydro(i, j, k).ATTRIBUTE_NAME;                                     \
+                            temp_buf_index = temp_buf_index + 1;                                                          \
+                        }                                                                                                 \
+                    }                                                                                                     \
+                }                                                                                                         \
+                dataspace_id = H5Dget_space(dset_id);                                                                     \
+                H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
+                H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
+                H5Sclose(dataspace_id);                                                                                   \
+                H5Dclose(dset_id);                                                                                        \
+            }
+            //#define DUMP_INFO_DEBUG 1
+            int mpi_size, mpi_rank;
+            MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+            species_t *sp = find_species_name(speciesname, species_list);
+            if (!sp)
+                ERROR(("Invalid species name: %s", speciesname));
+
+            clear_hydro_array(hydro_array);
+            accumulate_hydro_p(hydro_array, sp, interpolator_array);
+            synchronize_hydro_array(hydro_array);
+
+            char hname[256];
+            char hydro_scratch[128];
+            char subhydro_scratch[128];
+
+            sprintf(hydro_scratch, "./%s", "hydro_hdf5");
+            FileUtils::makeDirector(hydro_scratch);
+            sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou);
+            FileUtils::makeDirector(subhydro_scratch);
+
+            sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, speciesname, step_for_viou);
+            double el1 = uptime();
+            hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
+            H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+            hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+            H5Pclose(plist_id);
+
+            sprintf(hname, "Timestep_%zu", step_for_viou);
+            hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+            el1 = uptime() - el1;
+            //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
+            //double el2 = uptime();
+
+            // Create a variable list of field values to output.
+            //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
+            //size_t *varlist = new size_t[numvars];
+
+            //for (size_t i(0), c(0); i < total_field_variables; i++)
+            //    if (global->fdParams.output_vars.bitset(i))
+            //        varlist[c++] = i;
+
+            //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars);
+
+
+            //typedef struct hydro {
+            //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
+            //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
+            //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
+            //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
+            //  float _pad[2];         // 16-byte align
+            //} hydro_t;
+
+            //typedef struct hydro_array {
+            //  hydro_t * ALIGNED(128) h;
+            //  grid_t * g;
+            //} hydro_array_t;
+
+            float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
+            hsize_t temp_buf_index;
+            hid_t dset_id;
+            //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
+            plist_id = H5Pcreate(H5P_DATASET_XFER);
+            //Comment out for test only
+            H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+            //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
+
+            //global->topology_x
+
+            hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3];
+            hydro_global_size[0] = (grid->nx * grid->gpx);
+            hydro_global_size[1] = (grid->ny * grid->gpy);
+            hydro_global_size[2] = (grid->nz * grid->gpz);
+
+            hydro_local_size[0] = grid->nx;
+            hydro_local_size[1] = grid->ny;
+            hydro_local_size[2] = grid->nz;
+
+            int mpi_rank_x, mpi_rank_y, mpi_rank_z;
+            RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+
+            global_offset[0] = (grid->nx) * mpi_rank_x;
+            global_offset[1] = (grid->ny) * mpi_rank_y;
+            global_offset[2] = (grid->nz) * mpi_rank_z;
+
+            global_count[0] = (grid->nx);
+            global_count[1] = (grid->ny);
+            global_count[2] = (grid->nz);
+
+#ifdef DUMP_INFO_DEBUG
+            printf("global size   = %d %d %d \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
+            printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]);
+            printf("global_count  = %d %d %d \n", global_count[0], global_count[1], global_count[2]);
+            printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+            fflush(stdout);
+#endif
+
+            hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL);
+            hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL);
+            hid_t dataspace_id;
+
+            //typedef struct hydro {
+            //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
+            //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
+            //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
+            //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
+            //  float _pad[2];         // 16-byte align
+            //} hydro_t;
+
+            if (hydro_dump_flag.jx)
+                DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.jy)
+                DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.jz)
+                DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.rho)
+                DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT);
+
+            if (hydro_dump_flag.px)
+                DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.py)
+                DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.pz)
+                DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.ke)
+                DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT);
+
+            if (hydro_dump_flag.txx)
+                DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.tyy)
+                DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.tzz)
+                DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT);
+
+            if (hydro_dump_flag.tyz)
+                DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.tzx)
+                DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT);
+            if (hydro_dump_flag.txy)
+                DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
+
+            //el2 = uptime() - el2;
+            //sim_log("TimeHDF5Write: " << el2 << " s");
+
+            double el3 = uptime();
+
+            //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
+            float attr_data[2][3];
+            attr_data[0][0] = grid->x0;
+            attr_data[0][1] = grid->y0;
+            attr_data[0][2] = grid->z0;
+            attr_data[1][0] = grid->dx;
+            attr_data[1][1] = grid->dy;
+            attr_data[1][2] = grid->dz;
+            hsize_t dims[2];
+            dims[0] = 2;
+            dims[1] = 3;
+            hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
+            hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
+            H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
+            H5Sclose(va_geo_dataspace_id);
+            H5Aclose(va_geo_attribute_id);
+
+            free(temp_buf);
+            H5Sclose(filespace);
+            H5Sclose(memspace);
+            H5Pclose(plist_id);
+            H5Gclose(group_id);
+            H5Fclose(file_id);
+
+            el3 = uptime() - el3;
+            //sim_log("TimeHDF5Close: " << el3 << " s");
+
+            if (mpi_rank == 0)
+            {
+                char output_xml_file[128];
+                sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", speciesname, ".xdmf");
+                char dimensions_3d[128];
+                sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
+                char dimensions_4d[128];
+                sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3);
+                char orignal[128];
+                sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
+                char dxdydz[128];
+                sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
+
+                int nframes = num_step / hydro_interval + 1;
+
+                const int tframe = tframe_map[sp->id];
+
+#ifdef DUMP_INFO_DEBUG
+                printf("         meta file : %s \n", output_xml_file);
+                printf(" array dims per var: %s \n", dimensions_3d);
+                printf("array dims all vars: %s \n", dimensions_4d);
+                printf("            orignal: %s \n", orignal);
+                printf("             dxdydz: %s \n", dxdydz);
+                printf("            nframes: %d \n", nframes);
+                printf("    hydro_fields_interval: %d \n", hydro_interval);
+                printf("       current step: %lld \n", step_for_viou);
+                printf("    Simulation time: %f \n", grid->t0);
+                printf("             tframe: %d \n", tframe);
+#endif
+
+                char speciesname_new[128];
+                sprintf(speciesname_new, "hydro_%s", speciesname);
+                if (tframe >= 1)
+                {
+                    if (tframe == (nframes - 1))
+                    {
+                        invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
+                    }
+                    else
+                    {
+                        invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
+                    }
+                }
+                else
+                {
+                    create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval);
+                    if (tframe == (nframes - 1))
+                    {
+                        invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
+                    }
+                    else
+                    {
+                        invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
+                    }
+                }
+                tframe_map[sp->id]++;
+            }
+        }
+};
+#endif
+
+#ifdef VPIC_ENABLE_OPENPMD
+class OpenPMDDump : public Dump_Strategy {
+    public:
+        static openPMD::Series* series;
+        using Dump_Strategy::Dump_Strategy; // inherit constructor
+        void dump_fields(
+            const char *fbase,
+            int step,
+            grid_t* grid,
+            field_array_t* field_array,
+            int ftag
+        )
+        {
+            std::cout << "Writing openPMD data" << std::endl;
+
+            if (series == nullptr) {
+                std::cout << "init series" << std::endl;
+                series = new openPMD::Series(
+                        fbase,
+                        openPMD::AccessType::CREATE,
+                        MPI_COMM_WORLD
+                        );
+            }
+
+            std::cout << "Writing itration " << step << std::endl;
+            auto i = series->iterations[ step ];
+            // TODO: it would be nice to set these...
+            //series.setAuthor( "Axel Huebl <a.huebl@hzdr.de>");
+            //series.setMachine( "Hall Probe 5000, Model 3");
+            i.setAttribute( "vacuum", true);
+
+            auto cB = i.meshes["B"];
+            auto E = i.meshes["E"];
+            auto J = i.meshes["J"];
+
+            // record components
+            auto cbx = cB["x"];
+            auto cby = cB["y"];
+            auto cbz = cB["z"];
+
+            auto Ex = E["x"];
+            auto Ey = E["y"];
+            auto Ez = E["z"];
+
+            auto Jx = J["x"];
+            auto Jy = J["y"];
+            auto Jz = J["z"];
+
+            // TODO: set unitDimension so the anaylsis software knows what fields
+            // things are
+
+            size_t gnx = (grid->nx * grid->gpx);
+            size_t gny = (grid->ny * grid->gpy);
+            size_t gnz = (grid->nz * grid->gpz);
+            openPMD::Extent global_extent = {gny, gny, gnz};
+
+            openPMD::Datatype datatype = openPMD::determineDatatype<float>();
+            openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
+
+            cbx.resetDataset(dataset);
+            cby.resetDataset(dataset);
+            cbz.resetDataset(dataset);
+
+            Ex.resetDataset(dataset);
+            Ey.resetDataset(dataset);
+            Ez.resetDataset(dataset);
+
+            Jx.resetDataset(dataset);
+            Jy.resetDataset(dataset);
+            Jz.resetDataset(dataset);
+
+            // Convert rank to local x/y/z
+            int rx, ry, rz;
+            UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+            size_t nx = grid->nx;
+            size_t ny = grid->ny;
+            size_t nz = grid->nz;
+
+            // NOTE: this assumes a static mesh decomposition in nx/ny/nz
+            size_t global_offset_x = (nx) * rx;
+            size_t global_offset_y = (ny) * ry;
+            size_t global_offset_z = (nz) * rz;
+
+            openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z};
+            openPMD::Extent chunk_extent = {nx, ny, nz};
+
+            // Store a local copy of the data which we pull out of the AoS
+            std::vector<float> cbx_data;
+            std::vector<float> cby_data;
+            std::vector<float> cbz_data;
+
+            std::vector<float> ex_data;
+            std::vector<float> ey_data;
+            std::vector<float> ez_data;
+
+            std::vector<float> jx_data;
+            std::vector<float> jy_data;
+            std::vector<float> jz_data;
+
+            size_t nv = nx * ny * nz;
+
+            cbx_data.reserve(nv);
+            cby_data.reserve(nv);
+            cbz_data.reserve(nv);
+
+            ex_data.reserve(nv);
+            ey_data.reserve(nv);
+            ez_data.reserve(nv);
+
+            jx_data.reserve(nv);
+            jy_data.reserve(nv);
+            jz_data.reserve(nv);
+
+            // TODO: make this AoS to SoA conversion a function
+
+            // We could do 1D here, but we don't really care about the ghosts, and we
+            // can thread over nz/ny (collapsed?)
+            // Go over non-ghosts and grab just that data into a dense array
+            for (size_t k = 1; k < grid->nz + 1; k++)
+            {
+                for (size_t j = 1; j < grid->ny + 1; j++)
+                {
+                    for (size_t i = 1; i < grid->nx + 1; i++)
+                    {
+                        int local_index  = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2);
+                        int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
+
+                        cbx_data[local_index] = field_array->f[global_index].cbx;
+                        cby_data[local_index] = field_array->f[global_index].cby;
+                        cbz_data[local_index] = field_array->f[global_index].cbz;
+
+                        ex_data[local_index] = field_array->f[global_index].ex;
+                        ey_data[local_index] = field_array->f[global_index].ey;
+                        ez_data[local_index] = field_array->f[global_index].ez;
+
+                        jx_data[local_index] = field_array->f[global_index].jfx;
+                        jy_data[local_index] = field_array->f[global_index].jfy;
+                        jz_data[local_index] = field_array->f[global_index].jfz;
+                    }
+                }
+            }
+
+            cbx.storeChunk( cbx_data, chunk_offset, chunk_extent);
+            cby.storeChunk( cby_data, chunk_offset, chunk_extent);
+            cbz.storeChunk( cbz_data, chunk_offset, chunk_extent);
+
+            Ex.storeChunk( ex_data, chunk_offset, chunk_extent);
+            Ey.storeChunk( ey_data, chunk_offset, chunk_extent);
+            Ez.storeChunk( ez_data, chunk_offset, chunk_extent);
+
+            Jx.storeChunk( jx_data, chunk_offset, chunk_extent);
+            Jy.storeChunk( jy_data, chunk_offset, chunk_extent);
+            Jz.storeChunk( jz_data, chunk_offset, chunk_extent);
+
+            series->flush();
+        }
+        void dump_particles(
+            const char *fbase,
+            species_t* sp,
+            grid_t* grid,
+            int step,
+            interpolator_array_t* interpolator_array,
+            int ftag
+        )
+        {
+            if (series == nullptr) {
+                std::cout << "init series" << std::endl;
+                series = new openPMD::Series(
+                        fbase,
+                        openPMD::AccessType::CREATE,
+                        MPI_COMM_WORLD
+                        );
+            }
+
+            auto i = series->iterations[ step ];
+
+            // TODO: set these
+            i.setTime( (float)step );
+            i.setDt(1.0);
+            i.setTimeUnitSI(1.0);
+
+            auto& p = i.particles[sp->name];
+
+            const int np = sp->np;
+
+            // TODO: this could be a function call as it's used elsewhere (in hdf5)
+            unsigned long long total_particles, offset;
+            unsigned long long numparticles = np;
+            MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+            MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+            offset -= numparticles;
+
+            openPMD::Extent global_extent = {total_particles};
+            openPMD::Datatype datatype = openPMD::determineDatatype<float>();
+            openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
+
+            auto px = p["position"]["x"];
+            auto pxo = p["positionOffset"]["x"];
+
+            px.resetDataset(dataset);
+            pxo.resetDataset(dataset);
+
+            // convert data to SoA, allowing the user to chunk the operation
+            const int max_chunk = 32768*8; // 1MB SoA
+            // Loop over all particles in chunks
+            for (int i = 0; i < np; i += max_chunk)
+            {
+                // We have to be careful as the last chunk may not be full
+                // Find how many are left and do that many
+                size_t to_write = std::min(np-i, max_chunk);
+
+                // Convert the chunk ready to write
+                std::vector<float> x_pos;
+                std::vector<float> x_off;
+                x_pos.reserve(to_write);
+                x_off.reserve(to_write);
+
+                for (int j = 0; j < to_write; j++)
+                {
+                    // TODO: do I need to center the particles?
+                    auto& particle = sp->p[i+j];
+                    x_pos[j] = particle.dx;
+                    std::array<int, 4> gi = global_particle_index(particle.i, grid, rank);
+                    x_off[j] = (float)gi[1];
+                }
+
+                // Base offset plus i to account for chunks
+                auto o = openPMD::Offset{offset + i};
+                auto e = openPMD::Extent{to_write};
+                px.storeChunk(x_pos, o, e);
+                pxo.storeChunk(x_off, o, e);
+            }
+
+
+        }
+        void dump_hydro(
+            const char *fbase,
+            int step,
+            hydro_array_t* hydro_array,
+            species_t* sp,
+            interpolator_array_t* interpolator_array,
+            grid_t* grid,
+            int ftag
+        )
+        {
+        }
+};
+#endif
+
+/*
+   template <typename Policy = BinaryDump>
+   struct IODump : private Policy {
+   using Policy::dump_particles;
+   using Policy::dump_fields;
+   using Policy::dump_hydro;
+   };
+   */
+
+#endif
diff --git a/src/vpic/dumpmacros.h b/src/vpic/dumpmacros.h
index 9e46bf6b..bbb1b743 100644
--- a/src/vpic/dumpmacros.h
+++ b/src/vpic/dumpmacros.h
@@ -4,7 +4,7 @@
 /* FIXME: WHEN THESE MACROS WERE HOISTED AND VARIOUS HACKS DONE TO THEM
    THEY BECAME _VERY_ _DANGEROUS. */
 
-#define WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \
+#define WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO,step,rank,nproc) do { \
     /* Binary compatibility information */               \
     WRITE( char,      CHAR_BIT,               fileIO );  \
     WRITE( char,      sizeof(short int),      fileIO );  \
@@ -19,7 +19,7 @@
     WRITE( int,       0 /* Version */,        fileIO );  \
     WRITE( int,       dump_type,              fileIO );  \
     /* High level information */                         \
-    WRITE( int,       step(),                 fileIO );  \
+    WRITE( int,       step,                   fileIO );  \
     WRITE( int,       nxout,                  fileIO );  \
     WRITE( int,       nyout,                  fileIO );  \
     WRITE( int,       nzout,                  fileIO );  \
@@ -33,21 +33,21 @@
     WRITE( float,     grid->cvac,             fileIO );  \
     WRITE( float,     grid->eps0,             fileIO );  \
     WRITE( float,     0 /* damp */,           fileIO );  \
-    WRITE( int,       rank(),                 fileIO );  \
-    WRITE( int,       nproc(),                fileIO );  \
+    WRITE( int,       rank,                 fileIO );  \
+    WRITE( int,       nproc,                fileIO );  \
     /* Species parameters */                             \
     WRITE( int,       sp_id,                  fileIO );  \
     WRITE( float,     q_m,                    fileIO );  \
   } while(0)
- 
+
 // Note dim _MUST_ be a pointer to an int
- 
+
 #define WRITE_ARRAY_HEADER(p,ndim,dim,fileIO) do { \
     WRITE( int, sizeof(p[0]), fileIO );            \
     WRITE( int, ndim,         fileIO );            \
     fileIO.write( dim, ndim );                     \
   } while(0)
- 
+
 // The WRITE macro copies the output "value" into a temporary variable
 // of the requested output "type" so that the write to the "file"
 // occurs from a known binary data type. For example, if grid.dx were
@@ -60,12 +60,12 @@
 // single precision write copies. However, specialty types could be
 // created so that the type cast __WRITE_tmp = (type)(value)
 // automatically does the underlying conversion in C++
- 
+
 #define WRITE(type,value,fileIO) do { \
     type __WRITE_tmp = (type)(value); \
     fileIO.write( &__WRITE_tmp, 1 );  \
   } while(0)
- 
+
 // Note: strlen does not include the terminating \0
 #define WRITE_STRING(string,fileIO) do {                    \
     int __WRITE_STRING_len = 0;                             \
@@ -74,103 +74,102 @@
     if( __WRITE_STRING_len>0 )                              \
       fileIO.write( string, __WRITE_STRING_len );           \
   } while(0)
- 
+
 #define READ(type,value,fileIO) do { \
     type __READ_tmp;                 \
     fileIO.read(&__READ_tmp, 1 );    \
     (value) = __READ_tmp;            \
   } while(0)
 
-#define F_WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \
-    /* Binary compatibility information */                 \
-    F_WRITE( char,      CHAR_BIT,               fileIO );  \
-    F_WRITE( char,      sizeof(short int),      fileIO );  \
-    F_WRITE( char,      sizeof(int),            fileIO );  \
-    F_WRITE( char,      sizeof(float),          fileIO );  \
-    F_WRITE( char,      sizeof(double),         fileIO );  \
-    F_WRITE( short int, 0xcafe,                 fileIO );  \
-    F_WRITE( int,       0xdeadbeef,             fileIO );  \
-    F_WRITE( float,     1.0,                    fileIO );  \
-    F_WRITE( double,    1.0,                    fileIO );  \
-    /* Dump type and header format version */              \
-    F_WRITE( int,       0 /* Version */,        fileIO );  \
-    F_WRITE( int,       dump_type,              fileIO );  \
-    /* High level information */                           \
-    F_WRITE( int,       step(),                 fileIO );  \
-    F_WRITE( int,       imxstr-2,               fileIO );  \
-    F_WRITE( int,       jmxstr-2,               fileIO );  \
-    F_WRITE( int,       kmxstr-2,               fileIO );  \
-    F_WRITE( float,     grid->dt,               fileIO );  \
-    F_WRITE( float,     dxstr,                  fileIO );  \
-    F_WRITE( float,     dystr,                  fileIO );  \
-    F_WRITE( float,     dzstr,                  fileIO );  \
-    F_WRITE( float,     grid->x0,               fileIO );  \
-    F_WRITE( float,     grid->y0,               fileIO );  \
-    F_WRITE( float,     grid->z0,               fileIO );  \
-    F_WRITE( float,     grid->cvac,             fileIO );  \
-    F_WRITE( float,     grid->eps0,             fileIO );  \
-    F_WRITE( float,     0 /*damp*/,             fileIO );  \
-    F_WRITE( int,       rank(),                 fileIO );  \
-    F_WRITE( int,       nproc(),                fileIO );  \
-    /* Species parameters */                               \
-    F_WRITE( int,       sp_id,                  fileIO );  \
-    F_WRITE( float,     q_m,                    fileIO );  \
-  } while(0)
- 
-#define F_WRITE_HEADER_PAR(dump_type,sp_id,q_m,fileIO) do { \
-    /* Binary compatibility information */                  \
-    F_WRITE( char,      CHAR_BIT,               fileIO );   \
-    F_WRITE( char,      sizeof(short int),      fileIO );   \
-    F_WRITE( char,      sizeof(int),            fileIO );   \
-    F_WRITE( char,      sizeof(float),          fileIO );   \
-    F_WRITE( char,      sizeof(double),         fileIO );   \
-    F_WRITE( short int, 0xcafe,                 fileIO );   \
-    F_WRITE( int,       0xdeadbeef,             fileIO );   \
-    F_WRITE( float,     1.0,                    fileIO );   \
-    F_WRITE( double,    1.0,                    fileIO );   \
-    /* Dump type and header format version */               \
-    F_WRITE( int,       0 /* Version */,        fileIO );   \
-    F_WRITE( int,       dump_type,              fileIO );   \
-    /* High level information */                            \
-    F_WRITE( int,       step(),                 fileIO );   \
-    F_WRITE( int,       grid->nx,               fileIO );   \
-    F_WRITE( int,       grid->ny,               fileIO );   \
-    F_WRITE( int,       grid->nz,               fileIO );   \
-    F_WRITE( float,     grid->dt,               fileIO );   \
-    F_WRITE( float,     grid->dx,               fileIO );   \
-    F_WRITE( float,     grid->dy,               fileIO );   \
-    F_WRITE( float,     grid->dz,               fileIO );   \
-    F_WRITE( float,     grid->x0,               fileIO );   \
-    F_WRITE( float,     grid->y0,               fileIO );   \
-    F_WRITE( float,     grid->z0,               fileIO );   \
-    F_WRITE( float,     grid->cvac,             fileIO );   \
-    F_WRITE( float,     grid->eps0,             fileIO );   \
-    F_WRITE( float,     0 /*damp*/,             fileIO );   \
-    F_WRITE( int,       rank(),                 fileIO );   \
-    F_WRITE( int,       nproc(),                fileIO );   \
-    /* Species parameters */                                \
-    F_WRITE( int,       sp_id,                  fileIO );   \
-    F_WRITE( float,     q_m,                    fileIO );   \
-  } while(0)
- 
+//#define F_WRITE_HEADER_V0(dump_type,sp_id,q_m,fileIO) do { \
+    ///* Binary compatibility information */                 \
+    //F_WRITE( char,      CHAR_BIT,               fileIO );  \
+    //F_WRITE( char,      sizeof(short int),      fileIO );  \
+    //F_WRITE( char,      sizeof(int),            fileIO );  \
+    //F_WRITE( char,      sizeof(float),          fileIO );  \
+    //F_WRITE( char,      sizeof(double),         fileIO );  \
+    //F_WRITE( short int, 0xcafe,                 fileIO );  \
+    //F_WRITE( int,       0xdeadbeef,             fileIO );  \
+    //F_WRITE( float,     1.0,                    fileIO );  \
+    //F_WRITE( double,    1.0,                    fileIO );  \
+    ///* Dump type and header format version */              \
+    //F_WRITE( int,       0 /* Version */,        fileIO );  \
+    //F_WRITE( int,       dump_type,              fileIO );  \
+    ///* High level information */                           \
+    //F_WRITE( int,       step(),                 fileIO );  \
+    //F_WRITE( int,       imxstr-2,               fileIO );  \
+    //F_WRITE( int,       jmxstr-2,               fileIO );  \
+    //F_WRITE( int,       kmxstr-2,               fileIO );  \
+    //F_WRITE( float,     grid->dt,               fileIO );  \
+    //F_WRITE( float,     dxstr,                  fileIO );  \
+    //F_WRITE( float,     dystr,                  fileIO );  \
+    //F_WRITE( float,     dzstr,                  fileIO );  \
+    //F_WRITE( float,     grid->x0,               fileIO );  \
+    //F_WRITE( float,     grid->y0,               fileIO );  \
+    //F_WRITE( float,     grid->z0,               fileIO );  \
+    //F_WRITE( float,     grid->cvac,             fileIO );  \
+    //F_WRITE( float,     grid->eps0,             fileIO );  \
+    //F_WRITE( float,     0 /*damp*/,             fileIO );  \
+    //F_WRITE( int,       rank(),                 fileIO );  \
+    //F_WRITE( int,       nproc(),                fileIO );  \
+    ///* Species parameters */                               \
+    //F_WRITE( int,       sp_id,                  fileIO );  \
+    //F_WRITE( float,     q_m,                    fileIO );  \
+  //} while(0)
+
+//#define F_WRITE_HEADER_PAR(dump_type,sp_id,q_m,fileIO) do { \
+    ///* Binary compatibility information */                  \
+    //F_WRITE( char,      CHAR_BIT,               fileIO );   \
+    //F_WRITE( char,      sizeof(short int),      fileIO );   \
+    //F_WRITE( char,      sizeof(int),            fileIO );   \
+    //F_WRITE( char,      sizeof(float),          fileIO );   \
+    //F_WRITE( char,      sizeof(double),         fileIO );   \
+    //F_WRITE( short int, 0xcafe,                 fileIO );   \
+    //F_WRITE( int,       0xdeadbeef,             fileIO );   \
+    //F_WRITE( float,     1.0,                    fileIO );   \
+    //F_WRITE( double,    1.0,                    fileIO );   \
+    ///* Dump type and header format version */               \
+    //F_WRITE( int,       0 /* Version */,        fileIO );   \
+    //F_WRITE( int,       dump_type,              fileIO );   \
+    ///* High level information */                            \
+    //F_WRITE( int,       step(),                 fileIO );   \
+    //F_WRITE( int,       grid->nx,               fileIO );   \
+    //F_WRITE( int,       grid->ny,               fileIO );   \
+    //F_WRITE( int,       grid->nz,               fileIO );   \
+    //F_WRITE( float,     grid->dt,               fileIO );   \
+    //F_WRITE( float,     grid->dx,               fileIO );   \
+    //F_WRITE( float,     grid->dy,               fileIO );   \
+    //F_WRITE( float,     grid->dz,               fileIO );   \
+    //F_WRITE( float,     grid->x0,               fileIO );   \
+    //F_WRITE( float,     grid->y0,               fileIO );   \
+    //F_WRITE( float,     grid->z0,               fileIO );   \
+    //F_WRITE( float,     grid->cvac,             fileIO );   \
+    //F_WRITE( float,     grid->eps0,             fileIO );   \
+    //F_WRITE( float,     0 /*damp*/,             fileIO );   \
+    //F_WRITE( int,       rank(),                 fileIO );   \
+    //F_WRITE( int,       nproc(),                fileIO );   \
+    ///* Species parameters */                                \
+    //F_WRITE( int,       sp_id,                  fileIO );   \
+    //F_WRITE( float,     q_m,                    fileIO );   \
+  //} while(0)
+
 // Note dim _MUST_ be a pointer to an int
- 
-#define F_WRITE_ARRAY_HEADER(psiz,ndim,dim,fileIO) do { \
-    F_WRITE( int, psiz, fileIO );                       \
-    F_WRITE( int, ndim, fileIO );                       \
-    fileIO.write( dim, ndim );                          \
-  } while(0)
- 
-#define F_WRITE(type,value,fileIO) do { \
-    type __F_WRITE_tmp = (type)(value); \
-    fileIO.write( &__F_WRITE_tmp, 1 );  \
-  } while(0)
- 
-#define F_READ(type,value,fileIO) do { \
-    type __F_READ_tmp;                 \
-    fileIO.read( &__F_READ_tmp, 1 );   \
-    (value) = __F_READ_tmp;            \
-  } while(0)
+//#define F_WRITE_ARRAY_HEADER(psiz,ndim,dim,fileIO) do { \
+    //F_WRITE( int, psiz, fileIO );                       \
+    //F_WRITE( int, ndim, fileIO );                       \
+    //fileIO.write( dim, ndim );                          \
+  //} while(0)
+
+//#define F_WRITE(type,value,fileIO) do { \
+    //type __F_WRITE_tmp = (type)(value); \
+    //fileIO.write( &__F_WRITE_tmp, 1 );  \
+  //} while(0)
+
+//#define F_READ(type,value,fileIO) do { \
+    //type __F_READ_tmp;                 \
+    //fileIO.read( &__F_READ_tmp, 1 );   \
+    //(value) = __F_READ_tmp;            \
+  //} while(0)
 
 #define ABORT(cond) if( cond ) ERROR(( #cond ))
 
diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc
index bfd18767..3e3b3812 100644
--- a/src/vpic/vpic.cc
+++ b/src/vpic/vpic.cc
@@ -9,6 +9,7 @@
  */
 
 #include "vpic.h"
+#include "dump_strategy.h"
 
 /* Note that, when a vpic_simulation is created (and thus registered
    with the checkpt service), it is created empty; none of the simulation
@@ -71,8 +72,11 @@ reanimate_vpic_simulation( vpic_simulation * vpic ) {
 }
 
 
-vpic_simulation::vpic_simulation() {
-  CLEAR( this, 1 );
+vpic_simulation::vpic_simulation() : dump_strategy(BinaryDump( rank(), nproc() ))
+{
+  // TODO: why is this a good idea?
+  // Is this just trying to 0 initialize everything?
+  // CLEAR( this, 1 );
 
   /* Set non-zero defaults */
   verbose = 1;
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index 6b657a16..73dfab29 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -24,6 +24,7 @@
 #include "../util/bitfield.h"
 #include "../util/checksum.h"
 #include "../util/system.h"
+#include "dump_strategy.h"
 
 #ifndef USER_GLOBAL_SIZE
 #define USER_GLOBAL_SIZE 16384
@@ -287,9 +288,12 @@ class vpic_simulation {
   int field_interval;
   int particle_interval;
 
-  size_t nxout, nyout, nzout;
+  // TODO: these can probably now be removed, as they should only be used by dump?
+  // TODO: check if any decks used them
+  //size_t nxout, nyout, nzout;
+  //float dxout, dyout, dzout;
+
   size_t px, py, pz;
-  float dxout, dyout, dzout;
 
   int ndfld;
   int ndhyd;
@@ -361,7 +365,7 @@ class vpic_simulation {
   ///////////////
   // Dump helpers
 
-  int dump_mkdir(const char * dname);
+  static int dump_mkdir(const char * dname);
   int dump_cwd(char * dname, size_t size);
 
   // Text dumps
@@ -380,22 +384,9 @@ class vpic_simulation {
   void dump_particles( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
 
-#ifdef VPIC_ENABLE_OPENPMD
-  void dump_fields_openpmd( const char *fbase, int fname_tag = 1 );
-  void dump_particles_openpmd(
-      const char *sp_name,
-      const char *fbase,
-      int ftag = 1
-  );
-#endif
+  Dump_Strategy  dump_strategy;
 
 #ifdef VPIC_ENABLE_HDF5
-  void dump_particles_hdf5( const char *sp_name, const char *fbase,
-                       int fname_tag = 1 );
-  void dump_hydro_hdf5( const char *sp_name, const char *fbase,
-                   int fname_tag = 1 );
-  void dump_fields_hdf5( const char *fbase, int fname_tag = 1 );
-
   // Declare vars to use
   hydro_dump_flag_t hydro_dump_flag;
   field_dump_flag_t field_dump_flag;

From 686c143c8dfefb18fcea92c0675ebc0714fdede2 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Wed, 6 Nov 2019 09:50:09 -0700
Subject: [PATCH 71/95] first compile and linking build of dump strategy

---
 src/field_advance/field_advance.h         |  12 +-
 src/species_advance/species_advance.cc    |   4 +-
 src/species_advance/species_advance_aos.h |   2 +
 src/util/io/FileIO.h                      |   1 +
 src/util/util_base.h                      |  12 +-
 src/vpic/dump.cc                          |  22 ++-
 src/vpic/dump.h                           |  31 +---
 src/vpic/dump_strategy.cc                 | 158 +++++++++++++++++++++
 src/vpic/dump_strategy.h                  | 163 +++-------------------
 src/vpic/vpic.cc                          |   7 +-
 src/vpic/vpic.h                           |   4 +-
 11 files changed, 227 insertions(+), 189 deletions(-)
 create mode 100644 src/vpic/dump_strategy.cc

diff --git a/src/field_advance/field_advance.h b/src/field_advance/field_advance.h
index d1cee710..0d435e8a 100644
--- a/src/field_advance/field_advance.h
+++ b/src/field_advance/field_advance.h
@@ -13,7 +13,7 @@
 //
 // This module implements the following the difference equations on a
 // superhexahedral domain decomposed Yee-mesh:
-//  
+//
 // advance_b -> Finite Differenced Faraday
 //   cB_new = cB_old - frac c dt curl E
 //
@@ -32,7 +32,7 @@
 //     rapidly reduce RMS divergence error assuming divergences errors
 //     are due to accumulation of numerical roundoff when integrating
 //     Faraday. See clean_div.c for details.
-//     
+//
 // div_clean_e -> Modified Marder pass on electric fields
 //   E_new = E_old + drive D dt grad err_mul div ( epsr E_old - rho/eps0 )
 //     Since the total rho may not be known everywhere (for example in
@@ -65,7 +65,7 @@
 // fmatx,fmaty,fmatz are all on the "face
 // mesh". rhof,rhob,div_e_err,nmat are on the "nodes mesh".
 // div_b_err,cmat are on the "cell mesh".
-// 
+//
 // Above, for "edge mesh" quantities, interior means that the
 // component is not a tangential field directly on the surface of the
 // domain. For "face mesh" quantities, interior means that the
@@ -97,7 +97,7 @@
 //   ...
 //   material_coefficients = new_material_coefficients(grid,material_list);
 //   fields = new_fields(grid);
-// 
+//
 //   ... Set the initial field values and place materials ...
 //
 //   synchronize_fields(fields,grid);
@@ -107,7 +107,7 @@
 // initial fields or errors in the source terms or different floating
 // point properties on different nodes cause the shared faces to have
 // different fields).
-//   
+//
 // To advance the fields in a PIC simulation with TCA radation damping
 // and periodic divergence cleaning, the following sequence is
 // suggested:
@@ -118,7 +118,7 @@
 //   if( should_clean_div_e ) {
 //     ... adjust rho_f, rho_b and/or rho_c as necessary
 //     do {
-//       rms_err = clean_div_e( fields, material_coefficients, grid ); 
+//       rms_err = clean_div_e( fields, material_coefficients, grid );
 //     } while( rms_err_too_high );
 //   }
 //   if( should_clean_div_b ) {
diff --git a/src/species_advance/species_advance.cc b/src/species_advance/species_advance.cc
index 0e85a646..2ed53cbb 100644
--- a/src/species_advance/species_advance.cc
+++ b/src/species_advance/species_advance.cc
@@ -1,4 +1,4 @@
-/* 
+/*
  * Written by:
  *   Kevin J. Bowers, Ph.D.
  *   Plasma Physics Group (X-1)
@@ -146,7 +146,7 @@ species( const char * name,
   sp->sort_out_of_place = sort_out_of_place;
   MALLOC_ALIGNED( sp->partition, g->nv+1, 128 );
 
-  sp->g = g;   
+  sp->g = g;
 
   /* id, next are set by append species */
 
diff --git a/src/species_advance/species_advance_aos.h b/src/species_advance/species_advance_aos.h
index 3e1af9ad..47fa1a78 100644
--- a/src/species_advance/species_advance_aos.h
+++ b/src/species_advance/species_advance_aos.h
@@ -12,6 +12,8 @@
 #ifndef _species_advance_aos_h_
 #define _species_advance_aos_h_
 
+// TODO: should we restrict the direct include of this header?
+
 typedef int32_t species_id; // Must be 32-bit wide for particle_injector_t
 
 // FIXME: Eventually particle_t (definitely) and their other formats
diff --git a/src/util/io/FileIO.h b/src/util/io/FileIO.h
index 0d8ed6da..74221451 100644
--- a/src/util/io/FileIO.h
+++ b/src/util/io/FileIO.h
@@ -13,6 +13,7 @@
 #define FileIO_h
 
 #include <stdarg.h>
+#include <stdint.h>
 #include "FileIOData.h"
 
 /*!
diff --git a/src/util/util_base.h b/src/util/util_base.h
index bc9db329..4f2ada6c 100644
--- a/src/util/util_base.h
+++ b/src/util/util_base.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Written by:
  *   Kevin J. Bowers, Ph.D.
  *   Plasma Physics Group (X-1)
@@ -21,7 +21,7 @@
 #endif
 
 // C99 does requires some key macros of stdint to only be defined in
-// C++ implementations if explicitly requested. 
+// C++ implementations if explicitly requested.
 
 #define __STDC_LIMIT_MACROS
 
@@ -102,7 +102,7 @@ typedef struct collective collective_t;
 
 #ifndef RESTRICT
 #define RESTRICT __restrict
-#endif 
+#endif
 
 // Normal pointers (e.g. a *) are in whatever address space the given
 // compile unit uses.  However, sometimes it is necessary to declare
@@ -154,7 +154,7 @@ typedef struct collective collective_t;
 // allow correct autogeneration when no alignment necessary ... sigh
 // ...
 
-#define PAD(s,a) ( (a) - ( (s) & ( (a)-1 ) ) ) 
+#define PAD(s,a) ( (a) - ( (s) & ( (a)-1 ) ) )
 
 // POW2_CEIL rounds "u" up to the nearest multiple of the power of two
 // "a".  If u is a multiple of "a", its value is unchanged.  "a" should
@@ -344,7 +344,7 @@ void detect_old_style_arguments(int* pargc, char *** pargv);
 #define MALLOC(x,n)                                                    \
   util_malloc( "MALLOC( "#x", "#n" (%lu bytes) ) at "                  \
                __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ") failed", \
-               &(x), (n)*sizeof(*(x)) ) 
+               &(x), (n)*sizeof(*(x)) )
 
 void
 util_malloc( const char * err_fmt, // Has exactly one %lu in it
@@ -370,7 +370,7 @@ util_free( void * mem_ref );
                                          #n" (%lu bytes), "                    \
                                          #a" (%lu bytes) ) at "                \
                        __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ") failed", \
-                       &(x), (n)*sizeof(*(x)), (a) ) 
+                       &(x), (n)*sizeof(*(x)), (a) )
 
 
 void
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 35d7441d..e4fd86ae 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -44,7 +44,7 @@ void vpic_simulation::dump_particles( const char *sp_name,
                                  int ftag )
 {
     species_t * sp = find_species_name(sp_name, species_list);
-    dump_strategy.dump_particles(
+    dump_strategy->dump_particles(
         fbase,
         sp,
         grid,
@@ -54,12 +54,28 @@ void vpic_simulation::dump_particles( const char *sp_name,
     );
 }
 
-void dump_fields( const char *fbase, int fname_tag = 1 )
+void vpic_simulation::dump_fields( const char *fbase, int ftag )
 {
+    dump_strategy->dump_fields(
+        fbase,
+        step(),grid,
+        field_array,
+        ftag
+    );
 }
 
-void dump_hydro( const char *sp_name, const char *fbase, int fname_tag = 1 )
+void vpic_simulation::dump_hydro( const char *sp_name, const char *fbase, int ftag )
 {
+    species_t * sp = find_species_name(sp_name, species_list);
+    dump_strategy->dump_hydro(
+        fbase,
+        step(),
+        hydro_array,
+        sp,
+        interpolator_array,
+        grid,
+        ftag
+    );
 }
 
 void
diff --git a/src/vpic/dump.h b/src/vpic/dump.h
index 966e627e..1d17ee8a 100644
--- a/src/vpic/dump.h
+++ b/src/vpic/dump.h
@@ -2,6 +2,7 @@
 #define dump_h
 
 #include <array>
+#include "../grid/grid.h"
 
 // TODO: should this be an enum?
 namespace dump_type {
@@ -14,33 +15,5 @@ namespace dump_type {
 } // namespace
 
 // TODO: namesapce?
-std::array<int, 4> global_particle_index(int local_i, grid_t* grid, int rank)
-{
-    int ix, iy, iz, rx, ry, rz;
-    // Convert rank to local x/y/z
-    UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
-
-    // Calculate local ix/iy/iz
-    UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
-
-    // Account for the "first" ghost cell
-    ix = ix - 1;
-    iy = iy - 1;
-    iz = iz - 1;
-
-    // Convert ix/iy/iz to global
-    int gix = ix + (grid->nx * (rx));
-    int giy = iy + (grid->ny * (ry));
-    int giz = iz + (grid->nz * (rz));
-
-    // calculate global grid sizes
-    int gnx = grid->nx * grid->gpx;
-    int gny = grid->ny * grid->gpy;
-    int gnz = grid->nz * grid->gpz;
-
-    // TODO: find a better way to account for the hard coded ghosts in VOXEL
-    int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
-
-    return { global_i, gix, giy, giz };
-}
+std::array<int, 4> global_particle_index(int local_i, grid_t* grid, int rank);
 #endif
diff --git a/src/vpic/dump_strategy.cc b/src/vpic/dump_strategy.cc
new file mode 100644
index 00000000..adea2714
--- /dev/null
+++ b/src/vpic/dump_strategy.cc
@@ -0,0 +1,158 @@
+//BinaryDump::BinaryDump(int _rank, int _nproc) : Dump_Strategy(_rank, _nproc)
+//{
+    //// empty
+//}
+#include "dump_strategy.h"
+
+void BinaryDump::dump_fields(
+        const char *fbase,
+        int step,
+        grid_t* grid,
+        field_array_t* field_array,
+        int ftag
+        )
+{
+    char fname[256];
+    FileIO fileIO;
+    int dim[3];
+
+    if( !fbase ) ERROR(( "Invalid filename" ));
+
+    if( rank==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase ));
+
+    if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
+    else       sprintf( fname, "%s.%i", fbase, rank );
+
+    FileIOStatus status = fileIO.open(fname, io_write);
+    if( status==fail ) ERROR(( "Could not open \"%s\".", fname ));
+
+    /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
+    size_t nxout = grid->nx;
+    size_t nyout = grid->ny;
+    size_t nzout = grid->nz;
+    float dxout = grid->dx;
+    float dyout = grid->dy;
+    float dzout = grid->dz;
+
+    WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO, step , rank, nproc);
+
+    dim[0] = grid->nx+2;
+    dim[1] = grid->ny+2;
+    dim[2] = grid->nz+2;
+    WRITE_ARRAY_HEADER( field_array->f, 3, dim, fileIO );
+    fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] );
+    if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" ));
+}
+
+void BinaryDump::dump_particles(
+        const char *fbase,
+        species_t* sp,
+        grid_t* grid,
+        int step,
+        interpolator_array_t* interpolator_array,
+        int ftag
+        )
+{
+    char fname[256];
+    FileIO fileIO;
+    int dim[1], buf_start;
+    static particle_t * ALIGNED(128) p_buf = NULL;
+# define PBUF_SIZE 32768 // 1MB of particles
+
+    if( !sp ) ERROR(( "Invalid species name \"%s\".", sp->name ));
+
+    if( !fbase ) ERROR(( "Invalid filename" ));
+
+    if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 );
+
+    if( rank==0 )
+        MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase));
+
+    if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
+    else       sprintf( fname, "%s.%i", fbase, rank );
+    FileIOStatus status = fileIO.open(fname, io_write);
+    if( status==fail ) ERROR(( "Could not open \"%s\"", fname ));
+
+    /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
+    size_t nxout = grid->nx;
+    size_t nyout = grid->ny;
+    size_t nzout = grid->nz;
+    float dxout = grid->dx;
+    float dyout = grid->dy;
+    float dzout = grid->dz;
+
+    WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc);
+
+    dim[0] = sp->np;
+    WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO );
+
+    // Copy a PBUF_SIZE hunk of the particle list into the particle
+    // buffer, timecenter it and write it out. This is done this way to
+    // guarantee the particle list unchanged while not requiring too
+    // much memory.
+
+    // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE
+    // LARGE.
+
+    particle_t * sp_p = sp->p;      sp->p      = p_buf;
+    int sp_np         = sp->np;     sp->np     = 0;
+    int sp_max_np     = sp->max_np; sp->max_np = PBUF_SIZE;
+    for( buf_start=0; buf_start<sp_np; buf_start += PBUF_SIZE ) {
+        sp->np = sp_np-buf_start; if( sp->np > PBUF_SIZE ) sp->np = PBUF_SIZE;
+        COPY( sp->p, &sp_p[buf_start], sp->np );
+        center_p( sp, interpolator_array );
+        fileIO.write( sp->p, sp->np );
+    }
+    sp->p      = sp_p;
+    sp->np     = sp_np;
+    sp->max_np = sp_max_np;
+
+    if( fileIO.close() ) ERROR(("File close failed on dump particles!!!"));
+}
+void BinaryDump::dump_hydro(
+        const char *fbase,
+        int step,
+        hydro_array_t* hydro_array,
+        species_t* sp,
+        interpolator_array_t* interpolator_array,
+        grid_t* grid,
+        int ftag
+        )
+{
+    char fname[256];
+    FileIO fileIO;
+    int dim[3];
+
+    if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name ));
+
+    clear_hydro_array( hydro_array );
+    accumulate_hydro_p( hydro_array, sp, interpolator_array );
+    synchronize_hydro_array( hydro_array );
+
+    if( !fbase ) ERROR(( "Invalid filename" ));
+
+    if( rank==0 )
+        MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase));
+
+    if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
+    else       sprintf( fname, "%s.%i", fbase, rank );
+    FileIOStatus status = fileIO.open(fname, io_write);
+    if( status==fail) ERROR(( "Could not open \"%s\".", fname ));
+
+    /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
+    size_t nxout = grid->nx;
+    size_t nyout = grid->ny;
+    size_t nzout = grid->nz;
+    float dxout = grid->dx;
+    float dyout = grid->dy;
+    float dzout = grid->dz;
+
+    WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc);
+
+    dim[0] = grid->nx+2;
+    dim[1] = grid->ny+2;
+    dim[2] = grid->nz+2;
+    WRITE_ARRAY_HEADER( hydro_array->h, 3, dim, fileIO );
+    fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] );
+    if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
+}
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index c745df7c..6a5ba92c 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1,12 +1,18 @@
 #ifndef Dump_Strategy_h
 #define Dump_Strategy_h
 
+// TODO: should I drop the ./src here?
 #include "../util/io/FileIO.h"
 #include "../util/util_base.h"
 #include "../util/io/FileUtils.h"
+#include "../field_advance/field_advance.h"
+#include "../sf_interface/sf_interface.h"
+#include "../species_advance/species_advance.h"
+
 #include "dump.h"
 #include "dumpmacros.h"
 
+
 #ifdef VPIC_ENABLE_HDF5
 #include "hdf5.h" // from the lib
 #include "hdf5_header_info.h" // from vpic
@@ -21,6 +27,7 @@ class Dump_Strategy {
     int rank, nproc;
 
     Dump_Strategy(int _rank, int _nproc) : rank(_rank), nproc(_nproc) { } // empty
+    virtual ~Dump_Strategy() { };
 
     virtual void dump_fields(
         const char *fbase,
@@ -28,7 +35,7 @@ class Dump_Strategy {
         grid_t* grid,
         field_array_t* field_array,
         int ftag
-    );
+    ) = 0;
     virtual void dump_hydro(
         const char *fbase,
         int step,
@@ -37,7 +44,7 @@ class Dump_Strategy {
         interpolator_array_t* interpolator_array,
         grid_t* grid,
         int ftag
-    );
+    ) = 0;
     virtual void dump_particles(
         const char *fbase,
         species_t* sp,
@@ -45,12 +52,13 @@ class Dump_Strategy {
         int step,
         interpolator_array_t* interpolator_array,
         int ftag
-    );
+    ) = 0;
 };
 
 class BinaryDump : public Dump_Strategy {
     public:
         using Dump_Strategy::Dump_Strategy; // inherit constructor
+        BinaryDump(int _rank, int _nproc) : Dump_Strategy(_rank, _nproc){ } // empty
 
         // TODO: now we pass rank and step, ftag has odd semanticds
         void dump_fields(
@@ -59,39 +67,16 @@ class BinaryDump : public Dump_Strategy {
                 grid_t* grid,
                 field_array_t* field_array,
                 int ftag
-        )
-        {
-            char fname[256];
-            FileIO fileIO;
-            int dim[3];
-
-            if( !fbase ) ERROR(( "Invalid filename" ));
-
-            if( rank==0 ) MESSAGE(( "Dumping fields to \"%s\"", fbase ));
-
-            if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
-            else       sprintf( fname, "%s.%i", fbase, rank );
-
-            FileIOStatus status = fileIO.open(fname, io_write);
-            if( status==fail ) ERROR(( "Could not open \"%s\".", fname ));
-
-            /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-            size_t nxout = grid->nx;
-            size_t nyout = grid->ny;
-            size_t nzout = grid->nz;
-            float dxout = grid->dx;
-            float dyout = grid->dy;
-            float dzout = grid->dz;
-
-            WRITE_HEADER_V0( dump_type::field_dump, -1, 0, fileIO, step , rank, nproc);
-
-            dim[0] = grid->nx+2;
-            dim[1] = grid->ny+2;
-            dim[2] = grid->nz+2;
-            WRITE_ARRAY_HEADER( field_array->f, 3, dim, fileIO );
-            fileIO.write( field_array->f, dim[0]*dim[1]*dim[2] );
-            if( fileIO.close() ) ERROR(( "File close failed on dump fields!!!" ));
-        }
+        );
+        void dump_hydro(
+                const char *fbase,
+                int step,
+                hydro_array_t* hydro_array,
+                species_t* sp,
+                interpolator_array_t* interpolator_array,
+                grid_t* grid,
+                int ftag
+        );
         void dump_particles(
                 const char *fbase,
                 species_t* sp,
@@ -99,111 +84,7 @@ class BinaryDump : public Dump_Strategy {
                 int step,
                 interpolator_array_t* interpolator_array,
                 int ftag
-                )
-        {
-            char fname[256];
-            FileIO fileIO;
-            int dim[1], buf_start;
-            static particle_t * ALIGNED(128) p_buf = NULL;
-# define PBUF_SIZE 32768 // 1MB of particles
-
-            if( !sp ) ERROR(( "Invalid species name \"%s\".", sp->name ));
-
-            if( !fbase ) ERROR(( "Invalid filename" ));
-
-            if( !p_buf ) MALLOC_ALIGNED( p_buf, PBUF_SIZE, 128 );
-
-            if( rank==0 )
-                MESSAGE(("Dumping \"%s\" particles to \"%s\"",sp->name,fbase));
-
-            if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
-            else       sprintf( fname, "%s.%i", fbase, rank );
-            FileIOStatus status = fileIO.open(fname, io_write);
-            if( status==fail ) ERROR(( "Could not open \"%s\"", fname ));
-
-            /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-            size_t nxout = grid->nx;
-            size_t nyout = grid->ny;
-            size_t nzout = grid->nz;
-            float dxout = grid->dx;
-            float dyout = grid->dy;
-            float dzout = grid->dz;
-
-            WRITE_HEADER_V0( dump_type::particle_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc);
-
-            dim[0] = sp->np;
-            WRITE_ARRAY_HEADER( p_buf, 1, dim, fileIO );
-
-            // Copy a PBUF_SIZE hunk of the particle list into the particle
-            // buffer, timecenter it and write it out. This is done this way to
-            // guarantee the particle list unchanged while not requiring too
-            // much memory.
-
-            // FIXME: WITH A PIPELINED CENTER_P, PBUF NOMINALLY SHOULD BE QUITE
-            // LARGE.
-
-            particle_t * sp_p = sp->p;      sp->p      = p_buf;
-            int sp_np         = sp->np;     sp->np     = 0;
-            int sp_max_np     = sp->max_np; sp->max_np = PBUF_SIZE;
-            for( buf_start=0; buf_start<sp_np; buf_start += PBUF_SIZE ) {
-                sp->np = sp_np-buf_start; if( sp->np > PBUF_SIZE ) sp->np = PBUF_SIZE;
-                COPY( sp->p, &sp_p[buf_start], sp->np );
-                center_p( sp, interpolator_array );
-                fileIO.write( sp->p, sp->np );
-            }
-            sp->p      = sp_p;
-            sp->np     = sp_np;
-            sp->max_np = sp_max_np;
-
-            if( fileIO.close() ) ERROR(("File close failed on dump particles!!!"));
-        }
-        void dump_hydro(
-            const char *fbase,
-            int step,
-            hydro_array_t* hydro_array,
-            species_t* sp,
-            interpolator_array_t* interpolator_array,
-            grid_t* grid,
-            int ftag
-        )
-        {
-            char fname[256];
-            FileIO fileIO;
-            int dim[3];
-
-            if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name ));
-
-            clear_hydro_array( hydro_array );
-            accumulate_hydro_p( hydro_array, sp, interpolator_array );
-            synchronize_hydro_array( hydro_array );
-
-            if( !fbase ) ERROR(( "Invalid filename" ));
-
-            if( rank==0 )
-                MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase));
-
-            if( ftag ) sprintf( fname, "%s.%li.%i", fbase, (long)step, rank );
-            else       sprintf( fname, "%s.%i", fbase, rank );
-            FileIOStatus status = fileIO.open(fname, io_write);
-            if( status==fail) ERROR(( "Could not open \"%s\".", fname ));
-
-            /* IMPORTANT: these values are written in WRITE_HEADER_V0 */
-            size_t nxout = grid->nx;
-            size_t nyout = grid->ny;
-            size_t nzout = grid->nz;
-            float dxout = grid->dx;
-            float dyout = grid->dy;
-            float dzout = grid->dz;
-
-            WRITE_HEADER_V0(dump_type::hydro_dump, sp->id, sp->q/sp->m, fileIO, step, rank, nproc);
-
-            dim[0] = grid->nx+2;
-            dim[1] = grid->ny+2;
-            dim[2] = grid->nz+2;
-            WRITE_ARRAY_HEADER( hydro_array->h, 3, dim, fileIO );
-            fileIO.write( hydro_array->h, dim[0]*dim[1]*dim[2] );
-            if( fileIO.close() ) ERROR(( "File close failed on dump hydro!!!" ));
-        }
+        );
 };
 
 #ifdef VPIC_ENABLE_HDF5
diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc
index 3e3b3812..0dd2a418 100644
--- a/src/vpic/vpic.cc
+++ b/src/vpic/vpic.cc
@@ -72,7 +72,7 @@ reanimate_vpic_simulation( vpic_simulation * vpic ) {
 }
 
 
-vpic_simulation::vpic_simulation() : dump_strategy(BinaryDump( rank(), nproc() ))
+vpic_simulation::vpic_simulation()
 {
   // TODO: why is this a good idea?
   // Is this just trying to 0 initialize everything?
@@ -112,6 +112,11 @@ vpic_simulation::vpic_simulation() : dump_strategy(BinaryDump( rank(), nproc() )
   REGISTER_OBJECT( this, checkpt_vpic_simulation,
                    restore_vpic_simulation, reanimate_vpic_simulation );
 
+  // Initialize the dump strategy to use the binary dumpin, assuming the user
+  // may overwrite this later
+  dump_strategy = std::unique_ptr<Dump_Strategy>(new BinaryDump( rank(), nproc() ));
+
+  // TODO: this this still makes sense now we have a dump strategy
 #ifdef VPIC_ENABLE_HDF5
   // Default init hdf5 dump flags
   field_interval = 1;
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index 73dfab29..939480f7 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -384,7 +384,9 @@ class vpic_simulation {
   void dump_particles( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
 
-  Dump_Strategy  dump_strategy;
+  // Very likely a user will forgot to delete this if they change the strategy,
+  // a smart ptr will save us from the small leak
+  std::unique_ptr<Dump_Strategy> dump_strategy;
 
 #ifdef VPIC_ENABLE_HDF5
   // Declare vars to use

From 461598513e39a21c4f4482dbb8a1391a94da5037 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Wed, 6 Nov 2019 10:45:28 -0700
Subject: [PATCH 72/95] fixed include for unique pointer and add comment on
 inheritance

---
 src/vpic/dump.cc         | 29 +++++++++++++++++++++++++++++
 src/vpic/dump_strategy.h |  3 +++
 src/vpic/vpic.h          |  1 +
 3 files changed, 33 insertions(+)

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index e4fd86ae..005bb048 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -23,6 +23,35 @@
 // COMPATIBLE WITH EXISTING EXTERNAL 3RD PARTY VISUALIZATION SOFTWARE.
 // IN THE LONG RUN, THIS EXTERNAL SOFTWARE WILL NEED TO BE UPDATED.
 
+std::array<int, 4> global_particle_index(int local_i, grid_t* grid, int rank)
+{
+    int ix, iy, iz, rx, ry, rz;
+    // Convert rank to local x/y/z
+    UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+    // Calculate local ix/iy/iz
+    UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
+
+    // Account for the "first" ghost cell
+    ix = ix - 1;
+    iy = iy - 1;
+    iz = iz - 1;
+
+    // Convert ix/iy/iz to global
+    int gix = ix + (grid->nx * (rx));
+    int giy = iy + (grid->ny * (ry));
+    int giz = iz + (grid->nz * (rz));
+
+    // calculate global grid sizes
+    int gnx = grid->nx * grid->gpx;
+    int gny = grid->ny * grid->gpy;
+    int gnz = grid->nz * grid->gpz;
+
+    // TODO: find a better way to account for the hard coded ghosts in VOXEL
+    int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
+
+    return { global_i, gix, giy, giz };
+}
 // TODO: this should live somewhere more sensible, but it's better than the
 // global static it replaces
 std::unordered_map<species_id, size_t> tframe_map;
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 6a5ba92c..d3c3586a 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -22,6 +22,9 @@
 #include <openPMD/openPMD.hpp>
 #endif
 
+// Runtime inheritance is obviously not very "VPIC like", as we will [probably]
+// incur a penalty for the vtable lookup, but given we're about to do IO this
+// is very negligible.
 class Dump_Strategy {
     public:
     int rank, nproc;
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index 939480f7..5c6c72c3 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -15,6 +15,7 @@
 
 #include <vector>
 #include <cmath>
+#include <memory> // unique_ptr
 
 #include "../boundary/boundary.h"
 #include "../collision/collision.h"

From acd77ce95801abc905a00e7afb9d28e93c759dfb Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 12 Nov 2019 12:34:34 -0700
Subject: [PATCH 73/95] hdf5 backend seems to be working, including example of
 how to enable

---
 sample/harrisHDF5           |  29 +++--
 src/vpic/dump.cc            |  21 ++-
 src/vpic/dump_strategy.h    | 253 +++++++++++++++++++++++++++---------
 src/vpic/hdf5_header_info.h | 215 +++++++++++++++---------------
 src/vpic/vpic.cc            |  14 +-
 src/vpic/vpic.h             | 154 ++--------------------
 6 files changed, 356 insertions(+), 330 deletions(-)

diff --git a/sample/harrisHDF5 b/sample/harrisHDF5
index 2b3b21bf..c6c326de 100644
--- a/sample/harrisHDF5
+++ b/sample/harrisHDF5
@@ -49,7 +49,7 @@ begin_initialization {
 
 
   // Example of how to call / set dumping
-  field_dump_flag.disableEMAT();
+  //field_dump_flag.disableEMAT();
 
 
   double input_mass_ratio;
@@ -141,16 +141,16 @@ begin_initialization {
 
   num_step             = int(0.2*taui/(wci*dt));
   status_interval      = int(1./(wci*dt));
-  field_interval = status_interval;
-  hydro_interval = status_interval;
+  field_interval = 1; //status_interval;
+  hydro_interval = 1; //status_interval;
   sync_shared_interval = status_interval;
   clean_div_e_interval = status_interval;
   clean_div_b_interval = status_interval;
 
   global->energies_interval  = status_interval;
-  global->fields_interval    = status_interval;
-  global->ehydro_interval    = status_interval;
-  global->ihydro_interval    = status_interval;
+  global->fields_interval    = 1; //status_interval;
+  global->ehydro_interval    = 1; //status_interval;
+  global->ihydro_interval    = 1; //status_interval;
   global->eparticle_interval = status_interval;
   global->iparticle_interval = status_interval;
   global->restart_interval   = status_interval;
@@ -324,6 +324,11 @@ begin_initialization {
   // - Increment the time step
   // - Call user diagnostics
   // - (periodically) Print a status message
+
+  // Explicitly enable HDF5 backend for IO dump
+  // WARNING: Call this after you have set `num_step` (for now.. soon fixed)
+
+  enable_hdf5_dump();
 }
 
 begin_diagnostics {
@@ -369,8 +374,8 @@ begin_diagnostics {
   // algorithm. As a result, JF is not valid until at least one timestep has
   // been completed. Field dumps are in a binary format. Each rank makes a
   // field dump.
-  if( step()==-10 )         dump_fields_hdf5("fields"); // Get first valid total J
-  if( should_dump(fields) ) dump_fields_hdf5("fields");
+  if( step()==-10 )         dump_fields("fields"); // Get first valid total J
+  if( should_dump(fields) ) dump_fields("fields");
 
   // Hydro dumps store particle charge density, current density and
   // stress-energy tensor. All these quantities are known at the time
@@ -381,16 +386,16 @@ begin_diagnostics {
   // purely diagnostic. It is not used by the simulation and it is not
   // accumulated using a self-consistent charge-conserving method. Hydro dumps
   // are in a binary format. Each rank makes a hydro dump.
-  if(should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro");
-  if( should_dump(ihydro) ) dump_hydro_hdf5("ion",     "ihydro");
+  if(should_dump(ehydro) ) dump_hydro("electron","ehydro");
+  if( should_dump(ihydro) ) dump_hydro("ion",     "ihydro");
 
   // Particle dumps store the particle data for a given species. The data
   // written is known at the time t = time().  By default, particle dumps
   // are tagged with step(). However, if a "0" is added to the call, the
   // filename will not be tagged. Particle dumps are in a binary format.
   // Each rank makes a particle dump.
-  if( should_dump(eparticle) ) dump_particles_hdf5("electron","eparticle");
-  if( should_dump(iparticle) ) dump_particles_hdf5("ion",     "iparticle");
+  if( should_dump(eparticle) ) dump_particles("electron","eparticle");
+  if( should_dump(iparticle) ) dump_particles("ion",     "iparticle");
 
   // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string
   // and tag is an integer.  A typical usage is:
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 005bb048..9ac120a2 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -52,9 +52,6 @@ std::array<int, 4> global_particle_index(int local_i, grid_t* grid, int rank)
 
     return { global_i, gix, giy, giz };
 }
-// TODO: this should live somewhere more sensible, but it's better than the
-// global static it replaces
-std::unordered_map<species_id, size_t> tframe_map;
 
 int vpic_simulation::dump_mkdir(const char * dname) {
 	return FileUtils::makeDirectory(dname);
@@ -68,6 +65,24 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) {
  * ASCII dump IO
  *****************************************************************************/
 
+void vpic_simulation::enable_binary_dump() {
+    dump_strategy = std::unique_ptr<Dump_Strategy>(new BinaryDump( rank(), nproc(), num_step ));
+}
+
+#ifdef VPIC_ENABLE_HDF5
+void vpic_simulation::enable_hdf5_dump() {
+    std::cout << "Enabling HDF5 IO backend" << std::endl;
+    dump_strategy = std::unique_ptr<Dump_Strategy>(new HDF5Dump( rank(), nproc(), num_step ));
+}
+#endif
+
+#ifdef VPIC_ENABLE_OPENPMD
+void vpic_simulation::enable_openpmd_dump() {
+    std::cout << "Enabling openPMD IO backend" << std::endl;
+    dump_strategy = std::unique_ptr<Dump_Strategy>(new OpenPMDDump( rank(), nproc(), num_step ));
+}
+#endif
+
 void vpic_simulation::dump_particles( const char *sp_name,
                                  const char *fbase,
                                  int ftag )
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index d3c3586a..08fe6e60 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1,6 +1,13 @@
 #ifndef Dump_Strategy_h
 #define Dump_Strategy_h
 
+#include <unordered_map>
+#include <vector>
+
+#include <mpi.h> // TODO: it would be good if this didn't have to know about MPI
+
+#define DUMP_INFO_DEBUG 1
+
 // TODO: should I drop the ./src here?
 #include "../util/io/FileIO.h"
 #include "../util/util_base.h"
@@ -27,9 +34,14 @@
 // is very negligible.
 class Dump_Strategy {
     public:
-    int rank, nproc;
+    int rank, nproc, num_step;
+
+    Dump_Strategy(int _rank, int _nproc, int total_steps) :
+        rank(_rank),
+        nproc(_nproc),
+        num_step(total_steps) // TODO: remove the need for this
+    { } // empty
 
-    Dump_Strategy(int _rank, int _nproc) : rank(_rank), nproc(_nproc) { } // empty
     virtual ~Dump_Strategy() { };
 
     virtual void dump_fields(
@@ -61,7 +73,7 @@ class Dump_Strategy {
 class BinaryDump : public Dump_Strategy {
     public:
         using Dump_Strategy::Dump_Strategy; // inherit constructor
-        BinaryDump(int _rank, int _nproc) : Dump_Strategy(_rank, _nproc){ } // empty
+        BinaryDump(int _rank, int _nproc, int total_steps) : Dump_Strategy(_rank, _nproc, total_steps){ } // empty
 
         // TODO: now we pass rank and step, ftag has odd semanticds
         void dump_fields(
@@ -91,49 +103,157 @@ class BinaryDump : public Dump_Strategy {
 };
 
 #ifdef VPIC_ENABLE_HDF5
+
+struct field_dump_flag_t
+{
+  bool ex = true, ey = true, ez = true, div_e_err = true;
+  bool cbx = true, cby = true, cbz = true, div_b_err = true;
+  bool tcax = true, tcay = true, tcaz = true, rhob = true;
+  bool jfx = true, jfy = true, jfz = true, rhof = true;
+  bool ematx = true, ematy = true, ematz = true, nmat = true;
+  bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
+  void disableE()
+  {
+    ex = false, ey = false, ez = false, div_e_err = false;
+  }
+
+  void disableCB()
+  {
+    cbx = false, cby = false, cbz = false, div_b_err = false;
+  }
+
+  void disableTCA()
+  {
+    tcax = false, tcay = false, tcaz = false, rhob = false;
+  }
+
+  void disableJF()
+  {
+    jfx = false, jfy = false, jfz = false, rhof = false;
+  }
+
+  void disableEMAT()
+  {
+    ematx = false, ematy = false, ematz = false, nmat = false;
+  }
+
+  void disableFMAT()
+  {
+    fmatx = false, fmaty = false, fmatz = false, cmat = false;
+  }
+
+  void resetToDefaults()
+  {
+    ex = true, ey = true, ez = true, div_e_err = true;
+    cbx = true, cby = true, cbz = true, div_b_err = true;
+    tcax = true, tcay = true, tcaz = true, rhob = true;
+    jfx = true, jfy = true, jfz = true, rhof = true;
+    ematx = true, ematy = true, ematz = true, nmat = true;
+    fmatx = true, fmaty = true, fmatz = true, cmat = true;
+  }
+
+  bool enabledE()
+  {
+    return ex && ey && ez;
+  }
+
+  bool enabledCB()
+  {
+    return cbx && cby && cbz;
+  }
+
+  bool enabledTCA()
+  {
+    return tcax && tcay && tcaz;
+  }
+
+  bool enabledJF()
+  {
+    return jfx && jfy && jfz;
+  }
+
+  bool enabledEMAT()
+  {
+    return ematx && ematy && ematz;
+  }
+
+  bool enabledFMAT()
+  {
+    return fmatx && fmaty && fmatz;
+  }
+};
+
+struct hydro_dump_flag_t
+{
+  bool jx = true, jy = true, jz = true, rho = true;
+  bool px = true, py = true, pz = true, ke = true;
+  bool txx = true, tyy = true, tzz = true;
+  bool tyz = true, tzx = true, txy = true;
+
+  void disableJ()
+  {
+    jx = false, jy = false, jz = false, rho = false;
+  }
+
+  void disableP()
+  {
+    px = false, py = false, pz = false, ke = false;
+  }
+
+  void disableTD() //Stress diagonal
+  {
+    txx = false, tyy = false, tzz = false;
+  }
+
+  void disableTOD() //Stress off-diagonal
+  {
+    tyz = false, tzx = false, txy = false;
+  }
+  void resetToDefaults()
+  {
+    jx = true, jy = true, jz = true, rho = true;
+    px = true, py = true, pz = true, ke = true;
+    txx = true, tyy = true, tzz = true;
+    tyz = true, tzx = true, txy = true;
+  }
+
+  bool enabledJ()
+  {
+    return jx && jy && jz;
+  }
+
+  bool enabledP()
+  {
+    return px && py && pz;
+  }
+
+  bool enabledTD()
+  {
+    return txx && tyy && tzz;
+  }
+
+  bool enabledTOD()
+  {
+    return tyz && tzx && txy;
+  }
+};
 class HDF5Dump : public Dump_Strategy {
+    std::unordered_map<species_id, size_t> tframe_map;
     public:
         using Dump_Strategy::Dump_Strategy; // inherit constructor
+
+        // TODO: replace these with a common dump interface
+        // Declare vars to use
+        hydro_dump_flag_t hydro_dump_flag;
+        field_dump_flag_t field_dump_flag;
+
 #define DUMP_DIR_FORMAT "./%s"
 
-        /* define to do C-style indexing */
-#define hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
+// TODO: naming a macro so close to existing functions AND data is not a good
+// define to do C-style indexing
+#define _hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
 
         // TODO: make function?
-#define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag)                                 \
-        {                                                                                                                                       \
-            FILE *fp;                                                                                                                             \
-            fp = fopen(xml_file_name, "a");                                                                                                       \
-            fprintf(fp, main_body_head, time_step);                                                                                               \
-            if (field_dump_flag.enabledE())                                                                                                       \
-            write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez");             \
-            if (field_dump_flag.div_e_err)                                                                                                        \
-            fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err");               \
-            if (field_dump_flag.enabledCB())                                                                                                      \
-            write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz");          \
-            if (field_dump_flag.div_b_err)                                                                                                        \
-            fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err");               \
-            if (field_dump_flag.enabledTCA())                                                                                                     \
-            write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz");     \
-            if (field_dump_flag.rhob)                                                                                                             \
-            fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob");                         \
-            if (field_dump_flag.enabledJF())                                                                                                      \
-            write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz");         \
-            if (field_dump_flag.rhof)                                                                                                             \
-            fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof");                         \
-            if (field_dump_flag.enabledEMAT())                                                                                                    \
-            write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \
-            if (field_dump_flag.nmat)                                                                                                             \
-            fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat");                         \
-            if (field_dump_flag.enabledFMAT())                                                                                                    \
-            write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \
-            if (field_dump_flag.cmat)                                                                                                             \
-            fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat");                         \
-            fprintf(fp, "%s", main_body_foot);                                                                                                          \
-            if (add_footer_flag)                                                                                                                  \
-            fputs(footer, fp);                                                                                                                  \
-            fclose(fp);                                                                                                                           \
-        }
         void dump_fields(
             const char *fbase,
             int step,
@@ -257,6 +377,7 @@ class HDF5Dump : public Dump_Strategy {
             int rx, ry, rz;
             UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
 
+            int mpi_rank_x, mpi_rank_y, mpi_rank_z;
             mpi_rank_x = rx;
             mpi_rank_y = ry;
             mpi_rank_z = rz;
@@ -270,9 +391,9 @@ class HDF5Dump : public Dump_Strategy {
             global_count[2] = (grid->nz);
 
 #ifdef DUMP_INFO_DEBUG
-            printf("global size   = %d  %d %d \n", field_global_size[0], field_global_size[1], field_global_size[2]);
-            printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]);
-            printf("global_count  = %d  %d %d \n", global_count[0], global_count[1], global_count[2]);
+            printf("global size   = %llu  %llu %llu \n", field_global_size[0], field_global_size[1], field_global_size[2]);
+            printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]);
+            printf("global_count  = %llu  %llu %llu \n", global_count[0], global_count[1], global_count[2]);
             printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
             fflush(stdout);
 #endif
@@ -390,6 +511,13 @@ class HDF5Dump : public Dump_Strategy {
                 char dxdydz[128];
                 sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
 
+
+                // TODO: remove or let the user set
+                int field_interval = 1;
+
+                // TODO: remove this dependence on number of steps
+                std::cout << "num_step " << num_step << std::endl;
+
                 int nframes = num_step / field_interval + 1;
                 static int field_tframe = 0;
 
@@ -401,8 +529,8 @@ class HDF5Dump : public Dump_Strategy {
                 printf("             dxdydz: %s \n", dxdydz);
                 printf("            nframes: %d \n", nframes);
                 printf("    field_interval: %d \n", field_interval);
-                printf("       current step: %lld \n", step_for_viou);
-                printf("       current step: %lld \n", step_for_viou);
+                printf("       current step: %zd \n", step_for_viou);
+                printf("       current step: %zd \n", step_for_viou);
 
                 //printf("    Simulation time: %f \n", grid->t0);
                 printf("             tframe: %d \n", field_tframe);
@@ -459,9 +587,9 @@ class HDF5Dump : public Dump_Strategy {
             // get the total number of particles. in this example, output only electrons
             //sp = species_list;
             sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5");
-            FileUtils::makeDirector(particle_scratch);
+            FileUtils::makeDirectory(particle_scratch);
             sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou);
-            FileUtils::makeDirector(subparticle_scratch);
+            FileUtils::makeDirectory(subparticle_scratch);
 
             // TODO: Allow the user to set this
             int stride_particle_dump = 1;
@@ -588,7 +716,6 @@ class HDF5Dump : public Dump_Strategy {
                 global_pi[i] = global_i;
             }
 
-#undef UNVOXEL
             dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
             ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data());
             H5Dclose(dset_id);
@@ -740,7 +867,7 @@ class HDF5Dump : public Dump_Strategy {
                     {                                                                                                     \
                         for (size_t k(1); k < grid->nz + 1; k++)                                                          \
                         {                                                                                                 \
-                            temp_buf[temp_buf_index] = hydro(i, j, k).ATTRIBUTE_NAME;                                     \
+                            temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME;                                     \
                             temp_buf_index = temp_buf_index + 1;                                                          \
                         }                                                                                                 \
                     }                                                                                                     \
@@ -756,9 +883,10 @@ class HDF5Dump : public Dump_Strategy {
             MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
             MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
 
-            species_t *sp = find_species_name(speciesname, species_list);
             if (!sp)
-                ERROR(("Invalid species name: %s", speciesname));
+            {
+                ERROR(("Invalid species"));
+            }
 
             clear_hydro_array(hydro_array);
             accumulate_hydro_p(hydro_array, sp, interpolator_array);
@@ -769,11 +897,11 @@ class HDF5Dump : public Dump_Strategy {
             char subhydro_scratch[128];
 
             sprintf(hydro_scratch, "./%s", "hydro_hdf5");
-            FileUtils::makeDirector(hydro_scratch);
+            FileUtils::makeDirectory(hydro_scratch);
             sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou);
-            FileUtils::makeDirector(subhydro_scratch);
+            FileUtils::makeDirectory(subhydro_scratch);
 
-            sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, speciesname, step_for_viou);
+            sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, step_for_viou);
             double el1 = uptime();
             hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
             H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
@@ -832,7 +960,7 @@ class HDF5Dump : public Dump_Strategy {
             hydro_local_size[2] = grid->nz;
 
             int mpi_rank_x, mpi_rank_y, mpi_rank_z;
-            RANK_TO_INDEX2(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+            UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, grid->gpz);
 
             global_offset[0] = (grid->nx) * mpi_rank_x;
             global_offset[1] = (grid->ny) * mpi_rank_y;
@@ -843,9 +971,9 @@ class HDF5Dump : public Dump_Strategy {
             global_count[2] = (grid->nz);
 
 #ifdef DUMP_INFO_DEBUG
-            printf("global size   = %d %d %d \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
-            printf("global_offset = %d %d %d \n", global_offset[0], global_offset[1], global_offset[2]);
-            printf("global_count  = %d %d %d \n", global_count[0], global_count[1], global_count[2]);
+            printf("global size   = %llu %llu %llu \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
+            printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]);
+            printf("global_count  = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]);
             printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
             fflush(stdout);
 #endif
@@ -929,7 +1057,7 @@ class HDF5Dump : public Dump_Strategy {
             if (mpi_rank == 0)
             {
                 char output_xml_file[128];
-                sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", speciesname, ".xdmf");
+                sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, ".xdmf");
                 char dimensions_3d[128];
                 sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
                 char dimensions_4d[128];
@@ -939,6 +1067,10 @@ class HDF5Dump : public Dump_Strategy {
                 char dxdydz[128];
                 sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
 
+                // TODO: remove or let user set
+                int hydro_interval = 1;
+
+                // TODO: remove this dependence on number of steps
                 int nframes = num_step / hydro_interval + 1;
 
                 const int tframe = tframe_map[sp->id];
@@ -951,13 +1083,14 @@ class HDF5Dump : public Dump_Strategy {
                 printf("             dxdydz: %s \n", dxdydz);
                 printf("            nframes: %d \n", nframes);
                 printf("    hydro_fields_interval: %d \n", hydro_interval);
-                printf("       current step: %lld \n", step_for_viou);
+                printf("       current step: %zu \n", step_for_viou);
                 printf("    Simulation time: %f \n", grid->t0);
                 printf("             tframe: %d \n", tframe);
 #endif
 
+                // TODO: why doesnt this just use the cstr?
                 char speciesname_new[128];
-                sprintf(speciesname_new, "hydro_%s", speciesname);
+                sprintf(speciesname_new, "hydro_%s", sp->name);
                 if (tframe >= 1)
                 {
                     if (tframe == (nframes - 1))
diff --git a/src/vpic/hdf5_header_info.h b/src/vpic/hdf5_header_info.h
index baed8f7d..e3810612 100644
--- a/src/vpic/hdf5_header_info.h
+++ b/src/vpic/hdf5_header_info.h
@@ -3,120 +3,123 @@
 
 #define FIELD_ARRAY_NAME field_array
 
-// XML header stuff
-const char *header = "<?xml version=\"1.0\"?>\n<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []>\n<Xdmf xmlns:xi=\"http://www.w3.org/2001/XInclude\" Version=\"2.0\">\n\t<Domain>\n";
-const char *header_topology = "\t\t<Topology Dimensions=\"%s\" TopologyType=\"3DCoRectMesh\" name=\"topo\"/>\n";
-const char *header_geom = "\t\t<Geometry Type=\"ORIGIN_DXDYDZ\" name=\"geo\">\n";
-const char *header_origin = "\t\t\t<!-- Origin --> \n\t\t\t<DataItem Dimensions=\"3\" Format=\"XML\">%s</DataItem>\n";
-const char *header_dxdydz = "\t\t\t<!-- DxDyDz --> \n\t\t\t<DataItem Dimensions=\"3\" Format=\"XML\">%s</DataItem>\n";
-const char *footer_geom = "\t\t</Geometry>\n";
-const char *grid_line = "\t\t<Grid CollectionType=\"Temporal\" GridType=\"Collection\" Name=\"TimeSeries\"> \n \
-\t\t\t<Time TimeType=\"HyperSlab\"> \n \
-\t\t\t\t<DataItem Dimensions=\"%d\" Format=\"XML\" NumberType=\"Float\">";
-const char *grid_line_footer = "</DataItem> \n\
-\t\t\t</Time>\n";
-const char *footer = "\t\t</Grid>\n\t</Domain>\n</Xdmf>\n";
+namespace VPIC_HDF {
+    // XML header stuff
+    static const char *header = "<?xml version=\"1.0\"?>\n<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []>\n<Xdmf xmlns:xi=\"http://www.w3.org/2001/XInclude\" Version=\"2.0\">\n\t<Domain>\n";
+    static const char *header_topology = "\t\t<Topology Dimensions=\"%s\" TopologyType=\"3DCoRectMesh\" name=\"topo\"/>\n";
+    static const char *header_geom = "\t\t<Geometry Type=\"ORIGIN_DXDYDZ\" name=\"geo\">\n";
+    static const char *header_origin = "\t\t\t<!-- Origin --> \n\t\t\t<DataItem Dimensions=\"3\" Format=\"XML\">%s</DataItem>\n";
+    static const char *header_dxdydz = "\t\t\t<!-- DxDyDz --> \n\t\t\t<DataItem Dimensions=\"3\" Format=\"XML\">%s</DataItem>\n";
+    static const char *footer_geom = "\t\t</Geometry>\n";
+    static const char *grid_line = "\t\t<Grid CollectionType=\"Temporal\" GridType=\"Collection\" Name=\"TimeSeries\"> \n \
+                             \t\t\t<Time TimeType=\"HyperSlab\"> \n \
+                             \t\t\t\t<DataItem Dimensions=\"%d\" Format=\"XML\" NumberType=\"Float\">";
+    static const char *grid_line_footer = "</DataItem> \n\
+                                    \t\t\t</Time>\n";
+    static const char *footer = "\t\t</Grid>\n\t</Domain>\n</Xdmf>\n";
 
-const char *main_body_head = "\t\t\t<Grid GridType=\"Uniform\" Name=\"T%d\"> \n \
-\t\t\t\t<Topology Reference=\"/Xdmf/Domain/Topology[1]\"/>   \n \
-\t\t\t\t<Geometry Reference=\"/Xdmf/Domain/Geometry[1]\"/>  \n";
-const char *main_body_foot = "\t\t\t</Grid>\n";
+    static const char *main_body_head = "\t\t\t<Grid GridType=\"Uniform\" Name=\"T%d\"> \n \
+                                  \t\t\t\t<Topology Reference=\"/Xdmf/Domain/Topology[1]\"/>   \n \
+                                  \t\t\t\t<Geometry Reference=\"/Xdmf/Domain/Geometry[1]\"/>  \n";
+    static const char *main_body_foot = "\t\t\t</Grid>\n";
 
-const char *main_body_attributeV = "\
-        \t\t\t\t <Attribute AttributeType =\"Vector\" Center=\"Node\" Name=\"%s\">  \n \
-            \t\t\t\t\t<DataItem Dimensions=\" %s \" Function=\"JOIN($0, $1, $2)\" ItemType=\"Function\">  \n \
-                \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
-                \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
-                \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
-            \t\t\t\t\t</DataItem>  \n \
-        \t\t\t\t</Attribute>  \n ";
+    static const char *main_body_attributeV = "\
+                                        \t\t\t\t <Attribute AttributeType =\"Vector\" Center=\"Node\" Name=\"%s\">  \n \
+                                        \t\t\t\t\t<DataItem Dimensions=\" %s \" Function=\"JOIN($0, $1, $2)\" ItemType=\"Function\">  \n \
+                                        \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
+                                        \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
+                                        \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
+                                        \t\t\t\t\t</DataItem>  \n \
+                                        \t\t\t\t</Attribute>  \n ";
 
-const char *main_body_attributeS = "\
-        \t\t\t\t <Attribute AttributeType =\"Scalar\" Center=\"Node\" Name=\"%s\">  \n \
-                \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
-        \t\t\t\t</Attribute>  \n ";
+    static const char *main_body_attributeS = "\
+                                        \t\t\t\t <Attribute AttributeType =\"Scalar\" Center=\"Node\" Name=\"%s\">  \n \
+                                        \t\t\t\t\t\t<DataItem ItemType=\"Uniform\" Dimensions=\" %s \" DataType=\"Float\" Precision=\"4\" Format=\"HDF\"> T.%d/%s_%d.h5:/Timestep_%d/%s </DataItem>  \n \
+                                        \t\t\t\t</Attribute>  \n ";
+
+} // end namespace
 
 #define create_file_with_header(xml_file_name, dimensions, orignal, dxdydz, nframes, fields_interval) \
-  {                                                                                                   \
-    FILE *fp;                                                                                         \
-    fp = fopen(xml_file_name, "w");                                                                   \
-    fputs(header, fp);                                                                                \
-    fprintf(fp, header_topology, dimensions);                                                         \
-    fputs(header_geom, fp);                                                                           \
-    fprintf(fp, header_origin, orignal);                                                              \
-    fprintf(fp, header_dxdydz, dxdydz);                                                               \
-    fputs(footer_geom, fp);                                                                           \
-    fprintf(fp, grid_line, nframes);                                                                  \
-    int i;                                                                                            \
-    for (i = 0; i < nframes; i++)                                                                     \
-      fprintf(fp, "%d ", i*fields_interval);                                                         \
-    fputs(grid_line_footer, fp);                                                                      \
-    fclose(fp);                                                                                       \
-  }
+    {                                                                                                   \
+        FILE *fp;                                                                                         \
+        fp = fopen(xml_file_name, "w");                                                                   \
+        fputs(VPIC_HDF::header, fp);                                                                                \
+        fprintf(fp, VPIC_HDF::header_topology, dimensions);                                                         \
+        fputs(VPIC_HDF::header_geom, fp);                                                                           \
+        fprintf(fp, VPIC_HDF::header_origin, orignal);                                                              \
+        fprintf(fp, VPIC_HDF::header_dxdydz, dxdydz);                                                               \
+        fputs(VPIC_HDF::footer_geom, fp);                                                                           \
+        fprintf(fp, VPIC_HDF::grid_line, nframes);                                                                  \
+        int i;                                                                                            \
+        for (i = 0; i < nframes; i++)                                                                     \
+        fprintf(fp, "%d ", i*fields_interval);                                                         \
+        fputs(VPIC_HDF::grid_line_footer, fp);                                                                      \
+        fclose(fp);                                                                                       \
+    }
 #define write_main_body_attribute(fpp, main_body_attribute_p, attribute_name, dims_4d_p, dims_3d_p, file_name_pre_p, time_step_p, a1, a2, a3) \
-  {                                                                                                                                           \
-    fprintf(fpp, main_body_attribute_p, attribute_name, dims_4d_p,                                                                            \
-            dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a1,                                                            \
-            dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a2,                                                            \
-            dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a3);                                                           \
-  }
+    {                                                                                                                                           \
+        fprintf(fpp, main_body_attribute_p, attribute_name, dims_4d_p,                                                                            \
+                dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a1,                                                            \
+                dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a2,                                                            \
+                dims_3d_p, time_step_p, file_name_pre_p, time_step_p, time_step_p, a3);                                                           \
+    }
 
 #define invert_field_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag)                                 \
-  {                                                                                                                                       \
-    FILE *fp;                                                                                                                             \
-    fp = fopen(xml_file_name, "a");                                                                                                       \
-    fprintf(fp, main_body_head, time_step);                                                                                               \
-    if (field_dump_flag.enabledE())                                                                                                       \
-      write_main_body_attribute(fp, main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez");             \
-    if (field_dump_flag.div_e_err)                                                                                                        \
-      fprintf(fp, main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err");               \
-    if (field_dump_flag.enabledCB())                                                                                                      \
-      write_main_body_attribute(fp, main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz");          \
-    if (field_dump_flag.div_b_err)                                                                                                        \
-      fprintf(fp, main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err");               \
-    if (field_dump_flag.enabledTCA())                                                                                                     \
-      write_main_body_attribute(fp, main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz");     \
-    if (field_dump_flag.rhob)                                                                                                             \
-      fprintf(fp, main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob");                         \
-    if (field_dump_flag.enabledJF())                                                                                                      \
-      write_main_body_attribute(fp, main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz");         \
-    if (field_dump_flag.rhof)                                                                                                             \
-      fprintf(fp, main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof");                         \
-    if (field_dump_flag.enabledEMAT())                                                                                                    \
-      write_main_body_attribute(fp, main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \
-    if (field_dump_flag.nmat)                                                                                                             \
-      fprintf(fp, main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat");                         \
-    if (field_dump_flag.enabledFMAT())                                                                                                    \
-      write_main_body_attribute(fp, main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \
-    if (field_dump_flag.cmat)                                                                                                             \
-      fprintf(fp, main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat");                         \
-    fprintf(fp, "%s", main_body_foot);                                                                                                          \
-    if (add_footer_flag)                                                                                                                  \
-      fputs(footer, fp);                                                                                                                  \
-    fclose(fp);                                                                                                                           \
-  }
+    {                                                                                                                                       \
+        FILE *fp;                                                                                                                             \
+        fp = fopen(xml_file_name, "a");                                                                                                       \
+        fprintf(fp, VPIC_HDF::main_body_head, time_step);                                                                                               \
+        if (field_dump_flag.enabledE())                                                                                                       \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "E", dims_4d, dims_3d, speciesname_p, time_step, "ex", "ey", "ez");             \
+        if (field_dump_flag.div_e_err)                                                                                                        \
+        fprintf(fp, VPIC_HDF::main_body_attributeS, "div_e_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_e_err");               \
+        if (field_dump_flag.enabledCB())                                                                                                      \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "B", dims_4d, dims_3d, speciesname_p, time_step, "cbx", "cby", "cbz");          \
+        if (field_dump_flag.div_b_err)                                                                                                        \
+        fprintf(fp, VPIC_HDF::main_body_attributeS, "div_b_err", dims_3d, time_step, speciesname_p, time_step, time_step, "div_b_err");               \
+        if (field_dump_flag.enabledTCA())                                                                                                     \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "TCA", dims_4d, dims_3d, speciesname_p, time_step, "tcax", "tcay", "tcaz");     \
+        if (field_dump_flag.rhob)                                                                                                             \
+        fprintf(fp, VPIC_HDF::main_body_attributeS, "rhob", dims_3d, time_step, speciesname_p, time_step, time_step, "rhob");                         \
+        if (field_dump_flag.enabledJF())                                                                                                      \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "JF", dims_4d, dims_3d, speciesname_p, time_step, "jfx", "jfy", "jfz");         \
+        if (field_dump_flag.rhof)                                                                                                             \
+        fprintf(fp, VPIC_HDF::main_body_attributeS, "rhof", dims_3d, time_step, speciesname_p, time_step, time_step, "rhof");                         \
+        if (field_dump_flag.enabledEMAT())                                                                                                    \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "EMAT", dims_4d, dims_3d, speciesname_p, time_step, "ematx", "ematy", "ematz"); \
+        if (field_dump_flag.nmat)                                                                                                             \
+        fprintf(fp, VPIC_HDF::main_body_attributeS, "nmat", dims_3d, time_step, speciesname_p, time_step, time_step, "nmat");                         \
+        if (field_dump_flag.enabledFMAT())                                                                                                    \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "FMAT", dims_4d, dims_3d, speciesname_p, time_step, "fmatx", "fmaty", "fmatz"); \
+        if (field_dump_flag.cmat)                                                                                                             \
+        fprintf(fp, VPIC_HDF::main_body_attributeS, "cmat", dims_3d, time_step, speciesname_p, time_step, time_step, "cmat");                         \
+        fprintf(fp, "%s", VPIC_HDF::main_body_foot);                                                                                                          \
+        if (add_footer_flag)                                                                                                                  \
+        fputs(VPIC_HDF::footer, fp);                                                                                                                  \
+        fclose(fp);                                                                                                                           \
+    }
 #define invert_hydro_xml_item(xml_file_name, speciesname_p, time_step, dims_4d, dims_3d, add_footer_flag)                          \
-  {                                                                                                                                \
-    FILE *fp;                                                                                                                      \
-    fp = fopen(xml_file_name, "a");                                                                                                \
-    fprintf(fp, main_body_head, time_step);                                                                                        \
-    if (hydro_dump_flag.enabledJ())                                                                                                \
-      write_main_body_attribute(fp, main_body_attributeV, "J", dims_4d, dims_3d, speciesname_p, time_step, "jx", "jy", "jz");      \
-    if (hydro_dump_flag.rho)                                                                                                       \
-      fprintf(fp, main_body_attributeS, "rho", dims_3d, time_step, speciesname_p, time_step, time_step, "rho");                    \
-    if (hydro_dump_flag.enabledP())                                                                                                \
-      write_main_body_attribute(fp, main_body_attributeV, "P", dims_4d, dims_3d, speciesname_p, time_step, "px", "py", "pz");      \
-    if (hydro_dump_flag.ke)                                                                                                        \
-      fprintf(fp, main_body_attributeS, "ke", dims_3d, time_step, speciesname_p, time_step, time_step, "ke");                      \
-    if (hydro_dump_flag.enabledTD())                                                                                               \
-      write_main_body_attribute(fp, main_body_attributeV, "TD", dims_4d, dims_3d, speciesname_p, time_step, "txx", "tyy", "tzz");  \
-    if (hydro_dump_flag.enabledTOD())                                                                                              \
-      write_main_body_attribute(fp, main_body_attributeV, "TOD", dims_4d, dims_3d, speciesname_p, time_step, "tyz", "tzx", "txy"); \
-    fprintf(fp, "%s", main_body_foot);                                                                                                   \
-    if (add_footer_flag)                                                                                                           \
-      fputs(footer, fp);                                                                                                           \
-    fclose(fp);                                                                                                                    \
-  }
-
+    {                                                                                                                                \
+        FILE *fp;                                                                                                                      \
+        fp = fopen(xml_file_name, "a");                                                                                                \
+        fprintf(fp, VPIC_HDF::main_body_head, time_step);                                                                                        \
+        if (hydro_dump_flag.enabledJ())                                                                                                \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "J", dims_4d, dims_3d, speciesname_p, time_step, "jx", "jy", "jz");      \
+        if (hydro_dump_flag.rho)                                                                                                       \
+        fprintf(fp, VPIC_HDF::main_body_attributeS, "rho", dims_3d, time_step, speciesname_p, time_step, time_step, "rho");                    \
+        if (hydro_dump_flag.enabledP())                                                                                                \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "P", dims_4d, dims_3d, speciesname_p, time_step, "px", "py", "pz");      \
+        if (hydro_dump_flag.ke)                                                                                                        \
+        fprintf(fp, VPIC_HDF::main_body_attributeS, "ke", dims_3d, time_step, speciesname_p, time_step, time_step, "ke");                      \
+        if (hydro_dump_flag.enabledTD())                                                                                               \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "TD", dims_4d, dims_3d, speciesname_p, time_step, "txx", "tyy", "tzz");  \
+        if (hydro_dump_flag.enabledTOD())                                                                                              \
+        write_main_body_attribute(fp, VPIC_HDF::main_body_attributeV, "TOD", dims_4d, dims_3d, speciesname_p, time_step, "tyz", "tzx", "txy"); \
+        fprintf(fp, "%s", VPIC_HDF::main_body_foot);  \
+        if (add_footer_flag) {    \
+            fputs(VPIC_HDF::footer, fp);      \
+        }                         \
+        fclose(fp);               \
+    }
 
 #endif // VPIC_HDF5_HEAD_INFO
diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc
index 0dd2a418..9d36ff3e 100644
--- a/src/vpic/vpic.cc
+++ b/src/vpic/vpic.cc
@@ -114,16 +114,20 @@ vpic_simulation::vpic_simulation()
 
   // Initialize the dump strategy to use the binary dumpin, assuming the user
   // may overwrite this later
-  dump_strategy = std::unique_ptr<Dump_Strategy>(new BinaryDump( rank(), nproc() ));
+  //dump_strategy = std::unique_ptr<Dump_Strategy>(new BinaryDump( rank(), nproc() ));
+  enable_binary_dump();
 
   // TODO: this this still makes sense now we have a dump strategy
-#ifdef VPIC_ENABLE_HDF5
+//#ifdef VPIC_ENABLE_HDF5
   // Default init hdf5 dump flags
+  //field_interval = 1;
+  //hydro_interval = 1;
+  //field_dump_flag = field_dump_flag_t();
+  //hydro_dump_flag = hydro_dump_flag_t();
+//#endif
+
   field_interval = 1;
   hydro_interval = 1;
-  field_dump_flag = field_dump_flag_t();
-  hydro_dump_flag = hydro_dump_flag_t();
-#endif
 }
 
 vpic_simulation::~vpic_simulation() {
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index 5c6c72c3..118567d0 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -36,144 +36,6 @@
 #endif
 //  #include "dumpvars.h"
 
-
-// TODO: move these to a better header?
-#ifdef VPIC_ENABLE_HDF5
-struct field_dump_flag_t
-{
-  bool ex = true, ey = true, ez = true, div_e_err = true;
-  bool cbx = true, cby = true, cbz = true, div_b_err = true;
-  bool tcax = true, tcay = true, tcaz = true, rhob = true;
-  bool jfx = true, jfy = true, jfz = true, rhof = true;
-  bool ematx = true, ematy = true, ematz = true, nmat = true;
-  bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
-  void disableE()
-  {
-    ex = false, ey = false, ez = false, div_e_err = false;
-  }
-
-  void disableCB()
-  {
-    cbx = false, cby = false, cbz = false, div_b_err = false;
-  }
-
-  void disableTCA()
-  {
-    tcax = false, tcay = false, tcaz = false, rhob = false;
-  }
-
-  void disableJF()
-  {
-    jfx = false, jfy = false, jfz = false, rhof = false;
-  }
-
-  void disableEMAT()
-  {
-    ematx = false, ematy = false, ematz = false, nmat = false;
-  }
-
-  void disableFMAT()
-  {
-    fmatx = false, fmaty = false, fmatz = false, cmat = false;
-  }
-
-  void resetToDefaults()
-  {
-    ex = true, ey = true, ez = true, div_e_err = true;
-    cbx = true, cby = true, cbz = true, div_b_err = true;
-    tcax = true, tcay = true, tcaz = true, rhob = true;
-    jfx = true, jfy = true, jfz = true, rhof = true;
-    ematx = true, ematy = true, ematz = true, nmat = true;
-    fmatx = true, fmaty = true, fmatz = true, cmat = true;
-  }
-
-  bool enabledE()
-  {
-    return ex && ey && ez;
-  }
-
-  bool enabledCB()
-  {
-    return cbx && cby && cbz;
-  }
-
-  bool enabledTCA()
-  {
-    return tcax && tcay && tcaz;
-  }
-
-  bool enabledJF()
-  {
-    return jfx && jfy && jfz;
-  }
-
-  bool enabledEMAT()
-  {
-    return ematx && ematy && ematz;
-  }
-
-  bool enabledFMAT()
-  {
-    return fmatx && fmaty && fmatz;
-  }
-};
-
-struct hydro_dump_flag_t
-{
-  bool jx = true, jy = true, jz = true, rho = true;
-  bool px = true, py = true, pz = true, ke = true;
-  bool txx = true, tyy = true, tzz = true;
-  bool tyz = true, tzx = true, txy = true;
-
-  void disableJ()
-  {
-    jx = false, jy = false, jz = false, rho = false;
-  }
-
-  void disableP()
-  {
-    px = false, py = false, pz = false, ke = false;
-  }
-
-  void disableTD() //Stress diagonal
-  {
-    txx = false, tyy = false, tzz = false;
-  }
-
-  void disableTOD() //Stress off-diagonal
-  {
-    tyz = false, tzx = false, txy = false;
-  }
-  void resetToDefaults()
-  {
-    jx = true, jy = true, jz = true, rho = true;
-    px = true, py = true, pz = true, ke = true;
-    txx = true, tyy = true, tzz = true;
-    tyz = true, tzx = true, txy = true;
-  }
-
-  bool enabledJ()
-  {
-    return jx && jy && jz;
-  }
-
-  bool enabledP()
-  {
-    return px && py && pz;
-  }
-
-  bool enabledTD()
-  {
-    return txx && tyy && tzz;
-  }
-
-  bool enabledTOD()
-  {
-    return tyz && tzx && txy;
-  }
-};
-#endif
-
 typedef FileIO FILETYPE;
 
 const uint32_t all			(0xffffffff);
@@ -266,6 +128,16 @@ class vpic_simulation {
   int advance( void );
   void finalize( void );
 
+  // TODO: decide if I should collapse this to an enum
+  // An enum would stop these ifdefs being so leaky
+  void enable_binary_dump();
+#ifdef VPIC_ENABLE_HDF5
+  void enable_hdf5_dump();
+#endif
+#ifdef VPIC_ENABLE_OPENPMD
+  void enable_openpmd_dump();
+#endif
+
 protected:
 
   // Directly initialized by user
@@ -389,12 +261,6 @@ class vpic_simulation {
   // a smart ptr will save us from the small leak
   std::unique_ptr<Dump_Strategy> dump_strategy;
 
-#ifdef VPIC_ENABLE_HDF5
-  // Declare vars to use
-  hydro_dump_flag_t hydro_dump_flag;
-  field_dump_flag_t field_dump_flag;
-#endif
-
   // convenience functions for simlog output
   void create_field_list(char * strlist, DumpParameters & dumpParams);
   void create_hydro_list(char * strlist, DumpParameters & dumpParams);

From a90666677cd2e898606468fe58eb8e63d5130fed Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 12 Nov 2019 12:44:53 -0700
Subject: [PATCH 74/95] get pmd backend working too

---
 sample/harrisOpenPMD     | 9 ++++++---
 src/vpic/dump_strategy.h | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
index 16abb432..5b3274dc 100644
--- a/sample/harrisOpenPMD
+++ b/sample/harrisOpenPMD
@@ -42,6 +42,9 @@ begin_globals {
 };
 
 begin_initialization {
+
+  enable_openpmd_dump();
+
   // At this point, there is an empty grid and the random number generator is
   // seeded with the rank. The grid, materials, species need to be defined.
   // Then the initial non-zero fields need to be loaded at time level 0 and the
@@ -370,8 +373,8 @@ begin_diagnostics {
 
   std::string openpm_field_name = "fields.h5";
   //std::string openpm_field_name = "fields.bp";
-  if( step()==-10 )         dump_fields_openpmd(openpm_field_name.c_str()); // Get first valid total J
-  if( should_dump(fields) ) dump_fields_openpmd(openpm_field_name.c_str());
+  if( step()==-10 )         dump_fields(openpm_field_name.c_str()); // Get first valid total J
+  if( should_dump(fields) ) dump_fields(openpm_field_name.c_str());
 
   // Hydro dumps store particle charge density, current density and
   // stress-energy tensor. All these quantities are known at the time
@@ -390,7 +393,7 @@ begin_diagnostics {
   // are tagged with step(). However, if a "0" is added to the call, the
   // filename will not be tagged. Particle dumps are in a binary format.
   // Each rank makes a particle dump.
-  if( should_dump(eparticle) ) dump_particles_openpmd("electron","eparticle");
+  if( should_dump(eparticle) ) dump_particles("electron","eparticle");
   //if( should_dump(iparticle) ) dump_particles_hdf5("ion",     "iparticle");
 
   // A checkpt is made by calling checkpt( fbase, tag ) where fname is a string
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 08fe6e60..be22689f 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1123,7 +1123,7 @@ class HDF5Dump : public Dump_Strategy {
 #ifdef VPIC_ENABLE_OPENPMD
 class OpenPMDDump : public Dump_Strategy {
     public:
-        static openPMD::Series* series;
+        openPMD::Series* series;
         using Dump_Strategy::Dump_Strategy; // inherit constructor
         void dump_fields(
             const char *fbase,

From 8fe8ace50c54ae6a7a52279d492ddd85fb93b88a Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 12 Nov 2019 16:49:02 -0700
Subject: [PATCH 75/95] Adding explicitly deleted constructors to better follow
 rule of 3. Adding missing header too

---
 src/vpic/dump_strategy.h | 1 +
 src/vpic/vpic.h          | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index be22689f..aaf0bda6 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -3,6 +3,7 @@
 
 #include <unordered_map>
 #include <vector>
+#include <iostream>
 
 #include <mpi.h> // TODO: it would be good if this didn't have to know about MPI
 
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index 118567d0..1513de41 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -123,6 +123,9 @@ class vpic_simulation {
 public:
   vpic_simulation();
   ~vpic_simulation();
+  vpic_simulation(const vpic_simulation&) = delete;
+  vpic_simulation& operator=(const vpic_simulation&) = delete;
+
   void initialize( int argc, char **argv );
   void modify( const char *fname );
   int advance( void );

From 79faebb9247a8d80fb76cb3ff5d93fd219bfe06e Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 12 Nov 2019 18:00:30 -0700
Subject: [PATCH 76/95] remove needless call to copy constructor in unit tests

---
 test/unit/energy_comparison/3d_test.cc         | 2 +-
 test/unit/energy_comparison/weibel_driver.cc   | 2 +-
 test/unit/grid_heating/gridHeatingTestElec.cxx | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/unit/energy_comparison/3d_test.cc b/test/unit/energy_comparison/3d_test.cc
index 4c44736b..d84e4627 100644
--- a/test/unit/energy_comparison/3d_test.cc
+++ b/test/unit/energy_comparison/3d_test.cc
@@ -312,7 +312,7 @@ TEST_CASE( "Check if Weibel gives correct energy (within tol)", "[energy]" )
     ofs.close();
 
     // Init and run sim
-    vpic_simulation simulation = vpic_simulation();
+    vpic_simulation simulation;
 
     // TODO: We should do this in a safer manner
     simulation.initialize( 0, NULL );
diff --git a/test/unit/energy_comparison/weibel_driver.cc b/test/unit/energy_comparison/weibel_driver.cc
index eb4702db..cbc64e2c 100644
--- a/test/unit/energy_comparison/weibel_driver.cc
+++ b/test/unit/energy_comparison/weibel_driver.cc
@@ -310,7 +310,7 @@ TEST_CASE( "Check if Weibel gives correct energy (within tol)", "[energy]" )
     ofs.close();
 
     // Init and run sim
-    vpic_simulation simulation = vpic_simulation();
+    vpic_simulation simulation;
 
     // TODO: We should do this in a safer manner
     simulation.initialize( 0, NULL );
diff --git a/test/unit/grid_heating/gridHeatingTestElec.cxx b/test/unit/grid_heating/gridHeatingTestElec.cxx
index dc0e1ca5..9e804588 100644
--- a/test/unit/grid_heating/gridHeatingTestElec.cxx
+++ b/test/unit/grid_heating/gridHeatingTestElec.cxx
@@ -249,7 +249,7 @@ begin_initialization {
 TEST_CASE( "Check if Weibel gives correct energy (within tol)", "[energy]" )
 {
     // Init and run sim
-    vpic_simulation simulation = vpic_simulation();
+    vpic_simulation simulation;
 
     // TODO: We should do this in a safer manner
     simulation.initialize( 0, NULL );

From 37af4279abd3d7c494b3b74db8174f3b91002c7c Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Thu, 14 Nov 2019 08:49:57 -0700
Subject: [PATCH 77/95] change pmd series object to be a stack object

---
 src/vpic/dump_strategy.h | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index aaf0bda6..2a9356dd 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1124,7 +1124,7 @@ class HDF5Dump : public Dump_Strategy {
 #ifdef VPIC_ENABLE_OPENPMD
 class OpenPMDDump : public Dump_Strategy {
     public:
-        openPMD::Series* series;
+        //openPMD::Series* series;
         using Dump_Strategy::Dump_Strategy; // inherit constructor
         void dump_fields(
             const char *fbase,
@@ -1136,17 +1136,17 @@ class OpenPMDDump : public Dump_Strategy {
         {
             std::cout << "Writing openPMD data" << std::endl;
 
-            if (series == nullptr) {
+            //if (series == nullptr) {
                 std::cout << "init series" << std::endl;
-                series = new openPMD::Series(
+                openPMD::Series series = openPMD::Series(
                         fbase,
                         openPMD::AccessType::CREATE,
                         MPI_COMM_WORLD
-                        );
-            }
+                );
+            //}
 
             std::cout << "Writing itration " << step << std::endl;
-            auto i = series->iterations[ step ];
+            auto i = series.iterations[ step ];
             // TODO: it would be nice to set these...
             //series.setAuthor( "Axel Huebl <a.huebl@hzdr.de>");
             //series.setMachine( "Hall Probe 5000, Model 3");
@@ -1175,7 +1175,7 @@ class OpenPMDDump : public Dump_Strategy {
             size_t gnx = (grid->nx * grid->gpx);
             size_t gny = (grid->ny * grid->gpy);
             size_t gnz = (grid->nz * grid->gpz);
-            openPMD::Extent global_extent = {gny, gny, gnz};
+            openPMD::Extent global_extent = {gnx, gny, gnz};
 
             openPMD::Datatype datatype = openPMD::determineDatatype<float>();
             openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
@@ -1208,6 +1208,12 @@ class OpenPMDDump : public Dump_Strategy {
             openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z};
             openPMD::Extent chunk_extent = {nx, ny, nz};
 
+            std::cout << "Local offset " <<
+                " x: " << global_offset_x  <<
+                " y: " << global_offset_y  <<
+                " z: " << global_offset_z  <<
+                std::endl;
+
             // Store a local copy of the data which we pull out of the AoS
             std::vector<float> cbx_data;
             std::vector<float> cby_data;
@@ -1276,7 +1282,7 @@ class OpenPMDDump : public Dump_Strategy {
             Jy.storeChunk( jy_data, chunk_offset, chunk_extent);
             Jz.storeChunk( jz_data, chunk_offset, chunk_extent);
 
-            series->flush();
+            series.flush();
         }
         void dump_particles(
             const char *fbase,
@@ -1287,16 +1293,16 @@ class OpenPMDDump : public Dump_Strategy {
             int ftag
         )
         {
-            if (series == nullptr) {
+            //if (series == nullptr) {
                 std::cout << "init series" << std::endl;
-                series = new openPMD::Series(
+                openPMD::Series series = openPMD::Series(
                         fbase,
                         openPMD::AccessType::CREATE,
                         MPI_COMM_WORLD
                         );
-            }
+            //}
 
-            auto i = series->iterations[ step ];
+            auto i = series.iterations[ step ];
 
             // TODO: set these
             i.setTime( (float)step );
@@ -1356,6 +1362,7 @@ class OpenPMDDump : public Dump_Strategy {
             }
 
 
+            series.flush();
         }
         void dump_hydro(
             const char *fbase,

From 208729730c8c5dce23cac8f1444f24918e78f596 Mon Sep 17 00:00:00 2001
From: Robert Francis Bird - 294511 <bird@lanl.gov>
Date: Wed, 15 Jan 2020 11:43:05 -0700
Subject: [PATCH 78/95] fix bug where vpic_simulation class variables were not
 inited. If a deck forgot to do it we ran with UB

---
 src/vpic/vpic.cc | 11 ++++----
 src/vpic/vpic.h  | 69 ++++++++++++++++++++++++++----------------------
 2 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/src/vpic/vpic.cc b/src/vpic/vpic.cc
index 9d36ff3e..cc081358 100644
--- a/src/vpic/vpic.cc
+++ b/src/vpic/vpic.cc
@@ -78,11 +78,12 @@ vpic_simulation::vpic_simulation()
   // Is this just trying to 0 initialize everything?
   // CLEAR( this, 1 );
 
-  /* Set non-zero defaults */
-  verbose = 1;
-  num_comm_round = 3;
-  num_div_e_round = 2;
-  num_div_b_round = 2;
+  // Now done in the class def / header
+  ///* Set non-zero defaults */
+  //verbose = 1;
+  //num_comm_round = 3;
+  //num_div_e_round = 2;
+  //num_div_b_round = 2;
 
 #if defined(VPIC_USE_PTHREADS)                         // Pthreads case.
   int                              n_rng = serial.n_pipeline;
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index 1513de41..615b6191 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -145,50 +145,55 @@ class vpic_simulation {
 
   // Directly initialized by user
 
-  int verbose;              // Should system be verbose
-  int num_step;             // Number of steps to take
-  int num_comm_round;       // Num comm round
-  int status_interval;      // How often to print status messages
-  int clean_div_e_interval; // How often to clean div e
-  int num_div_e_round;      // How many clean div e rounds per div e interval
-  int clean_div_b_interval; // How often to clean div b
-  int num_div_b_round;      // How many clean div b rounds per div b interval
-  int sync_shared_interval; // How often to synchronize shared faces
+  int verbose = 1;              // Should system be verbose
+  int num_step = 1;             // Number of steps to take
+  int num_comm_round = 3;       // Num comm round
+  int status_interval = 0;    // How often to print status messages
+
+  int clean_div_e_interval = 0; // How often to clean div e
+  int num_div_e_round = 2;      // How many clean div e rounds per div e interval
+
+  int clean_div_b_interval = 0; // How often to clean div b
+  int num_div_b_round = 2;      // How many clean div b rounds per div b interval
+
+  int sync_shared_interval = 0; // How often to synchronize shared faces
 
   // FIXME: THESE INTERVALS SHOULDN'T BE PART OF vpic_simulation
   // THE BIG LIST FOLLOWING IT SHOULD BE CLEANED UP TOO
 
-  double quota;
-  int checkpt_interval;
-  int hydro_interval;
-  int field_interval;
-  int particle_interval;
+  double quota = 0;
+  int checkpt_interval = 0;
+  int hydro_interval = 0;
+  int field_interval = 0;
+  int particle_interval = 0;
 
   // TODO: these can probably now be removed, as they should only be used by dump?
   // TODO: check if any decks used them
   //size_t nxout, nyout, nzout;
   //float dxout, dyout, dzout;
 
-  size_t px, py, pz;
-
-  int ndfld;
-  int ndhyd;
-  int ndpar;
-  int ndhis;
-  int ndgrd;
-  int head_option;
-  int istride;
-  int jstride;
-  int kstride;
-  int stride_option;
-  int pstride;
-  int nprobe;
+  size_t px = 0;
+  size_t py = 0;
+  size_t pz = 0;
+
+  int ndfld = 0;
+  int ndhyd = 0;
+  int ndpar = 0;
+  int ndhis = 0;
+  int ndgrd = 0;
+  int head_option = 0;
+  int istride = 0;
+  int jstride = 0;
+  int kstride = 0;
+  int stride_option = 0;
+  int pstride = 0;
+  int nprobe = 0;
   int ijkprobe[NVARHISMX][4];
   float xyzprobe[NVARHISMX][3];
-  int block_dump;
-  int stepdigit;
-  int rankdigit;
-  int ifenergies;
+  int block_dump = 0;
+  int stepdigit = 0;
+  int rankdigit = 0;
+  int ifenergies = 0;
 
   // Helper initialized by user
 

From 6a753346f5b4edd6dacf901c5ae132105d989575 Mon Sep 17 00:00:00 2001
From: Robert Francis Bird - 294511 <bird@lanl.gov>
Date: Wed, 15 Jan 2020 11:52:20 -0700
Subject: [PATCH 79/95] change dump_Strategy constructor interface to not take
 num_steps, as it's not known when it's currently constructed. I instead now
 pass it in after user init, but the interface is a mess

---
 deck/main.cc             |  6 ++++++
 src/vpic/dump.cc         |  6 +++---
 src/vpic/dump_strategy.h |  7 +++----
 src/vpic/vpic.h          | 12 ++++++++----
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/deck/main.cc b/deck/main.cc
index f9f7fb1b..8a9d2352 100644
--- a/deck/main.cc
+++ b/deck/main.cc
@@ -98,6 +98,12 @@ int main(int argc, char** argv)
         }
         simulation = new vpic_simulation();
         simulation->initialize( argc, argv );
+
+        // do post init setup to consume deck values
+        // which includes setting dump starts steps, as we didn't know it sooner
+        // TODO: make this use sane functions
+        simulation->dump_strategy->num_step = simulation->num_step;
+
         REGISTER_OBJECT( &simulation, checkpt_main, restore_main, NULL );
     }
 
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 9ac120a2..46c5ad02 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -66,20 +66,20 @@ int vpic_simulation::dump_cwd(char * dname, size_t size) {
  *****************************************************************************/
 
 void vpic_simulation::enable_binary_dump() {
-    dump_strategy = std::unique_ptr<Dump_Strategy>(new BinaryDump( rank(), nproc(), num_step ));
+    dump_strategy = std::unique_ptr<Dump_Strategy>(new BinaryDump( rank(), nproc() ));
 }
 
 #ifdef VPIC_ENABLE_HDF5
 void vpic_simulation::enable_hdf5_dump() {
     std::cout << "Enabling HDF5 IO backend" << std::endl;
-    dump_strategy = std::unique_ptr<Dump_Strategy>(new HDF5Dump( rank(), nproc(), num_step ));
+    dump_strategy = std::unique_ptr<Dump_Strategy>(new HDF5Dump( rank(), nproc() ));
 }
 #endif
 
 #ifdef VPIC_ENABLE_OPENPMD
 void vpic_simulation::enable_openpmd_dump() {
     std::cout << "Enabling openPMD IO backend" << std::endl;
-    dump_strategy = std::unique_ptr<Dump_Strategy>(new OpenPMDDump( rank(), nproc(), num_step ));
+    dump_strategy = std::unique_ptr<Dump_Strategy>(new OpenPMDDump( rank(), nproc() ));
 }
 #endif
 
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 2a9356dd..f603cb34 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -37,10 +37,9 @@ class Dump_Strategy {
     public:
     int rank, nproc, num_step;
 
-    Dump_Strategy(int _rank, int _nproc, int total_steps) :
+    Dump_Strategy(int _rank, int _nproc ) :
         rank(_rank),
-        nproc(_nproc),
-        num_step(total_steps) // TODO: remove the need for this
+        nproc(_nproc)
     { } // empty
 
     virtual ~Dump_Strategy() { };
@@ -74,7 +73,7 @@ class Dump_Strategy {
 class BinaryDump : public Dump_Strategy {
     public:
         using Dump_Strategy::Dump_Strategy; // inherit constructor
-        BinaryDump(int _rank, int _nproc, int total_steps) : Dump_Strategy(_rank, _nproc, total_steps){ } // empty
+        //BinaryDump(int _rank, int _nproc ) : Dump_Strategy(_rank, _nproc ){ } // empty
 
         // TODO: now we pass rank and step, ftag has odd semanticds
         void dump_fields(
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index 615b6191..bbc75235 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -141,12 +141,19 @@ class vpic_simulation {
   void enable_openpmd_dump();
 #endif
 
+  // TODO: remake these protected
+
+  // Very likely a user will forgot to delete this if they change the strategy,
+  // a smart ptr will save us from the small leak
+  std::unique_ptr<Dump_Strategy> dump_strategy;
+
+  int num_step = 1;             // Number of steps to take
+
 protected:
 
   // Directly initialized by user
 
   int verbose = 1;              // Should system be verbose
-  int num_step = 1;             // Number of steps to take
   int num_comm_round = 3;       // Num comm round
   int status_interval = 0;    // How often to print status messages
 
@@ -265,9 +272,6 @@ class vpic_simulation {
   void dump_particles( const char *sp_name, const char *fbase,
                        int fname_tag = 1 );
 
-  // Very likely a user will forgot to delete this if they change the strategy,
-  // a smart ptr will save us from the small leak
-  std::unique_ptr<Dump_Strategy> dump_strategy;
 
   // convenience functions for simlog output
   void create_field_list(char * strlist, DumpParameters & dumpParams);

From 831b774a72d76d308cd5217c7af4d1719f55651c Mon Sep 17 00:00:00 2001
From: Robert Francis Bird - 294511 <bird@lanl.gov>
Date: Wed, 15 Jan 2020 15:04:14 -0700
Subject: [PATCH 80/95] port H5_ALL bug found by Patrick, and replace with
 linearspace

---
 src/vpic/dump_strategy.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index f603cb34..014ab0d8 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -650,9 +650,8 @@ class HDF5Dump : public Dump_Strategy {
             hsize_t memspace_count_temp = numparticles * 8;
             hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
 
-            // Don't need, can just use H5S_ALL
-            //hsize_t linearspace_count_temp = numparticles;
-            //hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
+            hsize_t linearspace_count_temp = numparticles;
+            hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
 
             plist_id = H5Pcreate(H5P_DATASET_XFER);
 
@@ -717,7 +716,7 @@ class HDF5Dump : public Dump_Strategy {
             }
 
             dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, filespace, plist_id, global_pi.data());
+            ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, linearspace, filespace, plist_id, global_pi.data());
             H5Dclose(dset_id);
 
 #else

From 2e108b95229e6101e8b105f7eb24f56faead23d6 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 21 Jan 2020 11:45:55 -0700
Subject: [PATCH 81/95] added particle and field dump code for openpmd

---
 src/vpic/dump_strategy.h | 158 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 157 insertions(+), 1 deletion(-)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 014ab0d8..df3be2a8 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1153,6 +1153,11 @@ class OpenPMDDump : public Dump_Strategy {
             auto cB = i.meshes["B"];
             auto E = i.meshes["E"];
             auto J = i.meshes["J"];
+            auto Tca = i.meshes["Tca"];
+            auto Emat = i.meshes["Emat"];
+            auto Fmat = i.meshes["Fmat"];
+            auto Rho = i.meshes["Rho"];
+            auto DivErr = i.meshes["DivErr"];
 
             // record components
             auto cbx = cB["x"];
@@ -1167,8 +1172,28 @@ class OpenPMDDump : public Dump_Strategy {
             auto Jy = J["y"];
             auto Jz = J["z"];
 
+            auto Tcax = Tca["x"];
+            auto Tcay = Tca["y"];
+            auto Tcaz = Tca["z"];
+
+            auto Ematx = Emat["x"];
+            auto Ematy = Emat["y"];
+            auto Ematz = Emat["z"];
+
+            auto Fmatx = Fmat["x"];
+            auto Fmaty = Fmat["y"];
+            auto Fmatz = Fmat["z"];
+
+            auto RhoB = Rho["B"];
+            auto RhoF = Rho["F"];
+
+            auto DivEErr = DivErr["E"];
+            auto DivBErr = DivErr["B"];
+
             // TODO: set unitDimension so the anaylsis software knows what fields
             // things are
+            //
+            // // TODO: add timers for the convert and for the write
 
             size_t gnx = (grid->nx * grid->gpx);
             size_t gny = (grid->ny * grid->gpy);
@@ -1190,6 +1215,24 @@ class OpenPMDDump : public Dump_Strategy {
             Jy.resetDataset(dataset);
             Jz.resetDataset(dataset);
 
+            Tcax.resetDataset(dataset);
+            Tcay.resetDataset(dataset);
+            Tcaz.resetDataset(dataset);
+
+            Ematx.resetDataset(dataset);
+            Ematy.resetDataset(dataset);
+            Ematz.resetDataset(dataset);
+
+            Fmatx.resetDataset(dataset);
+            Fmaty.resetDataset(dataset);
+            Fmatz.resetDataset(dataset);
+
+            RhoB.resetDataset(dataset);
+            RhoF.resetDataset(dataset);
+
+            DivEErr.resetDataset(dataset);
+            DivBErr.resetDataset(dataset);
+
             // Convert rank to local x/y/z
             int rx, ry, rz;
             UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
@@ -1225,6 +1268,26 @@ class OpenPMDDump : public Dump_Strategy {
             std::vector<float> jy_data;
             std::vector<float> jz_data;
 
+            std::vector<float> tcax_data;
+            std::vector<float> tcay_data;
+            std::vector<float> tcaz_data;
+
+            // TODO: these are material_id (ints not floats)
+            std::vector<float> ematx_data;
+            std::vector<float> ematy_data;
+            std::vector<float> ematz_data;
+
+            std::vector<float> fmatx_data;
+            std::vector<float> fmaty_data;
+            std::vector<float> fmatz_data;
+            // end todo
+
+            std::vector<float> rhob_data;
+            std::vector<float> rhof_data;
+
+            std::vector<float> divb_data;
+            std::vector<float> dive_data;
+
             size_t nv = nx * ny * nz;
 
             cbx_data.reserve(nv);
@@ -1239,6 +1302,24 @@ class OpenPMDDump : public Dump_Strategy {
             jy_data.reserve(nv);
             jz_data.reserve(nv);
 
+            tcax_data.reserve(nv);
+            tcay_data.reserve(nv);
+            tcaz_data.reserve(nv);
+
+            ematx_data.reserve(nv);
+            ematy_data.reserve(nv);
+            ematz_data.reserve(nv);
+
+            fmatx_data.reserve(nv);
+            fmaty_data.reserve(nv);
+            fmatz_data.reserve(nv);
+
+            rhob_data.reserve(nv);
+            rhof_data.reserve(nv);
+
+            divb_data.reserve(nv);
+            dive_data.reserve(nv);
+
             // TODO: make this AoS to SoA conversion a function
 
             // We could do 1D here, but we don't really care about the ghosts, and we
@@ -1264,6 +1345,24 @@ class OpenPMDDump : public Dump_Strategy {
                         jx_data[local_index] = field_array->f[global_index].jfx;
                         jy_data[local_index] = field_array->f[global_index].jfy;
                         jz_data[local_index] = field_array->f[global_index].jfz;
+
+                        tcax_data[local_index] = field_array->f[global_index].tcax;
+                        tcay_data[local_index] = field_array->f[global_index].tcay;
+                        tcaz_data[local_index] = field_array->f[global_index].tcaz;
+
+                        ematx_data[local_index] = field_array->f[global_index].ematx;
+                        ematy_data[local_index] = field_array->f[global_index].ematy;
+                        ematz_data[local_index] = field_array->f[global_index].ematz;
+
+                        fmatx_data[local_index] = field_array->f[global_index].fmatx;
+                        fmaty_data[local_index] = field_array->f[global_index].fmaty;
+                        fmatz_data[local_index] = field_array->f[global_index].fmatz;
+
+                        rhob_data[local_index] = field_array->f[global_index].rhob;
+                        rhof_data[local_index] = field_array->f[global_index].rhof;
+
+                        dive_data[local_index] = field_array->f[global_index].dive;
+                        divb_data[local_index] = field_array->f[global_index].divb;
                     }
                 }
             }
@@ -1280,8 +1379,27 @@ class OpenPMDDump : public Dump_Strategy {
             Jy.storeChunk( jy_data, chunk_offset, chunk_extent);
             Jz.storeChunk( jz_data, chunk_offset, chunk_extent);
 
+            Tcax.storeChunk( tcax_data, chunk_offset, chunk_extent);
+            Tcay.storeChunk( tcay_data, chunk_offset, chunk_extent);
+            Tcaz.storeChunk( tcaz_data, chunk_offset, chunk_extent);
+
+            Ematx.storeChunk( ematx_data, chunk_offset, chunk_extent);
+            Ematy.storeChunk( ematy_data, chunk_offset, chunk_extent);
+            Ematz.storeChunk( ematz_data, chunk_offset, chunk_extent);
+
+            Fmatx.storeChunk( fmatx_data, chunk_offset, chunk_extent);
+            Fmaty.storeChunk( fmaty_data, chunk_offset, chunk_extent);
+            Fmatz.storeChunk( fmatz_data, chunk_offset, chunk_extent);
+
+            RhoB.storeChunk( rhob_data, chunk_offset, chunk_extent);
+            RhoF.storeChunk( rhof, chunk_offset, chunk_extent);
+
+            DivEErr.storeChunk( dive_data, chunk_offset, chunk_extent);
+            DivBErr.storeChunk( divb_data, chunk_offset, chunk_extent);
+
             series.flush();
         }
+
         void dump_particles(
             const char *fbase,
             species_t* sp,
@@ -1343,13 +1461,42 @@ class OpenPMDDump : public Dump_Strategy {
                 x_pos.reserve(to_write);
                 x_off.reserve(to_write);
 
+                std::vector<float> y_pos;
+                std::vector<float> y_off;
+                y_pos.reserve(to_write);
+                y_off.reserve(to_write);
+
+                std::vector<float> z_pos;
+                std::vector<float> z_off;
+                z_pos.reserve(to_write);
+                z_off.reserve(to_write);
+
+                std::vector<float> ux_pos;
+                ux_pos.reserve(to_write);
+
+                std::vector<float> uy_pos;
+                uy_pos.reserve(to_write);
+
+                std::vector<float> uz_pos;
+                uz_pos.reserve(to_write);
+
                 for (int j = 0; j < to_write; j++)
                 {
                     // TODO: do I need to center the particles?
                     auto& particle = sp->p[i+j];
+
                     x_pos[j] = particle.dx;
+                    y_pos[j] = particle.dy;
+                    z_pos[j] = particle.dz;
+
+                    ux_pos[j] = particle.ux;
+                    uy_pos[j] = particle.uy;
+                    uz_pos[j] = particle.uz;
+
                     std::array<int, 4> gi = global_particle_index(particle.i, grid, rank);
                     x_off[j] = (float)gi[1];
+                    y_off[j] = (float)gi[2];
+                    z_off[j] = (float)gi[3];
                 }
 
                 // Base offset plus i to account for chunks
@@ -1357,8 +1504,17 @@ class OpenPMDDump : public Dump_Strategy {
                 auto e = openPMD::Extent{to_write};
                 px.storeChunk(x_pos, o, e);
                 pxo.storeChunk(x_off, o, e);
-            }
 
+                py.storeChunk(y_pos, o, e);
+                pyo.storeChunk(y_off, o, e);
+
+                pz.storeChunk(z_pos, o, e);
+                pzo.storeChunk(z_off, o, e);
+
+                ux.storeChunk(ux_pos, o, e);
+                uy.storeChunk(uy_pos, o, e);
+                uz.storeChunk(uz_pos, o, e);
+            }
 
             series.flush();
         }

From 5abef64053efae9cbdff5b37ac1f6d1fff6f9b23 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 21 Jan 2020 11:46:34 -0700
Subject: [PATCH 82/95] add note on compile time option

---
 src/vpic/dump_strategy.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 2a9356dd..97dadfe5 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -682,8 +682,10 @@ class HDF5Dump : public Dump_Strategy {
             ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
             H5Dclose(dset_id);
 
+            // TODO: make this a compile time option
 #define OUTPUT_CONVERT_GLOBAL_ID 1
 #ifdef OUTPUT_CONVERT_GLOBAL_ID
+
             // TODO: make a function out of this too, its used in openpmd
             std::vector<int> global_pi;
             global_pi.reserve(numparticles);

From 6e16fa001eb82174f5ce85f95c1e2fd7af21c6a4 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 21 Jan 2020 11:50:49 -0700
Subject: [PATCH 83/95] merge local code and fix up compilation of new openpmd
 code

---
 src/vpic/dump_strategy.h | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 9eaec6d9..65447c04 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1363,8 +1363,8 @@ class OpenPMDDump : public Dump_Strategy {
                         rhob_data[local_index] = field_array->f[global_index].rhob;
                         rhof_data[local_index] = field_array->f[global_index].rhof;
 
-                        dive_data[local_index] = field_array->f[global_index].dive;
-                        divb_data[local_index] = field_array->f[global_index].divb;
+                        dive_data[local_index] = field_array->f[global_index].div_e_err;
+                        divb_data[local_index] = field_array->f[global_index].div_b_err;
                     }
                 }
             }
@@ -1394,7 +1394,7 @@ class OpenPMDDump : public Dump_Strategy {
             Fmatz.storeChunk( fmatz_data, chunk_offset, chunk_extent);
 
             RhoB.storeChunk( rhob_data, chunk_offset, chunk_extent);
-            RhoF.storeChunk( rhof, chunk_offset, chunk_extent);
+            RhoF.storeChunk( rhof_data, chunk_offset, chunk_extent);
 
             DivEErr.storeChunk( dive_data, chunk_offset, chunk_extent);
             DivBErr.storeChunk( divb_data, chunk_offset, chunk_extent);
@@ -1445,9 +1445,29 @@ class OpenPMDDump : public Dump_Strategy {
             auto px = p["position"]["x"];
             auto pxo = p["positionOffset"]["x"];
 
+            auto py = p["position"]["y"];
+            auto pyo = p["positionOffset"]["y"];
+
+            auto pz = p["position"]["z"];
+            auto pzo = p["positionOffset"]["z"];
+
+            auto ux = p["velocity"]["x"];
+            auto uy = p["velocity"]["y"];
+            auto uz = p["velocity"]["z"];
+
             px.resetDataset(dataset);
             pxo.resetDataset(dataset);
 
+            py.resetDataset(dataset);
+            pyo.resetDataset(dataset);
+
+            pz.resetDataset(dataset);
+            pzo.resetDataset(dataset);
+
+            ux.resetDataset(dataset);
+            uy.resetDataset(dataset);
+            uz.resetDataset(dataset);
+
             // convert data to SoA, allowing the user to chunk the operation
             const int max_chunk = 32768*8; // 1MB SoA
             // Loop over all particles in chunks

From 3a2e2cc4c0466ad63543db9d86b63cbaccaf8e28 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 21 Jan 2020 12:15:07 -0700
Subject: [PATCH 84/95] particles and fields both pass pmd validator, with no
 errors. Some warnings (such as author) need fixing

---
 sample/harrisOpenPMD     |  2 ++
 src/vpic/dump.cc         |  2 +-
 src/vpic/dump_strategy.h | 14 ++++++++++----
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
index 5b3274dc..568710d9 100644
--- a/sample/harrisOpenPMD
+++ b/sample/harrisOpenPMD
@@ -371,6 +371,8 @@ begin_diagnostics {
   // been completed. Field dumps are in a binary format. Each rank makes a
   // field dump.
 
+  // TODO: passing in the field extension as part of the name doesn't work for
+  // the other functions, as they use it to look up species
   std::string openpm_field_name = "fields.h5";
   //std::string openpm_field_name = "fields.bp";
   if( step()==-10 )         dump_fields(openpm_field_name.c_str()); // Get first valid total J
diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 46c5ad02..65d15910 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -87,7 +87,7 @@ void vpic_simulation::dump_particles( const char *sp_name,
                                  const char *fbase,
                                  int ftag )
 {
-    species_t * sp = find_species_name(sp_name, species_list);
+    species_t* sp = find_species_name(sp_name, species_list);
     dump_strategy->dump_particles(
         fbase,
         sp,
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 65447c04..5efa5c4f 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1145,7 +1145,7 @@ class OpenPMDDump : public Dump_Strategy {
                 );
             //}
 
-            std::cout << "Writing itration " << step << std::endl;
+            std::cout << "Writing iteration " << step << std::endl;
             auto i = series.iterations[ step ];
             // TODO: it would be nice to set these...
             //series.setAuthor( "Axel Huebl <a.huebl@hzdr.de>");
@@ -1411,13 +1411,19 @@ class OpenPMDDump : public Dump_Strategy {
             int ftag
         )
         {
+            // TODO: move this to class level, and make it user settable, so it
+            // can be used more widely
+            std::string file_type = ".h5";
+            std::string full_file_name = fbase + file_type;
+
+            std::cout << "writing particles to " << full_file_name << std::endl;
+
             //if (series == nullptr) {
-                std::cout << "init series" << std::endl;
                 openPMD::Series series = openPMD::Series(
-                        fbase,
+                        full_file_name,
                         openPMD::AccessType::CREATE,
                         MPI_COMM_WORLD
-                        );
+                );
             //}
 
             auto i = series.iterations[ step ];

From f8b45c83c5191e293ba039cfcfc21c1bd0324766 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 21 Jan 2020 12:34:44 -0700
Subject: [PATCH 85/95] first pass adding hydro dump for openpmd

---
 sample/harrisOpenPMD     |   2 +-
 src/vpic/dump_strategy.h | 221 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 213 insertions(+), 10 deletions(-)

diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
index 568710d9..fdddccab 100644
--- a/sample/harrisOpenPMD
+++ b/sample/harrisOpenPMD
@@ -387,7 +387,7 @@ begin_diagnostics {
   // purely diagnostic. It is not used by the simulation and it is not
   // accumulated using a self-consistent charge-conserving method. Hydro dumps
   // are in a binary format. Each rank makes a hydro dump.
-  //if(should_dump(ehydro) ) dump_hydro_hdf5("electron","ehydro");
+  if(should_dump(ehydro) ) dump_hydro("electron","ehydro");
   //if( should_dump(ihydro) ) dump_hydro_hdf5("ion",     "ihydro");
 
   // Particle dumps store the particle data for a given species. The data
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 5efa5c4f..56aacb4e 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1162,9 +1162,9 @@ class OpenPMDDump : public Dump_Strategy {
             auto DivErr = i.meshes["DivErr"];
 
             // record components
-            auto cbx = cB["x"];
-            auto cby = cB["y"];
-            auto cbz = cB["z"];
+            auto Cbx = cB["x"];
+            auto Cby = cB["y"];
+            auto Cbz = cB["z"];
 
             auto Ex = E["x"];
             auto Ey = E["y"];
@@ -1205,9 +1205,9 @@ class OpenPMDDump : public Dump_Strategy {
             openPMD::Datatype datatype = openPMD::determineDatatype<float>();
             openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
 
-            cbx.resetDataset(dataset);
-            cby.resetDataset(dataset);
-            cbz.resetDataset(dataset);
+            Cbx.resetDataset(dataset);
+            Cby.resetDataset(dataset);
+            Cbz.resetDataset(dataset);
 
             Ex.resetDataset(dataset);
             Ey.resetDataset(dataset);
@@ -1235,6 +1235,7 @@ class OpenPMDDump : public Dump_Strategy {
             DivEErr.resetDataset(dataset);
             DivBErr.resetDataset(dataset);
 
+            // TODO: hoist this conversion code, as is it used elsewhere
             // Convert rank to local x/y/z
             int rx, ry, rz;
             UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
@@ -1369,9 +1370,9 @@ class OpenPMDDump : public Dump_Strategy {
                 }
             }
 
-            cbx.storeChunk( cbx_data, chunk_offset, chunk_extent);
-            cby.storeChunk( cby_data, chunk_offset, chunk_extent);
-            cbz.storeChunk( cbz_data, chunk_offset, chunk_extent);
+            Cbx.storeChunk( cbx_data, chunk_offset, chunk_extent);
+            Cby.storeChunk( cby_data, chunk_offset, chunk_extent);
+            Cbz.storeChunk( cbz_data, chunk_offset, chunk_extent);
 
             Ex.storeChunk( ex_data, chunk_offset, chunk_extent);
             Ey.storeChunk( ey_data, chunk_offset, chunk_extent);
@@ -1556,6 +1557,208 @@ class OpenPMDDump : public Dump_Strategy {
             int ftag
         )
         {
+            // TODO: move this to class level, and make it user settable, so it
+            // can be used more widely
+            std::string file_type = ".h5";
+            std::string full_file_name = fbase + file_type;
+
+            std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl;
+
+            //if (series == nullptr) {
+                openPMD::Series series = openPMD::Series(
+                        full_file_name,
+                        openPMD::AccessType::CREATE,
+                        MPI_COMM_WORLD
+                );
+            //}
+
+            auto i = series.iterations[ step ];
+
+            // TODO: set these
+            i.setTime( (float)step );
+            i.setDt(1.0);
+            i.setTimeUnitSI(1.0);
+
+            if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name ));
+
+            // TODO: do we want each backend to have to explicitly call these
+            // manually? Or, as it is common, should we hoist it to the VPIC
+            // call-site
+            clear_hydro_array( hydro_array );
+            accumulate_hydro_p( hydro_array, sp, interpolator_array );
+            synchronize_hydro_array( hydro_array );
+
+            if( !fbase ) ERROR(( "Invalid filename" ));
+
+            if( rank==0 )
+                MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase));
+
+            // Write data
+            //float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
+            //float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
+            //float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
+            //float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
+            auto J = i.meshes["J"];
+            auto P = i.meshes["P"];
+            auto T = i.meshes["T"];
+            auto _Ke = i.meshes["Ke"];
+            auto _Rho = i.meshes["Rho"];
+
+            auto Jx = J["x"];
+            auto Jy = J["y"];
+            auto Jz = J["z"];
+
+            auto Px = P["x"];
+            auto Py = P["y"];
+            auto Pz = P["z"];
+
+            auto Txx = T["xx"];
+            auto Tyy = T["yy"];
+            auto Tzz = T["zz"];
+            auto Tyz = T["yz"];
+            auto Tzx = T["zx"];
+            auto Txy = T["xy"];
+
+            auto Rho = _Rho["rho"]; // TODO: bad name..
+            auto Ke = _Ke["ke"]; // TODO: bad name..
+
+            size_t gnx = (grid->nx * grid->gpx);
+            size_t gny = (grid->ny * grid->gpy);
+            size_t gnz = (grid->nz * grid->gpz);
+            openPMD::Extent global_extent = {gnx, gny, gnz};
+
+            openPMD::Datatype datatype = openPMD::determineDatatype<float>();
+            openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
+
+            Jx.resetDataset(dataset);
+            Jy.resetDataset(dataset);
+            Jz.resetDataset(dataset);
+
+            Px.resetDataset(dataset);
+            Py.resetDataset(dataset);
+            Pz.resetDataset(dataset);
+
+            Txx.resetDataset(dataset);
+            Tyy.resetDataset(dataset);
+            Tzz.resetDataset(dataset);
+            Tyz.resetDataset(dataset);
+            Tzx.resetDataset(dataset);
+            Txy.resetDataset(dataset);
+
+            Rho.resetDataset(dataset);
+            Ke.resetDataset(dataset);
+
+            // TODO: hoist this conversion code, as is it used elsewhere
+            // Convert rank to local x/y/z
+            int rx, ry, rz;
+            UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+            size_t nx = grid->nx;
+            size_t ny = grid->ny;
+            size_t nz = grid->nz;
+
+            // NOTE: this assumes a static mesh decomposition in nx/ny/nz
+            size_t global_offset_x = (nx) * rx;
+            size_t global_offset_y = (ny) * ry;
+            size_t global_offset_z = (nz) * rz;
+
+            openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z};
+            openPMD::Extent chunk_extent = {nx, ny, nz};
+
+            std::cout << "Local offset " <<
+                " x: " << global_offset_x  <<
+                " y: " << global_offset_y  <<
+                " z: " << global_offset_z  <<
+                std::endl;
+
+            std::vector<float> jx_data;
+            std::vector<float> jy_data;
+            std::vector<float> jz_data;
+
+            std::vector<float> px_data;
+            std::vector<float> py_data;
+            std::vector<float> pz_data;
+
+            std::vector<float> txx_data;
+            std::vector<float> tyy_data;
+            std::vector<float> tzz_data;
+            std::vector<float> tyz_data;
+            std::vector<float> tzx_data;
+            std::vector<float> txy_data;
+
+            std::vector<float> rho_data;
+            std::vector<float> ke_data;
+
+            size_t nv = nx * ny * nz;
+
+            jx_data.reserve(nv);
+            jy_data.reserve(nv);
+            jz_data.reserve(nv);
+
+            px_data.reserve(nv);
+            py_data.reserve(nv);
+            pz_data.reserve(nv);
+
+            txx_data.reserve(nv);
+            tyy_data.reserve(nv);
+            tzz_data.reserve(nv);
+            tyz_data.reserve(nv);
+            tzx_data.reserve(nv);
+            txy_data.reserve(nv);
+
+            rho_data.reserve(nv);
+            ke_data.reserve(nv);
+
+            // Transpose AoS to SoAs
+            for (size_t k = 1; k < grid->nz + 1; k++)
+            {
+                for (size_t j = 1; j < grid->ny + 1; j++)
+                {
+                    for (size_t i = 1; i < grid->nx + 1; i++)
+                    {
+                        int local_index  = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2);
+                        int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
+
+                        jx_data[local_index] = hydro_array->h[global_index].jx;
+                        jy_data[local_index] = hydro_array->h[global_index].jy;
+                        jz_data[local_index] = hydro_array->h[global_index].jz;
+
+                        px_data[local_index] = hydro_array->h[global_index].px;
+                        py_data[local_index] = hydro_array->h[global_index].py;
+                        pz_data[local_index] = hydro_array->h[global_index].pz;
+
+                        txx_data[local_index] = hydro_array->h[global_index].txx;
+                        tyy_data[local_index] = hydro_array->h[global_index].tyy;
+                        tzz_data[local_index] = hydro_array->h[global_index].tzz;
+                        tyz_data[local_index] = hydro_array->h[global_index].tyz;
+                        tzx_data[local_index] = hydro_array->h[global_index].tzx;
+                        txy_data[local_index] = hydro_array->h[global_index].txy;
+
+                        rho_data[local_index] = hydro_array->h[global_index].rho;
+                        ke_data[local_index] = hydro_array->h[global_index].ke;
+                    }
+                }
+            }
+
+            Jx.storeChunk( jx_data, chunk_offset, chunk_extent);
+            Jy.storeChunk( jy_data, chunk_offset, chunk_extent);
+            Jz.storeChunk( jz_data, chunk_offset, chunk_extent);
+
+            Px.storeChunk( px_data, chunk_offset, chunk_extent);
+            Py.storeChunk( py_data, chunk_offset, chunk_extent);
+            Pz.storeChunk( pz_data, chunk_offset, chunk_extent);
+
+            Txx.storeChunk( txx_data, chunk_offset, chunk_extent);
+            Tyy.storeChunk( tyy_data, chunk_offset, chunk_extent);
+            Tzz.storeChunk( tzz_data, chunk_offset, chunk_extent);
+            Tyz.storeChunk( tyz_data, chunk_offset, chunk_extent);
+            Tzx.storeChunk( tzx_data, chunk_offset, chunk_extent);
+            Txy.storeChunk( txy_data, chunk_offset, chunk_extent);
+
+            Rho.storeChunk( rho_data, chunk_offset, chunk_extent);
+            Ke.storeChunk( ke_data, chunk_offset, chunk_extent);
+
+            series.flush();
         }
 };
 #endif

From 0ff039587ba199838eba60f067cb2e6a8fe95e4a Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Thu, 30 Jan 2020 11:14:06 -0700
Subject: [PATCH 86/95] fix vector semantic where reserve was used instead of
 resize and increase write chunk size to 256MB. also fix bug where pmd
 particles were not being flushed at proper times

---
 src/vpic/dump_strategy.h | 100 ++++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 56aacb4e..a1b96199 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -686,7 +686,7 @@ class HDF5Dump : public Dump_Strategy {
 
             // TODO: make a function out of this too, its used in openpmd
             std::vector<int> global_pi;
-            global_pi.reserve(numparticles);
+            global_pi.resize(numparticles);
             // TODO: this could be parallel
             for (int i = 0; i < numparticles; i++)
             {
@@ -1293,35 +1293,37 @@ class OpenPMDDump : public Dump_Strategy {
 
             size_t nv = nx * ny * nz;
 
-            cbx_data.reserve(nv);
-            cby_data.reserve(nv);
-            cbz_data.reserve(nv);
+            // TODO: resize here will zero out the data which we don't need, we
+            // could change to a different semantic to avoid this
+            cbx_data.resize(nv);
+            cby_data.resize(nv);
+            cbz_data.resize(nv);
 
-            ex_data.reserve(nv);
-            ey_data.reserve(nv);
-            ez_data.reserve(nv);
+            ex_data.resize(nv);
+            ey_data.resize(nv);
+            ez_data.resize(nv);
 
-            jx_data.reserve(nv);
-            jy_data.reserve(nv);
-            jz_data.reserve(nv);
+            jx_data.resize(nv);
+            jy_data.resize(nv);
+            jz_data.resize(nv);
 
-            tcax_data.reserve(nv);
-            tcay_data.reserve(nv);
-            tcaz_data.reserve(nv);
+            tcax_data.resize(nv);
+            tcay_data.resize(nv);
+            tcaz_data.resize(nv);
 
-            ematx_data.reserve(nv);
-            ematy_data.reserve(nv);
-            ematz_data.reserve(nv);
+            ematx_data.resize(nv);
+            ematy_data.resize(nv);
+            ematz_data.resize(nv);
 
-            fmatx_data.reserve(nv);
-            fmaty_data.reserve(nv);
-            fmatz_data.reserve(nv);
+            fmatx_data.resize(nv);
+            fmaty_data.resize(nv);
+            fmatz_data.resize(nv);
 
-            rhob_data.reserve(nv);
-            rhof_data.reserve(nv);
+            rhob_data.resize(nv);
+            rhof_data.resize(nv);
 
-            divb_data.reserve(nv);
-            dive_data.reserve(nv);
+            divb_data.resize(nv);
+            dive_data.resize(nv);
 
             // TODO: make this AoS to SoA conversion a function
 
@@ -1476,7 +1478,7 @@ class OpenPMDDump : public Dump_Strategy {
             uz.resetDataset(dataset);
 
             // convert data to SoA, allowing the user to chunk the operation
-            const int max_chunk = 32768*8; // 1MB SoA
+            const int max_chunk = 32768*256*8; // 256MB SoA
             // Loop over all particles in chunks
             for (int i = 0; i < np; i += max_chunk)
             {
@@ -1487,27 +1489,27 @@ class OpenPMDDump : public Dump_Strategy {
                 // Convert the chunk ready to write
                 std::vector<float> x_pos;
                 std::vector<float> x_off;
-                x_pos.reserve(to_write);
-                x_off.reserve(to_write);
+                x_pos.resize(to_write);
+                x_off.resize(to_write);
 
                 std::vector<float> y_pos;
                 std::vector<float> y_off;
-                y_pos.reserve(to_write);
-                y_off.reserve(to_write);
+                y_pos.resize(to_write);
+                y_off.resize(to_write);
 
                 std::vector<float> z_pos;
                 std::vector<float> z_off;
-                z_pos.reserve(to_write);
-                z_off.reserve(to_write);
+                z_pos.resize(to_write);
+                z_off.resize(to_write);
 
                 std::vector<float> ux_pos;
-                ux_pos.reserve(to_write);
+                ux_pos.resize(to_write);
 
                 std::vector<float> uy_pos;
-                uy_pos.reserve(to_write);
+                uy_pos.resize(to_write);
 
                 std::vector<float> uz_pos;
-                uz_pos.reserve(to_write);
+                uz_pos.resize(to_write);
 
                 for (int j = 0; j < to_write; j++)
                 {
@@ -1543,9 +1545,9 @@ class OpenPMDDump : public Dump_Strategy {
                 ux.storeChunk(ux_pos, o, e);
                 uy.storeChunk(uy_pos, o, e);
                 uz.storeChunk(uz_pos, o, e);
-            }
 
-            series.flush();
+                series.flush();
+            }
         }
         void dump_hydro(
             const char *fbase,
@@ -1691,23 +1693,23 @@ class OpenPMDDump : public Dump_Strategy {
 
             size_t nv = nx * ny * nz;
 
-            jx_data.reserve(nv);
-            jy_data.reserve(nv);
-            jz_data.reserve(nv);
+            jx_data.resize(nv);
+            jy_data.resize(nv);
+            jz_data.resize(nv);
 
-            px_data.reserve(nv);
-            py_data.reserve(nv);
-            pz_data.reserve(nv);
+            px_data.resize(nv);
+            py_data.resize(nv);
+            pz_data.resize(nv);
 
-            txx_data.reserve(nv);
-            tyy_data.reserve(nv);
-            tzz_data.reserve(nv);
-            tyz_data.reserve(nv);
-            tzx_data.reserve(nv);
-            txy_data.reserve(nv);
+            txx_data.resize(nv);
+            tyy_data.resize(nv);
+            tzz_data.resize(nv);
+            tyz_data.resize(nv);
+            tzx_data.resize(nv);
+            txy_data.resize(nv);
 
-            rho_data.reserve(nv);
-            ke_data.reserve(nv);
+            rho_data.resize(nv);
+            ke_data.resize(nv);
 
             // Transpose AoS to SoAs
             for (size_t k = 1; k < grid->nz + 1; k++)

From c6d01c668f35686a30498e8156d21d9781cf1793 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Sun, 2 Feb 2020 11:34:20 -0700
Subject: [PATCH 87/95] add code to expose user definable max openpmd particle
 write chunk size

---
 CMakeLists.txt            |  2 ++
 src/vpic/dump_strategy.cc |  3 +++
 src/vpic/dump_strategy.h  | 10 ++++++++--
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c912f52..8af21869 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,8 @@ option(USE_OPENPMD "Enable OpenPMD for use during IO. VPIC does not help you ins
 # option to set minimum number of particles
 set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing")
 
+# TODO: better name for this?
+set(PMD_MAX_IO_CHUNK AUTO CACHE STRING "Select the maxiumum IO write size to use when writing -- applies to particles only, and is specified as number of particles. currently only honored by OpenPMD backend")
 
 #------------------------------------------------------------------------------#
 # Create include and link aggregates
diff --git a/src/vpic/dump_strategy.cc b/src/vpic/dump_strategy.cc
index adea2714..e03ca36c 100644
--- a/src/vpic/dump_strategy.cc
+++ b/src/vpic/dump_strategy.cc
@@ -57,6 +57,9 @@ void BinaryDump::dump_particles(
     FileIO fileIO;
     int dim[1], buf_start;
     static particle_t * ALIGNED(128) p_buf = NULL;
+
+    // TODO: reconcile this with MAX_IO_CHUNK, and update Cmake option
+    // description to explain what backends use it
 # define PBUF_SIZE 32768 // 1MB of particles
 
     if( !sp ) ERROR(( "Invalid species name \"%s\".", sp->name ));
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index a1b96199..4e823e34 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -680,6 +680,7 @@ class HDF5Dump : public Dump_Strategy {
             ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
             H5Dclose(dset_id);
 
+            // TODO: should we add the ability to chunk the particle write?
             // TODO: make this a compile time option
 #define OUTPUT_CONVERT_GLOBAL_ID 1
 #ifdef OUTPUT_CONVERT_GLOBAL_ID
@@ -1476,9 +1477,14 @@ class OpenPMDDump : public Dump_Strategy {
             ux.resetDataset(dataset);
             uy.resetDataset(dataset);
             uz.resetDataset(dataset);
-
             // convert data to SoA, allowing the user to chunk the operation
-            const int max_chunk = 32768*256*8; // 256MB SoA
+
+            // TODO: Add code the convert to global offsets
+#ifndef PMD_MAX_IO_CHUNK // in particles
+#define PMD_MAX_IO_CHUNK 16777216; // 512MB total write
+#endif
+            const int max_chunk = PMD_MAX_IO_CHUNK;
+
             // Loop over all particles in chunks
             for (int i = 0; i < np; i += max_chunk)
             {

From fd1559a410f2c3e4682fa879984fd8b73e31d3b3 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Sun, 2 Feb 2020 11:43:26 -0700
Subject: [PATCH 88/95] add missing define for max io chunk, and hoist global
 id output option to cmake level

---
 CMakeLists.txt           | 11 +++++++++++
 src/vpic/dump_strategy.h |  3 +--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8af21869..ec4907a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,12 +90,15 @@ option(USE_HDF5 "Enable HDF5 for use during IO. VPIC does not help you install H
 
 option(USE_OPENPMD "Enable OpenPMD for use during IO. VPIC does not help you install OpenPM" OFF)
 
+option(OUTPUT_CONVERT_GLOBAL_ID "Convert particle cell id to be global, such that it tells  you  a unique global location instead of a local offset" ON)
+
 # option to set minimum number of particles
 set(SET_MIN_NUM_PARTICLES AUTO CACHE STRING "Select minimum number of particles to use, if using dynamic particle array resizing")
 
 # TODO: better name for this?
 set(PMD_MAX_IO_CHUNK AUTO CACHE STRING "Select the maxiumum IO write size to use when writing -- applies to particles only, and is specified as number of particles. currently only honored by OpenPMD backend")
 
+
 #------------------------------------------------------------------------------#
 # Create include and link aggregates
 #
@@ -129,6 +132,10 @@ if(NOT SET_MIN_NUM_PARTICLES STREQUAL "AUTO")
     add_definitions(-DMIN_NP=${SET_MIN_NUM_PARTICLES})
 endif()
 
+if(NOT PMD_MAX_IO_CHUNK STREQUAL "AUTO")
+    add_definitions(-DPMD_MAX_IO_CHUNK=${PMD_MAX_IO_CHUNK})
+endif()
+
 find_package(Threads REQUIRED)
 
 #------------------------------------------------------------------------------#
@@ -379,6 +386,10 @@ if(USE_OPENPMD)
     endif()
 endif(USE_OPENPMD)
 
+if(OUTPUT_CONVERT_GLOBAL_ID)
+    add_definitions(-DOUTPUT_CONVERT_GLOBAL_ID)
+endif(OUTPUT_CONVERT_GLOBAL_ID)
+
 macro(build_a_vpic name deck)
   if(NOT EXISTS ${deck})
     message(FATAL_ERROR "Could not find deck '${deck}'")
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 4e823e34..0451ce55 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -681,8 +681,7 @@ class HDF5Dump : public Dump_Strategy {
             H5Dclose(dset_id);
 
             // TODO: should we add the ability to chunk the particle write?
-            // TODO: make this a compile time option
-#define OUTPUT_CONVERT_GLOBAL_ID 1
+
 #ifdef OUTPUT_CONVERT_GLOBAL_ID
 
             // TODO: make a function out of this too, its used in openpmd

From 2cc7b1d8a587eb53dd8f8cd243bed7411cb766f1 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Sun, 2 Feb 2020 11:51:14 -0700
Subject: [PATCH 89/95] first effort to expose user configrable file extension
 for  openmpd

---
 sample/harrisOpenPMD     |  4 ++++
 src/vpic/dump_strategy.h | 10 ++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
index fdddccab..197dd38a 100644
--- a/sample/harrisOpenPMD
+++ b/sample/harrisOpenPMD
@@ -45,6 +45,10 @@ begin_initialization {
 
   enable_openpmd_dump();
 
+  // TODO: this should be done through a setter once we have a common options
+  // interface
+  dump_strategy->file_type = ".bp";
+
   // At this point, there is an empty grid and the random number generator is
   // seeded with the rank. The grid, materials, species need to be defined.
   // Then the initial non-zero fields need to be loaded at time level 0 and the
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 0451ce55..560f5813 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1126,6 +1126,10 @@ class OpenPMDDump : public Dump_Strategy {
     public:
         //openPMD::Series* series;
         using Dump_Strategy::Dump_Strategy; // inherit constructor
+
+        //std::string file_type = ".h5";
+        std::string file_type = ".bp";
+
         void dump_fields(
             const char *fbase,
             int step,
@@ -1414,9 +1418,6 @@ class OpenPMDDump : public Dump_Strategy {
             int ftag
         )
         {
-            // TODO: move this to class level, and make it user settable, so it
-            // can be used more widely
-            std::string file_type = ".h5";
             std::string full_file_name = fbase + file_type;
 
             std::cout << "writing particles to " << full_file_name << std::endl;
@@ -1564,9 +1565,6 @@ class OpenPMDDump : public Dump_Strategy {
             int ftag
         )
         {
-            // TODO: move this to class level, and make it user settable, so it
-            // can be used more widely
-            std::string file_type = ".h5";
             std::string full_file_name = fbase + file_type;
 
             std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl;

From a648e47da34ced2e72bfe617a90beb4dc4c07493 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 4 Feb 2020 15:55:27 -0700
Subject: [PATCH 90/95] renable hdf5 debug timing prints

---
 sample/harrisOpenPMD     |  2 +-
 src/vpic/dump_strategy.h | 44 ++++++++++++++++++++++++----------------
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/sample/harrisOpenPMD b/sample/harrisOpenPMD
index 197dd38a..697e0250 100644
--- a/sample/harrisOpenPMD
+++ b/sample/harrisOpenPMD
@@ -47,7 +47,7 @@ begin_initialization {
 
   // TODO: this should be done through a setter once we have a common options
   // interface
-  dump_strategy->file_type = ".bp";
+  //dump_strategy->file_type = ".bp";
 
   // At this point, there is an empty grid and the random number generator is
   // seeded with the rank. The grid, materials, species need to be defined.
diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 560f5813..3ca057a4 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -30,6 +30,16 @@
 #include <openPMD/openPMD.hpp>
 #endif
 
+// TODO: delete this
+#define _LOG_PREFIX \
+  __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: "
+#define io_log(x) do {                                \
+    if( rank==0 ) {                                  \
+      std::cerr << _LOG_PREFIX << x << std::endl;  \
+      std::cerr.flush();                               \
+    }                                                  \
+  } while(0)
+
 // Runtime inheritance is obviously not very "VPIC like", as we will [probably]
 // incur a penalty for the vtable lookup, but given we're about to do IO this
 // is very negligible.
@@ -321,7 +331,7 @@ class HDF5Dump : public Dump_Strategy {
             hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
             el1 = uptime() - el1;
-            //sim_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
+            io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
             double el2 = uptime();
 
             /*
@@ -468,7 +478,7 @@ class HDF5Dump : public Dump_Strategy {
                 DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT);
 
             el2 = uptime() - el2;
-            //sim_log("TimeHDF5Write: " << el2 << " s");
+            io_log("TimeHDF5Write: " << el2 << " s");
 
             double el3 = uptime();
 
@@ -497,7 +507,7 @@ class HDF5Dump : public Dump_Strategy {
             H5Fclose(file_id);
 
             el3 = uptime() - el3;
-            //sim_log("TimeHDF5Close: " << el3 << " s");
+            io_log("TimeHDF5Close: " << el3 << " s");
 
             if (mpi_rank == 0)
             {
@@ -621,7 +631,7 @@ class HDF5Dump : public Dump_Strategy {
             MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
 
             //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl;
-            //sim_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local);
+            io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local);
 
             Pf = (float *)sp->p;
             Pi = (int *)sp->p;
@@ -663,9 +673,9 @@ class HDF5Dump : public Dump_Strategy {
             H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL);
 
             el1 = uptime() - el1;
-            //sim_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
+            io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
 
-            //double el2 = uptime();
+            double el2 = uptime();
 
             // This point offset is silly, and loses the type safety (pf+1)
             hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
@@ -743,8 +753,8 @@ class HDF5Dump : public Dump_Strategy {
             ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7);
             H5Dclose(dset_id);
 
-            //el2 = uptime() - el2;
-            //sim_log("Particle TimeHDF5Write: " << el2 << " s");
+            el2 = uptime() - el2;
+            io_log("Particle TimeHDF5Write: " << el2 << " s");
 
             double el3 = uptime();
             H5Sclose(memspace);
@@ -753,7 +763,7 @@ class HDF5Dump : public Dump_Strategy {
             H5Gclose(group_id);
             H5Fclose(file_id);
             el3 = uptime() - el3;
-            //sim_log("Particle TimeHDF5Close: " << el3 << " s");
+            io_log("Particle TimeHDF5Close: " << el3 << " s");
 
             sp->p = sp_p;
             sp->np = sp_np;
@@ -785,7 +795,7 @@ class HDF5Dump : public Dump_Strategy {
             H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE);
             H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL);
             meta_el1 = uptime() - meta_el1;
-            //sim_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts
+            io_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts
 
             double meta_el2 = uptime();
 
@@ -834,7 +844,7 @@ class HDF5Dump : public Dump_Strategy {
             H5Dclose(meta_dset_id);
 
             meta_el2 = uptime() - meta_el2;
-            //sim_log("Metafile TimeHDF5Write: " << meta_el2 << " s");
+            io_log("Metafile TimeHDF5Write: " << meta_el2 << " s");
             double meta_el3 = uptime();
             H5Sclose(meta_memspace);
             H5Sclose(meta_filespace);
@@ -842,7 +852,7 @@ class HDF5Dump : public Dump_Strategy {
             H5Gclose(meta_group_id);
             H5Fclose(meta_file_id);
             meta_el3 = uptime() - meta_el3;
-            //sim_log("Metafile TimeHDF5Close: " << meta_el3 << " s");
+            io_log("Metafile TimeHDF5Close: " << meta_el3 << " s");
 
         }
 
@@ -913,8 +923,8 @@ class HDF5Dump : public Dump_Strategy {
             hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
             el1 = uptime() - el1;
-            //sim_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
-            //double el2 = uptime();
+            io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
+            double el2 = uptime();
 
             // Create a variable list of field values to output.
             //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
@@ -1023,8 +1033,8 @@ class HDF5Dump : public Dump_Strategy {
             if (hydro_dump_flag.txy)
                 DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
 
-            //el2 = uptime() - el2;
-            //sim_log("TimeHDF5Write: " << el2 << " s");
+            el2 = uptime() - el2;
+            io_log("TimeHDF5Write: " << el2 << " s");
 
             double el3 = uptime();
 
@@ -1053,7 +1063,7 @@ class HDF5Dump : public Dump_Strategy {
             H5Fclose(file_id);
 
             el3 = uptime() - el3;
-            //sim_log("TimeHDF5Close: " << el3 << " s");
+            io_log("TimeHDF5Close: " << el3 << " s");
 
             if (mpi_rank == 0)
             {

From 46ae346d13e211afb3d655f579bb5501e384fbfe Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 27 Oct 2020 12:18:41 -0600
Subject: [PATCH 91/95] default vpic_simulation ptrs to nullptr

---
 src/vpic/vpic.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index bbc75235..fd025ee8 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -210,21 +210,21 @@ class vpic_simulation {
      random numbers.  Keeping the synchronous generators in sync is
      the generator users responsibility. */
 
-  rng_pool_t           * entropy;            // Local entropy pool
-  rng_pool_t           * sync_entropy;       // Synchronous entropy pool
-  grid_t               * grid;               // define_*_grid et al
-  material_t           * material_list;      // define_material
-  field_array_t        * field_array;        // define_field_array
-  interpolator_array_t * interpolator_array; // define_interpolator_array
-  accumulator_array_t  * accumulator_array;  // define_accumulator_array
-  hydro_array_t        * hydro_array;        // define_hydro_array
-  species_t            * species_list;       // define_species /
-                                             // species helpers
-  particle_bc_t        * particle_bc_list;   // define_particle_bc /
-                                             // boundary helpers
-  emitter_t            * emitter_list;       // define_emitter /
-                                             // emitter helpers
-  collision_op_t       * collision_op_list;  // collision helpers
+  rng_pool_t           * entropy = nullptr;            // Local entropy pool
+  rng_pool_t           * sync_entropy = nullptr;       // Synchronous entropy pool
+  grid_t               * grid = nullptr;               // define_*_grid et al
+  material_t           * material_list = nullptr;      // define_material
+  field_array_t        * field_array = nullptr;        // define_field_array
+  interpolator_array_t * interpolator_array = nullptr; // define_interpolator_array
+  accumulator_array_t  * accumulator_array = nullptr;  // define_accumulator_array
+  hydro_array_t        * hydro_array = nullptr;        // define_hydro_array
+  species_t            * species_list = nullptr;       // define_species /
+                                                       // species helpers
+  particle_bc_t        * particle_bc_list = nullptr;   // define_particle_bc /
+                                                       // boundary helpers
+  emitter_t            * emitter_list = nullptr;       // define_emitter /
+                                                       // emitter helpers
+  collision_op_t       * collision_op_list = nullptr;  // collision helpers
 
   // User defined checkpt preserved variables
   // Note: user_global is aliased with user_global_t (see deck_wrapper.cxx)

From ade0a0f0ce5f242ecd29c76b31acff76b3404c0d Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 10 Nov 2020 14:35:10 -0700
Subject: [PATCH 92/95] disable restore test until we add logic to perform
 restore of dump strat

---
 src/vpic/dump_strategy.h                     |  4 +++-
 src/vpic/vpic.h                              |  1 +
 test/integrated/to_completion/CMakeLists.txt | 12 ++++++------
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 3ca057a4..0eb5425f 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1150,10 +1150,12 @@ class OpenPMDDump : public Dump_Strategy {
         {
             std::cout << "Writing openPMD data" << std::endl;
 
+            std::string full_file_name = fbase + file_type;
+
             //if (series == nullptr) {
                 std::cout << "init series" << std::endl;
                 openPMD::Series series = openPMD::Series(
-                        fbase,
+                        full_file_name,
                         openPMD::AccessType::CREATE,
                         MPI_COMM_WORLD
                 );
diff --git a/src/vpic/vpic.h b/src/vpic/vpic.h
index fd025ee8..2a4b1767 100644
--- a/src/vpic/vpic.h
+++ b/src/vpic/vpic.h
@@ -145,6 +145,7 @@ class vpic_simulation {
 
   // Very likely a user will forgot to delete this if they change the strategy,
   // a smart ptr will save us from the small leak
+  // TODO: this does not survive the dump right now
   std::unique_ptr<Dump_Strategy> dump_strategy;
 
   int num_step = 1;             // Number of steps to take
diff --git a/test/integrated/to_completion/CMakeLists.txt b/test/integrated/to_completion/CMakeLists.txt
index 5baecf43..690e738c 100644
--- a/test/integrated/to_completion/CMakeLists.txt
+++ b/test/integrated/to_completion/CMakeLists.txt
@@ -58,11 +58,11 @@ add_test(${generate_restore} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG}
 
 # Run using the restore file
 # TODO: caps?
-set(perform_restore "perform_${RESTART_BINARY}")
-build_a_vpic(${perform_restore} ${CMAKE_CURRENT_SOURCE_DIR}/${RESTART_DECK}.deck)
-add_test(${perform_restore} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG}
-    ${MPIEXEC_NUMPROC} ${MPIEXEC_PREFLAGS} ${perform_restore}
-    ${MPIEXEC_POSTFLAGS} ${RESTART_ARGS})
+#set(perform_restore "perform_${RESTART_BINARY}")
+#build_a_vpic(${perform_restore} ${CMAKE_CURRENT_SOURCE_DIR}/${RESTART_DECK}.deck)
+#add_test(${perform_restore} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG}
+#${MPIEXEC_NUMPROC} ${MPIEXEC_PREFLAGS} ${perform_restore}
+#${MPIEXEC_POSTFLAGS} ${RESTART_ARGS})
 
 # TODO: re-enable modify test
 #list(APPEND MODIFY_BINARY restore-modify)
@@ -76,4 +76,4 @@ add_test(${perform_restore} ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG}
 set(RESTORE_LABEL "restore_group")
 set_tests_properties(${perform_restore} PROPERTIES DEPENDS ${generate_restore})
 #set_property(TEST ${generate_restore} PROPERTY FIXTURES_SETUP ${RESTORE_LABEL})
-#set_property(TEST ${perform_restore} PROPERTY FIXTURES_REQUIRED ${RESTORE_LABEL})
\ No newline at end of file
+#set_property(TEST ${perform_restore} PROPERTY FIXTURES_REQUIRED ${RESTORE_LABEL})

From 34bdedd1fb234cf42764d2a86f6aef25477dd47f Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Mon, 23 Nov 2020 10:33:05 -0700
Subject: [PATCH 93/95] apply Bin's changed

---
 src/vpic/dump_strategy.h | 2276 ++++++++++++++++++++++----------------
 1 file changed, 1302 insertions(+), 974 deletions(-)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index 0eb5425f..c6fb7613 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -5,9 +5,27 @@
 #include <vector>
 #include <iostream>
 
+//#define DUMP_INFO_DEBUG 1
+//#define H5_ASYNC 1
+#ifdef H5_ASYNC
+#include "h5_vol_external_async_native.h"
+#endif
+//#define CHUNK_FLAG 1
+
+
+//#define METADATA_COLL_WRITE 1
+//#define TRUE 1
+
+
+#define HAS_FIELD_COMP 1
+#define HAS_PARTICLE_COMP 1
+#define HAS_HYDRO_COMP 1
+
+//#define HAS_INDEPENDENT_IO 1
+
 #include <mpi.h> // TODO: it would be good if this didn't have to know about MPI
+#include <cassert>
 
-#define DUMP_INFO_DEBUG 1
 
 // TODO: should I drop the ./src here?
 #include "../util/io/FileIO.h"
@@ -30,54 +48,63 @@
 #include <openPMD/openPMD.hpp>
 #endif
 
+
+//#define N_FILE_N_PROCESS 1
+//#define TEST_MPIIO 1
+
 // TODO: delete this
 #define _LOG_PREFIX \
-  __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: "
+    __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: "
+
+/*
 #define io_log(x) do {                                \
-    if( rank==0 ) {                                  \
-      std::cerr << _LOG_PREFIX << x << std::endl;  \
-      std::cerr.flush();                               \
-    }                                                  \
-  } while(0)
+if( rank==0 ) {                                  \
+std::cerr << _LOG_PREFIX << x << std::endl;  \
+std::cerr.flush();                               \
+}                                                  \
+} while(0)
+*/
+
+
 
 // Runtime inheritance is obviously not very "VPIC like", as we will [probably]
 // incur a penalty for the vtable lookup, but given we're about to do IO this
 // is very negligible.
 class Dump_Strategy {
     public:
-    int rank, nproc, num_step;
+        int rank, nproc, num_step;
 
-    Dump_Strategy(int _rank, int _nproc ) :
-        rank(_rank),
-        nproc(_nproc)
+        Dump_Strategy(int _rank, int _nproc ) :
+            rank(_rank),
+            nproc(_nproc)
     { } // empty
 
-    virtual ~Dump_Strategy() { };
+        virtual ~Dump_Strategy() { };
 
-    virtual void dump_fields(
-        const char *fbase,
-        int step,
-        grid_t* grid,
-        field_array_t* field_array,
-        int ftag
-    ) = 0;
-    virtual void dump_hydro(
-        const char *fbase,
-        int step,
-        hydro_array_t* hydro_array,
-        species_t* sp,
-        interpolator_array_t* interpolator_array,
-        grid_t* grid,
-        int ftag
-    ) = 0;
-    virtual void dump_particles(
-        const char *fbase,
-        species_t* sp,
-        grid_t* grid,
-        int step,
-        interpolator_array_t* interpolator_array,
-        int ftag
-    ) = 0;
+        virtual void dump_fields(
+                const char *fbase,
+                int step,
+                grid_t* grid,
+                field_array_t* field_array,
+                int ftag
+                ) = 0;
+        virtual void dump_hydro(
+                const char *fbase,
+                int step,
+                hydro_array_t* hydro_array,
+                species_t* sp,
+                interpolator_array_t* interpolator_array,
+                grid_t* grid,
+                int ftag
+                ) = 0;
+        virtual void dump_particles(
+                const char *fbase,
+                species_t* sp,
+                grid_t* grid,
+                int step,
+                interpolator_array_t* interpolator_array,
+                int ftag
+                ) = 0;
 };
 
 class BinaryDump : public Dump_Strategy {
@@ -92,7 +119,7 @@ class BinaryDump : public Dump_Strategy {
                 grid_t* grid,
                 field_array_t* field_array,
                 int ftag
-        );
+                );
         void dump_hydro(
                 const char *fbase,
                 int step,
@@ -101,7 +128,7 @@ class BinaryDump : public Dump_Strategy {
                 interpolator_array_t* interpolator_array,
                 grid_t* grid,
                 int ftag
-        );
+                );
         void dump_particles(
                 const char *fbase,
                 species_t* sp,
@@ -109,1025 +136,1326 @@ class BinaryDump : public Dump_Strategy {
                 int step,
                 interpolator_array_t* interpolator_array,
                 int ftag
-        );
+                );
 };
 
 #ifdef VPIC_ENABLE_HDF5
 
 struct field_dump_flag_t
 {
-  bool ex = true, ey = true, ez = true, div_e_err = true;
-  bool cbx = true, cby = true, cbz = true, div_b_err = true;
-  bool tcax = true, tcay = true, tcaz = true, rhob = true;
-  bool jfx = true, jfy = true, jfz = true, rhof = true;
-  bool ematx = true, ematy = true, ematz = true, nmat = true;
-  bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
-  void disableE()
-  {
-    ex = false, ey = false, ez = false, div_e_err = false;
-  }
-
-  void disableCB()
-  {
-    cbx = false, cby = false, cbz = false, div_b_err = false;
-  }
-
-  void disableTCA()
-  {
-    tcax = false, tcay = false, tcaz = false, rhob = false;
-  }
-
-  void disableJF()
-  {
-    jfx = false, jfy = false, jfz = false, rhof = false;
-  }
-
-  void disableEMAT()
-  {
-    ematx = false, ematy = false, ematz = false, nmat = false;
-  }
-
-  void disableFMAT()
-  {
-    fmatx = false, fmaty = false, fmatz = false, cmat = false;
-  }
-
-  void resetToDefaults()
-  {
-    ex = true, ey = true, ez = true, div_e_err = true;
-    cbx = true, cby = true, cbz = true, div_b_err = true;
-    tcax = true, tcay = true, tcaz = true, rhob = true;
-    jfx = true, jfy = true, jfz = true, rhof = true;
-    ematx = true, ematy = true, ematz = true, nmat = true;
-    fmatx = true, fmaty = true, fmatz = true, cmat = true;
-  }
-
-  bool enabledE()
-  {
-    return ex && ey && ez;
-  }
-
-  bool enabledCB()
-  {
-    return cbx && cby && cbz;
-  }
-
-  bool enabledTCA()
-  {
-    return tcax && tcay && tcaz;
-  }
-
-  bool enabledJF()
-  {
-    return jfx && jfy && jfz;
-  }
-
-  bool enabledEMAT()
-  {
-    return ematx && ematy && ematz;
-  }
-
-  bool enabledFMAT()
-  {
-    return fmatx && fmaty && fmatz;
-  }
+    bool ex = true, ey = true, ez = true, div_e_err = true;
+    bool cbx = true, cby = true, cbz = true, div_b_err = true;
+    bool tcax = true, tcay = true, tcaz = true, rhob = true;
+    bool jfx = true, jfy = true, jfz = true, rhof = true;
+    bool ematx = true, ematy = true, ematz = true, nmat = true;
+    bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
+    void disableE()
+    {
+        ex = false, ey = false, ez = false, div_e_err = false;
+    }
+
+    void disableCB()
+    {
+        cbx = false, cby = false, cbz = false, div_b_err = false;
+    }
+
+    void disableTCA()
+    {
+        tcax = false, tcay = false, tcaz = false, rhob = false;
+    }
+
+    void disableJF()
+    {
+        jfx = false, jfy = false, jfz = false, rhof = false;
+    }
+
+    void disableEMAT()
+    {
+        ematx = false, ematy = false, ematz = false, nmat = false;
+    }
+
+    void disableFMAT()
+    {
+        fmatx = false, fmaty = false, fmatz = false, cmat = false;
+    }
+
+    void resetToDefaults()
+    {
+        ex = true, ey = true, ez = true, div_e_err = true;
+        cbx = true, cby = true, cbz = true, div_b_err = true;
+        tcax = true, tcay = true, tcaz = true, rhob = true;
+        jfx = true, jfy = true, jfz = true, rhof = true;
+        ematx = true, ematy = true, ematz = true, nmat = true;
+        fmatx = true, fmaty = true, fmatz = true, cmat = true;
+    }
+
+    bool enabledE()
+    {
+        return ex && ey && ez;
+    }
+
+    bool enabledCB()
+    {
+        return cbx && cby && cbz;
+    }
+
+    bool enabledTCA()
+    {
+        return tcax && tcay && tcaz;
+    }
+
+    bool enabledJF()
+    {
+        return jfx && jfy && jfz;
+    }
+
+    bool enabledEMAT()
+    {
+        return ematx && ematy && ematz;
+    }
+
+    bool enabledFMAT()
+    {
+        return fmatx && fmaty && fmatz;
+    }
 };
 
 struct hydro_dump_flag_t
 {
-  bool jx = true, jy = true, jz = true, rho = true;
-  bool px = true, py = true, pz = true, ke = true;
-  bool txx = true, tyy = true, tzz = true;
-  bool tyz = true, tzx = true, txy = true;
-
-  void disableJ()
-  {
-    jx = false, jy = false, jz = false, rho = false;
-  }
-
-  void disableP()
-  {
-    px = false, py = false, pz = false, ke = false;
-  }
-
-  void disableTD() //Stress diagonal
-  {
-    txx = false, tyy = false, tzz = false;
-  }
-
-  void disableTOD() //Stress off-diagonal
-  {
-    tyz = false, tzx = false, txy = false;
-  }
-  void resetToDefaults()
-  {
-    jx = true, jy = true, jz = true, rho = true;
-    px = true, py = true, pz = true, ke = true;
-    txx = true, tyy = true, tzz = true;
-    tyz = true, tzx = true, txy = true;
-  }
-
-  bool enabledJ()
-  {
-    return jx && jy && jz;
-  }
-
-  bool enabledP()
-  {
-    return px && py && pz;
-  }
-
-  bool enabledTD()
-  {
-    return txx && tyy && tzz;
-  }
-
-  bool enabledTOD()
-  {
-    return tyz && tzx && txy;
-  }
+    bool jx = true, jy = true, jz = true, rho = true;
+    bool px = true, py = true, pz = true, ke = true;
+    bool txx = true, tyy = true, tzz = true;
+    bool tyz = true, tzx = true, txy = true;
+
+    void disableJ()
+    {
+        jx = false, jy = false, jz = false, rho = false;
+    }
+
+    void disableP()
+    {
+        px = false, py = false, pz = false, ke = false;
+    }
+
+    void disableTD() //Stress diagonal
+    {
+        txx = false, tyy = false, tzz = false;
+    }
+
+    void disableTOD() //Stress off-diagonal
+    {
+        tyz = false, tzx = false, txy = false;
+    }
+    void resetToDefaults()
+    {
+        jx = true, jy = true, jz = true, rho = true;
+        px = true, py = true, pz = true, ke = true;
+        txx = true, tyy = true, tzz = true;
+        tyz = true, tzx = true, txy = true;
+    }
+
+    bool enabledJ()
+    {
+        return jx && jy && jz;
+    }
+
+    bool enabledP()
+    {
+        return px && py && pz;
+    }
+
+    bool enabledTD()
+    {
+        return txx && tyy && tzz;
+    }
+
+    bool enabledTOD()
+    {
+        return tyz && tzx && txy;
+    }
 };
 class HDF5Dump : public Dump_Strategy {
     std::unordered_map<species_id, size_t> tframe_map;
     public:
-        using Dump_Strategy::Dump_Strategy; // inherit constructor
+    using Dump_Strategy::Dump_Strategy; // inherit constructor
 
-        // TODO: replace these with a common dump interface
-        // Declare vars to use
-        hydro_dump_flag_t hydro_dump_flag;
-        field_dump_flag_t field_dump_flag;
+    // TODO: replace these with a common dump interface
+    // Declare vars to use
+    hydro_dump_flag_t hydro_dump_flag;
+    field_dump_flag_t field_dump_flag;
 
 #define DUMP_DIR_FORMAT "./%s"
 
-// TODO: naming a macro so close to existing functions AND data is not a good
-// define to do C-style indexing
+    // TODO: naming a macro so close to existing functions AND data is not a good
+    // define to do C-style indexing
 #define _hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
 
-        // TODO: make function?
-        void dump_fields(
+
+    /**
+     * @brief Dump field data to the HDf5 file
+     *         Author: Bin Dong  dbin@lbl.gov
+     *         https://crd.lbl.gov/bin-dong
+     *         Nov 2020
+     * @param fbase
+     * @param step
+     * @param grid
+     * @param field_array
+     * @param ftag
+     */
+    void dump_fields(
             const char *fbase,
             int step,
             grid_t* grid,
             field_array_t* field_array,
             int ftag
-        )
-        {
-            size_t step_for_viou = step;
+            )
+    {
+        size_t step_for_viou = step;
 
-            int mpi_size, mpi_rank;
-            MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+        int mpi_size, mpi_rank;
+        MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+        MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
 
 
 #ifdef DUMP_INFO_DEBUG
-            printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
-            //printf("base dir for field: %s \n", fdParams.baseDir);
-            //printf("stride x y z  = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z);
-            printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
-            printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
-            //printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
-            printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
+        printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
+        //printf("base dir for field: %s \n", fdParams.baseDir);
+        //printf("stride x y z  = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z);
+        printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
+        printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
+        //printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
+        printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
 #endif
 
-#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
-            {                                                                                                             \
-                dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
-                temp_buf_index = 0;                                                                                       \
-                for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
-                {                                                                                                         \
-                    for (size_t j(1); j < grid->ny + 1; j++)                                                              \
-                    {                                                                                                     \
-                        for (size_t k(1); k < grid->nz + 1; k++)                                                          \
-                        {                                                                                                 \
-                            temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME;                     \
-                            temp_buf_index = temp_buf_index + 1;                                                          \
-                        }                                                                                                 \
-                    }                                                                                                     \
-                }                                                                                                         \
-                dataspace_id = H5Dget_space(dset_id);                                                                     \
-                H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
-                H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
-                H5Sclose(dataspace_id);                                                                                   \
-                H5Dclose(dset_id);                                                                                        \
-            }
-
-            char fname[256];
-            char field_scratch[128];
-            char subfield_scratch[128];
 
-            sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5");
-            FileUtils::makeDirectory(field_scratch);
-            sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou);
-            FileUtils::makeDirectory(subfield_scratch);
 
-            sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou);
-            double el1 = uptime();
-            hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-            H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-            hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
-            H5Pclose(plist_id);
+        char fname[256];
+        char field_scratch[128];
+        char subfield_scratch[128];
+
+        sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5");
+        FileUtils::makeDirectory(field_scratch);
+        sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou);
+        FileUtils::makeDirectory(subfield_scratch);
+
+        sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou);
+        double el1 = uptime();
+
+        //    int file_exist(const char *filename)
+        //{
+        //    struct stat buffer;
+        //    return (stat(filename, &buffer) == 0);
+        //}
+
+        //https://support.hdfgroup.org/ftp/HDF5/current/src/unpacked/examples/h5_compound.c
+#ifdef  HAS_FIELD_COMP
+        if(!mpi_rank)
+            printf("Using Field Compund type !\n");
+        hid_t  field_comp_type_it = H5Tcreate (H5T_COMPOUND, sizeof(field_t));
+        H5Tinsert(field_comp_type_it, "ex", HOFFSET(field_t, ex), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "ey", HOFFSET(field_t, ey), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "ez", HOFFSET(field_t, ez), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "div_e_err", HOFFSET(field_t, div_e_err), H5T_NATIVE_FLOAT);
+
+        H5Tinsert(field_comp_type_it, "cbx", HOFFSET(field_t, cbx), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "cby", HOFFSET(field_t, cby), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "cbz", HOFFSET(field_t, cbz), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "div_b_err", HOFFSET(field_t, div_b_err), H5T_NATIVE_FLOAT);
+
+        H5Tinsert(field_comp_type_it, "tcax", HOFFSET(field_t, tcax), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "tcay", HOFFSET(field_t, tcay), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "tcaz", HOFFSET(field_t, tcaz), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "rhob", HOFFSET(field_t, rhob), H5T_NATIVE_FLOAT);
+
+        H5Tinsert(field_comp_type_it, "jfx", HOFFSET(field_t, jfx), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "jfy", HOFFSET(field_t, jfy), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "jfz", HOFFSET(field_t, jfz), H5T_NATIVE_FLOAT);
+        H5Tinsert(field_comp_type_it, "rhof", HOFFSET(field_t, rhof), H5T_NATIVE_FLOAT);
+
+        H5Tinsert(field_comp_type_it, "ematx", HOFFSET(field_t, ematx), H5T_NATIVE_SHORT);
+        H5Tinsert(field_comp_type_it, "ematy", HOFFSET(field_t, ematy), H5T_NATIVE_SHORT);
+        H5Tinsert(field_comp_type_it, "ematz", HOFFSET(field_t, ematz), H5T_NATIVE_SHORT);
+        H5Tinsert(field_comp_type_it, "nmat", HOFFSET(field_t, nmat), H5T_NATIVE_SHORT);
+
+        H5Tinsert(field_comp_type_it, "fmatx", HOFFSET(field_t, fmatx), H5T_NATIVE_SHORT);
+        H5Tinsert(field_comp_type_it, "fmaty", HOFFSET(field_t, fmaty), H5T_NATIVE_SHORT);
+        H5Tinsert(field_comp_type_it, "fmatz", HOFFSET(field_t, fmatz), H5T_NATIVE_SHORT);
+        H5Tinsert(field_comp_type_it, "cmat", HOFFSET(field_t, cmat), H5T_NATIVE_SHORT);
+#endif
 
-            sprintf(fname, "Timestep_%zu", step_for_viou);
-            hid_t group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
-            el1 = uptime() - el1;
-            io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
-            double el2 = uptime();
+        //struct stat buffer;
+        //if((stat(fname, &buffer) == 0)){
+        //    file_exist_flag  = 1;
+        //    if(!mpi_rank)
+        //        printf("Write original files /w HDF5! \n");
+        // }
+        // file_exist_flag = 0;
+
+        hid_t plist_id;
+        hid_t file_id;
+        plist_id = H5Pcreate(H5P_FILE_ACCESS);
+        H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+        //H5Pset_alignment(plist_id, 4194304, 4194304);
+        /*if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < 0){
+          exit(-1);
+          }*/
+
+#ifdef METADATA_COLL_WRITE
+        if(!mpi_rank) printf("Enable collective metadata write !\n");
+        H5Pset_coll_metadata_write(plist_id, TRUE);
+#endif
+        file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+        H5Pclose(plist_id);
 
-            /*
-            // Create a variable list of field values to output.
-            size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
-            size_t * varlist = new size_t[numvars];
 
-            for(size_t i(0), c(0); i<total_field_variables; i++)
-            if(global->fdParams.output_vars.bitset(i)) varlist[c++] = i;
 
-            printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
+        sprintf(fname, "Timestep_%zu", step_for_viou);
+        hid_t group_id;
+        group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
-#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
-            /*
-               typedef struct field {
-               float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
-               float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
-               float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
-               float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
-               material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
-               material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
-               } field_t;*/
-            // Local voxel mesh resolution.  Voxels are
-            // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1
-            // with voxels 1:nx,1:ny,1:nz being non-ghost
-            // voxels.
-
-            float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
-            hsize_t temp_buf_index;
-            hid_t dset_id;
-            //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
-            plist_id = H5Pcreate(H5P_DATASET_XFER);
-            //Comment out for test only
-            H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-            //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
-
-            //global->topology_x
-
-            hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3];
-            field_global_size[0] = (grid->nx * grid->gpx);
-            field_global_size[1] = (grid->ny * grid->gpy);
-            field_global_size[2] = (grid->nz * grid->gpz);
-
-            field_local_size[0] = grid->nx;
-            field_local_size[1] = grid->ny;
-            field_local_size[2] = grid->nz;
-
-            int gpx = grid->gpx;
-            int gpy = grid->gpy;
-            int gpz = grid->gpz;
-
-            // Convert rank to local decomposition
-            int rx, ry, rz;
-            UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+        el1 = uptime() - el1;
+        //io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
+        double el2 = uptime();
 
-            int mpi_rank_x, mpi_rank_y, mpi_rank_z;
-            mpi_rank_x = rx;
-            mpi_rank_y = ry;
-            mpi_rank_z = rz;
+        /*
+        // Create a variable list of field values to output.
+        size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
+        size_t * varlist = new size_t[numvars];
 
-            global_offset[0] = (grid->nx) * mpi_rank_x;
-            global_offset[1] = (grid->ny) * mpi_rank_y;
-            global_offset[2] = (grid->nz) * mpi_rank_z;
+        for(size_t i(0), c(0); i<total_field_variables; i++)
+        if(global->fdParams.output_vars.bitset(i)) varlist[c++] = i;
 
-            global_count[0] = (grid->nx);
-            global_count[1] = (grid->ny);
-            global_count[2] = (grid->nz);
+        printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
 
-#ifdef DUMP_INFO_DEBUG
-            printf("global size   = %llu  %llu %llu \n", field_global_size[0], field_global_size[1], field_global_size[2]);
-            printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]);
-            printf("global_count  = %llu  %llu %llu \n", global_count[0], global_count[1], global_count[2]);
-            printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
-            fflush(stdout);
+#define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
+        /*
+           typedef struct field {
+           float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
+           float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
+           float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
+           float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
+           material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
+           material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
+           } field_t;*/
+        // Local voxel mesh resolution.  Voxels are
+        // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1
+        // with voxels 1:nx,1:ny,1:nz being non-ghost
+        // voxels.
+
+        float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
+        hsize_t temp_buf_index;
+        hid_t dset_id;
+        //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
+        //Comment out for test only
+
+        plist_id = H5Pcreate(H5P_DATASET_XFER);
+#ifdef HAS_INDEPENDENT_IO
+        if(!mpi_rank) printf("\n ###\n VPIC Independent I/O! \n ###\n");
+        H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT);
+#else
+        H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
 #endif
 
-            hid_t filespace = H5Screate_simple(3, field_global_size, NULL);
-            hid_t memspace = H5Screate_simple(3, field_local_size, NULL);
-            hid_t dataspace_id;
-
-            /*
-               typedef struct field {
-               float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
-               float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
-               float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
-               float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
-               material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
-               material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
-               } field_t;*/
-
-            if (field_dump_flag.ex)
-                DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.ey)
-                DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.ez)
-                DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.div_e_err)
-                DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT);
-
-            if (field_dump_flag.cbx)
-                DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.cby)
-                DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.cbz)
-                DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.div_b_err)
-                DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT);
-
-            if (field_dump_flag.tcax)
-                DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.tcay)
-                DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.tcaz)
-                DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.rhob)
-                DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT);
-
-            if (field_dump_flag.jfx)
-                DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.jfy)
-                DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.jfz)
-                DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT);
-            if (field_dump_flag.rhof)
-                DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT);
-
-            //H5T_NATIVE_SHORT  for material_id (typedef int16_t material_id)
-            if (field_dump_flag.ematx)
-                DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT);
-            if (field_dump_flag.ematy)
-                DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT);
-            if (field_dump_flag.ematz)
-                DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT);
-            if (field_dump_flag.nmat)
-                DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT);
-
-            if (field_dump_flag.fmatx)
-                DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT);
-            if (field_dump_flag.fmaty)
-                DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT);
-            if (field_dump_flag.fmatz)
-                DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT);
-            if (field_dump_flag.cmat)
-                DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT);
-
-            el2 = uptime() - el2;
-            io_log("TimeHDF5Write: " << el2 << " s");
-
-            double el3 = uptime();
-
-            //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
-            float attr_data[2][3];
-            attr_data[0][0] = grid->x0;
-            attr_data[0][1] = grid->y0;
-            attr_data[0][2] = grid->z0;
-            attr_data[1][0] = grid->dx;
-            attr_data[1][1] = grid->dy;
-            attr_data[1][2] = grid->dz;
-            hsize_t dims[2];
-            dims[0] = 2;
-            dims[1] = 3;
-            hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
-            hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
-            H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
-            H5Sclose(va_geo_dataspace_id);
-            H5Aclose(va_geo_attribute_id);
-
-            free(temp_buf);
-            H5Sclose(filespace);
-            H5Sclose(memspace);
-            H5Pclose(plist_id);
-            H5Gclose(group_id);
-            H5Fclose(file_id);
-
-            el3 = uptime() - el3;
-            io_log("TimeHDF5Close: " << el3 << " s");
-
-            if (mpi_rank == 0)
-            {
-                char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf";
-                char dimensions_3d[128];
-                sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]);
-                char dimensions_4d[128];
-                sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3);
-                char orignal[128];
-                sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
-                char dxdydz[128];
-                sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
 
 
-                // TODO: remove or let the user set
-                int field_interval = 1;
 
-                // TODO: remove this dependence on number of steps
-                std::cout << "num_step " << num_step << std::endl;
+        //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
 
-                int nframes = num_step / field_interval + 1;
-                static int field_tframe = 0;
+        //global->topology_x
 
-#ifdef DUMP_INFO_DEBUG
-                printf("         meta file : %s \n", output_xml_file);
-                printf(" array dims per var: %s \n", dimensions_3d);
-                printf("array dims all vars: %s \n", dimensions_4d);
-                printf("            orignal: %s \n", orignal);
-                printf("             dxdydz: %s \n", dxdydz);
-                printf("            nframes: %d \n", nframes);
-                printf("    field_interval: %d \n", field_interval);
-                printf("       current step: %zd \n", step_for_viou);
-                printf("       current step: %zd \n", step_for_viou);
-
-                //printf("    Simulation time: %f \n", grid->t0);
-                printf("             tframe: %d \n", field_tframe);
-#endif
+        hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3];
+        field_global_size[0] = (grid->nx * grid->gpx);
+        field_global_size[1] = (grid->ny * grid->gpy);
+        field_global_size[2] = (grid->nz * grid->gpz);
 
-                // TODO: this footer dumping is more likely better done in a
-                // destructor, rather than hoping a multiple division works out
-                if (field_tframe >= 1)
-                {
-                    if (field_tframe == (nframes - 1))
-                    {
-                        invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
-                    }
-                    else
-                    {
-                        invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
-                    }
-                }
-                else
-                {
-                    create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval);
-                    if (field_tframe == (nframes - 1))
-                    {
-                        invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
-                    }
-                    else
-                    {
-                        invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
-                    }
-                }
-                field_tframe++;
-            }
-        }
-        void dump_particles(
-            const char *fbase,
-            species_t* sp,
-            grid_t* grid,
-            int step,
-            interpolator_array_t* interpolator_array,
-            int ftag
-        )
-        {
-            size_t step_for_viou = step;
-            char fname[256];
-            char group_name[256];
-            char particle_scratch[128];
-            char subparticle_scratch[128];
-
-            int np_local;
-
-            float *Pf;
-            int *Pi;
-
-            // get the total number of particles. in this example, output only electrons
-            //sp = species_list;
-            sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5");
-            FileUtils::makeDirectory(particle_scratch);
-            sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou);
-            FileUtils::makeDirectory(subparticle_scratch);
-
-            // TODO: Allow the user to set this
-            int stride_particle_dump = 1;
-
-            np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump;
-
-            // make a copy of the part of particle data to be dumped
-            double ec1 = uptime();
-
-            int sp_np = sp->np;
-            int sp_max_np = sp->max_np;
-            particle_t *ALIGNED(128) p_buf = NULL;
-            if (!p_buf)
-                MALLOC_ALIGNED(p_buf, np_local, 128);
-            particle_t *sp_p = sp->p;
-            sp->p = p_buf;
-            sp->np = np_local;
-            sp->max_np = np_local;
-
-            for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i)
-            {
-                COPY(&sp->p[i], &sp_p[iptl], 1);
-            }
+        field_local_size[0] = grid->nx;
+        field_local_size[1] = grid->ny;
+        field_local_size[2] = grid->nz;
 
-            center_p(sp, interpolator_array);
+        int gpx = grid->gpx;
+        int gpy = grid->gpy;
+        int gpz = grid->gpz;
 
-            ec1 = uptime() - ec1;
-            int mpi_rank;
-            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+        // Convert rank to local decomposition
+        int rx, ry, rz;
+        UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
 
-            //std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl;
-            io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local);
+        int mpi_rank_x, mpi_rank_y, mpi_rank_z;
+        mpi_rank_x = rx;
+        mpi_rank_y = ry;
+        mpi_rank_z = rz;
 
-            Pf = (float *)sp->p;
-            Pi = (int *)sp->p;
+        global_offset[0] = (grid->nx) * mpi_rank_x;
+        global_offset[1] = (grid->ny) * mpi_rank_y;
+        global_offset[2] = (grid->nz) * mpi_rank_z;
 
-            // open HDF5 file in "particle/T.<step>/" subdirectory
-            // filename: eparticle.h5p
-            sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
-            sprintf(group_name, "/Timestep_%ld", step_for_viou);
-            double el1 = uptime();
+        global_count[0] = (grid->nx);
+        global_count[1] = (grid->ny);
+        global_count[2] = (grid->nz);
 
-            hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-            H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-            hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
-            hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+#ifdef DUMP_INFO_DEBUG
+        if(mpi_rank < 4){
+            printf("grid nx, ny nz  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
+            printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+            printf("global size   = %llu  %llu %llu \n", field_global_size[0], field_global_size[1], field_global_size[2]);
+            printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]);
+            printf("global_count  = %llu  %llu %llu \n", global_count[0], global_count[1], global_count[2]);
+            fflush(stdout);
+        }
+#endif
 
-            H5Pclose(plist_id);
+        hid_t filespace;   //= H5Screate_simple(3, field_global_size, NULL);
+        hid_t memspace; // = H5Screate_simple(3, field_local_size, NULL);
+        //if(!file_exist_flag){
+        filespace = H5Screate_simple(3, field_global_size, NULL);
+        //}
+        memspace = H5Screate_simple(3, field_local_size, NULL);
 
-            long long total_particles, offset;
-            long long numparticles = np_local;
-            MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-            MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-            offset -= numparticles;
+        hsize_t chunk_dims[3];
+        chunk_dims[0] = 288; //grid->nx;  //8 x 8 x 8
+        chunk_dims[1] = 24;  //grid->ny; //
+        chunk_dims[2] = 24;  //grid->nz;
 
-            hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL);
 
-            hsize_t memspace_count_temp = numparticles * 8;
-            hid_t memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
 
-            hsize_t linearspace_count_temp = numparticles;
-            hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
+        hid_t dataspace_id;
+        hid_t dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
+#ifdef CHUNK_FLAG
+        H5Pset_chunk(dcpl_id, 3, chunk_dims);
+        if(!mpi_rank)   printf("Enable chunking !\n");
+#endif
 
-            plist_id = H5Pcreate(H5P_DATASET_XFER);
+#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
+        {                                                                                                          \
+            dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); \
+            temp_buf_index = 0;                                                                                       \
+            for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
+            {                                                                                                         \
+                for (size_t j(1); j < grid->ny + 1; j++)                                                              \
+                {                                                                                                     \
+                    for (size_t k(1); k < grid->nz + 1; k++)                                                          \
+                    {                                                                                                 \
+                        temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME;                     \
+                        temp_buf_index = temp_buf_index + 1;                                                          \
+                    }                                                                                                 \
+                }                                                                                                     \
+            }                                                                                                         \
+            dataspace_id = H5Dget_space(dset_id);                                                                     \
+            H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
+            H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
+            H5Sclose(dataspace_id);                                                                                   \
+            H5Dclose(dset_id);                                                                                        \
+        }
+        /*
+           typedef struct field {
+           float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
+           float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
+           float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
+           float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
+           material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
+           material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
+           } field_t;*/
+
+
+#ifdef HAS_FIELD_COMP
+        field_t * field_buf;
+        temp_buf_index = 0;
+        int global_index;
+        field_buf = (field_t *)malloc(sizeof(field_t) * (grid->nx) * (grid->ny) * (grid->nz));
+        for (size_t i(1); i < grid->nx + 1; i++){
+            for (size_t j(1); j < grid->ny + 1; j++){
+                for (size_t k(1); k < grid->nz + 1; k++){
+                    field_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k);
+                    temp_buf_index++;
+                }
+            }
+        }
+        dset_id = H5Dcreate(group_id, "field", field_comp_type_it, filespace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
+        dataspace_id = H5Dget_space(dset_id);
+        H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);
+        H5Dwrite(dset_id, field_comp_type_it, memspace, dataspace_id, plist_id, field_buf);
+        free(field_buf);
+        H5Sclose(dataspace_id);
+        H5Dclose(dset_id);
+        H5Tclose(field_comp_type_it);
+#else
 
-            //Comment out for test only
-            H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-            H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL);
+        if (field_dump_flag.ex)
+            DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.ey)
+            DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.ez)
+            DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.div_e_err)
+            DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT);
+
+        if (field_dump_flag.cbx)
+            DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.cby)
+            DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.cbz)
+            DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.div_b_err)
+            DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT);
+
+        if (field_dump_flag.tcax)
+            DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.tcay)
+            DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.tcaz)
+            DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.rhob)
+            DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT);
+
+        if (field_dump_flag.jfx)
+            DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.jfy)
+            DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.jfz)
+            DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT);
+        if (field_dump_flag.rhof)
+            DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT);
+
+        //H5T_NATIVE_SHORT  for material_id (typedef int16_t material_id)
+        if (field_dump_flag.ematx)
+            DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT);
+        if (field_dump_flag.ematy)
+            DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT);
+        if (field_dump_flag.ematz)
+            DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT);
+        if (field_dump_flag.nmat)
+            DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT);
+
+        if (field_dump_flag.fmatx)
+            DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT);
+        if (field_dump_flag.fmaty)
+            DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT);
+        if (field_dump_flag.fmatz)
+            DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT);
+        if (field_dump_flag.cmat)
+            DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT);
 
-            hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local;
-            H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL);
+#endif
 
-            el1 = uptime() - el1;
-            io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
 
-            double el2 = uptime();
+        H5D_mpio_actual_io_mode_t actual_io_mode;
+        H5Pget_mpio_actual_io_mode(plist_id,  &actual_io_mode);
+        /*
+
+           switch(actual_io_mode){
+           case H5D_MPIO_NO_COLLECTIVE:
+           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_NO_COLLECTIVE: ");
+           break;
+           case H5D_MPIO_CHUNK_INDEPENDENT:
+           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_INDEPENDENT: ");
+           break;
+           case H5D_MPIO_CHUNK_COLLECTIVE:
+           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_COLLECTIVE: ");
+           break;
+           case H5D_MPIO_CHUNK_MIXED:
+           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_MIXED: ");
+           break;
+           case H5D_MPIO_CONTIGUOUS_COLLECTIVE:
+           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CONTIGUOUS_COLLECTIVE: ");
+           break;
+           default :
+           io_log("H5Pget_mpio_actual_io_mode: None returend: ");
+           break;
+           }
+
+           H5D_mpio_actual_chunk_opt_mode_t actual_chunk_opt_mode;
+           H5Pget_mpio_actual_chunk_opt_mode(plist_id, &actual_chunk_opt_mode);
+           switch(actual_chunk_opt_mode){
+           case H5D_MPIO_NO_CHUNK_OPTIMIZATION:
+           io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_NO_CHUNK_OPTIMIZATION: ");
+           break;
+           case H5D_MPIO_MULTI_CHUNK:
+           io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK: ");
+           break;
+        //  case H5D_MPIO_MULTI_CHUNK_NO_OPT:
+        //      io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK_NO_OPT: ");
+        //     break;
+        case H5D_MPIO_LINK_CHUNK:
+        io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_LINK_CHUNK: ");
+        break;
+        default :
+        io_log("H5Pget_mpio_actual_chunk_opt_mode: None returend: ");
+        break;
+        }
 
-            // This point offset is silly, and loses the type safety (pf+1)
-            hid_t dset_id = H5Dcreate(group_id, "dX", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            int ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf);
-            H5Dclose(dset_id);
+        uint32_t local_no_collective_cause,  global_no_collective_cause;
+        H5Pget_mpio_no_collective_cause(plist_id, &local_no_collective_cause, &global_no_collective_cause);
+
+        switch(local_no_collective_cause){
+        case H5D_MPIO_COLLECTIVE:
+        io_log("local_no_collective_cause: H5D_MPIO_COLLECTIVE: ");
+        break;
+        case H5D_MPIO_SET_INDEPENDENT:
+        io_log("local_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: ");
+        break;
+        case H5D_MPIO_DATA_TRANSFORMS:
+        io_log("local_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: ");
+        break;
+        //case H5D_MPIO_SET_MPIPOSIX:
+        //    io_log("local_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: ");
+        //    break;
+        case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES:
+        io_log("local_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: ");
+        break;
+        //case H5D_MPIO_POINT_SELECTIONS:
+        //    io_log("local_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: ");
+        //    break;
+        // case H5D_MPIO_FILTERS:
+        //    io_log("local_no_collective_cause: H5D_MPIO_FILTERS: ");
+        //    break;
+        default :
+        io_log("local_no_collective_cause: None returend: ");
+        break;
+    }
+
+
+    switch(global_no_collective_cause){
+        case H5D_MPIO_COLLECTIVE:
+            io_log("global_no_collective_cause: H5D_MPIO_COLLECTIVE: ");
+            break;
+        case H5D_MPIO_SET_INDEPENDENT:
+            io_log("global_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: ");
+            break;
+        case H5D_MPIO_DATA_TRANSFORMS:
+            io_log("global_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: ");
+            break;
+            //case H5D_MPIO_SET_MPIPOSIX:
+            //    io_log("global_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: ");
+            //    break;
+        case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES:
+            io_log("global_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: ");
+            break;
+            //case H5D_MPIO_POINT_SELECTIONS:
+            //    io_log("global_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: ");
+            //    break;
+            // case H5D_MPIO_FILTERS:
+            //   io_log("global_no_collective_cause: H5D_MPIO_FILTERS: ");
+            //   break;
+        default :
+            io_log("global_no_collective_cause: None returend: ");
+            break;
+    }
+    */
+
+    el2 = uptime() - el2;
+    //io_log("TimeHDF5Write: " << el2 << " s");
+
+    double el3 = uptime();
+
+    /*
+    //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
+    float attr_data[2][3];
+    attr_data[0][0] = grid->x0;
+    attr_data[0][1] = grid->y0;
+    attr_data[0][2] = grid->z0;
+    attr_data[1][0] = grid->dx;
+    attr_data[1][1] = grid->dy;
+    attr_data[1][2] = grid->dz;
+    hsize_t dims[2];
+    dims[0] = 2;
+    dims[1] = 3;
+    if(!file_exist_flag){
+    hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
+    hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
+    H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
+    H5Sclose(va_geo_dataspace_id);
+    H5Aclose(va_geo_attribute_id);
+    }
+    */
+    free(temp_buf);
+    //if(!file_exist_flag)
+    H5Sclose(filespace);
+    H5Sclose(memspace);
+    H5Pclose(plist_id);
+    H5Gclose(group_id);
+    H5Fclose(file_id);
+
+    el3 = uptime() - el3;
+    //io_log("TimeHDF5Close: " << el3 << " s");
+
+    if (mpi_rank == 0)
+    {
+        char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf";
+        char dimensions_3d[128];
+        sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]);
+        char dimensions_4d[128];
+        sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3);
+        char orignal[128];
+        sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
+        char dxdydz[128];
+        sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
+
+
+        // TODO: remove or let the user set
+        int field_interval = 1;
+
+        // TODO: remove this dependence on number of steps
+        //std::cout << "num_step " << num_step << std::endl;
+
+        int nframes = num_step / field_interval + 1;
+        static int field_tframe = 0;
 
-            dset_id = H5Dcreate(group_id, "dY", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 1);
-            H5Dclose(dset_id);
+#ifdef DUMP_INFO_DEBUG
+        printf("         meta file : %s \n", output_xml_file);
+        printf(" array dims per var: %s \n", dimensions_3d);
+        printf("array dims all vars: %s \n", dimensions_4d);
+        printf("            orignal: %s \n", orignal);
+        printf("             dxdydz: %s \n", dxdydz);
+        printf("            nframes: %d \n", nframes);
+        printf("    field_interval: %d \n", field_interval);
+        printf("       current step: %zd \n", step_for_viou);
+        printf("       current step: %zd \n", step_for_viou);
+
+        //printf("    Simulation time: %f \n", grid->t0);
+        printf("             tframe: %d \n", field_tframe);
+#endif
 
-            dset_id = H5Dcreate(group_id, "dZ", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 2);
-            H5Dclose(dset_id);
+        // TODO: this footer dumping is more likely better done in a
+        // destructor, rather than hoping a multiple division works out
+        if (field_tframe >= 1)
+        {
+            if (field_tframe == (nframes - 1))
+            {
+                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
+            }
+            else
+            {
+                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
+            }
+        }
+        else
+        {
+            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval);
+            if (field_tframe == (nframes - 1))
+            {
+                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
+            }
+            else
+            {
+                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
+            }
+        }
+        field_tframe++;
+    }
+    }
+    /**
+     * @brief dump_particles to the HDF5 file
+     *         Author: Bin Dong  dbin@lbl.gov
+     *            https://crd.lbl.gov/bin-dong
+     *         Nov 2020
+     * @param fbase
+     * @param sp
+     * @param grid
+     * @param step
+     * @param interpolator_array
+     * @param ftag
+     */
+    void dump_particles(
+            const char *fbase,
+            species_t* sp,
+            grid_t* grid,
+            int step,
+            interpolator_array_t* interpolator_array,
+            int ftag
+            )
+    {
+        static int file_index = 0;
+        file_index ++;
+        int mpi_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+        double dump_particles_uptime = uptime();
+        time_t seconds = time(NULL);
+        // printf("Atrank = %d, file_index = %d, dump_particles_uptime = %f, epoch_seconds = %ld  \n ", mpi_rank, file_index, dump_particles_uptime, seconds);
+
+
+        size_t step_for_viou = step;
+        char fname[256];
+        char group_name[256];
+        char particle_scratch[128];
+        char subparticle_scratch[128];
+
+        int np_local;
+
+        float *Pf;
+        int *Pi;
+
+        // get the total number of particles. in this example, output only electrons
+        //sp = species_list;
+        sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5");
+        FileUtils::makeDirectory(particle_scratch);
+        sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou);
+        FileUtils::makeDirectory(subparticle_scratch);
+
+        // TODO: Allow the user to set this
+        int stride_particle_dump = 1;
+
+        np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump;
+
+        // make a copy of the part of particle data to be dumped
+        double ec1 = uptime();
+
+        int sp_np = sp->np;
+        int sp_max_np = sp->max_np;
+        particle_t *ALIGNED(128) p_buf = NULL;
+        if (!p_buf)
+            MALLOC_ALIGNED(p_buf, np_local, 128);
+        particle_t *sp_p = sp->p;
+        sp->p = p_buf;
+        sp->np = np_local;
+        sp->max_np = np_local;
+
+        for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i)
+        {
+            COPY(&sp->p[i], &sp_p[iptl], 1);
+        }
 
-            // TODO: should we add the ability to chunk the particle write?
+        center_p(sp, interpolator_array);
 
-#ifdef OUTPUT_CONVERT_GLOBAL_ID
+        ec1 = uptime() - ec1;
 
-            // TODO: make a function out of this too, its used in openpmd
-            std::vector<int> global_pi;
-            global_pi.resize(numparticles);
-            // TODO: this could be parallel
-            for (int i = 0; i < numparticles; i++)
-            {
-                int local_i = sp->p[i].i;
 
-                int ix, iy, iz, rx, ry, rz;
+        //if(!mpi_rank || mpi_rank == 2047 )
+        //    std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl;
 
-                // Convert rank to local x/y/z
-                UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+#ifndef N_FILE_N_PROCESS
+        int np_local_max, np_local_min;
+        MPI_Reduce(&np_local, &np_local_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&np_local, &np_local_min, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);
+        //io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << ",np_local_max = " << np_local_max << ", local_min = "<< np_local_min);
+#endif
 
-                // Calculate local ix/iy/iz
-                UNVOXEL(local_i, ix, iy, iz, grid->nx+2, grid->ny+2, grid->nz+2);
+        Pf = (float *)sp->p;
+        Pi = (int *)sp->p;
 
-                // Convert ix/iy/iz to global
-                int gix = ix + (grid->nx * (rx));
-                int giy = iy + (grid->ny * (ry));
-                int giz = iz + (grid->nz * (rz));
+        // open HDF5 file in "particle/T.<step>/" subdirectory
+        // filename: eparticle.h5p
+#ifndef N_FILE_N_PROCESS
+        sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
+#else
+        sprintf(fname, "%s/%s_%ld_p%d.h5", subparticle_scratch, sp->name, step_for_viou, mpi_rank);
+#endif
 
-                // calculate global grid sizes
-                int gnx = grid->nx * grid->gpx;
-                int gny = grid->ny * grid->gpy;
-                int gnz = grid->nz * grid->gpz;
+        sprintf(group_name, "/Timestep_%ld", step_for_viou);
+        double el1 = uptime();
 
-                // TODO: find a better way to account for the hard coded ghosts in VOXEL
-                int global_i = VOXEL(gix, giy, giz, gnx-2, gny-2, gnz-2);
 
-                //std::cout << rank << " local i " << local_i << " becomes " << global_i << std::endl;
-                global_pi[i] = global_i;
-            }
 
-            dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, linearspace, filespace, plist_id, global_pi.data());
-            H5Dclose(dset_id);
+        long long total_particles, offset;
+        long long numparticles = np_local;
 
+#ifndef N_FILE_N_PROCESS
+        MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+        offset -= numparticles;
 #else
-            dset_id = H5Dcreate(group_id, "i", H5T_NATIVE_INT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace, plist_id, Pi + 3);
-            H5Dclose(dset_id);
+        total_particles = np_local;
+        offset = 0;
 #endif
 
-            dset_id = H5Dcreate(group_id, "Ux", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 4);
-            H5Dclose(dset_id);
-
-            dset_id = H5Dcreate(group_id, "Uy", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 5);
-            H5Dclose(dset_id);
-
-            dset_id = H5Dcreate(group_id, "Uz", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 6);
-            H5Dclose(dset_id);
-
-            dset_id = H5Dcreate(group_id, "q", H5T_NATIVE_FLOAT, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(dset_id, H5T_NATIVE_FLOAT, memspace, filespace, plist_id, Pf + 7);
-            H5Dclose(dset_id);
-
-            el2 = uptime() - el2;
-            io_log("Particle TimeHDF5Write: " << el2 << " s");
-
-            double el3 = uptime();
-            H5Sclose(memspace);
-            H5Sclose(filespace);
-            H5Pclose(plist_id);
-            H5Gclose(group_id);
-            H5Fclose(file_id);
-            el3 = uptime() - el3;
-            io_log("Particle TimeHDF5Close: " << el3 << " s");
-
-            sp->p = sp_p;
-            sp->np = sp_np;
-            sp->max_np = sp_max_np;
-            FREE_ALIGNED(p_buf);
-
-            // Write metadata if step() == 0
-            char meta_fname[256];
-
-            sprintf(meta_fname, "%s/grid_metadata_%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
-
-            double meta_el1 = uptime();
-
-            hid_t meta_plist_id = H5Pcreate(H5P_FILE_ACCESS);
-            H5Pset_fapl_mpio(meta_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-            hid_t meta_file_id = H5Fcreate(meta_fname, H5F_ACC_TRUNC, H5P_DEFAULT, meta_plist_id);
-            hid_t meta_group_id = H5Gcreate(meta_file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            H5Pclose(meta_plist_id);
-
-            long long meta_total_particles, meta_offset;
-            long long meta_numparticles = 1;
-            MPI_Allreduce(&meta_numparticles, &meta_total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-            MPI_Scan(&meta_numparticles, &meta_offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-            meta_offset -= meta_numparticles;
-
-            hid_t meta_filespace = H5Screate_simple(1, (hsize_t *)&meta_total_particles, NULL);
-            hid_t meta_memspace = H5Screate_simple(1, (hsize_t *)&meta_numparticles, NULL);
-            meta_plist_id = H5Pcreate(H5P_DATASET_XFER);
-            H5Pset_dxpl_mpio(meta_plist_id, H5FD_MPIO_COLLECTIVE);
-            H5Sselect_hyperslab(meta_filespace, H5S_SELECT_SET, (hsize_t *)&meta_offset, NULL, (hsize_t *)&meta_numparticles, NULL);
-            meta_el1 = uptime() - meta_el1;
-            io_log("Metafile TimeHDF5Open): " << meta_el1 << " s"); //Easy to handle results for scripts
-
-            double meta_el2 = uptime();
-
-            hid_t meta_dset_id = H5Dcreate(meta_group_id, "np_local", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, (int32_t *)&np_local);
-            H5Dclose(meta_dset_id);
-            //if (rank == 0) printf ("Written variable dX \n");
-
-            meta_dset_id = H5Dcreate(meta_group_id, "nx", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nx);
-            H5Dclose(meta_dset_id);
-            //if (rank == 0) printf ("Written variable dY \n");
-
-            meta_dset_id = H5Dcreate(meta_group_id, "ny", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->ny);
-            H5Dclose(meta_dset_id);
-            //if (rank == 0) printf ("Written variable dZ \n");
-
-            meta_dset_id = H5Dcreate(meta_group_id, "nz", H5T_NATIVE_INT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_INT, meta_memspace, meta_filespace, meta_plist_id, &grid->nz);
-            H5Dclose(meta_dset_id);
-            //if (rank == 0) printf ("Written variable i \n");
-
-            meta_dset_id = H5Dcreate(meta_group_id, "x0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->x0);
-            H5Dclose(meta_dset_id);
-
-            meta_dset_id = H5Dcreate(meta_group_id, "y0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->y0);
-            H5Dclose(meta_dset_id);
-
-            meta_dset_id = H5Dcreate(meta_group_id, "z0", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->z0);
-            H5Dclose(meta_dset_id);
-
-            meta_dset_id = H5Dcreate(meta_group_id, "dx", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dx);
-            H5Dclose(meta_dset_id);
-
-            meta_dset_id = H5Dcreate(meta_group_id, "dy", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dy);
-            H5Dclose(meta_dset_id);
-
-            meta_dset_id = H5Dcreate(meta_group_id, "dz", H5T_NATIVE_FLOAT, meta_filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-            ierr = H5Dwrite(meta_dset_id, H5T_NATIVE_FLOAT, meta_memspace, meta_filespace, meta_plist_id, &grid->dz);
-            H5Dclose(meta_dset_id);
-
-            meta_el2 = uptime() - meta_el2;
-            io_log("Metafile TimeHDF5Write: " << meta_el2 << " s");
-            double meta_el3 = uptime();
-            H5Sclose(meta_memspace);
-            H5Sclose(meta_filespace);
-            H5Pclose(meta_plist_id);
-            H5Gclose(meta_group_id);
-            H5Fclose(meta_file_id);
-            meta_el3 = uptime() - meta_el3;
-            io_log("Metafile TimeHDF5Close: " << meta_el3 << " s");
 
-        }
+        hid_t file_plist_id = H5Pcreate(H5P_FILE_ACCESS);
 
-        void dump_hydro(
-            const char *fbase,
-            int step,
-            hydro_array_t* hydro_array,
-            species_t* sp,
-            interpolator_array_t* interpolator_array,
-            grid_t* grid,
-            int ftag
-        )
-        {
-            size_t step_for_viou = step;
+#ifndef N_FILE_N_PROCESS
+        H5Pset_fapl_mpio(file_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+#endif
 
-#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
-            {                                                                                                             \
-                dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
-                temp_buf_index = 0;                                                                                       \
-                for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
-                {                                                                                                         \
-                    for (size_t j(1); j < grid->ny + 1; j++)                                                              \
-                    {                                                                                                     \
-                        for (size_t k(1); k < grid->nz + 1; k++)                                                          \
-                        {                                                                                                 \
-                            temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME;                                     \
-                            temp_buf_index = temp_buf_index + 1;                                                          \
-                        }                                                                                                 \
-                    }                                                                                                     \
-                }                                                                                                         \
-                dataspace_id = H5Dget_space(dset_id);                                                                     \
-                H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
-                H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
-                H5Sclose(dataspace_id);                                                                                   \
-                H5Dclose(dset_id);                                                                                        \
-            }
-            //#define DUMP_INFO_DEBUG 1
-            int mpi_size, mpi_rank;
-            MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-            MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+#ifdef H5_ASYNC
+        if(!mpi_rank) printf("Enable async on particle data");
 
-            if (!sp)
-            {
-                ERROR(("Invalid species"));
-            }
+        assert(H5Pset_vol_async(file_plist_id));
+#endif
 
-            clear_hydro_array(hydro_array);
-            accumulate_hydro_p(hydro_array, sp, interpolator_array);
-            synchronize_hydro_array(hydro_array);
+        hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, file_plist_id);
+        //if(!mpi_rank )
+        //io_log("++Particle H5Fcreate) ");
 
-            char hname[256];
-            char hydro_scratch[128];
-            char subhydro_scratch[128];
 
-            sprintf(hydro_scratch, "./%s", "hydro_hdf5");
-            FileUtils::makeDirectory(hydro_scratch);
-            sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou);
-            FileUtils::makeDirectory(subhydro_scratch);
+        hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        //if(!mpi_rank )
+        //io_log("++Particle H5Gcreate) ");
 
-            sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, step_for_viou);
-            double el1 = uptime();
-            hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-            H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-            hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
-            H5Pclose(plist_id);
+#ifdef  HAS_PARTICLE_COMP
+        if(!mpi_rank)
+            printf("Using Partilce Compund type !\n");
+        hid_t  particle_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(particle_t));
+        H5Tinsert(particle_comp_type_it, "dx", HOFFSET(particle_t, dx), H5T_NATIVE_FLOAT);
+        H5Tinsert(particle_comp_type_it, "dy", HOFFSET(particle_t, dy), H5T_NATIVE_FLOAT);
+        H5Tinsert(particle_comp_type_it, "dz", HOFFSET(particle_t, dz), H5T_NATIVE_FLOAT);
 
-            sprintf(hname, "Timestep_%zu", step_for_viou);
-            hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        H5Tinsert(particle_comp_type_it, "i", HOFFSET(particle_t, i), H5T_NATIVE_INT);
 
-            el1 = uptime() - el1;
-            io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
-            double el2 = uptime();
+        H5Tinsert(particle_comp_type_it, "ux", HOFFSET(particle_t, ux), H5T_NATIVE_FLOAT);
+        H5Tinsert(particle_comp_type_it, "uy", HOFFSET(particle_t, uy), H5T_NATIVE_FLOAT);
+        H5Tinsert(particle_comp_type_it, "uz", HOFFSET(particle_t, uz), H5T_NATIVE_FLOAT);
+        H5Tinsert(particle_comp_type_it, "w",  HOFFSET(particle_t, w), H5T_NATIVE_FLOAT);
+#endif
 
-            // Create a variable list of field values to output.
-            //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
-            //size_t *varlist = new size_t[numvars];
+        hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL);
+        H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL);
 
-            //for (size_t i(0), c(0); i < total_field_variables; i++)
-            //    if (global->fdParams.output_vars.bitset(i))
-            //        varlist[c++] = i;
+        //if(!mpi_rank )
+        //io_log("++Particle H5Sselect_hyperslab) ");
 
-            //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars);
+        //plist_id = H5P_DEFAULT;
+        hid_t io_plist_id = H5Pcreate(H5P_DATASET_XFER);
 
+#ifndef N_FILE_N_PROCESS
+#ifdef HAS_INDEPENDENT_IO
+        if(!mpi_rank) {
+            printf("\n ###\n VPIC Independent I/O! \n ###\n");
+        }
+        H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_INDEPENDENT);
+#else
+        H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_COLLECTIVE);
+#endif
+#endif
 
-            //typedef struct hydro {
-            //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
-            //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
-            //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
-            //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
-            //  float _pad[2];         // 16-byte align
-            //} hydro_t;
+#ifdef H5_ASYNC
+        H5Pset_dxpl_async(io_plist_id, true);
+#endif
+        hsize_t linearspace_count_temp = numparticles;
+        hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
+
+        hsize_t memspace_count_temp;
+        hid_t memspace;
+#ifdef HAS_PARTICLE_COMP
+        memspace_count_temp = numparticles ;
+        memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
+#else
+        memspace_count_temp = numparticles * 8;
+        memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
+        hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local;
+        H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL);
+#endif
+        el1 = uptime() - el1;
+        //if(!mpi_rank || mpi_rank == 2047 )
+        //io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
+        double el2 = uptime();
+        int ierr;
+
+#define WRITE_H5_FILE(group_id_p, data_buf_p, type_p, dname_p){\
+    hid_t dset_id = H5Dcreate(group_id_p, dname_p, type_p, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
+    H5Dwrite(dset_id, type_p, memspace, filespace, io_plist_id, data_buf_p);\
+    H5Dclose(dset_id);\
+}
+
+
+        //MPI_Info_set(info, "romio_cb_write", "disable");
+#define WRITE_MPI_FILE(dname_p, offset_p, data_buf_p, count_p, type_p){\
+    MPI_File fh;\
+    MPI_Status status;\
+    sprintf(fname, "%s/%s_%ld_%s.h5", subparticle_scratch, sp->name, step_for_viou, dname_p);\
+    if(mpi_rank == 0) printf("fname= %s \n", fname);\
+    MPI_Info info;\
+    MPI_Info_create(&info);\
+    MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_WRONLY | MPI_MODE_CREATE, info, &fh);\
+    MPI_File_write_at(fh, offset_p, data_buf_p, count_p,type_p, &status);\
+    MPI_Info_free(&info);\
+    MPI_File_close(&fh);\
+}
+
+#ifdef HAS_PARTICLE_COMP
+        hid_t dset_id = H5Dcreate(group_id, "particle", particle_comp_type_it, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+        H5Dwrite(dset_id, particle_comp_type_it, memspace, filespace, io_plist_id, sp->p);
+        H5Dclose(dset_id);
+#else
+#ifdef TEST_MPIIO
+        //Here we don't use the stripe but just for performance test
+        if(!mpi_rank) printf("Test MPI-IO\n");
+        WRITE_MPI_FILE("dX", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
+        WRITE_MPI_FILE("dY", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
+        WRITE_MPI_FILE("dZ", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
+        WRITE_MPI_FILE("i",  offset * sizeof(int),   Pf, numparticles,  MPI_INT);
+        WRITE_MPI_FILE("ux", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
+        WRITE_MPI_FILE("uy", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
+        WRITE_MPI_FILE("uz", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
+        WRITE_MPI_FILE("q",  offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
+#else
+#ifndef N_FILE_N_PROCESS
+        if(!mpi_rank) printf("Test HDF5-IO Single \n");
+#else
+        if(!mpi_rank) printf("Test HDF5-IO N Files N Process\n");
+#endif
+        //if(!mpi_rank )
+        //io_log("++Particle Starting to write ) ");
+        WRITE_H5_FILE(group_id,  Pf, H5T_NATIVE_FLOAT, "dX")
+        WRITE_H5_FILE(group_id,  Pf+1, H5T_NATIVE_FLOAT, "dY")
+        WRITE_H5_FILE(group_id,  Pf+2, H5T_NATIVE_FLOAT, "dZ")
+        WRITE_H5_FILE(group_id,  Pi+3, H5T_NATIVE_INT, "i")
+        WRITE_H5_FILE(group_id,  Pf+4, H5T_NATIVE_FLOAT, "ux")
+        WRITE_H5_FILE(group_id,  Pf+5, H5T_NATIVE_FLOAT, "uy")
+        WRITE_H5_FILE(group_id,  Pf+6, H5T_NATIVE_FLOAT, "uz")
+        WRITE_H5_FILE(group_id,  Pf+7, H5T_NATIVE_FLOAT, "q")
+#endif
+#endif
+        el2 = uptime() - el2;
+        //io_log("Particle TimeHDF5Write: " << el2 << " s");
 
-            //typedef struct hydro_array {
-            //  hydro_t * ALIGNED(128) h;
-            //  grid_t * g;
-            //} hydro_array_t;
+        double el3 = uptime();
+        H5Sclose(memspace);
+        H5Sclose(filespace);
+        H5Pclose(file_plist_id);
+        H5Pclose(io_plist_id);
+        H5Gclose(group_id);
 
-            float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
-            hsize_t temp_buf_index;
-            hid_t dset_id;
-            //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
-            plist_id = H5Pcreate(H5P_DATASET_XFER);
-            //Comment out for test only
-            H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-            //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
 
-            //global->topology_x
 
-            hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3];
-            hydro_global_size[0] = (grid->nx * grid->gpx);
-            hydro_global_size[1] = (grid->ny * grid->gpy);
-            hydro_global_size[2] = (grid->nz * grid->gpz);
+        H5Fclose(file_id);
 
-            hydro_local_size[0] = grid->nx;
-            hydro_local_size[1] = grid->ny;
-            hydro_local_size[2] = grid->nz;
+#ifdef H5_ASYNC
+        H5VLasync_finalize();
+#endif
+        el3 = uptime() - el3;
+        //io_log("Particle TimeHDF5Close: " << el3 << " s");
 
-            int mpi_rank_x, mpi_rank_y, mpi_rank_z;
-            UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, grid->gpz);
+        }
 
-            global_offset[0] = (grid->nx) * mpi_rank_x;
-            global_offset[1] = (grid->ny) * mpi_rank_y;
-            global_offset[2] = (grid->nz) * mpi_rank_z;
+/**
+ * @brief Dump hydro data to the HDf5 file
+ *         Author: Bin Dong  dbin@lbl.gov
+ *           https://crd.lbl.gov/bin-dong
+ *         Nov 2020
+ * @param fbase
+ * @param step
+ * @param hydro_array
+ * @param sp
+ * @param interpolator_array
+ * @param grid
+ * @param ftag
+ */
+void dump_hydro(
+        const char *fbase,
+        int step,
+        hydro_array_t* hydro_array,
+        species_t* sp,
+        interpolator_array_t* interpolator_array,
+        grid_t* grid,
+        int ftag
+        )
+{
+    size_t step_for_viou = step;
 
-            global_count[0] = (grid->nx);
-            global_count[1] = (grid->ny);
-            global_count[2] = (grid->nz);
+#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
+    {                                                                                                             \
+        dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
+        temp_buf_index = 0;                                                                                       \
+        for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
+        {                                                                                                         \
+            for (size_t j(1); j < grid->ny + 1; j++)                                                              \
+            {                                                                                                     \
+                for (size_t k(1); k < grid->nz + 1; k++)                                                          \
+                {                                                                                                 \
+                    temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME;                                     \
+                    temp_buf_index = temp_buf_index + 1;                                                          \
+                }                                                                                                 \
+            }                                                                                                     \
+        }                                                                                                         \
+        dataspace_id = H5Dget_space(dset_id);                                                                     \
+        H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
+        H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
+        H5Sclose(dataspace_id);                                                                                   \
+        H5Dclose(dset_id);                                                                                        \
+    }
+    //#define DUMP_INFO_DEBUG 1
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+    if (!sp)
+    {
+        ERROR(("Invalid species"));
+    }
+
+    clear_hydro_array(hydro_array);
+    accumulate_hydro_p(hydro_array, sp, interpolator_array);
+    synchronize_hydro_array(hydro_array);
+
+    char hname[256];
+    char hydro_scratch[128];
+    char subhydro_scratch[128];
+
+    sprintf(hydro_scratch, "./%s", "hydro_hdf5");
+    FileUtils::makeDirectory(hydro_scratch);
+    sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou);
+    FileUtils::makeDirectory(subhydro_scratch);
+
+    sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, step_for_viou);
+    double el1 = uptime();
+    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
+
+    /*
+       if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < 0){
+       exit(-1);
+       }*/
+    //if((fid = H5Fcreate(FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id)) < 0)
+    //    ERROR_RETURN;
+
+    H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+    hid_t file_id = H5Fcreate(hname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+    H5Pclose(plist_id);
+
+    sprintf(hname, "Timestep_%zu", step_for_viou);
+    hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+    el1 = uptime() - el1;
+    //io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
+    double el2 = uptime();
+
+    // Create a variable list of field values to output.
+    //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
+    //size_t *varlist = new size_t[numvars];
+
+    //for (size_t i(0), c(0); i < total_field_variables; i++)
+    //    if (global->fdParams.output_vars.bitset(i))
+    //        varlist[c++] = i;
+
+    //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars);
+
+
+    //typedef struct hydro {
+    //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
+    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
+    //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
+    //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
+    //  float _pad[2];         // 16-byte align
+    //} hydro_t;
+#ifdef HAS_HYDRO_COMP
+    //if(!mpi_rank)
+    //printf("Using Field Compund type !\n");
+    hid_t  hydro_comp_type_it = H5Tcreate (H5T_COMPOUND, sizeof(hydro_t));
+    H5Tinsert(hydro_comp_type_it, "jx", HOFFSET(hydro_t, jx), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "jy", HOFFSET(hydro_t, jy), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "jz", HOFFSET(hydro_t, jz), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "rho", HOFFSET(hydro_t, rho), H5T_NATIVE_FLOAT);
+
+    H5Tinsert(hydro_comp_type_it, "px", HOFFSET(hydro_t, px), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "py", HOFFSET(hydro_t, py), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "pz", HOFFSET(hydro_t, pz), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "ke", HOFFSET(hydro_t, ke), H5T_NATIVE_FLOAT);
+
+    H5Tinsert(hydro_comp_type_it, "txx", HOFFSET(hydro_t, txx), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "tyy", HOFFSET(hydro_t, tyy), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "tzz", HOFFSET(hydro_t, tzz), H5T_NATIVE_FLOAT);
+
+    H5Tinsert(hydro_comp_type_it, "tyz", HOFFSET(hydro_t, tyz), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "tzx", HOFFSET(hydro_t, tzx), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "txy", HOFFSET(hydro_t, txy), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "pad", HOFFSET(hydro_t, _pad), H5T_NATIVE_DOUBLE);
+#endif
+    //typedef struct hydro_array {
+    //  hydro_t * ALIGNED(128) h;
+    //  grid_t * g;
+    //} hydro_array_t;
+
+    float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
+    hsize_t temp_buf_index;
+    hid_t dset_id;
+    //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
+    plist_id = H5Pcreate(H5P_DATASET_XFER);
+    //Comment out for test only
+    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+    //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
+
+    //global->topology_x
+
+    hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3];
+    hydro_global_size[0] = (grid->nx * grid->gpx);
+    hydro_global_size[1] = (grid->ny * grid->gpy);
+    hydro_global_size[2] = (grid->nz * grid->gpz);
+
+    hydro_local_size[0] = grid->nx;
+    hydro_local_size[1] = grid->ny;
+    hydro_local_size[2] = grid->nz;
+
+    int mpi_rank_x, mpi_rank_y, mpi_rank_z;
+    UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, grid->gpz);
+
+    global_offset[0] = (grid->nx) * mpi_rank_x;
+    global_offset[1] = (grid->ny) * mpi_rank_y;
+    global_offset[2] = (grid->nz) * mpi_rank_z;
+
+    global_count[0] = (grid->nx);
+    global_count[1] = (grid->ny);
+    global_count[2] = (grid->nz);
 
 #ifdef DUMP_INFO_DEBUG
-            printf("global size   = %llu %llu %llu \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
-            printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]);
-            printf("global_count  = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]);
-            printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
-            fflush(stdout);
+    printf("global size   = %llu %llu %llu \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
+    printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]);
+    printf("global_count  = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]);
+    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+    fflush(stdout);
 #endif
 
-            hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL);
-            hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL);
-            hid_t dataspace_id;
-
-            //typedef struct hydro {
-            //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
-            //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
-            //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
-            //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
-            //  float _pad[2];         // 16-byte align
-            //} hydro_t;
-
-            if (hydro_dump_flag.jx)
-                DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.jy)
-                DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.jz)
-                DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.rho)
-                DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT);
-
-            if (hydro_dump_flag.px)
-                DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.py)
-                DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.pz)
-                DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.ke)
-                DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT);
-
-            if (hydro_dump_flag.txx)
-                DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.tyy)
-                DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.tzz)
-                DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT);
-
-            if (hydro_dump_flag.tyz)
-                DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.tzx)
-                DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT);
-            if (hydro_dump_flag.txy)
-                DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
-
-            el2 = uptime() - el2;
-            io_log("TimeHDF5Write: " << el2 << " s");
-
-            double el3 = uptime();
-
-            //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
-            float attr_data[2][3];
-            attr_data[0][0] = grid->x0;
-            attr_data[0][1] = grid->y0;
-            attr_data[0][2] = grid->z0;
-            attr_data[1][0] = grid->dx;
-            attr_data[1][1] = grid->dy;
-            attr_data[1][2] = grid->dz;
-            hsize_t dims[2];
-            dims[0] = 2;
-            dims[1] = 3;
-            hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
-            hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
-            H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
-            H5Sclose(va_geo_dataspace_id);
-            H5Aclose(va_geo_attribute_id);
-
-            free(temp_buf);
-            H5Sclose(filespace);
-            H5Sclose(memspace);
-            H5Pclose(plist_id);
-            H5Gclose(group_id);
-            H5Fclose(file_id);
-
-            el3 = uptime() - el3;
-            io_log("TimeHDF5Close: " << el3 << " s");
-
-            if (mpi_rank == 0)
-            {
-                char output_xml_file[128];
-                sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, ".xdmf");
-                char dimensions_3d[128];
-                sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
-                char dimensions_4d[128];
-                sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3);
-                char orignal[128];
-                sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
-                char dxdydz[128];
-                sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
-
-                // TODO: remove or let user set
-                int hydro_interval = 1;
+    hid_t filespace = H5Screate_simple(3, hydro_global_size, NULL);
+    hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL);
+    hid_t dataspace_id;
+
+    //typedef struct hydro {
+    //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
+    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
+    //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
+    //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
+    //  float _pad[2];         // 16-byte align
+    //} hydro_t;
+
+
+#ifdef HAS_HYDRO_COMP
+    hydro_t *hydro_buf = (hydro_t *)malloc(sizeof(hydro_t) * (grid->nx) * (grid->ny) * (grid->nz));
+    temp_buf_index = 0;
+    for (size_t i(1); i < grid->nx + 1; i++){
+        for (size_t j(1); j < grid->ny + 1; j++){
+            for (size_t k(1); k < grid->nz + 1; k++){
+                hydro_buf[temp_buf_index] = _hydro(i, j, k);
+                temp_buf_index = temp_buf_index + 1;
+            }
+        }
+    }
+    dset_id = H5Dcreate(group_id, "hydro", hydro_comp_type_it, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+
+    dataspace_id = H5Dget_space(dset_id);
+    H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);
+    H5Dwrite(dset_id, hydro_comp_type_it, memspace, dataspace_id, plist_id, hydro_buf);
+    free(hydro_buf);
+    H5Sclose(dataspace_id);
+    H5Dclose(dset_id);
+    H5Tclose(hydro_comp_type_it);
+#else
 
-                // TODO: remove this dependence on number of steps
-                int nframes = num_step / hydro_interval + 1;
+    if (hydro_dump_flag.jx)
+        DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.jy)
+        DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.jz)
+        DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.rho)
+        DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT);
+
+    if (hydro_dump_flag.px)
+        DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.py)
+        DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.pz)
+        DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.ke)
+        DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT);
+
+    if (hydro_dump_flag.txx)
+        DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.tyy)
+        DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.tzz)
+        DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT);
+
+    if (hydro_dump_flag.tyz)
+        DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.tzx)
+        DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT);
+    if (hydro_dump_flag.txy)
+        DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
+
+    el2 = uptime() - el2;
+    //io_log("TimeHDF5Write: " << el2 << " s");
 
-                const int tframe = tframe_map[sp->id];
+#endif
+    double el3 = uptime();
+
+    //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
+    /*
+       float attr_data[2][3];
+       attr_data[0][0] = grid->x0;
+       attr_data[0][1] = grid->y0;
+       attr_data[0][2] = grid->z0;
+       attr_data[1][0] = grid->dx;
+       attr_data[1][1] = grid->dy;
+       attr_data[1][2] = grid->dz;
+       hsize_t dims[2];
+       dims[0] = 2;
+       dims[1] = 3;
+       hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
+       hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
+       H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
+       H5Sclose(va_geo_dataspace_id);
+       H5Aclose(va_geo_attribute_id);*/
+
+    free(temp_buf);
+    H5Sclose(filespace);
+    H5Sclose(memspace);
+    H5Pclose(plist_id);
+    H5Gclose(group_id);
+    H5Fclose(file_id);
+
+    el3 = uptime() - el3;
+    //io_log("TimeHDF5Close: " << el3 << " s");
+
+    if (mpi_rank == 0)
+    {
+        char output_xml_file[128];
+        sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, ".xdmf");
+        char dimensions_3d[128];
+        sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
+        char dimensions_4d[128];
+        sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3);
+        char orignal[128];
+        sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
+        char dxdydz[128];
+        sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
+
+        // TODO: remove or let user set
+        int hydro_interval = 1;
+
+        // TODO: remove this dependence on number of steps
+        int nframes = num_step / hydro_interval + 1;
+
+        const int tframe = tframe_map[sp->id];
 
 #ifdef DUMP_INFO_DEBUG
-                printf("         meta file : %s \n", output_xml_file);
-                printf(" array dims per var: %s \n", dimensions_3d);
-                printf("array dims all vars: %s \n", dimensions_4d);
-                printf("            orignal: %s \n", orignal);
-                printf("             dxdydz: %s \n", dxdydz);
-                printf("            nframes: %d \n", nframes);
-                printf("    hydro_fields_interval: %d \n", hydro_interval);
-                printf("       current step: %zu \n", step_for_viou);
-                printf("    Simulation time: %f \n", grid->t0);
-                printf("             tframe: %d \n", tframe);
+        printf("         meta file : %s \n", output_xml_file);
+        printf(" array dims per var: %s \n", dimensions_3d);
+        printf("array dims all vars: %s \n", dimensions_4d);
+        printf("            orignal: %s \n", orignal);
+        printf("             dxdydz: %s \n", dxdydz);
+        printf("            nframes: %d \n", nframes);
+        printf("    hydro_fields_interval: %d \n", hydro_interval);
+        printf("       current step: %zu \n", step_for_viou);
+        printf("    Simulation time: %f \n", grid->t0);
+        printf("             tframe: %d \n", tframe);
 #endif
 
-                // TODO: why doesnt this just use the cstr?
-                char speciesname_new[128];
-                sprintf(speciesname_new, "hydro_%s", sp->name);
-                if (tframe >= 1)
-                {
-                    if (tframe == (nframes - 1))
-                    {
-                        invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
-                    }
-                    else
-                    {
-                        invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
-                    }
-                }
-                else
-                {
-                    create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval);
-                    if (tframe == (nframes - 1))
-                    {
-                        invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
-                    }
-                    else
-                    {
-                        invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
-                    }
-                }
-                tframe_map[sp->id]++;
+        // TODO: why doesnt this just use the cstr?
+        char speciesname_new[128];
+        sprintf(speciesname_new, "hydro_%s", sp->name);
+        if (tframe >= 1)
+        {
+            if (tframe == (nframes - 1))
+            {
+                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
+            }
+            else
+            {
+                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
+            }
+        }
+        else
+        {
+            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval);
+            if (tframe == (nframes - 1))
+            {
+                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
+            }
+            else
+            {
+                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
             }
         }
+        tframe_map[sp->id]++;
+    }
+}
 };
 #endif
 
@@ -1141,24 +1469,24 @@ class OpenPMDDump : public Dump_Strategy {
         std::string file_type = ".bp";
 
         void dump_fields(
-            const char *fbase,
-            int step,
-            grid_t* grid,
-            field_array_t* field_array,
-            int ftag
-        )
+                const char *fbase,
+                int step,
+                grid_t* grid,
+                field_array_t* field_array,
+                int ftag
+                )
         {
             std::cout << "Writing openPMD data" << std::endl;
 
             std::string full_file_name = fbase + file_type;
 
             //if (series == nullptr) {
-                std::cout << "init series" << std::endl;
-                openPMD::Series series = openPMD::Series(
-                        full_file_name,
-                        openPMD::AccessType::CREATE,
-                        MPI_COMM_WORLD
-                );
+            std::cout << "init series" << std::endl;
+            openPMD::Series series = openPMD::Series(
+                    full_file_name,
+                    openPMD::AccessType::CREATE,
+                    MPI_COMM_WORLD
+                    );
             //}
 
             std::cout << "Writing iteration " << step << std::endl;
@@ -1422,24 +1750,24 @@ class OpenPMDDump : public Dump_Strategy {
         }
 
         void dump_particles(
-            const char *fbase,
-            species_t* sp,
-            grid_t* grid,
-            int step,
-            interpolator_array_t* interpolator_array,
-            int ftag
-        )
+                const char *fbase,
+                species_t* sp,
+                grid_t* grid,
+                int step,
+                interpolator_array_t* interpolator_array,
+                int ftag
+                )
         {
             std::string full_file_name = fbase + file_type;
 
             std::cout << "writing particles to " << full_file_name << std::endl;
 
             //if (series == nullptr) {
-                openPMD::Series series = openPMD::Series(
-                        full_file_name,
-                        openPMD::AccessType::CREATE,
-                        MPI_COMM_WORLD
-                );
+            openPMD::Series series = openPMD::Series(
+                    full_file_name,
+                    openPMD::AccessType::CREATE,
+                    MPI_COMM_WORLD
+                    );
             //}
 
             auto i = series.iterations[ step ];
@@ -1568,25 +1896,25 @@ class OpenPMDDump : public Dump_Strategy {
             }
         }
         void dump_hydro(
-            const char *fbase,
-            int step,
-            hydro_array_t* hydro_array,
-            species_t* sp,
-            interpolator_array_t* interpolator_array,
-            grid_t* grid,
-            int ftag
-        )
+                const char *fbase,
+                int step,
+                hydro_array_t* hydro_array,
+                species_t* sp,
+                interpolator_array_t* interpolator_array,
+                grid_t* grid,
+                int ftag
+                )
         {
             std::string full_file_name = fbase + file_type;
 
             std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl;
 
             //if (series == nullptr) {
-                openPMD::Series series = openPMD::Series(
-                        full_file_name,
-                        openPMD::AccessType::CREATE,
-                        MPI_COMM_WORLD
-                );
+            openPMD::Series series = openPMD::Series(
+                    full_file_name,
+                    openPMD::AccessType::CREATE,
+                    MPI_COMM_WORLD
+                    );
             //}
 
             auto i = series.iterations[ step ];

From 657500e86ee36849666d5c57c37bea543889d8a1 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Mon, 23 Nov 2020 10:33:57 -0700
Subject: [PATCH 94/95] apply clang format to dump_strategy.h

---
 src/vpic/dump_strategy.h | 3239 +++++++++++++++++++-------------------
 1 file changed, 1583 insertions(+), 1656 deletions(-)

diff --git a/src/vpic/dump_strategy.h b/src/vpic/dump_strategy.h
index c6fb7613..85f70b1e 100644
--- a/src/vpic/dump_strategy.h
+++ b/src/vpic/dump_strategy.h
@@ -1,9 +1,9 @@
 #ifndef Dump_Strategy_h
 #define Dump_Strategy_h
 
+#include <iostream>
 #include <unordered_map>
 #include <vector>
-#include <iostream>
 
 //#define DUMP_INFO_DEBUG 1
 //#define H5_ASYNC 1
@@ -12,35 +12,31 @@
 #endif
 //#define CHUNK_FLAG 1
 
-
 //#define METADATA_COLL_WRITE 1
 //#define TRUE 1
 
-
 #define HAS_FIELD_COMP 1
 #define HAS_PARTICLE_COMP 1
 #define HAS_HYDRO_COMP 1
 
 //#define HAS_INDEPENDENT_IO 1
 
-#include <mpi.h> // TODO: it would be good if this didn't have to know about MPI
 #include <cassert>
-
+#include <mpi.h> // TODO: it would be good if this didn't have to know about MPI
 
 // TODO: should I drop the ./src here?
-#include "../util/io/FileIO.h"
-#include "../util/util_base.h"
-#include "../util/io/FileUtils.h"
 #include "../field_advance/field_advance.h"
 #include "../sf_interface/sf_interface.h"
 #include "../species_advance/species_advance.h"
+#include "../util/io/FileIO.h"
+#include "../util/io/FileUtils.h"
+#include "../util/util_base.h"
 
 #include "dump.h"
 #include "dumpmacros.h"
 
-
 #ifdef VPIC_ENABLE_HDF5
-#include "hdf5.h" // from the lib
+#include "hdf5.h"             // from the lib
 #include "hdf5_header_info.h" // from vpic
 #endif
 
@@ -48,13 +44,12 @@
 #include <openPMD/openPMD.hpp>
 #endif
 
-
 //#define N_FILE_N_PROCESS 1
 //#define TEST_MPIIO 1
 
 // TODO: delete this
-#define _LOG_PREFIX \
-    __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: "
+#define _LOG_PREFIX                                                            \
+  __FILE__ "(" EXPAND_AND_STRINGIFY(__LINE__) ")[" << rank << "]: "
 
 /*
 #define io_log(x) do {                                \
@@ -65,686 +60,623 @@ std::cerr.flush();                               \
 } while(0)
 */
 
-
-
 // Runtime inheritance is obviously not very "VPIC like", as we will [probably]
 // incur a penalty for the vtable lookup, but given we're about to do IO this
 // is very negligible.
 class Dump_Strategy {
-    public:
-        int rank, nproc, num_step;
-
-        Dump_Strategy(int _rank, int _nproc ) :
-            rank(_rank),
-            nproc(_nproc)
-    { } // empty
-
-        virtual ~Dump_Strategy() { };
-
-        virtual void dump_fields(
-                const char *fbase,
-                int step,
-                grid_t* grid,
-                field_array_t* field_array,
-                int ftag
-                ) = 0;
-        virtual void dump_hydro(
-                const char *fbase,
-                int step,
-                hydro_array_t* hydro_array,
-                species_t* sp,
-                interpolator_array_t* interpolator_array,
-                grid_t* grid,
-                int ftag
-                ) = 0;
-        virtual void dump_particles(
-                const char *fbase,
-                species_t* sp,
-                grid_t* grid,
-                int step,
-                interpolator_array_t* interpolator_array,
-                int ftag
-                ) = 0;
+public:
+  int rank, nproc, num_step;
+
+  Dump_Strategy(int _rank, int _nproc) : rank(_rank), nproc(_nproc) {} // empty
+
+  virtual ~Dump_Strategy(){};
+
+  virtual void dump_fields(const char *fbase, int step, grid_t *grid,
+                           field_array_t *field_array, int ftag) = 0;
+  virtual void dump_hydro(const char *fbase, int step,
+                          hydro_array_t *hydro_array, species_t *sp,
+                          interpolator_array_t *interpolator_array,
+                          grid_t *grid, int ftag) = 0;
+  virtual void dump_particles(const char *fbase, species_t *sp, grid_t *grid,
+                              int step,
+                              interpolator_array_t *interpolator_array,
+                              int ftag) = 0;
 };
 
 class BinaryDump : public Dump_Strategy {
-    public:
-        using Dump_Strategy::Dump_Strategy; // inherit constructor
-        //BinaryDump(int _rank, int _nproc ) : Dump_Strategy(_rank, _nproc ){ } // empty
-
-        // TODO: now we pass rank and step, ftag has odd semanticds
-        void dump_fields(
-                const char *fbase,
-                int step,
-                grid_t* grid,
-                field_array_t* field_array,
-                int ftag
-                );
-        void dump_hydro(
-                const char *fbase,
-                int step,
-                hydro_array_t* hydro_array,
-                species_t* sp,
-                interpolator_array_t* interpolator_array,
-                grid_t* grid,
-                int ftag
-                );
-        void dump_particles(
-                const char *fbase,
-                species_t* sp,
-                grid_t* grid,
-                int step,
-                interpolator_array_t* interpolator_array,
-                int ftag
-                );
+public:
+  using Dump_Strategy::Dump_Strategy; // inherit constructor
+  // BinaryDump(int _rank, int _nproc ) : Dump_Strategy(_rank, _nproc ){ } //
+  // empty
+
+  // TODO: now we pass rank and step, ftag has odd semanticds
+  void dump_fields(const char *fbase, int step, grid_t *grid,
+                   field_array_t *field_array, int ftag);
+  void dump_hydro(const char *fbase, int step, hydro_array_t *hydro_array,
+                  species_t *sp, interpolator_array_t *interpolator_array,
+                  grid_t *grid, int ftag);
+  void dump_particles(const char *fbase, species_t *sp, grid_t *grid, int step,
+                      interpolator_array_t *interpolator_array, int ftag);
 };
 
 #ifdef VPIC_ENABLE_HDF5
 
-struct field_dump_flag_t
-{
-    bool ex = true, ey = true, ez = true, div_e_err = true;
-    bool cbx = true, cby = true, cbz = true, div_b_err = true;
-    bool tcax = true, tcay = true, tcaz = true, rhob = true;
-    bool jfx = true, jfy = true, jfz = true, rhof = true;
-    bool ematx = true, ematy = true, ematz = true, nmat = true;
-    bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
-    void disableE()
-    {
-        ex = false, ey = false, ez = false, div_e_err = false;
-    }
+struct field_dump_flag_t {
+  bool ex = true, ey = true, ez = true, div_e_err = true;
+  bool cbx = true, cby = true, cbz = true, div_b_err = true;
+  bool tcax = true, tcay = true, tcaz = true, rhob = true;
+  bool jfx = true, jfy = true, jfz = true, rhof = true;
+  bool ematx = true, ematy = true, ematz = true, nmat = true;
+  bool fmatx = true, fmaty = true, fmatz = true, cmat = true;
+  void disableE() { ex = false, ey = false, ez = false, div_e_err = false; }
 
-    void disableCB()
-    {
-        cbx = false, cby = false, cbz = false, div_b_err = false;
-    }
+  void disableCB() { cbx = false, cby = false, cbz = false, div_b_err = false; }
 
-    void disableTCA()
-    {
-        tcax = false, tcay = false, tcaz = false, rhob = false;
-    }
+  void disableTCA() { tcax = false, tcay = false, tcaz = false, rhob = false; }
 
-    void disableJF()
-    {
-        jfx = false, jfy = false, jfz = false, rhof = false;
-    }
+  void disableJF() { jfx = false, jfy = false, jfz = false, rhof = false; }
 
-    void disableEMAT()
-    {
-        ematx = false, ematy = false, ematz = false, nmat = false;
-    }
+  void disableEMAT() {
+    ematx = false, ematy = false, ematz = false, nmat = false;
+  }
 
-    void disableFMAT()
-    {
-        fmatx = false, fmaty = false, fmatz = false, cmat = false;
-    }
+  void disableFMAT() {
+    fmatx = false, fmaty = false, fmatz = false, cmat = false;
+  }
 
-    void resetToDefaults()
-    {
-        ex = true, ey = true, ez = true, div_e_err = true;
-        cbx = true, cby = true, cbz = true, div_b_err = true;
-        tcax = true, tcay = true, tcaz = true, rhob = true;
-        jfx = true, jfy = true, jfz = true, rhof = true;
-        ematx = true, ematy = true, ematz = true, nmat = true;
-        fmatx = true, fmaty = true, fmatz = true, cmat = true;
-    }
+  void resetToDefaults() {
+    ex = true, ey = true, ez = true, div_e_err = true;
+    cbx = true, cby = true, cbz = true, div_b_err = true;
+    tcax = true, tcay = true, tcaz = true, rhob = true;
+    jfx = true, jfy = true, jfz = true, rhof = true;
+    ematx = true, ematy = true, ematz = true, nmat = true;
+    fmatx = true, fmaty = true, fmatz = true, cmat = true;
+  }
 
-    bool enabledE()
-    {
-        return ex && ey && ez;
-    }
+  bool enabledE() { return ex && ey && ez; }
 
-    bool enabledCB()
-    {
-        return cbx && cby && cbz;
-    }
+  bool enabledCB() { return cbx && cby && cbz; }
 
-    bool enabledTCA()
-    {
-        return tcax && tcay && tcaz;
-    }
+  bool enabledTCA() { return tcax && tcay && tcaz; }
 
-    bool enabledJF()
-    {
-        return jfx && jfy && jfz;
-    }
+  bool enabledJF() { return jfx && jfy && jfz; }
 
-    bool enabledEMAT()
-    {
-        return ematx && ematy && ematz;
-    }
+  bool enabledEMAT() { return ematx && ematy && ematz; }
 
-    bool enabledFMAT()
-    {
-        return fmatx && fmaty && fmatz;
-    }
+  bool enabledFMAT() { return fmatx && fmaty && fmatz; }
 };
 
-struct hydro_dump_flag_t
-{
-    bool jx = true, jy = true, jz = true, rho = true;
-    bool px = true, py = true, pz = true, ke = true;
-    bool txx = true, tyy = true, tzz = true;
-    bool tyz = true, tzx = true, txy = true;
+struct hydro_dump_flag_t {
+  bool jx = true, jy = true, jz = true, rho = true;
+  bool px = true, py = true, pz = true, ke = true;
+  bool txx = true, tyy = true, tzz = true;
+  bool tyz = true, tzx = true, txy = true;
 
-    void disableJ()
-    {
-        jx = false, jy = false, jz = false, rho = false;
-    }
+  void disableJ() { jx = false, jy = false, jz = false, rho = false; }
 
-    void disableP()
-    {
-        px = false, py = false, pz = false, ke = false;
-    }
+  void disableP() { px = false, py = false, pz = false, ke = false; }
 
-    void disableTD() //Stress diagonal
-    {
-        txx = false, tyy = false, tzz = false;
-    }
+  void disableTD() // Stress diagonal
+  {
+    txx = false, tyy = false, tzz = false;
+  }
 
-    void disableTOD() //Stress off-diagonal
-    {
-        tyz = false, tzx = false, txy = false;
-    }
-    void resetToDefaults()
-    {
-        jx = true, jy = true, jz = true, rho = true;
-        px = true, py = true, pz = true, ke = true;
-        txx = true, tyy = true, tzz = true;
-        tyz = true, tzx = true, txy = true;
-    }
+  void disableTOD() // Stress off-diagonal
+  {
+    tyz = false, tzx = false, txy = false;
+  }
+  void resetToDefaults() {
+    jx = true, jy = true, jz = true, rho = true;
+    px = true, py = true, pz = true, ke = true;
+    txx = true, tyy = true, tzz = true;
+    tyz = true, tzx = true, txy = true;
+  }
 
-    bool enabledJ()
-    {
-        return jx && jy && jz;
-    }
+  bool enabledJ() { return jx && jy && jz; }
 
-    bool enabledP()
-    {
-        return px && py && pz;
-    }
+  bool enabledP() { return px && py && pz; }
 
-    bool enabledTD()
-    {
-        return txx && tyy && tzz;
-    }
+  bool enabledTD() { return txx && tyy && tzz; }
 
-    bool enabledTOD()
-    {
-        return tyz && tzx && txy;
-    }
+  bool enabledTOD() { return tyz && tzx && txy; }
 };
 class HDF5Dump : public Dump_Strategy {
-    std::unordered_map<species_id, size_t> tframe_map;
-    public:
-    using Dump_Strategy::Dump_Strategy; // inherit constructor
+  std::unordered_map<species_id, size_t> tframe_map;
 
-    // TODO: replace these with a common dump interface
-    // Declare vars to use
-    hydro_dump_flag_t hydro_dump_flag;
-    field_dump_flag_t field_dump_flag;
+public:
+  using Dump_Strategy::Dump_Strategy; // inherit constructor
+
+  // TODO: replace these with a common dump interface
+  // Declare vars to use
+  hydro_dump_flag_t hydro_dump_flag;
+  field_dump_flag_t field_dump_flag;
 
 #define DUMP_DIR_FORMAT "./%s"
 
-    // TODO: naming a macro so close to existing functions AND data is not a good
-    // define to do C-style indexing
-#define _hydro(x, y, z) hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
-
-
-    /**
-     * @brief Dump field data to the HDf5 file
-     *         Author: Bin Dong  dbin@lbl.gov
-     *         https://crd.lbl.gov/bin-dong
-     *         Nov 2020
-     * @param fbase
-     * @param step
-     * @param grid
-     * @param field_array
-     * @param ftag
-     */
-    void dump_fields(
-            const char *fbase,
-            int step,
-            grid_t* grid,
-            field_array_t* field_array,
-            int ftag
-            )
-    {
-        size_t step_for_viou = step;
-
-        int mpi_size, mpi_rank;
-        MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-        MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+  // TODO: naming a macro so close to existing functions AND data is not a good
+  // define to do C-style indexing
+#define _hydro(x, y, z)                                                        \
+  hydro_array->h[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
+
+  /**
+   * @brief Dump field data to the HDf5 file
+   *         Author: Bin Dong  dbin@lbl.gov
+   *         https://crd.lbl.gov/bin-dong
+   *         Nov 2020
+   * @param fbase
+   * @param step
+   * @param grid
+   * @param field_array
+   * @param ftag
+   */
+  void dump_fields(const char *fbase, int step, grid_t *grid,
+                   field_array_t *field_array, int ftag) {
+    size_t step_for_viou = step;
 
+    int mpi_size, mpi_rank;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
 
 #ifdef DUMP_INFO_DEBUG
-        printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
-        //printf("base dir for field: %s \n", fdParams.baseDir);
-        //printf("stride x y z  = (%ld, %ld, %ld)\n", fdParams.stride_x, fdParams.stride_y, fdParams.stride_z);
-        printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
-        printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, %f) \n", grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
-        //printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x, global->topology_y, global->topology_z);
-        printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy, grid->sz, grid->nv);
+    printf("MPI rank = %d, size = %d \n", mpi_rank, mpi_size);
+    // printf("base dir for field: %s \n", fdParams.baseDir);
+    // printf("stride x y z  = (%ld, %ld, %ld)\n", fdParams.stride_x,
+    // fdParams.stride_y, fdParams.stride_z);
+    printf("grid x, y z  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
+    printf("domain loc (x0, y0, z0) -> (x1, y1, z1) = (%f, %f, %f) -> (%f, %f, "
+           "%f) \n",
+           grid->x0, grid->y0, grid->z0, grid->x1, grid->y1, grid->z1);
+    // printf("global->topology_x, y, z =  %f, %f, %f \n ", global->topology_x,
+    // global->topology_y, global->topology_z);
+    printf("grid -> sx, sy, sz =  (%d, %d, %d), nv=%d \n", grid->sx, grid->sy,
+           grid->sz, grid->nv);
 #endif
 
+    char fname[256];
+    char field_scratch[128];
+    char subfield_scratch[128];
 
+    sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5");
+    FileUtils::makeDirectory(field_scratch);
+    sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou);
+    FileUtils::makeDirectory(subfield_scratch);
 
-        char fname[256];
-        char field_scratch[128];
-        char subfield_scratch[128];
-
-        sprintf(field_scratch, DUMP_DIR_FORMAT, "field_hdf5");
-        FileUtils::makeDirectory(field_scratch);
-        sprintf(subfield_scratch, "%s/T.%zu/", field_scratch, step_for_viou);
-        FileUtils::makeDirectory(subfield_scratch);
-
-        sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou);
-        double el1 = uptime();
-
-        //    int file_exist(const char *filename)
-        //{
-        //    struct stat buffer;
-        //    return (stat(filename, &buffer) == 0);
-        //}
-
-        //https://support.hdfgroup.org/ftp/HDF5/current/src/unpacked/examples/h5_compound.c
-#ifdef  HAS_FIELD_COMP
-        if(!mpi_rank)
-            printf("Using Field Compund type !\n");
-        hid_t  field_comp_type_it = H5Tcreate (H5T_COMPOUND, sizeof(field_t));
-        H5Tinsert(field_comp_type_it, "ex", HOFFSET(field_t, ex), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "ey", HOFFSET(field_t, ey), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "ez", HOFFSET(field_t, ez), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "div_e_err", HOFFSET(field_t, div_e_err), H5T_NATIVE_FLOAT);
-
-        H5Tinsert(field_comp_type_it, "cbx", HOFFSET(field_t, cbx), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "cby", HOFFSET(field_t, cby), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "cbz", HOFFSET(field_t, cbz), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "div_b_err", HOFFSET(field_t, div_b_err), H5T_NATIVE_FLOAT);
-
-        H5Tinsert(field_comp_type_it, "tcax", HOFFSET(field_t, tcax), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "tcay", HOFFSET(field_t, tcay), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "tcaz", HOFFSET(field_t, tcaz), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "rhob", HOFFSET(field_t, rhob), H5T_NATIVE_FLOAT);
-
-        H5Tinsert(field_comp_type_it, "jfx", HOFFSET(field_t, jfx), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "jfy", HOFFSET(field_t, jfy), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "jfz", HOFFSET(field_t, jfz), H5T_NATIVE_FLOAT);
-        H5Tinsert(field_comp_type_it, "rhof", HOFFSET(field_t, rhof), H5T_NATIVE_FLOAT);
-
-        H5Tinsert(field_comp_type_it, "ematx", HOFFSET(field_t, ematx), H5T_NATIVE_SHORT);
-        H5Tinsert(field_comp_type_it, "ematy", HOFFSET(field_t, ematy), H5T_NATIVE_SHORT);
-        H5Tinsert(field_comp_type_it, "ematz", HOFFSET(field_t, ematz), H5T_NATIVE_SHORT);
-        H5Tinsert(field_comp_type_it, "nmat", HOFFSET(field_t, nmat), H5T_NATIVE_SHORT);
-
-        H5Tinsert(field_comp_type_it, "fmatx", HOFFSET(field_t, fmatx), H5T_NATIVE_SHORT);
-        H5Tinsert(field_comp_type_it, "fmaty", HOFFSET(field_t, fmaty), H5T_NATIVE_SHORT);
-        H5Tinsert(field_comp_type_it, "fmatz", HOFFSET(field_t, fmatz), H5T_NATIVE_SHORT);
-        H5Tinsert(field_comp_type_it, "cmat", HOFFSET(field_t, cmat), H5T_NATIVE_SHORT);
-#endif
+    sprintf(fname, "%s/%s_%zu.h5", subfield_scratch, "fields", step_for_viou);
+    double el1 = uptime();
 
+    //    int file_exist(const char *filename)
+    //{
+    //    struct stat buffer;
+    //    return (stat(filename, &buffer) == 0);
+    //}
 
-        //struct stat buffer;
-        //if((stat(fname, &buffer) == 0)){
-        //    file_exist_flag  = 1;
-        //    if(!mpi_rank)
-        //        printf("Write original files /w HDF5! \n");
-        // }
-        // file_exist_flag = 0;
+    // https://support.hdfgroup.org/ftp/HDF5/current/src/unpacked/examples/h5_compound.c
+#ifdef HAS_FIELD_COMP
+    if (!mpi_rank)
+      printf("Using Field Compund type !\n");
+    hid_t field_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(field_t));
+    H5Tinsert(field_comp_type_it, "ex", HOFFSET(field_t, ex), H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "ey", HOFFSET(field_t, ey), H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "ez", HOFFSET(field_t, ez), H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "div_e_err", HOFFSET(field_t, div_e_err),
+              H5T_NATIVE_FLOAT);
+
+    H5Tinsert(field_comp_type_it, "cbx", HOFFSET(field_t, cbx),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "cby", HOFFSET(field_t, cby),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "cbz", HOFFSET(field_t, cbz),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "div_b_err", HOFFSET(field_t, div_b_err),
+              H5T_NATIVE_FLOAT);
+
+    H5Tinsert(field_comp_type_it, "tcax", HOFFSET(field_t, tcax),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "tcay", HOFFSET(field_t, tcay),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "tcaz", HOFFSET(field_t, tcaz),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "rhob", HOFFSET(field_t, rhob),
+              H5T_NATIVE_FLOAT);
+
+    H5Tinsert(field_comp_type_it, "jfx", HOFFSET(field_t, jfx),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "jfy", HOFFSET(field_t, jfy),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "jfz", HOFFSET(field_t, jfz),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(field_comp_type_it, "rhof", HOFFSET(field_t, rhof),
+              H5T_NATIVE_FLOAT);
+
+    H5Tinsert(field_comp_type_it, "ematx", HOFFSET(field_t, ematx),
+              H5T_NATIVE_SHORT);
+    H5Tinsert(field_comp_type_it, "ematy", HOFFSET(field_t, ematy),
+              H5T_NATIVE_SHORT);
+    H5Tinsert(field_comp_type_it, "ematz", HOFFSET(field_t, ematz),
+              H5T_NATIVE_SHORT);
+    H5Tinsert(field_comp_type_it, "nmat", HOFFSET(field_t, nmat),
+              H5T_NATIVE_SHORT);
+
+    H5Tinsert(field_comp_type_it, "fmatx", HOFFSET(field_t, fmatx),
+              H5T_NATIVE_SHORT);
+    H5Tinsert(field_comp_type_it, "fmaty", HOFFSET(field_t, fmaty),
+              H5T_NATIVE_SHORT);
+    H5Tinsert(field_comp_type_it, "fmatz", HOFFSET(field_t, fmatz),
+              H5T_NATIVE_SHORT);
+    H5Tinsert(field_comp_type_it, "cmat", HOFFSET(field_t, cmat),
+              H5T_NATIVE_SHORT);
+#endif
 
-        hid_t plist_id;
-        hid_t file_id;
-        plist_id = H5Pcreate(H5P_FILE_ACCESS);
-        H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
-        //H5Pset_alignment(plist_id, 4194304, 4194304);
-        /*if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < 0){
-          exit(-1);
-          }*/
+    // struct stat buffer;
+    // if((stat(fname, &buffer) == 0)){
+    //    file_exist_flag  = 1;
+    //    if(!mpi_rank)
+    //        printf("Write original files /w HDF5! \n");
+    // }
+    // file_exist_flag = 0;
+
+    hid_t plist_id;
+    hid_t file_id;
+    plist_id = H5Pcreate(H5P_FILE_ACCESS);
+    H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+    // H5Pset_alignment(plist_id, 4194304, 4194304);
+    /*if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) <
+      0){ exit(-1);
+      }*/
 
 #ifdef METADATA_COLL_WRITE
-        if(!mpi_rank) printf("Enable collective metadata write !\n");
-        H5Pset_coll_metadata_write(plist_id, TRUE);
+    if (!mpi_rank)
+      printf("Enable collective metadata write !\n");
+    H5Pset_coll_metadata_write(plist_id, TRUE);
 #endif
-        file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
-        H5Pclose(plist_id);
-
-
+    file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+    H5Pclose(plist_id);
 
-        sprintf(fname, "Timestep_%zu", step_for_viou);
-        hid_t group_id;
-        group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    sprintf(fname, "Timestep_%zu", step_for_viou);
+    hid_t group_id;
+    group_id = H5Gcreate(file_id, fname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
-        el1 = uptime() - el1;
-        //io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
-        double el2 = uptime();
+    el1 = uptime() - el1;
+    // io_log("TimeHDF5Open): " << el1 << " s"); //Easy to handle results for
+    // scripts
+    double el2 = uptime();
 
-        /*
-        // Create a variable list of field values to output.
-        size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
-        size_t * varlist = new size_t[numvars];
+    /*
+    // Create a variable list of field values to output.
+    size_t numvars = std::min(global->fdParams.output_vars.bitsum(),
+    total_field_variables); size_t * varlist = new size_t[numvars];
 
-        for(size_t i(0), c(0); i<total_field_variables; i++)
-        if(global->fdParams.output_vars.bitset(i)) varlist[c++] = i;
+    for(size_t i(0), c(0); i<total_field_variables; i++)
+    if(global->fdParams.output_vars.bitset(i)) varlist[c++] = i;
 
-        printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
+    printf("\nBEGIN_OUTPUT: numvars = %zd \n", numvars);*/
 
 #define fpp(x, y, z) f[VOXEL(x, y, z, grid->nx, grid->ny, grid->nz)]
-        /*
-           typedef struct field {
-           float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
-           float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
-           float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
-           float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
-           material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
-           material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
-           } field_t;*/
-        // Local voxel mesh resolution.  Voxels are
-        // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1
-        // with voxels 1:nx,1:ny,1:nz being non-ghost
-        // voxels.
-
-        float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
-        hsize_t temp_buf_index;
-        hid_t dset_id;
-        //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
-        //Comment out for test only
-
-        plist_id = H5Pcreate(H5P_DATASET_XFER);
+    /*
+       typedef struct field {
+       float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
+       float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
+       float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge
+       density float jfx,  jfy,  jfz,  rhof;          // Free current and charge
+       density material_id ematx, ematy, ematz, nmat; // Material at edge
+       centers and nodes material_id fmatx, fmaty, fmatz, cmat; // Material at
+       face and cell centers } field_t;*/
+    // Local voxel mesh resolution.  Voxels are
+    // indexed FORTRAN style 0:nx+1,0:ny+1,0:nz+1
+    // with voxels 1:nx,1:ny,1:nz being non-ghost
+    // voxels.
+
+    float *temp_buf =
+        (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
+    hsize_t temp_buf_index;
+    hid_t dset_id;
+    // char  *field_var_name[] =
+    // {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
+    // Comment out for test only
+
+    plist_id = H5Pcreate(H5P_DATASET_XFER);
 #ifdef HAS_INDEPENDENT_IO
-        if(!mpi_rank) printf("\n ###\n VPIC Independent I/O! \n ###\n");
-        H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT);
+    if (!mpi_rank)
+      printf("\n ###\n VPIC Independent I/O! \n ###\n");
+    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT);
 #else
-        H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
 #endif
 
+    // H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL,
+    // (hsize_t *) &numparticles, NULL);
 
+    // global->topology_x
 
+    hsize_t field_global_size[3], field_local_size[3], global_offset[3],
+        global_count[3];
+    field_global_size[0] = (grid->nx * grid->gpx);
+    field_global_size[1] = (grid->ny * grid->gpy);
+    field_global_size[2] = (grid->nz * grid->gpz);
 
-        //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
-
-        //global->topology_x
-
-        hsize_t field_global_size[3], field_local_size[3], global_offset[3], global_count[3];
-        field_global_size[0] = (grid->nx * grid->gpx);
-        field_global_size[1] = (grid->ny * grid->gpy);
-        field_global_size[2] = (grid->nz * grid->gpz);
-
-        field_local_size[0] = grid->nx;
-        field_local_size[1] = grid->ny;
-        field_local_size[2] = grid->nz;
+    field_local_size[0] = grid->nx;
+    field_local_size[1] = grid->ny;
+    field_local_size[2] = grid->nz;
 
-        int gpx = grid->gpx;
-        int gpy = grid->gpy;
-        int gpz = grid->gpz;
+    int gpx = grid->gpx;
+    int gpy = grid->gpy;
+    int gpz = grid->gpz;
 
-        // Convert rank to local decomposition
-        int rx, ry, rz;
-        UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+    // Convert rank to local decomposition
+    int rx, ry, rz;
+    UNVOXEL(mpi_rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
 
-        int mpi_rank_x, mpi_rank_y, mpi_rank_z;
-        mpi_rank_x = rx;
-        mpi_rank_y = ry;
-        mpi_rank_z = rz;
+    int mpi_rank_x, mpi_rank_y, mpi_rank_z;
+    mpi_rank_x = rx;
+    mpi_rank_y = ry;
+    mpi_rank_z = rz;
 
-        global_offset[0] = (grid->nx) * mpi_rank_x;
-        global_offset[1] = (grid->ny) * mpi_rank_y;
-        global_offset[2] = (grid->nz) * mpi_rank_z;
+    global_offset[0] = (grid->nx) * mpi_rank_x;
+    global_offset[1] = (grid->ny) * mpi_rank_y;
+    global_offset[2] = (grid->nz) * mpi_rank_z;
 
-        global_count[0] = (grid->nx);
-        global_count[1] = (grid->ny);
-        global_count[2] = (grid->nz);
+    global_count[0] = (grid->nx);
+    global_count[1] = (grid->ny);
+    global_count[2] = (grid->nz);
 
 #ifdef DUMP_INFO_DEBUG
-        if(mpi_rank < 4){
-            printf("grid nx, ny nz  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
-            printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
-            printf("global size   = %llu  %llu %llu \n", field_global_size[0], field_global_size[1], field_global_size[2]);
-            printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]);
-            printf("global_count  = %llu  %llu %llu \n", global_count[0], global_count[1], global_count[2]);
-            fflush(stdout);
-        }
+    if (mpi_rank < 4) {
+      printf("grid nx, ny nz  = (%d, %d, %d) \n", grid->nx, grid->ny, grid->nz);
+      printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank,
+             mpi_rank_x, mpi_rank_y, mpi_rank_z);
+      printf("global size   = %llu  %llu %llu \n", field_global_size[0],
+             field_global_size[1], field_global_size[2]);
+      printf("global_offset = %llu %llu %llu \n", global_offset[0],
+             global_offset[1], global_offset[2]);
+      printf("global_count  = %llu  %llu %llu \n", global_count[0],
+             global_count[1], global_count[2]);
+      fflush(stdout);
+    }
 #endif
 
-        hid_t filespace;   //= H5Screate_simple(3, field_global_size, NULL);
-        hid_t memspace; // = H5Screate_simple(3, field_local_size, NULL);
-        //if(!file_exist_flag){
-        filespace = H5Screate_simple(3, field_global_size, NULL);
-        //}
-        memspace = H5Screate_simple(3, field_local_size, NULL);
+    hid_t filespace; //= H5Screate_simple(3, field_global_size, NULL);
+    hid_t memspace;  // = H5Screate_simple(3, field_local_size, NULL);
+    // if(!file_exist_flag){
+    filespace = H5Screate_simple(3, field_global_size, NULL);
+    //}
+    memspace = H5Screate_simple(3, field_local_size, NULL);
 
-        hsize_t chunk_dims[3];
-        chunk_dims[0] = 288; //grid->nx;  //8 x 8 x 8
-        chunk_dims[1] = 24;  //grid->ny; //
-        chunk_dims[2] = 24;  //grid->nz;
+    hsize_t chunk_dims[3];
+    chunk_dims[0] = 288; // grid->nx;  //8 x 8 x 8
+    chunk_dims[1] = 24;  // grid->ny; //
+    chunk_dims[2] = 24;  // grid->nz;
 
-
-
-        hid_t dataspace_id;
-        hid_t dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
+    hid_t dataspace_id;
+    hid_t dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
 #ifdef CHUNK_FLAG
-        H5Pset_chunk(dcpl_id, 3, chunk_dims);
-        if(!mpi_rank)   printf("Enable chunking !\n");
+    H5Pset_chunk(dcpl_id, 3, chunk_dims);
+    if (!mpi_rank)
+      printf("Enable chunking !\n");
 #endif
 
-#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
-        {                                                                                                          \
-            dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT); \
-            temp_buf_index = 0;                                                                                       \
-            for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
-            {                                                                                                         \
-                for (size_t j(1); j < grid->ny + 1; j++)                                                              \
-                {                                                                                                     \
-                    for (size_t k(1); k < grid->nz + 1; k++)                                                          \
-                    {                                                                                                 \
-                        temp_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME;                     \
-                        temp_buf_index = temp_buf_index + 1;                                                          \
-                    }                                                                                                 \
-                }                                                                                                     \
-            }                                                                                                         \
-            dataspace_id = H5Dget_space(dset_id);                                                                     \
-            H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
-            H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
-            H5Sclose(dataspace_id);                                                                                   \
-            H5Dclose(dset_id);                                                                                        \
-        }
-        /*
-           typedef struct field {
-           float ex,   ey,   ez,   div_e_err;     // Electric field and div E error
-           float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div B error
-           float tcax, tcay, tcaz, rhob;          // TCA fields and bound charge density
-           float jfx,  jfy,  jfz,  rhof;          // Free current and charge density
-           material_id ematx, ematy, ematz, nmat; // Material at edge centers and nodes
-           material_id fmatx, fmaty, fmatz, cmat; // Material at face and cell centers
-           } field_t;*/
-
+#define DUMP_FIELD_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)            \
+  {                                                                            \
+    dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace,          \
+                        H5P_DEFAULT, dcpl_id, H5P_DEFAULT);                    \
+    temp_buf_index = 0;                                                        \
+    for (size_t i(1); i < grid->nx + 1; i++) {                                 \
+      for (size_t j(1); j < grid->ny + 1; j++) {                               \
+        for (size_t k(1); k < grid->nz + 1; k++) {                             \
+          temp_buf[temp_buf_index] =                                           \
+              FIELD_ARRAY_NAME->fpp(i, j, k).ATTRIBUTE_NAME;                   \
+          temp_buf_index = temp_buf_index + 1;                                 \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    dataspace_id = H5Dget_space(dset_id);                                      \
+    H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL,     \
+                        global_count, NULL);                                   \
+    H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id,          \
+             temp_buf);                                                        \
+    H5Sclose(dataspace_id);                                                    \
+    H5Dclose(dset_id);                                                         \
+  }
+      /*
+         typedef struct field {
+         float ex,   ey,   ez,   div_e_err;     // Electric field and div E
+         error float cbx,  cby,  cbz,  div_b_err;     // Magnetic field and div
+         B error float tcax, tcay, tcaz, rhob;          // TCA fields and bound
+         charge density float jfx,  jfy,  jfz,  rhof;          // Free current
+         and charge density material_id ematx, ematy, ematz, nmat; // Material
+         at edge centers and nodes material_id fmatx, fmaty, fmatz, cmat; //
+         Material at face and cell centers } field_t;*/
 
 #ifdef HAS_FIELD_COMP
-        field_t * field_buf;
-        temp_buf_index = 0;
-        int global_index;
-        field_buf = (field_t *)malloc(sizeof(field_t) * (grid->nx) * (grid->ny) * (grid->nz));
-        for (size_t i(1); i < grid->nx + 1; i++){
-            for (size_t j(1); j < grid->ny + 1; j++){
-                for (size_t k(1); k < grid->nz + 1; k++){
-                    field_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k);
-                    temp_buf_index++;
-                }
-            }
+    field_t *field_buf;
+    temp_buf_index = 0;
+    int global_index;
+    field_buf = (field_t *)malloc(sizeof(field_t) * (grid->nx) * (grid->ny) *
+                                  (grid->nz));
+    for (size_t i(1); i < grid->nx + 1; i++) {
+      for (size_t j(1); j < grid->ny + 1; j++) {
+        for (size_t k(1); k < grid->nz + 1; k++) {
+          field_buf[temp_buf_index] = FIELD_ARRAY_NAME->fpp(i, j, k);
+          temp_buf_index++;
         }
-        dset_id = H5Dcreate(group_id, "field", field_comp_type_it, filespace, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
-        dataspace_id = H5Dget_space(dset_id);
-        H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);
-        H5Dwrite(dset_id, field_comp_type_it, memspace, dataspace_id, plist_id, field_buf);
-        free(field_buf);
-        H5Sclose(dataspace_id);
-        H5Dclose(dset_id);
-        H5Tclose(field_comp_type_it);
+      }
+    }
+    dset_id = H5Dcreate(group_id, "field", field_comp_type_it, filespace,
+                        H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
+    dataspace_id = H5Dget_space(dset_id);
+    H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL,
+                        global_count, NULL);
+    H5Dwrite(dset_id, field_comp_type_it, memspace, dataspace_id, plist_id,
+             field_buf);
+    free(field_buf);
+    H5Sclose(dataspace_id);
+    H5Dclose(dset_id);
+    H5Tclose(field_comp_type_it);
 #else
 
-        if (field_dump_flag.ex)
-            DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.ey)
-            DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.ez)
-            DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.div_e_err)
-            DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT);
-
-        if (field_dump_flag.cbx)
-            DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.cby)
-            DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.cbz)
-            DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.div_b_err)
-            DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT);
-
-        if (field_dump_flag.tcax)
-            DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.tcay)
-            DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.tcaz)
-            DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.rhob)
-            DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT);
-
-        if (field_dump_flag.jfx)
-            DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.jfy)
-            DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.jfz)
-            DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT);
-        if (field_dump_flag.rhof)
-            DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT);
-
-        //H5T_NATIVE_SHORT  for material_id (typedef int16_t material_id)
-        if (field_dump_flag.ematx)
-            DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT);
-        if (field_dump_flag.ematy)
-            DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT);
-        if (field_dump_flag.ematz)
-            DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT);
-        if (field_dump_flag.nmat)
-            DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT);
-
-        if (field_dump_flag.fmatx)
-            DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT);
-        if (field_dump_flag.fmaty)
-            DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT);
-        if (field_dump_flag.fmatz)
-            DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT);
-        if (field_dump_flag.cmat)
-            DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT);
+    if (field_dump_flag.ex)
+      DUMP_FIELD_TO_HDF5("ex", ex, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.ey)
+      DUMP_FIELD_TO_HDF5("ey", ey, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.ez)
+      DUMP_FIELD_TO_HDF5("ez", ez, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.div_e_err)
+      DUMP_FIELD_TO_HDF5("div_e_err", div_e_err, H5T_NATIVE_FLOAT);
+
+    if (field_dump_flag.cbx)
+      DUMP_FIELD_TO_HDF5("cbx", cbx, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.cby)
+      DUMP_FIELD_TO_HDF5("cby", cby, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.cbz)
+      DUMP_FIELD_TO_HDF5("cbz", cbz, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.div_b_err)
+      DUMP_FIELD_TO_HDF5("div_b_err", div_b_err, H5T_NATIVE_FLOAT);
+
+    if (field_dump_flag.tcax)
+      DUMP_FIELD_TO_HDF5("tcax", tcax, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.tcay)
+      DUMP_FIELD_TO_HDF5("tcay", tcay, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.tcaz)
+      DUMP_FIELD_TO_HDF5("tcaz", tcaz, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.rhob)
+      DUMP_FIELD_TO_HDF5("rhob", rhob, H5T_NATIVE_FLOAT);
+
+    if (field_dump_flag.jfx)
+      DUMP_FIELD_TO_HDF5("jfx", jfx, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.jfy)
+      DUMP_FIELD_TO_HDF5("jfy", jfy, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.jfz)
+      DUMP_FIELD_TO_HDF5("jfz", jfz, H5T_NATIVE_FLOAT);
+    if (field_dump_flag.rhof)
+      DUMP_FIELD_TO_HDF5("rhof", rhof, H5T_NATIVE_FLOAT);
+
+    // H5T_NATIVE_SHORT  for material_id (typedef int16_t material_id)
+    if (field_dump_flag.ematx)
+      DUMP_FIELD_TO_HDF5("ematx", ematx, H5T_NATIVE_SHORT);
+    if (field_dump_flag.ematy)
+      DUMP_FIELD_TO_HDF5("ematy", ematy, H5T_NATIVE_SHORT);
+    if (field_dump_flag.ematz)
+      DUMP_FIELD_TO_HDF5("ematz", ematz, H5T_NATIVE_SHORT);
+    if (field_dump_flag.nmat)
+      DUMP_FIELD_TO_HDF5("nmat", nmat, H5T_NATIVE_SHORT);
+
+    if (field_dump_flag.fmatx)
+      DUMP_FIELD_TO_HDF5("fmatx", fmatx, H5T_NATIVE_SHORT);
+    if (field_dump_flag.fmaty)
+      DUMP_FIELD_TO_HDF5("fmaty", fmaty, H5T_NATIVE_SHORT);
+    if (field_dump_flag.fmatz)
+      DUMP_FIELD_TO_HDF5("fmatz", fmatz, H5T_NATIVE_SHORT);
+    if (field_dump_flag.cmat)
+      DUMP_FIELD_TO_HDF5("cmat", cmat, H5T_NATIVE_SHORT);
 
 #endif
 
+    H5D_mpio_actual_io_mode_t actual_io_mode;
+    H5Pget_mpio_actual_io_mode(plist_id, &actual_io_mode);
+    /*
 
-        H5D_mpio_actual_io_mode_t actual_io_mode;
-        H5Pget_mpio_actual_io_mode(plist_id,  &actual_io_mode);
-        /*
-
-           switch(actual_io_mode){
-           case H5D_MPIO_NO_COLLECTIVE:
-           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_NO_COLLECTIVE: ");
-           break;
-           case H5D_MPIO_CHUNK_INDEPENDENT:
-           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_INDEPENDENT: ");
-           break;
-           case H5D_MPIO_CHUNK_COLLECTIVE:
-           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_COLLECTIVE: ");
-           break;
-           case H5D_MPIO_CHUNK_MIXED:
-           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_MIXED: ");
-           break;
-           case H5D_MPIO_CONTIGUOUS_COLLECTIVE:
-           io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CONTIGUOUS_COLLECTIVE: ");
-           break;
-           default :
-           io_log("H5Pget_mpio_actual_io_mode: None returend: ");
-           break;
-           }
-
-           H5D_mpio_actual_chunk_opt_mode_t actual_chunk_opt_mode;
-           H5Pget_mpio_actual_chunk_opt_mode(plist_id, &actual_chunk_opt_mode);
-           switch(actual_chunk_opt_mode){
-           case H5D_MPIO_NO_CHUNK_OPTIMIZATION:
-           io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_NO_CHUNK_OPTIMIZATION: ");
-           break;
-           case H5D_MPIO_MULTI_CHUNK:
-           io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK: ");
-           break;
-        //  case H5D_MPIO_MULTI_CHUNK_NO_OPT:
-        //      io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK_NO_OPT: ");
-        //     break;
-        case H5D_MPIO_LINK_CHUNK:
-        io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_LINK_CHUNK: ");
-        break;
-        default :
-        io_log("H5Pget_mpio_actual_chunk_opt_mode: None returend: ");
-        break;
-        }
+       switch(actual_io_mode){
+       case H5D_MPIO_NO_COLLECTIVE:
+       io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_NO_COLLECTIVE: ");
+       break;
+       case H5D_MPIO_CHUNK_INDEPENDENT:
+       io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_INDEPENDENT: ");
+       break;
+       case H5D_MPIO_CHUNK_COLLECTIVE:
+       io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_COLLECTIVE: ");
+       break;
+       case H5D_MPIO_CHUNK_MIXED:
+       io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CHUNK_MIXED: ");
+       break;
+       case H5D_MPIO_CONTIGUOUS_COLLECTIVE:
+       io_log("H5Pget_mpio_actual_io_mode: H5D_MPIO_CONTIGUOUS_COLLECTIVE: ");
+       break;
+       default :
+       io_log("H5Pget_mpio_actual_io_mode: None returend: ");
+       break;
+       }
+
+       H5D_mpio_actual_chunk_opt_mode_t actual_chunk_opt_mode;
+       H5Pget_mpio_actual_chunk_opt_mode(plist_id, &actual_chunk_opt_mode);
+       switch(actual_chunk_opt_mode){
+       case H5D_MPIO_NO_CHUNK_OPTIMIZATION:
+       io_log("H5Pget_mpio_actual_chunk_opt_mode:
+H5D_MPIO_NO_CHUNK_OPTIMIZATION: "); break; case H5D_MPIO_MULTI_CHUNK:
+       io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_MULTI_CHUNK: ");
+       break;
+    //  case H5D_MPIO_MULTI_CHUNK_NO_OPT:
+    //      io_log("H5Pget_mpio_actual_chunk_opt_mode:
+H5D_MPIO_MULTI_CHUNK_NO_OPT: ");
+    //     break;
+    case H5D_MPIO_LINK_CHUNK:
+    io_log("H5Pget_mpio_actual_chunk_opt_mode: H5D_MPIO_LINK_CHUNK: ");
+    break;
+    default :
+    io_log("H5Pget_mpio_actual_chunk_opt_mode: None returend: ");
+    break;
+    }
+
+    uint32_t local_no_collective_cause,  global_no_collective_cause;
+    H5Pget_mpio_no_collective_cause(plist_id, &local_no_collective_cause,
+&global_no_collective_cause);
+
+    switch(local_no_collective_cause){
+    case H5D_MPIO_COLLECTIVE:
+    io_log("local_no_collective_cause: H5D_MPIO_COLLECTIVE: ");
+    break;
+    case H5D_MPIO_SET_INDEPENDENT:
+    io_log("local_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: ");
+    break;
+    case H5D_MPIO_DATA_TRANSFORMS:
+    io_log("local_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: ");
+    break;
+    //case H5D_MPIO_SET_MPIPOSIX:
+    //    io_log("local_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: ");
+    //    break;
+    case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES:
+    io_log("local_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES:
+"); break;
+    //case H5D_MPIO_POINT_SELECTIONS:
+    //    io_log("local_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: ");
+    //    break;
+    // case H5D_MPIO_FILTERS:
+    //    io_log("local_no_collective_cause: H5D_MPIO_FILTERS: ");
+    //    break;
+    default :
+    io_log("local_no_collective_cause: None returend: ");
+    break;
+}
 
-        uint32_t local_no_collective_cause,  global_no_collective_cause;
-        H5Pget_mpio_no_collective_cause(plist_id, &local_no_collective_cause, &global_no_collective_cause);
 
-        switch(local_no_collective_cause){
-        case H5D_MPIO_COLLECTIVE:
-        io_log("local_no_collective_cause: H5D_MPIO_COLLECTIVE: ");
+switch(global_no_collective_cause){
+    case H5D_MPIO_COLLECTIVE:
+        io_log("global_no_collective_cause: H5D_MPIO_COLLECTIVE: ");
         break;
-        case H5D_MPIO_SET_INDEPENDENT:
-        io_log("local_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: ");
+    case H5D_MPIO_SET_INDEPENDENT:
+        io_log("global_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: ");
         break;
-        case H5D_MPIO_DATA_TRANSFORMS:
-        io_log("local_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: ");
+    case H5D_MPIO_DATA_TRANSFORMS:
+        io_log("global_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: ");
         break;
         //case H5D_MPIO_SET_MPIPOSIX:
-        //    io_log("local_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: ");
+        //    io_log("global_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: ");
         //    break;
-        case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES:
-        io_log("local_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: ");
-        break;
+    case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES:
+        io_log("global_no_collective_cause:
+H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: "); break;
         //case H5D_MPIO_POINT_SELECTIONS:
-        //    io_log("local_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: ");
+        //    io_log("global_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: ");
         //    break;
         // case H5D_MPIO_FILTERS:
-        //    io_log("local_no_collective_cause: H5D_MPIO_FILTERS: ");
-        //    break;
-        default :
-        io_log("local_no_collective_cause: None returend: ");
+        //   io_log("global_no_collective_cause: H5D_MPIO_FILTERS: ");
+        //   break;
+    default :
+        io_log("global_no_collective_cause: None returend: ");
         break;
-    }
-
-
-    switch(global_no_collective_cause){
-        case H5D_MPIO_COLLECTIVE:
-            io_log("global_no_collective_cause: H5D_MPIO_COLLECTIVE: ");
-            break;
-        case H5D_MPIO_SET_INDEPENDENT:
-            io_log("global_no_collective_cause: H5D_MPIO_SET_INDEPENDENT: ");
-            break;
-        case H5D_MPIO_DATA_TRANSFORMS:
-            io_log("global_no_collective_cause: H5D_MPIO_DATA_TRANSFORMS: ");
-            break;
-            //case H5D_MPIO_SET_MPIPOSIX:
-            //    io_log("global_no_collective_cause: H5D_MPIO_SET_MPIPOSIX: ");
-            //    break;
-        case H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES:
-            io_log("global_no_collective_cause: H5D_MPIO_NOT_SIMPLE_OR_SCALAR_DATASPACES: ");
-            break;
-            //case H5D_MPIO_POINT_SELECTIONS:
-            //    io_log("global_no_collective_cause: H5D_MPIO_POINT_SELECTIONS: ");
-            //    break;
-            // case H5D_MPIO_FILTERS:
-            //   io_log("global_no_collective_cause: H5D_MPIO_FILTERS: ");
-            //   break;
-        default :
-            io_log("global_no_collective_cause: None returend: ");
-            break;
-    }
-    */
+}
+*/
 
     el2 = uptime() - el2;
-    //io_log("TimeHDF5Write: " << el2 << " s");
+    // io_log("TimeHDF5Write: " << el2 << " s");
 
     double el3 = uptime();
 
@@ -762,14 +694,15 @@ class HDF5Dump : public Dump_Strategy {
     dims[1] = 3;
     if(!file_exist_flag){
     hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
-    hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
+    hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO",
+    H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
     H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
     H5Sclose(va_geo_dataspace_id);
     H5Aclose(va_geo_attribute_id);
     }
     */
     free(temp_buf);
-    //if(!file_exist_flag)
+    // if(!file_exist_flag)
     H5Sclose(filespace);
     H5Sclose(memspace);
     H5Pclose(plist_id);
@@ -777,399 +710,405 @@ class HDF5Dump : public Dump_Strategy {
     H5Fclose(file_id);
 
     el3 = uptime() - el3;
-    //io_log("TimeHDF5Close: " << el3 << " s");
-
-    if (mpi_rank == 0)
-    {
-        char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf";
-        char dimensions_3d[128];
-        sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0], field_global_size[1], field_global_size[2]);
-        char dimensions_4d[128];
-        sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0], field_global_size[1], field_global_size[2], 3);
-        char orignal[128];
-        sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
-        char dxdydz[128];
-        sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
-
-
-        // TODO: remove or let the user set
-        int field_interval = 1;
-
-        // TODO: remove this dependence on number of steps
-        //std::cout << "num_step " << num_step << std::endl;
-
-        int nframes = num_step / field_interval + 1;
-        static int field_tframe = 0;
+    // io_log("TimeHDF5Close: " << el3 << " s");
+
+    if (mpi_rank == 0) {
+      char const *output_xml_file = "./field_hdf5/hdf5_field.xdmf";
+      char dimensions_3d[128];
+      sprintf(dimensions_3d, "%lld %lld %lld", field_global_size[0],
+              field_global_size[1], field_global_size[2]);
+      char dimensions_4d[128];
+      sprintf(dimensions_4d, "%lld %lld %lld %d", field_global_size[0],
+              field_global_size[1], field_global_size[2], 3);
+      char orignal[128];
+      sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
+      char dxdydz[128];
+      sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
+
+      // TODO: remove or let the user set
+      int field_interval = 1;
+
+      // TODO: remove this dependence on number of steps
+      // std::cout << "num_step " << num_step << std::endl;
+
+      int nframes = num_step / field_interval + 1;
+      static int field_tframe = 0;
 
 #ifdef DUMP_INFO_DEBUG
-        printf("         meta file : %s \n", output_xml_file);
-        printf(" array dims per var: %s \n", dimensions_3d);
-        printf("array dims all vars: %s \n", dimensions_4d);
-        printf("            orignal: %s \n", orignal);
-        printf("             dxdydz: %s \n", dxdydz);
-        printf("            nframes: %d \n", nframes);
-        printf("    field_interval: %d \n", field_interval);
-        printf("       current step: %zd \n", step_for_viou);
-        printf("       current step: %zd \n", step_for_viou);
-
-        //printf("    Simulation time: %f \n", grid->t0);
-        printf("             tframe: %d \n", field_tframe);
+      printf("         meta file : %s \n", output_xml_file);
+      printf(" array dims per var: %s \n", dimensions_3d);
+      printf("array dims all vars: %s \n", dimensions_4d);
+      printf("            orignal: %s \n", orignal);
+      printf("             dxdydz: %s \n", dxdydz);
+      printf("            nframes: %d \n", nframes);
+      printf("    field_interval: %d \n", field_interval);
+      printf("       current step: %zd \n", step_for_viou);
+      printf("       current step: %zd \n", step_for_viou);
+
+      // printf("    Simulation time: %f \n", grid->t0);
+      printf("             tframe: %d \n", field_tframe);
 #endif
 
-        // TODO: this footer dumping is more likely better done in a
-        // destructor, rather than hoping a multiple division works out
-        if (field_tframe >= 1)
-        {
-            if (field_tframe == (nframes - 1))
-            {
-                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
-            }
-            else
-            {
-                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
-            }
+      // TODO: this footer dumping is more likely better done in a
+      // destructor, rather than hoping a multiple division works out
+      if (field_tframe >= 1) {
+        if (field_tframe == (nframes - 1)) {
+          invert_field_xml_item(output_xml_file, "fields", step_for_viou,
+                                dimensions_4d, dimensions_3d, 1);
+        } else {
+          invert_field_xml_item(output_xml_file, "fields", step_for_viou,
+                                dimensions_4d, dimensions_3d, 0);
         }
-        else
-        {
-            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, field_interval);
-            if (field_tframe == (nframes - 1))
-            {
-                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 1);
-            }
-            else
-            {
-                invert_field_xml_item(output_xml_file, "fields", step_for_viou, dimensions_4d, dimensions_3d, 0);
-            }
+      } else {
+        create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz,
+                                nframes, field_interval);
+        if (field_tframe == (nframes - 1)) {
+          invert_field_xml_item(output_xml_file, "fields", step_for_viou,
+                                dimensions_4d, dimensions_3d, 1);
+        } else {
+          invert_field_xml_item(output_xml_file, "fields", step_for_viou,
+                                dimensions_4d, dimensions_3d, 0);
         }
-        field_tframe++;
-    }
+      }
+      field_tframe++;
     }
-    /**
-     * @brief dump_particles to the HDF5 file
-     *         Author: Bin Dong  dbin@lbl.gov
-     *            https://crd.lbl.gov/bin-dong
-     *         Nov 2020
-     * @param fbase
-     * @param sp
-     * @param grid
-     * @param step
-     * @param interpolator_array
-     * @param ftag
-     */
-    void dump_particles(
-            const char *fbase,
-            species_t* sp,
-            grid_t* grid,
-            int step,
-            interpolator_array_t* interpolator_array,
-            int ftag
-            )
-    {
-        static int file_index = 0;
-        file_index ++;
-        int mpi_rank;
-        MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-        double dump_particles_uptime = uptime();
-        time_t seconds = time(NULL);
-        // printf("Atrank = %d, file_index = %d, dump_particles_uptime = %f, epoch_seconds = %ld  \n ", mpi_rank, file_index, dump_particles_uptime, seconds);
-
-
-        size_t step_for_viou = step;
-        char fname[256];
-        char group_name[256];
-        char particle_scratch[128];
-        char subparticle_scratch[128];
-
-        int np_local;
-
-        float *Pf;
-        int *Pi;
-
-        // get the total number of particles. in this example, output only electrons
-        //sp = species_list;
-        sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5");
-        FileUtils::makeDirectory(particle_scratch);
-        sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou);
-        FileUtils::makeDirectory(subparticle_scratch);
-
-        // TODO: Allow the user to set this
-        int stride_particle_dump = 1;
-
-        np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump;
-
-        // make a copy of the part of particle data to be dumped
-        double ec1 = uptime();
-
-        int sp_np = sp->np;
-        int sp_max_np = sp->max_np;
-        particle_t *ALIGNED(128) p_buf = NULL;
-        if (!p_buf)
-            MALLOC_ALIGNED(p_buf, np_local, 128);
-        particle_t *sp_p = sp->p;
-        sp->p = p_buf;
-        sp->np = np_local;
-        sp->max_np = np_local;
-
-        for (long long iptl = 0, i = 0; iptl < sp_np; iptl += stride_particle_dump, ++i)
-        {
-            COPY(&sp->p[i], &sp_p[iptl], 1);
-        }
+  }
+  /**
+   * @brief dump_particles to the HDF5 file
+   *         Author: Bin Dong  dbin@lbl.gov
+   *            https://crd.lbl.gov/bin-dong
+   *         Nov 2020
+   * @param fbase
+   * @param sp
+   * @param grid
+   * @param step
+   * @param interpolator_array
+   * @param ftag
+   */
+  void dump_particles(const char *fbase, species_t *sp, grid_t *grid, int step,
+                      interpolator_array_t *interpolator_array, int ftag) {
+    static int file_index = 0;
+    file_index++;
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    double dump_particles_uptime = uptime();
+    time_t seconds = time(NULL);
+    // printf("Atrank = %d, file_index = %d, dump_particles_uptime = %f,
+    // epoch_seconds = %ld  \n ", mpi_rank, file_index, dump_particles_uptime,
+    // seconds);
 
-        center_p(sp, interpolator_array);
+    size_t step_for_viou = step;
+    char fname[256];
+    char group_name[256];
+    char particle_scratch[128];
+    char subparticle_scratch[128];
+
+    int np_local;
+
+    float *Pf;
+    int *Pi;
+
+    // get the total number of particles. in this example, output only electrons
+    // sp = species_list;
+    sprintf(particle_scratch, DUMP_DIR_FORMAT, "particle_hdf5");
+    FileUtils::makeDirectory(particle_scratch);
+    sprintf(subparticle_scratch, "%s/T.%ld/", particle_scratch, step_for_viou);
+    FileUtils::makeDirectory(subparticle_scratch);
+
+    // TODO: Allow the user to set this
+    int stride_particle_dump = 1;
+
+    np_local = (sp->np + stride_particle_dump - 1) / stride_particle_dump;
+
+    // make a copy of the part of particle data to be dumped
+    double ec1 = uptime();
+
+    int sp_np = sp->np;
+    int sp_max_np = sp->max_np;
+    particle_t *ALIGNED(128) p_buf = NULL;
+    if (!p_buf)
+      MALLOC_ALIGNED(p_buf, np_local, 128);
+    particle_t *sp_p = sp->p;
+    sp->p = p_buf;
+    sp->np = np_local;
+    sp->max_np = np_local;
+
+    for (long long iptl = 0, i = 0; iptl < sp_np;
+         iptl += stride_particle_dump, ++i) {
+      COPY(&sp->p[i], &sp_p[iptl], 1);
+    }
 
-        ec1 = uptime() - ec1;
+    center_p(sp, interpolator_array);
 
+    ec1 = uptime() - ec1;
 
-        //if(!mpi_rank || mpi_rank == 2047 )
-        //    std::cout << "on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << std::endl;
+    // if(!mpi_rank || mpi_rank == 2047 )
+    //    std::cout << "on mpi_rank: " << mpi_rank << ", time in copying
+    //    particle data: " << ec1 << " s" << ", np_local = " << np_local <<
+    //    std::endl;
 
 #ifndef N_FILE_N_PROCESS
-        int np_local_max, np_local_min;
-        MPI_Reduce(&np_local, &np_local_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
-        MPI_Reduce(&np_local, &np_local_min, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);
-        //io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: " << ec1 << " s" << ", np_local = " << np_local << ",np_local_max = " << np_local_max << ", local_min = "<< np_local_min);
+    int np_local_max, np_local_min;
+    MPI_Reduce(&np_local, &np_local_max, 1, MPI_INT, MPI_MAX, 0,
+               MPI_COMM_WORLD);
+    MPI_Reduce(&np_local, &np_local_min, 1, MPI_INT, MPI_MIN, 0,
+               MPI_COMM_WORLD);
+    // io_log("on mpi_rank: " << mpi_rank << ", time in copying particle data: "
+    // << ec1 << " s" << ", np_local = " << np_local << ",np_local_max = " <<
+    // np_local_max << ", local_min = "<< np_local_min);
 #endif
 
-        Pf = (float *)sp->p;
-        Pi = (int *)sp->p;
+    Pf = (float *)sp->p;
+    Pi = (int *)sp->p;
 
-        // open HDF5 file in "particle/T.<step>/" subdirectory
-        // filename: eparticle.h5p
+    // open HDF5 file in "particle/T.<step>/" subdirectory
+    // filename: eparticle.h5p
 #ifndef N_FILE_N_PROCESS
-        sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name, step_for_viou);
+    sprintf(fname, "%s/%s_%ld.h5", subparticle_scratch, sp->name,
+            step_for_viou);
 #else
-        sprintf(fname, "%s/%s_%ld_p%d.h5", subparticle_scratch, sp->name, step_for_viou, mpi_rank);
+    sprintf(fname, "%s/%s_%ld_p%d.h5", subparticle_scratch, sp->name,
+            step_for_viou, mpi_rank);
 #endif
 
-        sprintf(group_name, "/Timestep_%ld", step_for_viou);
-        double el1 = uptime();
-
-
+    sprintf(group_name, "/Timestep_%ld", step_for_viou);
+    double el1 = uptime();
 
-        long long total_particles, offset;
-        long long numparticles = np_local;
+    long long total_particles, offset;
+    long long numparticles = np_local;
 
 #ifndef N_FILE_N_PROCESS
-        MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-        offset -= numparticles;
+    MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM,
+                  MPI_COMM_WORLD);
+    MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+    offset -= numparticles;
 #else
-        total_particles = np_local;
-        offset = 0;
+    total_particles = np_local;
+    offset = 0;
 #endif
 
-
-        hid_t file_plist_id = H5Pcreate(H5P_FILE_ACCESS);
+    hid_t file_plist_id = H5Pcreate(H5P_FILE_ACCESS);
 
 #ifndef N_FILE_N_PROCESS
-        H5Pset_fapl_mpio(file_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+    H5Pset_fapl_mpio(file_plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
 #endif
 
 #ifdef H5_ASYNC
-        if(!mpi_rank) printf("Enable async on particle data");
+    if (!mpi_rank)
+      printf("Enable async on particle data");
 
-        assert(H5Pset_vol_async(file_plist_id));
+    assert(H5Pset_vol_async(file_plist_id));
 #endif
 
-        hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, file_plist_id);
-        //if(!mpi_rank )
-        //io_log("++Particle H5Fcreate) ");
-
+    hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, file_plist_id);
+    // if(!mpi_rank )
+    // io_log("++Particle H5Fcreate) ");
 
-        hid_t group_id = H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        //if(!mpi_rank )
-        //io_log("++Particle H5Gcreate) ");
+    hid_t group_id =
+        H5Gcreate(file_id, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    // if(!mpi_rank )
+    // io_log("++Particle H5Gcreate) ");
 
-#ifdef  HAS_PARTICLE_COMP
-        if(!mpi_rank)
-            printf("Using Partilce Compund type !\n");
-        hid_t  particle_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(particle_t));
-        H5Tinsert(particle_comp_type_it, "dx", HOFFSET(particle_t, dx), H5T_NATIVE_FLOAT);
-        H5Tinsert(particle_comp_type_it, "dy", HOFFSET(particle_t, dy), H5T_NATIVE_FLOAT);
-        H5Tinsert(particle_comp_type_it, "dz", HOFFSET(particle_t, dz), H5T_NATIVE_FLOAT);
-
-        H5Tinsert(particle_comp_type_it, "i", HOFFSET(particle_t, i), H5T_NATIVE_INT);
-
-        H5Tinsert(particle_comp_type_it, "ux", HOFFSET(particle_t, ux), H5T_NATIVE_FLOAT);
-        H5Tinsert(particle_comp_type_it, "uy", HOFFSET(particle_t, uy), H5T_NATIVE_FLOAT);
-        H5Tinsert(particle_comp_type_it, "uz", HOFFSET(particle_t, uz), H5T_NATIVE_FLOAT);
-        H5Tinsert(particle_comp_type_it, "w",  HOFFSET(particle_t, w), H5T_NATIVE_FLOAT);
+#ifdef HAS_PARTICLE_COMP
+    if (!mpi_rank)
+      printf("Using Partilce Compund type !\n");
+    hid_t particle_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(particle_t));
+    H5Tinsert(particle_comp_type_it, "dx", HOFFSET(particle_t, dx),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(particle_comp_type_it, "dy", HOFFSET(particle_t, dy),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(particle_comp_type_it, "dz", HOFFSET(particle_t, dz),
+              H5T_NATIVE_FLOAT);
+
+    H5Tinsert(particle_comp_type_it, "i", HOFFSET(particle_t, i),
+              H5T_NATIVE_INT);
+
+    H5Tinsert(particle_comp_type_it, "ux", HOFFSET(particle_t, ux),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(particle_comp_type_it, "uy", HOFFSET(particle_t, uy),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(particle_comp_type_it, "uz", HOFFSET(particle_t, uz),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(particle_comp_type_it, "w", HOFFSET(particle_t, w),
+              H5T_NATIVE_FLOAT);
 #endif
 
-        hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL);
-        H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL, (hsize_t *)&numparticles, NULL);
+    hid_t filespace = H5Screate_simple(1, (hsize_t *)&total_particles, NULL);
+    H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *)&offset, NULL,
+                        (hsize_t *)&numparticles, NULL);
 
-        //if(!mpi_rank )
-        //io_log("++Particle H5Sselect_hyperslab) ");
+    // if(!mpi_rank )
+    // io_log("++Particle H5Sselect_hyperslab) ");
 
-        //plist_id = H5P_DEFAULT;
-        hid_t io_plist_id = H5Pcreate(H5P_DATASET_XFER);
+    // plist_id = H5P_DEFAULT;
+    hid_t io_plist_id = H5Pcreate(H5P_DATASET_XFER);
 
 #ifndef N_FILE_N_PROCESS
 #ifdef HAS_INDEPENDENT_IO
-        if(!mpi_rank) {
-            printf("\n ###\n VPIC Independent I/O! \n ###\n");
-        }
-        H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_INDEPENDENT);
+    if (!mpi_rank) {
+      printf("\n ###\n VPIC Independent I/O! \n ###\n");
+    }
+    H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_INDEPENDENT);
 #else
-        H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_COLLECTIVE);
+    H5Pset_dxpl_mpio(io_plist_id, H5FD_MPIO_COLLECTIVE);
 #endif
 #endif
 
 #ifdef H5_ASYNC
-        H5Pset_dxpl_async(io_plist_id, true);
+    H5Pset_dxpl_async(io_plist_id, true);
 #endif
-        hsize_t linearspace_count_temp = numparticles;
-        hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
+    hsize_t linearspace_count_temp = numparticles;
+    hid_t linearspace = H5Screate_simple(1, &linearspace_count_temp, NULL);
 
-        hsize_t memspace_count_temp;
-        hid_t memspace;
+    hsize_t memspace_count_temp;
+    hid_t memspace;
 #ifdef HAS_PARTICLE_COMP
-        memspace_count_temp = numparticles ;
-        memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
+    memspace_count_temp = numparticles;
+    memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
 #else
-        memspace_count_temp = numparticles * 8;
-        memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
-        hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local;
-        H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start, &memspace_stride, &memspace_count, NULL);
+    memspace_count_temp = numparticles * 8;
+    memspace = H5Screate_simple(1, &memspace_count_temp, NULL);
+    hsize_t memspace_start = 0, memspace_stride = 8, memspace_count = np_local;
+    H5Sselect_hyperslab(memspace, H5S_SELECT_SET, &memspace_start,
+                        &memspace_stride, &memspace_count, NULL);
 #endif
-        el1 = uptime() - el1;
-        //if(!mpi_rank || mpi_rank == 2047 )
-        //io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle results for scripts
-        double el2 = uptime();
-        int ierr;
-
-#define WRITE_H5_FILE(group_id_p, data_buf_p, type_p, dname_p){\
-    hid_t dset_id = H5Dcreate(group_id_p, dname_p, type_p, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
-    H5Dwrite(dset_id, type_p, memspace, filespace, io_plist_id, data_buf_p);\
-    H5Dclose(dset_id);\
-}
-
-
-        //MPI_Info_set(info, "romio_cb_write", "disable");
-#define WRITE_MPI_FILE(dname_p, offset_p, data_buf_p, count_p, type_p){\
-    MPI_File fh;\
-    MPI_Status status;\
-    sprintf(fname, "%s/%s_%ld_%s.h5", subparticle_scratch, sp->name, step_for_viou, dname_p);\
-    if(mpi_rank == 0) printf("fname= %s \n", fname);\
-    MPI_Info info;\
-    MPI_Info_create(&info);\
-    MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_WRONLY | MPI_MODE_CREATE, info, &fh);\
-    MPI_File_write_at(fh, offset_p, data_buf_p, count_p,type_p, &status);\
-    MPI_Info_free(&info);\
-    MPI_File_close(&fh);\
-}
+    el1 = uptime() - el1;
+    // if(!mpi_rank || mpi_rank == 2047 )
+    // io_log("Particle TimeHDF5Open): " << el1 << " s"); //Easy to handle
+    // results for scripts
+    double el2 = uptime();
+    int ierr;
+
+#define WRITE_H5_FILE(group_id_p, data_buf_p, type_p, dname_p)                 \
+  {                                                                            \
+    hid_t dset_id = H5Dcreate(group_id_p, dname_p, type_p, filespace,          \
+                              H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);          \
+    H5Dwrite(dset_id, type_p, memspace, filespace, io_plist_id, data_buf_p);   \
+    H5Dclose(dset_id);                                                         \
+  }
+
+    // MPI_Info_set(info, "romio_cb_write", "disable");
+#define WRITE_MPI_FILE(dname_p, offset_p, data_buf_p, count_p, type_p)         \
+  {                                                                            \
+    MPI_File fh;                                                               \
+    MPI_Status status;                                                         \
+    sprintf(fname, "%s/%s_%ld_%s.h5", subparticle_scratch, sp->name,           \
+            step_for_viou, dname_p);                                           \
+    if (mpi_rank == 0)                                                         \
+      printf("fname= %s \n", fname);                                           \
+    MPI_Info info;                                                             \
+    MPI_Info_create(&info);                                                    \
+    MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_WRONLY | MPI_MODE_CREATE,    \
+                  info, &fh);                                                  \
+    MPI_File_write_at(fh, offset_p, data_buf_p, count_p, type_p, &status);     \
+    MPI_Info_free(&info);                                                      \
+    MPI_File_close(&fh);                                                       \
+  }
 
 #ifdef HAS_PARTICLE_COMP
-        hid_t dset_id = H5Dcreate(group_id, "particle", particle_comp_type_it, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-        H5Dwrite(dset_id, particle_comp_type_it, memspace, filespace, io_plist_id, sp->p);
-        H5Dclose(dset_id);
+    hid_t dset_id = H5Dcreate(group_id, "particle", particle_comp_type_it,
+                              filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    H5Dwrite(dset_id, particle_comp_type_it, memspace, filespace, io_plist_id,
+             sp->p);
+    H5Dclose(dset_id);
 #else
 #ifdef TEST_MPIIO
-        //Here we don't use the stripe but just for performance test
-        if(!mpi_rank) printf("Test MPI-IO\n");
-        WRITE_MPI_FILE("dX", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
-        WRITE_MPI_FILE("dY", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
-        WRITE_MPI_FILE("dZ", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
-        WRITE_MPI_FILE("i",  offset * sizeof(int),   Pf, numparticles,  MPI_INT);
-        WRITE_MPI_FILE("ux", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
-        WRITE_MPI_FILE("uy", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
-        WRITE_MPI_FILE("uz", offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
-        WRITE_MPI_FILE("q",  offset * sizeof(float), Pf, numparticles,  MPI_FLOAT);
+    // Here we don't use the stripe but just for performance test
+    if (!mpi_rank)
+      printf("Test MPI-IO\n");
+    WRITE_MPI_FILE("dX", offset * sizeof(float), Pf, numparticles, MPI_FLOAT);
+    WRITE_MPI_FILE("dY", offset * sizeof(float), Pf, numparticles, MPI_FLOAT);
+    WRITE_MPI_FILE("dZ", offset * sizeof(float), Pf, numparticles, MPI_FLOAT);
+    WRITE_MPI_FILE("i", offset * sizeof(int), Pf, numparticles, MPI_INT);
+    WRITE_MPI_FILE("ux", offset * sizeof(float), Pf, numparticles, MPI_FLOAT);
+    WRITE_MPI_FILE("uy", offset * sizeof(float), Pf, numparticles, MPI_FLOAT);
+    WRITE_MPI_FILE("uz", offset * sizeof(float), Pf, numparticles, MPI_FLOAT);
+    WRITE_MPI_FILE("q", offset * sizeof(float), Pf, numparticles, MPI_FLOAT);
 #else
 #ifndef N_FILE_N_PROCESS
-        if(!mpi_rank) printf("Test HDF5-IO Single \n");
+    if (!mpi_rank)
+      printf("Test HDF5-IO Single \n");
 #else
-        if(!mpi_rank) printf("Test HDF5-IO N Files N Process\n");
+    if (!mpi_rank)
+      printf("Test HDF5-IO N Files N Process\n");
 #endif
-        //if(!mpi_rank )
-        //io_log("++Particle Starting to write ) ");
-        WRITE_H5_FILE(group_id,  Pf, H5T_NATIVE_FLOAT, "dX")
-        WRITE_H5_FILE(group_id,  Pf+1, H5T_NATIVE_FLOAT, "dY")
-        WRITE_H5_FILE(group_id,  Pf+2, H5T_NATIVE_FLOAT, "dZ")
-        WRITE_H5_FILE(group_id,  Pi+3, H5T_NATIVE_INT, "i")
-        WRITE_H5_FILE(group_id,  Pf+4, H5T_NATIVE_FLOAT, "ux")
-        WRITE_H5_FILE(group_id,  Pf+5, H5T_NATIVE_FLOAT, "uy")
-        WRITE_H5_FILE(group_id,  Pf+6, H5T_NATIVE_FLOAT, "uz")
-        WRITE_H5_FILE(group_id,  Pf+7, H5T_NATIVE_FLOAT, "q")
+    // if(!mpi_rank )
+    // io_log("++Particle Starting to write ) ");
+    WRITE_H5_FILE(group_id, Pf, H5T_NATIVE_FLOAT, "dX")
+    WRITE_H5_FILE(group_id, Pf + 1, H5T_NATIVE_FLOAT, "dY")
+    WRITE_H5_FILE(group_id, Pf + 2, H5T_NATIVE_FLOAT, "dZ")
+    WRITE_H5_FILE(group_id, Pi + 3, H5T_NATIVE_INT, "i")
+    WRITE_H5_FILE(group_id, Pf + 4, H5T_NATIVE_FLOAT, "ux")
+    WRITE_H5_FILE(group_id, Pf + 5, H5T_NATIVE_FLOAT, "uy")
+    WRITE_H5_FILE(group_id, Pf + 6, H5T_NATIVE_FLOAT, "uz")
+    WRITE_H5_FILE(group_id, Pf + 7, H5T_NATIVE_FLOAT, "q")
 #endif
 #endif
-        el2 = uptime() - el2;
-        //io_log("Particle TimeHDF5Write: " << el2 << " s");
-
-        double el3 = uptime();
-        H5Sclose(memspace);
-        H5Sclose(filespace);
-        H5Pclose(file_plist_id);
-        H5Pclose(io_plist_id);
-        H5Gclose(group_id);
-
+    el2 = uptime() - el2;
+    // io_log("Particle TimeHDF5Write: " << el2 << " s");
 
+    double el3 = uptime();
+    H5Sclose(memspace);
+    H5Sclose(filespace);
+    H5Pclose(file_plist_id);
+    H5Pclose(io_plist_id);
+    H5Gclose(group_id);
 
-        H5Fclose(file_id);
+    H5Fclose(file_id);
 
 #ifdef H5_ASYNC
-        H5VLasync_finalize();
+    H5VLasync_finalize();
 #endif
-        el3 = uptime() - el3;
-        //io_log("Particle TimeHDF5Close: " << el3 << " s");
-
-        }
-
-/**
- * @brief Dump hydro data to the HDf5 file
- *         Author: Bin Dong  dbin@lbl.gov
- *           https://crd.lbl.gov/bin-dong
- *         Nov 2020
- * @param fbase
- * @param step
- * @param hydro_array
- * @param sp
- * @param interpolator_array
- * @param grid
- * @param ftag
- */
-void dump_hydro(
-        const char *fbase,
-        int step,
-        hydro_array_t* hydro_array,
-        species_t* sp,
-        interpolator_array_t* interpolator_array,
-        grid_t* grid,
-        int ftag
-        )
-{
+    el3 = uptime() - el3;
+    // io_log("Particle TimeHDF5Close: " << el3 << " s");
+  }
+
+  /**
+   * @brief Dump hydro data to the HDf5 file
+   *         Author: Bin Dong  dbin@lbl.gov
+   *           https://crd.lbl.gov/bin-dong
+   *         Nov 2020
+   * @param fbase
+   * @param step
+   * @param hydro_array
+   * @param sp
+   * @param interpolator_array
+   * @param grid
+   * @param ftag
+   */
+  void dump_hydro(const char *fbase, int step, hydro_array_t *hydro_array,
+                  species_t *sp, interpolator_array_t *interpolator_array,
+                  grid_t *grid, int ftag) {
     size_t step_for_viou = step;
 
-#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)                                               \
-    {                                                                                                             \
-        dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); \
-        temp_buf_index = 0;                                                                                       \
-        for (size_t i(1); i < grid->nx + 1; i++)                                                                  \
-        {                                                                                                         \
-            for (size_t j(1); j < grid->ny + 1; j++)                                                              \
-            {                                                                                                     \
-                for (size_t k(1); k < grid->nz + 1; k++)                                                          \
-                {                                                                                                 \
-                    temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME;                                     \
-                    temp_buf_index = temp_buf_index + 1;                                                          \
-                }                                                                                                 \
-            }                                                                                                     \
-        }                                                                                                         \
-        dataspace_id = H5Dget_space(dset_id);                                                                     \
-        H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);               \
-        H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id, temp_buf);                              \
-        H5Sclose(dataspace_id);                                                                                   \
-        H5Dclose(dset_id);                                                                                        \
-    }
+#define DUMP_HYDRO_TO_HDF5(DSET_NAME, ATTRIBUTE_NAME, ELEMENT_TYPE)            \
+  {                                                                            \
+    dset_id = H5Dcreate(group_id, DSET_NAME, ELEMENT_TYPE, filespace,          \
+                        H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);                \
+    temp_buf_index = 0;                                                        \
+    for (size_t i(1); i < grid->nx + 1; i++) {                                 \
+      for (size_t j(1); j < grid->ny + 1; j++) {                               \
+        for (size_t k(1); k < grid->nz + 1; k++) {                             \
+          temp_buf[temp_buf_index] = _hydro(i, j, k).ATTRIBUTE_NAME;           \
+          temp_buf_index = temp_buf_index + 1;                                 \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    dataspace_id = H5Dget_space(dset_id);                                      \
+    H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL,     \
+                        global_count, NULL);                                   \
+    H5Dwrite(dset_id, ELEMENT_TYPE, memspace, dataspace_id, plist_id,          \
+             temp_buf);                                                        \
+    H5Sclose(dataspace_id);                                                    \
+    H5Dclose(dset_id);                                                         \
+  }
     //#define DUMP_INFO_DEBUG 1
     int mpi_size, mpi_rank;
     MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
     MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
 
-    if (!sp)
-    {
-        ERROR(("Invalid species"));
+    if (!sp) {
+      ERROR(("Invalid species"));
     }
 
     clear_hydro_array(hydro_array);
@@ -1185,15 +1124,16 @@ void dump_hydro(
     sprintf(subhydro_scratch, "%s/T.%zu/", hydro_scratch, step_for_viou);
     FileUtils::makeDirectory(subhydro_scratch);
 
-    sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name, step_for_viou);
+    sprintf(hname, "%s/hydro_%s_%zu.h5", subhydro_scratch, sp->name,
+            step_for_viou);
     double el1 = uptime();
     hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
 
     /*
-       if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) < 0){
-       exit(-1);
+       if(H5Pset_libver_bounds(plist_id, H5F_LIBVER_LATEST, H5F_LIBVER_LATEST) <
+       0){ exit(-1);
        }*/
-    //if((fid = H5Fcreate(FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id)) < 0)
+    // if((fid = H5Fcreate(FILENAME, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id)) < 0)
     //    ERROR_RETURN;
 
     H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
@@ -1201,70 +1141,83 @@ void dump_hydro(
     H5Pclose(plist_id);
 
     sprintf(hname, "Timestep_%zu", step_for_viou);
-    hid_t group_id = H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    hid_t group_id =
+        H5Gcreate(file_id, hname, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
     el1 = uptime() - el1;
-    //io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for scripts
+    // io_log("TimeHDF5Open: " << el1 << " s"); //Easy to handle results for
+    // scripts
     double el2 = uptime();
 
     // Create a variable list of field values to output.
-    //size_t numvars = std::min(global->fdParams.output_vars.bitsum(), total_field_variables);
-    //size_t *varlist = new size_t[numvars];
+    // size_t numvars = std::min(global->fdParams.output_vars.bitsum(),
+    // total_field_variables); size_t *varlist = new size_t[numvars];
 
-    //for (size_t i(0), c(0); i < total_field_variables; i++)
+    // for (size_t i(0), c(0); i < total_field_variables; i++)
     //    if (global->fdParams.output_vars.bitset(i))
     //        varlist[c++] = i;
 
-    //printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars);
+    // printf("\nBEGIN_OUTPUT: numvars = %zu \n", numvars);
 
-
-    //typedef struct hydro {
+    // typedef struct hydro {
     //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
-    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
-    //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
-    //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
-    //  float _pad[2];         // 16-byte align
+    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2
+    //  (gamma-1) f> float txx, tyy, tzz;   // Stress diagonal            =>
+    //  <p_i v_j f>, i==j float tyz, tzx, txy;   // Stress off-diagonal => <p_i
+    //  v_j f>, i!=j float _pad[2];         // 16-byte align
     //} hydro_t;
 #ifdef HAS_HYDRO_COMP
-    //if(!mpi_rank)
-    //printf("Using Field Compund type !\n");
-    hid_t  hydro_comp_type_it = H5Tcreate (H5T_COMPOUND, sizeof(hydro_t));
+    // if(!mpi_rank)
+    // printf("Using Field Compund type !\n");
+    hid_t hydro_comp_type_it = H5Tcreate(H5T_COMPOUND, sizeof(hydro_t));
     H5Tinsert(hydro_comp_type_it, "jx", HOFFSET(hydro_t, jx), H5T_NATIVE_FLOAT);
     H5Tinsert(hydro_comp_type_it, "jy", HOFFSET(hydro_t, jy), H5T_NATIVE_FLOAT);
     H5Tinsert(hydro_comp_type_it, "jz", HOFFSET(hydro_t, jz), H5T_NATIVE_FLOAT);
-    H5Tinsert(hydro_comp_type_it, "rho", HOFFSET(hydro_t, rho), H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "rho", HOFFSET(hydro_t, rho),
+              H5T_NATIVE_FLOAT);
 
     H5Tinsert(hydro_comp_type_it, "px", HOFFSET(hydro_t, px), H5T_NATIVE_FLOAT);
     H5Tinsert(hydro_comp_type_it, "py", HOFFSET(hydro_t, py), H5T_NATIVE_FLOAT);
     H5Tinsert(hydro_comp_type_it, "pz", HOFFSET(hydro_t, pz), H5T_NATIVE_FLOAT);
     H5Tinsert(hydro_comp_type_it, "ke", HOFFSET(hydro_t, ke), H5T_NATIVE_FLOAT);
 
-    H5Tinsert(hydro_comp_type_it, "txx", HOFFSET(hydro_t, txx), H5T_NATIVE_FLOAT);
-    H5Tinsert(hydro_comp_type_it, "tyy", HOFFSET(hydro_t, tyy), H5T_NATIVE_FLOAT);
-    H5Tinsert(hydro_comp_type_it, "tzz", HOFFSET(hydro_t, tzz), H5T_NATIVE_FLOAT);
-
-    H5Tinsert(hydro_comp_type_it, "tyz", HOFFSET(hydro_t, tyz), H5T_NATIVE_FLOAT);
-    H5Tinsert(hydro_comp_type_it, "tzx", HOFFSET(hydro_t, tzx), H5T_NATIVE_FLOAT);
-    H5Tinsert(hydro_comp_type_it, "txy", HOFFSET(hydro_t, txy), H5T_NATIVE_FLOAT);
-    H5Tinsert(hydro_comp_type_it, "pad", HOFFSET(hydro_t, _pad), H5T_NATIVE_DOUBLE);
+    H5Tinsert(hydro_comp_type_it, "txx", HOFFSET(hydro_t, txx),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "tyy", HOFFSET(hydro_t, tyy),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "tzz", HOFFSET(hydro_t, tzz),
+              H5T_NATIVE_FLOAT);
+
+    H5Tinsert(hydro_comp_type_it, "tyz", HOFFSET(hydro_t, tyz),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "tzx", HOFFSET(hydro_t, tzx),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "txy", HOFFSET(hydro_t, txy),
+              H5T_NATIVE_FLOAT);
+    H5Tinsert(hydro_comp_type_it, "pad", HOFFSET(hydro_t, _pad),
+              H5T_NATIVE_DOUBLE);
 #endif
-    //typedef struct hydro_array {
+    // typedef struct hydro_array {
     //  hydro_t * ALIGNED(128) h;
     //  grid_t * g;
     //} hydro_array_t;
 
-    float *temp_buf = (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
+    float *temp_buf =
+        (float *)malloc(sizeof(float) * (grid->nx) * (grid->ny) * (grid->nz));
     hsize_t temp_buf_index;
     hid_t dset_id;
-    //char  *field_var_name[] = {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
+    // char  *field_var_name[] =
+    // {"ex","ey","ez","div_e_err","cbx","cby","cbz","div_b_err","tcax","tcay","tcaz","rhob","jfx","jfy","jfz","rhof"};
     plist_id = H5Pcreate(H5P_DATASET_XFER);
-    //Comment out for test only
+    // Comment out for test only
     H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-    //H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL, (hsize_t *) &numparticles, NULL);
+    // H5Sselect_hyperslab(filespace, H5S_SELECT_SET, (hsize_t *) &offset, NULL,
+    // (hsize_t *) &numparticles, NULL);
 
-    //global->topology_x
+    // global->topology_x
 
-    hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3], global_count[3];
+    hsize_t hydro_global_size[3], hydro_local_size[3], global_offset[3],
+        global_count[3];
     hydro_global_size[0] = (grid->nx * grid->gpx);
     hydro_global_size[1] = (grid->ny * grid->gpy);
     hydro_global_size[2] = (grid->nz * grid->gpz);
@@ -1274,7 +1227,8 @@ void dump_hydro(
     hydro_local_size[2] = grid->nz;
 
     int mpi_rank_x, mpi_rank_y, mpi_rank_z;
-    UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy, grid->gpz);
+    UNVOXEL(mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z, grid->gpx, grid->gpy,
+            grid->gpz);
 
     global_offset[0] = (grid->nx) * mpi_rank_x;
     global_offset[1] = (grid->ny) * mpi_rank_y;
@@ -1285,10 +1239,14 @@ void dump_hydro(
     global_count[2] = (grid->nz);
 
 #ifdef DUMP_INFO_DEBUG
-    printf("global size   = %llu %llu %llu \n", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
-    printf("global_offset = %llu %llu %llu \n", global_offset[0], global_offset[1], global_offset[2]);
-    printf("global_count  = %llu %llu %llu \n", global_count[0], global_count[1], global_count[2]);
-    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x, mpi_rank_y, mpi_rank_z);
+    printf("global size   = %llu %llu %llu \n", hydro_global_size[0],
+           hydro_global_size[1], hydro_global_size[2]);
+    printf("global_offset = %llu %llu %llu \n", global_offset[0],
+           global_offset[1], global_offset[2]);
+    printf("global_count  = %llu %llu %llu \n", global_count[0],
+           global_count[1], global_count[2]);
+    printf("mpi-rank = %d, rank index = (%d, %d, %d) \n", mpi_rank, mpi_rank_x,
+           mpi_rank_y, mpi_rank_z);
     fflush(stdout);
 #endif
 
@@ -1296,31 +1254,34 @@ void dump_hydro(
     hid_t memspace = H5Screate_simple(3, hydro_local_size, NULL);
     hid_t dataspace_id;
 
-    //typedef struct hydro {
+    // typedef struct hydro {
     //  float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
-    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
-    //  float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
-    //  float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
-    //  float _pad[2];         // 16-byte align
+    //  float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2
+    //  (gamma-1) f> float txx, tyy, tzz;   // Stress diagonal            =>
+    //  <p_i v_j f>, i==j float tyz, tzx, txy;   // Stress off-diagonal => <p_i
+    //  v_j f>, i!=j float _pad[2];         // 16-byte align
     //} hydro_t;
 
-
 #ifdef HAS_HYDRO_COMP
-    hydro_t *hydro_buf = (hydro_t *)malloc(sizeof(hydro_t) * (grid->nx) * (grid->ny) * (grid->nz));
+    hydro_t *hydro_buf = (hydro_t *)malloc(sizeof(hydro_t) * (grid->nx) *
+                                           (grid->ny) * (grid->nz));
     temp_buf_index = 0;
-    for (size_t i(1); i < grid->nx + 1; i++){
-        for (size_t j(1); j < grid->ny + 1; j++){
-            for (size_t k(1); k < grid->nz + 1; k++){
-                hydro_buf[temp_buf_index] = _hydro(i, j, k);
-                temp_buf_index = temp_buf_index + 1;
-            }
+    for (size_t i(1); i < grid->nx + 1; i++) {
+      for (size_t j(1); j < grid->ny + 1; j++) {
+        for (size_t k(1); k < grid->nz + 1; k++) {
+          hydro_buf[temp_buf_index] = _hydro(i, j, k);
+          temp_buf_index = temp_buf_index + 1;
         }
+      }
     }
-    dset_id = H5Dcreate(group_id, "hydro", hydro_comp_type_it, filespace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    dset_id = H5Dcreate(group_id, "hydro", hydro_comp_type_it, filespace,
+                        H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
 
     dataspace_id = H5Dget_space(dset_id);
-    H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL, global_count, NULL);
-    H5Dwrite(dset_id, hydro_comp_type_it, memspace, dataspace_id, plist_id, hydro_buf);
+    H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, global_offset, NULL,
+                        global_count, NULL);
+    H5Dwrite(dset_id, hydro_comp_type_it, memspace, dataspace_id, plist_id,
+             hydro_buf);
     free(hydro_buf);
     H5Sclose(dataspace_id);
     H5Dclose(dset_id);
@@ -1328,44 +1289,44 @@ void dump_hydro(
 #else
 
     if (hydro_dump_flag.jx)
-        DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("jx", jx, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.jy)
-        DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("jy", jy, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.jz)
-        DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("jz", jz, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.rho)
-        DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("rho", rho, H5T_NATIVE_FLOAT);
 
     if (hydro_dump_flag.px)
-        DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("px", px, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.py)
-        DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("py", py, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.pz)
-        DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("pz", pz, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.ke)
-        DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("ke", ke, H5T_NATIVE_FLOAT);
 
     if (hydro_dump_flag.txx)
-        DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("txx", txx, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.tyy)
-        DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("tyy", tyy, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.tzz)
-        DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("tzz", tzz, H5T_NATIVE_FLOAT);
 
     if (hydro_dump_flag.tyz)
-        DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("tyz", tyz, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.tzx)
-        DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("tzx", tzx, H5T_NATIVE_FLOAT);
     if (hydro_dump_flag.txy)
-        DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
+      DUMP_HYDRO_TO_HDF5("txy", txy, H5T_NATIVE_FLOAT);
 
     el2 = uptime() - el2;
-    //io_log("TimeHDF5Write: " << el2 << " s");
+    // io_log("TimeHDF5Write: " << el2 << " s");
 
 #endif
     double el3 = uptime();
 
-    //Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
+    // Write metadata (geo original and geo dx/dy/dz) for ArrayUDF
     /*
        float attr_data[2][3];
        attr_data[0][0] = grid->x0;
@@ -1378,7 +1339,8 @@ void dump_hydro(
        dims[0] = 2;
        dims[1] = 3;
        hid_t va_geo_dataspace_id = H5Screate_simple(2, dims, NULL);
-       hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO", H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
+       hid_t va_geo_attribute_id = H5Acreate2(file_id, "VPIC-ArrayUDF-GEO",
+       H5T_IEEE_F32BE, va_geo_dataspace_id, H5P_DEFAULT, H5P_DEFAULT);
        H5Awrite(va_geo_attribute_id, H5T_NATIVE_FLOAT, attr_data);
        H5Sclose(va_geo_dataspace_id);
        H5Aclose(va_geo_attribute_id);*/
@@ -1391,720 +1353,685 @@ void dump_hydro(
     H5Fclose(file_id);
 
     el3 = uptime() - el3;
-    //io_log("TimeHDF5Close: " << el3 << " s");
-
-    if (mpi_rank == 0)
-    {
-        char output_xml_file[128];
-        sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name, ".xdmf");
-        char dimensions_3d[128];
-        sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2]);
-        char dimensions_4d[128];
-        sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0], hydro_global_size[1], hydro_global_size[2], 3);
-        char orignal[128];
-        sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
-        char dxdydz[128];
-        sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
-
-        // TODO: remove or let user set
-        int hydro_interval = 1;
-
-        // TODO: remove this dependence on number of steps
-        int nframes = num_step / hydro_interval + 1;
-
-        const int tframe = tframe_map[sp->id];
+    // io_log("TimeHDF5Close: " << el3 << " s");
+
+    if (mpi_rank == 0) {
+      char output_xml_file[128];
+      sprintf(output_xml_file, "./%s/%s%s%s", "hydro_hdf5", "hydro-", sp->name,
+              ".xdmf");
+      char dimensions_3d[128];
+      sprintf(dimensions_3d, "%lld %lld %lld", hydro_global_size[0],
+              hydro_global_size[1], hydro_global_size[2]);
+      char dimensions_4d[128];
+      sprintf(dimensions_4d, "%lld %lld %lld %d", hydro_global_size[0],
+              hydro_global_size[1], hydro_global_size[2], 3);
+      char orignal[128];
+      sprintf(orignal, "%f %f %f", grid->x0, grid->y0, grid->z0);
+      char dxdydz[128];
+      sprintf(dxdydz, "%f %f %f", grid->dx, grid->dy, grid->dz);
+
+      // TODO: remove or let user set
+      int hydro_interval = 1;
+
+      // TODO: remove this dependence on number of steps
+      int nframes = num_step / hydro_interval + 1;
+
+      const int tframe = tframe_map[sp->id];
 
 #ifdef DUMP_INFO_DEBUG
-        printf("         meta file : %s \n", output_xml_file);
-        printf(" array dims per var: %s \n", dimensions_3d);
-        printf("array dims all vars: %s \n", dimensions_4d);
-        printf("            orignal: %s \n", orignal);
-        printf("             dxdydz: %s \n", dxdydz);
-        printf("            nframes: %d \n", nframes);
-        printf("    hydro_fields_interval: %d \n", hydro_interval);
-        printf("       current step: %zu \n", step_for_viou);
-        printf("    Simulation time: %f \n", grid->t0);
-        printf("             tframe: %d \n", tframe);
+      printf("         meta file : %s \n", output_xml_file);
+      printf(" array dims per var: %s \n", dimensions_3d);
+      printf("array dims all vars: %s \n", dimensions_4d);
+      printf("            orignal: %s \n", orignal);
+      printf("             dxdydz: %s \n", dxdydz);
+      printf("            nframes: %d \n", nframes);
+      printf("    hydro_fields_interval: %d \n", hydro_interval);
+      printf("       current step: %zu \n", step_for_viou);
+      printf("    Simulation time: %f \n", grid->t0);
+      printf("             tframe: %d \n", tframe);
 #endif
 
-        // TODO: why doesnt this just use the cstr?
-        char speciesname_new[128];
-        sprintf(speciesname_new, "hydro_%s", sp->name);
-        if (tframe >= 1)
-        {
-            if (tframe == (nframes - 1))
-            {
-                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
-            }
-            else
-            {
-                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
-            }
+      // TODO: why doesnt this just use the cstr?
+      char speciesname_new[128];
+      sprintf(speciesname_new, "hydro_%s", sp->name);
+      if (tframe >= 1) {
+        if (tframe == (nframes - 1)) {
+          invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou,
+                                dimensions_4d, dimensions_3d, 1);
+        } else {
+          invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou,
+                                dimensions_4d, dimensions_3d, 0);
         }
-        else
-        {
-            create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz, nframes, hydro_interval);
-            if (tframe == (nframes - 1))
-            {
-                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 1);
-            }
-            else
-            {
-                invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou, dimensions_4d, dimensions_3d, 0);
-            }
+      } else {
+        create_file_with_header(output_xml_file, dimensions_3d, orignal, dxdydz,
+                                nframes, hydro_interval);
+        if (tframe == (nframes - 1)) {
+          invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou,
+                                dimensions_4d, dimensions_3d, 1);
+        } else {
+          invert_hydro_xml_item(output_xml_file, speciesname_new, step_for_viou,
+                                dimensions_4d, dimensions_3d, 0);
         }
-        tframe_map[sp->id]++;
+      }
+      tframe_map[sp->id]++;
     }
-}
+  }
 };
 #endif
 
 #ifdef VPIC_ENABLE_OPENPMD
 class OpenPMDDump : public Dump_Strategy {
-    public:
-        //openPMD::Series* series;
-        using Dump_Strategy::Dump_Strategy; // inherit constructor
-
-        //std::string file_type = ".h5";
-        std::string file_type = ".bp";
-
-        void dump_fields(
-                const char *fbase,
-                int step,
-                grid_t* grid,
-                field_array_t* field_array,
-                int ftag
-                )
-        {
-            std::cout << "Writing openPMD data" << std::endl;
-
-            std::string full_file_name = fbase + file_type;
-
-            //if (series == nullptr) {
-            std::cout << "init series" << std::endl;
-            openPMD::Series series = openPMD::Series(
-                    full_file_name,
-                    openPMD::AccessType::CREATE,
-                    MPI_COMM_WORLD
-                    );
-            //}
-
-            std::cout << "Writing iteration " << step << std::endl;
-            auto i = series.iterations[ step ];
-            // TODO: it would be nice to set these...
-            //series.setAuthor( "Axel Huebl <a.huebl@hzdr.de>");
-            //series.setMachine( "Hall Probe 5000, Model 3");
-            i.setAttribute( "vacuum", true);
-
-            auto cB = i.meshes["B"];
-            auto E = i.meshes["E"];
-            auto J = i.meshes["J"];
-            auto Tca = i.meshes["Tca"];
-            auto Emat = i.meshes["Emat"];
-            auto Fmat = i.meshes["Fmat"];
-            auto Rho = i.meshes["Rho"];
-            auto DivErr = i.meshes["DivErr"];
-
-            // record components
-            auto Cbx = cB["x"];
-            auto Cby = cB["y"];
-            auto Cbz = cB["z"];
-
-            auto Ex = E["x"];
-            auto Ey = E["y"];
-            auto Ez = E["z"];
-
-            auto Jx = J["x"];
-            auto Jy = J["y"];
-            auto Jz = J["z"];
-
-            auto Tcax = Tca["x"];
-            auto Tcay = Tca["y"];
-            auto Tcaz = Tca["z"];
-
-            auto Ematx = Emat["x"];
-            auto Ematy = Emat["y"];
-            auto Ematz = Emat["z"];
-
-            auto Fmatx = Fmat["x"];
-            auto Fmaty = Fmat["y"];
-            auto Fmatz = Fmat["z"];
-
-            auto RhoB = Rho["B"];
-            auto RhoF = Rho["F"];
-
-            auto DivEErr = DivErr["E"];
-            auto DivBErr = DivErr["B"];
-
-            // TODO: set unitDimension so the anaylsis software knows what fields
-            // things are
-            //
-            // // TODO: add timers for the convert and for the write
-
-            size_t gnx = (grid->nx * grid->gpx);
-            size_t gny = (grid->ny * grid->gpy);
-            size_t gnz = (grid->nz * grid->gpz);
-            openPMD::Extent global_extent = {gnx, gny, gnz};
-
-            openPMD::Datatype datatype = openPMD::determineDatatype<float>();
-            openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
-
-            Cbx.resetDataset(dataset);
-            Cby.resetDataset(dataset);
-            Cbz.resetDataset(dataset);
-
-            Ex.resetDataset(dataset);
-            Ey.resetDataset(dataset);
-            Ez.resetDataset(dataset);
-
-            Jx.resetDataset(dataset);
-            Jy.resetDataset(dataset);
-            Jz.resetDataset(dataset);
-
-            Tcax.resetDataset(dataset);
-            Tcay.resetDataset(dataset);
-            Tcaz.resetDataset(dataset);
-
-            Ematx.resetDataset(dataset);
-            Ematy.resetDataset(dataset);
-            Ematz.resetDataset(dataset);
-
-            Fmatx.resetDataset(dataset);
-            Fmaty.resetDataset(dataset);
-            Fmatz.resetDataset(dataset);
-
-            RhoB.resetDataset(dataset);
-            RhoF.resetDataset(dataset);
-
-            DivEErr.resetDataset(dataset);
-            DivBErr.resetDataset(dataset);
-
-            // TODO: hoist this conversion code, as is it used elsewhere
-            // Convert rank to local x/y/z
-            int rx, ry, rz;
-            UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
-
-            size_t nx = grid->nx;
-            size_t ny = grid->ny;
-            size_t nz = grid->nz;
-
-            // NOTE: this assumes a static mesh decomposition in nx/ny/nz
-            size_t global_offset_x = (nx) * rx;
-            size_t global_offset_y = (ny) * ry;
-            size_t global_offset_z = (nz) * rz;
-
-            openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z};
-            openPMD::Extent chunk_extent = {nx, ny, nz};
-
-            std::cout << "Local offset " <<
-                " x: " << global_offset_x  <<
-                " y: " << global_offset_y  <<
-                " z: " << global_offset_z  <<
-                std::endl;
-
-            // Store a local copy of the data which we pull out of the AoS
-            std::vector<float> cbx_data;
-            std::vector<float> cby_data;
-            std::vector<float> cbz_data;
-
-            std::vector<float> ex_data;
-            std::vector<float> ey_data;
-            std::vector<float> ez_data;
-
-            std::vector<float> jx_data;
-            std::vector<float> jy_data;
-            std::vector<float> jz_data;
-
-            std::vector<float> tcax_data;
-            std::vector<float> tcay_data;
-            std::vector<float> tcaz_data;
-
-            // TODO: these are material_id (ints not floats)
-            std::vector<float> ematx_data;
-            std::vector<float> ematy_data;
-            std::vector<float> ematz_data;
-
-            std::vector<float> fmatx_data;
-            std::vector<float> fmaty_data;
-            std::vector<float> fmatz_data;
-            // end todo
-
-            std::vector<float> rhob_data;
-            std::vector<float> rhof_data;
-
-            std::vector<float> divb_data;
-            std::vector<float> dive_data;
-
-            size_t nv = nx * ny * nz;
-
-            // TODO: resize here will zero out the data which we don't need, we
-            // could change to a different semantic to avoid this
-            cbx_data.resize(nv);
-            cby_data.resize(nv);
-            cbz_data.resize(nv);
-
-            ex_data.resize(nv);
-            ey_data.resize(nv);
-            ez_data.resize(nv);
-
-            jx_data.resize(nv);
-            jy_data.resize(nv);
-            jz_data.resize(nv);
-
-            tcax_data.resize(nv);
-            tcay_data.resize(nv);
-            tcaz_data.resize(nv);
-
-            ematx_data.resize(nv);
-            ematy_data.resize(nv);
-            ematz_data.resize(nv);
-
-            fmatx_data.resize(nv);
-            fmaty_data.resize(nv);
-            fmatz_data.resize(nv);
-
-            rhob_data.resize(nv);
-            rhof_data.resize(nv);
-
-            divb_data.resize(nv);
-            dive_data.resize(nv);
+public:
+  // openPMD::Series* series;
+  using Dump_Strategy::Dump_Strategy; // inherit constructor
+
+  // std::string file_type = ".h5";
+  std::string file_type = ".bp";
+
+  void dump_fields(const char *fbase, int step, grid_t *grid,
+                   field_array_t *field_array, int ftag) {
+    std::cout << "Writing openPMD data" << std::endl;
+
+    std::string full_file_name = fbase + file_type;
+
+    // if (series == nullptr) {
+    std::cout << "init series" << std::endl;
+    openPMD::Series series = openPMD::Series(
+        full_file_name, openPMD::AccessType::CREATE, MPI_COMM_WORLD);
+    //}
+
+    std::cout << "Writing iteration " << step << std::endl;
+    auto i = series.iterations[step];
+    // TODO: it would be nice to set these...
+    // series.setAuthor( "Axel Huebl <a.huebl@hzdr.de>");
+    // series.setMachine( "Hall Probe 5000, Model 3");
+    i.setAttribute("vacuum", true);
+
+    auto cB = i.meshes["B"];
+    auto E = i.meshes["E"];
+    auto J = i.meshes["J"];
+    auto Tca = i.meshes["Tca"];
+    auto Emat = i.meshes["Emat"];
+    auto Fmat = i.meshes["Fmat"];
+    auto Rho = i.meshes["Rho"];
+    auto DivErr = i.meshes["DivErr"];
+
+    // record components
+    auto Cbx = cB["x"];
+    auto Cby = cB["y"];
+    auto Cbz = cB["z"];
+
+    auto Ex = E["x"];
+    auto Ey = E["y"];
+    auto Ez = E["z"];
+
+    auto Jx = J["x"];
+    auto Jy = J["y"];
+    auto Jz = J["z"];
+
+    auto Tcax = Tca["x"];
+    auto Tcay = Tca["y"];
+    auto Tcaz = Tca["z"];
+
+    auto Ematx = Emat["x"];
+    auto Ematy = Emat["y"];
+    auto Ematz = Emat["z"];
+
+    auto Fmatx = Fmat["x"];
+    auto Fmaty = Fmat["y"];
+    auto Fmatz = Fmat["z"];
+
+    auto RhoB = Rho["B"];
+    auto RhoF = Rho["F"];
+
+    auto DivEErr = DivErr["E"];
+    auto DivBErr = DivErr["B"];
+
+    // TODO: set unitDimension so the anaylsis software knows what fields
+    // things are
+    //
+    // // TODO: add timers for the convert and for the write
+
+    size_t gnx = (grid->nx * grid->gpx);
+    size_t gny = (grid->ny * grid->gpy);
+    size_t gnz = (grid->nz * grid->gpz);
+    openPMD::Extent global_extent = {gnx, gny, gnz};
+
+    openPMD::Datatype datatype = openPMD::determineDatatype<float>();
+    openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
+
+    Cbx.resetDataset(dataset);
+    Cby.resetDataset(dataset);
+    Cbz.resetDataset(dataset);
+
+    Ex.resetDataset(dataset);
+    Ey.resetDataset(dataset);
+    Ez.resetDataset(dataset);
+
+    Jx.resetDataset(dataset);
+    Jy.resetDataset(dataset);
+    Jz.resetDataset(dataset);
+
+    Tcax.resetDataset(dataset);
+    Tcay.resetDataset(dataset);
+    Tcaz.resetDataset(dataset);
+
+    Ematx.resetDataset(dataset);
+    Ematy.resetDataset(dataset);
+    Ematz.resetDataset(dataset);
+
+    Fmatx.resetDataset(dataset);
+    Fmaty.resetDataset(dataset);
+    Fmatz.resetDataset(dataset);
+
+    RhoB.resetDataset(dataset);
+    RhoF.resetDataset(dataset);
+
+    DivEErr.resetDataset(dataset);
+    DivBErr.resetDataset(dataset);
+
+    // TODO: hoist this conversion code, as is it used elsewhere
+    // Convert rank to local x/y/z
+    int rx, ry, rz;
+    UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+    size_t nx = grid->nx;
+    size_t ny = grid->ny;
+    size_t nz = grid->nz;
+
+    // NOTE: this assumes a static mesh decomposition in nx/ny/nz
+    size_t global_offset_x = (nx)*rx;
+    size_t global_offset_y = (ny)*ry;
+    size_t global_offset_z = (nz)*rz;
+
+    openPMD::Offset chunk_offset = {global_offset_x, global_offset_y,
+                                    global_offset_z};
+    openPMD::Extent chunk_extent = {nx, ny, nz};
+
+    std::cout << "Local offset "
+              << " x: " << global_offset_x << " y: " << global_offset_y
+              << " z: " << global_offset_z << std::endl;
+
+    // Store a local copy of the data which we pull out of the AoS
+    std::vector<float> cbx_data;
+    std::vector<float> cby_data;
+    std::vector<float> cbz_data;
+
+    std::vector<float> ex_data;
+    std::vector<float> ey_data;
+    std::vector<float> ez_data;
+
+    std::vector<float> jx_data;
+    std::vector<float> jy_data;
+    std::vector<float> jz_data;
+
+    std::vector<float> tcax_data;
+    std::vector<float> tcay_data;
+    std::vector<float> tcaz_data;
+
+    // TODO: these are material_id (ints not floats)
+    std::vector<float> ematx_data;
+    std::vector<float> ematy_data;
+    std::vector<float> ematz_data;
 
-            // TODO: make this AoS to SoA conversion a function
-
-            // We could do 1D here, but we don't really care about the ghosts, and we
-            // can thread over nz/ny (collapsed?)
-            // Go over non-ghosts and grab just that data into a dense array
-            for (size_t k = 1; k < grid->nz + 1; k++)
-            {
-                for (size_t j = 1; j < grid->ny + 1; j++)
-                {
-                    for (size_t i = 1; i < grid->nx + 1; i++)
-                    {
-                        int local_index  = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2);
-                        int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
-
-                        cbx_data[local_index] = field_array->f[global_index].cbx;
-                        cby_data[local_index] = field_array->f[global_index].cby;
-                        cbz_data[local_index] = field_array->f[global_index].cbz;
-
-                        ex_data[local_index] = field_array->f[global_index].ex;
-                        ey_data[local_index] = field_array->f[global_index].ey;
-                        ez_data[local_index] = field_array->f[global_index].ez;
-
-                        jx_data[local_index] = field_array->f[global_index].jfx;
-                        jy_data[local_index] = field_array->f[global_index].jfy;
-                        jz_data[local_index] = field_array->f[global_index].jfz;
-
-                        tcax_data[local_index] = field_array->f[global_index].tcax;
-                        tcay_data[local_index] = field_array->f[global_index].tcay;
-                        tcaz_data[local_index] = field_array->f[global_index].tcaz;
-
-                        ematx_data[local_index] = field_array->f[global_index].ematx;
-                        ematy_data[local_index] = field_array->f[global_index].ematy;
-                        ematz_data[local_index] = field_array->f[global_index].ematz;
-
-                        fmatx_data[local_index] = field_array->f[global_index].fmatx;
-                        fmaty_data[local_index] = field_array->f[global_index].fmaty;
-                        fmatz_data[local_index] = field_array->f[global_index].fmatz;
+    std::vector<float> fmatx_data;
+    std::vector<float> fmaty_data;
+    std::vector<float> fmatz_data;
+    // end todo
 
-                        rhob_data[local_index] = field_array->f[global_index].rhob;
-                        rhof_data[local_index] = field_array->f[global_index].rhof;
+    std::vector<float> rhob_data;
+    std::vector<float> rhof_data;
 
-                        dive_data[local_index] = field_array->f[global_index].div_e_err;
-                        divb_data[local_index] = field_array->f[global_index].div_b_err;
-                    }
-                }
-            }
-
-            Cbx.storeChunk( cbx_data, chunk_offset, chunk_extent);
-            Cby.storeChunk( cby_data, chunk_offset, chunk_extent);
-            Cbz.storeChunk( cbz_data, chunk_offset, chunk_extent);
-
-            Ex.storeChunk( ex_data, chunk_offset, chunk_extent);
-            Ey.storeChunk( ey_data, chunk_offset, chunk_extent);
-            Ez.storeChunk( ez_data, chunk_offset, chunk_extent);
+    std::vector<float> divb_data;
+    std::vector<float> dive_data;
+
+    size_t nv = nx * ny * nz;
 
-            Jx.storeChunk( jx_data, chunk_offset, chunk_extent);
-            Jy.storeChunk( jy_data, chunk_offset, chunk_extent);
-            Jz.storeChunk( jz_data, chunk_offset, chunk_extent);
+    // TODO: resize here will zero out the data which we don't need, we
+    // could change to a different semantic to avoid this
+    cbx_data.resize(nv);
+    cby_data.resize(nv);
+    cbz_data.resize(nv);
+
+    ex_data.resize(nv);
+    ey_data.resize(nv);
+    ez_data.resize(nv);
 
-            Tcax.storeChunk( tcax_data, chunk_offset, chunk_extent);
-            Tcay.storeChunk( tcay_data, chunk_offset, chunk_extent);
-            Tcaz.storeChunk( tcaz_data, chunk_offset, chunk_extent);
+    jx_data.resize(nv);
+    jy_data.resize(nv);
+    jz_data.resize(nv);
+
+    tcax_data.resize(nv);
+    tcay_data.resize(nv);
+    tcaz_data.resize(nv);
+
+    ematx_data.resize(nv);
+    ematy_data.resize(nv);
+    ematz_data.resize(nv);
+
+    fmatx_data.resize(nv);
+    fmaty_data.resize(nv);
+    fmatz_data.resize(nv);
+
+    rhob_data.resize(nv);
+    rhof_data.resize(nv);
 
-            Ematx.storeChunk( ematx_data, chunk_offset, chunk_extent);
-            Ematy.storeChunk( ematy_data, chunk_offset, chunk_extent);
-            Ematz.storeChunk( ematz_data, chunk_offset, chunk_extent);
-
-            Fmatx.storeChunk( fmatx_data, chunk_offset, chunk_extent);
-            Fmaty.storeChunk( fmaty_data, chunk_offset, chunk_extent);
-            Fmatz.storeChunk( fmatz_data, chunk_offset, chunk_extent);
-
-            RhoB.storeChunk( rhob_data, chunk_offset, chunk_extent);
-            RhoF.storeChunk( rhof_data, chunk_offset, chunk_extent);
+    divb_data.resize(nv);
+    dive_data.resize(nv);
 
-            DivEErr.storeChunk( dive_data, chunk_offset, chunk_extent);
-            DivBErr.storeChunk( divb_data, chunk_offset, chunk_extent);
+    // TODO: make this AoS to SoA conversion a function
+
+    // We could do 1D here, but we don't really care about the ghosts, and we
+    // can thread over nz/ny (collapsed?)
+    // Go over non-ghosts and grab just that data into a dense array
+    for (size_t k = 1; k < grid->nz + 1; k++) {
+      for (size_t j = 1; j < grid->ny + 1; j++) {
+        for (size_t i = 1; i < grid->nx + 1; i++) {
+          int local_index = VOXEL(i - 1, j - 1, k - 1, grid->nx - 2,
+                                  grid->ny - 2, grid->nz - 2);
+          int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
+
+          cbx_data[local_index] = field_array->f[global_index].cbx;
+          cby_data[local_index] = field_array->f[global_index].cby;
+          cbz_data[local_index] = field_array->f[global_index].cbz;
 
-            series.flush();
+          ex_data[local_index] = field_array->f[global_index].ex;
+          ey_data[local_index] = field_array->f[global_index].ey;
+          ez_data[local_index] = field_array->f[global_index].ez;
+
+          jx_data[local_index] = field_array->f[global_index].jfx;
+          jy_data[local_index] = field_array->f[global_index].jfy;
+          jz_data[local_index] = field_array->f[global_index].jfz;
+
+          tcax_data[local_index] = field_array->f[global_index].tcax;
+          tcay_data[local_index] = field_array->f[global_index].tcay;
+          tcaz_data[local_index] = field_array->f[global_index].tcaz;
+
+          ematx_data[local_index] = field_array->f[global_index].ematx;
+          ematy_data[local_index] = field_array->f[global_index].ematy;
+          ematz_data[local_index] = field_array->f[global_index].ematz;
+
+          fmatx_data[local_index] = field_array->f[global_index].fmatx;
+          fmaty_data[local_index] = field_array->f[global_index].fmaty;
+          fmatz_data[local_index] = field_array->f[global_index].fmatz;
+
+          rhob_data[local_index] = field_array->f[global_index].rhob;
+          rhof_data[local_index] = field_array->f[global_index].rhof;
+
+          dive_data[local_index] = field_array->f[global_index].div_e_err;
+          divb_data[local_index] = field_array->f[global_index].div_b_err;
         }
+      }
+    }
+
+    Cbx.storeChunk(cbx_data, chunk_offset, chunk_extent);
+    Cby.storeChunk(cby_data, chunk_offset, chunk_extent);
+    Cbz.storeChunk(cbz_data, chunk_offset, chunk_extent);
+
+    Ex.storeChunk(ex_data, chunk_offset, chunk_extent);
+    Ey.storeChunk(ey_data, chunk_offset, chunk_extent);
+    Ez.storeChunk(ez_data, chunk_offset, chunk_extent);
+
+    Jx.storeChunk(jx_data, chunk_offset, chunk_extent);
+    Jy.storeChunk(jy_data, chunk_offset, chunk_extent);
+    Jz.storeChunk(jz_data, chunk_offset, chunk_extent);
+
+    Tcax.storeChunk(tcax_data, chunk_offset, chunk_extent);
+    Tcay.storeChunk(tcay_data, chunk_offset, chunk_extent);
+    Tcaz.storeChunk(tcaz_data, chunk_offset, chunk_extent);
+
+    Ematx.storeChunk(ematx_data, chunk_offset, chunk_extent);
+    Ematy.storeChunk(ematy_data, chunk_offset, chunk_extent);
+    Ematz.storeChunk(ematz_data, chunk_offset, chunk_extent);
 
-        void dump_particles(
-                const char *fbase,
-                species_t* sp,
-                grid_t* grid,
-                int step,
-                interpolator_array_t* interpolator_array,
-                int ftag
-                )
-        {
-            std::string full_file_name = fbase + file_type;
+    Fmatx.storeChunk(fmatx_data, chunk_offset, chunk_extent);
+    Fmaty.storeChunk(fmaty_data, chunk_offset, chunk_extent);
+    Fmatz.storeChunk(fmatz_data, chunk_offset, chunk_extent);
 
-            std::cout << "writing particles to " << full_file_name << std::endl;
+    RhoB.storeChunk(rhob_data, chunk_offset, chunk_extent);
+    RhoF.storeChunk(rhof_data, chunk_offset, chunk_extent);
 
-            //if (series == nullptr) {
-            openPMD::Series series = openPMD::Series(
-                    full_file_name,
-                    openPMD::AccessType::CREATE,
-                    MPI_COMM_WORLD
-                    );
-            //}
+    DivEErr.storeChunk(dive_data, chunk_offset, chunk_extent);
+    DivBErr.storeChunk(divb_data, chunk_offset, chunk_extent);
 
-            auto i = series.iterations[ step ];
+    series.flush();
+  }
 
-            // TODO: set these
-            i.setTime( (float)step );
-            i.setDt(1.0);
-            i.setTimeUnitSI(1.0);
+  void dump_particles(const char *fbase, species_t *sp, grid_t *grid, int step,
+                      interpolator_array_t *interpolator_array, int ftag) {
+    std::string full_file_name = fbase + file_type;
 
-            auto& p = i.particles[sp->name];
+    std::cout << "writing particles to " << full_file_name << std::endl;
 
-            const int np = sp->np;
+    // if (series == nullptr) {
+    openPMD::Series series = openPMD::Series(
+        full_file_name, openPMD::AccessType::CREATE, MPI_COMM_WORLD);
+    //}
 
-            // TODO: this could be a function call as it's used elsewhere (in hdf5)
-            unsigned long long total_particles, offset;
-            unsigned long long numparticles = np;
-            MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-            MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
-            offset -= numparticles;
+    auto i = series.iterations[step];
 
-            openPMD::Extent global_extent = {total_particles};
-            openPMD::Datatype datatype = openPMD::determineDatatype<float>();
-            openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
+    // TODO: set these
+    i.setTime((float)step);
+    i.setDt(1.0);
+    i.setTimeUnitSI(1.0);
 
-            auto px = p["position"]["x"];
-            auto pxo = p["positionOffset"]["x"];
+    auto &p = i.particles[sp->name];
 
-            auto py = p["position"]["y"];
-            auto pyo = p["positionOffset"]["y"];
+    const int np = sp->np;
 
-            auto pz = p["position"]["z"];
-            auto pzo = p["positionOffset"]["z"];
+    // TODO: this could be a function call as it's used elsewhere (in hdf5)
+    unsigned long long total_particles, offset;
+    unsigned long long numparticles = np;
+    MPI_Allreduce(&numparticles, &total_particles, 1, MPI_LONG_LONG, MPI_SUM,
+                  MPI_COMM_WORLD);
+    MPI_Scan(&numparticles, &offset, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
+    offset -= numparticles;
 
-            auto ux = p["velocity"]["x"];
-            auto uy = p["velocity"]["y"];
-            auto uz = p["velocity"]["z"];
+    openPMD::Extent global_extent = {total_particles};
+    openPMD::Datatype datatype = openPMD::determineDatatype<float>();
+    openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
 
-            px.resetDataset(dataset);
-            pxo.resetDataset(dataset);
+    auto px = p["position"]["x"];
+    auto pxo = p["positionOffset"]["x"];
 
-            py.resetDataset(dataset);
-            pyo.resetDataset(dataset);
+    auto py = p["position"]["y"];
+    auto pyo = p["positionOffset"]["y"];
 
-            pz.resetDataset(dataset);
-            pzo.resetDataset(dataset);
+    auto pz = p["position"]["z"];
+    auto pzo = p["positionOffset"]["z"];
 
-            ux.resetDataset(dataset);
-            uy.resetDataset(dataset);
-            uz.resetDataset(dataset);
-            // convert data to SoA, allowing the user to chunk the operation
+    auto ux = p["velocity"]["x"];
+    auto uy = p["velocity"]["y"];
+    auto uz = p["velocity"]["z"];
 
-            // TODO: Add code the convert to global offsets
-#ifndef PMD_MAX_IO_CHUNK // in particles
+    px.resetDataset(dataset);
+    pxo.resetDataset(dataset);
+
+    py.resetDataset(dataset);
+    pyo.resetDataset(dataset);
+
+    pz.resetDataset(dataset);
+    pzo.resetDataset(dataset);
+
+    ux.resetDataset(dataset);
+    uy.resetDataset(dataset);
+    uz.resetDataset(dataset);
+    // convert data to SoA, allowing the user to chunk the operation
+
+    // TODO: Add code the convert to global offsets
+#ifndef PMD_MAX_IO_CHUNK           // in particles
 #define PMD_MAX_IO_CHUNK 16777216; // 512MB total write
 #endif
-            const int max_chunk = PMD_MAX_IO_CHUNK;
-
-            // Loop over all particles in chunks
-            for (int i = 0; i < np; i += max_chunk)
-            {
-                // We have to be careful as the last chunk may not be full
-                // Find how many are left and do that many
-                size_t to_write = std::min(np-i, max_chunk);
-
-                // Convert the chunk ready to write
-                std::vector<float> x_pos;
-                std::vector<float> x_off;
-                x_pos.resize(to_write);
-                x_off.resize(to_write);
-
-                std::vector<float> y_pos;
-                std::vector<float> y_off;
-                y_pos.resize(to_write);
-                y_off.resize(to_write);
-
-                std::vector<float> z_pos;
-                std::vector<float> z_off;
-                z_pos.resize(to_write);
-                z_off.resize(to_write);
-
-                std::vector<float> ux_pos;
-                ux_pos.resize(to_write);
-
-                std::vector<float> uy_pos;
-                uy_pos.resize(to_write);
-
-                std::vector<float> uz_pos;
-                uz_pos.resize(to_write);
-
-                for (int j = 0; j < to_write; j++)
-                {
-                    // TODO: do I need to center the particles?
-                    auto& particle = sp->p[i+j];
-
-                    x_pos[j] = particle.dx;
-                    y_pos[j] = particle.dy;
-                    z_pos[j] = particle.dz;
-
-                    ux_pos[j] = particle.ux;
-                    uy_pos[j] = particle.uy;
-                    uz_pos[j] = particle.uz;
-
-                    std::array<int, 4> gi = global_particle_index(particle.i, grid, rank);
-                    x_off[j] = (float)gi[1];
-                    y_off[j] = (float)gi[2];
-                    z_off[j] = (float)gi[3];
-                }
-
-                // Base offset plus i to account for chunks
-                auto o = openPMD::Offset{offset + i};
-                auto e = openPMD::Extent{to_write};
-                px.storeChunk(x_pos, o, e);
-                pxo.storeChunk(x_off, o, e);
-
-                py.storeChunk(y_pos, o, e);
-                pyo.storeChunk(y_off, o, e);
-
-                pz.storeChunk(z_pos, o, e);
-                pzo.storeChunk(z_off, o, e);
-
-                ux.storeChunk(ux_pos, o, e);
-                uy.storeChunk(uy_pos, o, e);
-                uz.storeChunk(uz_pos, o, e);
-
-                series.flush();
-            }
-        }
-        void dump_hydro(
-                const char *fbase,
-                int step,
-                hydro_array_t* hydro_array,
-                species_t* sp,
-                interpolator_array_t* interpolator_array,
-                grid_t* grid,
-                int ftag
-                )
-        {
-            std::string full_file_name = fbase + file_type;
-
-            std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl;
-
-            //if (series == nullptr) {
-            openPMD::Series series = openPMD::Series(
-                    full_file_name,
-                    openPMD::AccessType::CREATE,
-                    MPI_COMM_WORLD
-                    );
-            //}
-
-            auto i = series.iterations[ step ];
-
-            // TODO: set these
-            i.setTime( (float)step );
-            i.setDt(1.0);
-            i.setTimeUnitSI(1.0);
-
-            if( !sp ) ERROR(( "Invalid species \"%s\"", sp->name ));
-
-            // TODO: do we want each backend to have to explicitly call these
-            // manually? Or, as it is common, should we hoist it to the VPIC
-            // call-site
-            clear_hydro_array( hydro_array );
-            accumulate_hydro_p( hydro_array, sp, interpolator_array );
-            synchronize_hydro_array( hydro_array );
-
-            if( !fbase ) ERROR(( "Invalid filename" ));
-
-            if( rank==0 )
-                MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"",sp->name,fbase));
-
-            // Write data
-            //float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
-            //float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2 (gamma-1) f>
-            //float txx, tyy, tzz;   // Stress diagonal            => <p_i v_j f>, i==j
-            //float tyz, tzx, txy;   // Stress off-diagonal        => <p_i v_j f>, i!=j
-            auto J = i.meshes["J"];
-            auto P = i.meshes["P"];
-            auto T = i.meshes["T"];
-            auto _Ke = i.meshes["Ke"];
-            auto _Rho = i.meshes["Rho"];
-
-            auto Jx = J["x"];
-            auto Jy = J["y"];
-            auto Jz = J["z"];
-
-            auto Px = P["x"];
-            auto Py = P["y"];
-            auto Pz = P["z"];
-
-            auto Txx = T["xx"];
-            auto Tyy = T["yy"];
-            auto Tzz = T["zz"];
-            auto Tyz = T["yz"];
-            auto Tzx = T["zx"];
-            auto Txy = T["xy"];
-
-            auto Rho = _Rho["rho"]; // TODO: bad name..
-            auto Ke = _Ke["ke"]; // TODO: bad name..
-
-            size_t gnx = (grid->nx * grid->gpx);
-            size_t gny = (grid->ny * grid->gpy);
-            size_t gnz = (grid->nz * grid->gpz);
-            openPMD::Extent global_extent = {gnx, gny, gnz};
-
-            openPMD::Datatype datatype = openPMD::determineDatatype<float>();
-            openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
-
-            Jx.resetDataset(dataset);
-            Jy.resetDataset(dataset);
-            Jz.resetDataset(dataset);
-
-            Px.resetDataset(dataset);
-            Py.resetDataset(dataset);
-            Pz.resetDataset(dataset);
-
-            Txx.resetDataset(dataset);
-            Tyy.resetDataset(dataset);
-            Tzz.resetDataset(dataset);
-            Tyz.resetDataset(dataset);
-            Tzx.resetDataset(dataset);
-            Txy.resetDataset(dataset);
-
-            Rho.resetDataset(dataset);
-            Ke.resetDataset(dataset);
-
-            // TODO: hoist this conversion code, as is it used elsewhere
-            // Convert rank to local x/y/z
-            int rx, ry, rz;
-            UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
-
-            size_t nx = grid->nx;
-            size_t ny = grid->ny;
-            size_t nz = grid->nz;
-
-            // NOTE: this assumes a static mesh decomposition in nx/ny/nz
-            size_t global_offset_x = (nx) * rx;
-            size_t global_offset_y = (ny) * ry;
-            size_t global_offset_z = (nz) * rz;
-
-            openPMD::Offset chunk_offset = {global_offset_x, global_offset_y, global_offset_z};
-            openPMD::Extent chunk_extent = {nx, ny, nz};
-
-            std::cout << "Local offset " <<
-                " x: " << global_offset_x  <<
-                " y: " << global_offset_y  <<
-                " z: " << global_offset_z  <<
-                std::endl;
-
-            std::vector<float> jx_data;
-            std::vector<float> jy_data;
-            std::vector<float> jz_data;
-
-            std::vector<float> px_data;
-            std::vector<float> py_data;
-            std::vector<float> pz_data;
-
-            std::vector<float> txx_data;
-            std::vector<float> tyy_data;
-            std::vector<float> tzz_data;
-            std::vector<float> tyz_data;
-            std::vector<float> tzx_data;
-            std::vector<float> txy_data;
-
-            std::vector<float> rho_data;
-            std::vector<float> ke_data;
-
-            size_t nv = nx * ny * nz;
-
-            jx_data.resize(nv);
-            jy_data.resize(nv);
-            jz_data.resize(nv);
-
-            px_data.resize(nv);
-            py_data.resize(nv);
-            pz_data.resize(nv);
-
-            txx_data.resize(nv);
-            tyy_data.resize(nv);
-            tzz_data.resize(nv);
-            tyz_data.resize(nv);
-            tzx_data.resize(nv);
-            txy_data.resize(nv);
-
-            rho_data.resize(nv);
-            ke_data.resize(nv);
-
-            // Transpose AoS to SoAs
-            for (size_t k = 1; k < grid->nz + 1; k++)
-            {
-                for (size_t j = 1; j < grid->ny + 1; j++)
-                {
-                    for (size_t i = 1; i < grid->nx + 1; i++)
-                    {
-                        int local_index  = VOXEL(i-1, j-1, k-1, grid->nx-2, grid->ny-2, grid->nz-2);
-                        int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
-
-                        jx_data[local_index] = hydro_array->h[global_index].jx;
-                        jy_data[local_index] = hydro_array->h[global_index].jy;
-                        jz_data[local_index] = hydro_array->h[global_index].jz;
-
-                        px_data[local_index] = hydro_array->h[global_index].px;
-                        py_data[local_index] = hydro_array->h[global_index].py;
-                        pz_data[local_index] = hydro_array->h[global_index].pz;
-
-                        txx_data[local_index] = hydro_array->h[global_index].txx;
-                        tyy_data[local_index] = hydro_array->h[global_index].tyy;
-                        tzz_data[local_index] = hydro_array->h[global_index].tzz;
-                        tyz_data[local_index] = hydro_array->h[global_index].tyz;
-                        tzx_data[local_index] = hydro_array->h[global_index].tzx;
-                        txy_data[local_index] = hydro_array->h[global_index].txy;
-
-                        rho_data[local_index] = hydro_array->h[global_index].rho;
-                        ke_data[local_index] = hydro_array->h[global_index].ke;
-                    }
-                }
-            }
-
-            Jx.storeChunk( jx_data, chunk_offset, chunk_extent);
-            Jy.storeChunk( jy_data, chunk_offset, chunk_extent);
-            Jz.storeChunk( jz_data, chunk_offset, chunk_extent);
-
-            Px.storeChunk( px_data, chunk_offset, chunk_extent);
-            Py.storeChunk( py_data, chunk_offset, chunk_extent);
-            Pz.storeChunk( pz_data, chunk_offset, chunk_extent);
-
-            Txx.storeChunk( txx_data, chunk_offset, chunk_extent);
-            Tyy.storeChunk( tyy_data, chunk_offset, chunk_extent);
-            Tzz.storeChunk( tzz_data, chunk_offset, chunk_extent);
-            Tyz.storeChunk( tyz_data, chunk_offset, chunk_extent);
-            Tzx.storeChunk( tzx_data, chunk_offset, chunk_extent);
-            Txy.storeChunk( txy_data, chunk_offset, chunk_extent);
-
-            Rho.storeChunk( rho_data, chunk_offset, chunk_extent);
-            Ke.storeChunk( ke_data, chunk_offset, chunk_extent);
-
-            series.flush();
+    const int max_chunk = PMD_MAX_IO_CHUNK;
+
+    // Loop over all particles in chunks
+    for (int i = 0; i < np; i += max_chunk) {
+      // We have to be careful as the last chunk may not be full
+      // Find how many are left and do that many
+      size_t to_write = std::min(np - i, max_chunk);
+
+      // Convert the chunk ready to write
+      std::vector<float> x_pos;
+      std::vector<float> x_off;
+      x_pos.resize(to_write);
+      x_off.resize(to_write);
+
+      std::vector<float> y_pos;
+      std::vector<float> y_off;
+      y_pos.resize(to_write);
+      y_off.resize(to_write);
+
+      std::vector<float> z_pos;
+      std::vector<float> z_off;
+      z_pos.resize(to_write);
+      z_off.resize(to_write);
+
+      std::vector<float> ux_pos;
+      ux_pos.resize(to_write);
+
+      std::vector<float> uy_pos;
+      uy_pos.resize(to_write);
+
+      std::vector<float> uz_pos;
+      uz_pos.resize(to_write);
+
+      for (int j = 0; j < to_write; j++) {
+        // TODO: do I need to center the particles?
+        auto &particle = sp->p[i + j];
+
+        x_pos[j] = particle.dx;
+        y_pos[j] = particle.dy;
+        z_pos[j] = particle.dz;
+
+        ux_pos[j] = particle.ux;
+        uy_pos[j] = particle.uy;
+        uz_pos[j] = particle.uz;
+
+        std::array<int, 4> gi = global_particle_index(particle.i, grid, rank);
+        x_off[j] = (float)gi[1];
+        y_off[j] = (float)gi[2];
+        z_off[j] = (float)gi[3];
+      }
+
+      // Base offset plus i to account for chunks
+      auto o = openPMD::Offset{offset + i};
+      auto e = openPMD::Extent{to_write};
+      px.storeChunk(x_pos, o, e);
+      pxo.storeChunk(x_off, o, e);
+
+      py.storeChunk(y_pos, o, e);
+      pyo.storeChunk(y_off, o, e);
+
+      pz.storeChunk(z_pos, o, e);
+      pzo.storeChunk(z_off, o, e);
+
+      ux.storeChunk(ux_pos, o, e);
+      uy.storeChunk(uy_pos, o, e);
+      uz.storeChunk(uz_pos, o, e);
+
+      series.flush();
+    }
+  }
+  void dump_hydro(const char *fbase, int step, hydro_array_t *hydro_array,
+                  species_t *sp, interpolator_array_t *interpolator_array,
+                  grid_t *grid, int ftag) {
+    std::string full_file_name = fbase + file_type;
+
+    std::cout << "OpenPMD dumping hydro to " << full_file_name << std::endl;
+
+    // if (series == nullptr) {
+    openPMD::Series series = openPMD::Series(
+        full_file_name, openPMD::AccessType::CREATE, MPI_COMM_WORLD);
+    //}
+
+    auto i = series.iterations[step];
+
+    // TODO: set these
+    i.setTime((float)step);
+    i.setDt(1.0);
+    i.setTimeUnitSI(1.0);
+
+    if (!sp)
+      ERROR(("Invalid species \"%s\"", sp->name));
+
+    // TODO: do we want each backend to have to explicitly call these
+    // manually? Or, as it is common, should we hoist it to the VPIC
+    // call-site
+    clear_hydro_array(hydro_array);
+    accumulate_hydro_p(hydro_array, sp, interpolator_array);
+    synchronize_hydro_array(hydro_array);
+
+    if (!fbase)
+      ERROR(("Invalid filename"));
+
+    if (rank == 0)
+      MESSAGE(("Dumping \"%s\" hydro fields to \"%s\"", sp->name, fbase));
+
+    // Write data
+    // float jx, jy, jz, rho; // Current and charge density => <q v_i f>, <q f>
+    // float px, py, pz, ke;  // Momentum and K.E. density  => <p_i f>, <m c^2
+    // (gamma-1) f> float txx, tyy, tzz;   // Stress diagonal            => <p_i
+    // v_j f>, i==j float tyz, tzx, txy;   // Stress off-diagonal        => <p_i
+    // v_j f>, i!=j
+    auto J = i.meshes["J"];
+    auto P = i.meshes["P"];
+    auto T = i.meshes["T"];
+    auto _Ke = i.meshes["Ke"];
+    auto _Rho = i.meshes["Rho"];
+
+    auto Jx = J["x"];
+    auto Jy = J["y"];
+    auto Jz = J["z"];
+
+    auto Px = P["x"];
+    auto Py = P["y"];
+    auto Pz = P["z"];
+
+    auto Txx = T["xx"];
+    auto Tyy = T["yy"];
+    auto Tzz = T["zz"];
+    auto Tyz = T["yz"];
+    auto Tzx = T["zx"];
+    auto Txy = T["xy"];
+
+    auto Rho = _Rho["rho"]; // TODO: bad name..
+    auto Ke = _Ke["ke"];    // TODO: bad name..
+
+    size_t gnx = (grid->nx * grid->gpx);
+    size_t gny = (grid->ny * grid->gpy);
+    size_t gnz = (grid->nz * grid->gpz);
+    openPMD::Extent global_extent = {gnx, gny, gnz};
+
+    openPMD::Datatype datatype = openPMD::determineDatatype<float>();
+    openPMD::Dataset dataset = openPMD::Dataset(datatype, global_extent);
+
+    Jx.resetDataset(dataset);
+    Jy.resetDataset(dataset);
+    Jz.resetDataset(dataset);
+
+    Px.resetDataset(dataset);
+    Py.resetDataset(dataset);
+    Pz.resetDataset(dataset);
+
+    Txx.resetDataset(dataset);
+    Tyy.resetDataset(dataset);
+    Tzz.resetDataset(dataset);
+    Tyz.resetDataset(dataset);
+    Tzx.resetDataset(dataset);
+    Txy.resetDataset(dataset);
+
+    Rho.resetDataset(dataset);
+    Ke.resetDataset(dataset);
+
+    // TODO: hoist this conversion code, as is it used elsewhere
+    // Convert rank to local x/y/z
+    int rx, ry, rz;
+    UNVOXEL(rank, rx, ry, rz, grid->gpx, grid->gpy, grid->gpz);
+
+    size_t nx = grid->nx;
+    size_t ny = grid->ny;
+    size_t nz = grid->nz;
+
+    // NOTE: this assumes a static mesh decomposition in nx/ny/nz
+    size_t global_offset_x = (nx)*rx;
+    size_t global_offset_y = (ny)*ry;
+    size_t global_offset_z = (nz)*rz;
+
+    openPMD::Offset chunk_offset = {global_offset_x, global_offset_y,
+                                    global_offset_z};
+    openPMD::Extent chunk_extent = {nx, ny, nz};
+
+    std::cout << "Local offset "
+              << " x: " << global_offset_x << " y: " << global_offset_y
+              << " z: " << global_offset_z << std::endl;
+
+    std::vector<float> jx_data;
+    std::vector<float> jy_data;
+    std::vector<float> jz_data;
+
+    std::vector<float> px_data;
+    std::vector<float> py_data;
+    std::vector<float> pz_data;
+
+    std::vector<float> txx_data;
+    std::vector<float> tyy_data;
+    std::vector<float> tzz_data;
+    std::vector<float> tyz_data;
+    std::vector<float> tzx_data;
+    std::vector<float> txy_data;
+
+    std::vector<float> rho_data;
+    std::vector<float> ke_data;
+
+    size_t nv = nx * ny * nz;
+
+    jx_data.resize(nv);
+    jy_data.resize(nv);
+    jz_data.resize(nv);
+
+    px_data.resize(nv);
+    py_data.resize(nv);
+    pz_data.resize(nv);
+
+    txx_data.resize(nv);
+    tyy_data.resize(nv);
+    tzz_data.resize(nv);
+    tyz_data.resize(nv);
+    tzx_data.resize(nv);
+    txy_data.resize(nv);
+
+    rho_data.resize(nv);
+    ke_data.resize(nv);
+
+    // Transpose AoS to SoAs
+    for (size_t k = 1; k < grid->nz + 1; k++) {
+      for (size_t j = 1; j < grid->ny + 1; j++) {
+        for (size_t i = 1; i < grid->nx + 1; i++) {
+          int local_index = VOXEL(i - 1, j - 1, k - 1, grid->nx - 2,
+                                  grid->ny - 2, grid->nz - 2);
+          int global_index = VOXEL(i, j, k, grid->nx, grid->ny, grid->nz);
+
+          jx_data[local_index] = hydro_array->h[global_index].jx;
+          jy_data[local_index] = hydro_array->h[global_index].jy;
+          jz_data[local_index] = hydro_array->h[global_index].jz;
+
+          px_data[local_index] = hydro_array->h[global_index].px;
+          py_data[local_index] = hydro_array->h[global_index].py;
+          pz_data[local_index] = hydro_array->h[global_index].pz;
+
+          txx_data[local_index] = hydro_array->h[global_index].txx;
+          tyy_data[local_index] = hydro_array->h[global_index].tyy;
+          tzz_data[local_index] = hydro_array->h[global_index].tzz;
+          tyz_data[local_index] = hydro_array->h[global_index].tyz;
+          tzx_data[local_index] = hydro_array->h[global_index].tzx;
+          txy_data[local_index] = hydro_array->h[global_index].txy;
+
+          rho_data[local_index] = hydro_array->h[global_index].rho;
+          ke_data[local_index] = hydro_array->h[global_index].ke;
         }
+      }
+    }
+
+    Jx.storeChunk(jx_data, chunk_offset, chunk_extent);
+    Jy.storeChunk(jy_data, chunk_offset, chunk_extent);
+    Jz.storeChunk(jz_data, chunk_offset, chunk_extent);
+
+    Px.storeChunk(px_data, chunk_offset, chunk_extent);
+    Py.storeChunk(py_data, chunk_offset, chunk_extent);
+    Pz.storeChunk(pz_data, chunk_offset, chunk_extent);
+
+    Txx.storeChunk(txx_data, chunk_offset, chunk_extent);
+    Tyy.storeChunk(tyy_data, chunk_offset, chunk_extent);
+    Tzz.storeChunk(tzz_data, chunk_offset, chunk_extent);
+    Tyz.storeChunk(tyz_data, chunk_offset, chunk_extent);
+    Tzx.storeChunk(tzx_data, chunk_offset, chunk_extent);
+    Txy.storeChunk(txy_data, chunk_offset, chunk_extent);
+
+    Rho.storeChunk(rho_data, chunk_offset, chunk_extent);
+    Ke.storeChunk(ke_data, chunk_offset, chunk_extent);
+
+    series.flush();
+  }
 };
 #endif
 

From c3a63fc1b5edb6753812c6a6d98fb81f9ccd7121 Mon Sep 17 00:00:00 2001
From: Robert Bird <bird@lanl.gov>
Date: Tue, 1 Dec 2020 12:03:33 -0700
Subject: [PATCH 95/95] revert change where hdf5 backend stopped tracking
 num_steps

---
 src/vpic/dump.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/vpic/dump.cc b/src/vpic/dump.cc
index 65d15910..21804882 100644
--- a/src/vpic/dump.cc
+++ b/src/vpic/dump.cc
@@ -73,6 +73,7 @@ void vpic_simulation::enable_binary_dump() {
 void vpic_simulation::enable_hdf5_dump() {
     std::cout << "Enabling HDF5 IO backend" << std::endl;
     dump_strategy = std::unique_ptr<Dump_Strategy>(new HDF5Dump( rank(), nproc() ));
+    dump_strategy->num_step = num_step;
 }
 #endif