diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..8dd6c091a
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde.git
diff --git a/CHANGELOG-vectorscan.md b/CHANGELOG-vectorscan.md
new file mode 100644
index 000000000..de0a6149e
--- /dev/null
+++ b/CHANGELOG-vectorscan.md
@@ -0,0 +1,66 @@
+# Vectorscan Change Log
+
+This is a list of notable changes to Vectorscan, in reverse chronological order. For Hyperscan Changelog, check CHANGELOG.md
+
+## [5.4.11] 2023-11-19
+
+- Refactor CMake build system to be much more modular.
+- version in hs.h fell out of sync again #175
+- Fix compile failures with recent compilers, namely clang-15 and gcc-13
+- Fix clang 15,16 compilation errors on all platforms, refactor CMake build system #181
+- Fix signed/unsigned char issue on Arm with Ragel generated code.
+- Correct set_source_files_properties usage #189
+- Fix build failure on Ubuntu 20.04
+- Support building on Ubuntu 20.04 #180
+- Require pkg-config during Cmake
+- make pkgconfig a requirement #188
+- Fix segfault on Fat runtimes with SVE2 code
+- Move VERM16 enums to the end of the list #191
+- Update README.md, add CHANGELOG-vectorscan.md and Contributors-vectorscan.md files
+
+## [5.4.10] 2023-09-23
+- Fix compilation with libcxx 16 by @rschu1ze in #144
+- Fix use-of-uninitialized-value due to getData128() by @azat in #148
+- Use std::vector instead of boost::container::small_vector under MSan by @azat in #149
+- Feature/enable fat runtime arm by @markos in #165
+- adding ifndef around HS_PUBLIC_API definition so that vectorscan can be statically linked into another shared library without exporting symbols by @jeffplaisance in #164
+- Feature/backport hyperscan 2023 q3 by @markos in #169
+- Prepare for 5.4.10 by @markos in #167
+
+## [5.4.9] 2023-03-23
+- Major change: Enable SVE & SVE2 builds and make it a supported architecture! (thanks to @abondarev84)
+- Fix various clang-related bugs
+- Fix Aarch64 bug in Parser.rl because of char signedness. Make unsigned char the default in the Parser for all architectures.
+- Fix Power bug, multiple tests were failing.
+- C++20 related change, use prefixed assume_aligned to avoid conflict with C++20 std::assume_aligned.
+
+## [5.4.8] 2022-09-13
+- CMake: Use non-deprecated method for finding python by @jth in #108
+- Optimize vectorscan for aarch64 by using shrn instruction by @danlark1 in #113
+- Fixed the PCRE download location by @pareenaverma in #116
+- Bugfix/hyperscan backport 202208 by @markos in #118
+- VSX optimizations by @markos in #119
+- when compiling with mingw64, use __mingw_aligned_malloc() and __mingw_aligned_free() by @liquidaty in #121
+- [NEON] simplify/optimize shift/align primitives by @markos in #123
+- Merge develop to master by @markos in #124
+
+## [5.4.7] 2022-05-05
+- Fix word boundary assertions under C++20 by @BigRedEye in #90
+- Fix all ASAN issues in vectorscan by @danlark1 in #93
+- change FAT_RUNTIME to a normal option so it can be set to off by @a16bitsysop in #94
+- Optimized and correct version of movemask128 for ARM by @danlark1 in #102
+
+## [5.4.6] 2022-01-21
+- Major refactoring of many engines to use internal SuperVector C++ templates library. Code size reduced to 1/3rd with no loss of performance in most cases.
+- Microbenchmarking tool added for performance finetuning
+- Arm Advanced SIMD/NEON fully ported. Initial work on SVE2 for a couple of engines.
+- Power9 VSX ppc64le fully ported. Initial port needs some optimization.
+- Clang compiler support added.
+- Apple M1 support added.
+- CI added, the following configurations are tested on every PR:
+  gcc-debug, gcc-release, clang-debug, clang-release:
+  Linux Intel: SSE4.2, AVX2, AVX512, FAT
+  Linux Arm
+  Linux Power9
+  clang-debug, clang-release:
+  MacOS Apple M1
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19a92b909..09b4a95cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,55 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
+## [5.4.2] 2023-04-19
+- Roll back bugfix for github issue #350: Besides using scratch for
+  corresponding database, Hyperscan also allows user to use larger scratch
+  allocated for another database. Users can leverage this property to achieve
+  safe scratch usage in multi-database scenarios. Behaviors beyond these are
+  discouraged and results are undefined.
+- Fix hsdump issue due to invalid nfa type.
+
+## [5.4.1] 2023-02-20
+- The Intel Hyperscan team is pleased to provide a bug fix release to our open source library.
+  Intel also maintains an upgraded version available through your Intel sales representative.
+- Bugfix for issue #184: fix random char value of UTF-8.
+- Bugfix for issue #291: bypass logical combination flag in hs_expression_info().
+- Bugfix for issue #292: fix build error due to libc symbol parsing.
+- Bugfix for issue #302/304: add empty string check for pure literal API.
+- Bugfix for issue #303: fix unknown instruction error in pure literal API.
+- Bugfix for issue #303: avoid memory leak in stream close stage.
+- Bugfix for issue #305: fix assertion failure in DFA construction.
+- Bugfix for issue #317: fix aligned allocator segment faults.
+- Bugfix for issue #350: add quick validity check for scratch.
+- Bugfix for issue #359: fix glibc-2.34 stack size issue.
+- Bugfix for issue #360: fix SKIP flag issue in chimera.
+- Bugfix for issue #362: fix one cotec check corner issue in UTF-8 validation.
+- Fix other compile issues.
+
+## [5.4.0] 2020-12-31
+- Improvement on literal matcher "Fat Teddy" performance, including
+  support for Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R)
+  AVX-512 VBMI).
+- Introduce a new 32-state shuffle-based DFA engine ("Sheng32"). This improves
+  scanning performance by leveraging AVX-512 VBMI.
+- Introduce a new 64-state shuffle-based DFA engine ("Sheng64"). This improves
+  scanning performance by leveraging AVX-512 VBMI.
+- Introduce a new shuffle-based hybrid DFA engine ("McSheng64"). This improves
+  scanning performance by leveraging AVX-512 VBMI.
+- Improvement on exceptional state handling performance for LimEx NFA, including
+  support for AVX-512 VBMI.
+- Improvement on lookaround performance with new models, including support for
+  AVX-512.
+- Improvement on DFA state space efficiency.
+- Optimization on decision of NFA/DFA generation.
+- hsbench: add CSV dump support for hsbench.
+- Bugfix for cmake error on Icelake under release mode.
+- Bugfix in find_vertices_in_cycles() to avoid self-loop checking in SCC.
+- Bugfix for issue #270: fix return value handling in chimera.
+- Bugfix for issue #284: use correct free function in logical combination.
+- Add BUILD_EXAMPLES cmake option to enable example code compilation. (#260)
+- Some typo fixing. (#242, #259)
+
 ## [5.3.0] 2020-05-15
 - Improvement on literal matcher "Teddy" performance, including support for
   Intel(R) AVX-512 Vector Byte Manipulation Instructions (Intel(R) AVX-512
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59c6e6e2f..d256e7ed9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,17 @@
-cmake_minimum_required (VERSION 2.8.11)
-project (hyperscan C CXX)
+cmake_minimum_required (VERSION 3.18.4)
+
+project (vectorscan C CXX)
 
 set (HS_MAJOR_VERSION 5)
-set (HS_MINOR_VERSION 3)
-set (HS_PATCH_VERSION 0)
+set (HS_MINOR_VERSION 4)
+set (HS_PATCH_VERSION 11)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
+string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
+message(STATUS "Build date: ${BUILD_DATE}")
+
+# Dependencies check
+
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
@@ -18,9 +24,25 @@ INCLUDE (CheckSymbolExists)
 include (CMakeDependentOption)
 include (GNUInstallDirs)
 include (${CMAKE_MODULE_PATH}/platform.cmake)
+include (${CMAKE_MODULE_PATH}/boost.cmake)
 include (${CMAKE_MODULE_PATH}/ragel.cmake)
 
-find_package(PkgConfig QUIET)
+find_package(PkgConfig REQUIRED)
+
+find_program(RAGEL ragel)
+
+if(${RAGEL} STREQUAL "RAGEL-NOTFOUND")
+    message(FATAL_ERROR "Ragel state machine compiler not found")
+endif()
+
+# Add ccache to speed builds
+find_program(CCACHE_FOUND ccache)
+if(CCACHE_FOUND)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif(CCACHE_FOUND)
+
+# Build type check
 
 if (NOT CMAKE_BUILD_TYPE)
     message(STATUS "Default build type 'Release with debug info'")
@@ -30,7 +52,7 @@ else()
     message(STATUS "Build type ${CMAKE_BUILD_TYPE}")
 endif()
 
-if(CMAKE_BUILD_TYPE MATCHES RELEASE|RELWITHDEBINFO|MINSIZEREL)
+if(CMAKE_BUILD_TYPE MATCHES NONE|RELEASE|RELWITHDEBINFO|MINSIZEREL)
     message(STATUS "using release build")
     set(RELEASE_BUILD TRUE)
 else()
@@ -54,411 +76,120 @@ foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
     set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} "${LIBDIR}")
 endforeach (OUTPUTCONFIG CMAKE_CONFIGURATION_TYPES)
 
-
-if(CMAKE_GENERATOR STREQUAL Xcode)
-    set(XCODE TRUE)
-endif()
-
-# older versions of cmake don't know things support isystem
-if (XCODE OR CMAKE_CXX_COMPILER_ID MATCHES "Intel")
-    set(CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem")
-endif ()
-
 set(CMAKE_INCLUDE_CURRENT_DIR 1)
 include_directories(${PROJECT_SOURCE_DIR}/src)
 include_directories(${PROJECT_BINARY_DIR})
 include_directories(SYSTEM include)
 
-include (${CMAKE_MODULE_PATH}/boost.cmake)
+# Compiler detection
 
-# -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6)
-find_package(PythonInterp)
-find_program(RAGEL ragel)
+include (${CMAKE_MODULE_PATH}/compiler.cmake)
 
-if(PYTHONINTERP_FOUND)
-    set(PYTHON ${PYTHON_EXECUTABLE})
-else()
-    message(FATAL_ERROR "No python interpreter found")
-endif()
+# CMake options
 
-# allow for reproducible builds - python for portability
-if (DEFINED ENV{SOURCE_DATE_EPOCH})
-      execute_process(
-          COMMAND "${PYTHON}" "${CMAKE_MODULE_PATH}/formatdate.py" "$ENV{SOURCE_DATE_EPOCH}"
-          OUTPUT_VARIABLE BUILD_DATE
-          OUTPUT_STRIP_TRAILING_WHITESPACE)
-else ()
-    string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
-endif ()
-message(STATUS "Build date: ${BUILD_DATE}")
+if (BUILD_STATIC_AND_SHARED)
+    message(FATAL_ERROR "This option is no longer supported, please set at least one of BUILD_STATIC_LIBS and BUILD_SHARED_LIBS")
+endif()
 
+option(BUILD_SHARED_LIBS "Build shared libs" OFF)
+option(BUILD_STATIC_LIBS "Build static libs" OFF)
 
-if(${RAGEL} STREQUAL "RAGEL-NOTFOUND")
-    message(FATAL_ERROR "Ragel state machine compiler not found")
+if (BUILD_SHARED_LIBS)
+    message(STATUS "Building shared libraries")
+endif()
+if (BUILD_STATIC_LIBS)
+    message(STATUS "Building static libraries")
 endif()
 
-option(OPTIMISE "Turns off compiler optimizations (on by default unless debug output enabled or coverage testing)" TRUE)
+if (NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
+    # if none are set build static libs
+    message(STATUS "Neither shared nor static libraries were requested, building static libraries")
+    set(BUILD_STATIC_LIBS ON)
+endif ()
 
-option(DEBUG_OUTPUT "Enable debug output (warning: very verbose)" FALSE)
+CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in release builds" ON "NOT RELEASE_BUILD" OFF)
+CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
 
+option(DEBUG_OUTPUT "Enable debug output (warning: very verbose)" OFF)
 if(DEBUG_OUTPUT)
     add_definitions(-DDEBUG)
-    set(OPTIMISE FALSE)
+    set(RELEASE_BUILD FALSE)
 endif(DEBUG_OUTPUT)
 
-option(BUILD_SHARED_LIBS "Build shared libs instead of static" OFF)
-option(BUILD_STATIC_AND_SHARED "Build shared libs as well as static" OFF)
-
-if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
-        message(STATUS "Building shared libraries")
-else()
-        message(STATUS "Building static libraries")
-endif()
-
-if (NOT BUILD_SHARED_LIBS)
-    # build static libs
-    set(BUILD_STATIC_LIBS ON)
-    mark_as_advanced(BUILD_STATIC_LIBS)
-endif ()
 
 #for config
-if (OPTIMISE)
+if (RELEASE_BUILD)
     set(HS_OPTIMIZE ON)
+    add_definitions(-DNDEBUG)
 endif()
 
-CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in release builds" ON "NOT RELEASE_BUILD" OFF)
-
-CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
-
-option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime"
-    OFF)
-
-option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF)
-
-# TODO: per platform config files?
-
-# TODO: windows generator on cmake always uses msvc, even if we plan to build with icc
-if(MSVC OR MSVC_IDE)
-    message(STATUS "Building for Windows")
-
-    if (MSVC_VERSION LESS 1700)
-        message(FATAL_ERROR "The project requires C++11 features.")
-    else()
-        if (WINDOWS_ICC)
-            set(ARCH_C_FLAGS "/QxHost")
-            set(ARCH_CXX_FLAGS "/QxHost")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /wd4267 /Qdiag-disable:remark")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
-        else()
-            set(MSVC_WARNS "/wd4101 /wd4146 /wd4172 /wd4200 /wd4244 /wd4267 /wd4307 /wd4334 /wd4805 /wd4996 -D_CRT_SECURE_NO_WARNINGS")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 ${MSVC_WARNS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 ${MSVC_WARNS} /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD")
-        endif()
-        string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-        string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
-
-        if (DISABLE_ASSERTS)
-            set(CMAKE_C_FLAGS_DEBUG "/DNDEBUG ${CMAKE_C_FLAGS_DEBUG}")
-            set(CMAKE_CXX_FLAGS_DEBUG "/DNDEBUG ${CMAKE_CXX_FLAGS_DEBUG}")
-        endif ()
+# Detect OS and if Fat Runtime is available
+include (${CMAKE_MODULE_PATH}/osdetection.cmake)
+
+if(SIMDE_BACKEND)
+    include (${CMAKE_MODULE_PATH}/simde.cmake)
+elseif (ARCH_IA32 OR ARCH_X86_64)
+    include (${CMAKE_MODULE_PATH}/cflags-x86.cmake)
+elseif (ARCH_ARM32 OR ARCH_AARCH64)
+    include (${CMAKE_MODULE_PATH}/cflags-arm.cmake)
+elseif (ARCH_PPC64EL)
+    include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
+else ()
+    message(FATAL_ERROR "Unsupported platform")
+endif ()
 
-        # flags only used to build hs libs
-        set(HS_C_FLAGS "/Gv")
-        set(HS_CXX_FLAGS "/Gv")
-    endif()
+if (ARCH_PPC64EL)
+    set(ARCH_FLAG mcpu)
+else ()
+    set(ARCH_FLAG march)
+endif ()
 
-else()
+# Detect Native arch flags if requested
+include (${CMAKE_MODULE_PATH}/archdetect.cmake)
 
-    # remove CMake's idea of optimisation
-    foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
-        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
-        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
-    endforeach ()
-
-    if (CMAKE_COMPILER_IS_GNUCC)
-        message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
-        # If gcc doesn't recognise the host cpu, then mtune=native becomes
-        # generic, which isn't very good in some cases. march=native looks at
-        # cpuid info and then chooses the best microarch it can (and replaces
-        # the flag), so use that for tune.
-
-        # arg1 might exist if using ccache
-        string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
-        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
-        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
-            OUTPUT_VARIABLE _GCC_OUTPUT)
-        string(FIND "${_GCC_OUTPUT}" "march" POS)
-        string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
-        string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
-            GNUCC_ARCH "${_GCC_OUTPUT}")
-
-        # test the parsed flag
-        set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
-        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
-            OUTPUT_QUIET ERROR_QUIET
-            INPUT_FILE /dev/null
-            RESULT_VARIABLE GNUCC_TUNE_TEST)
-        if (NOT GNUCC_TUNE_TEST EQUAL 0)
-            message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
-        endif()
-        set(TUNE_FLAG ${GNUCC_ARCH})
-    else ()
-        set(TUNE_FLAG native)
-    endif()
+# Configure Compiler flags (Generic)
 
-    # compiler version checks TODO: test more compilers
-    if (CMAKE_COMPILER_IS_GNUCXX)
-        set(GNUCXX_MINVER "4.8.1")
-        message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
-        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
-            message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support")
-        endif()
-    endif()
+include (${CMAKE_MODULE_PATH}/sanitize.cmake)
 
-    if(OPTIMISE)
-        if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL)
-            set(OPT_C_FLAG "-O3")
-            set(OPT_CXX_FLAG "-O2")
-        else ()
-            set(OPT_C_FLAG "-Os")
-            set(OPT_CXX_FLAG "-Os")
-        endif ()
+if (NOT FAT_RUNTIME)
+    if (GNUCC_TUNE)
+        set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE}")
+        set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE}")
     else()
-        set(OPT_C_FLAG "-O0")
-        set(OPT_CXX_FLAG "-O0")
-    endif(OPTIMISE)
-
-    # set compiler flags - more are tested and added later
-    set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c99 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
-    set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++11 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
-
-    if (NOT RELEASE_BUILD)
-        # -Werror is most useful during development, don't potentially break
-        # release builds
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
-    endif()
-
-    if (DISABLE_ASSERTS)
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
-    endif()
-
-    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
-        set(ARCH_C_FLAGS "-march=native -mtune=${TUNE_FLAG}")
-    endif()
-
-    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
-        set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}")
-    endif()
-
-    if(CMAKE_COMPILER_IS_GNUCC)
-        # spurious warnings?
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
-    endif()
-
-    if(CMAKE_COMPILER_IS_GNUCXX)
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
-        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
-            set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
-        endif ()
-        # don't complain about abi
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
+        set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}")
+        set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}")
     endif()
+endif()
 
-    if (NOT(ARCH_IA32 AND RELEASE_BUILD))
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
-    endif()
+# remove CMake's idea of optimisation
+foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
+    string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
+    string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
+endforeach ()
 
+message(STATUS "ARCH_C_FLAGS   : ${ARCH_C_FLAGS}")
+message(STATUS "ARCH_CXX_FLAGS : ${ARCH_CXX_FLAGS}")
 
-    if (CMAKE_C_COMPILER_ID MATCHES "Intel")
-        set(SKYLAKE_FLAG "-xCORE-AVX512")
+if(RELEASE_BUILD)
+    if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL)
+        set(OPT_C_FLAG "-O3")
+        set(OPT_CXX_FLAG "-O3")
     else ()
-        set(SKYLAKE_FLAG "-march=skylake-avx512")
+        set(OPT_C_FLAG "-Os")
+        set(OPT_CXX_FLAG "-Os")
     endif ()
-endif()
-
-CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
-CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
-CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
-CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
-CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
-
-CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
-CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
-
-# these end up in the config file
-CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
-CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
-
-# are we using libc++
-CHECK_CXX_SYMBOL_EXISTS(_LIBCPP_VERSION ciso646 HAVE_LIBCPP)
-
-if (RELEASE_BUILD)
-    if (HAS_C_HIDDEN)
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
-    endif()
-    if (HAS_CXX_HIDDEN)
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fvisibility=hidden")
-    endif()
-endif()
-
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    # This is a Linux-only feature for now - requires platform support
-    # elsewhere
-    message(STATUS "generator is ${CMAKE_GENERATOR}")
-    if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND
-        CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9")
-        message (STATUS "Clang v3.9 or higher required for fat runtime, cannot build fat runtime")
-        set (FAT_RUNTIME_REQUISITES FALSE)
-    elseif (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
-            (CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
-        message (STATUS "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
-        set (FAT_RUNTIME_REQUISITES FALSE)
-    else()
-        include (${CMAKE_MODULE_PATH}/attrib.cmake)
-        if (NOT HAS_C_ATTR_IFUNC)
-            message(STATUS "Compiler does not support ifunc attribute, cannot build fat runtime")
-            set (FAT_RUNTIME_REQUISITES FALSE)
-        else ()
-            set (FAT_RUNTIME_REQUISITES TRUE)
-        endif()
-    endif()
-    CMAKE_DEPENDENT_OPTION(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ${RELEASE_BUILD} "FAT_RUNTIME_REQUISITES" OFF)
-endif ()
-
-include (${CMAKE_MODULE_PATH}/arch.cmake)
-
-# testing a builtin takes a little more work
-CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
-CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
-CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
-
-if (NOT WIN32)
-set(C_FLAGS_TO_CHECK
-# Variable length arrays are way bad, most especially at run time
-"-Wvla"
-# Pointer arith on void pointers is doing it wrong.
- "-Wpointer-arith"
-# Build our C code with -Wstrict-prototypes -Wmissing-prototypes
- "-Wstrict-prototypes"
- "-Wmissing-prototypes"
-)
-foreach (FLAG ${C_FLAGS_TO_CHECK})
-    # munge the name so it doesn't break things
-    string(REPLACE "-" "_" FNAME C_FLAG${FLAG})
-    CHECK_C_COMPILER_FLAG("${FLAG}" ${FNAME})
-    if (${FNAME})
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} ${FLAG}")
-    endif()
-endforeach()
-
-set(CXX_FLAGS_TO_CHECK
-"-Wvla"
-"-Wpointer-arith"
-)
-foreach (FLAG ${CXX_FLAGS_TO_CHECK})
-    string(REPLACE "-" "_" FNAME CXX_FLAG${FLAG})
-    CHECK_CXX_COMPILER_FLAG("${FLAG}" ${FNAME})
-    if (${FNAME})
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} ${FLAG}")
-    endif()
-endforeach()
-
-# self-assign should be thrown away, but clang whinges
-CHECK_C_COMPILER_FLAG("-Wself-assign" CC_SELF_ASSIGN)
-if (CC_SELF_ASSIGN)
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-self-assign")
-endif()
-CHECK_CXX_COMPILER_FLAG("-Wself-assign" CXX_SELF_ASSIGN)
-if (CXX_SELF_ASSIGN)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-self-assign")
-endif()
-
-# clang gets up in our face for going paren crazy with macros
-CHECK_C_COMPILER_FLAG("-Wparentheses-equality" CC_PAREN_EQUALITY)
-if (CC_PAREN_EQUALITY)
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-parentheses-equality")
-endif()
-
-# clang complains about unused const vars in our Ragel-generated code.
-CHECK_CXX_COMPILER_FLAG("-Wunused-const-variable" CXX_UNUSED_CONST_VAR)
-if (CXX_UNUSED_CONST_VAR)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
-endif()
-
-# gcc 6 complains about type attributes that get ignored, like alignment
-CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
-if (CXX_IGNORED_ATTR)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
-endif()
-
-# gcc 9 complains about redundant move for returned variable
-CHECK_CXX_COMPILER_FLAG("-Wredundant-move" CXX_REDUNDANT_MOVE)
-if (CXX_REDUNDANT_MOVE)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-redundant-move")
-endif()
-
-# note this for later
-# g++ doesn't have this flag but clang does
-CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
-if (CXX_WEAK_VTABLES)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wweak-vtables")
-endif()
-
-CHECK_CXX_COMPILER_FLAG("-Wmissing-declarations" CXX_MISSING_DECLARATIONS)
-if (CXX_MISSING_DECLARATIONS)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wmissing-declarations")
-endif()
-
-CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
-
-# gcc5 complains about this
-CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
-
-# gcc 10 complains about this
-CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
-if(CC_STRINGOP_OVERFLOW)
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow")
-endif()
+else()
+    set(OPT_C_FLAG "-O0")
+    set(OPT_CXX_FLAG "-O0")
+endif(RELEASE_BUILD)
 
-endif()
+include (${CMAKE_MODULE_PATH}/cflags-generic.cmake)
 
 include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
 
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-    set(LINUX TRUE)
-endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
-
-if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    set(FREEBSD true)
-endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-
-if(NOT WIN32)
-if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -diag-error 10006 -diag-disable 68 -diag-disable 177 -diag-disable 186 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable 279 -diag-disable=remark")
-endif()
-if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -diag-error 10006 -diag-disable 68 -diag-disable 177 -diag-disable 186 -diag-disable 2304 -diag-disable 2305 -diag-disable 2338 -diag-disable 1418 -diag-disable 1170 -diag-disable 3373 -diag-disable 279 -diag-disable=remark")
-endif()
-endif()
-
-if (NOT FAT_RUNTIME)
-    message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
-else()
-    message(STATUS "Building runtime for multiple microarchitectures")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-endif()
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
 
-add_subdirectory(util)
-add_subdirectory(doc/dev-reference)
-
-if (NOT WIN32)
 # PCRE check, we have a fixed requirement for PCRE to use Chimera
 # and hscollider
 set(PCRE_REQUIRED_MAJOR_VERSION 8)
@@ -474,32 +205,29 @@ if (CORRECT_PCRE_VERSION AND PCRE_BUILD_SOURCE AND BUILD_STATIC_LIBS)
     set(BUILD_CHIMERA TRUE)
 endif()
 
-add_subdirectory(unit)
-if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
-    add_subdirectory(tools)
-endif()
-if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA)
-    add_subdirectory(chimera)
-endif()
-endif()
+set(RAGEL_C_FLAGS "-Wno-unused -funsigned-char")
+
+set_source_files_properties(
+    src/parser/Parser.cpp
+    PROPERTIES
+        COMPILE_FLAGS "${RAGEL_C_FLAGS}")
+
+ragelmaker(src/parser/Parser.rl)
+
+set_source_files_properties(
+   src/parser/control_verbs.cpp
+    PROPERTIES
+        COMPILE_FLAGS "${RAGEL_C_FLAGS}")
+
+ragelmaker(src/parser/control_verbs.rl)
 
 # do substitutions
 configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
 configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
 
-if (NOT WIN32)
-    # expand out library names for pkgconfig static link info
-    foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES})
-        # this is fragile, but protects us from toolchain specific files
-        if (NOT EXISTS ${LIB})
-            set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}")
-        endif()
-    endforeach()
-
-    configure_file(libhs.pc.in libhs.pc @ONLY) # only replace @ quoted vars
-    install(FILES ${CMAKE_BINARY_DIR}/libhs.pc
-        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
-endif()
+configure_file(libhs.pc.in libhs.pc @ONLY) # only replace @ quoted vars
+install(FILES ${CMAKE_BINARY_DIR}/libhs.pc
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
 # only set these after all tests are done
 if (NOT FAT_RUNTIME)
@@ -510,50 +238,8 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 endif()
 
-if (WIN32)
-# PCRE check, we have a fixed requirement for PCRE to use Chimera
-# and hscollider
-set(PCRE_REQUIRED_MAJOR_VERSION 8)
-set(PCRE_REQUIRED_MINOR_VERSION 41)
-set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION})
-include (${CMAKE_MODULE_PATH}/pcre.cmake)
-if (NOT CORRECT_PCRE_VERSION)
-    message(STATUS "PCRE ${PCRE_REQUIRED_VERSION} or above not found")
-endif()
-
-# we need static libs for Chimera - too much deep magic for shared libs
-if (CORRECT_PCRE_VERSION AND PCRE_BUILD_SOURCE AND BUILD_STATIC_LIBS)
-    set(BUILD_CHIMERA TRUE)
-endif()
-
-add_subdirectory(unit)
-if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
-    add_subdirectory(tools)
-endif()
-if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA)
-    add_subdirectory(chimera)
-endif()
-endif()
-
-if(NOT WIN32)
-set(RAGEL_C_FLAGS "-Wno-unused")
-endif()
-
-set_source_files_properties(
-    ${CMAKE_BINARY_DIR}/src/parser/Parser.cpp
-    PROPERTIES
-        COMPILE_FLAGS "${RAGEL_C_FLAGS}")
-
-ragelmaker(src/parser/Parser.rl)
-
-set_source_files_properties(
-    ${CMAKE_BINARY_DIR}/src/parser/control_verbs.cpp
-    PROPERTIES
-        COMPILE_FLAGS "${RAGEL_C_FLAGS}")
-
-ragelmaker(src/parser/control_verbs.rl)
-
 SET(hs_HEADERS
+    ${PROJECT_BINARY_DIR}/hs_version.h
     src/hs.h
     src/hs_common.h
     src/hs_compile.h
@@ -564,14 +250,32 @@ install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
 set (hs_exec_common_SRCS
     src/alloc.c
     src/scratch.c
-    src/util/cpuid_flags.c
-    src/util/cpuid_flags.h
+    src/util/arch/common/cpuid_flags.h
     src/util/multibit.c
     )
+if (SIMDE_BACKEND)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/simde/cpuid_flags.c)
+elseif (ARCH_IA32 OR ARCH_X86_64)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/x86/cpuid_flags.c
+    )
+elseif (ARCH_ARM32 OR ARCH_AARCH64)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/arm/cpuid_flags.c
+    )
+elseif (ARCH_PPC64EL)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/ppc64el/cpuid_flags.c)
+endif ()
 
 set (hs_exec_SRCS
     ${hs_HEADERS}
-    src/hs_version.h
+    src/hs_version.h.in
     src/ue2common.h
     src/allocator.h
     src/crc32.c
@@ -595,7 +299,7 @@ set (hs_exec_SRCS
     src/hwlm/hwlm.c
     src/hwlm/hwlm.h
     src/hwlm/hwlm_internal.h
-    src/hwlm/noodle_engine.c
+    src/hwlm/noodle_engine.cpp
     src/hwlm/noodle_engine.h
     src/hwlm/noodle_internal.h
     src/nfa/accel.c
@@ -651,16 +355,15 @@ set (hs_exec_SRCS
     src/nfa/sheng_impl.h
     src/nfa/sheng_impl4.h
     src/nfa/sheng_internal.h
-    src/nfa/shufti.c
+    src/nfa/shufti.cpp
     src/nfa/shufti.h
     src/nfa/tamarama.c
     src/nfa/tamarama.h
     src/nfa/tamarama_internal.h
-    src/nfa/truffle.c
+    src/nfa/truffle.cpp
     src/nfa/truffle.h
-    src/nfa/vermicelli.h
+    src/nfa/vermicelli.hpp
     src/nfa/vermicelli_run.h
-    src/nfa/vermicelli_sse.h
     src/som/som.h
     src/som/som_operation.h
     src/som/som_runtime.h
@@ -694,7 +397,6 @@ set (hs_exec_SRCS
     src/util/exhaust.h
     src/util/fatbit.h
     src/util/join.h
-    src/util/masked_move.h
     src/util/multibit.h
     src/util/multibit.c
     src/util/multibit_compress.h
@@ -705,7 +407,6 @@ set (hs_exec_SRCS
     src/util/scatter.h
     src/util/scatter_runtime.h
     src/util/simd_utils.h
-    src/util/simd_utils.c
     src/util/state_compress.h
     src/util/state_compress.c
     src/util/unaligned.h
@@ -714,11 +415,41 @@ set (hs_exec_SRCS
     src/database.h
 )
 
-set (hs_exec_avx2_SRCS
-    src/fdr/teddy_avx2.c
-    src/util/masked_move.c
-)
+if (SIMDE_BACKEND)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/nfa/vermicelli_simd.cpp
+    src/util/supervector/arch/x86/impl.cpp)
+elseif (ARCH_IA32 OR ARCH_X86_64)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/nfa/vermicelli_simd.cpp
+    src/util/supervector/arch/x86/impl.cpp)
+elseif (ARCH_ARM32 OR ARCH_AARCH64)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/util/supervector/arch/arm/impl.cpp)
+elseif (ARCH_PPC64EL)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/nfa/vermicelli_simd.cpp
+    src/util/supervector/arch/ppc64el/impl.cpp)
+endif()
 
+if (ARCH_IA32 OR ARCH_X86_64)
+    set (hs_exec_avx2_SRCS
+        src/fdr/teddy_avx2.c
+        src/util/arch/x86/masked_move.c
+        src/util/arch/x86/masked_move.h
+    )
+endif()
+
+if (ARCH_ARM32 OR ARCH_AARCH64)
+    set (hs_exec_neon_SRCS
+        src/nfa/vermicelli_simd.cpp)
+    set (hs_exec_sve_SRCS
+        src/nfa/vermicelli_simd.cpp)
+endif()
 
 SET (hs_compile_SRCS
     ${hs_HEADERS}
@@ -728,7 +459,7 @@ SET (hs_compile_SRCS
     src/grey.h
     src/hs.cpp
     src/hs_internal.h
-    src/hs_version.h
+    src/hs_version.h.in
     src/scratch.h
     src/state.h
     src/ue2common.h
@@ -821,6 +552,8 @@ SET (hs_compile_SRCS
     src/nfa/tamaramacompile.h
     src/nfa/trufflecompile.cpp
     src/nfa/trufflecompile.h
+    src/nfa/vermicellicompile.cpp
+    src/nfa/vermicellicompile.h
     src/nfagraph/ng.cpp
     src/nfagraph/ng.h
     src/nfagraph/ng_anchored_acyclic.cpp
@@ -1162,11 +895,20 @@ set (LIB_VERSION ${HS_VERSION})
 set (LIB_SOVERSION ${HS_MAJOR_VERSION})
 
 if (NOT FAT_RUNTIME)
-
     set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_common_SRCS})
 
-    if (HAVE_AVX2)
-        set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+    if (ARCH_IA32 OR ARCH_X86_64)
+        if (BUILD_AVX2)
+            set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+        endif()
+    elseif (ARCH_AARCH64)
+        if (BUILD_SVE2)
+            set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_sve2_SRCS})
+        elseif (BUILD_SVE)
+            set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_sve_SRCS})
+        else()
+            set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_neon_SRCS})
+        endif()
     endif()
 
     if (BUILD_STATIC_LIBS)
@@ -1176,7 +918,6 @@ if (NOT FAT_RUNTIME)
         set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 
         add_library(hs_compile OBJECT ${hs_compile_SRCS})
-
         add_library(hs STATIC
             src/hs_version.c
             src/hs_valid_platform.c
@@ -1184,126 +925,227 @@ if (NOT FAT_RUNTIME)
             $<TARGET_OBJECTS:hs_compile>)
     endif (BUILD_STATIC_LIBS)
 
-    if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+    if (BUILD_SHARED_LIBS)
         add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
         set_target_properties(hs_exec_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
         add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
         set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
     endif()
+else ()
+    if (ARCH_IA32 OR ARCH_X86_64)
+        set(BUILD_WRAPPER "${PROJECT_SOURCE_DIR}/cmake/build_wrapper.sh")
+        if (NOT BUILD_AVX512)
+            set (DISPATCHER_DEFINE "-DDISABLE_AVX512_DISPATCH")
+        endif (NOT BUILD_AVX512)
+        if (NOT BUILD_AVX512VBMI)
+            set (DISPATCHER_DEFINE "${DISPATCHER_DEFINE} -DDISABLE_AVX512VBMI_DISPATCH")
+        endif (NOT BUILD_AVX512VBMI)
+        set_source_files_properties(src/dispatcher.c PROPERTIES
+            COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
+
+        if (BUILD_STATIC_LIBS)
+            add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
+            list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
+            set_target_properties(hs_exec_core2 PROPERTIES
+                COMPILE_FLAGS "-march=core2 -msse4.2"
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
 
-else (FAT_RUNTIME)
+            add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
+            list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
+            set_target_properties(hs_exec_corei7 PROPERTIES
+                COMPILE_FLAGS "-march=corei7 -msse4.2"
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
 
-    set(BUILD_WRAPPER "${PROJECT_SOURCE_DIR}/cmake/build_wrapper.sh")
-    if (NOT BUILD_AVX512)
-        set (DISPATCHER_DEFINE "-DDISABLE_AVX512_DISPATCH")
-    endif (NOT BUILD_AVX512)
-    set_source_files_properties(src/dispatcher.c PROPERTIES
-        COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
+            if (BUILD_AVX2)
+                add_library(hs_exec_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+                list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx2>)
+                set_target_properties(hs_exec_avx2 PROPERTIES
+                    COMPILE_FLAGS "-march=core-avx2 -mavx2"
+                    RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                    )
+            endif (BUILD_AVX2)
+            if (BUILD_AVX512)
+                add_library(hs_exec_avx512 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+                list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx512>)
+                set_target_properties(hs_exec_avx512 PROPERTIES
+                    COMPILE_FLAGS "${SKYLAKE_FLAG}"
+                    RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                    )
+            endif (BUILD_AVX512)
+            if (BUILD_AVX512VBMI)
+                add_library(hs_exec_avx512vbmi OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+                list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx512vbmi>)
+                set_target_properties(hs_exec_avx512vbmi PROPERTIES
+                    COMPILE_FLAGS "${ICELAKE_FLAG}"
+                    RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512vbmi ${CMAKE_MODULE_PATH}/keep.syms.in"
+                    )
+            endif (BUILD_AVX512VBMI)
+
+            add_library(hs_exec_common OBJECT
+                ${hs_exec_common_SRCS}
+                src/dispatcher.c
+                )
 
-    if (BUILD_STATIC_LIBS)
-       add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
-       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
-       set_target_properties(hs_exec_core2 PROPERTIES
-           COMPILE_FLAGS "-march=core2"
-           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-           )
-
-       add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
-       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
-       set_target_properties(hs_exec_corei7 PROPERTIES
-           COMPILE_FLAGS "-march=corei7"
-           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
-           )
-
-       add_library(hs_exec_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
-       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx2>)
-       set_target_properties(hs_exec_avx2 PROPERTIES
-           COMPILE_FLAGS "-march=core-avx2"
-           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-           )
-       if (BUILD_AVX512)
-           add_library(hs_exec_avx512 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
-           list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx512>)
-           set_target_properties(hs_exec_avx512 PROPERTIES
-               COMPILE_FLAGS "${SKYLAKE_FLAG}"
-               RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
-               )
-       endif (BUILD_AVX512)
-
-       add_library(hs_exec_common OBJECT
-           ${hs_exec_common_SRCS}
-           src/dispatcher.c
-           )
-
-       # hs_version.c is added explicitly to avoid some build systems that refuse to
-       # create a lib without any src (I'm looking at you Xcode)
-
-       add_library(hs_runtime STATIC src/hs_version.c
-           $<TARGET_OBJECTS:hs_exec_common>
-           ${RUNTIME_LIBS})
-       set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
-        add_library(hs_compile OBJECT ${hs_compile_SRCS})
+            # hs_version.c is added explicitly to avoid some build systems that refuse to
+            # create a lib without any src (I'm looking at you Xcode)
+
+            add_library(hs_runtime STATIC src/hs_version.c
+                $<TARGET_OBJECTS:hs_exec_common>
+                ${RUNTIME_LIBS})
+            set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+            add_library(hs_compile OBJECT ${hs_compile_SRCS})
+
+            # we want the static lib for testing
+            add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
+                $<TARGET_OBJECTS:hs_compile>
+                $<TARGET_OBJECTS:hs_exec_common>
+                ${RUNTIME_LIBS})
+        endif (BUILD_STATIC_LIBS)
+
+        if (BUILD_SHARED_LIBS)
+            # build shared libs
+            add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
+            set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+            add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
+            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
+            set_target_properties(hs_exec_shared_core2 PROPERTIES
+                COMPILE_FLAGS "-march=core2 -msse4.2"
+                POSITION_INDEPENDENT_CODE TRUE
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
+            add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
+            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
+            set_target_properties(hs_exec_shared_corei7 PROPERTIES
+                COMPILE_FLAGS "-march=corei7 -msse4.2"
+                POSITION_INDEPENDENT_CODE TRUE
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
 
-       # we want the static lib for testing
-       add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
-           $<TARGET_OBJECTS:hs_compile>
-           $<TARGET_OBJECTS:hs_exec_common>
-           ${RUNTIME_LIBS})
+            if (BUILD_AVX2)
+                add_library(hs_exec_shared_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+                list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx2>)
+                set_target_properties(hs_exec_shared_avx2 PROPERTIES
+                    COMPILE_FLAGS "-march=core-avx2 -mavx2"
+                    POSITION_INDEPENDENT_CODE TRUE
+                    RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                    )
+            endif (BUILD_AVX2)
+            if (BUILD_AVX512)
+                add_library(hs_exec_shared_avx512 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+                list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx512>)
+                set_target_properties(hs_exec_shared_avx512 PROPERTIES
+                    COMPILE_FLAGS "${SKYLAKE_FLAG}"
+                    POSITION_INDEPENDENT_CODE TRUE
+                    RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                    )
+            endif (BUILD_AVX512)
+            if (BUILD_AVX512VBMI)
+                add_library(hs_exec_shared_avx512vbmi OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+                list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx512vbmi>)
+                set_target_properties(hs_exec_shared_avx512vbmi PROPERTIES
+                    COMPILE_FLAGS "${ICELAKE_FLAG}"
+                    POSITION_INDEPENDENT_CODE TRUE
+                    RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512vbmi ${CMAKE_MODULE_PATH}/keep.syms.in"
+                    )
+            endif (BUILD_AVX512VBMI)
+            add_library(hs_exec_common_shared OBJECT
+            ${hs_exec_common_SRCS}
+            src/dispatcher.c
+            )
+            set_target_properties(hs_exec_common_shared PROPERTIES
+                OUTPUT_NAME hs_exec_common
+                POSITION_INDEPENDENT_CODE TRUE)
+
+        endif() # SHARED
+    endif (ARCH_IA32 OR ARCH_X86_64)
+    if (ARCH_AARCH64)
+        set(BUILD_WRAPPER "${PROJECT_SOURCE_DIR}/cmake/build_wrapper.sh")
+        if (BUILD_STATIC_LIBS)
+            add_library(hs_exec_neon OBJECT ${hs_exec_SRCS} ${hs_exec_neon_SRCS})
+            list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_neon>)
+            set_target_properties(hs_exec_neon PROPERTIES
+                COMPILE_FLAGS "-march=${ARMV8_ARCH}"
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} neon ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
 
-    endif (BUILD_STATIC_LIBS)
+            add_library(hs_exec_sve OBJECT ${hs_exec_SRCS} ${hs_exec_sve_SRCS})
+            list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_sve>)
+            set_target_properties(hs_exec_sve PROPERTIES
+                COMPILE_FLAGS "-march=${SVE_ARCH}"
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} sve ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
+            add_library(hs_exec_sve2 OBJECT ${hs_exec_SRCS} ${hs_exec_sve2_SRCS})
+            list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_sve2>)
+            set_target_properties(hs_exec_sve2 PROPERTIES
+                COMPILE_FLAGS "-march=${SVE2_BITPERM_ARCH}"
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} sve2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
 
-    if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
-        # build shared libs
-        add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
-        set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
-        add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
-        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
-        set_target_properties(hs_exec_shared_core2 PROPERTIES
-            COMPILE_FLAGS "-march=core2"
-            POSITION_INDEPENDENT_CODE TRUE
-            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-            )
-        add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
-        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
-        set_target_properties(hs_exec_shared_corei7 PROPERTIES
-            COMPILE_FLAGS "-march=corei7"
-            POSITION_INDEPENDENT_CODE TRUE
-            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
-            )
-        add_library(hs_exec_shared_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
-        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx2>)
-        set_target_properties(hs_exec_shared_avx2 PROPERTIES
-            COMPILE_FLAGS "-march=core-avx2"
-            POSITION_INDEPENDENT_CODE TRUE
-            RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-            )
+            add_library(hs_exec_common OBJECT
+                ${hs_exec_common_SRCS}
+                src/dispatcher.c
+                )
 
-        if (BUILD_AVX512)
-            add_library(hs_exec_shared_avx512 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
-            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx512>)
-            set_target_properties(hs_exec_shared_avx512 PROPERTIES
-                COMPILE_FLAGS "${SKYLAKE_FLAG}"
+            # hs_version.c is added explicitly to avoid some build systems that refuse to
+            # create a lib without any src (I'm looking at you Xcode)
+
+            add_library(hs_runtime STATIC src/hs_version.c
+                $<TARGET_OBJECTS:hs_exec_common>
+                ${RUNTIME_LIBS})
+            set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+            add_library(hs_compile OBJECT ${hs_compile_SRCS})
+
+            # we want the static lib for testing
+            add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
+                $<TARGET_OBJECTS:hs_compile>
+                $<TARGET_OBJECTS:hs_exec_common>
+                ${RUNTIME_LIBS})
+        endif (BUILD_STATIC_LIBS)
+
+        if (BUILD_SHARED_LIBS)
+            # build shared libs
+            add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
+            set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+	          add_library(hs_exec_shared_neon OBJECT ${hs_exec_SRCS} ${hs_exec_neon_SRCS})
+            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_neon>)
+            set_target_properties(hs_exec_shared_neon PROPERTIES
+                COMPILE_FLAGS "-march=${ARMV8_ARCH}"
                 POSITION_INDEPENDENT_CODE TRUE
-                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} neon ${CMAKE_MODULE_PATH}/keep.syms.in"
                 )
-        endif (BUILD_AVX512)
-        add_library(hs_exec_common_shared OBJECT
-        ${hs_exec_common_SRCS}
-        src/dispatcher.c
-        )
-        set_target_properties(hs_exec_common_shared PROPERTIES
-            OUTPUT_NAME hs_exec_common
-            POSITION_INDEPENDENT_CODE TRUE)
-    endif() # SHARED
-
 
+            add_library(hs_exec_shared_sve OBJECT ${hs_exec_SRCS} ${hs_exec_sve_SRCS})
+            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_sve>)
+            set_target_properties(hs_exec_shared_sve PROPERTIES
+		            COMPILE_FLAGS "-march=${SVE_ARCH}"
+                POSITION_INDEPENDENT_CODE TRUE
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} sve ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
+            add_library(hs_exec_shared_sve2 OBJECT ${hs_exec_SRCS} ${hs_exec_sve2_SRCS})
+            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_sve2>)
+            set_target_properties(hs_exec_shared_sve2 PROPERTIES
+                COMPILE_FLAGS "-march=${SVE2_BITPERM_ARCH}"
+                POSITION_INDEPENDENT_CODE TRUE
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} sve2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
+            add_library(hs_exec_common_shared OBJECT
+            ${hs_exec_common_SRCS}
+            src/dispatcher.c
+            )
+            set_target_properties(hs_exec_common_shared PROPERTIES
+                OUTPUT_NAME hs_exec_common
+                POSITION_INDEPENDENT_CODE TRUE)
+        endif() # SHARED
+    endif (ARCH_AARCH64)
 endif (NOT FAT_RUNTIME)
 
-if (NOT BUILD_SHARED_LIBS)
+if (BUILD_STATIC_LIBS)
     install(TARGETS hs_runtime DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
-if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+if (BUILD_SHARED_LIBS)
     if (NOT FAT_RUNTIME)
         add_library(hs_runtime_shared SHARED src/hs_version.c
             src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec_shared>
@@ -1331,23 +1173,16 @@ if (BUILD_STATIC_LIBS)
     add_dependencies(hs ragel_Parser)
 endif ()
 
-if (NOT BUILD_SHARED_LIBS)
+if (BUILD_STATIC_LIBS)
     install(TARGETS hs DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
-if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+if (BUILD_SHARED_LIBS)
     set(hs_shared_SRCS
         src/hs_version.c
         src/hs_valid_platform.c
         $<TARGET_OBJECTS:hs_compile_shared>)
 
-    if (XCODE)
-        # force this lib to use C++ linkage
-        add_custom_command(OUTPUT empty.cxx
-            COMMAND ${CMAKE_COMMAND} -E touch empty.cxx)
-        set (hs_shared_SRCS ${hs_shared_SRCS} empty.cxx)
-    endif (XCODE)
-
     if (NOT FAT_RUNTIME)
         set(hs_shared_SRCS
             ${hs_shared_SRCS}
@@ -1380,7 +1215,24 @@ if (NOT BUILD_STATIC_LIBS)
     add_library(hs ALIAS hs_shared)
 endif ()
 
+add_subdirectory(util)
+add_subdirectory(unit)
+
+if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
+    add_subdirectory(tools)
+endif()
+if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA)
+    add_subdirectory(chimera)
+endif()
 
-if(NOT WIN32)
+option(BUILD_EXAMPLES "Build Hyperscan example code (default TRUE)" TRUE)
+if(BUILD_EXAMPLES)
     add_subdirectory(examples)
 endif()
+
+option(BUILD_BENCHMARKS "Build benchmarks (default TRUE)" TRUE)
+if(BUILD_BENCHMARKS)
+    add_subdirectory(benchmarks)
+endif()
+
+add_subdirectory(doc/dev-reference)
diff --git a/COPYING b/COPYING
index ef9b24fb9..908843a01 100644
--- a/COPYING
+++ b/COPYING
@@ -1,4 +1,5 @@
 Copyright (c) 2015, Intel Corporation
+Copyright (c) 2019-20, VectorCamp PC
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/Contributors-vectorscan.md b/Contributors-vectorscan.md
new file mode 100644
index 000000000..b28f4a585
--- /dev/null
+++ b/Contributors-vectorscan.md
@@ -0,0 +1,25 @@
+   394	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
+    59	apostolos <apostolos.tapsas@vectorcamp.gr>
+    25	Hong, Yang A <yang.a.hong@intel.com>
+    19	George Wort <george.wort@arm.com>
+    16	Chang, Harry <harry.chang@intel.com>
+     7	Danila Kutenin <danilak@google.com>
+     7	Wang Xiang W <xiang.w.wang@intel.com>
+     6	Alex Bondarev <abondarev84@gmail.com>
+     5	Konstantinos Margaritis <konma@vectorcamp.gr>
+     3	Duncan Bellamy <dunk@denkimushi.com>
+     2	Azat Khuzhin <a3at.mail@gmail.com>
+     2	Jan Henning <jan.thilo.henning@sap.com>
+     1	BigRedEye <mail@bigredeye.me>
+     1	Daniel Kutenin <kutdanila@yandex.ru>
+     1	Danila Kutenin <kutdanila@yandex.ru>
+     1	Liu Zixian <hdu_sdlzx@163.com>
+     1	Mitchell Wasson <miwasson@cisco.com>
+     1	Piotr Skamruk <piotr.skamruk@gmail.com>
+     1	Robbie Williamson <robbie.williamson@arm.com>
+     1	Robert Schulze <robert@clickhouse.com>
+     1	Walt Stoneburner <wls@wwco.com>
+     1	Zhu,Wenjun <wenjun.zhu@intel.com>
+     1	hongyang7 <yang.a.hong@intel.com>
+     1	jplaisance <jeffplaisance@gmail.com>
+     1	liquidaty <info@liquidaty.com>
diff --git a/LICENSE b/LICENSE
index 30c57a801..8324617bf 100644
--- a/LICENSE
+++ b/LICENSE
@@ -2,6 +2,11 @@ Hyperscan is licensed under the BSD License.
 
 Copyright (c) 2015, Intel Corporation
 
+Vectorscan is licensed under the BSD License.
+
+Copyright (c) 2020, VectorCamp PC
+Copyright (c) 2021, Arm Limited
+
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
diff --git a/README.md b/README.md
index 9f4c03723..7f7c2f531 100644
--- a/README.md
+++ b/README.md
@@ -1,43 +1,189 @@
-# Hyperscan
+# About Vectorscan
 
-Hyperscan is a high-performance multiple regex matching library. It follows the
+A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD
+and Power VSX are 100% functional. ARM SVE2 support is in ongoing with
+access to hardware now. More platforms will follow in the future.
+Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
+port, which can be either used for platforms without official SIMD support,
+as SIMDe can emulate SIMD instructions, or as an alternative backend for existing architectures,
+for reference and comparison purposes.
+
+Vectorscan will follow Intel's API and internal algorithms where possible, but will not
+hesitate to make code changes where it is thought of giving better performance or better
+portability. In addition, the code will be gradually simplified and made more uniform and
+all architecture specific -currently Intel- #ifdefs will be removed and abstracted away.
+
+# Why was there a need for a fork?
+
+Originally, the ARM porting was intended to be merged into Intel's own Hyperscan, and relevant 
+Pull Requests were made to the project for this reason. Unfortunately, the
+PRs were rejected for now and the forseeable future, thus we have created Vectorscan for 
+our own multi-architectural and opensource collaborative needs.
+
+The recent license change of Hyperscan makes Vectorscan even more relevant for the FLOSS ecosystem.
+
+# What is Vectorscan/Hyperscan/?
+
+Hyperscan and by extension Vectorscan is a high-performance multiple regex matching library. It follows the
 regular expression syntax of the commonly-used libpcre library, but is a
 standalone library with its own C API.
 
-Hyperscan uses hybrid automata techniques to allow simultaneous matching of
+Hyperscan/Vectorscan uses hybrid automata techniques to allow simultaneous matching of
 large numbers (up to tens of thousands) of regular expressions and for the
 matching of regular expressions across streams of data.
 
-Hyperscan is typically used in a DPI library stack.
+Vectorscan is typically used in a DPI library stack, just like Hyperscan.
 
-# Documentation
+# License
 
-Information on building the Hyperscan library and using its API is available in
-the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/).
+Vectorscan follows a BSD License like the original Hyperscan (up to 5.4).
 
-# License
+Vectorscan continues to be an open source project and we are committed to keep it that way.
+See the LICENSE file in the project repository.
+
+## Hyperscan License Change after 5.4
+
+According to
+[Accelerate Snort Performance with Hyperscan and Intel Xeon Processors on Public Clouds](https://networkbuilders.intel.com/docs/networkbuilders/accelerate-snort-performance-with-hyperscan-and-intel-xeon-processors-on-public-clouds-1680176363.pdf) versions of Hyperscan later than 5.4 are
+going to be closed-source:
 
-Hyperscan is licensed under the BSD License. See the LICENSE file in the
-project repository.
+> The latest open-source version (BSD-3 license) of Hyperscan on Github is 5.4. Intel conducts continuous internal
+> development and delivers new Hyperscan releases under Intel Proprietary License (IPL) beginning from 5.5 for interested
+> customers. Please contact authors to learn more about getting new Hyperscan releases.
 
 # Versioning
 
-The `master` branch on Github will always contain the most recent release of
+The `master` branch on Github will always contain the most recent stable release of
 Hyperscan. Each version released to `master` goes through QA and testing before
 it is released; if you're a user, rather than a developer, this is the version
 you should be using.
 
 Further development towards the next release takes place on the `develop`
-branch.
+branch. All PRs are first made against the develop branch and if the pass the [Vectorscan CI](https://buildbot-ci.vectorcamp.gr/#/grid), then they get merged. Similarly with PRs from develop to master.
+
+# Compatibility with Hyperscan
+
+Vectorscan aims to be ABI and API compatible with the last open source version of Intel Hyperscan 5.4.
+After careful consideration we decided that we will **NOT** aim to achieving compatibility with later Hyperscan versions 5.5/5.6 that have extended Hyperscan's API.
+If keeping up to date with latest API of Hyperscan, you should talk to Intel and get a license to use that.
+However, we intend to extend Vectorscan's API with user requested changes or API extensions and improvements that we think are best for the project.
+
+# Installation
+
+## Debian/Ubuntu
+
+On recent Debian/Ubuntu systems, vectorscan should be directly available for installation:
+
+```
+$ sudo apt install libvectorscan5
+```
+
+Or to install the devel package you can install `libvectorscan-dev` package:
+
+```
+$ sudo apt install libvectorscan-dev
+```
+
+For other distributions/OSes please check the [Wiki](https://github.com/VectorCamp/vectorscan/wiki/Installation-from-package)
+
+
+# Build Instructions
+
+The build system has recently been refactored to be more modular and easier to extend. For that reason,
+some small but necessary changes were made that might break compatibility with how Hyperscan was built.
+
+## Install Common Dependencies
+
+### Debian/Ubuntu
+In order to build on Debian/Ubuntu make sure you install the following build-dependencies
+
+```
+$ sudo apt build-essential cmake ragel pkg-config libsqlite3-dev libpcap-dev
+```
+
+### Other distributions
+
+TBD
+
+### MacOS X (M1/M2/M3 CPUs only)
 
-# Get Involved
+Assuming an existing HomeBrew installation:
 
-The official homepage for Hyperscan is at [www.hyperscan.io](https://www.hyperscan.io).
+```
+% brew install boost cmake gcc libpcap pkg-config ragel sqlite
+```
+
+## Configure & build
+
+In order to configure with `cmake` first create and cd into a build directory:
+
+```
+$ mkdir build
+$ cd build
+```
+
+Then call `cmake` from inside the `build` directory:
+
+```
+$ cmake ../
+```
+
+Common options for Cmake are:
+
+* `-DBUILD_STATIC_LIBS=[On|Off]` Build static libraries
+* `-DBUILD_SHARED_LIBS=[On|Off]` Build shared libraries (if none are set static libraries are built by default)
+* `-DCMAKE_BUILD_TYPE=[Release|Debug|RelWithDebInfo|MinSizeRel]` Configure build type and determine optimizations and certain features.
+* `-DUSE_CPU_NATIVE=[On|Off]` Native CPU detection is off by default, however it is possible to build a performance-oriented non-fat library tuned to your CPU
+* `-DFAT_RUNTIME=[On|Off]` Fat Runtime is only available for X86 32-bit/64-bit and AArch64 architectures and only on Linux. It is incompatible with `Debug` type and `USE_CPU_NATIVE`.
+
+### Specific options for X86 32-bit/64-bit (Intel/AMD) CPUs
+
+* `-DBUILD_AVX2=[On|Off]` Enable code for AVX2.
+* `-DBUILD_AVX512=[On|Off]` Enable code for AVX512. Implies `BUILD_AVX2`.
+* `-DBUILD_AVX512VBMI=[On|Off]` Enable code for AVX512 with VBMI extension. Implies `BUILD_AVX512`.
+
+### Specific options for Arm 64-bit CPUs
+
+* `-DBUILD_SVE=[On|Off]` Enable code for SVE, like on AWS Graviton3 CPUs. Not much code is ported just for SVE , but enabling SVE code production, does improve code generation, see [Benchmarks](https://github.com/VectorCamp/vectorscan/wiki/Benchmarks).
+* `-DBUILD_SVE2=[On|Off]` Enable code for SVE2, implies `BUILD_SVE`. Most non-Neon code is written for SVE2
+* `-DBUILD_SVE2_BITPERM=[On|Off]` Enable code for SVE2_BITPERM harwdare feature, implies `BUILD_SVE2`.
+
+## Other options
+
+* `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.
+
+## SIMDe options
+
+* `SIMDE_BACKEND=[On|Off]` Enable SIMDe backend. If this is chosen all native (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be disabled and a SIMDe SSE4.2 emulation backend will be enabled. This will enable Vectorscan to build and run on architectures without SIMD.
+* `SIMDE_NATIVE=[On|Off]` Enable SIMDe native emulation of x86 SSE4.2 intrinsics on the building platform. That is, SSE4.2 intrinsics will be emulated using Neon on an Arm platform, or VSX on a Power platform, etc.
+
+## Build
+
+If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
+
+```
+$ make -j <N>
+```
+
+will speed up the process. If all goes well, you should have the vectorscan library compiled.
+
+
+# Contributions
+
+The official homepage for Vectorscan is at [www.github.com/VectorCamp/vectorscan](https://www.github.com/VectorCamp/vectorscan).
+
+# Vectorscan Development
+
+All development of Vectorscan is done in public. 
+
+# Original Hyperscan links
+For reference, the official homepage for Hyperscan is at [www.hyperscan.io](https://www.hyperscan.io).
+
+# Hyperscan Documentation
+
+Information on building the Hyperscan library and using its API is available in
+the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/).
 
-If you have questions or comments, we encourage you to [join the mailing
-list](https://lists.01.org/mailman/listinfo/hyperscan). Bugs can be filed by
-sending email to the list, or by creating an issue on Github.
+And you can find the source code [on Github](https://github.com/intel/hyperscan).
 
-If you wish to contact the Hyperscan team at Intel directly, without posting
-publicly to the mailing list, send email to
-[hyperscan@intel.com](mailto:hyperscan@intel.com).
+For Intel Hyperscan related issues and questions, please follow the relevant links there.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 000000000..63391a68c
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
+
+if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
+  add_executable(benchmarks benchmarks.cpp)
+  set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
+      "-Wall -Wno-unused-variable")
+  target_link_libraries(benchmarks hs)
+endif()
diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp
new file mode 100644
index 000000000..91cab3f8d
--- /dev/null
+++ b/benchmarks/benchmarks.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2020, 2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <iostream>
+#include <chrono>
+#include <cstring>
+#include <ctime>
+#include <cstdlib>
+#include <memory>
+#include <functional>
+
+#include "benchmarks.hpp"
+
+#define MAX_LOOPS    1000000000
+#define MAX_MATCHES  5
+#define N            8
+
+struct hlmMatchEntry {
+    size_t to;
+    u32 id;
+    hlmMatchEntry(size_t end, u32 identifier) :
+            to(end), id(identifier) {}
+};
+
+std::vector<hlmMatchEntry> ctxt;
+
+static
+hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
+                              UNUSED struct hs_scratch *scratch) {
+    DEBUG_PRINTF("match @%zu = %u\n", to, id);
+
+    ctxt.push_back(hlmMatchEntry(to, id));
+
+    return HWLM_CONTINUE_MATCHING;
+}
+
+template<typename InitFunc, typename BenchFunc>
+static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse, MicroBenchmark &bench, InitFunc &&init, BenchFunc &&func) {
+    init(bench);
+    double total_sec = 0.0;            
+    u64a total_size = 0;
+    double bw = 0.0;
+    double avg_bw = 0.0;
+    double max_bw = 0.0;
+    double avg_time = 0.0;
+    if (max_matches) {
+        int pos = 0;
+        for(int j = 0; j < max_matches - 1; j++) {
+            bench.buf[pos] = 'b';
+            pos = (j+1) *size / max_matches ;
+            bench.buf[pos] = 'a';
+            u64a actual_size = 0;
+            auto start = std::chrono::steady_clock::now();
+            for(int i = 0; i < loops; i++) { 
+                const u8 *res = func(bench);
+		if (is_reverse)
+		   actual_size += bench.buf.data() + size - res;
+		else
+                   actual_size += res - bench.buf.data();
+            }
+            auto end = std::chrono::steady_clock::now();
+            double dt = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+            total_sec += dt;
+            /*convert microseconds to seconds*/
+            /*calculate bandwidth*/
+            bw  = (actual_size / dt) * 1000000.0 / 1048576.0;
+	    /*std::cout << "act_size = " << act_size << std::endl;
+	    std::cout << "dt = " << dt << std::endl;
+	    std::cout << "bw = " << bw << std::endl;*/
+	    avg_bw += bw;
+            /*convert to MB/s*/
+            max_bw = std::max(bw, max_bw);
+            /*calculate average time*/
+            avg_time += total_sec / loops;
+        }
+        avg_time /= max_matches;
+        avg_bw /= max_matches;
+	total_sec /= 1000000.0;
+        /*convert average time to us*/
+        printf(KMAG "%s: %u matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, " 
+               KBLU "average time per call =" RST " %.3f μs," KBLU " max bandwidth = " RST " %.3f MB/s," KBLU " average bandwidth =" RST " %.3f MB/s \n",
+               bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
+    } else {
+        auto start = std::chrono::steady_clock::now();
+        for (int i = 0; i < loops; i++) {
+            const u8 *res = func(bench);
+        }
+        auto end = std::chrono::steady_clock::now();
+        total_sec += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        /*calculate transferred size*/
+        total_size = size * loops;
+        /*calculate average time*/
+        avg_time = total_sec / loops;
+        /*convert microseconds to seconds*/
+        total_sec /= 1000000.0;
+        /*calculate maximum bandwidth*/
+        max_bw = total_size / total_sec;
+        /*convert to MB/s*/
+        max_bw /= 1048576.0;
+        printf(KMAG "%s: no matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, " 
+               KBLU "average time per call =" RST " %.3f μs ," KBLU " bandwidth = " RST " %.3f MB/s \n",
+               bench.label, size ,loops, total_sec, avg_time, max_bw );
+    }
+}
+
+int main(){
+    int matches[] = {0, MAX_MATCHES};
+    std::vector<size_t> sizes;
+    for (size_t i = 0; i < N; i++) sizes.push_back(16000 << i*2);
+    const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa"; 
+  
+    for (int m = 0; m < 2; m++) {
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            MicroBenchmark bench("Shufti", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+                [&](MicroBenchmark &b) {
+                    b.chars.set('a');
+                    ue2::shuftiBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    memset(b.buf.data(), 'b', b.size);
+                },
+                [&](MicroBenchmark &b) {
+                    return shuftiExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
+                }
+            );
+        }
+
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            MicroBenchmark bench("Reverse Shufti", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+                [&](MicroBenchmark &b) {
+                    b.chars.set('a');
+                    ue2::shuftiBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    memset(b.buf.data(), 'b', b.size);
+                },
+                [&](MicroBenchmark &b) {
+                    return rshuftiExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
+                }
+            );
+        }
+
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            MicroBenchmark bench("Truffle", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+                [&](MicroBenchmark &b) {
+                    b.chars.set('a');
+                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    memset(b.buf.data(), 'b', b.size);
+                },
+                [&](MicroBenchmark &b) {
+                    return truffleExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
+                }
+            );
+        }
+
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            MicroBenchmark bench("Reverse Truffle", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+                [&](MicroBenchmark &b) {
+                    b.chars.set('a');
+                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    memset(b.buf.data(), 'b', b.size);
+                },
+                [&](MicroBenchmark &b) {
+                    return rtruffleExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
+                }
+            );
+        }
+
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            MicroBenchmark bench("Vermicelli", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+                [&](MicroBenchmark &b) {
+                    b.chars.set('a');
+                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    memset(b.buf.data(), 'b', b.size);
+                },
+                [&](MicroBenchmark &b) {
+                    return vermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
+                }
+            );
+        }
+
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+                [&](MicroBenchmark &b) {
+                    b.chars.set('a');
+                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    memset(b.buf.data(), 'b', b.size);
+                },
+                [&](MicroBenchmark &b) {
+                    return rvermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
+                }
+            );
+        }
+
+        for (size_t i = 0; i < std::size(sizes); i++) {
+            //we imitate the noodle unit tests
+            std::string str;
+            const size_t char_len = 5;
+            str.resize(char_len + 1);
+            for (size_t j=0; j < char_len; j++) {
+                srand (time(NULL));
+                int key = rand() % + 36 ;
+                str[char_len] = charset[key];
+                str[char_len + 1] = '\0';
+            }
+
+            MicroBenchmark bench("Noodle", sizes[i]);
+            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+                [&](MicroBenchmark &b) {
+                    ctxt.clear();
+                    memset(b.buf.data(), 'a', b.size);
+                    u32 id = 1000;
+                    ue2::hwlmLiteral lit(str, true, id);
+                    b.nt = ue2::noodBuildTable(lit);
+                    assert(b.nt != nullptr);
+                },
+                [&](MicroBenchmark &b) {
+                    noodExec(b.nt.get(), b.buf.data(), b.size, 0, hlmSimpleCallback, &b.scratch);
+                    return b.buf.data() + b.size;
+                }
+           );
+        }
+    }
+
+    return 0;
+}
diff --git a/benchmarks/benchmarks.hpp b/benchmarks/benchmarks.hpp
new file mode 100644
index 000000000..974d22344
--- /dev/null
+++ b/benchmarks/benchmarks.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020, 2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nfa/shufti.h"
+#include "nfa/shufticompile.h"
+#include "nfa/truffle.h"
+#include "nfa/trufflecompile.h"
+#include "nfa/vermicelli.hpp"
+#include "hwlm/noodle_build.h"
+#include "hwlm/noodle_engine.h"
+#include "hwlm/noodle_internal.h"
+#include "hwlm/hwlm_literal.h"
+#include "util/bytecode_ptr.h"
+#include "scratch.h"
+
+/*define colour control characters*/
+#define RST  "\x1B[0m"
+#define KRED  "\x1B[31m"
+#define KGRN  "\x1B[32m"
+#define KYEL  "\x1B[33m"
+#define KBLU  "\x1B[34m"
+#define KMAG  "\x1B[35m"
+#define KCYN  "\x1B[36m"
+#define KWHT  "\x1B[37m"
+
+class MicroBenchmark
+{
+public:
+  char const *label;
+  size_t size;
+
+  // Shufti/Truffle
+  m128 lo, hi;
+  ue2::CharReach chars;
+  std::vector<u8> buf;
+
+  // Noodle
+  struct hs_scratch scratch;
+  ue2::bytecode_ptr<noodTable> nt;
+
+  MicroBenchmark(char const *label_, size_t size_)
+  :label(label_), size(size_), buf(size_) {
+  };
+};
diff --git a/chimera/CMakeLists.txt b/chimera/CMakeLists.txt
index 1cd66a3f5..c3c50c3b4 100644
--- a/chimera/CMakeLists.txt
+++ b/chimera/CMakeLists.txt
@@ -33,17 +33,15 @@ target_link_libraries(chimera hs pcre)
 
 install(TARGETS chimera DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
-if (NOT WIN32)
-    # expand out library names for pkgconfig static link info
-    foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES})
-        # this is fragile, but protects us from toolchain specific files
-        if (NOT EXISTS ${LIB})
-            set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}")
-        endif()
-    endforeach()
-    set(PRIVATE_LIBS "${PRIVATE_LIBS} -L${LIBDIR} -lpcre")
-
-    configure_file(libch.pc.in libch.pc @ONLY) # only replace @ quoted vars
-    install(FILES ${CMAKE_BINARY_DIR}/chimera/libch.pc
-        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
-endif()
+# expand out library names for pkgconfig static link info
+foreach (LIB ${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES})
+    # this is fragile, but protects us from toolchain specific files
+    if (NOT EXISTS ${LIB})
+        set(PRIVATE_LIBS "${PRIVATE_LIBS} -l${LIB}")
+    endif()
+endforeach()
+set(PRIVATE_LIBS "${PRIVATE_LIBS} -L${LIBDIR} -lpcre")
+
+configure_file(libch.pc.in libch.pc @ONLY) # only replace @ quoted vars
+install(FILES ${CMAKE_BINARY_DIR}/chimera/libch.pc
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
\ No newline at end of file
diff --git a/chimera/ch_common.h b/chimera/ch_common.h
index 8caa44407..bdb0bafa9 100644
--- a/chimera/ch_common.h
+++ b/chimera/ch_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Intel Corporation
+ * Copyright (c) 2018-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -345,6 +345,16 @@ ch_error_t HS_CDECL ch_set_scratch_allocator(ch_alloc_t alloc_func,
  */
 #define CH_SCRATCH_IN_USE       (-10)
 
+/**
+ * Unexpected internal error from Hyperscan.
+ *
+ * This error indicates that there was unexpected matching behaviors from
+ * Hyperscan. This could be related to invalid usage of scratch space or
+ * invalid memory operations by users.
+ *
+ */
+#define CH_UNKNOWN_HS_ERROR     (-13)
+
 /**
  * Returned when pcre_exec (called for some expressions internally from @ref
  * ch_scan) failed due to a fatal error.
diff --git a/chimera/ch_compile.cpp b/chimera/ch_compile.cpp
index 46536f312..fbe8fe534 100644
--- a/chimera/ch_compile.cpp
+++ b/chimera/ch_compile.cpp
@@ -39,7 +39,6 @@
 #include "hs_internal.h"
 #include "ue2common.h"
 #include "util/compile_error.h"
-#include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/target_info.h"
 
@@ -495,7 +494,7 @@ void ch_compile_multi_int(const char *const *expressions, const unsigned *flags,
         // First, build with libpcre. A build failure from libpcre will throw
         // an exception up to the caller.
         auto patternData =
-            ue2::make_unique<PatternData>(myExpr, myFlags, i, myId, mode, match_limit,
+            std::make_unique<PatternData>(myExpr, myFlags, i, myId, mode, match_limit,
                                           match_limit_recursion, platform);
         pcres.push_back(move(patternData));
         PatternData &curr = *pcres.back();
diff --git a/chimera/ch_runtime.c b/chimera/ch_runtime.c
index 212bbc7be..af7d1f080 100644
--- a/chimera/ch_runtime.c
+++ b/chimera/ch_runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Intel Corporation
+ * Copyright (c) 2018-2022, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -326,6 +326,10 @@ ch_error_t catchupPcre(struct HybridContext *hyctx, unsigned int id,
         } else if (cbrv == CH_CALLBACK_SKIP_PATTERN) {
             DEBUG_PRINTF("user callback told us to skip this pattern\n");
             pd->scanStart = hyctx->length;
+            if (top_id == id) {
+                break;
+            }
+            continue;
         }
 
         if (top_id == id) {
@@ -419,6 +423,7 @@ int HS_CDECL multiCallback(unsigned int id, unsigned long long from,
             DEBUG_PRINTF("user callback told us to skip this pattern\n");
             pd->scanStart = hyctx->length;
             ret = HS_SUCCESS;
+            hyctx->scratch->ret = ret;
         } else if (ret == CH_FAIL_INTERNAL) {
             return ret;
         }
@@ -590,11 +595,24 @@ ch_error_t ch_scan_i(const ch_database_t *hydb,
 
     if (!(db->flags & CHIMERA_FLAG_NO_MULTIMATCH)) {
         ret = scanHyperscan(&hyctx, data, length);
-        if (ret != HS_SUCCESS && scratch->ret != CH_SUCCESS) {
-            DEBUG_PRINTF("Hyperscan returned error %d\n", scratch->ret);
+        // Errors from pcre scan.
+        if (scratch->ret == CH_CALLBACK_TERMINATE) {
+            DEBUG_PRINTF("Pcre terminates scan\n");
+            unmarkScratchInUse(scratch);
+            return CH_SCAN_TERMINATED;
+        } else if (scratch->ret != CH_SUCCESS) {
+            DEBUG_PRINTF("Pcre internal error\n");
             unmarkScratchInUse(scratch);
             return scratch->ret;
         }
+        // Errors from Hyperscan scan. Note Chimera could terminate
+        // Hyperscan callback on purpose so this is not counted as an error.
+        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+            assert(scratch->ret == CH_SUCCESS);
+            DEBUG_PRINTF("Hyperscan returned error %d\n", ret);
+            unmarkScratchInUse(scratch);
+            return ret;
+        }
     }
 
     DEBUG_PRINTF("Flush priority queue\n");
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
deleted file mode 100644
index cced49c69..000000000
--- a/cmake/arch.cmake
+++ /dev/null
@@ -1,96 +0,0 @@
-# detect architecture features
-#
-# must be called after determining where compiler intrinsics are defined
-
-if (HAVE_C_X86INTRIN_H)
-    set (INTRIN_INC_H "x86intrin.h")
-elseif (HAVE_C_INTRIN_H)
-    set (INTRIN_INC_H "intrin.h")
-else ()
-    message (FATAL_ERROR "No intrinsics header found")
-endif ()
-
-if (BUILD_AVX512)
-    CHECK_C_COMPILER_FLAG(${SKYLAKE_FLAG} HAS_ARCH_SKYLAKE)
-    if (NOT HAS_ARCH_SKYLAKE)
-        message (FATAL_ERROR "AVX512 not supported by compiler")
-    endif ()
-endif ()
-
-if (FAT_RUNTIME)
-    # test the highest level microarch to make sure everything works
-    if (BUILD_AVX512)
-        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
-    else ()
-        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2")
-    endif ()
-else (NOT FAT_RUNTIME)
-    # if not fat runtime, then test given cflags
-    set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
-endif ()
-
-# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
-CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
-int main() {
-    __m128i a = _mm_set1_epi8(1);
-    (void)_mm_shuffle_epi8(a, a);
-}" HAVE_SSSE3)
-
-# now look for AVX2
-CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
-#if !defined(__AVX2__)
-#error no avx2
-#endif
-
-int main(){
-    __m256i z = _mm256_setzero_si256();
-    (void)_mm256_xor_si256(z, z);
-}" HAVE_AVX2)
-
-# and now for AVX512
-CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
-#if !defined(__AVX512BW__)
-#error no avx512bw
-#endif
-
-int main(){
-    __m512i z = _mm512_setzero_si512();
-    (void)_mm512_abs_epi8(z);
-}" HAVE_AVX512)
-
-# and now for AVX512VBMI
-CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
-#if !defined(__AVX512VBMI__)
-#error no avx512vbmi
-#endif
-
-int main(){
-    __m512i a = _mm512_set1_epi8(0xFF);
-    __m512i idx = _mm512_set_epi64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
-    (void)_mm512_permutexvar_epi8(idx, a);
-}" HAVE_AVX512VBMI)
-
-if (FAT_RUNTIME)
-    if (NOT HAVE_SSSE3)
-        message(FATAL_ERROR "SSSE3 support required to build fat runtime")
-    endif ()
-    if (NOT HAVE_AVX2)
-        message(FATAL_ERROR "AVX2 support required to build fat runtime")
-    endif ()
-    if (BUILD_AVX512 AND NOT HAVE_AVX512)
-        message(FATAL_ERROR "AVX512 support requested but not supported")
-    endif ()
-else (NOT FAT_RUNTIME)
-    if (NOT HAVE_AVX2)
-        message(STATUS "Building without AVX2 support")
-    endif ()
-    if (NOT HAVE_AVX512)
-        message(STATUS "Building without AVX512 support")
-    endif ()
-    if (NOT HAVE_SSSE3)
-        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
-    endif ()
-endif ()
-
-unset (CMAKE_REQUIRED_FLAGS)
-unset (INTRIN_INC_H)
diff --git a/cmake/archdetect.cmake b/cmake/archdetect.cmake
new file mode 100644
index 000000000..bd0d088cc
--- /dev/null
+++ b/cmake/archdetect.cmake
@@ -0,0 +1,111 @@
+if (USE_CPU_NATIVE)
+    # Detect best GNUCC_ARCH to tune for
+    if (CMAKE_COMPILER_IS_GNUCC)
+        message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
+
+        # If gcc doesn't recognise the host cpu, then mtune=native becomes
+        # generic, which isn't very good in some cases. march=native looks at
+        # cpuid info and then chooses the best microarch it can (and replaces
+        # the flag), so use that for tune.
+
+        set(TUNE_FLAG "mtune")
+        set(GNUCC_TUNE "")
+        message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
+
+        # arg1 might exist if using ccache
+        string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
+        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -${TUNE_FLAG}=native)
+        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+            OUTPUT_VARIABLE _GCC_OUTPUT)
+        set(_GCC_OUTPUT_TUNE ${_GCC_OUTPUT})
+        string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}=" POS)
+        string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
+        string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
+
+        string(FIND "${_GCC_OUTPUT_TUNE}" "${TUNE_FLAG}=" POS_TUNE)
+        string(SUBSTRING "${_GCC_OUTPUT_TUNE}" ${POS_TUNE} -1 _GCC_OUTPUT_TUNE)
+        string(REGEX REPLACE "${TUNE_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_TUNE "${_GCC_OUTPUT_TUNE}")
+
+        message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")
+
+        # test the parsed flag
+        set (EXEC_ARGS ${CC_ARG1} -E - -${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE})
+        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+            OUTPUT_QUIET ERROR_QUIET
+            INPUT_FILE /dev/null
+            RESULT_VARIABLE GNUCC_TUNE_TEST)
+
+        if (NOT GNUCC_TUNE_TEST EQUAL 0)
+            message(WARNING "Something went wrong determining gcc tune: -mtune=${GNUCC_TUNE} not valid, falling back to -mtune=native")
+            set(GNUCC_TUNE native)
+        else()
+            set(GNUCC_TUNE ${GNUCC_TUNE})
+            message(STATUS "gcc will tune for ${GNUCC_ARCH}, ${GNUCC_TUNE}")
+        endif()
+    elseif (CMAKE_COMPILER_IS_CLANG)
+        if (ARCH_IA32 OR ARCH_X86_64)
+            set(GNUCC_ARCH x86-64-v2)
+            set(TUNE_FLAG generic)
+        elseif(ARCH_AARCH64)
+            if (BUILD_SVE2_BITPERM)
+                set(GNUCC_ARCH ${SVE2_BITPERM_ARCH})
+            elseif (BUILD_SVE2)
+                set(GNUCC_ARCH ${SVE2_ARCH})
+            elseif (BUILD_SVE)
+                set(GNUCC_ARCH ${SVE_ARCH})
+            else ()
+                set(GNUCC_ARCH ${ARMV8_ARCH})
+            endif()
+            set(TUNE_FLAG generic)
+        elseif(ARCH_ARM32)
+            set(GNUCC_ARCH armv7a)
+            set(TUNE_FLAG generic)
+        else()
+            set(GNUCC_ARCH native)
+            set(TUNE_FLAG generic)
+        endif()
+        message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
+    endif()
+else()
+    if (SIMDE_BACKEND)
+        if (ARCH_IA32 OR ARCH_X86_64)
+            set(GNUCC_ARCH x86-64-v2)
+            set(TUNE_FLAG generic)
+        elseif(ARCH_AARCH64)
+            set(GNUCC_ARCH armv8-a)
+            set(TUNE_FLAG generic)
+        elseif(ARCH_ARM32)
+            set(GNUCC_ARCH armv7a)
+            set(TUNE_FLAG generic)
+        elseif(ARCH_PPC64EL)
+            set(GNUCC_ARCH power8)
+            set(TUNE_FLAG power8)
+        else()
+            set(GNUCC_ARCH native)
+            set(TUNE_FLAG generic)
+        endif()
+    elseif (ARCH_IA32 OR ARCH_X86_64)
+        set(GNUCC_ARCH native)
+        set(TUNE_FLAG generic)
+    elseif(ARCH_AARCH64)
+        if (BUILD_SVE2_BITPERM)
+            set(GNUCC_ARCH ${SVE2_BITPERM_ARCH})
+        elseif (BUILD_SVE2)
+            set(GNUCC_ARCH ${SVE2_ARCH})
+        elseif (BUILD_SVE)
+            set(GNUCC_ARCH ${SVE_ARCH})
+        else ()
+            set(GNUCC_ARCH ${ARMV8_ARCH})
+        endif()
+        set(TUNE_FLAG generic)
+    elseif(ARCH_ARM32)
+       set(GNUCC_ARCH armv7a)
+       set(TUNE_FLAG generic)
+    elseif(ARCH_PPC64EL)
+       set(GNUCC_ARCH power8)
+       set(TUNE_FLAG power8)
+    else()
+       set(GNUCC_ARCH native)
+       set(TUNE_FLAG native)
+    endif()
+endif()
diff --git a/cmake/build_wrapper.sh b/cmake/build_wrapper.sh
index 1962813fe..895610c00 100755
--- a/cmake/build_wrapper.sh
+++ b/cmake/build_wrapper.sh
@@ -17,7 +17,7 @@ KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
 LIBC_SO=$("$@" --print-file-name=libc.so.6)
 cp ${KEEPSYMS_IN} ${KEEPSYMS}
 # get all symbols from libc and turn them into patterns
-nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
+nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
 # build the object
 "$@"
 # rename the symbols in the object
diff --git a/cmake/cflags-arm.cmake b/cmake/cflags-arm.cmake
new file mode 100644
index 000000000..c6943bbfd
--- /dev/null
+++ b/cmake/cflags-arm.cmake
@@ -0,0 +1,93 @@
+if (NOT FAT_RUNTIME)
+    if (BUILD_SVE2_BITPERM)
+        message (STATUS "SVE2_BITPERM implies SVE2, enabling BUILD_SVE2")
+        set(BUILD_SVE2 ON)
+    endif ()
+    if (BUILD_SVE2)
+        message (STATUS "SVE2 implies SVE, enabling BUILD_SVE")
+        set(BUILD_SVE ON)
+    endif ()
+endif ()
+
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+    set(ARMV9BASE_MINVER "12")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ARMV9BASE_MINVER)
+        set(SVE2_ARCH "armv8-a+sve2")
+    else()
+        set(SVE2_ARCH "armv9-a")
+    endif()
+else()
+    set(SVE2_ARCH "armv9-a")
+endif()
+
+set(ARMV8_ARCH "armv8-a")
+set(SVE_ARCH "${ARMV8_ARCH}+sve")
+set(SVE2_BITPERM_ARCH "${SVE2_ARCH}+sve2-bitperm")
+
+CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H)
+if (BUILD_SVE OR BUILD_SVE2 OR BUILD_SVE2_BITPERM OR FAT_RUNTIME)
+  set(CMAKE_REQUIRED_FLAGS "-march=${SVE_ARCH}")
+  CHECK_INCLUDE_FILE_CXX(arm_sve.h HAVE_C_ARM_SVE_H)
+  if (NOT HAVE_C_ARM_SVE_H)
+    message(FATAL_ERROR "arm_sve.h is required to build for SVE.")
+  endif()
+endif()
+
+CHECK_C_SOURCE_COMPILES("#include <arm_neon.h>
+int main() {
+    int32x4_t a = vdupq_n_s32(1);
+    (void)a;
+}" HAVE_NEON)
+
+if (BUILD_SVE2_BITPERM)
+    set(CMAKE_REQUIRED_FLAGS "-march=${SVE2_BITPERM_ARCH}")
+    CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
+    int main() {
+        svuint8_t a = svbext(svdup_u8(1), svdup_u8(2));
+        (void)a;
+    }" HAVE_SVE2_BITPERM)
+endif()
+if (BUILD_SVE2)
+    set(CMAKE_REQUIRED_FLAGS "-march=${SVE2_ARCH}")
+    CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
+        int main() {
+            svuint8_t a = svbsl(svdup_u8(1), svdup_u8(2), svdup_u8(3));
+            (void)a;
+    }" HAVE_SVE2)
+endif()
+if (BUILD_SVE)
+    set(CMAKE_REQUIRED_FLAGS "-march=${SVE_ARCH}")
+    CHECK_C_SOURCE_COMPILES("#include <arm_sve.h>
+        int main() {
+            svuint8_t a = svdup_u8(1);
+            (void)a;
+    }" HAVE_SVE)
+endif ()
+
+if (FAT_RUNTIME)
+    if (NOT HAVE_NEON)
+        message(FATAL_ERROR "NEON support required to build fat runtime")
+    endif ()
+    if (BUILD_SVE AND NOT HAVE_SVE)
+        message(FATAL_ERROR "SVE support required to build fat runtime")
+    endif ()
+    if (BUILD_SVE2 AND NOT HAVE_SVE2)
+        message(FATAL_ERROR "SVE2 support required to build fat runtime")
+    endif ()
+    if (BUILD_SVE2_BITPERM AND NOT HAVE_SVE2_BITPERM)
+        message(FATAL_ERROR "SVE2 support required to build fat runtime")
+    endif ()
+else (NOT FAT_RUNTIME)
+    if (NOT BUILD_SVE)
+        message(STATUS "Building without SVE support")
+    endif ()
+    if (NOT BUILD_SVE2)
+        message(STATUS "Building without SVE2 support")
+    endif ()
+    if (NOT HAVE_NEON)
+        message(FATAL_ERROR "Neon/ASIMD support required for Arm support")
+    endif ()
+endif ()
+
+
diff --git a/cmake/cflags-generic.cmake b/cmake/cflags-generic.cmake
new file mode 100644
index 000000000..4eabcdb55
--- /dev/null
+++ b/cmake/cflags-generic.cmake
@@ -0,0 +1,164 @@
+# set compiler flags - more are tested and added later
+set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
+set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
+if (NOT CMAKE_COMPILER_IS_CLANG)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
+endif()
+
+if (NOT RELEASE_BUILD)
+    # -Werror is most useful during development, don't potentially break
+    # release builds
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
+    if (CMAKE_COMPILER_IS_CLANG)
+    	if (CMAKE_C_COMPILER_VERSION VERSION_GREATER "13.0")
+           set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-unused-but-set-variable")
+           set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
+        endif()
+    endif()
+endif()
+
+if (DISABLE_ASSERTS)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCC)
+    # spurious warnings?
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
+    endif ()
+    # don't complain about abi
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
+endif()
+
+if (NOT(ARCH_IA32 AND RELEASE_BUILD))
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
+endif()
+
+CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
+CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
+CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
+
+# these end up in the config file
+CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
+CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
+
+# are we using libc++
+CHECK_CXX_SYMBOL_EXISTS(_LIBCPP_VERSION ciso646 HAVE_LIBCPP)
+
+if (RELEASE_BUILD)
+    if (HAS_C_HIDDEN)
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
+    endif()
+    if (HAS_CXX_HIDDEN)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fvisibility=hidden")
+    endif()
+endif()
+
+# testing a builtin takes a little more work
+CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
+CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
+# Clang does not use __builtin_constant_p() the same way as gcc
+if (NOT CMAKE_COMPILER_IS_CLANG)
+   CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
+endif()
+
+set(C_FLAGS_TO_CHECK
+# Variable length arrays are way bad, most especially at run time
+"-Wvla"
+# Pointer arith on void pointers is doing it wrong.
+ "-Wpointer-arith"
+# Build our C code with -Wstrict-prototypes -Wmissing-prototypes
+ "-Wstrict-prototypes"
+ "-Wmissing-prototypes"
+)
+foreach (FLAG ${C_FLAGS_TO_CHECK})
+    # munge the name so it doesn't break things
+    string(REPLACE "-" "_" FNAME C_FLAG${FLAG})
+    CHECK_C_COMPILER_FLAG("${FLAG}" ${FNAME})
+    if (${FNAME})
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} ${FLAG}")
+    endif()
+endforeach()
+
+# self-assign should be thrown away, but clang whinges
+CHECK_C_COMPILER_FLAG("-Wself-assign" CC_SELF_ASSIGN)
+if (CC_SELF_ASSIGN)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-self-assign")
+endif()
+CHECK_CXX_COMPILER_FLAG("-Wself-assign" CXX_SELF_ASSIGN)
+if (CXX_SELF_ASSIGN)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-self-assign")
+endif()
+
+# clang gets up in our face for going paren crazy with macros
+CHECK_C_COMPILER_FLAG("-Wparentheses-equality" CC_PAREN_EQUALITY)
+if (CC_PAREN_EQUALITY)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-parentheses-equality")
+endif()
+
+# clang complains about unused const vars in our Ragel-generated code.
+CHECK_CXX_COMPILER_FLAG("-Wunused-const-variable" CXX_UNUSED_CONST_VAR)
+if (CXX_UNUSED_CONST_VAR)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
+endif()
+
+# clang-14 complains about unused-but-set variable.
+CHECK_CXX_COMPILER_FLAG("-Wunused-but-set-variable" CXX_UNUSED_BUT_SET_VAR)
+if (CXX_UNUSED_BUT_SET_VAR)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
+endif()
+
+# clang-14 complains about using bitwise operator instead of logical ones.
+CHECK_CXX_COMPILER_FLAG("-Wbitwise-instead-of-logical" CXX_BITWISE_INSTEAD_OF_LOGICAL)
+if (CXX_BITWISE_INSTEAD_OF_LOGICAL)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-bitwise-instead-of-logical")
+endif()
+
+# clang-14 complains about using bitwise operator instead of logical ones.
+CHECK_CXX_COMPILER_FLAG("-Wbitwise-instead-of-logical" CXX_BITWISE_INSTEAD_OF_LOGICAL)
+if (CXX_BITWISE_INSTEAD_OF_LOGICAL)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-bitwise-instead-of-logical")
+endif()
+
+CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
+if (CXX_IGNORED_ATTR)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
+endif()
+
+# gcc 9 complains about redundant move for returned variable
+CHECK_CXX_COMPILER_FLAG("-Wredundant-move" CXX_REDUNDANT_MOVE)
+if (CXX_REDUNDANT_MOVE)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-redundant-move")
+endif()
+
+# note this for later, g++ doesn't have this flag but clang does
+CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
+if (CXX_WEAK_VTABLES)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wweak-vtables")
+endif()
+
+CHECK_CXX_COMPILER_FLAG("-Wmissing-declarations" CXX_MISSING_DECLARATIONS)
+if (CXX_MISSING_DECLARATIONS)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wmissing-declarations")
+endif()
+
+CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)
+
+CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)
+
+# gcc 10 complains about this
+CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
+CHECK_CXX_COMPILER_FLAG("-Wstringop-overflow" CXX_STRINGOP_OVERFLOW)
+if(CC_STRINGOP_OVERFLOW OR CXX_STRINGOP_OVERFLOW)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-stringop-overflow")
+endif()
diff --git a/cmake/cflags-ppc64le.cmake b/cmake/cflags-ppc64le.cmake
new file mode 100644
index 000000000..2ea9f1ba6
--- /dev/null
+++ b/cmake/cflags-ppc64le.cmake
@@ -0,0 +1,18 @@
+
+CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H)
+
+if (HAVE_C_PPC64EL_ALTIVEC_H)
+    set (INTRIN_INC_H "altivec.h")
+else()
+    message (FATAL_ERROR "No intrinsics header found for VSX")
+endif ()
+
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+int main() {
+    vector int a = vec_splat_s32(1);
+    (void)a;
+}" HAVE_VSX)
+
+if (NOT HAVE_VSX)
+    message(FATAL_ERROR "VSX support required for Power support")
+endif ()
diff --git a/cmake/cflags-x86.cmake b/cmake/cflags-x86.cmake
new file mode 100644
index 000000000..7b9cbf81a
--- /dev/null
+++ b/cmake/cflags-x86.cmake
@@ -0,0 +1,133 @@
+option(BUILD_AVX512 "Enabling support for AVX512" OFF)
+option(BUILD_AVX512VBMI "Enabling support for AVX512VBMI" OFF)
+
+set(SKYLAKE_FLAG "-march=skylake-avx512")
+set(ICELAKE_FLAG "-march=icelake-server")
+
+if (NOT FAT_RUNTIME)
+    if (BUILD_AVX512VBMI)
+        message (STATUS "AVX512VBMI implies AVX512, enabling BUILD_AVX512")
+        set(BUILD_AVX512 ON)
+        set(ARCH_C_FLAGS "${ICELAKE_FLAG}")
+        set(ARCH_CXX_FLAGS "${ICELAKE_FLAG}")
+    endif ()
+    if (BUILD_AVX512)
+        message (STATUS "AVX512 implies AVX2, enabling BUILD_AVX2")
+        set(BUILD_AVX2 ON)
+        set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
+        set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
+    endif ()
+    if (BUILD_AVX2)
+        message (STATUS "Enabling BUILD_AVX2")
+        set(ARCH_C_FLAGS "-mavx2")
+        set(ARCH_CXX_FLAGS "-mavx2")
+    else()
+        set(ARCH_C_FLAGS "-msse4.2")
+        set(ARCH_CXX_FLAGS "-msse4.2")
+    endif()
+else()
+    set(ARCH_C_FLAGS "-msse4.2")
+    set(ARCH_CXX_FLAGS "-msse4.2")
+endif()
+
+set(CMAKE_REQUIRED_FLAGS "${ARCH_C_FLAGS}")
+CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
+CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
+CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
+CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
+
+if (HAVE_C_X86INTRIN_H)
+    set (INTRIN_INC_H "x86intrin.h")
+elseif (HAVE_C_INTRIN_H)
+    set (INTRIN_INC_H "intrin.h")
+else()
+    message (FATAL_ERROR "No intrinsics header found for SSE/AVX2/AVX512")
+endif ()
+
+if (BUILD_AVX512)
+    CHECK_C_COMPILER_FLAG(${SKYLAKE_FLAG} HAS_ARCH_SKYLAKE)
+    if (NOT HAS_ARCH_SKYLAKE)
+        message (FATAL_ERROR "AVX512 not supported by compiler")
+    endif ()
+endif ()
+
+if (BUILD_AVX512VBMI)
+    CHECK_C_COMPILER_FLAG(${ICELAKE_FLAG} HAS_ARCH_ICELAKE)
+    if (NOT HAS_ARCH_ICELAKE)
+        message (FATAL_ERROR "AVX512VBMI not supported by compiler")
+    endif ()
+endif ()
+
+# ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+int main() {
+    __m128i a = _mm_set1_epi8(1);
+    (void)_mm_shuffle_epi8(a, a);
+}" HAVE_SSE42)
+
+# now look for AVX2
+set(CMAKE_REQUIRED_FLAGS "-mavx2")
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+#if !defined(__AVX2__)
+#error no avx2
+#endif
+
+int main(){
+    __m256i z = _mm256_setzero_si256();
+    (void)_mm256_xor_si256(z, z);
+}" HAVE_AVX2)
+
+# and now for AVX512
+set(CMAKE_REQUIRED_FLAGS "${SKYLAKE_FLAG}")
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+#if !defined(__AVX512BW__)
+#error no avx512bw
+#endif
+
+int main(){
+    __m512i z = _mm512_setzero_si512();
+    (void)_mm512_abs_epi8(z);
+}" HAVE_AVX512)
+
+# and now for AVX512VBMI
+set(CMAKE_REQUIRED_FLAGS "${ICELAKE_FLAG}")
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+#if !defined(__AVX512VBMI__)
+#error no avx512vbmi
+#endif
+
+int main(){
+    __m512i a = _mm512_set1_epi8(0xFF);
+    __m512i idx = _mm512_set_epi64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    (void)_mm512_permutexvar_epi8(idx, a);
+}" HAVE_AVX512VBMI)
+
+if (FAT_RUNTIME)
+    if (NOT HAVE_SSE42)
+        message(FATAL_ERROR "SSE4.2 support required to build fat runtime")
+    endif ()
+    if (BUILD_AVX2 AND NOT HAVE_AVX2)
+        message(FATAL_ERROR "AVX2 support required to build fat runtime")
+    endif ()
+    if (BUILD_AVX512 AND NOT HAVE_AVX512)
+        message(FATAL_ERROR "AVX512 support requested but not supported")
+    endif ()
+    if (BUILD_AVX512VBMI AND NOT HAVE_AVX512VBMI)
+        message(FATAL_ERROR "AVX512VBMI support requested but not supported")
+    endif ()
+else (NOT FAT_RUNTIME)
+    if (NOT BUILD_AVX2)
+        message(STATUS "Building without AVX2 support")
+    endif ()
+    if (NOT HAVE_AVX512)
+        message(STATUS "Building without AVX512 support")
+    endif ()
+    if (NOT HAVE_AVX512VBMI)
+        message(STATUS "Building without AVX512VBMI support")
+    endif ()
+    if (NOT HAVE_SSE42)
+        message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
+    endif ()
+endif ()
+
+
diff --git a/cmake/compiler.cmake b/cmake/compiler.cmake
new file mode 100644
index 000000000..4b174c722
--- /dev/null
+++ b/cmake/compiler.cmake
@@ -0,0 +1,19 @@
+# determine compiler
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    set(CMAKE_COMPILER_IS_CLANG TRUE)
+    set(CLANGCXX_MINVER "5")
+    message(STATUS "clang++ version ${CMAKE_CXX_COMPILER_VERSION}")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS CLANGCXX_MINVER)
+        message(FATAL_ERROR "A minimum of clang++ ${CLANGCXX_MINVER} is required for C++17 support")
+    endif()
+endif()
+
+# compiler version checks TODO: test more compilers
+if (CMAKE_COMPILER_IS_GNUCXX)
+    set(GNUCXX_MINVER "9")
+    message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
+        message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support")
+    endif()
+endif()
+
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 203f0afde..dbd72445c 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -15,15 +15,42 @@
 /* "Define if building for EM64T" */
 #cmakedefine ARCH_X86_64
 
+/* "Define if building for ARM32" */
+#cmakedefine ARCH_ARM32
+
+/* "Define if building for AARCH64" */
+#cmakedefine ARCH_AARCH64
+
+/* "Define if building for PPC64EL" */
+#cmakedefine ARCH_PPC64EL
+
+/* "Define if cross compiling for AARCH64" */
+#cmakedefine CROSS_COMPILE_AARCH64
+
+/* Define if building SVE for AARCH64. */
+#cmakedefine BUILD_SVE
+
+/* Define if building SVE2 for AARCH64. */
+#cmakedefine BUILD_SVE2
+
+/* Define if building SVE2+BITPERM for AARCH64. */
+#cmakedefine BUILD_SVE2_BITPERM
+
 /* internal build, switch on dump support. */
 #cmakedefine DUMP_SUPPORT
 
 /* Define if building "fat" runtime. */
 #cmakedefine FAT_RUNTIME
 
+/* Define if building AVX2 in the fat runtime. */
+#cmakedefine BUILD_AVX2
+
 /* Define if building AVX-512 in the fat runtime. */
 #cmakedefine BUILD_AVX512
 
+/* Define if building AVX512VBMI in the fat runtime. */
+#cmakedefine BUILD_AVX512VBMI
+
 /* Define to 1 if `backtrace' works. */
 #cmakedefine HAVE_BACKTRACE
 
@@ -45,6 +72,15 @@
 /* C compiler has intrin.h */
 #cmakedefine HAVE_C_INTRIN_H
 
+/* C compiler has arm_neon.h */
+#cmakedefine HAVE_C_ARM_NEON_H
+
+/* C compiler has arm_sve.h */
+#cmakedefine HAVE_C_ARM_SVE_H
+
+/* C compiler has arm_neon.h */
+#cmakedefine HAVE_C_PPC64EL_ALTIVEC_H
+
 /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
    0 if you don't. */
 #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
diff --git a/cmake/formatdate.py b/cmake/formatdate.py
index 1b9c62d2b..b9845687b 100755
--- a/cmake/formatdate.py
+++ b/cmake/formatdate.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import print_function
+
 import os
 import sys
 import datetime
diff --git a/cmake/osdetection.cmake b/cmake/osdetection.cmake
new file mode 100644
index 000000000..343e16b50
--- /dev/null
+++ b/cmake/osdetection.cmake
@@ -0,0 +1,38 @@
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    set(LINUX TRUE)
+endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
+
+if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    set(FREEBSD true)
+endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+
+option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" OFF)
+if (FAT_RUNTIME)
+    message("Checking Fat Runtime Requirements...")
+    if (NOT LINUX)
+        message(FATAL_ERROR "Fat runtime is only supported on Linux OS")
+    else()
+        if (USE_CPU_NATIVE AND FAT_RUNTIME)
+            message(FATAL_ERROR "Fat runtime is not compatible with Native CPU detection")
+        endif()
+
+        if (NOT (ARCH_IA32 OR ARCH_X86_64 OR ARCH_AARCH64))
+            message(FATAL_ERROR "Fat runtime is only supported on Intel and Aarch64 architectures")
+        else()
+            message(STATUS "Building Fat runtime for multiple microarchitectures")
+            message(STATUS "generator is ${CMAKE_GENERATOR}")
+            if (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
+                (CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
+                message (FATAL_ERROR "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
+            else()
+                include (${CMAKE_MODULE_PATH}/attrib.cmake)
+                if (NOT HAS_C_ATTR_IFUNC)
+                    message(FATAL_ERROR "Compiler does not support ifunc attribute, cannot build fat runtime")
+                endif()
+            endif()
+        endif()
+    endif()
+    if (NOT RELEASE_BUILD)
+        message(FATAL_ERROR "Fat runtime is only built on Release builds")
+    endif()
+endif ()
diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 593c544b5..30f6da92d 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -1,9 +1,12 @@
 # determine the target arch
-
 # really only interested in the preprocessor here
-CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
-
-CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
-
-set(ARCH_X86_64 ${ARCH_64_BIT})
-set(ARCH_IA32 ${ARCH_32_BIT})
+CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
+CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
+CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
+CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
+CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
+if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL)
+  set(ARCH_64_BIT TRUE)
+else()
+  set(ARCH_32_BIT TRUE)
+endif()
diff --git a/cmake/ragel.cmake b/cmake/ragel.cmake
index d3f0b9269..3697195b6 100644
--- a/cmake/ragel.cmake
+++ b/cmake/ragel.cmake
@@ -7,7 +7,7 @@ function(ragelmaker src_rl)
     add_custom_command(
         OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}/${src_file}.cpp
         COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${src_dir}
-        COMMAND ${RAGEL} ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl} -o ${rl_out}
+        COMMAND ${RAGEL} ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl} -o ${rl_out} -G0
         DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${src_rl}
         )
     add_custom_target(ragel_${src_file} DEPENDS ${rl_out})
diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake
new file mode 100644
index 000000000..2c1ce0685
--- /dev/null
+++ b/cmake/sanitize.cmake
@@ -0,0 +1,40 @@
+# Possible values:
+# - `address` (ASan)
+# - `memory` (MSan)
+# - `undefined` (UBSan)
+# - "" (no sanitizing)
+option (SANITIZE "Enable one of the code sanitizers" "")
+
+set (SAN_FLAGS "${SAN_FLAGS} -g -fno-omit-frame-pointer -DSANITIZER")
+
+if (SANITIZE)
+    if (SANITIZE STREQUAL "address")
+        set (ASAN_FLAGS "-fsanitize=address -fsanitize-address-use-after-scope")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
+        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
+
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_FLAGS}")
+        endif()
+
+    elseif (SANITIZE STREQUAL "memory")
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+	    set (FATAL_ERROR "GCC does not have memory sanitizer")
+        endif()
+	# MemorySanitizer flags are set according to the official documentation:
+        # https://clang.llvm.org/docs/MemorySanitizer.html#usage
+        set (MSAN_FLAGS "-fsanitize=memory -fsanitize-memory-use-after-dtor -fsanitize-memory-track-origins -fno-optimize-sibling-calls")
+
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}")
+        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}")
+    elseif (SANITIZE STREQUAL "undefined")
+        set (UBSAN_FLAGS "-fsanitize=undefined")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}")
+        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}")
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
+        endif()
+    else ()
+        message (FATAL_ERROR "Unknown sanitizer type: ${SANITIZE}")
+    endif ()
+endif()
diff --git a/cmake/simde.cmake b/cmake/simde.cmake
new file mode 100644
index 000000000..0ac52832f
--- /dev/null
+++ b/cmake/simde.cmake
@@ -0,0 +1,21 @@
+LIST(APPEND CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/simde)
+
+CHECK_INCLUDE_FILES(simde/x86/sse4.2.h SIMDE_SSE42_H_FOUND)
+
+if (SIMDE_SSE42_H_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
+  include_directories(${PROJECT_SOURCE_DIR}/simde)
+
+  if (CMAKE_COMPILER_IS_CLANG)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
+  endif()
+
+  if (SIMDE_NATIVE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+  endif()
+else()
+  message(FATAL_ERROR "SIMDe backend requested but SIMDe is not available on the system")
+endif()
diff --git a/cmake/sqlite3.cmake b/cmake/sqlite3.cmake
index a58362da7..92b18ce19 100644
--- a/cmake/sqlite3.cmake
+++ b/cmake/sqlite3.cmake
@@ -4,35 +4,21 @@
 
 option(SQLITE_PREFER_STATIC "Build sqlite3 statically instead of using an installed lib" OFF)
 
-if(NOT WIN32 AND NOT SQLITE_PREFER_STATIC)
+if(NOT SQLITE_PREFER_STATIC)
 find_package(PkgConfig QUIET)
 
 # first check for sqlite on the system
 pkg_check_modules(SQLITE3 sqlite3)
 endif()
 
-if (NOT SQLITE3_FOUND)
-    message(STATUS "looking for sqlite3 in source tree")
-    # look in the source tree
-    if (EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.h" AND
-            EXISTS "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
-        message(STATUS "  found sqlite3 in source tree")
-        set(SQLITE3_FOUND TRUE)
-        set(SQLITE3_BUILD_SOURCE TRUE)
-        set(SQLITE3_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/sqlite3")
-        set(SQLITE3_LDFLAGS sqlite3_static)
-    else()
-        message(STATUS "  no sqlite3 in source tree")
-    endif()
-endif()
-
 # now do version checks
 if (SQLITE3_FOUND)
     list(INSERT CMAKE_REQUIRED_INCLUDES 0 "${SQLITE3_INCLUDE_DIRS}")
-    CHECK_C_SOURCE_COMPILES("#include <sqlite3.h>\n#if SQLITE_VERSION_NUMBER >= 3008007 && SQLITE_VERSION_NUMBER < 3008010\n#error broken sqlite\n#endif\nint main() {return 0;}" SQLITE_VERSION_OK)
-    if (NOT SQLITE_VERSION_OK)
+    if (SQLITE_VERSION LESS "3.8.10")
         message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version")
     endif()
+endif()
+
 if (NOT SQLITE3_BUILD_SOURCE)
     set(_SAVED_FLAGS ${CMAKE_REQUIRED_FLAGS})
     list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS})
@@ -43,11 +29,8 @@ else()
     if (NOT TARGET sqlite3_static)
     # build sqlite as a static lib to compile into our test programs
     add_library(sqlite3_static STATIC "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
-    if (NOT WIN32)
-        set_target_properties(sqlite3_static PROPERTIES COMPILE_FLAGS "-Wno-error -Wno-extra -Wno-unused -Wno-cast-qual -DSQLITE_OMIT_LOAD_EXTENSION")
-    endif()
+    set_target_properties(sqlite3_static PROPERTIES COMPILE_FLAGS "-Wno-error -Wno-extra -Wno-unused -Wno-cast-qual -DSQLITE_OMIT_LOAD_EXTENSION")
     endif()
 endif()
-endif()
 
 # that's enough about sqlite
diff --git a/doc/dev-reference/chimera.rst b/doc/dev-reference/chimera.rst
index 883cb5a0a..d35b116f5 100644
--- a/doc/dev-reference/chimera.rst
+++ b/doc/dev-reference/chimera.rst
@@ -212,7 +212,7 @@ space is required for that context.
 In the absence of recursive scanning, only one such space is required per thread
 and can (and indeed should) be allocated before data scanning is to commence.
 
-In a scenario where a set of expressions are compiled by a single "master"
+In a scenario where a set of expressions are compiled by a single "main"
 thread and data will be scanned by multiple "worker" threads, the convenience
 function :c:func:`ch_clone_scratch` allows multiple copies of an existing
 scratch space to be made for each thread (rather than forcing the caller to pass
diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 205b7348b..6f5541ecf 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -64,21 +64,21 @@ interpreted independently. No syntax association happens between any adjacent
 characters.
 
 For example, given an expression written as :regexp:`/bc?/`. We could say it is
-a regluar expression, with the meaning that character ``b`` followed by nothing
+a regular expression, with the meaning that character ``b`` followed by nothing
 or by one character ``c``. On the other view, we could also say it is a pure
 literal expression, with the meaning that this is a character sequence of 3-byte
 length, containing characters ``b``, ``c`` and ``?``. In regular case, the
 question mark character ``?`` has a particular syntax role called 0-1 quantifier,
-which has an syntax association with the character ahead of it. Similar
-characters exist in regular grammer like ``[``, ``]``, ``(``, ``)``, ``{``,
+which has a syntax association with the character ahead of it. Similar
+characters exist in regular grammar like ``[``, ``]``, ``(``, ``)``, ``{``,
 ``}``, ``-``, ``*``, ``+``, ``\``, ``|``, ``/``, ``:``, ``^``, ``.``, ``$``.
 While in pure literal case, all these meta characters lost extra meanings
 expect for that they are just common ASCII codes.
 
 Hyperscan is initially designed to process common regular expressions. It is
-hence embedded with a complex parser to do comprehensive regular grammer
-interpretion. Particularly, the identification of above meta characters is the
-basic step for the interpretion of far more complex regular grammers.
+hence embedded with a complex parser to do comprehensive regular grammar
+interpretation. Particularly, the identification of above meta characters is the
+basic step for the interpretation of far more complex regular grammars.
 
 However in real cases, patterns may not always be regular expressions. They
 could just be pure literals. Problem will come if the pure literals contain
@@ -165,7 +165,7 @@ The following regex constructs are supported by Hyperscan:
     :regexp:`{n,}` are supported with limitations.
 
     * For arbitrary repeated sub-patterns: *n* and *m* should be either small
-      or infinite, e.g. :regexp:`(a|b}{4}`, :regexp:`(ab?c?d){4,10}` or
+      or infinite, e.g. :regexp:`(a|b){4}`, :regexp:`(ab?c?d){4,10}` or
       :regexp:`(ab(cd)*){6,}`.
 
     * For single-character width sub-patterns such as :regexp:`[^\\a]` or
diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst
index b38128733..aaff15ba2 100644
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@@ -263,17 +263,19 @@ the current platform is supported by Hyperscan.
 As of this release, the variants of the runtime that are built, and the CPU
 capability that is required, are the following:
 
-+----------+-------------------------------+---------------------------+
-| Variant  | CPU Feature Flag(s) Required  | gcc arch flag             |
-+==========+===============================+===========================+
-| Core 2   | ``SSSE3``                     | ``-march=core2``          |
-+----------+-------------------------------+---------------------------+
-| Core i7  | ``SSE4_2`` and ``POPCNT``     | ``-march=corei7``         |
-+----------+-------------------------------+---------------------------+
-| AVX 2    | ``AVX2``                      | ``-march=core-avx2``      |
-+----------+-------------------------------+---------------------------+
-| AVX 512  | ``AVX512BW`` (see note below) | ``-march=skylake-avx512`` |
-+----------+-------------------------------+---------------------------+
++--------------+---------------------------------+---------------------------+
+| Variant      | CPU Feature Flag(s) Required    | gcc arch flag             |
++==============+=================================+===========================+
+| Core 2       | ``SSSE3``                       | ``-march=core2``          |
++--------------+---------------------------------+---------------------------+
+| Core i7      | ``SSE4_2`` and ``POPCNT``       | ``-march=corei7``         |
++--------------+---------------------------------+---------------------------+
+| AVX 2        | ``AVX2``                        | ``-march=core-avx2``      |
++--------------+---------------------------------+---------------------------+
+| AVX 512      | ``AVX512BW`` (see note below)   | ``-march=skylake-avx512`` |
++--------------+---------------------------------+---------------------------+
+| AVX 512 VBMI | ``AVX512VBMI`` (see note below) | ``-march=icelake-server`` |
++--------------+---------------------------------+---------------------------+
 
 .. note::
 
@@ -287,6 +289,16 @@ capability that is required, are the following:
 
         cmake -DBUILD_AVX512=on <...>
 
+    Hyperscan v5.3 adds support for AVX512VBMI instructions - in particular the
+    ``AVX512VBMI`` instruction set that was introduced on Intel "Icelake" Xeon
+    processors - however the AVX512VBMI runtime variant is **not** enabled by
+    default in fat runtime builds as not all toolchains support AVX512VBMI
+    instruction sets. To build an AVX512VBMI runtime, the CMake variable
+    ``BUILD_AVX512VBMI`` must be enabled manually during configuration. For
+    example: ::
+
+        cmake -DBUILD_AVX512VBMI=on <...>
+
 As the fat runtime requires compiler, libc, and binutils support, at this time
 it will only be enabled for Linux builds where the compiler supports the
 `indirect function "ifunc" function attribute
diff --git a/doc/dev-reference/runtime.rst b/doc/dev-reference/runtime.rst
index d64ec540d..396521c94 100644
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@@ -178,7 +178,7 @@ space is required for that context.
 In the absence of recursive scanning, only one such space is required per thread
 and can (and indeed should) be allocated before data scanning is to commence.
 
-In a scenario where a set of expressions are compiled by a single "master"
+In a scenario where a set of expressions are compiled by a single "main"
 thread and data will be scanned by multiple "worker" threads, the convenience
 function :c:func:`hs_clone_scratch` allows multiple copies of an existing
 scratch space to be made for each thread (rather than forcing the caller to pass
diff --git a/examples/patbench.cc b/examples/patbench.cc
index 20de5745e..8180d2a9d 100644
--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@@ -112,6 +112,7 @@
  *
  */
 
+#include <random>
 #include <algorithm>
 #include <cstring>
 #include <chrono>
@@ -151,6 +152,8 @@ using std::set;
 using std::min;
 using std::max;
 using std::copy;
+using std::random_device;
+using std::mt19937;
 
 enum Criterion {
     CRITERION_THROUGHPUT,
@@ -731,7 +734,9 @@ int main(int argc, char **argv) {
             count++;
             cout << "." << std::flush;
             vector<unsigned> sv(s.begin(), s.end());
-            random_shuffle(sv.begin(), sv.end());
+            random_device rng;
+            mt19937 urng(rng());
+            shuffle(sv.begin(), sv.end(), urng);
             unsigned groups = factor_max + 1;
             for (unsigned current_group = 0; current_group < groups;
                  current_group++) {
diff --git a/libhs.pc.in b/libhs.pc.in
index fed4db454..3ad2b90cc 100644
--- a/libhs.pc.in
+++ b/libhs.pc.in
@@ -7,5 +7,4 @@ Name: libhs
 Description: Intel(R) Hyperscan Library
 Version: @HS_VERSION@
 Libs: -L${libdir} -lhs
-Libs.private: @PRIVATE_LIBS@
 Cflags: -I${includedir}/hs
diff --git a/simde b/simde
new file mode 160000
index 000000000..aae22459f
--- /dev/null
+++ b/simde
@@ -0,0 +1 @@
+Subproject commit aae22459fa284e9fc2b7d4b8e4571afa0418125f
diff --git a/src/compiler/asserts.cpp b/src/compiler/asserts.cpp
index 444422260..51a052b04 100644
--- a/src/compiler/asserts.cpp
+++ b/src/compiler/asserts.cpp
@@ -231,7 +231,7 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,
          * required so that ^ doesn't match trailing \n */
          for (const auto &e : out_edges_range(v, g)) {
             if (target(e, g) == g.accept) {
-                dead.push_back(e);
+                dead.emplace_back(e);
             }
         }
         /* assert has been resolved; clear flag */
diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index 666eefc9c..35f46b3fe 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -323,7 +323,8 @@ void addExpression(NG &ng, unsigned index, const char *expression,
     }
 
     // Ensure that our pattern isn't too long (in characters).
-    if (strlen(expression) > cc.grey.limitPatternLength) {
+    size_t maxlen = cc.grey.limitPatternLength + 1;
+    if (strnlen(expression, maxlen) >= maxlen) {
         throw CompileError("Pattern length exceeds limit.");
     }
 
@@ -416,6 +417,10 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
                            "HS_FLAG_SOM_LEFTMOST are supported in literal API.");
     }
 
+    if (!strcmp(expression, "")) {
+        throw CompileError("Pure literal API doesn't support empty string.");
+    }
+
     // This expression must be a pure literal, we can build ue2_literal
     // directly based on expression text.
     ParsedLitExpression ple(index, expression, expLength, flags, id);
@@ -458,6 +463,9 @@ platform_t target_to_platform(const target_t &target_info) {
     if (!target_info.has_avx512()) {
         p |= HS_PLATFORM_NOAVX512;
     }
+    if (!target_info.has_avx512vbmi()) {
+        p |= HS_PLATFORM_NOAVX512VBMI;
+    }
     return p;
 }
 
diff --git a/src/crc32.c b/src/crc32.c
index 1dae47b4e..19c7b7fa9 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -30,7 +30,6 @@
 #include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
-#include "util/intrinsics.h"
 
 #if !defined(HAVE_SSE42)
 
@@ -579,53 +578,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
 }
 
 #else // HAVE_SSE42
-
-#ifdef ARCH_64_BIT
-#define CRC_WORD 8
-#define CRC_TYPE u64a
-#define CRC_FUNC _mm_crc32_u64
-#else
-#define CRC_WORD 4
-#define CRC_TYPE u32
-#define CRC_FUNC _mm_crc32_u32
-#endif
-
-/*
- * Use the crc32 instruction from SSE4.2 to compute our checksum - same
- * polynomial as the above function.
- */
-static really_inline
-u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
-                      const size_t length) {
-    u32 crc = running_crc;
-
-    // Process byte-by-byte until p_buf is aligned
-
-    const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
-    size_t init_bytes = aligned_buf - p_buf;
-    size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
-    size_t end_bytes = length - init_bytes - running_length;
-
-    while (p_buf < aligned_buf) {
-        crc = _mm_crc32_u8(crc, *p_buf++);
-    }
-
-    // Main aligned loop, processes a word at a time.
-
-    for (size_t li = 0; li < running_length/CRC_WORD; li++) {
-        CRC_TYPE block = *(const CRC_TYPE *)p_buf;
-        crc = CRC_FUNC(crc, block);
-        p_buf += CRC_WORD;
-    }
-
-    // Remaining bytes
-
-    for(size_t li = 0; li < end_bytes; li++) {
-        crc = _mm_crc32_u8(crc, *p_buf++);
-    }
-
-    return crc;
-}
+#include "util/arch/x86/crc32.h"
 #endif
 
 #ifdef VERIFY_ASSERTION
diff --git a/src/database.c b/src/database.c
index 1a79800e2..62e0b5e3f 100644
--- a/src/database.c
+++ b/src/database.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -115,7 +115,8 @@ static
 hs_error_t db_check_platform(const u64a p) {
     if (p != hs_current_platform
         && p != (hs_current_platform | hs_current_platform_no_avx2)
-        && p != (hs_current_platform | hs_current_platform_no_avx512)) {
+        && p != (hs_current_platform | hs_current_platform_no_avx512)
+        && p != (hs_current_platform | hs_current_platform_no_avx512vbmi)) {
         return HS_DB_PLATFORM_ERROR;
     }
     // passed all checks
@@ -352,12 +353,6 @@ hs_error_t dbIsValid(const hs_database_t *db) {
     return HS_SUCCESS;
 }
 
-#if defined(_WIN32)
-#define SNPRINTF_COMPAT _snprintf
-#else
-#define SNPRINTF_COMPAT snprintf
-#endif
-
 /** Allocate a buffer and prints the database info into it. Returns an
  * appropriate error code on failure, or HS_SUCCESS on success. */
 static
@@ -370,9 +365,11 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
     u8 minor = (version >> 16) & 0xff;
     u8 major = (version >> 24) & 0xff;
 
-    const char *features = (plat & HS_PLATFORM_NOAVX512)
-                               ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
-                               : "AVX512";
+    const char *features = (plat & HS_PLATFORM_NOAVX512VBMI)
+                               ? (plat & HS_PLATFORM_NOAVX512)
+                                   ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
+                                   : "AVX512"
+                               : "AVX512VBMI";
 
     const char *mode = NULL;
 
@@ -397,9 +394,7 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
             return ret;
         }
 
-        // Note: SNPRINTF_COMPAT is a macro defined above, to cope with systems
-        // that don't have snprintf but have a workalike.
-        int p_len = SNPRINTF_COMPAT(
+        int p_len = snprintf(
             buf, len, "Version: %u.%u.%u Features: %s Mode: %s",
             major, minor, release, features, mode);
         if (p_len < 0) {
diff --git a/src/database.h b/src/database.h
index 5715ed677..a4d6e4dca 100644
--- a/src/database.h
+++ b/src/database.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,10 +51,12 @@ extern "C"
 // CPU type is the low 6 bits (we can't need more than 64, surely!)
 
 #define HS_PLATFORM_INTEL           1
+#define HS_PLATFORM_ARM             2
 #define HS_PLATFORM_CPU_MASK        0x3F
 
 #define HS_PLATFORM_NOAVX2          (4<<13)
 #define HS_PLATFORM_NOAVX512        (8<<13)
+#define HS_PLATFORM_NOAVX512VBMI    (0x10<<13)
 
 /** \brief Platform features bitmask. */
 typedef u64a platform_t;
@@ -66,6 +68,9 @@ const platform_t hs_current_platform = {
 #endif
 #if !defined(HAVE_AVX512)
     HS_PLATFORM_NOAVX512 |
+#endif
+#if !defined(HAVE_AVX512VBMI)
+    HS_PLATFORM_NOAVX512VBMI |
 #endif
     0,
 };
@@ -74,12 +79,20 @@ static UNUSED
 const platform_t hs_current_platform_no_avx2 = {
     HS_PLATFORM_NOAVX2 |
     HS_PLATFORM_NOAVX512 |
+    HS_PLATFORM_NOAVX512VBMI |
     0,
 };
 
 static UNUSED
 const platform_t hs_current_platform_no_avx512 = {
     HS_PLATFORM_NOAVX512 |
+    HS_PLATFORM_NOAVX512VBMI |
+    0,
+};
+
+static UNUSED
+const platform_t hs_current_platform_no_avx512vbmi = {
+    HS_PLATFORM_NOAVX512VBMI |
     0,
 };
 
diff --git a/src/dispatcher.c b/src/dispatcher.c
index a786b806d..a817e7441 100644
--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,7 +30,8 @@
 #include "hs_common.h"
 #include "hs_runtime.h"
 #include "ue2common.h"
-#include "util/cpuid_inline.h"
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/cpuid_inline.h"
 #include "util/join.h"
 
 #if defined(DISABLE_AVX512_DISPATCH)
@@ -38,8 +39,14 @@
 #define check_avx512() (0)
 #endif
 
+#if defined(DISABLE_AVX512VBMI_DISPATCH)
+#define avx512vbmi_ disabled_
+#define check_avx512vbmi() (0)
+#endif
+
 #define CREATE_DISPATCH(RTYPE, NAME, ...)                                      \
     /* create defns */                                                         \
+    RTYPE JOIN(avx512vbmi_, NAME)(__VA_ARGS__);                                \
     RTYPE JOIN(avx512_, NAME)(__VA_ARGS__);                                    \
     RTYPE JOIN(avx2_, NAME)(__VA_ARGS__);                                      \
     RTYPE JOIN(corei7_, NAME)(__VA_ARGS__);                                    \
@@ -52,6 +59,9 @@
                                                                                \
     /* resolver */                                                             \
     static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) {                  \
+        if (check_avx512vbmi()) {                                              \
+            return JOIN(avx512vbmi_, NAME);                                    \
+        }                                                                      \
         if (check_avx512()) {                                                  \
             return JOIN(avx512_, NAME);                                        \
         }                                                                      \
@@ -72,6 +82,46 @@
     HS_PUBLIC_API                                                              \
     RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
 
+#elif defined(ARCH_AARCH64)
+#include "util/arch/arm/cpuid_inline.h"
+#include "util/join.h"
+
+#define CREATE_DISPATCH(RTYPE, NAME, ...)                                      \
+    /* create defns */                                                         \
+    RTYPE JOIN(sve2_, NAME)(__VA_ARGS__);                                      \
+    RTYPE JOIN(sve_, NAME)(__VA_ARGS__);                                       \
+    RTYPE JOIN(neon_, NAME)(__VA_ARGS__);                                      \
+                                                                               \
+    /* error func */                                                           \
+    static inline RTYPE JOIN(error_, NAME)(__VA_ARGS__) {                      \
+        return (RTYPE)HS_ARCH_ERROR;                                           \
+    }                                                                          \
+                                                                               \
+    /* resolver */                                                             \
+    static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) {                  \
+        if (check_sve2()) {                                                    \
+            return JOIN(sve2_, NAME);                                          \
+        }                                                                      \
+        if (check_sve()) {                                                     \
+            return JOIN(sve_, NAME);                                           \
+        }                                                                      \
+        if (check_neon()) {                                                    \
+            return JOIN(neon_, NAME);                                          \
+        }                                                                      \
+        /* anything else is fail */                                            \
+        return JOIN(error_, NAME);                                             \
+    }                                                                          \
+                                                                               \
+    /* function */                                                             \
+    HS_PUBLIC_API                                                              \
+    RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
+
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
 CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
                 unsigned length, unsigned flags, hs_scratch_t *scratch,
                 match_event_handler onEvent, void *userCtx);
@@ -140,3 +190,6 @@ CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
 /** INTERNALS **/
 
 CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic pop
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index d33756d35..561e8f986 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -36,6 +36,7 @@
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "util/arch.h"
+#include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/uniform_ops.h"
 
@@ -119,20 +120,6 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
 };
 
-/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
- * so we force its generation.
- */
-static really_inline
-u64a andn(const u32 a, const u8 *b) {
-    u64a r;
-#if defined(HAVE_BMI) && !defined(NO_ASM)
-    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
-#else
-    r = unaligned_load_u32(b) & ~a;
-#endif
-    return r;
-}
-
 /* generates an initial state mask based on the last byte-ish of history rather
  * than being all accepting. If there is no history to consider, the state is
  * generated based on the minimum length of each bucket in order to prevent
@@ -160,33 +147,43 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
                        const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
     /* +1: the zones ensure that we can read the byte at z->end */
     assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
-    u64a reach0 = andn(domain_mask_flipped, itPtr);
-    u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
-    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
-    u64a reach3 = andn(domain_mask_flipped, itPtr + 3);
-
-    m128 st0 = load_m128_from_u64a(ft + reach0);
-    m128 st1 = load_m128_from_u64a(ft + reach1);
-    m128 st2 = load_m128_from_u64a(ft + reach2);
-    m128 st3 = load_m128_from_u64a(ft + reach3);
-
-    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
-    u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
-    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
-    u64a reach7 = andn(domain_mask_flipped, itPtr + 7);
-
-    m128 st4 = load_m128_from_u64a(ft + reach4);
-    m128 st5 = load_m128_from_u64a(ft + reach5);
-    m128 st6 = load_m128_from_u64a(ft + reach6);
-    m128 st7 = load_m128_from_u64a(ft + reach7);
-
-    st1 = lshiftbyte_m128(st1, 1);
-    st2 = lshiftbyte_m128(st2, 2);
-    st3 = lshiftbyte_m128(st3, 3);
-    st4 = lshiftbyte_m128(st4, 4);
-    st5 = lshiftbyte_m128(st5, 5);
-    st6 = lshiftbyte_m128(st6, 6);
-    st7 = lshiftbyte_m128(st7, 7);
+    u64a domain_mask = ~domain_mask_flipped;
+
+    u64a it_hi = *(const u64a *)itPtr;
+    u64a it_lo = *(const u64a *)(itPtr + 8);
+    u64a reach0  = domain_mask & it_hi;
+    u64a reach1  = domain_mask & (it_hi >> 8);
+    u64a reach2  = domain_mask & (it_hi >> 16);
+    u64a reach3  = domain_mask & (it_hi >> 24);
+    u64a reach4  = domain_mask & (it_hi >> 32);
+    u64a reach5  = domain_mask & (it_hi >> 40);
+    u64a reach6  = domain_mask & (it_hi >> 48);
+    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
+    u64a reach8  = domain_mask & it_lo;
+    u64a reach9  = domain_mask & (it_lo >> 8);
+    u64a reach10 = domain_mask & (it_lo >> 16);
+    u64a reach11 = domain_mask & (it_lo >> 24);
+    u64a reach12 = domain_mask & (it_lo >> 32);
+    u64a reach13 = domain_mask & (it_lo >> 40);
+    u64a reach14 = domain_mask & (it_lo >> 48);
+    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
+
+    m128 st0  = load_m128_from_u64a(ft + reach0);
+    m128 st1  = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
+    m128 st2  = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2);
+    m128 st3  = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3);
+    m128 st4  = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
+    m128 st5  = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5);
+    m128 st6  = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6);
+    m128 st7  = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7);
+    m128 st8  = load_m128_from_u64a(ft + reach8);
+    m128 st9  = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
+    m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2);
+    m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3);
+    m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4);
+    m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5);
+    m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6);
+    m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7);
 
     st0 = or128(st0, st1);
     st2 = or128(st2, st3);
@@ -195,39 +192,6 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
     st0 = or128(st0, st2);
     st4 = or128(st4, st6);
     st0 = or128(st0, st4);
-    *s = or128(*s, st0);
-
-    *conf0 = movq(*s);
-    *s = rshiftbyte_m128(*s, 8);
-    *conf0 ^= ~0ULL;
-
-    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
-    u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
-    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
-    u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
-
-    m128 st8 = load_m128_from_u64a(ft + reach8);
-    m128 st9 = load_m128_from_u64a(ft + reach9);
-    m128 st10 = load_m128_from_u64a(ft + reach10);
-    m128 st11 = load_m128_from_u64a(ft + reach11);
-
-    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
-    u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
-    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
-    u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
-
-    m128 st12 = load_m128_from_u64a(ft + reach12);
-    m128 st13 = load_m128_from_u64a(ft + reach13);
-    m128 st14 = load_m128_from_u64a(ft + reach14);
-    m128 st15 = load_m128_from_u64a(ft + reach15);
-
-    st9 = lshiftbyte_m128(st9, 1);
-    st10 = lshiftbyte_m128(st10, 2);
-    st11 = lshiftbyte_m128(st11, 3);
-    st12 = lshiftbyte_m128(st12, 4);
-    st13 = lshiftbyte_m128(st13, 5);
-    st14 = lshiftbyte_m128(st14, 6);
-    st15 = lshiftbyte_m128(st15, 7);
 
     st8 = or128(st8, st9);
     st10 = or128(st10, st11);
@@ -236,11 +200,14 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
     st8 = or128(st8, st10);
     st12 = or128(st12, st14);
     st8 = or128(st8, st12);
-    *s = or128(*s, st8);
 
-    *conf8 = movq(*s);
-    *s = rshiftbyte_m128(*s, 8);
-    *conf8 ^= ~0ULL;
+    m128 st = or128(*s, st0);
+    *conf0 = movq(st) ^ ~0ULL;
+    st = rshiftbyte_m128(st, 8);
+    st = or128(st, st8);
+
+    *conf8 = movq(st) ^ ~0ULL;
+    *s = rshiftbyte_m128(st, 8);
 }
 
 static really_inline
@@ -248,6 +215,7 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
                        UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
                        const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
     assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
     u64a reach0 = andn(domain_mask_flipped, itPtr);
     u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
     u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
@@ -300,6 +268,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
                        UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
                        const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
     assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+
     u64a reach0 = andn(domain_mask_flipped, itPtr);
     u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
     u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
@@ -696,6 +665,10 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
         const u8 *tryFloodDetect = zz->floodPtr;                            \
         const u8 *start_ptr = zz->start;                                    \
         const u8 *end_ptr = zz->end;                                        \
+        for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr;      \
+            itPtr += 4*ITER_BYTES) {                                        \
+            __builtin_prefetch(itPtr);                                      \
+        }                                                                   \
                                                                             \
         for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr;    \
             itPtr += ITER_BYTES) {                                          \
@@ -739,6 +712,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
     assert(ISALIGNED_CL(confBase));
     struct zone zones[ZONE_MAX];
     assert(fdr->domain > 8 && fdr->domain < 16);
+    memset(zones, 0, sizeof(zones));
 
     size_t numZone = prepareZones(a->buf, a->len,
                                   a->buf_history + a->len_history,
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index fcfc08638..d15e4537b 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -44,7 +44,6 @@
 #include "util/compare.h"
 #include "util/container.h"
 #include "util/dump_mask.h"
-#include "util/make_unique.h"
 #include "util/math.h"
 #include "util/noncopyable.h"
 #include "util/target_info.h"
@@ -99,7 +98,7 @@ class FDRCompiler : noncopyable {
                 const FDREngineDescription &eng_in,
                 bool make_small_in, const Grey &grey_in)
         : eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()),
-          lits(move(lits_in)), bucketToLits(move(bucketToLits_in)),
+          lits(std::move(lits_in)), bucketToLits(std::move(bucketToLits_in)),
           make_small(make_small_in) {}
 
     bytecode_ptr<FDR> build();
@@ -494,18 +493,18 @@ map<BucketIndex, vector<LiteralIndex>> assignStringsToBuckets(
         u32 cnt = last_id - first_id;
         // long literals first for included literals checking
         for (u32 k = 0; k < cnt; k++) {
-            litIds.push_back(last_id - k - 1);
+            litIds.emplace_back(last_id - k - 1);
         }
 
         i = j;
-        buckets.push_back(litIds);
+        buckets.emplace_back(litIds);
     }
 
     // reverse bucket id, longer literals come first
     map<BucketIndex, vector<LiteralIndex>> bucketToLits;
     size_t bucketCnt = buckets.size();
     for (size_t i = 0; i < bucketCnt; i++) {
-        bucketToLits.emplace(bucketCnt - i - 1, move(buckets[i]));
+        bucketToLits.emplace(bucketCnt - i - 1, std::move(buckets[i]));
     }
 
     return bucketToLits;
@@ -868,7 +867,7 @@ unique_ptr<HWLMProto> fdrBuildProtoInternal(u8 engType,
     auto bucketToLits = assignStringsToBuckets(lits, *des);
     addIncludedInfo(lits, des->getNumBuckets(), bucketToLits);
     auto proto =
-        ue2::make_unique<HWLMProto>(engType, move(des), lits, bucketToLits,
+        std::make_unique<HWLMProto>(engType, std::move(des), lits, bucketToLits,
                                     make_small);
     return proto;
 }
diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index 8e3690895..75b237b06 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -162,7 +162,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
         LitInfo & li = tmpLitInfo[i];
         u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
         DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
-        res2lits[hash].push_back(i);
+        res2lits[hash].emplace_back(i);
         gm |= li.groups;
     }
 
@@ -303,13 +303,13 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
         if (contains(bucketToLits, b)) {
             vector<hwlmLiteral> vl;
             for (const LiteralIndex &lit_idx : bucketToLits.at(b)) {
-                vl.push_back(lits[lit_idx]);
+                vl.emplace_back(lits[lit_idx]);
             }
 
             DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
             auto fc = getFDRConfirm(vl, make_small);
             totalConfirmSize += fc.size();
-            bc2Conf.emplace(b, move(fc));
+            bc2Conf.emplace(b, std::move(fc));
         }
     }
 
diff --git a/src/fdr/fdr_dump.cpp b/src/fdr/fdr_dump.cpp
index f4cd1f44e..1dda751ac 100644
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -107,6 +107,25 @@ void dumpTeddyReinforced(const u8 *rmsk, const u32 num_tables, FILE *f) {
     }
 }
 
+static
+void dumpTeddyDupMasks(const u8 *dmsk, u32 numMasks, FILE *f) {
+    // dump nibble masks
+    u32 maskWidth = 2;
+    fprintf(f, "    dup nibble masks:\n");
+    for (u32 i = 0; i < numMasks * 2; i++) {
+        fprintf(f, "      -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
+        for (u32 j = 0; j < 16 * maskWidth * 2; j++) {
+            u8 val = dmsk[i * 16 * maskWidth * 2 + j];
+            for (u32 k = 0; k < 8; k++) {
+                fprintf(f, "%s", ((val >> k) & 0x1) ? "1" : "0");
+            }
+            fprintf(f, " ");
+        }
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
 static
 void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
     // dump nibble masks
@@ -146,12 +165,17 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
 
     u32 maskWidth = des->getNumBuckets() / 8;
     size_t headerSize = sizeof(Teddy);
-    size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
     const u8 *teddy_base = (const u8 *)teddy;
     const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
-    const u8 *rmsk = baseMsk + ROUNDUP_CL(maskLen);
     dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
-    dumpTeddyReinforced(rmsk, maskWidth, f);
+    size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
+    const u8 *rdmsk = baseMsk + ROUNDUP_CL(maskLen);
+    if (maskWidth == 1) { // reinforcement table in Teddy
+        dumpTeddyReinforced(rdmsk, maskWidth, f);
+    } else { // dup nibble mask table in Fat Teddy
+        assert(maskWidth == 2);
+        dumpTeddyDupMasks(rdmsk, des->numMasks, f);
+    }
     dumpConfirms(teddy, teddy->confOffset, des->getNumBuckets(), f);
 }
 
diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp
index 2f9ba420c..c4f592588 100644
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@@ -31,7 +31,6 @@
 #include "hs_compile.h"
 #include "util/target_info.h"
 #include "util/compare.h" // for ourisalpha()
-#include "util/make_unique.h"
 
 #include <cassert>
 #include <cstdlib>
@@ -196,7 +195,7 @@ unique_ptr<FDREngineDescription> chooseEngine(const target_t &target,
     }
 
     DEBUG_PRINTF("using engine %u\n", best->getID());
-    return ue2::make_unique<FDREngineDescription>(*best);
+    return std::make_unique<FDREngineDescription>(*best);
 }
 
 SchemeBitIndex FDREngineDescription::getSchemeBit(BucketIndex b,
@@ -222,7 +221,7 @@ unique_ptr<FDREngineDescription> getFdrDescription(u32 engineID) {
         return nullptr;
     }
 
-    return ue2::make_unique<FDREngineDescription>(allDescs[engineID]);
+    return std::make_unique<FDREngineDescription>(allDescs[engineID]);
 }
 
 } // namespace ue2
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 960e2a415..65db3dff0 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -284,14 +284,6 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
 #define PREP_CONF_FN(val, n)                                                  \
     prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
 
-const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
-    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
-    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
-};
-
 #define TEDDY_VBMI_SL1_POS    15
 #define TEDDY_VBMI_SL2_POS    14
 #define TEDDY_VBMI_SL3_POS    13
@@ -311,26 +303,26 @@ const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
     sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
 
 #define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set4x128(maskBase[0]);                                      \
-    dup_mask[1] = set4x128(maskBase[1]);
+    dup_mask[0] = set1_4x128(maskBase[0]);                                      \
+    dup_mask[1] = set1_4x128(maskBase[1]);
 
 #define PREPARE_MASKS_2                                                       \
     PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set4x128(maskBase[2]);                                      \
-    dup_mask[3] = set4x128(maskBase[3]);
+    dup_mask[2] = set1_4x128(maskBase[2]);                                      \
+    dup_mask[3] = set1_4x128(maskBase[3]);
 
 #define PREPARE_MASKS_3                                                       \
     PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set4x128(maskBase[4]);                                      \
-    dup_mask[5] = set4x128(maskBase[5]);
+    dup_mask[4] = set1_4x128(maskBase[4]);                                      \
+    dup_mask[5] = set1_4x128(maskBase[5]);
 
 #define PREPARE_MASKS_4                                                       \
     PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set4x128(maskBase[6]);                                      \
-    dup_mask[7] = set4x128(maskBase[7]);
+    dup_mask[6] = set1_4x128(maskBase[6]);                                      \
+    dup_mask[7] = set1_4x128(maskBase[7]);
 
 #define PREPARE_MASKS(n)                                                      \
-    m512 lo_mask = set64x8(0xf);                                              \
+    m512 lo_mask = set1_64x8(0xf);                                              \
     m512 dup_mask[n * 2];                                                     \
     m512 sl_msk[n - 1];                                                       \
     PREPARE_MASKS_##n                                                         \
@@ -476,7 +468,7 @@ do {                                                                        \
     *c_16 = *(ptr + 15);                                                    \
     *c_32 = *(ptr + 31);                                                    \
     *c_48 = *(ptr + 47);                                                    \
-    m512 r_msk = set512_64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\
+    m512 r_msk = set8x64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\
                            0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\
     *c_0 = *(ptr + 63)
 
@@ -570,26 +562,26 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
                          &c_0, &c_16, &c_32, &c_48)
 
 #define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set4x128(maskBase[0]);                                      \
-    dup_mask[1] = set4x128(maskBase[1]);
+    dup_mask[0] = set1_4x128(maskBase[0]);                                      \
+    dup_mask[1] = set1_4x128(maskBase[1]);
 
 #define PREPARE_MASKS_2                                                       \
     PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set4x128(maskBase[2]);                                      \
-    dup_mask[3] = set4x128(maskBase[3]);
+    dup_mask[2] = set1_4x128(maskBase[2]);                                      \
+    dup_mask[3] = set1_4x128(maskBase[3]);
 
 #define PREPARE_MASKS_3                                                       \
     PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set4x128(maskBase[4]);                                      \
-    dup_mask[5] = set4x128(maskBase[5]);
+    dup_mask[4] = set1_4x128(maskBase[4]);                                      \
+    dup_mask[5] = set1_4x128(maskBase[5]);
 
 #define PREPARE_MASKS_4                                                       \
     PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set4x128(maskBase[6]);                                      \
-    dup_mask[7] = set4x128(maskBase[7]);
+    dup_mask[6] = set1_4x128(maskBase[6]);                                      \
+    dup_mask[7] = set1_4x128(maskBase[7]);
 
 #define PREPARE_MASKS(n)                                                      \
-    m512 lo_mask = set64x8(0xf);                                              \
+    m512 lo_mask = set1_64x8(0xf);                                              \
     m512 dup_mask[n * 2];                                                     \
     PREPARE_MASKS_##n
 
@@ -713,7 +705,7 @@ do {                                                                        \
 #define PREP_SHUF_MASK                                                      \
     PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr));                          \
     *c_128 = *(ptr + 15);                                                   \
-    m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
+    m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
     *c_0 = *(ptr + 31)
 
 #define SHIFT_OR_M1                                                         \
@@ -805,26 +797,26 @@ m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
     prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
 
 #define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set2x128(maskBase[0]);                                      \
-    dup_mask[1] = set2x128(maskBase[1]);
+    dup_mask[0] = set1_2x128(maskBase[0]);                                      \
+    dup_mask[1] = set1_2x128(maskBase[1]);
 
 #define PREPARE_MASKS_2                                                       \
     PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set2x128(maskBase[2]);                                      \
-    dup_mask[3] = set2x128(maskBase[3]);
+    dup_mask[2] = set1_2x128(maskBase[2]);                                      \
+    dup_mask[3] = set1_2x128(maskBase[3]);
 
 #define PREPARE_MASKS_3                                                       \
     PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set2x128(maskBase[4]);                                      \
-    dup_mask[5] = set2x128(maskBase[5]);
+    dup_mask[4] = set1_2x128(maskBase[4]);                                      \
+    dup_mask[5] = set1_2x128(maskBase[5]);
 
 #define PREPARE_MASKS_4                                                       \
     PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set2x128(maskBase[6]);                                      \
-    dup_mask[7] = set2x128(maskBase[7]);
+    dup_mask[6] = set1_2x128(maskBase[6]);                                      \
+    dup_mask[7] = set1_2x128(maskBase[7]);
 
 #define PREPARE_MASKS(n)                                                      \
-    m256 lo_mask = set32x8(0xf);                                              \
+    m256 lo_mask = set1_32x8(0xf);                                              \
     m256 dup_mask[n * 2];                                                     \
     PREPARE_MASKS_##n
 
@@ -901,8 +893,10 @@ do {                                                                          \
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
 do {                                                                        \
     if (unlikely(diff128(var, ones128()))) {                                \
-        u64a lo = movq(var);                                                \
-        u64a hi = movq(rshiftbyte_m128(var, 8));                            \
+        u64a __attribute__((aligned(16))) vec[2];                           \
+        store128(vec, var);                                                 \
+        u64a lo = vec[0];                                                   \
+        u64a hi = vec[1];                                                   \
         CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn);                 \
         CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn);             \
     }                                                                       \
@@ -925,7 +919,7 @@ do {                                                                        \
 
 static really_inline
 m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
-    m128 mask = set16x8(0xf);
+    m128 mask = set1_16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
     return or128(pshufb_m128(maskBase[0 * 2], lo),
@@ -934,7 +928,7 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
 
 static really_inline
 m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
-    m128 mask = set16x8(0xf);
+    m128 mask = set1_16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m1(maskBase, val);
@@ -949,7 +943,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
 static really_inline
 m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
                         m128 val) {
-    m128 mask = set16x8(0xf);
+    m128 mask = set1_16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
@@ -964,7 +958,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
 static really_inline
 m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
                         m128 *old_3, m128 val) {
-    m128 mask = set16x8(0xf);
+    m128 mask = set1_16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 20ea938cf..e17e78726 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -109,6 +109,36 @@ const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
 
+#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
+
+#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)          \
+do {                                                                        \
+    if (unlikely(chunk != ones_u64a)) {                                     \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)          \
+do {                                                                        \
+    if (unlikely(chunk != ones_u32)) {                                      \
+        chunk = ~chunk;                                                     \
+        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
+                &control, &last_match);                                     \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while(0)
+
+static really_inline
+const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
+}
+
+#else
+
 #define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn)              \
 do {                                                                        \
     if (unlikely(chunk != ones_u64a)) {                                     \
@@ -134,203 +164,200 @@ const m256 *getMaskBase_fat(const struct Teddy *teddy) {
     return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
 }
 
-#if defined(HAVE_AVX512_REVERT) // revert to AVX2 Fat Teddy
+#endif
 
-static really_inline
-const u64a *getReinforcedMaskBase_fat(const struct Teddy *teddy, u8 numMask) {
-    return (const u64a *)((const u8 *)getMaskBase_fat(teddy)
-                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
-}
+#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
+
+const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
+    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+    8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
+    24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+};
 
 #ifdef ARCH_64_BIT
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
 do {                                                                        \
     if (unlikely(diff512(var, ones512()))) {                                \
-        m512 swap = swap256in512(var);                                      \
-        m512 r = interleave512lo(var, swap);                                \
+        m512 msk_interleave = load512(p_mask_interleave);                   \
+        m512 r = vpermb512(msk_interleave, var);                            \
         m128 r0 = extract128from512(r, 0);                                  \
         m128 r1 = extract128from512(r, 1);                                  \
+        m128 r2 = extract128from512(r, 2);                                  \
+        m128 r3 = extract128from512(r, 3);                                  \
         u64a part1 = movq(r0);                                              \
         u64a part2 = extract64from128(r0, 1);                               \
-        u64a part5 = movq(r1);                                              \
-        u64a part6 = extract64from128(r1, 1);                               \
-        r = interleave512hi(var, swap);                                     \
-        r0 = extract128from512(r, 0);                                       \
-        r1 = extract128from512(r, 1);                                       \
-        u64a part3 = movq(r0);                                              \
-        u64a part4 = extract64from128(r0, 1);                               \
-        u64a part7 = movq(r1);                                              \
-        u64a part8 = extract64from128(r1, 1);                               \
-        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn);          \
-        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn);      \
-        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn);      \
-        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn);     \
-        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, conf_fn);     \
-        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, conf_fn);     \
-        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, conf_fn);     \
-        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, conf_fn);     \
+        u64a part3 = movq(r1);                                              \
+        u64a part4 = extract64from128(r1, 1);                               \
+        u64a part5 = movq(r2);                                              \
+        u64a part6 = extract64from128(r2, 1);                               \
+        u64a part7 = movq(r3);                                              \
+        u64a part8 = extract64from128(r3, 1);                               \
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);      \
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \
     }                                                                       \
 } while(0)
 #else
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
+#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
 do {                                                                        \
     if (unlikely(diff512(var, ones512()))) {                                \
-        m512 swap = swap256in512(var);                                      \
-        m512 r = interleave512lo(var, swap);                                \
+        m512 msk_interleave = load512(p_mask_interleave);                   \
+        m512 r = vpermb512(msk_interleave, var);                            \
         m128 r0 = extract128from512(r, 0);                                  \
         m128 r1 = extract128from512(r, 1);                                  \
+        m128 r2 = extract128from512(r, 2);                                  \
+        m128 r3 = extract128from512(r, 3);                                  \
         u32 part1 = movd(r0);                                               \
         u32 part2 = extract32from128(r0, 1);                                \
         u32 part3 = extract32from128(r0, 2);                                \
         u32 part4 = extract32from128(r0, 3);                                \
-        u32 part9 = movd(r1);                                               \
-        u32 part10 = extract32from128(r1, 1);                               \
-        u32 part11 = extract32from128(r1, 2);                               \
-        u32 part12 = extract32from128(r1, 3);                               \
-        r = interleave512hi(var, swap);                                     \
-        r0 = extract128from512(r, 0);                                       \
-        r1 = extract128from512(r, 1);                                       \
-        u32 part5 = movd(r0);                                               \
-        u32 part6 = extract32from128(r0, 1);                                \
-        u32 part7 = extract32from128(r0, 2);                                \
-        u32 part8 = extract32from128(r0, 3);                                \
-        u32 part13 = movd(r1);                                              \
-        u32 part14 = extract32from128(r1, 1);                               \
-        u32 part15 = extract32from128(r1, 2);                               \
-        u32 part16 = extract32from128(r1, 3);                               \
-        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn);          \
-        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, conf_fn);    \
-        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, conf_fn);    \
+        u32 part5 = movd(r1);                                               \
+        u32 part6 = extract32from128(r1, 1);                                \
+        u32 part7 = extract32from128(r1, 2);                                \
+        u32 part8 = extract32from128(r1, 3);                                \
+        u32 part9 = movd(r2);                                               \
+        u32 part10 = extract32from128(r2, 1);                               \
+        u32 part11 = extract32from128(r2, 2);                               \
+        u32 part12 = extract32from128(r2, 3);                               \
+        u32 part13 = movd(r3);                                              \
+        u32 part14 = extract32from128(r3, 1);                               \
+        u32 part15 = extract32from128(r3, 2);                               \
+        u32 part16 = extract32from128(r3, 3);                               \
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);      \
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn);  \
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \
+        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\
+        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\
     }                                                                       \
 } while(0)
 #endif
 
-static really_inline
-m512 vectoredLoad2x256(m512 *p_mask, const u8 *ptr, const size_t start_offset,
-                       const u8 *lo, const u8 *hi,
-                       const u8 *buf_history, size_t len_history,
-                       const u32 nMasks) {
-    m256 p_mask256;
-    m512 ret = set2x256(vectoredLoad256(&p_mask256, ptr, start_offset, lo, hi,
-                                        buf_history, len_history, nMasks));
-    *p_mask = set2x256(p_mask256);
-    return ret;
-}
-
-#define PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val)                            \
+#define PREP_FAT_SHUF_MASK                                                  \
     m512 lo = and512(val, *lo_mask);                                        \
     m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
 
-#define PREP_FAT_SHUF_MASK                                                  \
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(set2x256(load256(ptr)));            \
-    *c_16 = *(ptr + 15);                                                    \
-    m512 r_msk = set512_64(0ULL, r_msk_base_hi[*c_16],                      \
-                           0ULL, r_msk_base_hi[*c_0],                       \
-                           0ULL, r_msk_base_lo[*c_16],                      \
-                           0ULL, r_msk_base_lo[*c_0]);                      \
-    *c_0 = *(ptr + 31)
-
-#define FAT_SHIFT_OR_M1                                                     \
-    or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi))
-
-#define FAT_SHIFT_OR_M2                                                     \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo),                \
-                               pshufb_m512(dup_mask[3], hi)),               \
-                         1), FAT_SHIFT_OR_M1)
-
-#define FAT_SHIFT_OR_M3                                                     \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo),                \
-                               pshufb_m512(dup_mask[5], hi)),               \
-                         2), FAT_SHIFT_OR_M2)
-
-#define FAT_SHIFT_OR_M4                                                     \
-    or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo),                \
-                               pshufb_m512(dup_mask[7], hi)),               \
-                         3), FAT_SHIFT_OR_M3)
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M1                          \
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
+                            pshufb_m512(dup_mask[1], hi));
 
-static really_inline
-m512 prep_conf_fat_teddy_no_reinforcement_m1(const m512 *lo_mask,
-                                             const m512 *dup_mask,
-                                             const m512 val) {
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
-    return FAT_SHIFT_OR_M1;
-}
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M2                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M1                              \
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
+                            pshufb_m512(dup_mask[3], hi));
 
-static really_inline
-m512 prep_conf_fat_teddy_no_reinforcement_m2(const m512 *lo_mask,
-                                             const m512 *dup_mask,
-                                             const m512 val) {
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
-    return FAT_SHIFT_OR_M2;
-}
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M3                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M2                              \
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
+                            pshufb_m512(dup_mask[5], hi));
 
-static really_inline
-m512 prep_conf_fat_teddy_no_reinforcement_m3(const m512 *lo_mask,
-                                             const m512 *dup_mask,
-                                             const m512 val) {
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
-    return FAT_SHIFT_OR_M3;
-}
+#define FAT_TEDDY_VBMI_PSHUFB_OR_M4                          \
+    FAT_TEDDY_VBMI_PSHUFB_OR_M3                              \
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
+                            pshufb_m512(dup_mask[7], hi));
 
-static really_inline
-m512 prep_conf_fat_teddy_no_reinforcement_m4(const m512 *lo_mask,
-                                             const m512 *dup_mask,
-                                             const m512 val) {
-    PREP_FAT_SHUF_MASK_NO_REINFORCEMENT(val);
-    return FAT_SHIFT_OR_M4;
-}
+#define FAT_TEDDY_VBMI_SL1_MASK   0xfffffffefffffffeULL
+#define FAT_TEDDY_VBMI_SL2_MASK   0xfffffffcfffffffcULL
+#define FAT_TEDDY_VBMI_SL3_MASK   0xfffffff8fffffff8ULL
+
+#define FAT_TEDDY_VBMI_SHIFT_M1
+
+#define FAT_TEDDY_VBMI_SHIFT_M2                      \
+    FAT_TEDDY_VBMI_SHIFT_M1                          \
+    m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+
+#define FAT_TEDDY_VBMI_SHIFT_M3                      \
+    FAT_TEDDY_VBMI_SHIFT_M2                          \
+    m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+
+#define FAT_TEDDY_VBMI_SHIFT_M4                      \
+    FAT_TEDDY_VBMI_SHIFT_M3                          \
+    m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+
+#define FAT_SHIFT_OR_M1            \
+    shuf_or_b0
+
+#define FAT_SHIFT_OR_M2            \
+    or512(sl1, FAT_SHIFT_OR_M1)
+
+#define FAT_SHIFT_OR_M3            \
+    or512(sl2, FAT_SHIFT_OR_M2)
+
+#define FAT_SHIFT_OR_M4            \
+    or512(sl3, FAT_SHIFT_OR_M3)
 
 static really_inline
 m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
-                            const u8 *ptr, const u64a *r_msk_base_lo,
-                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+                            UNUSED const m512 *sl_msk, const m512 val) {
     PREP_FAT_SHUF_MASK;
-    return or512(FAT_SHIFT_OR_M1, r_msk);
+    FAT_TEDDY_VBMI_PSHUFB_OR_M1;
+    FAT_TEDDY_VBMI_SHIFT_M1;
+    return FAT_SHIFT_OR_M1;
 }
 
 static really_inline
 m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
-                            const u8 *ptr, const u64a *r_msk_base_lo,
-                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+                            const m512 *sl_msk, const m512 val) {
     PREP_FAT_SHUF_MASK;
-    return or512(FAT_SHIFT_OR_M2, r_msk);
+    FAT_TEDDY_VBMI_PSHUFB_OR_M2;
+    FAT_TEDDY_VBMI_SHIFT_M2;
+    return FAT_SHIFT_OR_M2;
 }
 
 static really_inline
 m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
-                            const u8 *ptr, const u64a *r_msk_base_lo,
-                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+                            const m512 *sl_msk, const m512 val) {
     PREP_FAT_SHUF_MASK;
-    return or512(FAT_SHIFT_OR_M3, r_msk);
+    FAT_TEDDY_VBMI_PSHUFB_OR_M3;
+    FAT_TEDDY_VBMI_SHIFT_M3;
+    return FAT_SHIFT_OR_M3;
 }
 
 static really_inline
 m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
-                            const u8 *ptr, const u64a *r_msk_base_lo,
-                            const u64a *r_msk_base_hi, u32 *c_0, u32 *c_16) {
+                            const m512 *sl_msk, const m512 val) {
     PREP_FAT_SHUF_MASK;
-    return or512(FAT_SHIFT_OR_M4, r_msk);
+    FAT_TEDDY_VBMI_PSHUFB_OR_M4;
+    FAT_TEDDY_VBMI_SHIFT_M4;
+    return FAT_SHIFT_OR_M4;
 }
 
-#define PREP_CONF_FAT_FN_NO_REINFORCEMENT(val, n)                             \
-    prep_conf_fat_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val)
+#define PREP_CONF_FAT_FN(val, n)    \
+    prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
 
-#define PREP_CONF_FAT_FN(ptr, n)                                              \
-    prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, ptr,                         \
-                             r_msk_base_lo, r_msk_base_hi, &c_0, &c_16)
+#define FAT_TEDDY_VBMI_SL1_POS    15
+#define FAT_TEDDY_VBMI_SL2_POS    14
+#define FAT_TEDDY_VBMI_SL3_POS    13
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
+    sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
+    sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
+
+#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
+    sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
 
 /*
  * In FAT teddy, it needs 2 bytes to represent result of each position,
@@ -355,31 +382,15 @@ m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
  * then do pshufb_m512(AABB, XYXY).
  */
 
-#define DUP_FAT_MASK(a) mask_set2x256(set2x256(swap128in256(a)), 0xC3, a)
-
-#define PREPARE_FAT_MASKS_1                                                   \
-    dup_mask[0] = DUP_FAT_MASK(maskBase[0]);                                  \
-    dup_mask[1] = DUP_FAT_MASK(maskBase[1]);
-
-#define PREPARE_FAT_MASKS_2                                                   \
-    PREPARE_FAT_MASKS_1                                                       \
-    dup_mask[2] = DUP_FAT_MASK(maskBase[2]);                                  \
-    dup_mask[3] = DUP_FAT_MASK(maskBase[3]);
-
-#define PREPARE_FAT_MASKS_3                                                   \
-    PREPARE_FAT_MASKS_2                                                       \
-    dup_mask[4] = DUP_FAT_MASK(maskBase[4]);                                  \
-    dup_mask[5] = DUP_FAT_MASK(maskBase[5]);
-
-#define PREPARE_FAT_MASKS_4                                                   \
-    PREPARE_FAT_MASKS_3                                                       \
-    dup_mask[6] = DUP_FAT_MASK(maskBase[6]);                                  \
-    dup_mask[7] = DUP_FAT_MASK(maskBase[7]);
-
 #define PREPARE_FAT_MASKS(n)                                                  \
-    m512 lo_mask = set64x8(0xf);                                              \
-    m512 dup_mask[n * 2];                                                     \
-    PREPARE_FAT_MASKS_##n
+    m512 lo_mask = set1_64x8(0xf);                                              \
+    m512 sl_msk[n - 1];                                                       \
+    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
+
+#define FAT_TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffULL >> n_sh)
+#define FAT_TEDDY_VBMI_CONF_MASK_FULL   ((0xffffffffULL << n_sh) & 0xffffffffULL)
+#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
+#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffULL >> (32 - n_sh))
 
 #define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                   \
 do {                                                                          \
@@ -389,67 +400,53 @@ do {                                                                          \
     const u8 *tryFloodDetect = a->firstFloodDetect;                           \
     u32 last_match = ones_u32;                                                \
     const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
-    const size_t iterBytes = 64;                                              \
+    const size_t iterBytes = 32;                                              \
+    u32 n_sh = n_msk - 1;                                                     \
+    const size_t loopBytes = 32 - n_sh;                                       \
     DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
                  a->buf, a->len, a->start_offset);                            \
                                                                               \
-    const m256 *maskBase = getMaskBase_fat(teddy);                            \
+    const m512 *dup_mask = getDupMaskBase(teddy, n_msk);                      \
     PREPARE_FAT_MASKS(n_msk);                                                 \
     const u32 *confBase = getConfBase(teddy);                                 \
                                                                               \
-    const u64a *r_msk_base_lo = getReinforcedMaskBase_fat(teddy, n_msk);      \
-    const u64a *r_msk_base_hi = r_msk_base_lo + (N_CHARS + 1);                \
-    u32 c_0 = 0x100;                                                          \
-    u32 c_16 = 0x100;                                                         \
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);                               \
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);           \
-    if (ptr < mainStart) {                                                    \
-        ptr = mainStart - 32;                                                 \
-        m512 p_mask;                                                          \
-        m512 val_0 = vectoredLoad2x256(&p_mask, ptr, a->start_offset,         \
-                                     a->buf, buf_end,                         \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk);           \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
-        ptr += 32;                                                            \
+    u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;                                   \
+    m512 p_mask = set_mask_m512(~((k << 32) | k));                            \
+    u32 overlap = 0;                                                          \
+    u64a patch = 0;                                                           \
+    if (likely(ptr + loopBytes <= buf_end)) {                                 \
+        u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;                              \
+        m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));                     \
+        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk);          \
+        r_0 = or512(r_0, p_mask0);                                            \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn);               \
+        ptr += loopBytes;                                                     \
+        overlap = n_sh;                                                       \
+        patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;                               \
     }                                                                         \
                                                                               \
-    if (ptr + 32 <= buf_end) {                                                \
-        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
-        ptr += 32;                                                            \
-    }                                                                         \
-                                                                              \
-    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {                    \
-        __builtin_prefetch(ptr + (iterBytes * 4));                            \
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
         CHECK_FLOOD;                                                          \
-        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);                 \
-        m512 r_1 = PREP_CONF_FAT_FN(ptr + 32, n_msk);                         \
-        CONFIRM_FAT_TEDDY(r_1, 16, 32, NOT_CAUTIOUS, conf_fn);                \
-    }                                                                         \
-                                                                              \
-    if (ptr + 32 <= buf_end) {                                                \
-        m512 r_0 = PREP_CONF_FAT_FN(ptr, n_msk);                              \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);                 \
-        ptr += 32;                                                            \
+        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk);   \
+        r_0 = or512(r_0, p_mask);                                             \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);     \
     }                                                                         \
                                                                               \
-    assert(ptr + 32 > buf_end);                                               \
+    assert(ptr + loopBytes > buf_end);                                        \
     if (ptr < buf_end) {                                                      \
-        m512 p_mask;                                                          \
-        m512 val_0 = vectoredLoad2x256(&p_mask, ptr, 0, ptr, buf_end,         \
-                                     a->buf_history, a->len_history, n_msk);  \
-        m512 r_0 = PREP_CONF_FAT_FN_NO_REINFORCEMENT(val_0, n_msk);           \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                    \
+        u32 left = (u32)(buf_end - ptr);                                      \
+        u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);                         \
+        m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));                     \
+        m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));   \
+        m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk);                            \
+        r_0 = or512(r_0, p_mask1);                                            \
+        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn);     \
     }                                                                         \
                                                                               \
     return HWLM_SUCCESS;                                                      \
 } while(0)
 
-#else // HAVE_AVX512
+#else // !HAVE_AVX512VBMI, AVX2 normal fat teddy
 
 #ifdef ARCH_64_BIT
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
@@ -501,15 +498,15 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
                        const u8 *buf_history, size_t len_history,
                        const u32 nMasks) {
     m128 p_mask128;
-    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+    m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
                                         buf_history, len_history, nMasks));
-    *p_mask = set2x128(p_mask128);
+    *p_mask = set1_2x128(p_mask128);
     return ret;
 }
 
 static really_inline
 m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
-    m256 mask = set32x8(0xf);
+    m256 mask = set1_32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
     return or256(pshufb_m256(maskBase[0 * 2], lo),
@@ -518,7 +515,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
 
 static really_inline
 m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
-    m256 mask = set32x8(0xf);
+    m256 mask = set1_32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m1(maskBase, val);
@@ -533,7 +530,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
 static really_inline
 m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
                             m256 val) {
-    m256 mask = set32x8(0xf);
+    m256 mask = set1_32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
@@ -548,7 +545,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
 static really_inline
 m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
                             m256 *old_3, m256 val) {
-    m256 mask = set32x8(0xf);
+    m256 mask = set1_32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
@@ -659,7 +656,7 @@ do {                                                                        \
     return HWLM_SUCCESS;                                                    \
 } while(0)
 
-#endif // HAVE_AVX512
+#endif // HAVE_AVX512VBMI
 
 hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 9a1e54a15..e7398b6fa 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,6 @@
 #include "util/alloc.h"
 #include "util/compare.h"
 #include "util/container.h"
-#include "util/make_unique.h"
 #include "util/noncopyable.h"
 #include "util/popcount.h"
 #include "util/small_vector.h"
@@ -89,7 +88,7 @@ class TeddyCompiler : noncopyable {
                   const TeddyEngineDescription &eng_in, bool make_small_in,
                   const Grey &grey_in)
         : eng(eng_in), grey(grey_in), lits(lits_in),
-          bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
+          bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {}
 
     bytecode_ptr<FDR> build();
 };
@@ -166,7 +165,7 @@ class TeddySet {
                 nibbleSets[i * 2] = nibbleSets[i * 2 + 1] = 0xffff;
             }
         }
-        litIds.push_back(lit_id);
+        litIds.emplace_back(lit_id);
         sort_and_unique(litIds);
     }
 
@@ -353,6 +352,89 @@ void fillReinforcedMsk(u8 *rmsk, u16 c, u32 j, u8 bmsk) {
     }
 }
 
+static
+void fillDupNibbleMasks(const map<BucketIndex,
+                                  vector<LiteralIndex>> &bucketToLits,
+                        const vector<hwlmLiteral> &lits,
+                        u32 numMasks, size_t maskLen,
+                        u8 *baseMsk) {
+    u32 maskWidth = 2;
+    memset(baseMsk, 0xff, maskLen);
+
+    for (const auto &b2l : bucketToLits) {
+        const u32 &bucket_id = b2l.first;
+        const vector<LiteralIndex> &ids = b2l.second;
+        const u8 bmsk = 1U << (bucket_id % 8);
+
+        for (const LiteralIndex &lit_id : ids) {
+            const hwlmLiteral &l = lits[lit_id];
+            DEBUG_PRINTF("putting lit %u into bucket %u\n", lit_id, bucket_id);
+            const u32 sz = verify_u32(l.s.size());
+
+            // fill in masks
+            for (u32 j = 0; j < numMasks; j++) {
+                const u32 msk_id_lo = j * 2 * maskWidth + (bucket_id / 8);
+                const u32 msk_id_hi = (j * 2 + 1) * maskWidth + (bucket_id / 8);
+                const u32 lo_base0 = msk_id_lo * 32;
+                const u32 lo_base1 = msk_id_lo * 32 + 16;
+                const u32 hi_base0 = msk_id_hi * 32;
+                const u32 hi_base1 = msk_id_hi * 32 + 16;
+
+                // if we don't have a char at this position, fill in i
+                // locations in these masks with '1'
+                if (j >= sz) {
+                    for (u32 n = 0; n < 16; n++) {
+                        baseMsk[lo_base0 + n] &= ~bmsk;
+                        baseMsk[lo_base1 + n] &= ~bmsk;
+                        baseMsk[hi_base0 + n] &= ~bmsk;
+                        baseMsk[hi_base1 + n] &= ~bmsk;
+                    }
+                } else {
+                    u8 c = l.s[sz - 1 - j];
+                    // if we do have a char at this position
+                    const u32 hiShift = 4;
+                    u32 n_hi = (c >> hiShift) & 0xf;
+                    u32 n_lo = c & 0xf;
+
+                    if (j < l.msk.size() && l.msk[l.msk.size() - 1 - j]) {
+                        u8 m = l.msk[l.msk.size() - 1 - j];
+                        u8 m_hi = (m >> hiShift) & 0xf;
+                        u8 m_lo = m & 0xf;
+                        u8 cmp = l.cmp[l.msk.size() - 1 - j];
+                        u8 cmp_lo = cmp & 0xf;
+                        u8 cmp_hi = (cmp >> hiShift) & 0xf;
+
+                        for (u8 cm = 0; cm < 0x10; cm++) {
+                            if ((cm & m_lo) == (cmp_lo & m_lo)) {
+                                baseMsk[lo_base0 + cm] &= ~bmsk;
+                                baseMsk[lo_base1 + cm] &= ~bmsk;
+                            }
+                            if ((cm & m_hi) == (cmp_hi & m_hi)) {
+                                baseMsk[hi_base0 + cm] &= ~bmsk;
+                                baseMsk[hi_base1 + cm] &= ~bmsk;
+                            }
+                        }
+                    } else {
+                        if (l.nocase && ourisalpha(c)) {
+                            u32 cmHalfClear = (0xdf >> hiShift) & 0xf;
+                            u32 cmHalfSet = (0x20 >> hiShift) & 0xf;
+                            baseMsk[hi_base0 + (n_hi & cmHalfClear)] &= ~bmsk;
+                            baseMsk[hi_base1 + (n_hi & cmHalfClear)] &= ~bmsk;
+                            baseMsk[hi_base0 + (n_hi | cmHalfSet)] &= ~bmsk;
+                            baseMsk[hi_base1 + (n_hi | cmHalfSet)] &= ~bmsk;
+                        } else {
+                            baseMsk[hi_base0 + n_hi] &= ~bmsk;
+                            baseMsk[hi_base1 + n_hi] &= ~bmsk;
+                        }
+                        baseMsk[lo_base0 + n_lo] &= ~bmsk;
+                        baseMsk[lo_base1 + n_lo] &= ~bmsk;
+                    }
+                }
+            }
+        }
+    }
+}
+
 static
 void fillNibbleMasks(const map<BucketIndex,
                                vector<LiteralIndex>> &bucketToLits,
@@ -432,7 +514,7 @@ void fillReinforcedTable(const map<BucketIndex,
                          u8 *rtable_base, const u32 num_tables) {
     vector<u8 *> tables;
     for (u32 i = 0; i < num_tables; i++) {
-        tables.push_back(rtable_base + i * RTABLE_SIZE);
+        tables.emplace_back(rtable_base + i * RTABLE_SIZE);
     }
 
     for (auto t : tables) {
@@ -479,14 +561,17 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
 
     size_t headerSize = sizeof(Teddy);
     size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
-    size_t reinforcedMaskLen = RTABLE_SIZE * maskWidth;
+    size_t reinforcedDupMaskLen = RTABLE_SIZE * maskWidth;
+    if (maskWidth == 2) { // dup nibble mask table in Fat Teddy
+        reinforcedDupMaskLen = maskLen * 2;
+    }
 
     auto floodTable = setupFDRFloodControl(lits, eng, grey);
     auto confirmTable = setupFullConfs(lits, eng, bucketToLits, make_small);
 
     // Note: we place each major structure here on a cacheline boundary.
     size_t size = ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
-                  ROUNDUP_CL(reinforcedMaskLen) +
+                  ROUNDUP_CL(reinforcedDupMaskLen) +
                   ROUNDUP_CL(confirmTable.size()) + floodTable.size();
 
     auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
@@ -502,7 +587,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
 
     // Write confirm structures.
     u8 *ptr = teddy_base + ROUNDUP_CL(headerSize) + ROUNDUP_CL(maskLen) +
-              ROUNDUP_CL(reinforcedMaskLen);
+              ROUNDUP_CL(reinforcedDupMaskLen);
     assert(ISALIGNED_CL(ptr));
     teddy->confOffset = verify_u32(ptr - teddy_base);
     memcpy(ptr, confirmTable.get(), confirmTable.size());
@@ -519,9 +604,16 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
     fillNibbleMasks(bucketToLits, lits, eng.numMasks, maskWidth, maskLen,
                     baseMsk);
 
-    // Write reinforcement masks.
-    u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
-    fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
+    if (maskWidth == 1) { // reinforcement table in Teddy
+        // Write reinforcement masks.
+        u8 *reinforcedMsk = baseMsk + ROUNDUP_CL(maskLen);
+        fillReinforcedTable(bucketToLits, lits, reinforcedMsk, maskWidth);
+    } else { // dup nibble mask table in Fat Teddy
+        assert(maskWidth == 2);
+        u8 *dupMsk = baseMsk + ROUNDUP_CL(maskLen);
+        fillDupNibbleMasks(bucketToLits, lits, eng.numMasks,
+			   reinforcedDupMaskLen, dupMsk);
+    }
 
     return fdr;
 }
@@ -584,7 +676,7 @@ unique_ptr<HWLMProto> teddyBuildProtoHinted(
         return nullptr;
     }
 
-    return ue2::make_unique<HWLMProto>(engType, move(des), lits,
+    return std::make_unique<HWLMProto>(engType, std::move(des), lits,
                                        bucketToLits, make_small);
 }
 
diff --git a/src/fdr/teddy_engine_description.cpp b/src/fdr/teddy_engine_description.cpp
index 88ae0f538..7cd33ab23 100644
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@@ -34,7 +34,6 @@
 #include "fdr_engine_description.h"
 #include "teddy_internal.h"
 #include "teddy_engine_description.h"
-#include "util/make_unique.h"
 
 #include <cmath>
 
@@ -197,7 +196,7 @@ chooseTeddyEngine(const target_t &target, const vector<hwlmLiteral> &vl) {
     }
 
     DEBUG_PRINTF("using engine %u\n", best->getID());
-    return ue2::make_unique<TeddyEngineDescription>(*best);
+    return std::make_unique<TeddyEngineDescription>(*best);
 }
 
 unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
@@ -206,7 +205,7 @@ unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
 
     for (const auto &desc : descs) {
         if (desc.getID() == engineID) {
-            return ue2::make_unique<TeddyEngineDescription>(desc);
+            return std::make_unique<TeddyEngineDescription>(desc);
         }
     }
 
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index 730850cb7..d27be994e 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -45,6 +45,16 @@ extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
 extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
 #endif
 
+#if defined(HAVE_AVX512VBMI)
+static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
+};
+#endif
+
 #ifdef ARCH_64_BIT
 #define TEDDY_CONF_TYPE u64a
 #define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
@@ -338,7 +348,7 @@ static really_inline
 m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
                      const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
                      const u32 nMasks) {
-    m512 val;
+    m512 val = zeroes512();
 
     uintptr_t copy_start;
     uintptr_t copy_len;
diff --git a/src/hs.cpp b/src/hs.cpp
index ab54105c5..61e46148c 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,8 +44,11 @@
 #include "parser/prefilter.h"
 #include "parser/unsupported.h"
 #include "util/compile_error.h"
-#include "util/cpuid_flags.h"
-#include "util/cpuid_inline.h"
+#include "util/arch/common/cpuid_flags.h"
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/cpuid_inline.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#endif
 #include "util/depth.h"
 #include "util/popcount.h"
 #include "util/target_info.h"
@@ -120,9 +123,10 @@ bool checkMode(unsigned int mode, hs_compile_error **comp_error) {
 
 static
 bool checkPlatform(const hs_platform_info *p, hs_compile_error **comp_error) {
-    static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_GLM;
+    static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_ICX;
     static constexpr u32 HS_CPU_FEATURES_ALL =
-        HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512;
+        HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512 |
+        HS_CPU_FEATURES_AVX512VBMI;
 
     if (!p) {
         return true;
@@ -195,11 +199,13 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
     }
 
 #if defined(FAT_RUNTIME)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
     if (!check_ssse3()) {
         *db = nullptr;
         *comp_error = generateCompileError("Unsupported architecture", -1);
         return HS_ARCH_ERROR;
     }
+#endif
 #endif
 
     if (!checkMode(mode, comp_error)) {
@@ -316,13 +322,14 @@ hs_compile_lit_multi_int(const char *const *expressions, const unsigned *flags,
         *comp_error = generateCompileError("Invalid parameter: elements is zero", -1);
         return HS_COMPILER_ERROR;
     }
-
 #if defined(FAT_RUNTIME)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
     if (!check_ssse3()) {
         *db = nullptr;
         *comp_error = generateCompileError("Unsupported architecture", -1);
         return HS_ARCH_ERROR;
     }
+#endif
 #endif
 
     if (!checkMode(mode, comp_error)) {
@@ -496,10 +503,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
     }
 
 #if defined(FAT_RUNTIME)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
     if (!check_ssse3()) {
         *error = generateCompileError("Unsupported architecture", -1);
         return HS_ARCH_ERROR;
     }
+#endif
 #endif
 
     if (!info) {
@@ -513,6 +522,12 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
         return HS_COMPILER_ERROR;
     }
 
+    if (flags & HS_FLAG_COMBINATION) {
+        *error = generateCompileError("Invalid parameter: unsupported "
+                                      "logical combination expression", -1);
+        return HS_COMPILER_ERROR;
+    }
+
     *info = nullptr;
     *error = nullptr;
 
@@ -621,9 +636,11 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform) {
 extern "C" HS_PUBLIC_API
 hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error) {
 #if defined(FAT_RUNTIME)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
     if (!check_ssse3()) {
         return HS_ARCH_ERROR;
     }
+#endif
 #endif
     freeCompileError(error);
     return HS_SUCCESS;
diff --git a/src/hs.h b/src/hs.h
index 105919fb8..5f363a608 100644
--- a/src/hs.h
+++ b/src/hs.h
@@ -39,12 +39,7 @@
  * the individual component headers for documentation.
  */
 
-/* The current Hyperscan version information. */
-
-#define HS_MAJOR      5
-#define HS_MINOR      3
-#define HS_PATCH      0
-
+#include "hs_version.h"
 #include "hs_compile.h"
 #include "hs_runtime.h"
 
diff --git a/src/hs_common.h b/src/hs_common.h
index 93dc1fe8a..3078ad7bb 100644
--- a/src/hs_common.h
+++ b/src/hs_common.h
@@ -29,11 +29,7 @@
 #ifndef HS_COMMON_H_
 #define HS_COMMON_H_
 
-#if defined(_WIN32)
-#define HS_CDECL    __cdecl
-#else
 #define HS_CDECL
-#endif
 #include <stdlib.h>
 
 /**
diff --git a/src/hs_compile.h b/src/hs_compile.h
index 081d46387..5aa241886 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -748,10 +748,7 @@ hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
  *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
  *                                when a match is found.
- *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
- *                               syntax.
- *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
- *                         the sub-expressions in logical combinations.
+ *       - HS_FLAG_QUIET - This flag will be ignored.
  *
  * @param info
  *      On success, a pointer to the pattern information will be returned in
@@ -814,10 +811,7 @@ hs_error_t HS_CDECL hs_expression_info(const char *expression,
  *       - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode.
  *       - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset
  *                                when a match is found.
- *       - HS_FLAG_COMBINATION - Parse the expression in logical combination
- *                               syntax.
- *       - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for
- *                         the sub-expressions in logical combinations.
+ *       - HS_FLAG_QUIET - This flag will be ignored.
  *
  * @param ext
  *      A pointer to a filled @ref hs_expr_ext_t structure that defines
@@ -1034,6 +1028,15 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
  */
 #define HS_CPU_FEATURES_AVX512           (1ULL << 3)
 
+/**
+ * CPU features flag - Intel(R) Advanced Vector Extensions 512
+ * Vector Byte Manipulation Instructions (Intel(R) AVX512VBMI)
+ *
+ * Setting this flag indicates that the target platform supports AVX512VBMI
+ * instructions. Using AVX512VBMI implies the use of AVX512.
+ */
+#define HS_CPU_FEATURES_AVX512VBMI       (1ULL << 4)
+
 /** @} */
 
 /**
@@ -1114,6 +1117,22 @@ hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
  */
 #define HS_TUNE_FAMILY_GLM 8
 
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Icelake
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Icelake microarchitecture.
+ */
+#define HS_TUNE_FAMILY_ICL 9
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Icelake Server
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Icelake Server microarchitecture.
+ */
+#define HS_TUNE_FAMILY_ICX 10
+
 /** @} */
 
 /**
diff --git a/src/hs_internal.h b/src/hs_internal.h
index adf07b22c..4eb5e157c 100644
--- a/src/hs_internal.h
+++ b/src/hs_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Intel Corporation
+ * Copyright (c) 2019-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -80,7 +80,9 @@ extern "C"
                     | HS_FLAG_PREFILTER \
                     | HS_FLAG_SINGLEMATCH \
                     | HS_FLAG_ALLOWEMPTY \
-                    | HS_FLAG_SOM_LEFTMOST)
+                    | HS_FLAG_SOM_LEFTMOST \
+                    | HS_FLAG_COMBINATION \
+                    | HS_FLAG_QUIET)
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 59ad3f3ab..74a8fc1ec 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,16 +27,33 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "config.h"
 #include "hs_common.h"
-#include "util/cpuid_flags.h"
-#include "util/cpuid_inline.h"
+#include "ue2common.h"
+#if !defined(VS_SIMDE_BACKEND)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/cpuid_inline.h"
+#elif defined(ARCH_AARCH64)
+#include "util/arch/arm/cpuid_inline.h"
+#endif
+#endif
 
 HS_PUBLIC_API
 hs_error_t HS_CDECL hs_valid_platform(void) {
     /* Hyperscan requires SSSE3, anything else is a bonus */
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_IA32) || defined(ARCH_X86_64))
     if (check_ssse3()) {
         return HS_SUCCESS;
     } else {
         return HS_ARCH_ERROR;
     }
+#elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
+   if (check_neon()) {
+        return HS_SUCCESS;
+    } else {
+        return HS_ARCH_ERROR;
+    }
+#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
+    return HS_SUCCESS;
+#endif
 }
diff --git a/src/hs_version.h.in b/src/hs_version.h.in
index 4412730dd..678422194 100644
--- a/src/hs_version.h.in
+++ b/src/hs_version.h.in
@@ -36,5 +36,9 @@
 
 #define HS_VERSION_32BIT ((@HS_MAJOR_VERSION@ << 24) | (@HS_MINOR_VERSION@ << 16) | (@HS_PATCH_VERSION@ << 8) | 0)
 
+#define HS_MAJOR      @HS_MAJOR_VERSION@
+#define HS_MINOR      @HS_MINOR_VERSION@
+#define HS_PATCH      @HS_PATCH_VERSION@
+
 #endif /* HS_VERSION_H_C6428FAF8E3713 */
 
diff --git a/src/hwlm/hwlm.c b/src/hwlm/hwlm.c
index 8cf585a98..e50deff71 100644
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,7 +39,7 @@
 #include "nfa/accel.h"
 #include "nfa/shufti.h"
 #include "nfa/truffle.h"
-#include "nfa/vermicelli.h"
+#include "nfa/vermicelli.hpp"
 #include <string.h>
 
 #define MIN_ACCEL_LEN_BLOCK  16
@@ -62,6 +63,11 @@ const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
         DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n",
                      aux->dverm.c1, aux->dverm.c2);
         return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 1, ptr, end);
+#ifdef HAVE_SVE2
+    case ACCEL_VERM16:
+        DEBUG_PRINTF("single vermicelli16\n");
+        return vermicelli16Exec(aux->verm16.mask, ptr, end);
+#endif // HAVE_SVE2
     case ACCEL_SHUFTI:
         DEBUG_PRINTF("single shufti\n");
         return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index 1b3328152..7837819ac 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -46,7 +46,6 @@
 #include "fdr/teddy_engine_description.h"
 #include "util/compile_context.h"
 #include "util/compile_error.h"
-#include "util/make_unique.h"
 #include "util/ue2string.h"
 
 #include <cassert>
@@ -58,24 +57,24 @@ using namespace std;
 namespace ue2 {
 
 HWLMProto::HWLMProto(u8 engType_in, vector<hwlmLiteral> lits_in)
-    : engType(engType_in), lits(move(lits_in)) {}
+    : engType(engType_in), lits(std::move(lits_in)) {}
 
 HWLMProto::HWLMProto(u8 engType_in,
                      unique_ptr<FDREngineDescription> eng_in,
                      vector<hwlmLiteral> lits_in,
                      map<u32, vector<u32>> bucketToLits_in,
                      bool make_small_in)
-    : engType(engType_in), fdrEng(move(eng_in)), lits(move(lits_in)),
-      bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
+    : engType(engType_in), fdrEng(std::move(eng_in)), lits(std::move(lits_in)),
+      bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {}
 
 HWLMProto::HWLMProto(u8 engType_in,
                      unique_ptr<TeddyEngineDescription> eng_in,
                      vector<hwlmLiteral> lits_in,
                      map<u32, vector<u32>> bucketToLits_in,
                      bool make_small_in)
-    : engType(engType_in), teddyEng(move(eng_in)),
-      lits(move(lits_in)),
-      bucketToLits(move(bucketToLits_in)), make_small(make_small_in) {}
+    : engType(engType_in), teddyEng(std::move(eng_in)),
+      lits(std::move(lits_in)),
+      bucketToLits(std::move(bucketToLits_in)), make_small(make_small_in) {}
 
 HWLMProto::~HWLMProto() {}
 
@@ -133,14 +132,14 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
         if (noodle) {
             engSize = noodle.size();
         }
-        eng = move(noodle);
+        eng = std::move(noodle);
     } else {
         DEBUG_PRINTF("building a new deal\n");
         auto fdr = fdrBuildTable(proto, cc.grey);
         if (fdr) {
             engSize = fdr.size();
         }
-        eng = move(fdr);
+        eng = std::move(fdr);
     }
 
     if (!eng) {
@@ -201,7 +200,7 @@ hwlmBuildProto(vector<hwlmLiteral> &lits, bool make_small,
 
     if (isNoodleable(lits, cc)) {
         DEBUG_PRINTF("build noodle table\n");
-        proto = ue2::make_unique<HWLMProto>(HWLM_ENGINE_NOOD, lits);
+        proto = std::make_unique<HWLMProto>(HWLM_ENGINE_NOOD, lits);
     } else {
         DEBUG_PRINTF("building a new deal\n");
         proto = fdrBuildProto(HWLM_ENGINE_FDR, lits, make_small,
diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c
deleted file mode 100644
index d4f6902a2..000000000
--- a/src/hwlm/noodle_engine.c
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Noodle literal matcher: runtime.
- */
-#include "hwlm.h"
-#include "noodle_engine.h"
-#include "noodle_internal.h"
-#include "scratch.h"
-#include "ue2common.h"
-#include "util/arch.h"
-#include "util/bitutils.h"
-#include "util/compare.h"
-#include "util/intrinsics.h"
-#include "util/join.h"
-#include "util/masked_move.h"
-#include "util/partial_store.h"
-#include "util/simd_utils.h"
-
-#include <ctype.h>
-#include <stdbool.h>
-#include <string.h>
-
-/** \brief Noodle runtime context. */
-struct cb_info {
-    HWLMCallback cb; //!< callback function called on match
-    u32 id; //!< ID to pass to callback on match
-    struct hs_scratch *scratch; //!< scratch to pass to callback
-    size_t offsetAdj; //!< used in streaming mode
-};
-
-#if defined(HAVE_AVX512)
-#define CHUNKSIZE 64
-#define MASK_TYPE m512
-#define Z_BITS 64
-#define Z_TYPE u64a
-#elif defined(HAVE_AVX2)
-#define CHUNKSIZE 32
-#define MASK_TYPE m256
-#define Z_BITS 32
-#define Z_TYPE u32
-#else
-#define CHUNKSIZE 16
-#define MASK_TYPE m128
-#define Z_BITS 32
-#define Z_TYPE u32
-#endif
-
-
-#define RETURN_IF_TERMINATED(x)                                                \
-    {                                                                          \
-        if ((x) == HWLM_TERMINATED) {                                          \
-            return HWLM_TERMINATED;                                            \
-        }                                                                      \
-    }
-
-#define SINGLE_ZSCAN()                                                         \
-    do {                                                                       \
-        while (unlikely(z)) {                                                  \
-            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
-            size_t matchPos = d - buf + pos;                                   \
-            DEBUG_PRINTF("match pos %zu\n", matchPos);                         \
-            hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos);             \
-            RETURN_IF_TERMINATED(rv);                                          \
-        }                                                                      \
-    } while (0)
-
-#define DOUBLE_ZSCAN()                                                         \
-    do {                                                                       \
-        while (unlikely(z)) {                                                  \
-            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
-            size_t matchPos = d - buf + pos - 1;                               \
-            DEBUG_PRINTF("match pos %zu\n", matchPos);                         \
-            hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos);             \
-            RETURN_IF_TERMINATED(rv);                                          \
-        }                                                                      \
-    } while (0)
-
-static really_inline
-u8 caseClear8(u8 x, bool noCase) {
-    return (u8)(noCase ? (x & (u8)0xdf) : x);
-}
-
-// Make sure the rest of the string is there. The single character scanner
-// is used only for single chars with case insensitivity used correctly,
-// so it can go straight to the callback if we get this far.
-static really_inline
-hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
-                   char single, const struct cb_info *cbi, size_t pos) {
-    if (single) {
-        if (n->msk_len == 1) {
-            goto match;
-        }
-    }
-    assert(len >= n->msk_len);
-    u64a v =
-        partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
-    DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
-    if ((v & n->msk) != n->cmp) {
-        /* mask didn't match */
-        return HWLM_SUCCESS;
-    }
-
-match:
-    pos -= cbi->offsetAdj;
-    DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
-    hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
-    if (rv == HWLM_TERMINATE_MATCHING) {
-        return HWLM_TERMINATED;
-    }
-    return HWLM_SUCCESS;
-}
-
-#if defined(HAVE_AVX512)
-#define CHUNKSIZE 64
-#define MASK_TYPE m512
-#include "noodle_engine_avx512.c"
-#elif defined(HAVE_AVX2)
-#define CHUNKSIZE 32
-#define MASK_TYPE m256
-#include "noodle_engine_avx2.c"
-#else
-#define CHUNKSIZE 16
-#define MASK_TYPE m128
-#include "noodle_engine_sse.c"
-#endif
-
-static really_inline
-hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
-                            size_t len, size_t start, bool noCase,
-                            const struct cb_info *cbi) {
-
-    const MASK_TYPE mask1 = getMask(n->key0, noCase);
-    const MASK_TYPE caseMask = getCaseMask();
-
-    size_t offset = start + n->msk_len - 1;
-    size_t end = len;
-    assert(offset < end);
-
-#if !defined(HAVE_AVX512)
-    hwlm_error_t rv;
-
-    if (end - offset < CHUNKSIZE) {
-        rv = scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, offset,
-                             end);
-        return rv;
-    }
-
-    if (end - offset == CHUNKSIZE) {
-        rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
-                                 cbi, offset, end);
-        return rv;
-    }
-
-    uintptr_t data = (uintptr_t)buf;
-    uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
-    uintptr_t last = data + end;
-    uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
-    uintptr_t s3Start = end - CHUNKSIZE;
-
-    if (offset != s2Start) {
-        // first scan out to the fast scan starting point
-        DEBUG_PRINTF("stage 1: -> %zu\n", s2Start);
-        rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
-                                 cbi, offset, s2Start);
-        RETURN_IF_TERMINATED(rv);
-    }
-
-    if (likely(s2Start != s2End)) {
-        // scan as far as we can, bounded by the last point this key can
-        // possibly match
-        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End);
-        rv = scanSingleFast(n, buf, len, noCase, caseMask, mask1, cbi, s2Start,
-                            s2End);
-        RETURN_IF_TERMINATED(rv);
-    }
-
-    // if we are done bail out
-    if (s2End == len) {
-        return HWLM_SUCCESS;
-    }
-
-    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len);
-    rv = scanSingleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, cbi,
-                             s2End, len);
-
-    return rv;
-#else // HAVE_AVX512
-    return scanSingle512(n, buf, len, noCase, caseMask, mask1, cbi, offset,
-                         end);
-#endif
-}
-
-static really_inline
-hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
-                            size_t len, size_t start, bool noCase,
-                            const struct cb_info *cbi) {
-    // we stop scanning for the key-fragment when the rest of the key can't
-    // possibly fit in the remaining buffer
-    size_t end = len - n->key_offset + 2;
-
-    // the first place the key can match
-    size_t offset = start + n->msk_len - n->key_offset;
-
-    const MASK_TYPE caseMask = getCaseMask();
-    const MASK_TYPE mask1 = getMask(n->key0, noCase);
-    const MASK_TYPE mask2 = getMask(n->key1, noCase);
-
-#if !defined(HAVE_AVX512)
-    hwlm_error_t rv;
-
-    if (end - offset < CHUNKSIZE) {
-        rv = scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
-                             offset, end);
-        return rv;
-    }
-    if (end - offset == CHUNKSIZE) {
-        rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
-                                 mask2, cbi, offset, end);
-        return rv;
-    }
-
-    uintptr_t data = (uintptr_t)buf;
-    uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data;
-    uintptr_t s1End = s2Start + 1;
-    uintptr_t last = data + end;
-    uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data;
-    uintptr_t s3Start = end - CHUNKSIZE;
-    uintptr_t off = offset;
-
-    if (s2Start != off) {
-        // first scan out to the fast scan starting point plus one char past to
-        // catch the key on the overlap
-        DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start);
-        rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1,
-                                 mask2, cbi, off, s1End);
-        RETURN_IF_TERMINATED(rv);
-    }
-    off = s1End;
-
-    if (s2Start >= end) {
-        DEBUG_PRINTF("s2 == mL %zu\n", end);
-        return HWLM_SUCCESS;
-    }
-
-    if (likely(s2Start != s2End)) {
-        // scan as far as we can, bounded by the last point this key can
-        // possibly match
-        DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start);
-        rv = scanDoubleFast(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
-                            s2Start, s2End);
-        RETURN_IF_TERMINATED(rv);
-        off = s2End;
-    }
-
-    // if there isn't enough data left to match the key, bail out
-    if (s2End == end) {
-        return HWLM_SUCCESS;
-    }
-
-    DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end);
-    rv = scanDoubleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1,
-                             mask2, cbi, off, end);
-
-    return rv;
-#else // AVX512
-    return scanDouble512(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
-                         offset, end);
-#endif // AVX512
-}
-
-
-static really_inline
-hwlm_error_t scanSingleNoCase(const struct noodTable *n, const u8 *buf,
-                              size_t len, size_t start,
-                              const struct cb_info *cbi) {
-    return scanSingleMain(n, buf, len, start, 1, cbi);
-}
-
-static really_inline
-hwlm_error_t scanSingleCase(const struct noodTable *n, const u8 *buf,
-                            size_t len, size_t start,
-                            const struct cb_info *cbi) {
-    return scanSingleMain(n, buf, len, start, 0, cbi);
-}
-
-// Single-character specialisation, used when keyLen = 1
-static really_inline
-hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
-                        size_t start, bool noCase, const struct cb_info *cbi) {
-    if (!ourisalpha(n->key0)) {
-        noCase = 0; // force noCase off if we don't have an alphabetic char
-    }
-
-    // kinda ugly, but this forces constant propagation
-    if (noCase) {
-        return scanSingleNoCase(n, buf, len, start, cbi);
-    } else {
-        return scanSingleCase(n, buf, len, start, cbi);
-    }
-}
-
-
-static really_inline
-hwlm_error_t scanDoubleNoCase(const struct noodTable *n, const u8 *buf,
-                              size_t len, size_t start,
-                              const struct cb_info *cbi) {
-    return scanDoubleMain(n, buf, len, start, 1, cbi);
-}
-
-static really_inline
-hwlm_error_t scanDoubleCase(const struct noodTable *n, const u8 *buf,
-                            size_t len, size_t start,
-                            const struct cb_info *cbi) {
-    return scanDoubleMain(n, buf, len, start, 0, cbi);
-}
-
-
-static really_inline
-hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
-                        size_t start, bool noCase, const struct cb_info *cbi) {
-    // kinda ugly, but this forces constant propagation
-    if (noCase) {
-        return scanDoubleNoCase(n, buf, len, start, cbi);
-    } else {
-        return scanDoubleCase(n, buf, len, start, cbi);
-    }
-}
-
-// main entry point for the scan code
-static really_inline
-hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
-                  size_t start, char single, bool noCase,
-                  const struct cb_info *cbi) {
-    if (len - start < n->msk_len) {
-        // can't find string of length keyLen in a shorter buffer
-        return HWLM_SUCCESS;
-    }
-
-    if (single) {
-        return scanSingle(n, buf, len, start, noCase, cbi);
-    } else {
-        return scanDouble(n, buf, len, start, noCase, cbi);
-    }
-}
-
-/** \brief Block-mode scanner. */
-hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
-                      size_t start, HWLMCallback cb,
-                      struct hs_scratch *scratch) {
-    assert(n && buf);
-
-    struct cb_info cbi = {cb, n->id, scratch, 0};
-    DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
-                 (const char *)&n->cmp, buf);
-
-    return scan(n, buf, len, start, n->single, n->nocase, &cbi);
-}
-
-/** \brief Streaming-mode scanner. */
-hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
-                               size_t hlen, const u8 *buf, size_t len,
-                               HWLMCallback cb, struct hs_scratch *scratch) {
-    assert(n);
-
-    if (len + hlen < n->msk_len) {
-        DEBUG_PRINTF("not enough bytes for a match\n");
-        return HWLM_SUCCESS;
-    }
-
-    struct cb_info cbi = {cb, n->id, scratch, 0};
-    DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
-                 n->msk_len, (const char *)&n->cmp, buf);
-
-    if (hlen && n->msk_len > 1) {
-        /*
-         * we have history, so build up a buffer from enough of the history
-         * buffer plus what we've been given to scan. Since this is relatively
-         * short, just check against msk+cmp per byte offset for matches.
-         */
-        assert(hbuf);
-        u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
-        memset(temp_buf, 0, sizeof(temp_buf));
-
-        assert(n->msk_len);
-        size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
-        size_t tl2 = MIN((size_t)n->msk_len - 1, len);
-
-        assert(tl1 + tl2 <= sizeof(temp_buf));
-        assert(tl1 + tl2 >= n->msk_len);
-        assert(tl1 <= sizeof(u64a));
-        assert(tl2 <= sizeof(u64a));
-        DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
-
-        unaligned_store_u64a(temp_buf,
-                             partial_load_u64a(hbuf + hlen - tl1, tl1));
-        unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
-
-        for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
-            u64a v = unaligned_load_u64a(temp_buf + i);
-            if ((v & n->msk) == n->cmp) {
-                size_t m_end = -tl1 + i + n->msk_len - 1;
-                DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
-                hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
-                if (rv == HWLM_TERMINATE_MATCHING) {
-                    return HWLM_TERMINATED;
-                }
-            }
-        }
-    }
-
-    assert(buf);
-
-    cbi.offsetAdj = 0;
-    return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
-}
diff --git a/src/hwlm/noodle_engine.cpp b/src/hwlm/noodle_engine.cpp
new file mode 100644
index 000000000..33788ab42
--- /dev/null
+++ b/src/hwlm/noodle_engine.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020, 2021, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Noodle literal matcher: runtime.
+ */
+#include "hwlm.h"
+#include "noodle_engine.h"
+#include "noodle_internal.h"
+#include "scratch.h"
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/compare.h"
+#include "util/intrinsics.h"
+#include "util/join.h"
+#include "util/partial_store.h"
+#include "util/simd_utils.h"
+
+#if defined(HAVE_AVX2)
+#include "util/arch/x86/masked_move.h"
+#endif
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <string.h>
+
+/** \brief Noodle runtime context. */
+struct cb_info {
+    HWLMCallback cb; //!< callback function called on match
+    u32 id; //!< ID to pass to callback on match
+    struct hs_scratch *scratch; //!< scratch to pass to callback
+    size_t offsetAdj; //!< used in streaming mode
+};
+
+
+#define RETURN_IF_TERMINATED(x)                                                \
+    {                                                                          \
+        if ((x) == HWLM_TERMINATED) {                                          \
+            return HWLM_TERMINATED;                                            \
+        }                                                                      \
+    }
+
+// Make sure the rest of the string is there. The single character scanner
+// is used only for single chars with case insensitivity used correctly,
+// so it can go straight to the callback if we get this far.
+static really_inline
+hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len,
+                   bool needsConfirm, const struct cb_info *cbi, size_t pos) {
+    u64a v{0};
+    if (!needsConfirm) {
+        goto match;
+    }
+    assert(len >= n->msk_len);
+    v = partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len);
+    DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp);
+    if ((v & n->msk) != n->cmp) {
+        /* mask didn't match */
+        return HWLM_SUCCESS;
+    }
+
+match:
+    pos -= cbi->offsetAdj;
+    DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset);
+    hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch);
+    if (rv == HWLM_TERMINATE_MATCHING) {
+        return HWLM_TERMINATED;
+    }
+    return HWLM_SUCCESS;
+}
+
+#ifdef HAVE_SVE2
+#include "noodle_engine_sve.hpp"
+#else
+#include "noodle_engine_simd.hpp"
+#endif
+
+// main entry point for the scan code
+static really_inline
+hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len,
+                  size_t start, char single, bool noCase,
+                  const struct cb_info *cbi) {
+    if (len - start < n->msk_len) {
+        // can't find string of length keyLen in a shorter buffer
+        return HWLM_SUCCESS;
+    }
+
+    if (single) {
+        return scanSingle(n, buf, len, start, noCase, cbi);
+    } else {
+        return scanDouble(n, buf, len, start, noCase, cbi);
+    }
+}
+
+/** \brief Block-mode scanner. */
+hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len,
+                      size_t start, HWLMCallback cb,
+                      struct hs_scratch *scratch) {
+    assert(n && buf);
+
+    struct cb_info cbi = {cb, n->id, scratch, 0};
+    DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len,
+                 (const char *)&n->cmp, buf);
+
+    return scan(n, buf, len, start, n->single, n->nocase, &cbi);
+}
+
+/** \brief Streaming-mode scanner. */
+hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf,
+                               size_t hlen, const u8 *buf, size_t len,
+                               HWLMCallback cb, struct hs_scratch *scratch) {
+    assert(n);
+
+    if (len + hlen < n->msk_len) {
+        DEBUG_PRINTF("not enough bytes for a match\n");
+        return HWLM_SUCCESS;
+    }
+
+    struct cb_info cbi = {cb, n->id, scratch, 0};
+    DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen,
+                 n->msk_len, (const char *)&n->cmp, buf);
+
+    if (hlen && n->msk_len > 1) {
+        /*
+         * we have history, so build up a buffer from enough of the history
+         * buffer plus what we've been given to scan. Since this is relatively
+         * short, just check against msk+cmp per byte offset for matches.
+         */
+        assert(hbuf);
+        u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2];
+        memset(temp_buf, 0, sizeof(temp_buf));
+
+        assert(n->msk_len);
+        size_t tl1 = MIN((size_t)n->msk_len - 1, hlen);
+        size_t tl2 = MIN((size_t)n->msk_len - 1, len);
+
+        assert(tl1 + tl2 <= sizeof(temp_buf));
+        assert(tl1 + tl2 >= n->msk_len);
+        assert(tl1 <= sizeof(u64a));
+        assert(tl2 <= sizeof(u64a));
+        DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2);
+
+        unaligned_store_u64a(temp_buf,
+                             partial_load_u64a(hbuf + hlen - tl1, tl1));
+        unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2));
+
+        for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) {
+            u64a v = unaligned_load_u64a(temp_buf + i);
+            if ((v & n->msk) == n->cmp) {
+                size_t m_end = -tl1 + i + n->msk_len - 1;
+                DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i);
+                hwlmcb_rv_t rv = cb(m_end, n->id, scratch);
+                if (rv == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATED;
+                }
+            }
+        }
+    }
+
+    assert(buf);
+
+    cbi.offsetAdj = 0;
+    return scan(n, buf, len, 0, n->single, n->nocase, &cbi);
+}
diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c
deleted file mode 100644
index 5edc646af..000000000
--- a/src/hwlm/noodle_engine_avx2.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/* noodle scan parts for AVX */
-
-static really_inline m256 getMask(u8 c, bool noCase) {
-    u8 k = caseClear8(c, noCase);
-    return set32x8(k);
-}
-
-static really_inline m256 getCaseMask(void) {
-    return set32x8(0xdf);
-}
-
-static really_inline
-hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
-                                 size_t len, size_t offset, bool noCase,
-                                 m256 caseMask, m256 mask1,
-                                 const struct cb_info *cbi, size_t start,
-                                 size_t end) {
-    const u8 *d = buf + offset;
-    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
-    const size_t l = end - start;
-
-    m256 v = loadu256(d);
-
-    if (noCase) {
-        v = and256(v, caseMask);
-    }
-
-    u32 z = movemask256(eq256(mask1, v));
-
-    u32 buf_off = start - offset;
-    u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off;
-    DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
-
-    z &= mask;
-
-    SINGLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
-                                 size_t len, size_t offset, bool noCase,
-                                 m256 caseMask, m256 mask1, m256 mask2,
-                                 const struct cb_info *cbi, size_t start,
-                                 size_t end) {
-    const u8 *d = buf + offset;
-    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
-    size_t l = end - start;
-
-    m256 v = loadu256(d);
-
-    if (noCase) {
-        v = and256(v, caseMask);
-    }
-
-    u32 z0 = movemask256(eq256(mask1, v));
-    u32 z1 = movemask256(eq256(mask2, v));
-    u32 z = (z0 << 1) & z1;
-
-    // mask out where we can't match
-    u32 buf_off = start - offset;
-    u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off;
-    DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
-    z &= mask;
-
-    DOUBLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-// The short scan routine. It is used both to scan data up to an
-// alignment boundary if needed and to finish off data that the aligned scan
-// function can't handle (due to small/unaligned chunk at end)
-static really_inline
-hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
-                             size_t len, bool noCase, m256 caseMask, m256 mask1,
-                             const struct cb_info *cbi, size_t start,
-                             size_t end) {
-    const u8 *d = buf + start;
-    size_t l = end - start;
-    DEBUG_PRINTF("l %zu\n", l);
-    assert(l <= 32);
-    if (!l) {
-        return HWLM_SUCCESS;
-    }
-
-    m256 v;
-
-    if (l < 4) {
-        u8 *vp = (u8*)&v;
-        switch (l) {
-            case 3: vp[2] = d[2]; // fallthrough
-            case 2: vp[1] = d[1]; // fallthrough
-            case 1: vp[0] = d[0]; // fallthrough
-        }
-    } else {
-        v = masked_move256_len(d, l);
-    }
-
-    if (noCase) {
-        v = and256(v, caseMask);
-    }
-
-    // mask out where we can't match
-    u32 mask = (0xFFFFFFFF >> (32 - l));
-
-    u32 z = mask & movemask256(eq256(mask1, v));
-
-    SINGLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
-                             size_t len, bool noCase, m256 caseMask, m256 mask1,
-                             m256 mask2, const struct cb_info *cbi,
-                             size_t start, size_t end) {
-    const u8 *d = buf + start;
-    size_t l = end - start;
-    if (!l) {
-        return HWLM_SUCCESS;
-    }
-    assert(l <= 32);
-    m256 v;
-
-    DEBUG_PRINTF("d %zu\n", d - buf);
-    if (l < 4) {
-        u8 *vp = (u8*)&v;
-        switch (l) {
-            case 3: vp[2] = d[2]; // fallthrough
-            case 2: vp[1] = d[1]; // fallthrough
-            case 1: vp[0] = d[0]; // fallthrough
-        }
-    } else {
-        v = masked_move256_len(d, l);
-    }
-    if (noCase) {
-        v = and256(v, caseMask);
-    }
-
-    u32 z0 = movemask256(eq256(mask1, v));
-    u32 z1 = movemask256(eq256(mask2, v));
-    u32 z = (z0 << 1) & z1;
-
-    // mask out where we can't match
-    u32 mask = (0xFFFFFFFF >> (32 - l));
-    z &= mask;
-
-    DOUBLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
-                            size_t len, bool noCase, m256 caseMask, m256 mask1,
-                            const struct cb_info *cbi, size_t start,
-                            size_t end) {
-    const u8 *d = buf + start, *e = buf + end;
-    assert(d < e);
-
-    for (; d < e; d += 32) {
-        m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
-
-        u32 z = movemask256(eq256(mask1, v));
-
-        // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 128);
-
-        SINGLE_ZSCAN();
-    }
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
-                            size_t len, bool noCase, m256 caseMask, m256 mask1,
-                            m256 mask2, const struct cb_info *cbi, size_t start,
-                            size_t end) {
-    const u8 *d = buf + start, *e = buf + end;
-    DEBUG_PRINTF("start %zu end %zu \n", start, end);
-    assert(d < e);
-    u32 lastz0 = 0;
-
-    for (; d < e; d += 32) {
-        m256 v = noCase ? and256(load256(d), caseMask) : load256(d);
-
-        // we have to pull the masks out of the AVX registers because we can't
-        // byte shift between the lanes
-        u32 z0 = movemask256(eq256(mask1, v));
-        u32 z1 = movemask256(eq256(mask2, v));
-        u32 z = (lastz0 | (z0 << 1)) & z1;
-        lastz0 = z0 >> 31;
-
-        // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 128);
-
-        DOUBLE_ZSCAN();
-
-    }
-    return HWLM_SUCCESS;
-}
-
diff --git a/src/hwlm/noodle_engine_avx512.c b/src/hwlm/noodle_engine_avx512.c
deleted file mode 100644
index 8cac1b15c..000000000
--- a/src/hwlm/noodle_engine_avx512.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/* noodle scan parts for AVX512 */
-
-static really_inline
-m512 getMask(u8 c, bool noCase) {
-    u8 k = caseClear8(c, noCase);
-    return set64x8(k);
-}
-
-static really_inline
-m512 getCaseMask(void) {
-    return set64x8(CASE_CLEAR);
-}
-
-// The short scan routine. It is used both to scan data up to an
-// alignment boundary if needed and to finish off data that the aligned scan
-// function can't handle (due to small/unaligned chunk at end)
-static really_inline
-hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
-                             size_t len, bool noCase, m512 caseMask, m512 mask1,
-                             const struct cb_info *cbi, size_t start,
-                             size_t end) {
-    const u8 *d = buf + start;
-    ptrdiff_t scan_len = end - start;
-    DEBUG_PRINTF("scan_len %zu\n", scan_len);
-    assert(scan_len <= 64);
-    if (!scan_len) {
-        return HWLM_SUCCESS;
-    }
-
-    __mmask64 k = (~0ULL) >> (64 - scan_len);
-    DEBUG_PRINTF("load mask 0x%016llx\n", k);
-
-    m512 v = loadu_maskz_m512(k, d);
-
-    if (noCase) {
-        v = and512(v, caseMask);
-    }
-
-    // reuse the load mask to indicate valid bytes
-    u64a z = masked_eq512mask(k, mask1, v);
-
-    SINGLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len,
-                           bool noCase, m512 caseMask, m512 mask1,
-                           const struct cb_info *cbi, size_t start,
-                           size_t end) {
-    const u8 *d = buf + start;
-    const u8 *e = buf + end;
-    DEBUG_PRINTF("start %p end %p \n", d, e);
-    assert(d < e);
-    if (d + 64 >= e) {
-        goto tail;
-    }
-
-    // peel off first part to cacheline boundary
-    const u8 *d1 = ROUNDUP_PTR(d, 64);
-    if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start,
-                        d1 - buf) == HWLM_TERMINATED) {
-        return HWLM_TERMINATED;
-    }
-    d = d1;
-
-    for (; d + 64 < e; d += 64) {
-        DEBUG_PRINTF("d %p e %p \n", d, e);
-        m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
-
-        u64a z = eq512mask(mask1, v);
-        __builtin_prefetch(d + 128);
-
-        SINGLE_ZSCAN();
-    }
-
-tail:
-    DEBUG_PRINTF("d %p e %p \n", d, e);
-    // finish off tail
-
-    return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf,
-                           e - buf);
-}
-
-static really_inline
-hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
-                             size_t len, bool noCase, m512 caseMask, m512 mask1,
-                             m512 mask2, const struct cb_info *cbi,
-                             u64a *lastz0, size_t start, size_t end) {
-    DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0);
-    const u8 *d = buf + start;
-    ptrdiff_t scan_len = end - start;
-    if (!scan_len) {
-        return HWLM_SUCCESS;
-    }
-    assert(scan_len <= 64);
-    __mmask64 k = (~0ULL) >> (64 - scan_len);
-    DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len);
-
-    m512 v = loadu_maskz_m512(k, d);
-    if (noCase) {
-        v = and512(v, caseMask);
-    }
-
-    u64a z0 = masked_eq512mask(k, mask1, v);
-    u64a z1 = masked_eq512mask(k, mask2, v);
-    u64a z = (*lastz0 | (z0 << 1)) & z1;
-    DEBUG_PRINTF("z 0x%016llx\n", z);
-
-    DOUBLE_ZSCAN();
-    *lastz0 = z0 >> (scan_len - 1);
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len,
-                           bool noCase, m512 caseMask, m512 mask1, m512 mask2,
-                           const struct cb_info *cbi, size_t start,
-                           size_t end) {
-    const u8 *d = buf + start;
-    const u8 *e = buf + end;
-    u64a lastz0 = 0;
-    DEBUG_PRINTF("start %zu end %zu \n", start, end);
-    assert(d < e);
-    if (d + 64 >= e) {
-        goto tail;
-    }
-
-    // peel off first part to cacheline boundary
-    const u8 *d1 = ROUNDUP_PTR(d, 64);
-    if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
-                        &lastz0, start, d1 - buf) == HWLM_TERMINATED) {
-        return HWLM_TERMINATED;
-    }
-    d = d1;
-
-    for (; d + 64 < e; d += 64) {
-        DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0);
-        m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
-
-        /* we have to pull the masks out of the AVX registers because we can't
-           byte shift between the lanes */
-        u64a z0 = eq512mask(mask1, v);
-        u64a z1 = eq512mask(mask2, v);
-        u64a z = (lastz0 | (z0 << 1)) & z1;
-        lastz0 = z0 >> 63;
-
-        // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 256);
-
-        DEBUG_PRINTF("z 0x%016llx\n", z);
-
-        DOUBLE_ZSCAN();
-    }
-
-tail:
-    DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf);
-    // finish off tail
-
-    return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi,
-                           &lastz0, d - buf, end);
-}
diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp
new file mode 100644
index 000000000..23827873f
--- /dev/null
+++ b/src/hwlm/noodle_engine_simd.hpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* SIMD engine agnostic noodle scan parts */
+
+#include "util/supervector/supervector.hpp"
+#include "util/supervector/casemask.hpp"
+
+template <uint16_t S>
+static really_really_inline
+hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
+                          typename SuperVector<S>::comparemask_type z, size_t len, const struct cb_info *cbi) {
+    while (unlikely(z)) {
+        typename SuperVector<S>::comparemask_type pos = SuperVector<S>::findLSB(z);
+        size_t matchPos = d - buf + pos;
+        DEBUG_PRINTF("match pos %zu\n", matchPos);
+        hwlmcb_rv_t rv = final(n, buf, len, n->msk_len != 1, cbi, matchPos);
+        RETURN_IF_TERMINATED(rv);
+    }
+    return HWLM_SUCCESS;
+}
+
+template <uint16_t S>
+static really_really_inline
+hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
+                          typename SuperVector<S>::comparemask_type z, size_t len, const struct cb_info *cbi) {
+    while (unlikely(z)) {
+        typename SuperVector<S>::comparemask_type pos = SuperVector<S>::findLSB(z);
+        size_t matchPos = d - buf + pos - 1;
+        DEBUG_PRINTF("match pos %zu\n", matchPos);
+        hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
+        RETURN_IF_TERMINATED(rv);
+    }
+    return HWLM_SUCCESS;
+}
+
+template <uint16_t S>
+static really_inline
+hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t offset,
+                            SuperVector<S> caseMask, SuperVector<S> mask1,
+                            const struct cb_info *cbi) {
+    size_t start = offset + n->msk_len - 1;
+
+    const u8 *d = buf + start;
+    const u8 *buf_end = buf + len;
+    assert(d < buf_end);
+
+    DEBUG_PRINTF("noodle %p start %zu len %zu\n", buf, start, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+
+    __builtin_prefetch(d + 16*64);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p, S: %d \n", ROUNDUP_PTR(d, S), S);
+        if (!ISALIGNED_N(d, S)) {
+            const u8 *d0 = ROUNDDOWN_PTR(d, S);
+            DEBUG_PRINTF("d - d0: %ld \n", d - d0);
+#if defined(HAVE_MASKED_LOADS)
+            uint8_t l = d - d0;
+            typename SuperVector<S>::comparemask_type mask = ~SuperVector<S>::load_mask(l);
+            SuperVector<S> chars = SuperVector<S>::loadu_maskz(d0, mask) & caseMask;
+            typename SuperVector<S>::comparemask_type z = mask1.eqmask(chars);
+            DEBUG_PRINTF("mask: %08llx\n", mask);
+            hwlm_error_t rv = single_zscan<S>(n, d0, buf, z, len, cbi);
+#else
+            uint8_t l = d0 + S - d;
+            DEBUG_PRINTF("l: %d \n", l);
+            SuperVector<S> chars = SuperVector<S>::loadu_maskz(d, l) & caseMask;
+            chars.print8("chars");
+            typename SuperVector<S>::comparemask_type z = mask1.eqmask(chars);
+            DEBUG_PRINTF("z: %08llx\n", (u64a) z);
+            z = SuperVector<S>::iteration_mask(z);
+            DEBUG_PRINTF("z: %08llx\n", (u64a) z);
+
+            hwlm_error_t rv = single_zscan<S>(n, d, buf, z, len, cbi);
+#endif
+            chars.print32("chars");
+            DEBUG_PRINTF("z: %08llx\n", (u64a) z);
+
+            RETURN_IF_TERMINATED(rv);
+            d = d0 + S;
+        }
+
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 16*64);
+            DEBUG_PRINTF("d %p \n", d);
+
+            SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
+            typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
+            z = SuperVector<S>::iteration_mask(z);
+
+            hwlm_error_t rv = single_zscan<S>(n, d, buf, z, len, cbi);
+            RETURN_IF_TERMINATED(rv);
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("d %p e %p \n", d, buf_end);
+    // finish off tail
+
+    if (d != buf_end) {
+        uint8_t l = buf_end - d;
+        SuperVector<S> chars = SuperVector<S>::loadu_maskz(d, l) & caseMask;
+        typename SuperVector<S>::comparemask_type z = mask1.eqmask(chars);
+        z = SuperVector<S>::iteration_mask(z);
+
+        hwlm_error_t rv = single_zscan<S>(n, d, buf, z, len, cbi);
+        RETURN_IF_TERMINATED(rv);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+template <uint16_t S>
+static really_inline
+hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
+                            size_t len, size_t offset,
+                            SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
+                            const struct cb_info *cbi) {
+    size_t end = len - n->key_offset + 2;
+    size_t start = offset + n->msk_len - n->key_offset;
+
+    const u8 *d = buf + start;
+    const u8 *buf_end = buf + end;
+    assert(d < buf_end);
+
+    DEBUG_PRINTF("noodle %p start %zu len %zu\n", buf, start, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+
+    typename SuperVector<S>::comparemask_type lastz1{0};
+
+    __builtin_prefetch(d + 16*64);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p, S: %d \n", ROUNDUP_PTR(d, S), S);
+        if (!ISALIGNED_N(d, S)) {
+            const u8 *d0 = ROUNDDOWN_PTR(d, S);
+#if defined(HAVE_MASKED_LOADS)
+            uint8_t l = d - d0;
+            typename SuperVector<S>::comparemask_type mask = ~SuperVector<S>::load_mask(l);
+            SuperVector<S> chars = SuperVector<S>::loadu_maskz(d0, mask) & caseMask;
+            typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z = (z1 << SuperVector<S>::mask_width()) & z2;
+            z = SuperVector<S>::iteration_mask(z);
+            lastz1 = z1 >> (S - 1);
+
+            DEBUG_PRINTF("mask: %08llx\n", mask);
+            hwlm_error_t rv = double_zscan<S>(n, d0, buf, z, len, cbi);
+#else
+            uint8_t l = d0 + S - d;
+            SuperVector<S> chars = SuperVector<S>::loadu_maskz(d, l) & caseMask;
+            chars.print8("chars");
+            typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z = (z1 << SuperVector<S>::mask_width()) & z2;
+            z = SuperVector<S>::iteration_mask(z);
+
+            hwlm_error_t rv = double_zscan<S>(n, d, buf, z, len, cbi);
+            lastz1 = z1 >> (l - 1);
+#endif
+            RETURN_IF_TERMINATED(rv);
+            d = d0 + S;
+        }
+
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 16*64);
+            DEBUG_PRINTF("d %p \n", d);
+
+            SuperVector<S> chars = SuperVector<S>::load(d) & caseMask;
+            typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
+            typename SuperVector<S>::comparemask_type z = (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
+            lastz1 = z1 >> (S - 1);
+            z = SuperVector<S>::iteration_mask(z);
+
+            hwlm_error_t rv = double_zscan<S>(n, d, buf, z, len, cbi);
+            RETURN_IF_TERMINATED(rv);
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("d %p e %p \n", d, buf_end);
+    // finish off tail
+    if (d != buf_end) {
+        uint8_t l = buf_end - d;
+        SuperVector<S> chars = SuperVector<S>::loadu_maskz(d, l) & caseMask;
+        typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
+        typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
+        typename SuperVector<S>::comparemask_type z = (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
+        z = SuperVector<S>::iteration_mask(z);
+
+        hwlm_error_t rv = double_zscan<S>(n, d, buf, z, len, cbi);
+        RETURN_IF_TERMINATED(rv);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+// Single-character specialisation, used when keyLen = 1
+static really_inline
+hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t start, bool noCase, const struct cb_info *cbi) {
+    /*
+     * TODO: Investigate adding scalar case for smaller sizes
+    if (len < VECTORSIZE) {
+      return scanSingleSlow(n, buf, len, start, noCase, n->key0, cbi);
+    }*/
+
+    if (!ourisalpha(n->key0)) {
+        noCase = 0; // force noCase off if we don't have an alphabetic char
+    }
+
+    const SuperVector<VECTORSIZE> caseMask{noCase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
+    const SuperVector<VECTORSIZE> mask1{getMask<VECTORSIZE>(n->key0, noCase)};
+
+    return scanSingleMain(n, buf, len, start, caseMask, mask1, cbi);
+}
+
+
+static really_inline
+hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t start, bool noCase, const struct cb_info *cbi) {
+
+    const SuperVector<VECTORSIZE> caseMask{noCase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
+    const SuperVector<VECTORSIZE> mask1{getMask<VECTORSIZE>(n->key0, noCase)};
+    const SuperVector<VECTORSIZE> mask2{getMask<VECTORSIZE>(n->key1, noCase)};
+
+    return scanDoubleMain(n, buf, len, start, caseMask, mask1, mask2, cbi);
+}
diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c
deleted file mode 100644
index 7cd53d7ce..000000000
--- a/src/hwlm/noodle_engine_sse.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/* noodle scan parts for SSE */
-
-static really_inline m128 getMask(u8 c, bool noCase) {
-    u8 k = caseClear8(c, noCase);
-    return set16x8(k);
-}
-
-static really_inline m128 getCaseMask(void) {
-    return set16x8(0xdf);
-}
-
-static really_inline
-hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
-                             size_t len, bool noCase, m128 caseMask, m128 mask1,
-                             const struct cb_info *cbi, size_t start,
-                             size_t end) {
-    const u8 *d = buf + start;
-    size_t l = end - start;
-    DEBUG_PRINTF("l %zu\n", l);
-    assert(l <= 16);
-    if (!l) {
-        return HWLM_SUCCESS;
-    }
-    m128 v = zeroes128();
-    // we don't have a clever way of doing this move yet
-    memcpy(&v, d, l);
-    if (noCase) {
-        v = and128(v, caseMask);
-    }
-
-    // mask out where we can't match
-    u32 mask = (0xFFFF >> (16 - l));
-
-    u32 z = mask & movemask128(eq128(mask1, v));
-
-    SINGLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
-                                 size_t len, size_t offset, bool noCase,
-                                 m128 caseMask, m128 mask1,
-                                 const struct cb_info *cbi, size_t start,
-                                 size_t end) {
-    const u8 *d = buf + offset;
-    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
-    const size_t l = end - start;
-
-    m128 v = loadu128(d);
-
-    if (noCase) {
-        v = and128(v, caseMask);
-    }
-
-    u32 buf_off = start - offset;
-    u32 mask = ((1 << l) - 1) << buf_off;
-
-    u32 z = mask & movemask128(eq128(mask1, v));
-
-    DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
-
-    z &= mask;
-
-    SINGLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
-                             size_t len, bool noCase, m128 caseMask, m128 mask1,
-                             m128 mask2, const struct cb_info *cbi,
-                             size_t start, size_t end) {
-    const u8 *d = buf + start;
-    size_t l = end - start;
-    if (!l) {
-        return HWLM_SUCCESS;
-    }
-    assert(l <= 32);
-
-    DEBUG_PRINTF("d %zu\n", d - buf);
-    m128 v = zeroes128();
-    memcpy(&v, d, l);
-    if (noCase) {
-        v = and128(v, caseMask);
-    }
-
-    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
-                               eq128(mask2, v)));
-
-    // mask out where we can't match
-    u32 mask = (0xFFFF >> (16 - l));
-    z &= mask;
-
-    DOUBLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
-                                 size_t len, size_t offset, bool noCase,
-                                 m128 caseMask, m128 mask1, m128 mask2,
-                                 const struct cb_info *cbi, size_t start,
-                                 size_t end) {
-    const u8 *d = buf + offset;
-    DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
-    size_t l = end - start;
-
-    m128 v = loadu128(d);
-
-    if (noCase) {
-        v = and128(v, caseMask);
-    }
-
-    u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
-                               eq128(mask2, v)));
-
-    // mask out where we can't match
-    u32 buf_off = start - offset;
-    u32 mask = ((1 << l) - 1) << buf_off;
-    DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z);
-    z &= mask;
-
-    DOUBLE_ZSCAN();
-
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf,
-                            size_t len, bool noCase, m128 caseMask, m128 mask1,
-                            const struct cb_info *cbi, size_t start,
-                            size_t end) {
-    const u8 *d = buf + start, *e = buf + end;
-    assert(d < e);
-
-    for (; d < e; d += 16) {
-        m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
-
-        u32 z = movemask128(eq128(mask1, v));
-
-        // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 128);
-
-        SINGLE_ZSCAN();
-    }
-    return HWLM_SUCCESS;
-}
-
-static really_inline
-hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf,
-                            size_t len, bool noCase, m128 caseMask, m128 mask1,
-                            m128 mask2, const struct cb_info *cbi, size_t start,
-                            size_t end) {
-    const u8 *d = buf + start, *e = buf + end;
-    assert(d < e);
-    m128 lastz1 = zeroes128();
-
-    for (; d < e; d += 16) {
-        m128 v = noCase ? and128(load128(d), caseMask) : load128(d);
-        m128 z1 = eq128(mask1, v);
-        m128 z2 = eq128(mask2, v);
-        u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2));
-        lastz1 = z1;
-
-        // On large packet buffers, this prefetch appears to get us about 2%.
-        __builtin_prefetch(d + 128);
-        DEBUG_PRINTF("z 0x%08x\n", z);
-        DOUBLE_ZSCAN();
-    }
-    return HWLM_SUCCESS;
-}
diff --git a/src/hwlm/noodle_engine_sve.hpp b/src/hwlm/noodle_engine_sve.hpp
new file mode 100644
index 000000000..cc2d77002
--- /dev/null
+++ b/src/hwlm/noodle_engine_sve.hpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+static really_inline
+hwlm_error_t checkMatched(const struct noodTable *n, const u8 *buf, size_t len,
+                          const struct cb_info *cbi, const u8 *d,
+                          svbool_t matched, bool needsConfirm) {
+    assert(d >= buf);
+    size_t basePos = d - buf;
+    svbool_t next_match = svpnext_b8(matched, svpfalse());
+    do {
+        svbool_t brk = svbrkb_z(svptrue_b8(), next_match);
+        size_t matchPos = basePos + svcntp_b8(svptrue_b8(), brk);
+        DEBUG_PRINTF("match pos %zu\n", matchPos);
+        assert(matchPos < len);
+        hwlmcb_rv_t rv = final(n, buf, len, needsConfirm, cbi, matchPos);
+        RETURN_IF_TERMINATED(rv);
+        next_match = svpnext_b8(matched, next_match);
+    } while (unlikely(svptest_any(svptrue_b8(), next_match)));
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t singleCheckMatched(const struct noodTable *n, const u8 *buf,
+                                size_t len, const struct cb_info *cbi,
+                                const u8 *d, svbool_t matched) {
+    if (unlikely(svptest_any(svptrue_b8(), matched))) {
+        hwlmcb_rv_t rv = checkMatched(n, buf, len, cbi, d, matched,
+                                      n->msk_len != 1);
+        RETURN_IF_TERMINATED(rv);
+    }
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+svbool_t singleMatched(svuint8_t chars, const u8 *d, svbool_t pg) {
+    return svmatch(pg, svld1_u8(pg, d), chars);
+}
+
+static really_inline
+hwlm_error_t scanSingleOnce(const struct noodTable *n, const u8 *buf,
+                            size_t len, const struct cb_info *cbi,
+                            svuint8_t chars, const u8 *d, const u8 *e) {
+    DEBUG_PRINTF("start %p end %p\n", d, e);
+    assert(d < e);
+    assert(d >= buf);
+    DEBUG_PRINTF("l = %td\n", e - d);
+    svbool_t pg = svwhilelt_b8_s64(0, e - d);
+    svbool_t matched = singleMatched(chars, d, pg);
+    return singleCheckMatched(n, buf, len, cbi, d, matched);
+}
+
+static really_inline
+hwlm_error_t scanSingleLoop(const struct noodTable *n, const u8 *buf,
+                            size_t len, const struct cb_info *cbi,
+                            svuint8_t chars, const u8 *d, const u8 *e) {
+    assert(d < e);
+    assert(d >= buf);
+    size_t loops = (e - d) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    assert(d + (loops * svcntb()) <= e);
+
+    for (size_t i = 0; i < loops; i++, d += svcntb()) {
+        DEBUG_PRINTF("d %p \n", d);
+        svbool_t matched = singleMatched(chars, d, svptrue_b8());
+        hwlmcb_rv_t rv = singleCheckMatched(n, buf, len, cbi, d, matched);
+        RETURN_IF_TERMINATED(rv);
+    }
+    DEBUG_PRINTF("d %p e %p \n", d, e);
+    return d == e ? HWLM_SUCCESS
+                  : scanSingleOnce(n, buf, len, cbi, chars, d, e);
+}
+
+static really_inline
+hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t offset, bool noCase, const struct cb_info *cbi) {
+    if (!ourisalpha(n->key0)) {
+        noCase = false; // force noCase off if we don't have an alphabetic char
+    }
+
+    size_t start = offset + n->msk_len - 1;
+    const u8 *d = buf + start;
+    const u8 *e = buf + len;
+    DEBUG_PRINTF("start %p end %p \n", d, e);
+    assert(d < e);
+    assert(d >= buf);
+
+    svuint8_t chars = getCharMaskSingle(n->key0, noCase);
+
+    size_t scan_len = e - d;
+    if (scan_len <= svcntb()) {
+        return scanSingleOnce(n, buf, len, cbi, chars, d, e);
+    }
+    // peel off first part to align to the vector size
+    const u8 *d1 = ROUNDUP_PTR(d, svcntb_pat(SV_POW2));
+    if (d != d1) {
+        DEBUG_PRINTF("until aligned %p \n", d1);
+        hwlmcb_rv_t rv = scanSingleOnce(n, buf, len, cbi, chars, d, d1);
+        RETURN_IF_TERMINATED(rv);
+    }
+    return scanSingleLoop(n, buf, len, cbi, chars, d1, e);
+}
+
+static really_inline
+hwlm_error_t doubleCheckMatched(const struct noodTable *n, const u8 *buf,
+                                size_t len, const struct cb_info *cbi,
+                                const u8 *d, svbool_t matched,
+                                svbool_t matched_rot, svbool_t any) {
+    if (unlikely(svptest_any(svptrue_b8(), any))) {
+        // Project predicate onto vector.
+        svuint8_t matched_vec = svdup_u8_z(matched, 1);
+        // Shift vector to right by one and project back to the predicate.
+        matched = svcmpeq_n_u8(svptrue_b8(), svinsr_n_u8(matched_vec, 0), 1);
+        matched = svorr_z(svptrue_b8(), matched, matched_rot);
+        // d - 1 won't underflow as the first position in buf has been dealt
+        // with meaning that d > buf
+        assert(d > buf);
+        hwlmcb_rv_t rv = checkMatched(n, buf, len, cbi, d - 1, matched,
+                                      n->msk_len != 2);
+        RETURN_IF_TERMINATED(rv);
+    }
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+svbool_t doubleMatched(svuint16_t chars, const u8 *d,
+                       svbool_t pg, svbool_t pg_rot,
+                       svbool_t * const matched, svbool_t * const matched_rot) {
+    svuint16_t vec = svreinterpret_u16(svld1_u8(pg, d));
+    // d - 1 won't underflow as the first position in buf has been dealt
+    // with meaning that d > buf
+    svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, d - 1));
+    *matched = svmatch(pg, vec, chars);
+    *matched_rot = svmatch(pg_rot, vec_rot, chars);
+    return svorr_z(svptrue_b8(), *matched, *matched_rot);
+}
+
+static really_inline
+hwlm_error_t scanDoubleOnce(const struct noodTable *n, const u8 *buf,
+                            size_t len, const struct cb_info *cbi,
+                            svuint8_t chars, const u8 *d, const u8 *e) {
+    DEBUG_PRINTF("start %p end %p\n", d, e);
+    assert(d < e);
+    assert(d > buf);
+    svbool_t pg = svwhilelt_b8_s64(0, e - d);
+    svbool_t pg_rot = svwhilelt_b8_s64(0, e - d + 1);
+    svbool_t matched, matched_rot;
+    svbool_t any = doubleMatched(svreinterpret_u16(chars), d, pg, pg_rot, &matched, &matched_rot);
+    return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any);
+}
+
+static really_inline
+hwlm_error_t scanDoubleLoop(const struct noodTable *n, const u8 *buf,
+                            size_t len, const struct cb_info *cbi,
+                            svuint8_t chars, const u8 *d, const u8 *e) {
+    assert(d < e);
+    assert(d > buf);
+    size_t loops = (e - d) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    assert(d + (loops * svcntb()) <= e);
+
+    for (size_t i = 0; i < loops; i++, d += svcntb()) {
+        DEBUG_PRINTF("d %p \n", d);
+        svbool_t matched, matched_rot;
+        svbool_t any = doubleMatched(svreinterpret_u16(chars), d, svptrue_b8(), svptrue_b8(),
+                                     &matched, &matched_rot);
+        hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d,
+                                             matched, matched_rot, any);
+        RETURN_IF_TERMINATED(rv);
+    }
+    DEBUG_PRINTF("d %p e %p \n", d, e);
+
+    return d == e ? HWLM_SUCCESS
+                  : scanDoubleOnce(n, buf, len, cbi, chars, d, e);
+}
+
+static really_inline
+hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len,
+                        size_t offset, bool noCase, const struct cb_info *cbi) {
+    // we stop scanning for the key-fragment when the rest of the key can't
+    // possibly fit in the remaining buffer
+    size_t end = len - n->key_offset + 2;
+
+    size_t start = offset + n->msk_len - n->key_offset;
+
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
+    DEBUG_PRINTF("start %p end %p \n", d, e);
+    assert(d < e);
+    assert(d >= buf);
+
+    size_t scan_len = e - d;
+    if (scan_len < 2) {
+        return HWLM_SUCCESS;
+    }
+    ++d;
+
+    svuint8_t chars = svreinterpret_u8(getCharMaskDouble(n->key0, n->key1, noCase));
+
+    if (scan_len <= svcntb()) {
+        return scanDoubleOnce(n, buf, len, cbi, chars, d, e);
+    }
+    // peel off first part to align to the vector size
+    const u8 *d1 = ROUNDUP_PTR(d, svcntb_pat(SV_POW2));
+    if (d != d1) {
+        DEBUG_PRINTF("until aligned %p \n", d1);
+        hwlmcb_rv_t rv = scanDoubleOnce(n, buf, len, cbi, chars,
+                                        d, d1);
+        RETURN_IF_TERMINATED(rv);
+    }
+    return scanDoubleLoop(n, buf, len, cbi, chars, d1, e);
+}
diff --git a/src/nfa/accel.c b/src/nfa/accel.c
index 2bc60945f..7661b7a79 100644
--- a/src/nfa/accel.c
+++ b/src/nfa/accel.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,7 +30,7 @@
 #include "accel.h"
 #include "shufti.h"
 #include "truffle.h"
-#include "vermicelli.h"
+#include "vermicelli.hpp"
 #include "ue2common.h"
 
 const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
@@ -81,6 +82,39 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
                                   c_end - 1);
         break;
 
+#ifdef HAVE_SVE2
+    case ACCEL_VERM16:
+        DEBUG_PRINTF("accel verm16 %p %p\n", c, c_end);
+        if (c_end - c < 16) {
+            return c;
+        }
+
+        rv = vermicelli16Exec(accel->verm16.mask, c, c_end);
+        break;
+
+    case ACCEL_DVERM16:
+        DEBUG_PRINTF("accel dverm16 %p %p\n", c, c_end);
+        if (c_end - c < 18) {
+            return c;
+        }
+
+        /* need to stop one early to get an accurate end state */
+        rv = vermicelliDouble16Exec(accel->dverm16.mask, accel->dverm16.firsts,
+                                    c, c_end - 1);
+        break;
+
+    case ACCEL_DVERM16_MASKED:
+        DEBUG_PRINTF("accel dverm16 masked %p %p\n", c, c_end);
+        if (c_end - c < 18) {
+            return c;
+        }
+
+        /* need to stop one early to get an accurate end state */
+        rv = vermicelliDoubleMasked16Exec(accel->mdverm16.mask, accel->mdverm16.c1,
+                                          accel->mdverm16.m1, c, c_end - 1);
+        break;
+#endif // HAVE_SVE2
+
     case ACCEL_DVERM_MASKED:
         DEBUG_PRINTF("accel dverm masked %p %p\n", c, c_end);
         if (c + 16 + 1 >= c_end) {
diff --git a/src/nfa/accel.h b/src/nfa/accel.h
index 3a03d0596..3fccdd7bf 100644
--- a/src/nfa/accel.h
+++ b/src/nfa/accel.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -62,6 +63,9 @@ enum AccelType {
     ACCEL_TRUFFLE,
     ACCEL_RED_TAPE,
     ACCEL_DVERM_MASKED,
+    ACCEL_VERM16,
+    ACCEL_DVERM16,
+    ACCEL_DVERM16_MASKED,
 };
 
 /** \brief Structure for accel framework. */
@@ -97,6 +101,24 @@ union AccelAux {
         u8 len1;
         u8 len2;
     } mdverm;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        m128 mask;
+    } verm16;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        u64a firsts;
+        m128 mask;
+    } dverm16;
+    struct {
+        u8 accel_type;
+        u8 offset;
+        u8 c1; // used for partial match
+        u8 m1; // used for partial match
+        m128 mask;
+    } mdverm16;
     struct {
         u8 accel_type;
         u8 offset;
diff --git a/src/nfa/accel_dfa_build_strat.cpp b/src/nfa/accel_dfa_build_strat.cpp
index ae71e141a..7139d5bea 100644
--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +34,7 @@
 #include "nfagraph/ng_limex_accel.h"
 #include "shufticompile.h"
 #include "trufflecompile.h"
+#include "vermicellicompile.h"
 #include "util/accel_scheme.h"
 #include "util/charreach.h"
 #include "util/container.h"
@@ -105,7 +107,7 @@ static
 path append(const path &orig, const CharReach &cr, u32 new_dest) {
     path p(new_dest);
     p.reach = orig.reach;
-    p.reach.push_back(cr);
+    p.reach.emplace_back(cr);
 
     return p;
 }
@@ -117,25 +119,25 @@ void extend(const raw_dfa &rdfa, const vector<CharReach> &rev_map,
     const dstate &s = rdfa.states[p.dest];
 
     if (!p.reach.empty() && p.reach.back().none()) {
-        out.push_back(p);
+        out.emplace_back(p);
         return;
     }
 
     if (!s.reports.empty()) {
         if (generates_callbacks(rdfa.kind)) {
-            out.push_back(p);
+            out.emplace_back(p);
             return;
         } else {
             path pp = append(p, CharReach(), p.dest);
-            all[p.dest].push_back(pp);
-            out.push_back(move(pp));
+            all[p.dest].emplace_back(pp);
+            out.emplace_back(std::move(pp));
         }
     }
 
     if (!s.reports_eod.empty()) {
         path pp = append(p, CharReach(), p.dest);
-        all[p.dest].push_back(pp);
-        out.push_back(move(pp));
+        all[p.dest].emplace_back(pp);
+        out.emplace_back(std::move(pp));
     }
 
     flat_map<u32, CharReach> dest;
@@ -154,8 +156,8 @@ void extend(const raw_dfa &rdfa, const vector<CharReach> &rev_map,
 
         DEBUG_PRINTF("----good: [%s] -> %u\n",
                      describeClasses(pp.reach).c_str(), pp.dest);
-        all[e.first].push_back(pp);
-        out.push_back(move(pp));
+        all[e.first].emplace_back(pp);
+        out.emplace_back(std::move(pp));
     }
 }
 
@@ -165,14 +167,14 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
     const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
     vector<path> paths{path(base)};
     unordered_map<u32, vector<path>> all;
-    all[base].push_back(path(base));
+    all[base].emplace_back(path(base));
     for (u32 i = 0; i < len && paths.size() < PATHS_LIMIT; i++) {
         vector<path> next_gen;
         for (const auto &p : paths) {
             extend(rdfa, rev_map, p, all, next_gen);
         }
 
-        paths = move(next_gen);
+        paths = std::move(next_gen);
     }
 
     dump_paths(paths);
@@ -180,7 +182,7 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
     vector<vector<CharReach>> rv;
     rv.reserve(paths.size());
     for (auto &p : paths) {
-        rv.push_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
+        rv.emplace_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
                                        std::make_move_iterator(p.reach.end())));
     }
     return rv;
@@ -318,7 +320,7 @@ set<dstate_id_t> find_region(const raw_dfa &rdfa, dstate_id_t base,
 
             DEBUG_PRINTF("    %hu is in region\n", t);
             region.insert(t);
-            pending.push_back(t);
+            pending.emplace_back(t);
         }
     }
 
@@ -440,45 +442,75 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
         return;
     }
 
-    if (double_byte_ok(info) && info.double_cr.none() &&
-        (info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
-        bool ok = true;
+    if (double_byte_ok(info) && info.double_cr.none()) {
+        if ((info.double_byte.size() == 2 || info.double_byte.size() == 4)) {
+            bool ok = true;
 
-        assert(!info.double_byte.empty());
-        u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
-        u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
+            assert(!info.double_byte.empty());
+            u8 firstC = info.double_byte.begin()->first & CASE_CLEAR;
+            u8 secondC = info.double_byte.begin()->second & CASE_CLEAR;
 
-        for (const pair<u8, u8> &p : info.double_byte) {
-            if ((p.first & CASE_CLEAR) != firstC ||
-                (p.second & CASE_CLEAR) != secondC) {
-                ok = false;
-                break;
+            for (const pair<u8, u8> &p : info.double_byte) {
+                if ((p.first & CASE_CLEAR) != firstC ||
+                    (p.second & CASE_CLEAR) != secondC) {
+                    ok = false;
+                    break;
+                }
             }
-        }
 
-        if (ok) {
-            accel->accel_type = ACCEL_DVERM_NOCASE;
-            accel->dverm.c1 = firstC;
-            accel->dverm.c2 = secondC;
-            accel->dverm.offset = verify_u8(info.double_offset);
-            DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
-            return;
-        }
+            if (ok) {
+                accel->accel_type = ACCEL_DVERM_NOCASE;
+                accel->dverm.c1 = firstC;
+                accel->dverm.c2 = secondC;
+                accel->dverm.offset = verify_u8(info.double_offset);
+                DEBUG_PRINTF("state %hu is nc double vermicelli\n", this_idx);
+                return;
+            }
 
-        u8 m1;
-        u8 m2;
-        if (buildDvermMask(info.double_byte, &m1, &m2)) {
-            accel->accel_type = ACCEL_DVERM_MASKED;
-            accel->dverm.offset = verify_u8(info.double_offset);
-            accel->dverm.c1 = info.double_byte.begin()->first & m1;
-            accel->dverm.c2 = info.double_byte.begin()->second & m2;
-            accel->dverm.m1 = m1;
-            accel->dverm.m2 = m2;
-            DEBUG_PRINTF(
-                "building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
-                accel->dverm.c1, accel->dverm.c2);
+            u8 m1;
+            u8 m2;
+            if (buildDvermMask(info.double_byte, &m1, &m2)) {
+                u8 c1 = info.double_byte.begin()->first & m1;
+                u8 c2 = info.double_byte.begin()->second & m2;
+#ifdef HAVE_SVE2
+                if (vermicelliDoubleMasked16Build(c1, c2, m1, m2, (u8 *)&accel->mdverm16.mask)) {
+                    accel->accel_type = ACCEL_DVERM16_MASKED;
+                    accel->mdverm16.offset = verify_u8(info.double_offset);
+                    accel->mdverm16.c1 = c1;
+                    accel->mdverm16.m1 = m1;
+                    DEBUG_PRINTF("building maskeddouble16-vermicelli for 0x%02hhx%02hhx\n",
+                                c1, c2);
+                    return;
+                } else if (info.double_byte.size() <= 8 &&
+                        vermicelliDouble16Build(info.double_byte, (u8 *)&accel->dverm16.mask,
+                                                (u8 *)&accel->dverm16.firsts)) {
+                    accel->accel_type = ACCEL_DVERM16;
+                    accel->dverm16.offset = verify_u8(info.double_offset);
+                    DEBUG_PRINTF("building double16-vermicelli\n");
+                    return;
+                }
+#endif // HAVE_SVE2
+                accel->accel_type = ACCEL_DVERM_MASKED;
+                accel->dverm.offset = verify_u8(info.double_offset);
+                accel->dverm.c1 = c1;
+                accel->dverm.c2 = c2;
+                accel->dverm.m1 = m1;
+                accel->dverm.m2 = m2;
+                DEBUG_PRINTF(
+                    "building maskeddouble-vermicelli for 0x%02hhx%02hhx\n", c1, c2);
+                return;
+            }
+        }
+#ifdef HAVE_SVE2
+        if (info.double_byte.size() <= 8 &&
+            vermicelliDouble16Build(info.double_byte, (u8 *)&accel->dverm16.mask,
+                                    (u8 *)&accel->dverm16.firsts)) {
+            accel->accel_type = ACCEL_DVERM16;
+            accel->dverm16.offset = verify_u8(info.double_offset);
+            DEBUG_PRINTF("building double16-vermicelli\n");
             return;
         }
+#endif // HAVE_SVE2
     }
 
     if (double_byte_ok(info) &&
@@ -514,6 +546,15 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
         return;
     }
 
+#ifdef HAVE_SVE2
+    if (info.cr.count() <= 16) {
+        accel->accel_type = ACCEL_VERM16;
+        vermicelli16Build(info.cr, (u8 *)&accel->verm16.mask);
+        DEBUG_PRINTF("state %hu is vermicelli16\n", this_idx);
+        return;
+    }
+#endif // HAVE_SVE2
+
     if (info.cr.count() > max_floating_stop_char()) {
         accel->accel_type = ACCEL_NONE;
         DEBUG_PRINTF("state %hu is too broad\n", this_idx);
diff --git a/src/nfa/accelcompile.cpp b/src/nfa/accelcompile.cpp
index a224410dc..e0be910d8 100644
--- a/src/nfa/accelcompile.cpp
+++ b/src/nfa/accelcompile.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,6 +30,7 @@
 #include "accel.h"
 #include "accelcompile.h"
 #include "shufticompile.h"
+#include "vermicellicompile.h"
 #include "trufflecompile.h"
 #include "nfagraph/ng_limex_accel.h" /* for constants */
 #include "util/bitutils.h"
@@ -71,6 +73,16 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
         return;
     }
 
+#ifdef HAVE_SVE2
+    if (outs <= 16) {
+        aux->accel_type = ACCEL_VERM16;
+        aux->verm16.offset = offset;
+        vermicelli16Build(info.single_stops, (u8 *)&aux->verm16.mask);
+        DEBUG_PRINTF("building vermicelli16\n");
+        return;
+    }
+#endif
+
     DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
     if (-1 != shuftiBuildMasks(info.single_stops, (u8 *)&aux->shufti.lo,
                                (u8 *)&aux->shufti.hi)) {
@@ -195,16 +207,45 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
         u8 m2;
 
         if (buildDvermMask(info.double_stop2, &m1, &m2)) {
+            u8 c1 = info.double_stop2.begin()->first & m1;
+            u8 c2 = info.double_stop2.begin()->second & m2;
+#ifdef HAVE_SVE2
+            if (vermicelliDoubleMasked16Build(c1, c2, m1, m2, (u8 *)&aux->mdverm16.mask)) {
+                aux->accel_type = ACCEL_DVERM16_MASKED;
+                aux->mdverm16.offset = offset;
+                aux->mdverm16.c1 = c1;
+                aux->mdverm16.m1 = m1;
+                DEBUG_PRINTF("building maskeddouble16-vermicelli for 0x%02hhx%02hhx\n",
+                             c1, c2);
+                return;
+            } else if (outs2 <= 8 &&
+                       vermicelliDouble16Build(info.double_stop2, (u8 *)&aux->dverm16.mask,
+                                               (u8 *)&aux->dverm16.firsts)) {
+                aux->accel_type = ACCEL_DVERM16;
+                aux->dverm16.offset = offset;
+                DEBUG_PRINTF("building double16-vermicelli\n");
+                return;
+            }
+#endif // HAVE_SVE2
             aux->accel_type = ACCEL_DVERM_MASKED;
             aux->dverm.offset = offset;
-            aux->dverm.c1 = info.double_stop2.begin()->first & m1;
-            aux->dverm.c2 = info.double_stop2.begin()->second & m2;
+            aux->dverm.c1 = c1;
+            aux->dverm.c2 = c2;
             aux->dverm.m1 = m1;
             aux->dverm.m2 = m2;
-            DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n",
-                         aux->dverm.c1, aux->dverm.c2);
+            DEBUG_PRINTF("building maskeddouble-vermicelli for 0x%02hhx%02hhx\n", c1, c2);
+            return;
+        }
+#ifdef HAVE_SVE2
+        if (outs2 <= 8 &&
+            vermicelliDouble16Build(info.double_stop2, (u8 *)&aux->dverm16.mask,
+                                    (u8 *)&aux->dverm16.firsts)) {
+            aux->accel_type = ACCEL_DVERM16;
+            aux->dverm16.offset = offset;
+            DEBUG_PRINTF("building double16-vermicelli\n");
             return;
         }
+#endif // HAVE_SVE2
     }
 
     if (outs1 < outs2 && outs1 <= 2) { // Heuristic from UE-438.
diff --git a/src/nfa/arm/shufti.hpp b/src/nfa/arm/shufti.hpp
new file mode 100644
index 000000000..e710fd16a
--- /dev/null
+++ b/src/nfa/arm/shufti.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Shufti: character class acceleration.
+ */
+
+template <uint16_t S>
+static really_inline
+const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars) {
+    const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
+
+    SuperVector<S> c_lo = chars & low4bits;
+    SuperVector<S> c_hi = chars.template vshr_8_imm<4>();
+    c_lo = mask_lo.template pshufb<false>(c_lo);
+    c_hi = mask_hi.template pshufb<false>(c_hi);
+
+    return (c_lo & c_hi) > (SuperVector<S>::Zeroes());
+}
+
+template <uint16_t S>
+static really_inline
+SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars) {
+
+    const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
+    SuperVector<S> chars_lo = chars & low4bits;
+    chars_lo.print8("chars_lo");
+    SuperVector<S> chars_hi = chars.template vshr_64_imm<4>() & low4bits;
+    chars_hi.print8("chars_hi");
+    SuperVector<S> c1_lo = mask1_lo.template pshufb<true>(chars_lo);
+    c1_lo.print8("c1_lo");
+    SuperVector<S> c1_hi = mask1_hi.template pshufb<true>(chars_hi);
+    c1_hi.print8("c1_hi");
+    SuperVector<S> t1 = c1_lo | c1_hi;
+    t1.print8("t1");
+
+    SuperVector<S> c2_lo = mask2_lo.template pshufb<true>(chars_lo);
+    c2_lo.print8("c2_lo");
+    SuperVector<S> c2_hi = mask2_hi.template pshufb<true>(chars_hi);
+    c2_hi.print8("c2_hi");
+    SuperVector<S> t2 = c2_lo | c2_hi;
+    t2.print8("t2");
+    t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)");
+    SuperVector<S> t = t1 | (t2.template vshr_128_imm<1>());
+    t.print8("t");
+
+    return !t.eq(SuperVector<S>::Ones());
+}
\ No newline at end of file
diff --git a/src/nfa/arm/truffle.hpp b/src/nfa/arm/truffle.hpp
new file mode 100644
index 000000000..923332611
--- /dev/null
+++ b/src/nfa/arm/truffle.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Truffle: character class acceleration.
+ *
+ */
+
+template <uint16_t S>
+static really_inline
+const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
+
+    chars.print8("chars");
+    shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear");
+    shuf_mask_lo_highset.print8("shuf_mask_lo_highset");
+
+    SuperVector<S> highconst = SuperVector<S>::dup_u8(0x80);
+    highconst.print8("highconst");
+    SuperVector<S> shuf_mask_hi = SuperVector<S>::dup_u64(0x8040201008040201);
+    shuf_mask_hi.print8("shuf_mask_hi");
+    
+    SuperVector<S> shuf1 = shuf_mask_lo_highclear.pshufb(chars);
+    shuf1.print8("shuf1");
+    SuperVector<S> t1 = chars ^ highconst;
+    t1.print8("t1");
+    SuperVector<S> shuf2 = shuf_mask_lo_highset.pshufb(t1);
+    shuf2.print8("shuf2");
+    SuperVector<S> t2 = highconst.opandnot(chars.template vshr_64_imm<4>());
+    t2.print8("t2");
+    SuperVector<S> shuf3 = shuf_mask_hi.pshufb(t2);
+    shuf3.print8("shuf3");
+    SuperVector<S> res = (shuf1 | shuf2) & shuf3;
+    res.print8("(shuf1 | shuf2) & shuf3");
+
+    return !res.eq(SuperVector<S>::Zeroes());
+}
diff --git a/src/nfa/arm/vermicelli.hpp b/src/nfa/arm/vermicelli.hpp
new file mode 100644
index 000000000..496468e0d
--- /dev/null
+++ b/src/nfa/arm/vermicelli.hpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli: single-byte and double-byte acceleration.
+ */
+
+template <uint16_t S>
+static really_inline
+const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = chars.eq(casemask & data);
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = !chars.eq(casemask & data);
+    return first_zero_match_inverted<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = chars.eq(casemask & data);
+    return last_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
+
+    data.print8("data");
+    chars.print8("chars");
+    casemask.print8("casemask");
+    SuperVector<S> mask = !chars.eq(casemask & data);
+    mask.print8("mask");
+    return last_zero_match_inverted<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+                                u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v = casemask & data;
+    SuperVector<S> mask1 = chars1.eq(v);
+    SuperVector<S> mask2 = chars2.eq(v);
+    SuperVector<S> mask = mask1 & (mask2 >> 1);
+
+    DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
+    }
+
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+                                 u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v = casemask & data;
+    SuperVector<S> mask1 = chars1.eq(v);
+    SuperVector<S> mask2 = chars2.eq(v);
+    SuperVector<S> mask = (mask1 << 1)& mask2;
+
+    DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | (SuperVector<S>::Ones() >> (S-1));
+    }
+
+    return last_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
+                                      SuperVector<S> const mask1, SuperVector<S> const mask2,
+                                      u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v1 = chars1.eq(data & mask1);
+    SuperVector<S> v2 = chars2.eq(data & mask2);
+    SuperVector<S> mask = v1 & (v2 >> 1);
+
+    DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
+    }
+
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
diff --git a/src/nfa/castle.c b/src/nfa/castle.c
index 7c158b31c..29208f8d4 100644
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,12 +40,16 @@
 #include "repeat.h"
 #include "shufti.h"
 #include "truffle.h"
-#include "vermicelli.h"
+#include "vermicelli.hpp"
 #include "util/bitutils.h"
 #include "util/multibit.h"
 #include "util/partial_store.h"
 #include "ue2common.h"
 
+#ifdef HAVE_SVE2
+#include "castle_sve.h"
+#endif
+
 static really_inline
 const struct SubCastle *getSubCastle(const struct Castle *c, u32 num) {
     assert(num < c->numRepeats);
@@ -604,6 +609,12 @@ char castleScan(const struct Castle *c, const u8 *buf, const size_t begin,
         return castleScanVerm(c, buf, begin, end, loc);
     case CASTLE_NVERM:
         return castleScanNVerm(c, buf, begin, end, loc);
+#ifdef HAVE_SVE2
+    case CASTLE_VERM16:
+        return castleScanVerm16(c, buf, begin, end, loc);
+    case CASTLE_NVERM16:
+        return castleScanNVerm16(c, buf, begin, end, loc);
+#endif // HAVE_SVE2
     case CASTLE_SHUFTI:
         return castleScanShufti(c, buf, begin, end, loc);
     case CASTLE_TRUFFLE:
@@ -699,6 +710,12 @@ char castleRevScan(const struct Castle *c, const u8 *buf, const size_t begin,
         return castleRevScanVerm(c, buf, begin, end, loc);
     case CASTLE_NVERM:
         return castleRevScanNVerm(c, buf, begin, end, loc);
+#ifdef HAVE_SVE2
+    case CASTLE_VERM16:
+        return castleRevScanVerm16(c, buf, begin, end, loc);
+    case CASTLE_NVERM16:
+        return castleRevScanNVerm16(c, buf, begin, end, loc);
+#endif // HAVE_SVE2
     case CASTLE_SHUFTI:
         return castleRevScanShufti(c, buf, begin, end, loc);
     case CASTLE_TRUFFLE:
diff --git a/src/nfa/castle_internal.h b/src/nfa/castle_internal.h
index 429c232ff..ea135f8d6 100644
--- a/src/nfa/castle_internal.h
+++ b/src/nfa/castle_internal.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,6 +53,8 @@ struct SubCastle {
 #define CASTLE_NVERM 2
 #define CASTLE_SHUFTI 3
 #define CASTLE_TRUFFLE 4
+#define CASTLE_VERM16 5
+#define CASTLE_NVERM16 6
 
 enum ExclusiveType {
     NOT_EXCLUSIVE,     //!< no subcastles are exclusive
@@ -129,6 +132,9 @@ struct ALIGN_AVX_DIRECTIVE Castle {
         struct {
             char c;
         } verm;
+        struct {
+            m128 mask;
+        } verm16;
         struct {
             m128 mask_lo;
             m128 mask_hi;
diff --git a/src/nfa/castle_sve.h b/src/nfa/castle_sve.h
new file mode 100644
index 000000000..a8f6452d0
--- /dev/null
+++ b/src/nfa/castle_sve.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Castle for SVE: multi-tenant repeat engine, runtime code.
+ */
+
+static really_inline
+char castleScanVerm16(const struct Castle *c, const u8 *buf, const size_t begin,
+                      const size_t end, size_t *loc) {
+    const u8 *ptr = vermicelli16Exec(c->u.verm16.mask, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = ptr - buf;
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleScanNVerm16(const struct Castle *c, const u8 *buf, const size_t begin,
+                       const size_t end, size_t *loc) {
+    const u8 *ptr = nvermicelli16Exec(c->u.verm16.mask, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = ptr - buf;
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleRevScanVerm16(const struct Castle *c, const u8 *buf,
+                         const size_t begin, const size_t end, size_t *loc) {
+    const u8 *ptr = rvermicelli16Exec(c->u.verm16.mask, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = ptr - buf;
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char castleRevScanNVerm16(const struct Castle *c, const u8 *buf,
+                          const size_t begin, const size_t end, size_t *loc) {
+    const u8 *ptr = rnvermicelli16Exec(c->u.verm16.mask, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    assert(ptr >= buf && ptr < buf + end);
+    *loc = ptr - buf;
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
\ No newline at end of file
diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 5884ebb21..56b12700f 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,6 +40,7 @@
 #include "repeatcompile.h"
 #include "shufticompile.h"
 #include "trufflecompile.h"
+#include "vermicellicompile.h"
 #include "nfagraph/ng_dump.h"
 #include "nfagraph/ng_equivalence.h"
 #include "nfagraph/ng_repeat.h"
@@ -50,7 +52,6 @@
 #include "util/dump_charclass.h"
 #include "util/flat_containers.h"
 #include "util/graph.h"
-#include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/report_manager.h"
 #include "util/verify_types.h"
@@ -102,6 +103,19 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
         return;
     }
 
+#ifdef HAVE_SVE2
+    if (cr.count() <= 16) {
+        c->type = CASTLE_NVERM16;
+        vermicelli16Build(cr, (u8 *)&c->u.verm16.mask);
+        return;
+    }
+    if (negated.count() <= 16) {
+        c->type = CASTLE_VERM16;
+        vermicelli16Build(negated, (u8 *)&c->u.verm16.mask);
+        return;
+    }
+#endif // HAVE_SVE2
+
     if (shuftiBuildMasks(negated, (u8 *)&c->u.shuf.mask_lo,
                          (u8 *)&c->u.shuf.mask_hi) != -1) {
         c->type = CASTLE_SHUFTI;
@@ -157,7 +171,7 @@ void getNeighborInfo(const CliqueGraph &g, vector<u32> &neighbor,
     // find neighbors for cv
     for (const auto &v : adjacent_vertices_range(cv, g)) {
         if (g[v].stateId != id && contains(group, g[v].stateId)) {
-            neighbor.push_back(g[v].stateId);
+            neighbor.emplace_back(g[v].stateId);
             DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
         }
     }
@@ -172,7 +186,7 @@ void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique) {
     vector<u32> init;
     for (const auto &v : vertices_range(cg)) {
         vertexMap[cg[v].stateId] = v;
-        init.push_back(cg[v].stateId);
+        init.emplace_back(cg[v].stateId);
     }
     gStack.push(init);
 
@@ -186,7 +200,7 @@ void findCliqueGroup(CliqueGraph &cg, vector<u32> &clique) {
         // Choose a vertex from the graph
         u32 id = g[0];
         const CliqueVertex &n = vertexMap.at(id);
-        clique.push_back(id);
+        clique.emplace_back(id);
         // Corresponding vertex in the original graph
         vector<u32> neighbor;
         set<u32> subgraphId(g.begin(), g.end());
@@ -215,7 +229,7 @@ vector<u32> removeClique(CliqueGraph &cg) {
         vector<CliqueVertex> dead;
         for (const auto &v : vertices_range(cg)) {
             if (find(c.begin(), c.end(), cg[v].stateId) != c.end()) {
-                dead.push_back(v);
+                dead.emplace_back(v);
             }
         }
         for (const auto &v : dead) {
@@ -227,7 +241,7 @@ vector<u32> removeClique(CliqueGraph &cg) {
         }
         vector<u32> clique;
         findCliqueGroup(cg, clique);
-        cliquesVec.push_back(clique);
+        cliquesVec.emplace_back(clique);
     }
 
     // get the independent set with max size
@@ -288,11 +302,11 @@ vector<vector<u32>> checkExclusion(u32 &streamStateSize,
         // get min reset distance for each repeat
         for (size_t i = lower; i < upper; i++) {
             CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
-            vertices.push_back(v);
+            vertices.emplace_back(v);
 
             const vector<size_t> &tmp_dist =
                 minResetDistToEnd(triggers[i], cr);
-            min_reset_dist.push_back(tmp_dist);
+            min_reset_dist.emplace_back(tmp_dist);
         }
 
         // find exclusive pair for each repeat
@@ -311,7 +325,7 @@ vector<vector<u32>> checkExclusion(u32 &streamStateSize,
         auto clique = removeClique(*cg);
         size_t cliqueSize = clique.size();
         if (cliqueSize > 1) {
-            groups.push_back(clique);
+            groups.emplace_back(clique);
             exclusive = EXCLUSIVE;
             total += cliqueSize;
         }
@@ -387,7 +401,7 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
         }
 
         if (pr.bounds.max.is_finite()) {
-            may_stale.push_back(i);
+            may_stale.emplace_back(i);
         }
 
         info.type = verify_u8(rtype);
@@ -411,7 +425,7 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
 
         if (rtype == REPEAT_SPARSE_OPTIMAL_P) {
             for (u32 j = 0; j < rsi.patchSize; j++) {
-                tables.push_back(rsi.table[j]);
+                tables.emplace_back(rsi.table[j]);
             }
             sparseRepeats++;
             patchSize[i] = rsi.patchSize;
@@ -509,10 +523,10 @@ buildCastle(const CastleProto &proto,
             is_reset = true;
         }
 
-        repeatInfoPair.push_back(make_pair(min_period, is_reset));
+        repeatInfoPair.emplace_back(make_pair(min_period, is_reset));
 
-        candidateTriggers.push_back(triggers.at(top));
-        candidateRepeats.push_back(i);
+        candidateTriggers.emplace_back(triggers.at(top));
+        candidateRepeats.emplace_back(i);
     }
 
     // Case 1: exclusive repeats
@@ -977,7 +991,7 @@ unique_ptr<NGHolder> makeHolder(const CastleProto &proto,
         }
     }
 
-    auto g = ue2::make_unique<NGHolder>(proto.kind);
+    auto g = std::make_unique<NGHolder>(proto.kind);
 
     for (const auto &m : proto.repeats) {
         addToHolder(*g, m.first, m.second);
diff --git a/src/nfa/dfa_min.cpp b/src/nfa/dfa_min.cpp
index 1a07e8a7d..757fffbe9 100644
--- a/src/nfa/dfa_min.cpp
+++ b/src/nfa/dfa_min.cpp
@@ -152,7 +152,7 @@ HopcroftInfo::HopcroftInfo(const raw_dfa &rdfa)
     for (size_t i = 0; i < states.size(); i++) { // i is the previous state
         for (size_t sym = 0; sym < alpha_size; sym++) {
             dstate_id_t present_state = rdfa.states[i].next[sym];
-            states[present_state].prev[sym].push_back(i);
+            states[present_state].prev[sym].emplace_back(i);
         }
     }
 }
@@ -263,7 +263,7 @@ void mapping_new_states(const HopcroftInfo &info,
     new_states.reserve(num_partitions);
 
     for (const auto &m : ordering) {
-        new_states.push_back(rdfa.states[m.first]);
+        new_states.emplace_back(rdfa.states[m.first]);
     }
     rdfa.states = std::move(new_states);
 }
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index d41c6f423..343a793b8 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -39,7 +39,6 @@
 #include "util/container.h"
 #include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/verify_types.h"
@@ -147,7 +146,7 @@ void translateRawReports(UNUSED GoughGraph &cfg, UNUSED const raw_som_dfa &raw,
         } else {
             var = joins_at_s.at(sr.slot);
         }
-        reports_out->push_back(make_pair(sr.report, var));
+        reports_out->emplace_back(make_pair(sr.report, var));
     }
 }
 
@@ -190,7 +189,7 @@ void makeCFG_top_edge(GoughGraph &cfg, const vector<GoughVertex> &vertices,
         shared_ptr<GoughSSAVarNew> vnew;
         if (slot_id == trigger_slot) {
             vnew = make_shared<GoughSSAVarNew>(0U);
-            cfg[e].vars.push_back(vnew);
+            cfg[e].vars.emplace_back(vnew);
         } else {
             assert(contains(src_slots, slot_id));
         }
@@ -207,7 +206,11 @@ void makeCFG_top_edge(GoughGraph &cfg, const vector<GoughVertex> &vertices,
             assert(contains(src_slots, slot_id));
 
             shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
-            cfg[e].vars.push_back(vmin);
+            if (!vmin) {
+                assert(0);
+                throw std::bad_alloc();
+            }
+            cfg[e].vars.emplace_back(vmin);
             final_var = vmin.get();
 
             DEBUG_PRINTF("slot %u gets a new value\n", slot_id);
@@ -280,7 +283,7 @@ void makeCFG_edge(GoughGraph &cfg, const map<u32, u32> &som_creators,
                     vnew = vnew_by_adj[adjust];
                 } else {
                     vnew = make_shared<GoughSSAVarNew>(adjust);
-                    cfg[e].vars.push_back(vnew);
+                    cfg[e].vars.emplace_back(vnew);
                     vnew_by_adj[adjust] = vnew;
                 }
                 assert(vnew);
@@ -318,7 +321,11 @@ void makeCFG_edge(GoughGraph &cfg, const map<u32, u32> &som_creators,
             DEBUG_PRINTF("bypassing min on join %u\n", slot_id);
         } else {
             shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
-            cfg[e].vars.push_back(vmin);
+            if (!vmin) {
+                assert(0);
+                throw std::bad_alloc();
+            }
+            cfg[e].vars.emplace_back(vmin);
             final_var = vmin.get();
 
             if (vnew) {
@@ -348,17 +355,17 @@ static never_inline
 unique_ptr<GoughGraph> makeCFG(const raw_som_dfa &raw) {
     vector<GoughVertex> vertices;
     vertices.reserve(raw.states.size());
-    unique_ptr<GoughGraph> cfg = ue2::make_unique<GoughGraph>();
+    unique_ptr<GoughGraph> cfg = std::make_unique<GoughGraph>();
     u32 min_state = !is_triggered(raw.kind);
 
     if (min_state) {
-        vertices.push_back(GoughGraph::null_vertex()); /* skip dead state */
+        vertices.emplace_back(GoughGraph::null_vertex()); /* skip dead state */
     }
 
     vector<flat_map<u32, GoughSSAVarJoin *> > joins(raw.states.size());
     for (u32 i = min_state; i < raw.states.size(); ++i) {
         GoughVertex v = add_vertex(GoughVertexProps(i), *cfg);
-        vertices.push_back(v);
+        vertices.emplace_back(v);
 
         /* create JOIN variables */
         for (som_tran_info::const_iterator it = raw.state_som[i].preds.begin();
@@ -366,7 +373,7 @@ unique_ptr<GoughGraph> makeCFG(const raw_som_dfa &raw) {
             u32 slot_id = it->first;
             if (!contains(raw.new_som_nfa_states, slot_id)
                 || raw.new_som_nfa_states.at(slot_id)) {
-                (*cfg)[v].vars.push_back(make_shared<GoughSSAVarJoin>());
+                (*cfg)[v].vars.emplace_back(make_shared<GoughSSAVarJoin>());
                 joins[get(vertex_index, *cfg, v)][slot_id]
                     = (*cfg)[v].vars.back().get();
                 DEBUG_PRINTF("dfa %u:: slot %u\n", i, slot_id);
@@ -525,7 +532,7 @@ void mark_live_reports(const vector<pair<ReportID, GoughSSAVar *> > &reps,
             continue;
         }
         var->seen = true;
-        queue->push_back(var);
+        queue->emplace_back(var);
     }
 }
 
@@ -546,7 +553,7 @@ void remove_dead(GoughGraph &g) {
                 continue;
             }
             var->seen = true;
-            queue.push_back(var);
+            queue.emplace_back(var);
         }
     }
 
@@ -589,7 +596,7 @@ gough_ins make_gough_ins(u8 op, u32 dest = INVALID_SLOT,
 
 void GoughSSAVarNew::generate(vector<gough_ins> *out) const {
     assert(slot != INVALID_SLOT);
-    out->push_back(make_gough_ins(GOUGH_INS_NEW, slot, adjust));
+    out->emplace_back(make_gough_ins(GOUGH_INS_NEW, slot, adjust));
 }
 
 #ifndef NDEBUG
@@ -616,7 +623,7 @@ void GoughSSAVarMin::generate(vector<gough_ins> *out) const {
             /* if the destination is one of the sources, no need to move it */
             first = false;
         } else {
-            input_slots.push_back(var->slot);
+            input_slots.emplace_back(var->slot);
         }
     }
 
@@ -624,10 +631,10 @@ void GoughSSAVarMin::generate(vector<gough_ins> *out) const {
 
     for (const u32 &input_slot : input_slots) {
         if (first) {
-            out->push_back(make_gough_ins(GOUGH_INS_MOV, slot, input_slot));
+            out->emplace_back(make_gough_ins(GOUGH_INS_MOV, slot, input_slot));
             first = false;
         } else {
-            out->push_back(make_gough_ins(GOUGH_INS_MIN, slot, input_slot));
+            out->emplace_back(make_gough_ins(GOUGH_INS_MIN, slot, input_slot));
         }
     }
 }
@@ -842,7 +849,7 @@ void add_simple_joins(edge_join_info &eji, vector<gough_ins> *out) {
             /* value of destination slot is not used by any remaining joins;
              * we can output this join immediately */
             DEBUG_PRINTF("out %u<-%u\n", dest, src);
-            out->push_back(make_gough_ins(GOUGH_INS_MOV, dest, src));
+            out->emplace_back(make_gough_ins(GOUGH_INS_MOV, dest, src));
 
             eji.erase(src, dest);
 
@@ -877,14 +884,14 @@ void add_joins_to_block(edge_join_info &eji, vector<gough_ins> *out,
         /* stash the initial value of the split register in a temp register */
         u32 temp = base_temp_slot++;
         DEBUG_PRINTF("out %u<-%u\n", temp, split);
-        out->push_back(make_gough_ins(GOUGH_INS_MOV, temp, split));
+        out->emplace_back(make_gough_ins(GOUGH_INS_MOV, temp, split));
         eji.remap_src(split, temp); /* update maps */
 
         /* split can now be safely written out to as all the uses of it as an
          * input now refer to temp instead */
 
         DEBUG_PRINTF("out %u<-%u\n", split, input_for_split);
-        out->push_back(make_gough_ins(GOUGH_INS_MOV, split, input_for_split));
+        out->emplace_back(make_gough_ins(GOUGH_INS_MOV, split, input_for_split));
         eji.erase(input_for_split, split);
 
         /* handle any uncovered simple cases */
@@ -931,7 +938,7 @@ void build_blocks(const GoughGraph &g,
 
     for (vector<gough_ins> &ins_list : *blocks | map_values) {
         assert(!ins_list.empty());
-        ins_list.push_back(make_gough_ins(GOUGH_INS_END));
+        ins_list.emplace_back(make_gough_ins(GOUGH_INS_END));
     }
 }
 
@@ -1235,7 +1242,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
 
     const bool remap_reports = has_managed_reports(rdfa.kind);
 
-    auto ri = ue2::make_unique<raw_gough_report_info_impl>();
+    auto ri = std::make_unique<raw_gough_report_info_impl>();
     map<raw_gough_report_list, u32> rev;
 
     assert(!rdfa.states.empty());
@@ -1252,39 +1259,39 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
 
         DEBUG_PRINTF("i = %zu [%zu]\n", reports.size(), gg[v].reports.size());
         if (v == GoughGraph::null_vertex() || gg[v].reports.empty()) {
-            reports.push_back(MO_INVALID_IDX);
+            reports.emplace_back(MO_INVALID_IDX);
             continue;
         }
 
         raw_gough_report_list rrl(gg[v].reports, rm, remap_reports);
         DEBUG_PRINTF("non empty r %zu\n", reports.size());
         if (rev.find(rrl) != rev.end()) {
-            reports.push_back(rev[rrl]);
+            reports.emplace_back(rev[rrl]);
         } else {
             DEBUG_PRINTF("adding to rl\n");
             rev[rrl] = ri->size();
-            reports.push_back(ri->size());
-            ri->rl.push_back(rrl);
+            reports.emplace_back(ri->size());
+            ri->rl.emplace_back(rrl);
         }
     }
 
     for (auto v : verts) {
         if (v == GoughGraph::null_vertex() || gg[v].reports_eod.empty()) {
-            reports_eod.push_back(MO_INVALID_IDX);
+            reports_eod.emplace_back(MO_INVALID_IDX);
             continue;
         }
 
         DEBUG_PRINTF("non empty r eod\n");
         raw_gough_report_list rrl(gg[v].reports_eod, rm, remap_reports);
         if (rev.find(rrl) != rev.end()) {
-            reports_eod.push_back(rev[rrl]);
+            reports_eod.emplace_back(rev[rrl]);
             continue;
         }
 
         DEBUG_PRINTF("adding to rl eod %zu\n", gg[v].reports_eod.size());
         rev[rrl] = ri->size();
-        reports_eod.push_back(ri->size());
-        ri->rl.push_back(rrl);
+        reports_eod.emplace_back(ri->size());
+        ri->rl.emplace_back(rrl);
     }
 
     /* TODO: support single report in gough */
@@ -1292,7 +1299,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
     *arbReport = MO_INVALID_IDX;
     assert(!ri->rl.empty()); /* all components should be able to generate
                                 reports */
-    return move(ri);
+    return std::move(ri);
 }
 
 u32 raw_gough_report_info_impl::getReportListSize() const {
@@ -1313,7 +1320,7 @@ size_t raw_gough_report_info_impl::size() const {
 void raw_gough_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
                                                  vector<u32> &ro) const {
     for (const raw_gough_report_list &r : rl) {
-        ro.push_back(base_offset);
+        ro.emplace_back(base_offset);
 
         gough_report_list *p = (gough_report_list *)((char *)n + base_offset);
         u32 i = 0;
diff --git a/src/nfa/goughcompile_dump.cpp b/src/nfa/goughcompile_dump.cpp
index 96ab196e3..ca94b69f3 100644
--- a/src/nfa/goughcompile_dump.cpp
+++ b/src/nfa/goughcompile_dump.cpp
@@ -145,7 +145,7 @@ void dump_var_mapping(const GoughGraph &g, const string &base,
             fprintf(f, "\tuses:");
             vector<u32> used_id;
             for (const GoughSSAVar *var : used) {
-                used_id.push_back(var->slot);
+                used_id.emplace_back(var->slot);
             }
             for (const u32 &id : used_id) {
                 fprintf(f, " %u", id);
@@ -167,7 +167,7 @@ void dump_var_mapping(const GoughGraph &g, const string &base,
             fprintf(f, "\tuses:");
             vector<u32> used_id;
             for (const GoughSSAVar *var : used) {
-                used_id.push_back(var->slot);
+                used_id.emplace_back(var->slot);
             }
             for (const u32 &id : used_id) {
                 fprintf(f, " %u", id);
@@ -194,7 +194,7 @@ void gather_vars(const GoughGraph &g, vector<const GoughSSAVar *> *vars,
             const GoughSSAVar *vp = g[v].vars[i].get();
             stringstream ss;
             ss << dump_name(g[v]) << "_" << i;
-            vars->push_back(vp);
+            vars->emplace_back(vp);
             names->insert(make_pair(vp, ss.str()));
             src_label->insert(make_pair(vp, dump_name(g[v])));
         }
@@ -205,7 +205,7 @@ void gather_vars(const GoughGraph &g, vector<const GoughSSAVar *> *vars,
             const GoughSSAVar *vp = g[e].vars[i].get();
             stringstream ss;
             ss << dump_name(g, e) << "_" << i;
-            vars->push_back(vp);
+            vars->emplace_back(vp);
             names->insert(make_pair(vp, ss.str()));
             src_label->insert(make_pair(vp, dump_name(g, e)));
         }
diff --git a/src/nfa/goughcompile_reg.cpp b/src/nfa/goughcompile_reg.cpp
index 48e515b9a..d088e1c04 100644
--- a/src/nfa/goughcompile_reg.cpp
+++ b/src/nfa/goughcompile_reg.cpp
@@ -49,19 +49,19 @@ using boost::adaptors::map_values;
 namespace ue2 {
 
 template<typename VarP, typename VarQ>
-void push_back_all_raw(vector<VarP> *out, const vector<VarQ> &in) {
+void emplace_back_all_raw(vector<VarP> *out, const vector<VarQ> &in) {
     for (const auto &var : in) {
-        out->push_back(var.get());
+        out->emplace_back(var.get());
     }
 }
 
 static
 void all_vars(const GoughGraph &g, vector<GoughSSAVar *> *out) {
     for (auto v : vertices_range(g)) {
-        push_back_all_raw(out, g[v].vars);
+        emplace_back_all_raw(out, g[v].vars);
     }
     for (const auto &e : edges_range(g)) {
-        push_back_all_raw(out, g[e].vars);
+        emplace_back_all_raw(out, g[e].vars);
     }
 }
 
@@ -380,7 +380,7 @@ template<typename VarP>
 void add_to_dom_ordering(const vector<VarP> &vars,
                          vector<GoughSSAVar *> *out) {
     for (const auto &var : vars) {
-        out->push_back(var.get());
+        out->emplace_back(var.get());
     }
 }
 
@@ -389,7 +389,7 @@ class FinishVisitor : public boost::default_dfs_visitor {
 public:
     explicit FinishVisitor(vector<GoughVertex> *o) : out(o) {}
     void finish_vertex(const GoughVertex v, const GoughGraph &) {
-        out->push_back(v);
+        out->emplace_back(v);
     }
     vector<GoughVertex> *out;
 };
diff --git a/src/nfa/lbr.c b/src/nfa/lbr.c
index d403733a6..52e81ad67 100644
--- a/src/nfa/lbr.c
+++ b/src/nfa/lbr.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,7 +40,7 @@
 #include "repeat_internal.h"
 #include "shufti.h"
 #include "truffle.h"
-#include "vermicelli.h"
+#include "vermicelli.hpp"
 #include "util/partial_store.h"
 #include "util/unaligned.h"
 
@@ -529,3 +530,7 @@ char lbrFwdScanTruf(const struct NFA *nfa, const u8 *buf,
 
 #define ENGINE_ROOT_NAME Truf
 #include "lbr_common_impl.h"
+
+#ifdef HAVE_SVE2
+#include "lbr_sve.h"
+#endif
diff --git a/src/nfa/lbr.h b/src/nfa/lbr.h
index a9e42046d..b6718c05b 100644
--- a/src/nfa/lbr.h
+++ b/src/nfa/lbr.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -101,6 +102,52 @@ char nfaExecLbrNVerm_expandState(const struct NFA *nfa, void *dest,
 #define nfaExecLbrNVerm_B_Reverse NFA_API_NO_IMPL
 #define nfaExecLbrNVerm_zombie_status NFA_API_ZOMBIE_NO_IMPL
 
+#ifdef HAVE_SVE2
+
+// LBR Verm16
+
+char nfaExecLbrVerm16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrVerm16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrVerm16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecLbrVerm16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecLbrVerm16_inAccept(const struct NFA *n, ReportID report,
+                               struct mq *q);
+char nfaExecLbrVerm16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecLbrVerm16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecLbrVerm16_initCompressedState(const struct NFA *n, u64a offset,
+                                          void *state, u8 key);
+char nfaExecLbrVerm16_queueCompressState(const struct NFA *nfa,
+                                         const struct mq *q, s64a loc);
+char nfaExecLbrVerm16_expandState(const struct NFA *nfa, void *dest,
+                                  const void *src, u64a offset, u8 key);
+
+#define nfaExecLbrVerm16_testEOD NFA_API_NO_IMPL
+#define nfaExecLbrVerm16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecLbrVerm16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+// LBR Negated Verm16
+
+char nfaExecLbrNVerm16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrNVerm16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecLbrNVerm16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecLbrNVerm16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecLbrNVerm16_inAccept(const struct NFA *n, ReportID report,
+                                struct mq *q);
+char nfaExecLbrNVerm16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecLbrNVerm16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecLbrNVerm16_initCompressedState(const struct NFA *n, u64a offset,
+                                           void *state, u8 key);
+char nfaExecLbrNVerm16_queueCompressState(const struct NFA *nfa,
+                                          const struct mq *q, s64a loc);
+char nfaExecLbrNVerm16_expandState(const struct NFA *nfa, void *dest,
+                                   const void *src, u64a offset, u8 key);
+
+#define nfaExecLbrNVerm16_testEOD NFA_API_NO_IMPL
+#define nfaExecLbrNVerm16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecLbrNVerm16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+#endif // HAVE_SVE2
+
 // LBR Shuf
 
 char nfaExecLbrShuf_Q(const struct NFA *n, struct mq *q, s64a end);
diff --git a/src/nfa/lbr_internal.h b/src/nfa/lbr_internal.h
index 8ba11dd4d..beb1a50b1 100644
--- a/src/nfa/lbr_internal.h
+++ b/src/nfa/lbr_internal.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -56,6 +57,11 @@ struct lbr_verm {
     char c; //!< escape char
 };
 
+struct lbr_verm16 {
+    struct lbr_common common;
+    m128 mask;
+};
+
 struct lbr_shuf {
     struct lbr_common common;
     m128 mask_lo; //!< shufti lo mask for escape chars
diff --git a/src/nfa/lbr_sve.h b/src/nfa/lbr_sve.h
new file mode 100644
index 000000000..8f5948b56
--- /dev/null
+++ b/src/nfa/lbr_sve.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Large Bounded Repeat (LBR) engine for SVE: runtime code.
+ */
+
+static really_inline
+char lbrRevScanVerm16(const struct NFA *nfa, const u8 *buf,
+                      size_t begin, size_t end, size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_VERM16);
+    const struct lbr_verm16 *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = rvermicelli16Exec(l->mask, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = ptr - buf;
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char lbrRevScanNVerm16(const struct NFA *nfa, const u8 *buf,
+                       size_t begin, size_t end, size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_NVERM16);
+    const struct lbr_verm16 *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = rnvermicelli16Exec(l->mask, buf + begin, buf + end);
+    if (ptr == buf + begin - 1) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = ptr - buf;
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char lbrFwdScanVerm16(const struct NFA *nfa, const u8 *buf,
+                      size_t begin, size_t end, size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_VERM16);
+    const struct lbr_verm16 *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = vermicelli16Exec(l->mask, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = ptr - buf;
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+static really_inline
+char lbrFwdScanNVerm16(const struct NFA *nfa, const u8 *buf,
+                       size_t begin, size_t end, size_t *loc) {
+    assert(begin <= end);
+    assert(nfa->type == LBR_NFA_NVERM16);
+    const struct lbr_verm16 *l = getImplNfa(nfa);
+
+    if (begin == end) {
+        return 0;
+    }
+
+    const u8 *ptr = nvermicelli16Exec(l->mask, buf + begin, buf + end);
+    if (ptr == buf + end) {
+        DEBUG_PRINTF("no escape found\n");
+        return 0;
+    }
+
+    assert(loc);
+    *loc = ptr - buf;
+    DEBUG_PRINTF("escape found at offset %zu\n", *loc);
+    return 1;
+}
+
+#define ENGINE_ROOT_NAME Verm16
+#include "lbr_common_impl.h"
+
+#define ENGINE_ROOT_NAME NVerm16
+#include "lbr_common_impl.h"
\ No newline at end of file
diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c
index 4834b6a54..a85d5a077 100644
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@@ -40,7 +40,7 @@
 #include "shufti.h"
 #include "truffle.h"
 #include "ue2common.h"
-#include "vermicelli.h"
+#include "vermicelli.hpp"
 #include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index bbb266051..f84cdc32f 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -85,6 +85,18 @@ namespace ue2 {
  */
 static constexpr u32 NO_STATE = ~0;
 
+/* Maximum number of states taken as a small NFA */
+static constexpr u32 MAX_SMALL_NFA_STATES = 64;
+
+/* Maximum bounded repeat upper bound to consider as a fast NFA */
+static constexpr u64a MAX_REPEAT_SIZE = 200;
+
+/* Maximum bounded repeat char reach size to consider as a fast NFA */
+static constexpr u32 MAX_REPEAT_CHAR_REACH = 26;
+
+/* Minimum bounded repeat trigger distance to consider as a fast NFA */
+static constexpr u8 MIN_REPEAT_TRIGGER_DISTANCE = 6;
+
 namespace {
 
 struct precalcAccel {
@@ -319,7 +331,7 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
     verts.reserve(args.num_states);
     for (auto v : vertices_range(h)) {
         if (state_ids.at(v) != NO_STATE) {
-            verts.push_back(v);
+            verts.emplace_back(v);
         }
     }
 
@@ -350,7 +362,7 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
     u8 num = 0;
     for (auto mi = mapping.begin(), me = mapping.end(); mi != me; ++mi, ++num) {
         // Reach entry.
-        reach.push_back(mi->first);
+        reach.emplace_back(mi->first);
 
         // Character mapping.
         const CharReach &cr = mi->second;
@@ -415,7 +427,7 @@ void gatherAccelStates(const build_info &bi, vector<AccelBuild> &accelStates) {
         DEBUG_PRINTF("state %u is accelerable\n", bi.state_ids.at(v));
         AccelBuild a;
         findStopLiterals(bi, v, a);
-        accelStates.push_back(a);
+        accelStates.emplace_back(a);
     }
 
     // AccelStates should be sorted by state number, so that we build our accel
@@ -536,7 +548,7 @@ void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
     for (const auto &vv : tops | map_values) {
         for (NFAVertex v : vv) {
             if (!edge(g.start, v, g).second) {
-                tempEdges.push_back(add_edge(g.start, v, g).first);
+                tempEdges.emplace_back(add_edge(g.start, v, g).first);
             }
         }
     }
@@ -544,7 +556,7 @@ void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,
     // Similarly, connect (start, startDs) if necessary.
     if (!edge(g.start, g.startDs, g).second) {
         NFAEdge e = add_edge(g.start, g.startDs, g);
-        tempEdges.push_back(e); // Remove edge later.
+        tempEdges.emplace_back(e); // Remove edge later.
     }
 
     unordered_map<NFAVertex, AccelScheme> out;
@@ -611,7 +623,7 @@ void fillAccelInfo(build_info &bi) {
 
     vector<NFAVertex> astates;
     for (const auto &m : accel_map) {
-        astates.push_back(m.first);
+        astates.emplace_back(m.first);
     }
 
     NFAStateSet useful(num_states);
@@ -632,7 +644,7 @@ void fillAccelInfo(build_info &bi) {
         for (u32 j = 0, j_end = astates.size(); j < j_end; j++) {
             if (i & (1U << j)) {
                 NFAVertex v = astates[j];
-                states.push_back(v);
+                states.emplace_back(v);
                 state_set.set(state_ids.at(v));
             }
         }
@@ -874,12 +886,12 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
     // bits in accelStates.
     vector<AccelBuild> accelOuts(accelCount);
     vector<u32> effective_accel_set;
-    effective_accel_set.push_back(0); /* empty is effectively empty */
+    effective_accel_set.emplace_back(0); /* empty is effectively empty */
 
     for (u32 i = 1; i < accelCount; i++) {
         u32 effective_i = getEffectiveAccelStates(args, dom_map, i,
                                                   accelStates);
-        effective_accel_set.push_back(effective_i);
+        effective_accel_set.emplace_back(effective_i);
 
         if (effective_i == IMPOSSIBLE_ACCEL_MASK) {
             DEBUG_PRINTF("this combination of accel states is not possible\n");
@@ -901,7 +913,7 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
     // an index.
 
     // Start with the NONE case.
-    auxvec.push_back(AccelAux());
+    auxvec.emplace_back(AccelAux());
     memset(&auxvec[0], 0, sizeof(AccelAux));
     auxvec[0].accel_type = ACCEL_NONE; // no states on.
 
@@ -937,7 +949,7 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
         auto it = find_if(auxvec.begin(), auxvec.end(), AccelAuxCmp(aux));
         if (it == auxvec.end()) {
             accelTable[i] = verify_u8(auxvec.size());
-            auxvec.push_back(aux);
+            auxvec.emplace_back(aux);
         } else {
             accelTable[i] = verify_u8(it - auxvec.begin());
         }
@@ -983,7 +995,7 @@ u32 addSquashMask(const build_info &args, const NFAVertex &v,
         return verify_u32(std::distance(squash.begin(), it));
     }
     u32 idx = verify_u32(squash.size());
-    squash.push_back(sit->second);
+    squash.emplace_back(sit->second);
     return idx;
 }
 
@@ -995,7 +1007,7 @@ u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
     assert(!r.empty());
 
     vector<ReportID> my_reports(begin(r), end(r));
-    my_reports.push_back(MO_INVALID_IDX); // sentinel
+    my_reports.emplace_back(MO_INVALID_IDX); // sentinel
 
     auto cache_it = reports_cache.find(my_reports);
     if (cache_it != end(reports_cache)) {
@@ -1014,7 +1026,7 @@ u32 addReports(const flat_set<ReportID> &r, vector<ReportID> &reports,
 
     u32 offset = verify_u32(reports.size());
     insert(&reports, reports.end(), my_reports);
-    reports_cache.emplace(move(my_reports), offset);
+    reports_cache.emplace(std::move(my_reports), offset);
     return offset;
 }
 
@@ -1052,7 +1064,7 @@ void buildAcceptsList(const build_info &args, ReportListCache &reports_cache,
             a.reports = addReports(h[v].reports, reports, reports_cache);
         }
         a.squash = addSquashMask(args, v, squash);
-        accepts.push_back(move(a));
+        accepts.emplace_back(std::move(a));
     }
 }
 
@@ -1077,11 +1089,11 @@ void buildAccepts(const build_info &args, ReportListCache &reports_cache,
 
         if (edge(v, h.accept, h).second) {
             acceptMask.set(state_id);
-            verts_accept.push_back(v);
+            verts_accept.emplace_back(v);
         } else {
             assert(edge(v, h.acceptEod, h).second);
             acceptEodMask.set(state_id);
-            verts_accept_eod.push_back(v);
+            verts_accept_eod.emplace_back(v);
         }
     }
 
@@ -1498,7 +1510,7 @@ u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
             // of states.
             assert(e.succ_states.size() == num_states);
             assert(e.squash_states.size() == num_states);
-            exceptionMap[e].push_back(i);
+            exceptionMap[e].emplace_back(i);
             exceptionCount++;
         }
     }
@@ -1807,7 +1819,7 @@ struct Factory {
             *streamState += streamStateLen;
             *scratchStateSize += sizeof(RepeatControl);
 
-            out.emplace_back(move(info));
+            out.emplace_back(std::move(info));
         }
     }
 
@@ -1910,7 +1922,8 @@ struct Factory {
     }
 
     static
-    void writeExceptions(const map<ExceptionProto, vector<u32>> &exceptionMap,
+    void writeExceptions(const build_info &args,
+                         const map<ExceptionProto, vector<u32>> &exceptionMap,
                          const vector<u32> &repeatOffsets, implNFA_t *limex,
                          const u32 exceptionsOffset,
                          const u32 reportListOffset) {
@@ -1962,6 +1975,59 @@ struct Factory {
 
         limex->exceptionOffset = exceptionsOffset;
         limex->exceptionCount = ecount;
+
+        if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) {
+            const u8 *exceptionMask = (const u8 *)(&limex->exceptionMask);
+            u8 *shufMask = (u8 *)&limex->exceptionShufMask;
+            u8 *bitMask = (u8 *)&limex->exceptionBitMask;
+            u8 *andMask = (u8 *)&limex->exceptionAndMask;
+
+            u32 tot_cnt = 0;
+            u32 pos = 0;
+            bool valid = true;
+            size_t tot = sizeof(limex->exceptionMask);
+            size_t base = 0;
+
+            // We normally have up to 64 exceptions to handle,
+            // but treat 384 state Limex differently to simplify operations
+            size_t limit = 64;
+            if (args.num_states > 256 && args.num_states <= 384) {
+                limit = 48;
+            }
+
+            for (size_t i = 0; i < tot; i++) {
+                if (!exceptionMask[i]) {
+                    continue;
+                }
+                u32 bit_cnt = popcount32(exceptionMask[i]);
+
+                tot_cnt += bit_cnt;
+                if (tot_cnt > limit) {
+                    valid = false;
+                    break;
+                }
+
+                u32 emsk = exceptionMask[i];
+                while (emsk) {
+                    u32 t = findAndClearLSB_32(&emsk);
+                    bitMask[pos] = 1U << t;
+                    andMask[pos] = 1U << t;
+                    shufMask[pos++] = i + base;
+
+                    if (pos == 32 &&
+                        (args.num_states > 128 && args.num_states <= 256)) {
+                        base += 32;
+                    }
+                }
+            }
+            // Avoid matching unused bytes
+            for (u32 i = pos; i < 64; i++) {
+                bitMask[i] = 0xff;
+            }
+            if (valid) {
+                setLimexFlag(limex, LIMEX_FLAG_EXTRACT_EXP);
+            }
+        }
     }
 
     static
@@ -2287,7 +2353,7 @@ struct Factory {
         writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset,
                      repeatsOffset);
 
-        writeExceptions(exceptionMap, repeatOffsets, limex, exceptionsOffset,
+        writeExceptions(args, exceptionMap, repeatOffsets, limex, exceptionsOffset,
                         reportListOffset);
 
         writeLimexMasks(args, limex);
@@ -2422,6 +2488,68 @@ bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
 }
 #endif // NDEBUG
 
+static
+bool isFast(const build_info &args) {
+    const NGHolder &h = args.h;
+    const u32 num_states = args.num_states;
+
+    if (num_states > MAX_SMALL_NFA_STATES) {
+        return false;
+    }
+
+    unordered_map<NFAVertex, bool> pos_trigger;
+    for (u32 i = 0; i < args.repeats.size(); i++) {
+        const BoundedRepeatData &br = args.repeats[i];
+        assert(!contains(pos_trigger, br.pos_trigger));
+        pos_trigger[br.pos_trigger] = br.repeatMax <= MAX_REPEAT_SIZE;
+    }
+
+    // Small NFA without bounded repeat should be fast.
+    if (pos_trigger.empty()) {
+        return true;
+    }
+
+    vector<NFAVertex> cur;
+    unordered_set<NFAVertex> visited;
+    for (const auto &m : args.tops) {
+        for (NFAVertex v : m.second) {
+            cur.emplace_back(v);
+            visited.insert(v);
+        }
+    }
+
+    u8 pos_dist = 0;
+    while (!cur.empty()) {
+        vector<NFAVertex> next;
+        for (const auto &v : cur) {
+            if (contains(pos_trigger, v)) {
+                const CharReach &cr = h[v].char_reach;
+                if (!pos_trigger[v] && cr.count() > MAX_REPEAT_CHAR_REACH) {
+                    return false;
+                }
+            }
+            for (const auto &w : adjacent_vertices_range(v, h)) {
+                if (w == v) {
+                    continue;
+                }
+                u32 j = args.state_ids.at(w);
+                if (j == NO_STATE) {
+                    continue;
+                }
+                if (!contains(visited, w)) {
+                    next.emplace_back(w);
+                    visited.insert(w);
+                }
+            }
+        }
+        if (++pos_dist >= MIN_REPEAT_TRIGGER_DISTANCE) {
+            break;
+        }
+        swap(cur, next);
+    }
+    return true;
+}
+
 static
 u32 max_state(const unordered_map<NFAVertex, u32> &state_ids) {
     u32 rv = 0;
@@ -2442,7 +2570,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
                 const unordered_map<NFAVertex, NFAStateSet> &squashMap,
                 const map<u32, set<NFAVertex>> &tops,
                 const set<NFAVertex> &zombies, bool do_accel,
-                bool stateCompression, u32 hint,
+                bool stateCompression, bool &fast, u32 hint,
                 const CompileContext &cc) {
     const u32 num_states = max_state(states) + 1;
     DEBUG_PRINTF("total states: %u\n", num_states);
@@ -2497,6 +2625,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
         if (nfa) {
             DEBUG_PRINTF("successful build with NFA engine: %s\n",
                          nfa_type_name(limex_model));
+            fast = isFast(arg);
             return nfa;
         }
     }
diff --git a/src/nfa/limex_compile.h b/src/nfa/limex_compile.h
index a08e0ae56..4afdcdb3e 100644
--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -78,6 +78,7 @@ bytecode_ptr<NFA> generate(NGHolder &g,
             const std::set<NFAVertex> &zombies,
             bool do_accel,
             bool stateCompression,
+            bool &fast,
             u32 hint,
             const CompileContext &cc);
 
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index 9256c841c..a22392b34 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -354,7 +354,7 @@ static
 void setupReach(const u8 *reachMap, const u8 *reachBase, u32 size,
                 u32 state_count, vector<CharReach> *perStateReach) {
     for (u32 i = 0; i < state_count; i++) {
-        perStateReach->push_back(CharReach());
+        perStateReach->emplace_back(CharReach());
         for (u32 j = 0; j < N_CHARS; j++) {
             u8 k = reachMap[j];
             const u8 *r = reachBase + k * (size/8);
diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h
index e770c3278..c9de3aed4 100644
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,8 @@
 #define AND_STATE               JOIN(and_, STATE_T)
 #define EQ_STATE(a, b)          (!JOIN(noteq_, STATE_T)((a), (b)))
 #define OR_STATE                JOIN(or_, STATE_T)
+#define EXPAND_STATE            JOIN(broadcast_, STATE_T)
+#define SHUFFLE_BYTE_STATE      JOIN(shuffle_byte_, STATE_T)
 #define TESTBIT_STATE           JOIN(testbit_, STATE_T)
 #define EXCEPTION_T             JOIN(struct NFAException, SIZE)
 #define CONTEXT_T               JOIN(NFAContext, SIZE)
@@ -208,7 +210,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
 /** \brief Process all of the exceptions associated with the states in the \a
  * estate. */
 static really_inline
-int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
+int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
           const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
           u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) {
     assert(diffmask > 0); // guaranteed by caller macro
@@ -233,6 +235,72 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
     ctx->local_succ = ZERO_STATE;
 #endif
 
+    struct proto_cache new_cache = {0, NULL};
+    enum CacheResult cacheable = CACHE_RESULT;
+
+#if defined(HAVE_AVX512VBMI) && SIZE > 64
+    if (likely(limex->flags & LIMEX_FLAG_EXTRACT_EXP)) {
+        m512 emask = EXPAND_STATE(*STATE_ARG_P);
+        emask = SHUFFLE_BYTE_STATE(load_m512(&limex->exceptionShufMask), emask);
+        emask = and512(emask, load_m512(&limex->exceptionAndMask));
+        u64a word = eq512mask(emask, load_m512(&limex->exceptionBitMask));
+
+        do {
+            u32 bit = FIND_AND_CLEAR_FN(&word);
+            const EXCEPTION_T *e = &exceptions[bit];
+
+            if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
+#ifndef BIG_MODEL
+                                  &local_succ,
+#endif
+                                  limex, offset, ctx, &new_cache, &cacheable,
+                                  in_rev, flags)) {
+                return PE_RV_HALT;
+            }
+        } while (word);
+    } else {
+        // A copy of the estate as an array of GPR-sized chunks.
+        CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+        CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+#ifdef ESTATE_ON_STACK
+        memcpy(chunks, &estate, sizeof(STATE_T));
+#else
+        memcpy(chunks, estatep, sizeof(STATE_T));
+#endif
+        memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
+
+        u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
+        base_index[0] = 0;
+        for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
+            base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
+        }
+
+        do {
+            u32 t = findAndClearLSB_32(&diffmask);
+#ifdef ARCH_64_BIT
+            t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
+#endif
+            assert(t < ARRAY_LENGTH(chunks));
+            CHUNK_T word = chunks[t];
+            assert(word != 0);
+            do {
+                u32 bit = FIND_AND_CLEAR_FN(&word);
+                u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit);
+                u32 idx = local_index + base_index[t];
+                const EXCEPTION_T *e = &exceptions[idx];
+
+                if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
+#ifndef BIG_MODEL
+                                      &local_succ,
+#endif
+                                      limex, offset, ctx, &new_cache, &cacheable,
+                                      in_rev, flags)) {
+                    return PE_RV_HALT;
+                }
+            } while (word);
+        } while (diffmask);
+    }
+#else
     // A copy of the estate as an array of GPR-sized chunks.
     CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
     CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
@@ -243,9 +311,6 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 #endif
     memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
 
-    struct proto_cache new_cache = {0, NULL};
-    enum CacheResult cacheable = CACHE_RESULT;
-
     u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
     base_index[0] = 0;
     for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
@@ -276,6 +341,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
             }
         } while (word);
     } while (diffmask);
+#endif
 
 #ifndef BIG_MODEL
     *succ = OR_STATE(*succ, local_succ);
@@ -307,6 +373,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
 #undef AND_STATE
 #undef EQ_STATE
 #undef OR_STATE
+#undef EXPAND_STATE
+#undef SHUFFLE_BYTE_STATE
 #undef TESTBIT_STATE
 #undef PE_FN
 #undef RUN_EXCEPTION_FN
diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index db703f039..23b1bd970 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -86,6 +86,7 @@
 #define LIMEX_FLAG_COMPRESS_STATE  1 /**< pack state into stream state */
 #define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
 #define LIMEX_FLAG_CANNOT_DIE      4 /**< limex cannot have no states on */
+#define LIMEX_FLAG_EXTRACT_EXP     8 /**< use limex exception bit extraction */
 
 enum LimExTrigger {
     LIMEX_TRIGGER_NONE = 0,
@@ -157,6 +158,9 @@ struct LimExNFA##size {                                                     \
     u_##size shift[MAX_SHIFT_COUNT];                                        \
     u32 shiftCount; /**< number of shift masks used */                      \
     u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */     \
+    m512 exceptionShufMask; /**< exception byte shuffle mask  */            \
+    m512 exceptionBitMask; /**< exception bit mask */                       \
+    m512 exceptionAndMask; /**< exception and mask */                       \
 };
 
 CREATE_NFA_LIMEX(32)
diff --git a/src/nfa/limex_shuffle.hpp b/src/nfa/limex_shuffle.hpp
new file mode 100644
index 000000000..367d400ba
--- /dev/null
+++ b/src/nfa/limex_shuffle.hpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Naive dynamic shuffles.
+ *
+ * These are written with the assumption that the provided masks are sparsely
+ * populated and never contain more than 32 on bits. Other implementations will
+ * be faster and actually correct if these assumptions don't hold true.
+ */
+
+#ifndef LIMEX_SHUFFLE_HPP
+#define LIMEX_SHUFFLE_HPP
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/unaligned.h"
+#include "util/supervector/supervector.hpp"
+
+template <u16 S>
+u32 packedExtract(SuperVector<S> s, const SuperVector<S> permute, const SuperVector<S> compare);
+
+
+template <>
+really_really_inline
+u32 packedExtract<16>(SuperVector<16> s, const SuperVector<16> permute, const SuperVector<16> compare) {
+    SuperVector<16> shuffled = s.pshufb<true>(permute);
+    SuperVector<16> compared = shuffled & compare;
+    u64a rv = (~compared.eqmask(shuffled)) & 0xffff;
+    if (SuperVector<16>::mask_width() != 1) {
+        u32 ans = 0;
+        for (u32 i = 0; i < 16; ++i) {
+            ans |= (rv & (1ull << (i * SuperVector<16>::mask_width()))) >>
+                   (i * SuperVector<16>::mask_width() - i);
+        }
+        return ans;
+    }
+    return (u32)rv;
+}
+
+template <>
+really_really_inline
+u32 packedExtract<32>(SuperVector<32> s, const SuperVector<32> permute, const SuperVector<32> compare) {
+    SuperVector<32> shuffled = s.pshufb<true>(permute);
+    SuperVector<32> compared = shuffled & compare;
+    // TODO(danlark1): Future ARM support might have a bug.
+    u64a rv = (~compared.eqmask(shuffled)) & 0xffffffff;
+    return (u32)((rv >> 16) | (rv & 0xffffU));
+}
+
+template <>
+really_really_inline
+u32 packedExtract<64>(SuperVector<64> s, const SuperVector<64> permute, const SuperVector<64> compare) {
+    SuperVector<64> shuffled = s.pshufb<true>(permute);
+    SuperVector<64> compared = shuffled & compare;
+    // TODO(danlark1): Future ARM support might have a bug.
+    u64a rv = ~compared.eqmask(shuffled);
+    rv = rv >> 32 | rv;
+    return (u32)(((rv >> 16) | rv) & 0xffffU);
+}
+
+
+#endif // LIMEX_SHUFFLE_HPP
diff --git a/src/nfa/mcclellan.c b/src/nfa/mcclellan.c
index 71f71e327..a7fcb06a4 100644
--- a/src/nfa/mcclellan.c
+++ b/src/nfa/mcclellan.c
@@ -634,6 +634,12 @@ char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
     assert(ISALIGNED_N(q->state, 2));
     u32 s = *(u16 *)q->state;
 
+    const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+    __builtin_prefetch(base);
+    __builtin_prefetch(base + 64);
+    __builtin_prefetch(base + 128);
+    __builtin_prefetch(base + 192);
+
     if (q->report_current) {
         assert(s);
         assert(get_aux(m, s)->accept);
@@ -790,6 +796,12 @@ char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
 
     u32 s = *(u8 *)q->state;
 
+    const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+    __builtin_prefetch(base);
+    __builtin_prefetch(base + 64);
+    __builtin_prefetch(base + 128);
+    __builtin_prefetch(base + 192);
+
     if (q->report_current) {
         assert(s);
         assert(s >= m->accept_limit_8);
diff --git a/src/nfa/mcclellan_common_impl.h b/src/nfa/mcclellan_common_impl.h
index 7b0e7f48c..6ec1b1f15 100644
--- a/src/nfa/mcclellan_common_impl.h
+++ b/src/nfa/mcclellan_common_impl.h
@@ -59,7 +59,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
 
     if (len) {
         m128 ss_char = load128(sherman_state);
-        m128 cur_char = set16x8(cprime);
+        m128 cur_char = set1_16x8(cprime);
 
         u32 z = movemask128(eq128(ss_char, cur_char));
 
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index c1a4f87fc..d165b1faf 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2018, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,7 +43,6 @@
 #include "util/compare.h"
 #include "util/compile_context.h"
 #include "util/container.h"
-#include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/flat_containers.h"
@@ -162,7 +161,7 @@ DfaPrevInfo::DfaPrevInfo(raw_dfa &rdfa)
     for (size_t i = 0; i < states.size(); i++) {
         for (symbol_t sym = 0; sym < impl_alpha_size; sym++) {
             dstate_id_t curr = rdfa.states[i].next[sym];
-            states[curr].prev_vec[sym].push_back(i);
+            states[curr].prev_vec[sym].emplace_back(i);
         }
         if (!rdfa.states[i].reports.empty()
             || !rdfa.states[i].reports_eod.empty()) {
@@ -393,12 +392,12 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
 
     const bool remap_reports = has_managed_reports(rdfa.kind);
 
-    auto ri = ue2::make_unique<raw_report_info_impl>();
+    auto ri = std::make_unique<raw_report_info_impl>();
     map<raw_report_list, u32> rev;
 
     for (const dstate &s : rdfa.states) {
         if (s.reports.empty()) {
-            reports.push_back(MO_INVALID_IDX);
+            reports.emplace_back(MO_INVALID_IDX);
             continue;
         }
 
@@ -406,18 +405,18 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
         DEBUG_PRINTF("non empty r\n");
         auto it = rev.find(rrl);
         if (it != rev.end()) {
-            reports.push_back(it->second);
+            reports.emplace_back(it->second);
         } else {
             DEBUG_PRINTF("adding to rl %zu\n", ri->size());
             rev.emplace(rrl, ri->size());
-            reports.push_back(ri->size());
-            ri->rl.push_back(rrl);
+            reports.emplace_back(ri->size());
+            ri->rl.emplace_back(rrl);
         }
     }
 
     for (const dstate &s : rdfa.states) {
         if (s.reports_eod.empty()) {
-            reports_eod.push_back(MO_INVALID_IDX);
+            reports_eod.emplace_back(MO_INVALID_IDX);
             continue;
         }
 
@@ -425,14 +424,14 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
         raw_report_list rrl(s.reports_eod, rm, remap_reports);
         auto it = rev.find(rrl);
         if (it != rev.end()) {
-            reports_eod.push_back(it->second);
+            reports_eod.emplace_back(it->second);
             continue;
         }
 
         DEBUG_PRINTF("adding to rl eod %zu\n", s.reports_eod.size());
         rev.emplace(rrl, ri->size());
-        reports_eod.push_back(ri->size());
-        ri->rl.push_back(rrl);
+        reports_eod.emplace_back(ri->size());
+        ri->rl.emplace_back(rrl);
     }
 
     assert(!ri->rl.empty()); /* all components should be able to generate
@@ -463,7 +462,7 @@ unique_ptr<raw_report_info> mcclellan_build_strat::gatherReports(
         *isSingleReport = 0;
     }
 
-    return move(ri);
+    return std::move(ri);
 }
 
 u32 raw_report_info_impl::getReportListSize() const {
@@ -484,7 +483,7 @@ size_t raw_report_info_impl::size() const {
 void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
                                            vector<u32> &ro) const {
     for (const auto &reps : rl) {
-        ro.push_back(base_offset);
+        ro.emplace_back(base_offset);
 
         report_list *p = (report_list *)((char *)n + base_offset);
 
@@ -569,13 +568,13 @@ bool allocateFSN16(dfa_info &info, dstate_id_t *sherman_base,
 
     for (u32 i = 1; i < info.size(); i++) {
         if (info.is_widehead(i)) {
-            wideHead.push_back(i);
+            wideHead.emplace_back(i);
         } else if (info.is_widestate(i)) {
-            wideState.push_back(i);
+            wideState.emplace_back(i);
         } else if (info.is_sherman(i)) {
-            sherm.push_back(i);
+            sherm.emplace_back(i);
         } else {
-            norm.push_back(i);
+            norm.emplace_back(i);
         }
     }
 
@@ -893,11 +892,11 @@ void allocateFSN8(dfa_info &info,
 
     for (u32 i = 1; i < info.size(); i++) {
         if (!info.states[i].reports.empty()) {
-            accept.push_back(i);
+            accept.emplace_back(i);
         } else if (contains(accel_escape_info, i)) {
-            accel.push_back(i);
+            accel.emplace_back(i);
         } else {
-            norm.push_back(i);
+            norm.emplace_back(i);
         }
     }
 
@@ -1082,7 +1081,9 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
         // Use the daddy already set for this state so long as it isn't already
         // a Sherman state.
         dstate_id_t daddy = currState.daddy;
-        if (!info.is_sherman(daddy) && !info.is_widestate(daddy)) {
+        if (info.is_widestate(daddy)) {
+            return;
+        } else if (!info.is_sherman(daddy)) {
             hinted.insert(currState.daddy);
         } else {
             // Fall back to granddaddy, which has already been processed (due
@@ -1248,7 +1249,7 @@ dstate_id_t find_chain_candidate(const raw_dfa &rdfa, const DfaPrevInfo &info,
                                  const symbol_t curr_sym,
                                  vector<dstate_id_t> &temp_chain) {
     //Record current id first.
-    temp_chain.push_back(curr_id);
+    temp_chain.emplace_back(curr_id);
 
     const u16 size = info.impl_alpha_size;
 
@@ -1311,7 +1312,7 @@ bool store_chain_longest(vector<vector<dstate_id_t>> &candidate_chain,
         DEBUG_PRINTF("This is a new chain!\n");
 
         // Add this new chain and get it marked.
-        candidate_chain.push_back(temp_chain);
+        candidate_chain.emplace_back(temp_chain);
 
         for (auto &id : temp_chain) {
             DEBUG_PRINTF("(Marking s%u ...)\n", id);
@@ -1385,18 +1386,18 @@ void generate_symbol_chain(dfa_info &info, vector<symbol_t> &chain_tail) {
 
             // The tail symbol comes from vector chain_tail;
             if (j == width - 1) {
-                symbol_chain.push_back(chain_tail[i]);
+                symbol_chain.emplace_back(chain_tail[i]);
             } else {
                 for (symbol_t sym = 0; sym < info.impl_alpha_size; sym++) {
                     if (rdfa.states[curr_id].next[sym] == next_id) {
-                        symbol_chain.push_back(sym);
+                        symbol_chain.emplace_back(sym);
                         break;
                     }
                 }
             }
         }
 
-        info.wide_symbol_chain.push_back(symbol_chain);
+        info.wide_symbol_chain.emplace_back(symbol_chain);
     }
 }
 
@@ -1445,12 +1446,12 @@ void find_wide_state(dfa_info &info) {
                 }
 
                 reverse(temp_chain.begin(), temp_chain.end());
-                temp_chain.push_back(curr_id);
+                temp_chain.emplace_back(curr_id);
 
                 assert(head > 0 && head == temp_chain.front());
                 if (store_chain_longest(info.wide_state_chain, temp_chain,
                                         added, head_is_new)) {
-                    chain_tail.push_back(sym);
+                    chain_tail.emplace_back(sym);
                 }
             }
         }
@@ -1477,28 +1478,32 @@ bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
 
     bytecode_ptr<NFA> nfa;
     if (!using8bit) {
+        // Wide state optimization
         if (cc.grey.allowWideStates && strat.getType() == McClellan
             && !is_triggered(raw.kind)) {
             find_wide_state(info);
         }
 
-        u16 total_daddy = 0;
         bool any_cyclic_near_anchored_state
             = is_cyclic_near(raw, raw.start_anchored);
 
-        for (u32 i = 0; i < info.size(); i++) {
-            if (info.is_widestate(i)) {
-                continue;
+        // Sherman optimization
+        if (info.impl_alpha_size > 16) {
+            u16 total_daddy = 0;
+            for (u32 i = 0; i < info.size(); i++) {
+                if (info.is_widestate(i)) {
+                    continue;
+                }
+                find_better_daddy(info, i, using8bit,
+                                  any_cyclic_near_anchored_state,
+                                  trust_daddy_states, cc.grey);
+                total_daddy += info.extra[i].daddytaken;
             }
-            find_better_daddy(info, i, using8bit,
-                              any_cyclic_near_anchored_state,
-                              trust_daddy_states, cc.grey);
-            total_daddy += info.extra[i].daddytaken;
-        }
 
-        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
-                     info.size() * info.impl_alpha_size, info.size(),
-                     info.impl_alpha_size);
+            DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                         info.size() * info.impl_alpha_size, info.size(),
+                         info.impl_alpha_size);
+        }
 
         nfa = mcclellanCompile16(info, cc, accel_states);
     } else {
diff --git a/src/nfa/mcclellancompile_util.cpp b/src/nfa/mcclellancompile_util.cpp
index 3e299b81e..d0df0319a 100644
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@@ -150,7 +150,7 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
                 continue;
             }
             if (dist[t] == ~0U) {
-                to_visit.push_back(t);
+                to_visit.emplace_back(t);
                 dist[t] = d + 1;
             } else {
                 assert(dist[t] <= d + 1);
diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c
index 4619ff6fd..5c97d73a4 100644
--- a/src/nfa/mcsheng.c
+++ b/src/nfa/mcsheng.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,7 +72,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
 
     if (len) {
         m128 ss_char = load128(sherman_state);
-        m128 cur_char = set16x8(cprime);
+        m128 cur_char = set1_16x8(cprime);
 
         u32 z = movemask128(eq128(ss_char, cur_char));
 
@@ -153,7 +153,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
     assert(s_in); /* should not already be dead */
     assert(soft_c_end <= hard_c_end);
     DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
-    m128 s = set16x8(s_in - 1);
+    m128 s = set1_16x8(s_in - 1);
     const u8 *c = *c_inout;
     const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
     if (!do_accel) {
@@ -171,8 +171,8 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
 
 #if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
     u32 sheng_limit_x4 = sheng_limit * 0x01010101;
-    m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
-    m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
+    m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4);
+    m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit);
     DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
                  m->sheng_accel_limit, sheng_stop_limit);
 #endif
@@ -889,6 +889,12 @@ char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         return MO_ALIVE;
     }
 
+    const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+    __builtin_prefetch(base);
+    __builtin_prefetch(base + 64);
+    __builtin_prefetch(base + 128);
+    __builtin_prefetch(base + 192);
+
     while (1) {
         assert(q->cur < q->end);
         s64a ep = q->items[q->cur].location;
@@ -1017,6 +1023,12 @@ char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
         return MO_ALIVE;
     }
 
+    const u8 *base = ROUNDDOWN_PTR(&m->remap[0], 64);
+    __builtin_prefetch(base);
+    __builtin_prefetch(base + 64);
+    __builtin_prefetch(base + 128);
+    __builtin_prefetch(base + 192);
+
     while (1) {
         DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
                      q->items[q->cur].type == MQE_END ? "END" : "???",
@@ -1184,7 +1196,7 @@ char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q) {
 
 static
 char mcshengHasAccept(const struct mcsheng *m, const struct mstate_aux *aux,
-                        ReportID report) {
+                      ReportID report) {
     assert(m && aux);
 
     if (!aux->accept) {
@@ -1405,3 +1417,1332 @@ char nfaExecMcSheng16_expandState(UNUSED const struct NFA *nfa, void *dest,
     *(u16 *)dest = unaligned_load_u16(src);
     return 0;
 }
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+const struct mstate_aux *get_aux64(const struct mcsheng64 *m, u32 s) {
+    const char *nfa = (const char *)m - sizeof(struct NFA);
+    const struct mstate_aux *aux
+        = s + (const struct mstate_aux *)(nfa + m->aux_offset);
+
+    assert(ISALIGNED(aux));
+    return aux;
+}
+
+static really_inline
+u32 mcshengEnableStarts64(const struct mcsheng64 *m, u32 s) {
+    const struct mstate_aux *aux = get_aux64(m, s);
+
+    DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top);
+    return aux->top;
+}
+
+static really_inline
+char doComplexReport64(NfaCallback cb, void *ctxt, const struct mcsheng64 *m,
+                       u32 s, u64a loc, char eod, u32 *cached_accept_state,
+                       u32 *cached_accept_id) {
+    DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n",
+                 s & STATE_MASK, loc, eod);
+
+    if (!eod && s == *cached_accept_state) {
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    const struct mstate_aux *aux = get_aux64(m, s);
+    size_t offset = eod ? aux->accept_eod : aux->accept;
+
+    assert(offset);
+    const struct report_list *rl
+        = (const void *)((const char *)m + offset - sizeof(struct NFA));
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list size %u\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = s;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+static really_inline
+u32 doSheng64(const struct mcsheng64 *m, const u8 **c_inout, const u8 *soft_c_end,
+              const u8 *hard_c_end, u32 s_in, char do_accel) {
+    assert(s_in < m->sheng_end);
+    assert(s_in); /* should not already be dead */
+    assert(soft_c_end <= hard_c_end);
+    DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
+    m512 s = set1_64x8(s_in - 1);
+    const u8 *c = *c_inout;
+    const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
+    if (!do_accel) {
+        c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1);
+    }
+
+    const m512 *masks = m->sheng_succ_masks;
+    u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */
+    u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit;
+
+    /* When we use movd to get a u32 containing our state, it will have 4 lanes
+     * all duplicating the state. We can create versions of our limits with 4
+     * copies to directly compare against, this prevents us generating code to
+     * extract a single copy of the state from the u32 for checking. */
+    u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+    u32 sheng_limit_x4 = sheng_limit * 0x01010101;
+    m512 simd_stop_limit = set1_16x32(sheng_stop_limit_x4);
+    m512 accel_delta = set1_64x8(sheng_limit - sheng_stop_limit);
+    DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
+                 m->sheng_accel_limit, sheng_stop_limit);
+#endif
+
+#define SHENG64_SINGLE_ITER do {                                             \
+        m512 succ_mask = masks[*(c++)];                                      \
+        s = vpermb512(s, succ_mask);                                         \
+        u32 s_gpr_x4 = movd512(s); /* convert to u8 */                       \
+        DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4);      \
+        if (s_gpr_x4 >= sheng_stop_limit_x4) {                               \
+            s_gpr = s_gpr_x4;                                                \
+            goto exit;                                                       \
+        }                                                                    \
+    } while (0)
+
+    u8 s_gpr;
+    while (c < c_end) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+        /* This version uses pext for efficiently bitbashing out scaled
+         * versions of the bytes to process from a u64a */
+
+        u64a data_bytes = unaligned_load_u64a(c);
+        u64a cc0 = pdep64(data_bytes, 0x3fc0); /* extract scaled low byte */
+        data_bytes &= ~0xffULL; /* clear low bits for scale space */
+
+        m512 succ_mask0 = load512((const char *)masks + cc0);
+        s = vpermb512(s, succ_mask0);
+        m512 s_max = s;
+        m512 s_max0 = s_max;
+        DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 6, movd512(s));
+
+#define SHENG64_SINGLE_UNROLL_ITER(iter)                                \
+        assert(iter);                                                   \
+        u64a cc##iter = pext64(data_bytes, mcsheng64_pext_mask[iter]);  \
+        assert(cc##iter == (u64a)c[iter] << 6);                         \
+        m512 succ_mask##iter = load512((const char *)masks + cc##iter); \
+        s = vpermb512(s, succ_mask##iter);                              \
+        if (do_accel && iter == 7) {                                    \
+            /* in the final iteration we also have to check against accel */ \
+            m512 s_temp = sadd_u8_m512(s, accel_delta);                 \
+            s_max = max_u8_m512(s_max, s_temp);                         \
+        } else {                                                        \
+            s_max = max_u8_m512(s_max, s);                              \
+        }                                                               \
+        m512 s_max##iter = s_max;                                       \
+        DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 6,       \
+                     movd512(s), movd512(s_max));
+
+        SHENG64_SINGLE_UNROLL_ITER(1);
+        SHENG64_SINGLE_UNROLL_ITER(2);
+        SHENG64_SINGLE_UNROLL_ITER(3);
+        SHENG64_SINGLE_UNROLL_ITER(4);
+        SHENG64_SINGLE_UNROLL_ITER(5);
+        SHENG64_SINGLE_UNROLL_ITER(6);
+        SHENG64_SINGLE_UNROLL_ITER(7);
+
+        if (movd512(s_max7) >= sheng_limit_x4) {
+            DEBUG_PRINTF("exit found\n");
+
+            /* Explicitly check the last byte as it is more likely as it also
+             * checks for acceleration. */
+            if (movd512(s_max6) < sheng_limit_x4) {
+                c += SHENG_CHUNK;
+                s_gpr = movq512(s);
+                assert(s_gpr >= sheng_stop_limit);
+                goto exit;
+            }
+
+            /* use shift-xor to create a register containing all of the max
+             * values */
+            m512 blended = rshift64_m512(s_max0, 56);
+            blended = xor512(blended, rshift64_m512(s_max1, 48));
+            blended = xor512(blended, rshift64_m512(s_max2, 40));
+            blended = xor512(blended, rshift64_m512(s_max3, 32));
+            blended = xor512(blended, rshift64_m512(s_max4, 24));
+            blended = xor512(blended, rshift64_m512(s_max5, 16));
+            blended = xor512(blended, rshift64_m512(s_max6, 8));
+            blended = xor512(blended, s);
+            blended = xor512(blended, rshift64_m512(blended, 8));
+            DEBUG_PRINTF("blended %016llx\n", movq512(blended));
+
+            m512 final = min_u8_m512(blended, simd_stop_limit);
+            m512 cmp = sub_u8_m512(final, simd_stop_limit);
+            m128 tmp = cast512to128(cmp);
+            u64a stops = ~movemask128(tmp);
+            assert(stops);
+            u32 earliest = ctz32(stops);
+            DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest);
+            assert(earliest < 8);
+            c += earliest + 1;
+            s_gpr = movq512(blended) >> (earliest * 8);
+            assert(s_gpr >= sheng_stop_limit);
+            goto exit;
+        } else {
+            c += SHENG_CHUNK;
+        }
+#else
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+        SHENG64_SINGLE_ITER;
+#endif
+    }
+
+    assert(c_end - c < SHENG_CHUNK);
+    if (c < soft_c_end) {
+        assert(soft_c_end - c < SHENG_CHUNK);
+        switch (soft_c_end - c) {
+        case 7:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 6:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 5:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 4:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 3:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 2:
+            SHENG64_SINGLE_ITER; // fallthrough
+        case 1:
+            SHENG64_SINGLE_ITER; // fallthrough
+        }
+    }
+
+    assert(c >= soft_c_end);
+
+    s_gpr = movq512(s);
+exit:
+    assert(c <= hard_c_end);
+    DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr);
+    assert(c >= soft_c_end || s_gpr >= sheng_stop_limit);
+    /* undo state adjustment to match mcclellan view */
+    if (s_gpr == sheng_limit) {
+        s_gpr = 0;
+    } else if (s_gpr < sheng_limit) {
+        s_gpr++;
+    }
+
+    *c_inout = c;
+    return s_gpr;
+}
+
+static really_inline
+const char *findShermanState64(UNUSED const struct mcsheng64 *m,
+                               const char *sherman_base_offset,
+                               u32 sherman_base, u32 s) {
+    const char *rv
+        = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
+    assert(type == SHERMAN_STATE);
+    return rv;
+}
+
+static really_inline
+const u8 *run_mcsheng_accel64(const struct mcsheng64 *m,
+                              const struct mstate_aux *aux, u32 s,
+                              const u8 **min_accel_offset,
+                              const u8 *c, const u8 *c_end) {
+    DEBUG_PRINTF("skipping\n");
+    u32 accel_offset = aux[s].accel_offset;
+
+    assert(aux[s].accel_offset);
+    assert(accel_offset >= m->aux_offset);
+    assert(!m->sherman_offset || accel_offset < m->sherman_offset);
+
+    const union AccelAux *aaux = (const void *)((const char *)m + accel_offset);
+    const u8 *c2 = run_accel(aaux, c, c_end);
+
+    if (c2 < *min_accel_offset + BAD_ACCEL_DIST) {
+        *min_accel_offset = c2 + BIG_ACCEL_PENALTY;
+    } else {
+        *min_accel_offset = c2 + SMALL_ACCEL_PENALTY;
+    }
+
+    if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) {
+        *min_accel_offset = c_end;
+    }
+
+    DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n",
+                 c2 - c, *min_accel_offset - c2, c_end - c2);
+
+    return c2;
+}
+
+static really_inline
+u32 doNormal64_16(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end,
+                  u32 s, char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    const u16 *succ_table
+        = (const u16 *)((const char *)m + sizeof(struct mcsheng64));
+    assert(ISALIGNED_N(succ_table, 2));
+    u32 sheng_end = m->sheng_end;
+    u32 sherman_base = m->sherman_limit;
+    const char *sherman_base_offset
+        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+    u32 as = m->alphaShift;
+
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+    s &= STATE_MASK;
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime, s);
+        if (s < sherman_base) {
+            DEBUG_PRINTF("doing normal\n");
+            assert(s < m->state_count);
+            s = succ_table[(s << as) + cprime];
+        } else {
+            const char *sherman_state
+                = findShermanState64(m, sherman_base_offset, sherman_base, s);
+            DEBUG_PRINTF("doing sherman (%u)\n", s);
+            s = doSherman16(sherman_state, cprime, succ_table, as);
+        }
+
+        DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK);
+        c++;
+
+        if (do_accel && (s & ACCEL_FLAG)) {
+            break;
+        }
+        if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) {
+            break;
+        }
+
+        s &= STATE_MASK;
+    }
+
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcsheng64Exec16_i(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                       size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                       char single, const u8 **c_final, enum MatchMode mode) {
+    assert(ISALIGNED_N(state, 2));
+    if (!len) {
+        if (mode == STOP_AT_MATCH) {
+            *c_final = buf;
+        }
+        return MO_ALIVE;
+    }
+
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+
+    s &= STATE_MASK;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        int do_accept;
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng64(m, &c, min_accel_offset, c_end, s, 0);
+            do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept;
+        } else {
+            s = doNormal64_16(m, &c, min_accel_offset, s, 0, mode);
+
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        assert(c < c_end);
+        int do_accept;
+
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng64(m, &c, c_end, c_end, s, 1);
+            do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept;
+        } else {
+            if (s & ACCEL_FLAG) {
+                DEBUG_PRINTF("skipping\n");
+                s &= STATE_MASK;
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+
+            s = doNormal64_16(m, &c, c_end, s, 1, mode);
+            do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG);
+        }
+
+        if (do_accept) {
+            if (mode == STOP_AT_MATCH) {
+                *state = s & STATE_MASK;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD; /* termination requested */
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    s &= STATE_MASK;
+
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    *state = s;
+
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcsheng64Exec16_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcsheng64Exec16_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                           size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                           char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcsheng64Exec16_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                             final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcsheng64Exec16_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point,
+                          enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcsheng64Exec16_i_cb(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcsheng64Exec16_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                     single, final_point);
+    } else {
+        assert (mode == NO_MATCHES);
+        return mcsheng64Exec16_i_nm(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    }
+}
+
+static really_inline
+u32 doNormal64_8(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end, u32 s,
+                 char do_accel, enum MatchMode mode) {
+    const u8 *c = *c_inout;
+    u32 sheng_end = m->sheng_end;
+    u32 accel_limit = m->accel_limit_8;
+    u32 accept_limit = m->accept_limit_8;
+
+    const u32 as = m->alphaShift;
+    const u8 *succ_table = (const u8 *)((const char *)m
+                                        + sizeof(struct mcsheng64));
+    /* Adjust start of succ table so we can index into using state id (rather
+     * than adjust to normal id). As we will not be processing states with low
+     * state ids, we will not be accessing data before the succ table. Note: due
+     * to the size of the sheng tables, the succ_table pointer will still be
+     * inside the engine.*/
+    succ_table -= sheng_end << as;
+
+    assert(s >= sheng_end);
+    while (c < end && s >= sheng_end) {
+        u8 cprime = m->remap[*c];
+        DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c,
+                     ourisprint(*c) ? *c : '?', cprime);
+        s = succ_table[(s << as) + cprime];
+
+        DEBUG_PRINTF("s: %u\n", s);
+        c++;
+        if (do_accel) {
+            if (s >= accel_limit) {
+                break;
+            }
+        } else {
+            if (mode != NO_MATCHES && s >= accept_limit) {
+                break;
+            }
+        }
+    }
+    *c_inout = c;
+    return s;
+}
+
+static really_inline
+char mcsheng64Exec8_i(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                      size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                      char single, const u8 **c_final, enum MatchMode mode) {
+    if (!len) {
+        *c_final = buf;
+        return MO_ALIVE;
+    }
+    u32 s = *state;
+    const u8 *c = buf;
+    const u8 *c_end = buf + len;
+    const u8 sheng_end = m->sheng_end;
+
+    const struct mstate_aux *aux
+        = (const struct mstate_aux *)((const char *)m + m->aux_offset
+                                      - sizeof(struct NFA));
+    u32 accept_limit = m->accept_limit_8;
+
+    u32 cached_accept_id = 0;
+    u32 cached_accept_state = 0;
+
+    DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit);
+
+    DEBUG_PRINTF("s: %u, len %zu\n", s, len);
+
+    const u8 *min_accel_offset = c;
+    if (!m->has_accel || len < ACCEL_MIN_LEN) {
+        min_accel_offset = c_end;
+        goto without_accel;
+    }
+
+    goto with_accel;
+
+without_accel:
+    do {
+        assert(c < min_accel_offset);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            s = doSheng64(m, &c, min_accel_offset, c_end, s, 0);
+        } else {
+            s = doNormal64_8(m, &c, min_accel_offset, s, 0, mode);
+            assert(c <= min_accel_offset);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */
+    } while (c < min_accel_offset);
+
+    if (c == c_end) {
+        goto exit;
+    }
+
+with_accel:
+    do {
+        u32 accel_limit = m->accel_limit_8;
+
+        assert(c < c_end);
+        if (!s) {
+            goto exit;
+        } else if (s < sheng_end) {
+            if (s > m->sheng_accel_limit) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doSheng64(m, &c, c_end, c_end, s, 1);
+        } else {
+            if (s >= accel_limit && aux[s].accel_offset) {
+                c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end);
+                if (c == c_end) {
+                    goto exit;
+                } else {
+                    goto without_accel;
+                }
+            }
+            s = doNormal64_8(m, &c, c_end, s, 1, mode);
+        }
+
+        if (mode != NO_MATCHES && s >= accept_limit) {
+            if (mode == STOP_AT_MATCH) {
+                DEBUG_PRINTF("match - pausing\n");
+                *state = s;
+                *c_final = c - 1;
+                return MO_MATCHES_PENDING;
+            }
+
+            u64a loc = (c - 1) - buf + offAdj + 1;
+            if (single) {
+                DEBUG_PRINTF("reporting %u\n", m->arb_report);
+                if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) {
+                    return MO_DEAD;
+                }
+            } else if (doComplexReport64(cb, ctxt, m, s, loc, 0,
+                                         &cached_accept_state,
+                                         &cached_accept_id)
+                       == MO_HALT_MATCHING) {
+                return MO_DEAD;
+            }
+        }
+
+        assert(c <= c_end);
+    } while (c < c_end);
+
+exit:
+    *state = s;
+    if (mode == STOP_AT_MATCH) {
+        *c_final = c_end;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char mcsheng64Exec8_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, CALLBACK_OUTPUT);
+}
+
+static never_inline
+char mcsheng64Exec8_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                          size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                          char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, STOP_AT_MATCH);
+}
+
+static never_inline
+char mcsheng64Exec8_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point) {
+    return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single,
+                            final_point, NO_MATCHES);
+}
+
+static really_inline
+char mcsheng64Exec8_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf,
+                         size_t len, u64a offAdj, NfaCallback cb, void *ctxt,
+                         char single, const u8 **final_point,
+                         enum MatchMode mode) {
+    if (mode == CALLBACK_OUTPUT) {
+        return mcsheng64Exec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single,
+                                   final_point);
+    } else if (mode == STOP_AT_MATCH) {
+        return mcsheng64Exec8_i_sam(m, state, buf, len, offAdj, cb, ctxt,
+                                    single, final_point);
+    } else {
+        assert(mode == NO_MATCHES);
+        return mcsheng64Exec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single,
+                                   final_point);
+    }
+}
+
+static really_inline
+char mcshengCheckEOD64(const struct NFA *nfa, u32 s, u64a offset,
+                       NfaCallback cb, void *ctxt) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    const struct mstate_aux *aux = get_aux64(m, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+    return doComplexReport64(cb, ctxt, m, s, offset, 1, NULL, NULL);
+}
+
+static really_inline
+char nfaExecMcSheng64_16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                             const u8 *hend, NfaCallback cb, void *context,
+                             struct mq *q, char single, s64a end,
+                             enum MatchMode mode) {
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    s64a sp;
+
+    assert(ISALIGNED_N(q->state, 2));
+    u32 s = *(u16 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(get_aux64(m, s)->accept);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0,
+                                   &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    assert(q->cur);
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u16 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        /* do main buffer region */
+        const u8 *final_look;
+        char rv = mcsheng64Exec16_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                       offset + sp, cb, context, single,
+                                       &final_look, mode);
+        if (rv == MO_DEAD) {
+            *(u16 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u16 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u16 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts64(m, s);
+            break;
+        case MQE_END:
+            *(u16 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+static really_inline
+char nfaExecMcSheng64_8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer,
+                            const u8 *hend, NfaCallback cb, void *context,
+                            struct mq *q, char single, s64a end,
+                            enum MatchMode mode) {
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    s64a sp;
+
+    u32 s = *(u8 *)q->state;
+
+    if (q->report_current) {
+        assert(s);
+        assert(s >= m->accept_limit_8);
+
+        int rv;
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+
+            rv = cb(0, q_cur_offset(q), m->arb_report, context);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0,
+                                   &cached_accept_state, &cached_accept_id);
+        }
+
+        q->report_current = 0;
+
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+    }
+
+    sp = q_cur_loc(q);
+    q->cur++;
+
+    const u8 *cur_buf = sp < 0 ? hend : buffer;
+
+    if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) {
+        DEBUG_PRINTF("this is as far as we go\n");
+        q->cur--;
+        q->items[q->cur].type = MQE_START;
+        q->items[q->cur].location = end;
+        *(u8 *)q->state = s;
+        return MO_ALIVE;
+    }
+
+    while (1) {
+        DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" :
+                     q->items[q->cur].type == MQE_END ? "END" : "???",
+                     q->items[q->cur].location + offset);
+        assert(q->cur < q->end);
+        s64a ep = q->items[q->cur].location;
+        if (mode != NO_MATCHES) {
+            ep = MIN(ep, end);
+        }
+
+        assert(ep >= sp);
+
+        s64a local_ep = ep;
+        if (sp < 0) {
+            local_ep = MIN(0, ep);
+        }
+
+        const u8 *final_look;
+        char rv = mcsheng64Exec8_i_ni(m, &s, cur_buf + sp, local_ep - sp,
+                                      offset + sp, cb, context, single,
+                                      &final_look, mode);
+        if (rv == MO_HALT_MATCHING) {
+            *(u8 *)q->state = 0;
+            return MO_DEAD;
+        }
+        if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf);
+
+            assert(q->cur);
+            assert(final_look != cur_buf + local_ep);
+
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = final_look - cur_buf + 1; /* due to
+                                                                   * early -1 */
+            *(u8 *)q->state = s;
+            return MO_MATCHES_PENDING;
+        }
+
+        assert(rv == MO_ALIVE);
+        assert(q->cur);
+        if (mode != NO_MATCHES && q->items[q->cur].location > end) {
+            DEBUG_PRINTF("this is as far as we go\n");
+            assert(q->cur);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = end;
+            *(u8 *)q->state = s;
+            return MO_ALIVE;
+        }
+
+        sp = local_ep;
+
+        if (sp == 0) {
+            cur_buf = buffer;
+        }
+
+        if (sp != ep) {
+            continue;
+        }
+
+        switch (q->items[q->cur].type) {
+        case MQE_TOP:
+            assert(sp + offset || !s);
+            if (sp + offset == 0) {
+                s = (u8)m->start_anchored;
+                break;
+            }
+            s = mcshengEnableStarts64(m, s);
+            break;
+        case MQE_END:
+            *(u8 *)q->state = s;
+            q->cur++;
+            return s ? MO_ALIVE : MO_DEAD;
+        default:
+            assert(!"invalid queue event");
+        }
+
+        q->cur++;
+    }
+}
+
+char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, end,
+                                  CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, end,
+                                   CALLBACK_OUTPUT);
+}
+
+char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng64 *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u8 *)q->state;
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    assert(s);
+
+    if (s >= m->accept_limit_8) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                              &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct mcsheng64 *m = getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u32 s = *(u16 *)q->state;
+    const struct mstate_aux *aux = get_aux64(m, s);
+    u8 single = m->flags & MCSHENG_FLAG_SINGLE;
+    u64a offset = q_cur_offset(q);
+    assert(q_cur_type(q) == MQE_START);
+    DEBUG_PRINTF("state %u\n", s);
+    assert(s);
+
+    if (aux->accept) {
+        if (single) {
+            DEBUG_PRINTF("reporting %u\n", m->arb_report);
+            cb(0, offset, m->arb_report, ctxt);
+        } else {
+            u32 cached_accept_id = 0;
+            u32 cached_accept_state = 0;
+
+            doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state,
+                              &cached_accept_id);
+        }
+    }
+
+    return 0;
+}
+
+static
+char mcshengHasAccept64(const struct mcsheng64 *m, const struct mstate_aux *aux,
+                        ReportID report) {
+    assert(m && aux);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    const struct report_list *rl = (const struct report_list *)
+            ((const char *)m + aux->accept - sizeof(struct NFA));
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return mcshengHasAccept64(m, get_aux64(m, s), report);
+}
+
+char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u8 s = *(u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hhu\n", s);
+
+    return !!get_aux64(m, s)->accept;
+}
+
+char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
+                                  struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return mcshengHasAccept64(m, get_aux64(m, s), report);
+}
+
+char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct mcsheng64 *m = getImplNfa(n);
+    u16 s = *(u16 *)q->state;
+    DEBUG_PRINTF("checking accepts for %hu\n", s);
+
+    return !!get_aux64(m, s)->accept;
+}
+
+char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                  m->flags & MCSHENG_FLAG_SINGLE, end,
+                                  STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                   m->flags & MCSHENG_FLAG_SINGLE, end,
+                                   STOP_AT_MATCH);
+}
+
+char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_8);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q,
+                                     m->flags & MCSHENG_FLAG_SINGLE,
+                                     0 /* end */, NO_MATCHES);
+    if (rv && nfaExecMcSheng64_8_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    u64a offset = q->offset;
+    const u8 *buffer = q->buffer;
+    NfaCallback cb = q->cb;
+    void *context = q->context;
+    assert(n->type == MCSHENG_64_NFA_16);
+    const struct mcsheng64 *m = getImplNfa(n);
+    const u8 *hend = q->history + q->hlength;
+
+    char rv = nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q,
+                                      m->flags & MCSHENG_FLAG_SINGLE,
+                                      0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecMcSheng64_16_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    } else {
+        return rv;
+    }
+}
+
+char nfaExecMcSheng64_8_initCompressedState(const struct NFA *nfa, u64a offset,
+                                            void *state, UNUSED u8 key) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    u8 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        *(u8 *)state = s;
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng64_16_initCompressedState(const struct NFA *nfa, u64a offset,
+                                             void *state, UNUSED u8 key) {
+    const struct mcsheng64 *m = getImplNfa(nfa);
+    u16 s = offset ? m->start_floating : m->start_anchored;
+    if (s) {
+        unaligned_store_u16(state, s);
+        return 1;
+    }
+    return 0;
+}
+
+char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
+                                UNUSED const char *streamState, u64a offset,
+                                NfaCallback callback, void *context) {
+    return mcshengCheckEOD64(nfa, *(const u8 *)state, offset, callback,
+                             context);
+}
+
+char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
+                                 UNUSED const char *streamState, u64a offset,
+                                 NfaCallback callback, void *context) {
+    assert(ISALIGNED_N(state, 2));
+    return mcshengCheckEOD64(nfa, *(const u16 *)state, offset, callback,
+                             context);
+}
+
+char nfaExecMcSheng64_8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+    *(u8 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng64_16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 2);
+    assert(ISALIGNED_N(q->state, 2));
+    *(u16 *)q->state = 0;
+    return 0;
+}
+
+char nfaExecMcSheng64_8_queueCompressState(UNUSED const struct NFA *nfa,
+                                           const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng64_8_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                    const void *src, UNUSED u64a offset,
+                                    UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecMcSheng64_16_queueCompressState(UNUSED const struct NFA *nfa,
+                                            const struct mq *q,
+                                            UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(src, 2));
+    unaligned_store_u16(dest, *(const u16 *)(src));
+    return 0;
+}
+
+char nfaExecMcSheng64_16_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                     const void *src, UNUSED u64a offset,
+                                     UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 2);
+    assert(nfa->streamStateSize == 2);
+    assert(ISALIGNED_N(dest, 2));
+    *(u16 *)dest = unaligned_load_u16(src);
+    return 0;
+}
+#endif
diff --git a/src/nfa/mcsheng.h b/src/nfa/mcsheng.h
index 19fd69614..0329e1212 100644
--- a/src/nfa/mcsheng.h
+++ b/src/nfa/mcsheng.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -80,5 +80,78 @@ char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest,
 
 #define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL
 #define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#if defined(HAVE_AVX512VBMI)
+/* 64-8 bit Sheng-McClellan hybrid  */
+char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state,
+                                const char *streamState, u64a offset,
+                                NfaCallback callback, void *context);
+char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report,
+                                 struct mq *q);
+char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_8_initCompressedState(const struct NFA *n, u64a offset,
+                                            void *state, u8 key);
+char nfaExecMcSheng64_8_queueCompressState(const struct NFA *nfa,
+                                           const struct mq *q, s64a loc);
+char nfaExecMcSheng64_8_expandState(const struct NFA *nfa, void *dest,
+                                    const void *src, u64a offset, u8 key);
+
+#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+/* 64-16 bit Sheng-McClellan hybrid  */
+char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state,
+                                 const char *streamState, u64a offset,
+                                 NfaCallback callback, void *context);
+char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report,
+                                  struct mq *q);
+char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_queueInitState(const struct NFA *n, struct mq *q);
+char nfaExecMcSheng64_16_initCompressedState(const struct NFA *n, u64a offset,
+                                             void *state, u8 key);
+char nfaExecMcSheng64_16_queueCompressState(const struct NFA *nfa,
+                                            const struct mq *q, s64a loc);
+char nfaExecMcSheng64_16_expandState(const struct NFA *nfa, void *dest,
+                                     const void *src, u64a offset, u8 key);
+#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#else // !HAVE_AVX512VBMI
+#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMcSheng64_8_Q NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_Q2 NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_QR NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_inAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_queueInitState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_queueCompressState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_expandState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_initCompressedState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_testEOD NFA_API_NO_IMPL
+#define nfaExecMcSheng64_8_reportCurrent NFA_API_NO_IMPL
+
+#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecMcSheng64_16_Q NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_Q2 NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_QR NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_inAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_queueInitState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_queueCompressState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_expandState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_initCompressedState NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_testEOD NFA_API_NO_IMPL
+#define nfaExecMcSheng64_16_reportCurrent NFA_API_NO_IMPL
+
+#endif //end of HAVE_AVX512VBMI
 
 #endif
diff --git a/src/nfa/mcsheng_compile.cpp b/src/nfa/mcsheng_compile.cpp
index 871ca4fb1..622362bea 100644
--- a/src/nfa/mcsheng_compile.cpp
+++ b/src/nfa/mcsheng_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,7 +48,6 @@
 #include "util/flat_containers.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/unaligned.h"
@@ -64,7 +63,6 @@
 #include <set>
 #include <deque>
 #include <vector>
-
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
@@ -244,6 +242,106 @@ void populateBasicInfo(size_t state_size, const dfa_info &info,
     }
 }
 
+static
+mstate_aux *getAux64(NFA *n, dstate_id_t i) {
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(n);
+    mstate_aux *aux_base = (mstate_aux *)((char *)n + m->aux_offset);
+
+    mstate_aux *aux = aux_base + i;
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void createShuffleMasks64(mcsheng64 *m, const dfa_info &info,
+                      dstate_id_t sheng_end,
+                      const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("using first %hu states for a sheng\n", sheng_end);
+    assert(sheng_end > DEAD_STATE + 1);
+    assert(sheng_end <= sizeof(m512) + 1);
+    vector<array<u8, sizeof(m512)>> masks;
+    masks.resize(info.alpha_size);
+    /* -1 to avoid wasting a slot as we do not include dead state */
+    vector<dstate_id_t> raw_ids;
+    raw_ids.resize(sheng_end - 1);
+    for (dstate_id_t s = DEAD_STATE + 1; s < info.states.size(); s++) {
+        assert(info.implId(s)); /* should not map to DEAD_STATE */
+        if (info.is_sheng(s)) {
+            raw_ids[info.extra[s].sheng_id] = s;
+        }
+    }
+    for (u32 i = 0; i < info.alpha_size; i++) {
+        if (i == info.alpha_remap[TOP]) {
+            continue;
+        }
+        auto &mask = masks[i];
+        assert(sizeof(mask) == sizeof(m512));
+        mask.fill(0);
+
+        for (dstate_id_t sheng_id = 0; sheng_id < sheng_end - 1; sheng_id++) {
+            dstate_id_t raw_id = raw_ids[sheng_id];
+            dstate_id_t next_id = info.implId(info.states[raw_id].next[i]);
+            if (next_id == DEAD_STATE) {
+                next_id = sheng_end - 1;
+            } else if (next_id < sheng_end) {
+                next_id--;
+            }
+            DEBUG_PRINTF("%hu: %u->next %hu\n", sheng_id, i, next_id);
+            mask[sheng_id] = verify_u8(next_id);
+        }
+    }
+    for (u32 i = 0; i < N_CHARS; i++) {
+        assert(info.alpha_remap[i] != info.alpha_remap[TOP]);
+        memcpy((u8 *)&m->sheng_succ_masks[i],
+               (u8 *)masks[info.alpha_remap[i]].data(), sizeof(m512));
+    }
+    m->sheng_end = sheng_end;
+    m->sheng_accel_limit = sheng_end - 1;
+
+    for (dstate_id_t s : raw_ids) {
+        if (contains(accel_escape_info, s)) {
+            LIMIT_TO_AT_MOST(&m->sheng_accel_limit, info.extra[s].sheng_id);
+        }
+    }
+}
+
+static
+void populateBasicInfo64(size_t state_size, const dfa_info &info,
+                         u32 total_size, u32 aux_offset, u32 accel_offset,
+                         u32 accel_count, ReportID arb, bool single, NFA *nfa) {
+    assert(state_size == sizeof(u16) || state_size == sizeof(u8));
+
+    nfa->length = total_size;
+    nfa->nPositions = info.states.size();
+
+    nfa->scratchStateSize = verify_u32(state_size);
+    nfa->streamStateSize = verify_u32(state_size);
+
+    if (state_size == sizeof(u8)) {
+        nfa->type = MCSHENG_64_NFA_8;
+    } else {
+        nfa->type = MCSHENG_64_NFA_16;
+    }
+
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+    for (u32 i = 0; i < 256; i++) {
+        m->remap[i] = verify_u8(info.alpha_remap[i]);
+    }
+    m->alphaShift = info.getAlphaShift();
+    m->length = total_size;
+    m->aux_offset = aux_offset;
+    m->accel_offset = accel_offset;
+    m->arb_report = arb;
+    m->state_count = verify_u16(info.size());
+    m->start_anchored = info.implId(info.raw.start_anchored);
+    m->start_floating = info.implId(info.raw.start_floating);
+    m->has_accel = accel_count ? 1 : 0;
+
+    if (single) {
+        m->flags |= MCSHENG_FLAG_SINGLE;
+    }
+}
+
 static
 size_t calcShermanRegionSize(const dfa_info &info) {
     size_t rv = 0;
@@ -272,7 +370,7 @@ void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info,
 /* returns false on error */
 static
 bool allocateImplId16(dfa_info &info, dstate_id_t sheng_end,
-                     dstate_id_t *sherman_base) {
+                      dstate_id_t *sherman_base) {
     info.states[0].impl_id = 0; /* dead is always 0 */
 
     vector<dstate_id_t> norm;
@@ -291,15 +389,15 @@ bool allocateImplId16(dfa_info &info, dstate_id_t sheng_end,
             continue; /* sheng impl ids have already been allocated */
         } if (info.is_sherman(i)) {
             if (info.is_sheng_succ(i)) {
-                sherm_sheng_succ.push_back(i);
+                sherm_sheng_succ.emplace_back(i);
             } else {
-                sherm.push_back(i);
+                sherm.emplace_back(i);
             }
         } else {
             if (info.is_sheng_succ(i)) {
-                norm_sheng_succ.push_back(i);
+                norm_sheng_succ.emplace_back(i);
             } else {
-                norm.push_back(i);
+                norm.emplace_back(i);
             }
         }
     }
@@ -382,6 +480,7 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) {
 }
 
 #define MAX_SHENG_STATES 16
+#define MAX_SHENG64_STATES 64
 #define MAX_SHENG_LEAKINESS 0.05
 
 using LeakinessCache = ue2_unordered_map<pair<RdfaVertex, u32>, double>;
@@ -435,7 +534,8 @@ double leakiness(const RdfaGraph &g, dfa_info &info,
 
 static
 dstate_id_t find_sheng_states(dfa_info &info,
-                             map<dstate_id_t, AccelScheme> &accel_escape_info) {
+                              map<dstate_id_t, AccelScheme> &accel_escape_info,
+                              size_t max_sheng_states) {
     RdfaGraph g(info.raw);
     auto cyclics = find_vertices_in_cycles(g);
 
@@ -470,7 +570,7 @@ dstate_id_t find_sheng_states(dfa_info &info,
     flat_set<dstate_id_t> considered = { DEAD_STATE };
     bool seen_back_edge = false;
     while (!to_consider.empty()
-           && sheng_states.size() < MAX_SHENG_STATES) {
+           && sheng_states.size() < max_sheng_states) {
         auto v = to_consider.front();
         to_consider.pop_front();
         if (!considered.insert(g[v].index).second) {
@@ -488,7 +588,7 @@ dstate_id_t find_sheng_states(dfa_info &info,
         sheng_states.insert(v);
         for (const auto &t : adjacent_vertices_range(v, g)) {
             if (!contains(considered, g[t].index)) {
-                to_consider.push_back(t);
+                to_consider.emplace_back(t);
             }
             if (t == base_cyclic) {
                 seen_back_edge = true;
@@ -616,6 +716,80 @@ void fill_in_succ_table_16(NFA *nfa, const dfa_info &info,
     }
 }
 
+static
+void fill_in_aux_info64(NFA *nfa, const dfa_info &info,
+                        const map<dstate_id_t, AccelScheme> &accel_escape_info,
+                        u32 accel_offset, UNUSED u32 accel_end_offset,
+                        const vector<u32> &reports,
+                        const vector<u32> &reports_eod,
+                        u32 report_base_offset,
+                        const raw_report_info &ri) {
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+
+    vector<u32> reportOffsets;
+
+    ri.fillReportLists(nfa, report_base_offset, reportOffsets);
+
+    for (u32 i = 0; i < info.size(); i++) {
+        u16 impl_id = info.implId(i);
+        mstate_aux *this_aux = getAux64(nfa, impl_id);
+
+        fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets);
+        if (contains(accel_escape_info, i)) {
+            this_aux->accel_offset = accel_offset;
+            accel_offset += info.strat.accelSize();
+            assert(accel_offset <= accel_end_offset);
+            assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+            info.strat.buildAccel(i, accel_escape_info.at(i),
+                                  (void *)((char *)m + this_aux->accel_offset));
+        }
+    }
+}
+
+static
+u16 get_edge_flags64(NFA *nfa, dstate_id_t target_impl_id) {
+    mstate_aux *aux = getAux64(nfa, target_impl_id);
+    u16 flags = 0;
+
+    if (aux->accept) {
+        flags |= ACCEPT_FLAG;
+    }
+
+    if (aux->accel_offset) {
+        flags |= ACCEL_FLAG;
+    }
+
+    return flags;
+}
+
+static
+void fill_in_succ_table_64_16(NFA *nfa, const dfa_info &info,
+                              dstate_id_t sheng_end,
+                              UNUSED dstate_id_t sherman_base) {
+    u16 *succ_table = (u16 *)((char *)nfa + sizeof(NFA) + sizeof(mcsheng64));
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end || info.is_sherman(i));
+            continue;
+        }
+
+        assert(info.implId(i) < sherman_base);
+        u16 normal_id = verify_u16(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            u16 &entry = succ_table[((size_t)normal_id << alphaShift) + s];
+
+            entry = info.implId(raw_succ);
+            entry |= get_edge_flags64(nfa, entry);
+        }
+    }
+}
+
 #define MAX_SHERMAN_LIST_LEN 8
 
 static
@@ -842,17 +1016,20 @@ bytecode_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
 
     assert(info.getAlphaShift() <= 8);
 
-    u16 total_daddy = 0;
-    for (u32 i = 0; i < info.size(); i++) {
-        find_better_daddy(info, i,
-                          is_cyclic_near(info.raw, info.raw.start_anchored),
-                          grey);
-        total_daddy += info.extra[i].daddytaken;
-    }
+    // Sherman optimization
+    if (info.impl_alpha_size > 16) {
+        u16 total_daddy = 0;
+        for (u32 i = 0; i < info.size(); i++) {
+            find_better_daddy(info, i,
+                              is_cyclic_near(info.raw, info.raw.start_anchored),
+                              grey);
+            total_daddy += info.extra[i].daddytaken;
+        }
 
-    DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
-                 info.size() * info.impl_alpha_size, info.size(),
-                 info.impl_alpha_size);
+        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                     info.size() * info.impl_alpha_size, info.size(),
+                     info.impl_alpha_size);
+    }
 
     u16 sherman_limit;
     if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
@@ -931,6 +1108,160 @@ void fill_in_succ_table_8(NFA *nfa, const dfa_info &info,
     }
 }
 
+static
+void fill_in_sherman64(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
+    char *nfa_base = (char *)nfa;
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa);
+    char *sherman_table = nfa_base + m->sherman_offset;
+
+    assert(ISALIGNED_16(sherman_table));
+    for (size_t i = 0; i < info.size(); i++) {
+        if (!info.is_sherman(i)) {
+            continue;
+        }
+        u16 fs = verify_u16(info.implId(i));
+        DEBUG_PRINTF("building sherman %zu impl %hu\n", i, fs);
+
+        assert(fs >= sherman_limit);
+
+        char *curr_sherman_entry
+            = sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE;
+        assert(curr_sherman_entry <= nfa_base + m->length);
+
+        u8 len = verify_u8(info.impl_alpha_size - info.extra[i].daddytaken);
+        assert(len <= 9);
+        dstate_id_t d = info.states[i].daddy;
+
+        *(u8 *)(curr_sherman_entry + SHERMAN_TYPE_OFFSET) = SHERMAN_STATE;
+        *(u8 *)(curr_sherman_entry + SHERMAN_LEN_OFFSET) = len;
+        *(u16 *)(curr_sherman_entry + SHERMAN_DADDY_OFFSET) = info.implId(d);
+        u8 *chars = (u8 *)(curr_sherman_entry + SHERMAN_CHARS_OFFSET);
+
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                *(chars++) = (u8)s;
+            }
+        }
+
+        u16 *states = (u16 *)(curr_sherman_entry + SHERMAN_STATES_OFFSET(len));
+        for (u16 s = 0; s < info.impl_alpha_size; s++) {
+            if (info.states[i].next[s] != info.states[d].next[s]) {
+                DEBUG_PRINTF("s overrider %hu dad %hu char next %hu\n", fs,
+                             info.implId(d),
+                             info.implId(info.states[i].next[s]));
+                u16 entry_val = info.implId(info.states[i].next[s]);
+                entry_val |= get_edge_flags64(nfa, entry_val);
+                unaligned_store_u16((u8 *)states++, entry_val);
+            }
+        }
+    }
+}
+
+static
+bytecode_ptr<NFA> mcsheng64Compile16(dfa_info&info, dstate_id_t sheng_end,
+                         const map<dstate_id_t, AccelScheme>&accel_escape_info,
+                         const Grey &grey) {
+    DEBUG_PRINTF("building mcsheng 64-16\n");
+
+    vector<u32> reports; /* index in ri for the appropriate report list */
+    vector<u32> reports_eod; /* as above */
+    ReportID arb;
+    u8 single;
+
+    assert(info.getAlphaShift() <= 8);
+
+    // Sherman optimization
+    if (info.impl_alpha_size > 16) {
+        u16 total_daddy = 0;
+        for (u32 i = 0; i < info.size(); i++) {
+            find_better_daddy(info, i,
+                              is_cyclic_near(info.raw, info.raw.start_anchored),
+                              grey);
+            total_daddy += info.extra[i].daddytaken;
+        }
+
+        DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy,
+                     info.size() * info.impl_alpha_size, info.size(),
+                     info.impl_alpha_size);
+    }
+
+    u16 sherman_limit;
+    if (!allocateImplId16(info, sheng_end, &sherman_limit)) {
+        DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n",
+                     info.size());
+        return nullptr;
+    }
+    u16 count_real_states = sherman_limit - sheng_end;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t tran_size = (1 << info.getAlphaShift()) * sizeof(u16)
+                     * count_real_states;
+
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                    + ri->getReportListSize(), 32);
+    size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size);
+    size_t sherman_size = calcShermanRegionSize(info);
+
+    size_t total_size = sherman_offset + sherman_size;
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
+
+    populateBasicInfo64(sizeof(u16), info, total_size, aux_offset, accel_offset,
+                        accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks64(m, info, sheng_end, accel_escape_info);
+
+    /* copy in the mc header information */
+    m->sherman_offset = sherman_offset;
+    m->sherman_end = total_size;
+    m->sherman_limit = sherman_limit;
+
+    DEBUG_PRINTF("%hu sheng, %hu norm, %zu total\n", sheng_end,
+                 count_real_states, info.size());
+
+    fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
+                       sherman_offset - sizeof(NFA), reports, reports_eod,
+                       aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_64_16(nfa.get(), info, sheng_end, sherman_limit);
+
+    fill_in_sherman64(nfa.get(), info, sherman_limit);
+
+    return nfa;
+}
+
+static
+void fill_in_succ_table_64_8(NFA *nfa, const dfa_info &info,
+                             dstate_id_t sheng_end) {
+    u8 *succ_table = (u8 *)nfa + sizeof(NFA) + sizeof(mcsheng64);
+
+    u8 alphaShift = info.getAlphaShift();
+    assert(alphaShift <= 8);
+
+    for (size_t i = 0; i < info.size(); i++) {
+        assert(!info.is_sherman(i));
+        if (!info.is_normal(i)) {
+            assert(info.implId(i) < sheng_end);
+            continue;
+        }
+        u8 normal_id = verify_u8(info.implId(i) - sheng_end);
+
+        for (size_t s = 0; s < info.impl_alpha_size; s++) {
+            dstate_id_t raw_succ = info.states[i].next[s];
+            succ_table[((size_t)normal_id << alphaShift) + s]
+                = info.implId(raw_succ);
+        }
+    }
+}
+
 static
 void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
                      const map<dstate_id_t, AccelScheme> &accel_escape_info,
@@ -947,11 +1278,11 @@ void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
         if (info.is_sheng(i)) {
             continue; /* already allocated */
         } else if (!info.states[i].reports.empty()) {
-            accept.push_back(i);
+            accept.emplace_back(i);
         } else if (contains(accel_escape_info, i)) {
-            accel.push_back(i);
+            accel.emplace_back(i);
         } else {
-            norm.push_back(i);
+            norm.emplace_back(i);
         }
     }
 
@@ -1028,6 +1359,58 @@ bytecode_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
     return nfa;
 }
 
+static
+bytecode_ptr<NFA> mcsheng64Compile8(dfa_info &info, dstate_id_t sheng_end,
+                      const map<dstate_id_t, AccelScheme> &accel_escape_info) {
+    DEBUG_PRINTF("building mcsheng 64-8\n");
+
+    vector<u32> reports;
+    vector<u32> reports_eod;
+    ReportID arb;
+    u8 single;
+
+    auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb);
+
+    size_t normal_count = info.size() - sheng_end;
+
+    size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * normal_count;
+    size_t aux_size = sizeof(mstate_aux) * info.size();
+    size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size);
+    size_t accel_size = info.strat.accelSize() * accel_escape_info.size();
+    size_t accel_offset = ROUNDUP_N(aux_offset + aux_size
+                                    + ri->getReportListSize(), 32);
+    size_t total_size = accel_offset + accel_size;
+
+    DEBUG_PRINTF("aux_size %zu\n", aux_size);
+    DEBUG_PRINTF("aux_offset %zu\n", aux_offset);
+    DEBUG_PRINTF("rl size %u\n", ri->getReportListSize());
+    DEBUG_PRINTF("accel_size %zu\n", accel_size);
+    DEBUG_PRINTF("accel_offset %zu\n", accel_offset);
+    DEBUG_PRINTF("total_size %zu\n", total_size);
+
+    accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
+    assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
+
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
+    mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get());
+
+    allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
+                    &m->accept_limit_8);
+
+    populateBasicInfo64(sizeof(u8), info, total_size, aux_offset, accel_offset,
+                        accel_escape_info.size(), arb, single, nfa.get());
+    createShuffleMasks64(m, info, sheng_end, accel_escape_info);
+
+    fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset,
+                       total_size - sizeof(NFA), reports, reports_eod,
+                       aux_offset + aux_size, *ri);
+
+    fill_in_succ_table_64_8(nfa.get(), info, sheng_end);
+    DEBUG_PRINTF("rl size %zu\n", ri->size());
+
+    return nfa;
+}
+
 bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
                                  const ReportManager &rm) {
     if (!cc.grey.allowMcSheng) {
@@ -1047,19 +1430,83 @@ bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
 
     map<dstate_id_t, AccelScheme> accel_escape_info
         = info.strat.getAccelInfo(cc.grey);
+    auto old_states = info.states;
+    dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info, MAX_SHENG_STATES);
 
-    dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info);
     if (sheng_end <= DEAD_STATE + 1) {
+        info.states = old_states;
         return nullptr;
     }
 
     bytecode_ptr<NFA> nfa;
+
     if (!using8bit) {
         nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
     } else {
         nfa = mcshengCompile8(info, sheng_end, accel_escape_info);
     }
 
+    if (!nfa) {
+        info.states = old_states;
+        return nfa;
+    }
+
+    if (has_eod_reports) {
+        nfa->flags |= NFA_ACCEPTS_EOD;
+    }
+
+    DEBUG_PRINTF("compile done\n");
+    return nfa;
+}
+
+bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
+                                   const ReportManager &rm) {
+    if (!cc.grey.allowMcSheng) {
+        return nullptr;
+    }
+
+    if (!cc.target_info.has_avx512vbmi()) {
+        DEBUG_PRINTF("McSheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
+        return nullptr;
+    }
+
+    mcclellan_build_strat mbs(raw, rm, false);
+    dfa_info info(mbs);
+    bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
+
+    if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
+                          * mode with our semantics */
+        raw.stripExtraEodReports();
+    }
+
+    bool has_eod_reports = raw.hasEodReports();
+
+    map<dstate_id_t, AccelScheme> accel_escape_info
+        = info.strat.getAccelInfo(cc.grey);
+    bool using64state = false; /*default flag*/
+    dstate_id_t sheng_end64;
+    sheng_end64 = find_sheng_states(info, accel_escape_info, MAX_SHENG64_STATES);
+
+    if (sheng_end64 <= DEAD_STATE + 1) {
+        return nullptr;
+    } else {
+        using64state = true;
+    }
+
+    bytecode_ptr<NFA> nfa;
+
+    if (using64state) {
+        assert((sheng_end64 > 17) && (sheng_end64 <= 65));
+        if (!using8bit) {
+            nfa = mcsheng64Compile16(info, sheng_end64, accel_escape_info, cc.grey);
+        } else {
+            assert(using8bit);
+            nfa = mcsheng64Compile8(info, sheng_end64, accel_escape_info);
+            assert(nfa);
+            assert(nfa->type == MCSHENG_64_NFA_8);
+        }
+    }
+
     if (!nfa) {
         return nfa;
     }
diff --git a/src/nfa/mcsheng_compile.h b/src/nfa/mcsheng_compile.h
index 487ab45f4..3a79b46a2 100644
--- a/src/nfa/mcsheng_compile.h
+++ b/src/nfa/mcsheng_compile.h
@@ -42,7 +42,8 @@ struct raw_dfa;
 
 bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
                                  const ReportManager &rm);
-
+bytecode_ptr<NFA> mcshengCompile64(raw_dfa &raw, const CompileContext &cc,
+                                   const ReportManager &rm);
 bool has_accel_mcsheng(const NFA *nfa);
 
 } // namespace ue2
diff --git a/src/nfa/mcsheng_data.c b/src/nfa/mcsheng_data.c
index eaf3cbbb3..0701b4b31 100644
--- a/src/nfa/mcsheng_data.c
+++ b/src/nfa/mcsheng_data.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,3 +41,15 @@ const u64a mcsheng_pext_mask[8] = {
     0x00ff00000000000f,
     0xff0000000000000f,
 };
+#if defined(HAVE_AVX512VBMI)
+const u64a mcsheng64_pext_mask[8] = {
+    0, /* dummy */
+    0x000000000000ff3f,
+    0x0000000000ff003f,
+    0x00000000ff00003f,
+    0x000000ff0000003f,
+    0x0000ff000000003f,
+    0x00ff00000000003f,
+    0xff0000000000003f,
+};
+#endif
diff --git a/src/nfa/mcsheng_dump.cpp b/src/nfa/mcsheng_dump.cpp
index 2b5630799..7cef82f4d 100644
--- a/src/nfa/mcsheng_dump.cpp
+++ b/src/nfa/mcsheng_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -174,6 +174,124 @@ void describeEdge(FILE *f, const mcsheng *m, const u16 *t, u16 i) {
     }
 }
 
+static
+const mstate_aux *getAux64(const NFA *n, dstate_id_t i) {
+    auto *m = (const mcsheng64 *)getImplNfa(n);
+    auto *aux_base = (const mstate_aux *)((const char *)n + m->aux_offset);
+
+    const mstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)n + m->length);
+    return aux;
+}
+
+static
+void next_states64(const NFA *n, u16 s, u16 *t) {
+    const mcsheng64 *m = (const mcsheng64 *)getImplNfa(n);
+    const mstate_aux *aux = getAux64(n, s);
+    const u32 as = m->alphaShift;
+    assert(s != DEAD_STATE);
+
+    if (s < m->sheng_end) {
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u8 sheng_s = s - 1;
+            auto trans_for_c = (const char *)&m->sheng_succ_masks[c];
+            assert(sheng_s < sizeof(m512));
+            u8 raw_succ = trans_for_c[sheng_s];
+            if (raw_succ == m->sheng_end - 1) {
+                t[c] = DEAD_STATE;
+            } else if (raw_succ < m->sheng_end) {
+                t[c] = raw_succ + 1;
+            } else {
+                t[c] = raw_succ;
+            }
+        }
+    } else  if (n->type == MCSHENG_64_NFA_8) {
+        const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcsheng64));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+    } else {
+        u16 base_s = s;
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (s - m->sherman_limit);
+
+        if (s >= m->sherman_limit) {
+            base_s = unaligned_load_u16(state_base + SHERMAN_DADDY_OFFSET);
+            assert(base_s >= m->sheng_end);
+        }
+
+        const u16 *succ_table = (const u16 *)((const char *)m
+                                              + sizeof(mcsheng64));
+        for (u16 c = 0; c < N_CHARS; c++) {
+            u32 normal_id = base_s - m->sheng_end;
+            t[c] = succ_table[(normal_id << as) + m->remap[c]];
+        }
+
+        if (s >= m->sherman_limit) {
+            UNUSED char type = *(state_base + SHERMAN_TYPE_OFFSET);
+            assert(type == SHERMAN_STATE);
+            u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
+            const char *chars = state_base + SHERMAN_CHARS_OFFSET;
+            const u16 *states = (const u16 *)(state_base
+                                              + SHERMAN_STATES_OFFSET(len));
+
+            for (u8 i = 0; i < len; i++) {
+                for (u16 c = 0; c < N_CHARS; c++) {
+                    if (m->remap[c] == chars[i]) {
+                        t[c] = unaligned_load_u16((const u8*)&states[i]);
+                    }
+                }
+            }
+        }
+
+        for (u16 c = 0; c < N_CHARS; c++) {
+            t[c] &= STATE_MASK;
+        }
+
+    }
+
+    t[TOP] = aux->top & STATE_MASK;
+}
+
+static
+void describeEdge64(FILE *f, const mcsheng64 *m, const u16 *t, u16 i) {
+    for (u16 s = 0; s < N_CHARS; s++) {
+        if (!t[s]) {
+            continue;
+        }
+
+        u16 ss;
+        for (ss = 0; ss < s; ss++) {
+            if (t[s] == t[ss]) {
+                break;
+            }
+        }
+
+        if (ss != s) {
+            continue;
+        }
+
+        CharReach reach;
+        for (ss = s; ss < 256; ss++) {
+            if (t[s] == t[ss]) {
+                reach.set(ss);
+            }
+        }
+
+        fprintf(f, "%u -> %u [ ", i, t[s]);
+        if (i < m->sheng_end && t[s] < m->sheng_end) {
+            fprintf(f, "color = red, fontcolor = red ");
+        }
+        fprintf(f, "label = \"");
+        describeClass(f, reach, 5, CC_OUT_DOT);
+
+        fprintf(f, "\" ];\n");
+    }
+}
+
 static
 void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) {
     switch(accel->accel_type) {
@@ -256,6 +374,66 @@ void describeNode(const NFA *n, const mcsheng *m, u16 i, FILE *f) {
 
 }
 
+static
+void describeNode64(const NFA *n, const mcsheng64 *m, u16 i, FILE *f) {
+    const mstate_aux *aux = getAux64(n, i);
+
+    bool isSherman = m->sherman_limit && i >= m->sherman_limit;
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+            "label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");
+
+    if (aux->accel_offset) {
+        dumpAccelDot(f, i, (const union AccelAux *)
+                     ((const char *)m + aux->accel_offset));
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "%u [color = red, fontcolor = red]; \n", i);
+    }
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && aux->top != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top);
+    }
+
+    if (i == m->start_anchored) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == m->start_floating) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+
+    if (isSherman) {
+        const char *winfo_base = (const char *)n + m->sherman_offset;
+        const char *state_base
+                = winfo_base + SHERMAN_FIXED_SIZE * (i - m->sherman_limit);
+        assert(state_base < (const char *)m + m->length - sizeof(NFA));
+        UNUSED u8 type = *(const u8 *)(state_base + SHERMAN_TYPE_OFFSET);
+        assert(type == SHERMAN_STATE);
+        fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
+        u16 daddy = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
+        if (daddy) {
+            fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
+                    i, daddy);
+        }
+    }
+
+    if (i && i < m->sheng_end) {
+        fprintf(f, "subgraph cluster_sheng { %u } \n", i);
+    }
+
+}
+
 static
 void dumpDotPreambleDfa(FILE *f) {
     dumpDotPreamble(f);
@@ -392,6 +570,131 @@ void dump_text_8(const NFA *nfa, FILE *f) {
     dumpTextReverse(nfa, f);
 }
 
+static
+void dump64_dot_16(const NFA *nfa, FILE *f) {
+    auto  *m = (const mcsheng64 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode64(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states64(nfa, i, t);
+
+        describeEdge64(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dump64_dot_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng64 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < m->state_count; i++) {
+        describeNode64(nfa, m, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        next_states64(nfa, i, t);
+
+        describeEdge64(f, m, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void dumpAccelMasks64(FILE *f, const mcsheng64 *m, const mstate_aux *aux) {
+    fprintf(f, "\n");
+    fprintf(f, "Acceleration\n");
+    fprintf(f, "------------\n");
+
+    for (u16 i = 0; i < m->state_count; i++) {
+        if (!aux[i].accel_offset) {
+            continue;
+        }
+
+        auto accel = (const AccelAux *)((const char *)m + aux[i].accel_offset);
+        fprintf(f, "%05hu ", i);
+        dumpAccelInfo(f, *accel);
+    }
+}
+
+static
+void describeAlphabet64(FILE *f, const mcsheng64 *m) {
+    map<u8, CharReach> rev;
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].clear();
+    }
+
+    for (u16 i = 0; i < N_CHARS; i++) {
+        rev[m->remap[i]].set(i);
+    }
+
+    map<u8, CharReach>::const_iterator it;
+    fprintf(f, "\nAlphabet\n");
+    for (it = rev.begin(); it != rev.end(); ++it) {
+        fprintf(f, "%3hhu: ", it->first);
+        describeClass(f, it->second, 10240, CC_OUT_TEXT);
+        fprintf(f, "\n");
+    }
+    fprintf(f, "\n");
+}
+
+static
+void dumpCommonHeader64(FILE *f, const mcsheng64 *m) {
+    fprintf(f, "report: %u, states: %u, length: %u\n", m->arb_report,
+            m->state_count, m->length);
+    fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
+            m->start_floating);
+    fprintf(f, "single accept: %d, has_accel: %d\n",
+            !!(int)m->flags & MCSHENG_FLAG_SINGLE, m->has_accel);
+    fprintf(f, "sheng_end:         %hu\n", m->sheng_end);
+    fprintf(f, "sheng_accel_limit: %hu\n", m->sheng_accel_limit);
+}
+
+static
+void dump64_text_8(const NFA *nfa, FILE *f) {
+    auto m = (const mcsheng64 *)getImplNfa(nfa);
+    auto aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 64-8\n");
+    dumpCommonHeader64(f, m);
+    fprintf(f, "accel_limit: %hu, accept_limit %hu\n", m->accel_limit_8,
+            m->accept_limit_8);
+    fprintf(f, "\n");
+
+    describeAlphabet64(f, m);
+    dumpAccelMasks64(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
+static
+void dump64_text_16(const NFA *nfa, FILE *f) {
+    auto *m = (const mcsheng64 *)getImplNfa(nfa);
+    auto *aux = (const mstate_aux *)((const char *)nfa + m->aux_offset);
+
+    fprintf(f, "mcsheng 64-16\n");
+    dumpCommonHeader64(f, m);
+    fprintf(f, "sherman_limit: %d, sherman_end: %d\n", (int)m->sherman_limit,
+            (int)m->sherman_end);
+    fprintf(f, "\n");
+
+    describeAlphabet64(f, m);
+    dumpAccelMasks64(f, m, aux);
+
+    fprintf(f, "\n");
+    dumpTextReverse(nfa, f);
+}
+
 void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == MCSHENG_NFA_16);
     dump_text_16(nfa, StdioFile(base + ".txt", "w"));
@@ -404,4 +707,16 @@ void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) {
     dump_dot_8(nfa, StdioFile(base + ".dot", "w"));
 }
 
+void nfaExecMcSheng64_16_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+    assert(nfa->type == MCSHENG_64_NFA_16);
+    dump64_text_16(nfa, StdioFile(base + ".txt", "w"));
+    dump64_dot_16(nfa, StdioFile(base + ".dot", "w"));
+}
+
+void nfaExecMcSheng64_8_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+    assert(nfa->type == MCSHENG_64_NFA_8);
+    dump64_text_8(nfa, StdioFile(base + ".txt", "w"));
+    dump64_dot_8(nfa, StdioFile(base + ".dot", "w"));
+}
+
 } // namespace ue2
diff --git a/src/nfa/mcsheng_dump.h b/src/nfa/mcsheng_dump.h
index 1b6993674..26e6cfda7 100644
--- a/src/nfa/mcsheng_dump.h
+++ b/src/nfa/mcsheng_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,7 +42,8 @@ namespace ue2 {
 
 void nfaExecMcSheng8_dump(const struct NFA *nfa, const std::string &base);
 void nfaExecMcSheng16_dump(const struct NFA *nfa, const std::string &base);
-
+void nfaExecMcSheng64_8_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecMcSheng64_16_dump(const struct NFA *nfa, const std::string &base);
 } // namespace ue2
 
 #endif // DUMP_SUPPORT
diff --git a/src/nfa/mcsheng_internal.h b/src/nfa/mcsheng_internal.h
index bb45ae23f..d98557462 100644
--- a/src/nfa/mcsheng_internal.h
+++ b/src/nfa/mcsheng_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -92,4 +92,33 @@ struct mcsheng {
  * representing the data from a u64a. */
 extern const u64a mcsheng_pext_mask[8];
 
+struct mcsheng64 {
+    u16 state_count; /**< total number of states */
+    u32 length; /**< length of dfa in bytes */
+    u16 start_anchored; /**< anchored start state */
+    u16 start_floating; /**< floating start state */
+    u32 aux_offset; /**< offset of the aux structures relative to the start of
+                     *  the nfa structure */
+    u32 sherman_offset; /**< offset of array of sherman state offsets the
+                         * state_info structures relative to the start of the
+                         * nfa structure */
+    u32 sherman_end; /**< offset of the end of the state_info structures
+                      * relative to the start of the nfa structure */
+    u16 sheng_end; /**< first non-sheng state */
+    u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of
+                            * internal sheng ids */
+    u16 accel_limit_8; /**< 8 bit, lowest accelerable state */
+    u16 accept_limit_8; /**< 8 bit, lowest accept state */
+    u16 sherman_limit; /**< lowest sherman state */
+    u8  alphaShift;
+    u8  flags;
+    u8  has_accel; /**< 1 iff there are any accel plans */
+    u8  remap[256]; /**< remaps characters to a smaller alphabet */
+    ReportID arb_report; /**< one of the accepts that this dfa may raise */
+    u32 accel_offset; /**< offset of accel structures from start of McClellan */
+    m512 sheng_succ_masks[N_CHARS];
+};
+
+extern const u64a mcsheng64_pext_mask[8];
+
 #endif
diff --git a/src/nfa/mpv.c b/src/nfa/mpv.c
index 552754d60..cba3d159e 100644
--- a/src/nfa/mpv.c
+++ b/src/nfa/mpv.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +36,7 @@
 #include "shufti.h"
 #include "truffle.h"
 #include "ue2common.h"
-#include "vermicelli.h"
+#include "vermicelli.hpp"
 #include "vermicelli_run.h"
 #include "util/multibit.h"
 #include "util/partial_store.h"
@@ -260,6 +261,13 @@ size_t limitByReach(const struct mpv_kilopuff *kp, const u8 *buf,
     } else if (kp->type == MPV_NVERM) {
         return nvermicelliExec(kp->u.verm.c, 0, buf, buf + length) - buf;
     }
+#ifdef HAVE_SVE2
+    else if (kp->type == MPV_VERM16) {
+        return vermicelli16Exec(kp->u.verm16.mask, buf, buf + length) - buf;
+    } else if (kp->type == MPV_NVERM16) {
+        return nvermicelli16Exec(kp->u.verm16.mask, buf, buf + length) - buf;
+    }
+#endif // HAVE_SVE2
 
     assert(kp->type == MPV_DOT);
     return length;
diff --git a/src/nfa/mpv_internal.h b/src/nfa/mpv_internal.h
index a52853dce..b6b925043 100644
--- a/src/nfa/mpv_internal.h
+++ b/src/nfa/mpv_internal.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +37,8 @@
 #define MPV_SHUFTI 2
 #define MPV_TRUFFLE 3
 #define MPV_NVERM  4
+#define MPV_VERM16 5
+#define MPV_NVERM16 6
 
 struct mpv_puffette {
     u32 repeats;
@@ -65,6 +68,9 @@ struct mpv_kilopuff {
         struct {
             char c;
         } verm;
+        struct {
+            m128 mask;
+        } verm16;
         struct {
             m128 mask_lo;
             m128 mask_hi;
diff --git a/src/nfa/mpvcompile.cpp b/src/nfa/mpvcompile.cpp
index 8497c6487..d85c90b02 100644
--- a/src/nfa/mpvcompile.cpp
+++ b/src/nfa/mpvcompile.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +34,7 @@
 #include "nfa_internal.h"
 #include "shufticompile.h"
 #include "trufflecompile.h"
+#include "vermicellicompile.h"
 #include "util/alloc.h"
 #include "util/multibit_build.h"
 #include "util/order_check.h"
@@ -140,12 +142,12 @@ void populateClusters(const vector<raw_puff> &puffs_in,
 
     u32 e = MQE_TOP_FIRST;
     for (const auto &puff : triggered_puffs) {
-        puff_clusters[ClusterKey(e, puff)].push_back(puff);
+        puff_clusters[ClusterKey(e, puff)].emplace_back(puff);
         e++;
     }
 
     for (const auto &puff : puffs_in) {
-        puff_clusters[ClusterKey(puff)].push_back(puff);
+        puff_clusters[ClusterKey(puff)].emplace_back(puff);
     }
 
 
@@ -175,6 +177,14 @@ void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it,
         size_t set = reach.find_first();
         assert(set != CharReach::npos);
         kp->u.verm.c = (char)set;
+#ifdef HAVE_SVE2
+    } else if (reach.count() >= 240) {
+        kp->type = MPV_VERM16;
+        vermicelli16Build(~reach, (u8 *)&kp->u.verm16.mask);
+    } else if (reach.count() <= 16) {
+        kp->type = MPV_NVERM16;
+        vermicelli16Build(reach, (u8 *)&kp->u.verm16.mask);
+#endif // HAVE_SVE2
     } else if (shuftiBuildMasks(~reach, (u8 *)&kp->u.shuf.mask_lo,
                                 (u8 *)&kp->u.shuf.mask_hi) != -1) {
         kp->type = MPV_SHUFTI;
@@ -264,7 +274,7 @@ void fillCounterInfos(vector<mpv_counter_info> *out, u32 *curr_decomp_offset,
         assert(it->first.trigger_event
                == MQE_TOP_FIRST + distance(kilopuffs.begin(), it));
 
-        out->push_back(mpv_counter_info());
+        out->emplace_back(mpv_counter_info());
         map<ClusterKey, vector<raw_puff>>::const_iterator it_o = it;
         ++it;
         fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset,
@@ -282,14 +292,14 @@ void fillCounterInfos(vector<mpv_counter_info> *out, u32 *curr_decomp_offset,
         ++it;
     }
     if (it != trig_ite) {
-        out->push_back(mpv_counter_info());
+        out->emplace_back(mpv_counter_info());
         fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset,
                         kilopuffs, kilopuffs.begin(), it);
     }
     while (it != kilopuffs.end() && it->first.auto_restart) {
         assert(it->first.trigger_event == MQE_INVALID);
 
-        out->push_back(mpv_counter_info());
+        out->emplace_back(mpv_counter_info());
         map<ClusterKey, vector<raw_puff>>::const_iterator it_o = it;
         ++it;
         fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset,
diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c
index f4b7552ef..6785e9390 100644
--- a/src/nfa/nfa_api_dispatch.c
+++ b/src/nfa/nfa_api_dispatch.c
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -53,6 +54,14 @@
 
 // general framework calls
 
+#ifdef HAVE_SVE2
+#define VERM16_CASES(dbnt_func)                                                \
+        DISPATCH_CASE(LBR_NFA_VERM16, LbrVerm16, dbnt_func);                   \
+        DISPATCH_CASE(LBR_NFA_NVERM16, LbrNVerm16, dbnt_func);
+#else
+#define VERM16_CASES(dbnt_func)
+#endif
+
 #define DISPATCH_BY_NFA_TYPE(dbnt_func)                                        \
     switch (nfa->type) {                                                       \
         DISPATCH_CASE(LIMEX_NFA_32, LimEx32, dbnt_func);                       \
@@ -76,6 +85,11 @@
         DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
         DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
         DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
+        DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func);                       \
+        DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func);                       \
+        DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func);               \
+        DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func);             \
+        VERM16_CASES(dbnt_func)                                                \
     default:                                                                   \
         assert(0);                                                             \
     }
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 9185ccdd7..ed0e2f013 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -181,7 +182,6 @@ enum NFACategory {NFA_LIMEX, NFA_OTHER};
         static const nfa_dispatch_fn has_repeats_other_than_firsts;     \
         static const u32 stateAlign =                                   \
                 MAX(mlt_align, alignof(RepeatControl));                 \
-        static const bool fast = mlt_size <= 64;                        \
     };                                                                  \
     const nfa_dispatch_fn NFATraits<LIMEX_NFA_##mlt_size>::has_accel    \
             = has_accel_limex<LimExNFA##mlt_size>;                      \
@@ -210,7 +210,6 @@ template<> struct NFATraits<MCCLELLAN_NFA_8> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -226,7 +225,6 @@ template<> struct NFATraits<MCCLELLAN_NFA_16> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 2;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -242,7 +240,6 @@ template<> struct NFATraits<GOUGH_NFA_8> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -258,7 +255,6 @@ template<> struct NFATraits<GOUGH_NFA_16> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -274,7 +270,6 @@ template<> struct NFATraits<MPV_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -290,7 +285,6 @@ template<> struct NFATraits<CASTLE_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -306,7 +300,6 @@ template<> struct NFATraits<LBR_NFA_DOT> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -322,7 +315,6 @@ template<> struct NFATraits<LBR_NFA_VERM> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -338,7 +330,6 @@ template<> struct NFATraits<LBR_NFA_NVERM> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -350,7 +341,25 @@ const nfa_dispatch_fn NFATraits<LBR_NFA_NVERM>::has_repeats_other_than_firsts =
 const char *NFATraits<LBR_NFA_NVERM>::name = "Lim Bounded Repeat (NV)";
 #endif
 
-template<> struct NFATraits<LBR_NFA_SHUF> {
+#ifdef HAVE_SVE2
+
+template<> struct NFATraits<LBR_NFA_VERM16> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 8;
+    static const bool fast = true;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<LBR_NFA_VERM16>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_VERM16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_VERM16>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<LBR_NFA_VERM16>::name = "Lim Bounded Repeat (V16)";
+#endif
+
+template<> struct NFATraits<LBR_NFA_NVERM16> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
@@ -359,6 +368,23 @@ template<> struct NFATraits<LBR_NFA_SHUF> {
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
 };
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVERM16>::has_accel = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVERM16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<LBR_NFA_NVERM16>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<LBR_NFA_NVERM16>::name = "Lim Bounded Repeat (NV16)";
+#endif
+
+#endif // HAVE_SVE2
+
+template<> struct NFATraits<LBR_NFA_SHUF> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 8;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
 const nfa_dispatch_fn NFATraits<LBR_NFA_SHUF>::has_accel = dispatch_false;
 const nfa_dispatch_fn NFATraits<LBR_NFA_SHUF>::has_repeats = dispatch_false;
 const nfa_dispatch_fn NFATraits<LBR_NFA_SHUF>::has_repeats_other_than_firsts = dispatch_false;
@@ -370,7 +396,6 @@ template<> struct NFATraits<LBR_NFA_TRUF> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 8;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -386,7 +411,6 @@ template<> struct NFATraits<SHENG_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -402,7 +426,6 @@ template<> struct NFATraits<TAMARAMA_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 64;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -418,7 +441,6 @@ template<> struct NFATraits<MCSHENG_NFA_8> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 1;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -434,7 +456,6 @@ template<> struct NFATraits<MCSHENG_NFA_16> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
     static const u32 stateAlign = 2;
-    static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
     static const nfa_dispatch_fn has_repeats_other_than_firsts;
@@ -446,6 +467,65 @@ const nfa_dispatch_fn NFATraits<MCSHENG_NFA_16>::has_repeats_other_than_firsts =
 const char *NFATraits<MCSHENG_NFA_16>::name = "Shengy McShengFace 16";
 #endif
 
+template<> struct NFATraits<SHENG_NFA_32> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_accel = has_accel_sheng;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_32>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<SHENG_NFA_32>::name = "Sheng 32";
+#endif
+
+template<> struct NFATraits<SHENG_NFA_64> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_accel = has_accel_sheng;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<SHENG_NFA_64>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<SHENG_NFA_64>::name = "Sheng 64";
+#endif
+
+template<> struct NFATraits<MCSHENG_64_NFA_8> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 1;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_8>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_64_NFA_8>::name = "Shengy64 McShengFace 8";
+#endif
+
+template<> struct NFATraits<MCSHENG_64_NFA_16> {
+    UNUSED static const char *name;
+    static const NFACategory category = NFA_OTHER;
+    static const u32 stateAlign = 2;
+    static const nfa_dispatch_fn has_accel;
+    static const nfa_dispatch_fn has_repeats;
+    static const nfa_dispatch_fn has_repeats_other_than_firsts;
+};
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_accel = has_accel_mcsheng;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_repeats = dispatch_false;
+const nfa_dispatch_fn NFATraits<MCSHENG_64_NFA_16>::has_repeats_other_than_firsts = dispatch_false;
+#if defined(DUMP_SUPPORT)
+const char *NFATraits<MCSHENG_64_NFA_16>::name = "Shengy64 McShengFace 16";
+#endif
 } // namespace
 
 #if defined(DUMP_SUPPORT)
@@ -473,20 +553,6 @@ u32 state_alignment(const NFA &nfa) {
     return DISPATCH_BY_NFA_TYPE((NFAEngineType)nfa.type, getStateAlign, nullptr);
 }
 
-namespace {
-template<NFAEngineType t>
-struct getFastness {
-    static u32 call(void *) {
-        return NFATraits<t>::fast;
-    }
-};
-}
-
-bool is_fast(const NFA &nfa) {
-    NFAEngineType t = (NFAEngineType)nfa.type;
-    return DISPATCH_BY_NFA_TYPE(t, getFastness, nullptr);
-}
-
 namespace {
 template<NFAEngineType t>
 struct is_limex {
diff --git a/src/nfa/nfa_build_util.h b/src/nfa/nfa_build_util.h
index 92a1091ec..ee7a30949 100644
--- a/src/nfa/nfa_build_util.h
+++ b/src/nfa/nfa_build_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,10 +47,6 @@ std::string describe(const NFA &nfa);
 // For a given NFA, retrieve the alignment required by its uncompressed state.
 u32 state_alignment(const NFA &nfa);
 
-/* returns true if the nfa is considered 'fast'. TODO: work out what we mean by
- * fast. */
-bool is_fast(const NFA &n);
-
 bool has_bounded_repeats_other_than_firsts(const NFA &n);
 
 bool has_bounded_repeats(const NFA &n);
diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp
index 5607ed27a..bc8c175d3 100644
--- a/src/nfa/nfa_dump_dispatch.cpp
+++ b/src/nfa/nfa_dump_dispatch.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -81,6 +81,10 @@ namespace ue2 {
         DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func);                      \
         DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func);                     \
         DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func);                   \
+        DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func);                       \
+        DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func);                       \
+        DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func);               \
+        DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func);             \
     default:                                                                   \
         assert(0);                                                             \
     }
diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h
index 9d2808225..8cc701b6e 100644
--- a/src/nfa/nfa_internal.h
+++ b/src/nfa/nfa_internal.h
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,6 +73,14 @@ enum NFAEngineType {
     TAMARAMA_NFA,       /**< magic nfa container */
     MCSHENG_NFA_8,      /**< magic pseudo nfa */
     MCSHENG_NFA_16,     /**< magic pseudo nfa */
+    SHENG_NFA_32,       /**< magic pseudo nfa */
+    SHENG_NFA_64,       /**< magic pseudo nfa */
+    MCSHENG_64_NFA_8,   /**< magic pseudo nfa */
+    MCSHENG_64_NFA_16,  /**< magic pseudo nfa */
+#ifdef HAVE_SVE2
+    LBR_NFA_VERM16,     /**< magic pseudo nfa */
+    LBR_NFA_NVERM16,    /**< magic pseudo nfa */
+#endif // HAVE_SVE2
     /** \brief bogus NFA - not used */
     INVALID_NFA
 };
@@ -148,7 +157,8 @@ static really_inline int isMcClellanType(u8 t) {
 /** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid
  * DFA. */
 static really_inline int isShengMcClellanType(u8 t) {
-    return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16;
+    return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16 ||
+           t == MCSHENG_64_NFA_8 || t == MCSHENG_64_NFA_16;
 }
 
 /** \brief True if the given type (from NFA::type) is a Gough DFA. */
@@ -157,10 +167,25 @@ static really_inline int isGoughType(u8 t) {
 }
 
 /** \brief True if the given type (from NFA::type) is a Sheng DFA. */
-static really_inline int isShengType(u8 t) {
+static really_inline int isSheng16Type(u8 t) {
     return t == SHENG_NFA;
 }
 
+/** \brief True if the given type (from NFA::type) is a Sheng32 DFA. */
+static really_inline int isSheng32Type(u8 t) {
+    return t == SHENG_NFA_32;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng64 DFA. */
+static really_inline int isSheng64Type(u8 t) {
+    return t == SHENG_NFA_64;
+}
+
+/** \brief True if the given type (from NFA::type) is a Sheng16/32/64 DFA. */
+static really_inline int isShengType(u8 t) {
+    return t == SHENG_NFA || t == SHENG_NFA_32 || t == SHENG_NFA_64;
+}
+
 /**
  * \brief True if the given type (from NFA::type) is a McClellan, Gough or
  * Sheng DFA.
@@ -198,6 +223,9 @@ static really_inline int isNfaType(u8 t) {
 static really_inline
 int isLbrType(u8 t) {
     return t == LBR_NFA_DOT || t == LBR_NFA_VERM || t == LBR_NFA_NVERM ||
+#ifdef HAVE_SVE2
+           t == LBR_NFA_VERM16 || t == LBR_NFA_NVERM16 ||
+#endif // HAVE_SVE2
            t == LBR_NFA_SHUF || t == LBR_NFA_TRUF;
 }
 
@@ -214,7 +242,6 @@ int isMultiTopType(u8 t) {
 
 /** Macros used in place of unimplemented NFA API functions for a given
  * engine. */
-#if !defined(_WIN32)
 
 /* Use for functions that return an integer. */
 #define NFA_API_NO_IMPL(...)                                                   \
@@ -230,14 +257,6 @@ int isMultiTopType(u8 t) {
         NFA_ZOMBIE_NO;                                                         \
     })
 
-#else
-
-/* Simpler implementation for compilers that don't like the GCC extension used
- * above. */
-#define NFA_API_NO_IMPL(...)        0
-#define NFA_API_ZOMBIE_NO_IMPL(...) NFA_ZOMBIE_NO
-
-#endif
 
 #ifdef __cplusplus
 }
diff --git a/src/nfa/nfa_rev_api.h b/src/nfa/nfa_rev_api.h
index 370f96ef6..d82c52a45 100644
--- a/src/nfa/nfa_rev_api.h
+++ b/src/nfa/nfa_rev_api.h
@@ -35,7 +35,7 @@
 
 #include "accel.h"
 #include "nfa_internal.h"
-#include "vermicelli.h"
+#include "vermicelli.hpp"
 #include "util/unaligned.h"
 
 static really_inline
diff --git a/src/nfa/ppc64el/shufti.hpp b/src/nfa/ppc64el/shufti.hpp
new file mode 100644
index 000000000..dedeb52de
--- /dev/null
+++ b/src/nfa/ppc64el/shufti.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Shufti: character class acceleration.
+ *
+ */
+
+template <uint16_t S>
+static really_inline
+const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars) {
+    const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
+
+    SuperVector<S> c_lo = chars & low4bits;
+    SuperVector<S> c_hi = chars.template vshr_8_imm<4>();
+    c_lo = mask_lo.template pshufb<false>(c_lo);
+    c_hi = mask_hi.template pshufb<false>(c_hi);
+
+    return (c_lo & c_hi).eq(SuperVector<S>::Zeroes());
+}
+
+template <uint16_t S>
+static really_inline
+SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars) {
+
+    const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
+    SuperVector<S> chars_lo = chars & low4bits;
+    chars_lo.print8("chars_lo");
+    SuperVector<S> chars_hi = chars.template vshr_64_imm<4>() & low4bits;
+    chars_hi.print8("chars_hi");
+    SuperVector<S> c1_lo = mask1_lo.template pshufb<true>(chars_lo);
+    c1_lo.print8("c1_lo");
+    SuperVector<S> c1_hi = mask1_hi.template pshufb<true>(chars_hi);
+    c1_hi.print8("c1_hi");
+    SuperVector<S> t1 = c1_lo | c1_hi;
+    t1.print8("t1");
+
+    SuperVector<S> c2_lo = mask2_lo.template pshufb<true>(chars_lo);
+    c2_lo.print8("c2_lo");
+    SuperVector<S> c2_hi = mask2_hi.template pshufb<true>(chars_hi);
+    c2_hi.print8("c2_hi");
+    SuperVector<S> t2 = c2_lo | c2_hi;
+    t2.print8("t2");
+    t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)");
+    SuperVector<S> t = t1 | (t2.template vshr_128_imm<1>());
+    t.print8("t");
+
+    return t.eq(SuperVector<S>::Ones());
+}
diff --git a/src/nfa/ppc64el/truffle.hpp b/src/nfa/ppc64el/truffle.hpp
new file mode 100644
index 000000000..7dc711f4e
--- /dev/null
+++ b/src/nfa/ppc64el/truffle.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Truffle: character class acceleration.
+ *
+ */
+
+template <uint16_t S>
+static really_inline
+const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
+
+    chars.print8("chars");
+    shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear");
+    shuf_mask_lo_highset.print8("shuf_mask_lo_highset");
+
+    SuperVector<S> highconst = SuperVector<S>::dup_u8(0x80);
+    highconst.print8("highconst");
+    SuperVector<S> shuf_mask_hi = SuperVector<S>::dup_u64(0x8040201008040201);
+    shuf_mask_hi.print8("shuf_mask_hi");
+    
+    SuperVector<S> shuf1 = shuf_mask_lo_highclear.pshufb(chars);
+    shuf1.print8("shuf1");
+    SuperVector<S> t1 = chars ^ highconst;
+    t1.print8("t1");
+    SuperVector<S> shuf2 = shuf_mask_lo_highset.pshufb(t1);
+    shuf2.print8("shuf2");
+    SuperVector<S> t2 = highconst.opandnot(chars.template vshr_64_imm<4>());
+    t2.print8("t2");
+    SuperVector<S> shuf3 = shuf_mask_hi.pshufb(t2);
+    shuf3.print8("shuf3");
+    SuperVector<S> res = (shuf1 | shuf2) & shuf3;
+    res.print8("(shuf1 | shuf2) & shuf3");
+
+    return res.eq(SuperVector<S>::Zeroes());
+}
diff --git a/src/nfa/ppc64el/vermicelli.hpp b/src/nfa/ppc64el/vermicelli.hpp
new file mode 100644
index 000000000..1f3de25f2
--- /dev/null
+++ b/src/nfa/ppc64el/vermicelli.hpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli: single-byte and double-byte acceleration.
+ */
+
+template <uint16_t S>
+static really_inline
+const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = chars.eq(casemask & data);
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = chars.eq(casemask & data);
+    return first_zero_match_inverted<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = chars.eq(casemask & data);
+    return last_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
+
+    data.print8("data");
+    chars.print8("chars");
+    casemask.print8("casemask");
+    SuperVector<S> mask = chars.eq(casemask & data);
+    mask.print8("mask");
+    return last_zero_match_inverted<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+                                u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v = casemask & data;
+    SuperVector<S> mask1 = chars1.eq(v);
+    SuperVector<S> mask2 = chars2.eq(v);
+    SuperVector<S> mask = mask1 & (mask2 >> 1);
+
+    DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
+    }
+
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+                                 u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v = casemask & data;
+    SuperVector<S> mask1 = chars1.eq(v);
+    SuperVector<S> mask2 = chars2.eq(v);
+    SuperVector<S> mask = (mask1 << 1)& mask2;
+
+    DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | (SuperVector<S>::Ones() >> (S-1));
+    }
+
+    return last_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
+                                      SuperVector<S> const mask1, SuperVector<S> const mask2,
+                                      u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v1 = chars1.eq(data & mask1);
+    SuperVector<S> v2 = chars2.eq(data & mask2);
+    SuperVector<S> mask = v1 & (v2 >> 1);
+
+    DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
+    }
+
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
+
diff --git a/src/nfa/rdfa_graph.cpp b/src/nfa/rdfa_graph.cpp
index 2467748b9..d925d1b4a 100644
--- a/src/nfa/rdfa_graph.cpp
+++ b/src/nfa/rdfa_graph.cpp
@@ -44,7 +44,7 @@ RdfaGraph::RdfaGraph(const raw_dfa &rdfa) {
     vector<RdfaGraph::vertex_descriptor> verts;
     verts.reserve(rdfa.states.size());
     for (dstate_id_t i = 0; i < rdfa.states.size(); i++) {
-        verts.push_back(add_vertex(g));
+        verts.emplace_back(add_vertex(g));
         assert(g[verts.back()].index == i);
     }
 
diff --git a/src/nfa/rdfa_merge.cpp b/src/nfa/rdfa_merge.cpp
index 2ad871234..588f94e38 100644
--- a/src/nfa/rdfa_merge.cpp
+++ b/src/nfa/rdfa_merge.cpp
@@ -37,7 +37,6 @@
 #include "util/container.h"
 #include "util/determinise.h"
 #include "util/flat_containers.h"
-#include "util/make_unique.h"
 #include "util/report_manager.h"
 #include "util/unordered.h"
 
@@ -132,7 +131,7 @@ class Automaton_Merge {
 
                 if (t.any() && t != esets[i]) {
                     esets[i] &= ~t;
-                    esets.push_back(t);
+                    esets.emplace_back(t);
                 }
             }
         }
@@ -204,7 +203,7 @@ class Automaton_Merge {
     const vector<StateSet> initial() {
         vector<StateSet> rv = {as};
         if (start_floating != DEAD_STATE && start_floating != start_anchored) {
-            rv.push_back(fs);
+            rv.emplace_back(fs);
         }
         return rv;
     }
@@ -287,7 +286,7 @@ unique_ptr<raw_dfa> mergeTwoDfas(const raw_dfa *d1, const raw_dfa *d2,
     assert(d1->kind == d2->kind);
     assert(max_states <= MAX_DFA_STATES);
 
-    auto rdfa = ue2::make_unique<raw_dfa>(d1->kind);
+    auto rdfa = std::make_unique<raw_dfa>(d1->kind);
 
     Automaton_Merge autom(d1, d2, rm, grey);
     if (determinise(autom, rdfa->states, max_states)) {
@@ -320,7 +319,7 @@ void mergeDfas(vector<unique_ptr<raw_dfa>> &dfas, size_t max_states,
 
     queue<unique_ptr<raw_dfa>> q;
     for (auto &dfa : dfas) {
-        q.push(move(dfa));
+        q.push(std::move(dfa));
     }
 
     // All DFAs are now on the queue, so we'll clear the vector and use it for
@@ -329,30 +328,30 @@ void mergeDfas(vector<unique_ptr<raw_dfa>> &dfas, size_t max_states,
 
     while (q.size() > 1) {
         // Attempt to merge the two front elements of the queue.
-        unique_ptr<raw_dfa> d1 = move(q.front());
+        unique_ptr<raw_dfa> d1 = std::move(q.front());
         q.pop();
-        unique_ptr<raw_dfa> d2 = move(q.front());
+        unique_ptr<raw_dfa> d2 = std::move(q.front());
         q.pop();
 
         auto rdfa = mergeTwoDfas(d1.get(), d2.get(), max_states, rm, grey);
         if (rdfa) {
-            q.push(move(rdfa));
+            q.push(std::move(rdfa));
         } else {
             DEBUG_PRINTF("failed to merge\n");
             // Put the larger of the two DFAs on the output list, retain the
             // smaller one on the queue for further merge attempts.
             if (d2->states.size() > d1->states.size()) {
-                dfas.push_back(move(d2));
-                q.push(move(d1));
+                dfas.emplace_back(std::move(d2));
+                q.push(std::move(d1));
             } else {
-                dfas.push_back(move(d1));
-                q.push(move(d2));
+                dfas.emplace_back(std::move(d1));
+                q.push(std::move(d2));
             }
         }
     }
 
     while (!q.empty()) {
-        dfas.push_back(move(q.front()));
+        dfas.emplace_back(std::move(q.front()));
         q.pop();
     }
 
@@ -370,7 +369,7 @@ unique_ptr<raw_dfa> mergeAllDfas(const vector<const raw_dfa *> &dfas,
     assert(all_of(begin(dfas), end(dfas),
                   [&kind](const raw_dfa *rdfa) { return rdfa->kind == kind; }));
 
-    auto rdfa = ue2::make_unique<raw_dfa>(kind);
+    auto rdfa = std::make_unique<raw_dfa>(kind);
     Automaton_Merge n(dfas, rm, grey);
 
     DEBUG_PRINTF("merging dfa\n");
diff --git a/src/nfa/repeatcompile.cpp b/src/nfa/repeatcompile.cpp
index 934dd29e6..60b513524 100644
--- a/src/nfa/repeatcompile.cpp
+++ b/src/nfa/repeatcompile.cpp
@@ -80,10 +80,10 @@ u32 repeatRecurTable(struct RepeatStateInfo *info, const depth &repeatMax,
     u32 repeatTmp = info->patchCount > 2 ? 64 : (u32)repeatMax;
     u32 repeat_index = repeatTmp < minPeriod ? repeatTmp : minPeriod;
     for (u32 i = 0; i <= repeat_index; i++) {
-        info->table.push_back(i + 1);
+        info->table.emplace_back(i + 1);
     }
     for (u32 i = minPeriod + 1; i <= repeatTmp; i++) {
-        info->table.push_back(info->table[i - 1] + info->table[i - minPeriod]);
+        info->table.emplace_back(info->table[i - 1] + info->table[i - minPeriod]);
         if (info->table[i] < info->table[i - 1]) {
             return i - 1;
         }
@@ -124,6 +124,10 @@ RepeatStateInfo::RepeatStateInfo(enum RepeatType type, const depth &repeatMin,
                                  const depth &repeatMax, u32 minPeriod)
     : stateSize(0), packedCtrlSize(0), horizon(0), patchCount(0),
       patchSize(0), encodingSize(0), patchesOffset(0) {
+    if (type == REPEAT_SPARSE_OPTIMAL_P && minPeriod == 0) {
+        assert(0);
+        throw std::domain_error("SPARSE_OPTIMAL_P must have non-zero minPeriod.");
+    }
     assert(repeatMin <= repeatMax);
     assert(repeatMax.is_reachable());
     assert(minPeriod || type != REPEAT_SPARSE_OPTIMAL_P);
@@ -341,7 +345,7 @@ vector<size_t> minResetDistToEnd(const vector<vector<CharReach>> &triggers,
                 break;
             }
         }
-        out.push_back(i);
+        out.emplace_back(i);
     }
 
     return out;
diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c
index 4f30910b5..3f36e2189 100644
--- a/src/nfa/sheng.c
+++ b/src/nfa/sheng.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -154,6 +154,205 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
     return MO_CONTINUE_MATCHING; /* continue execution */
 }
 
+#if defined(HAVE_AVX512VBMI)
+// Sheng32
+static really_inline
+const struct sheng32 *get_sheng32(const struct NFA *n) {
+    return (const struct sheng32 *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux32(const struct sheng32 *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG32_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG32_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const union AccelAux *get_accel32(const struct sheng32 *sh, u8 id) {
+    const struct sstate_aux *saux = get_aux32(sh, id);
+    DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel);
+    const union AccelAux *aux = (const union AccelAux *)
+            ((const char *)sh + saux->accel - sizeof(struct NFA));
+    return aux;
+}
+
+static really_inline
+const struct report_list *get_rl32(const struct sheng32 *sh,
+                                   const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl32(const struct sheng32 *sh,
+                                       const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char sheng32HasAccept(const struct sheng32 *sh, const struct sstate_aux *aux,
+                      ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl32(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireReports32(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                   const u8 state, u64a loc, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux32(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl32(sh, aux) :
+                                         get_rl32(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+
+// Sheng64
+static really_inline
+const struct sheng64 *get_sheng64(const struct NFA *n) {
+    return (const struct sheng64 *)getImplNfa(n);
+}
+
+static really_inline
+const struct sstate_aux *get_aux64(const struct sheng64 *sh, u8 id) {
+    u32 offset = sh->aux_offset - sizeof(struct NFA) +
+            (id & SHENG64_STATE_MASK) * sizeof(struct sstate_aux);
+    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
+                 id & SHENG64_STATE_MASK, (u64a)offset + sizeof(struct NFA));
+    return (const struct sstate_aux *)((const char *) sh + offset);
+}
+
+static really_inline
+const struct report_list *get_rl64(const struct sheng64 *sh,
+                                   const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept - sizeof(struct NFA));
+}
+
+static really_inline
+const struct report_list *get_eod_rl64(const struct sheng64 *sh,
+                                       const struct sstate_aux *aux) {
+    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
+    return (const struct report_list *)
+        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
+}
+
+static really_inline
+char sheng64HasAccept(const struct sheng64 *sh, const struct sstate_aux *aux,
+                      ReportID report) {
+    assert(sh && aux);
+
+    const struct report_list *rl = get_rl64(sh, aux);
+    assert(ISALIGNED_N(rl, 4));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+
+    for (u32 i = 0; i < rl->count; i++) {
+        if (rl->report[i] == report) {
+            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+static really_inline
+char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                   const u8 state, u64a loc, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, char eod) {
+    DEBUG_PRINTF("reporting matches @ %llu\n", loc);
+
+    if (!eod && state == *cached_accept_state) {
+        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
+        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+    const struct sstate_aux *aux = get_aux64(sh, state);
+    const struct report_list *rl = eod ? get_eod_rl64(sh, aux) :
+                                         get_rl64(sh, aux);
+    assert(ISALIGNED(rl));
+
+    DEBUG_PRINTF("report list has %u entries\n", rl->count);
+    u32 count = rl->count;
+
+    if (!eod && count == 1) {
+        *cached_accept_state = state;
+        *cached_accept_id = rl->report[0];
+
+        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
+        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+
+        return MO_CONTINUE_MATCHING; /* continue execution */
+    }
+
+    for (u32 i = 0; i < count; i++) {
+        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
+        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING; /* termination requested */
+        }
+    }
+    return MO_CONTINUE_MATCHING; /* continue execution */
+}
+#endif // end of HAVE_AVX512VBMI
+
 /* include Sheng function definitions */
 #include "sheng_defs.h"
 
@@ -671,3 +870,1008 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest,
     *(u8 *)dest = *(const u8 *)src;
     return 0;
 }
+
+#if defined(HAVE_AVX512VBMI)
+// Sheng32
+static really_inline
+char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng32_4_coda(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        } else {
+            rv = sheng32_4_cod(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng32_cod(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng32_4_coa(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        } else {
+            rv = sheng32_4_co(state, cb, ctxt, sh, cached_accept_state,
+                              cached_accept_id, single, offset, cur_buf,
+                              start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng32_co(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf,
+                        *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runSheng32Nm(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        if (has_accel) {
+            sheng32_4_nmda(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        } else {
+            sheng32_4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          start, end, scanned);
+        }
+        sheng32_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                    single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng32_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                     single, offset, cur_buf, start, end, scanned);
+        sheng32_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                   single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runSheng32Sam(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
+                   u64a offset, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, const u8 *cur_buf,
+                   const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
+                   u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
+                 !!has_accel, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        if (has_accel) {
+            rv = sheng32_4_samda(state, cb, ctxt, sh, cached_accept_state,
+                                 cached_accept_id, single, offset, cur_buf,
+                                 start, end, scanned);
+        } else {
+            rv = sheng32_4_samd(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng32_samd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          *scanned, end, scanned);
+    } else {
+        if (has_accel) {
+            rv = sheng32_4_sama(state, cb, ctxt, sh, cached_accept_state,
+                                cached_accept_id, single, offset, cur_buf,
+                                start, end, scanned);
+        } else {
+            rv = sheng32_4_sam(state, cb, ctxt, sh, cached_accept_state,
+                               cached_accept_id, single, offset, cur_buf,
+                               start, end, scanned);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng32_sam(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng32(const struct sheng32 *sh, struct mq *q, s64a b_end,
+                enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng32 execution in state %u\n",
+                 state & SHENG32_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports32(sh, q->cb, q->context, state, q_cur_offset(q),
+                               &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            if (mode == NO_MATCHES) {
+                runSheng32Nm(sh, q->cb, q->context, q->offset,
+                             &cached_accept_state, &cached_accept_id, cur_buf,
+                             cur_buf + cur_start, cur_buf + cur_end, can_die,
+                             has_accel, single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runSheng32Cb(sh, q->cb, q->context, q->offset,
+                                  &cached_accept_state, &cached_accept_id,
+                                  cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                  can_die, has_accel, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runSheng32Sam(sh, q->cb, q->context, q->offset,
+                                   &cached_accept_state, &cached_accept_id,
+                                   cur_buf, cur_buf + cur_start,
+                                   cur_buf + cur_end, can_die, has_accel, single,
+                                   &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG32_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG32_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux32(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG32_STATE_MASK,
+                             new_state & SHENG32_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng32\n");
+    assert(n->type == SHENG_NFA_32);
+    const struct sheng32 *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runSheng32Cb(sh, cb, context, offset, &cached_accept_state,
+                      &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                      has_accel, single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG32_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG32_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux32(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports32(sh, cb, context, state, end + offset,
+                      &cached_accept_state, &cached_accept_id, 1);
+    }
+
+    return state & SHENG32_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    char rv = runSheng32(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng32_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q) {
+    assert(n && q);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return sheng32HasAccept(sh, aux, report);
+}
+
+char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng32 *sh = get_sheng32(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state,
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng32 *sh = get_sheng32(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux32(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports32(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng32 *sh = (const struct sheng32 *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux32(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports32(sh, cb, ctxt, s, offset, &cached_state_id,
+                          &cached_report_id, 0);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, UNUSED u8 key) {
+    const struct sheng32 *sh = get_sheng32(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG32_STATE_DEAD);
+}
+
+char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng32 *sh = get_sheng32(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng32_queueCompressState(UNUSED const struct NFA *nfa,
+                                       const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng32_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                const void *src, UNUSED u64a offset,
+                                UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+// Sheng64
+static really_inline
+char runSheng64Cb(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    int rv;
+    /* scan and report all matches */
+    if (can_die) {
+        rv = sheng64_4_cod(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng64_cod(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    } else {
+        rv = sheng64_4_co(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        rv = sheng64_co(state, cb, ctxt, sh, cached_accept_state,
+                        cached_accept_id, single, offset, cur_buf,
+                        *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    return MO_ALIVE;
+}
+
+static really_inline
+void runSheng64Nm(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                  u64a offset, u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id, const u8 *cur_buf,
+                  const u8 *start, const u8 *end, u8 can_die,
+                  u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    /* just scan the buffer */
+    if (can_die) {
+        sheng64_4_nmd(state, cb, ctxt, sh, cached_accept_state,
+                      cached_accept_id, single, offset, cur_buf,
+                      start, end, scanned);
+        sheng64_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                    single, offset, cur_buf, *scanned, end, scanned);
+    } else {
+        sheng64_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                     single, offset, cur_buf, start, end, scanned);
+        sheng64_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
+                   single, offset, cur_buf, *scanned, end, scanned);
+    }
+}
+
+static really_inline
+char runSheng64Sam(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
+                   u64a offset, u8 *const cached_accept_state,
+                   ReportID *const cached_accept_id, const u8 *cur_buf,
+                   const u8 *start, const u8 *end, u8 can_die,
+                   u8 single, const u8 **scanned, u8 *state) {
+    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
+                 (u64a)(end - start), offset);
+    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
+                 (s64a)(end - cur_buf));
+    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
+    int rv;
+    /* scan until first match */
+    if (can_die) {
+        rv = sheng64_4_samd(state, cb, ctxt, sh, cached_accept_state,
+                            cached_accept_id, single, offset, cur_buf,
+                            start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng64_samd(state, cb, ctxt, sh, cached_accept_state,
+                          cached_accept_id, single, offset, cur_buf,
+                          *scanned, end, scanned);
+    } else {
+        rv = sheng64_4_sam(state, cb, ctxt, sh, cached_accept_state,
+                           cached_accept_id, single, offset, cur_buf,
+                           start, end, scanned);
+        if (rv == MO_HALT_MATCHING) {
+            return MO_DEAD;
+        }
+        /* if we stopped before we expected, we found a match */
+        if (rv == MO_MATCHES_PENDING) {
+            return MO_MATCHES_PENDING;
+        }
+
+        rv = sheng64_sam(state, cb, ctxt, sh, cached_accept_state,
+                         cached_accept_id, single, offset, cur_buf,
+                         *scanned, end, scanned);
+    }
+    if (rv == MO_HALT_MATCHING) {
+        return MO_DEAD;
+    }
+    /* if we stopped before we expected, we found a match */
+    if (rv == MO_MATCHES_PENDING) {
+        return MO_MATCHES_PENDING;
+    }
+    return MO_ALIVE;
+}
+
+static never_inline
+char runSheng64(const struct sheng64 *sh, struct mq *q, s64a b_end,
+                enum MatchMode mode) {
+    u8 state = *(u8 *)q->state;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    DEBUG_PRINTF("starting Sheng64 execution in state %u\n",
+                 state & SHENG64_STATE_MASK);
+
+    if (q->report_current) {
+        DEBUG_PRINTF("reporting current pending matches\n");
+        assert(sh);
+
+        q->report_current = 0;
+
+        int rv;
+        if (single) {
+            rv = fireSingleReport(q->cb, q->context, sh->report,
+                                  q_cur_offset(q));
+        } else {
+            rv = fireReports64(sh, q->cb, q->context, state, q_cur_offset(q),
+                               &cached_accept_state, &cached_accept_id, 0);
+        }
+        if (rv == MO_HALT_MATCHING) {
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            return MO_DEAD;
+        }
+
+        DEBUG_PRINTF("proceeding with matching\n");
+    }
+
+    assert(q_cur_type(q) == MQE_START);
+    s64a start = q_cur_loc(q);
+
+    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
+                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
+                     mode == NO_MATCHES ? "NO MATCHES" :
+                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");
+
+    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                 q_cur_type(q) == MQE_START ? "START" :
+                     q_cur_type(q) == MQE_TOP ? "TOP" :
+                         q_cur_type(q) == MQE_END ? "END" : "???");
+
+    const u8* cur_buf;
+    if (start < 0) {
+        DEBUG_PRINTF("negative location, scanning history\n");
+        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
+        cur_buf = q->history + q->hlength;
+    } else {
+        DEBUG_PRINTF("positive location, scanning buffer\n");
+        DEBUG_PRINTF("max location: %lli\n", b_end);
+        cur_buf = q->buffer;
+    }
+
+    /* if we our queue event is past our end */
+    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+        DEBUG_PRINTF("current location past buffer end\n");
+        DEBUG_PRINTF("setting q location to %llu\n", b_end);
+        DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+        q->items[q->cur].location = b_end;
+        return MO_ALIVE;
+    }
+
+    q->cur++;
+
+    s64a cur_start = start;
+
+    while (1) {
+        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
+                     q_cur_type(q) == MQE_START ? "START" :
+                             q_cur_type(q) == MQE_TOP ? "TOP" :
+                                     q_cur_type(q) == MQE_END ? "END" : "???");
+        s64a end = q_cur_loc(q);
+        if (mode != NO_MATCHES) {
+            end = MIN(end, b_end);
+        }
+        assert(end <= (s64a) q->length);
+        s64a cur_end = end;
+
+        /* we may cross the border between history and current buffer */
+        if (cur_start < 0) {
+            cur_end = MIN(0, cur_end);
+        }
+
+        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);
+
+        /* don't scan zero length buffer */
+        if (cur_start != cur_end) {
+            const u8 * scanned = cur_buf;
+            char rv;
+
+            if (mode == NO_MATCHES) {
+                runSheng64Nm(sh, q->cb, q->context, q->offset,
+                             &cached_accept_state, &cached_accept_id, cur_buf,
+                             cur_buf + cur_start, cur_buf + cur_end, can_die,
+                             single, &scanned, &state);
+            } else if (mode == CALLBACK_OUTPUT) {
+                rv = runSheng64Cb(sh, q->cb, q->context, q->offset,
+                                  &cached_accept_state, &cached_accept_id,
+                                  cur_buf, cur_buf + cur_start, cur_buf + cur_end,
+                                  can_die, single, &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return MO_DEAD;
+                }
+            } else if (mode == STOP_AT_MATCH) {
+                rv = runSheng64Sam(sh, q->cb, q->context, q->offset,
+                                   &cached_accept_state, &cached_accept_id,
+                                   cur_buf, cur_buf + cur_start,
+                                   cur_buf + cur_end, can_die, single,
+                                   &scanned, &state);
+                if (rv == MO_DEAD) {
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return rv;
+                } else if (rv == MO_MATCHES_PENDING) {
+                    assert(q->cur);
+                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
+                                 scanned - cur_buf + 1);
+                    q->cur--;
+                    q->items[q->cur].type = MQE_START;
+                    q->items[q->cur].location =
+                            scanned - cur_buf + 1; /* due to exiting early */
+                    *(u8 *)q->state = state;
+                    DEBUG_PRINTF("exiting in state %u\n",
+                                 state & SHENG64_STATE_MASK);
+                    return rv;
+                }
+            } else {
+                assert(!"invalid scanning mode!");
+            }
+            assert(scanned == cur_buf + cur_end);
+
+            cur_start = cur_end;
+        }
+
+        /* if we our queue event is past our end */
+        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
+            DEBUG_PRINTF("current location past buffer end\n");
+            DEBUG_PRINTF("setting q location to %llu\n", b_end);
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            q->cur--;
+            q->items[q->cur].type = MQE_START;
+            q->items[q->cur].location = b_end;
+            *(u8 *)q->state = state;
+            return MO_ALIVE;
+        }
+
+        /* crossing over into actual buffer */
+        if (cur_start == 0) {
+            DEBUG_PRINTF("positive location, scanning buffer\n");
+            DEBUG_PRINTF("max offset: %lli\n", b_end);
+            cur_buf = q->buffer;
+        }
+
+        /* continue scanning the same buffer */
+        if (end != cur_end) {
+            continue;
+        }
+
+        switch (q_cur_type(q)) {
+        case MQE_END:
+            *(u8 *)q->state = state;
+            q->cur++;
+            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
+            if (can_die) {
+                return (state & SHENG64_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
+            }
+            return MO_ALIVE;
+        case MQE_TOP:
+            if (q->offset + cur_start == 0) {
+                DEBUG_PRINTF("Anchored start, going to state %u\n",
+                             sh->anchored);
+                state = sh->anchored;
+            } else {
+                u8 new_state = get_aux64(sh, state)->top;
+                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG64_STATE_MASK,
+                             new_state & SHENG64_STATE_MASK);
+                state = new_state;
+            }
+            break;
+        default:
+            assert(!"invalid queue event");
+            break;
+        }
+        q->cur++;
+    }
+}
+
+char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context) {
+    DEBUG_PRINTF("smallwrite Sheng64\n");
+    assert(n->type == SHENG_NFA_64);
+    const struct sheng64 *sh = getImplNfa(n);
+    u8 state = sh->anchored;
+    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
+    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
+    u8 cached_accept_state = 0;
+    ReportID cached_accept_id = 0;
+
+    /* scan and report all matches */
+    int rv;
+    s64a end = length;
+    const u8 *scanned;
+
+    rv = runSheng64Cb(sh, cb, context, offset, &cached_accept_state,
+                      &cached_accept_id, buffer, buffer, buffer + end, can_die,
+                      single, &scanned, &state);
+    if (rv == MO_DEAD) {
+        DEBUG_PRINTF("exiting in state %u\n",
+                     state & SHENG64_STATE_MASK);
+        return MO_DEAD;
+    }
+
+    DEBUG_PRINTF("%u\n", state & SHENG64_STATE_MASK);
+
+    const struct sstate_aux *aux = get_aux64(sh, state);
+
+    if (aux->accept_eod) {
+        DEBUG_PRINTF("Reporting EOD matches\n");
+        fireReports64(sh, cb, context, state, end + offset,
+                      &cached_accept_state, &cached_accept_id, 1);
+    }
+
+    return state & SHENG64_STATE_DEAD ? MO_DEAD : MO_ALIVE;
+}
+
+char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, end, CALLBACK_OUTPUT);
+    return rv;
+}
+
+char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end) {
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, end, STOP_AT_MATCH);
+    return rv;
+}
+
+char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report) {
+    assert(q_cur_type(q) == MQE_START);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    char rv = runSheng64(sh, q, 0 /* end */, NO_MATCHES);
+
+    if (rv && nfaExecSheng64_inAccept(n, report, q)) {
+        return MO_MATCHES_PENDING;
+    }
+    return rv;
+}
+
+char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q) {
+    assert(n && q);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+
+    if (!aux->accept) {
+        return 0;
+    }
+
+    return sheng64HasAccept(sh, aux, report);
+}
+
+char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q) {
+    assert(n && q);
+
+    const struct sheng64 *sh = get_sheng64(n);
+    u8 s = *(const u8 *)q->state;
+    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+    return !!aux->accept;
+}
+
+char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state,
+                            UNUSED const char *streamState, u64a offset,
+                            NfaCallback cb, void *ctxt) {
+    assert(nfa);
+
+    const struct sheng64 *sh = get_sheng64(nfa);
+    u8 s = *(const u8 *)state;
+    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));
+
+    const struct sstate_aux *aux = get_aux64(sh, s);
+
+    if (!aux->accept_eod) {
+        return MO_CONTINUE_MATCHING;
+    }
+
+    return fireReports64(sh, cb, ctxt, s, offset, NULL, NULL, 1);
+}
+
+char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q) {
+    const struct sheng64 *sh = (const struct sheng64 *)getImplNfa(n);
+    NfaCallback cb = q->cb;
+    void *ctxt = q->context;
+    u8 s = *(u8 *)q->state;
+    const struct sstate_aux *aux = get_aux64(sh, s);
+    u64a offset = q_cur_offset(q);
+    u8 cached_state_id = 0;
+    ReportID cached_report_id = 0;
+    assert(q_cur_type(q) == MQE_START);
+
+    if (aux->accept) {
+        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
+            fireSingleReport(cb, ctxt, sh->report, offset);
+        } else {
+            fireReports64(sh, cb, ctxt, s, offset, &cached_state_id,
+                          &cached_report_id, 0);
+        }
+    }
+
+    return 0;
+}
+
+char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, UNUSED u8 key) {
+    const struct sheng64 *sh = get_sheng64(nfa);
+    u8 *s = (u8 *)state;
+    *s = offset ? sh->floating: sh->anchored;
+    return !(*s & SHENG64_STATE_DEAD);
+}
+
+char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q) {
+    assert(nfa->scratchStateSize == 1);
+
+    /* starting in floating state */
+    const struct sheng64 *sh = get_sheng64(nfa);
+    *(u8 *)q->state = sh->floating;
+    DEBUG_PRINTF("starting in floating state\n");
+    return 0;
+}
+
+char nfaExecSheng64_queueCompressState(UNUSED const struct NFA *nfa,
+                                       const struct mq *q, UNUSED s64a loc) {
+    void *dest = q->streamState;
+    const void *src = q->state;
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+
+char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest,
+                                const void *src, UNUSED u64a offset,
+                                UNUSED u8 key) {
+    assert(nfa->scratchStateSize == 1);
+    assert(nfa->streamStateSize == 1);
+    *(u8 *)dest = *(const u8 *)src;
+    return 0;
+}
+#endif // end of HAVE_AVX512VBMI
diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h
index 84a2b6b51..7b90e3034 100644
--- a/src/nfa/sheng.h
+++ b/src/nfa/sheng.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,4 +58,86 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q);
 char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
                     size_t length, NfaCallback cb, void *context);
 
+#if defined(HAVE_AVX512VBMI)
+#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng32_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecSheng32_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, u8 key);
+char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state,
+                            const char *streamState, u64a offset,
+                            NfaCallback callback, void *context);
+char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context);
+
+#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL
+
+char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end);
+char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report);
+char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report,
+                             struct mq *q);
+char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q);
+char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q);
+char nfaExecSheng64_queueCompressState(const struct NFA *nfa,
+                                       const struct mq *q, s64a loc);
+char nfaExecSheng64_expandState(const struct NFA *nfa, void *dest,
+                                const void *src, u64a offset, u8 key);
+char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset,
+                                        void *state, u8 key);
+char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state,
+                            const char *streamState, u64a offset,
+                            NfaCallback callback, void *context);
+char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q);
+
+char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
+                      size_t length, NfaCallback cb, void *context);
+
+#else // !HAVE_AVX512VBMI
+
+#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecSheng32_Q NFA_API_NO_IMPL
+#define nfaExecSheng32_Q2 NFA_API_NO_IMPL
+#define nfaExecSheng32_QR NFA_API_NO_IMPL
+#define nfaExecSheng32_inAccept NFA_API_NO_IMPL
+#define nfaExecSheng32_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecSheng32_queueInitState NFA_API_NO_IMPL
+#define nfaExecSheng32_queueCompressState NFA_API_NO_IMPL
+#define nfaExecSheng32_expandState NFA_API_NO_IMPL
+#define nfaExecSheng32_initCompressedState NFA_API_NO_IMPL
+#define nfaExecSheng32_testEOD NFA_API_NO_IMPL
+#define nfaExecSheng32_reportCurrent NFA_API_NO_IMPL
+#define nfaExecSheng32_B NFA_API_NO_IMPL
+
+#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL
+#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL
+#define nfaExecSheng64_Q NFA_API_NO_IMPL
+#define nfaExecSheng64_Q2 NFA_API_NO_IMPL
+#define nfaExecSheng64_QR NFA_API_NO_IMPL
+#define nfaExecSheng64_inAccept NFA_API_NO_IMPL
+#define nfaExecSheng64_inAnyAccept NFA_API_NO_IMPL
+#define nfaExecSheng64_queueInitState NFA_API_NO_IMPL
+#define nfaExecSheng64_queueCompressState NFA_API_NO_IMPL
+#define nfaExecSheng64_expandState NFA_API_NO_IMPL
+#define nfaExecSheng64_initCompressedState NFA_API_NO_IMPL
+#define nfaExecSheng64_testEOD NFA_API_NO_IMPL
+#define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL
+#define nfaExecSheng64_B NFA_API_NO_IMPL
+#endif // end of HAVE_AVX512VBMI
+
 #endif /* SHENG_H_ */
diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h
index 26bdbcee2..390af7522 100644
--- a/src/nfa/sheng_defs.h
+++ b/src/nfa/sheng_defs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,6 +52,43 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) {
     return (a | b | c | d) & (SHENG_STATE_FLAG_MASK);
 }
 
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+u8 isDeadState32(const u8 a) {
+    return a & SHENG32_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState32(const u8 a) {
+    return a & SHENG32_STATE_ACCEPT;
+}
+
+static really_inline
+u8 isAccelState32(const u8 a) {
+    return a & SHENG32_STATE_ACCEL;
+}
+
+static really_inline
+u8 hasInterestingStates32(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG32_STATE_FLAG_MASK);
+}
+
+static really_inline
+u8 isDeadState64(const u8 a) {
+    return a & SHENG64_STATE_DEAD;
+}
+
+static really_inline
+u8 isAcceptState64(const u8 a) {
+    return a & SHENG64_STATE_ACCEPT;
+}
+
+static really_inline
+u8 hasInterestingStates64(const u8 a, const u8 b, const u8 c, const u8 d) {
+    return (a | b | c | d) & (SHENG64_STATE_FLAG_MASK);
+}
+#endif
+
 /* these functions should be optimized out, used by NO_MATCHES mode */
 static really_inline
 u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c,
@@ -71,66 +108,162 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define SHENG_IMPL sheng_cod
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_cod
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_cod
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* callback output, can't die */
 #define SHENG_IMPL sheng_co
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_co
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_co
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can die */
 #define SHENG_IMPL sheng_samd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_samd
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_samd
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can't die */
 #define SHENG_IMPL sheng_sam
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_sam
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_sam
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* no match, can die */
 #define SHENG_IMPL sheng_nmd
 #define DEAD_FUNC isDeadState
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_nmd
+#define DEAD_FUNC32 isDeadState32
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_nmd
+#define DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 dummyFunc
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* no match, can't die */
 #define SHENG_IMPL sheng_nm
 #define DEAD_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_nm
+#define DEAD_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_nm
+#define DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 dummyFunc
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl.h"
 #undef SHENG_IMPL
 #undef DEAD_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef DEAD_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /*
@@ -144,6 +277,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_coda
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -153,6 +296,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* callback output, can die, not accelerated */
@@ -163,6 +316,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_cod
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_cod
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 isDeadState64
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -172,6 +339,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* callback output, can't die, accelerated */
@@ -182,6 +363,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_coa
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -191,6 +382,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* callback output, can't die, not accelerated */
@@ -201,6 +402,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_co
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_co
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -210,6 +425,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can die, accelerated */
@@ -220,6 +449,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_samda
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -229,6 +468,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can die, not accelerated */
@@ -239,6 +488,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_samd
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 isDeadState32
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_samd
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 isDeadState64
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -248,6 +511,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can't die, accelerated */
@@ -258,6 +535,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC isAccelState
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_sama
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 isAccelState32
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -267,6 +554,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* stop at match, can't die, not accelerated */
@@ -277,6 +574,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC isAcceptState
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_sam
+#define INTERESTING_FUNC32 hasInterestingStates32
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 isAcceptState32
+#define SHENG64_IMPL sheng64_4_sam
+#define INTERESTING_FUNC64 hasInterestingStates64
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 isAcceptState64
+#endif
 #define STOP_AT_MATCH 1
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -286,6 +597,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* no-match have interesting func as dummy, and die/accel checks are outer */
@@ -298,6 +623,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC isAccelState
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nmda
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 isDeadState32
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 isAccelState32
+#define ACCEPT_FUNC32 dummyFunc
+#define NO_SHENG64_IMPL
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -307,6 +642,16 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef NO_SHENG64_IMPL
+#endif
 #undef STOP_AT_MATCH
 
 /* no match, can die, not accelerated */
@@ -317,6 +662,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nmd
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 isDeadState32
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_4_nmd
+#define INTERESTING_FUNC64 dummyFunc4
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 isDeadState64
+#define ACCEPT_FUNC64 dummyFunc
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -326,6 +685,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 /* there is no performance benefit in accelerating a no-match case that can't
@@ -339,6 +712,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #define INNER_ACCEL_FUNC dummyFunc
 #define OUTER_ACCEL_FUNC dummyFunc
 #define ACCEPT_FUNC dummyFunc
+#if defined(HAVE_AVX512VBMI)
+#define SHENG32_IMPL sheng32_4_nm
+#define INTERESTING_FUNC32 dummyFunc4
+#define INNER_DEAD_FUNC32 dummyFunc
+#define OUTER_DEAD_FUNC32 dummyFunc
+#define INNER_ACCEL_FUNC32 dummyFunc
+#define OUTER_ACCEL_FUNC32 dummyFunc
+#define ACCEPT_FUNC32 dummyFunc
+#define SHENG64_IMPL sheng64_4_nm
+#define INTERESTING_FUNC64 dummyFunc4
+#define INNER_DEAD_FUNC64 dummyFunc
+#define OUTER_DEAD_FUNC64 dummyFunc
+#define ACCEPT_FUNC64 dummyFunc
+#endif
 #define STOP_AT_MATCH 0
 #include "sheng_impl4.h"
 #undef SHENG_IMPL
@@ -348,6 +735,20 @@ u8 dummyFunc(UNUSED const u8 a) {
 #undef INNER_ACCEL_FUNC
 #undef OUTER_ACCEL_FUNC
 #undef ACCEPT_FUNC
+#if defined(HAVE_AVX512VBMI)
+#undef SHENG32_IMPL
+#undef INTERESTING_FUNC32
+#undef INNER_DEAD_FUNC32
+#undef OUTER_DEAD_FUNC32
+#undef INNER_ACCEL_FUNC32
+#undef OUTER_ACCEL_FUNC32
+#undef ACCEPT_FUNC32
+#undef SHENG64_IMPL
+#undef INTERESTING_FUNC64
+#undef INNER_DEAD_FUNC64
+#undef OUTER_DEAD_FUNC64
+#undef ACCEPT_FUNC64
+#endif
 #undef STOP_AT_MATCH
 
 #endif // SHENG_DEFS_H
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
index 9552fe15d..1fa5c8317 100644
--- a/src/nfa/sheng_impl.h
+++ b/src/nfa/sheng_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,7 +52,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     }
     DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
 
-    m128 cur_state = set16x8(*state);
+    m128 cur_state = set1_16x8(*state);
     const m128 *masks = s->shuffle_masks;
 
     while (likely(cur_buf != end)) {
@@ -95,3 +95,127 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     *scan_end = cur_buf;
     return MO_CONTINUE_MATCHING;
 }
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng32 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG32_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC32(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m512 cur_state = set1_64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m512 succ_mask = masks[c];
+        cur_state = vpermb512(cur_state, succ_mask);
+        const u8 tmp = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK,
+                     tmp & SHENG32_STATE_FLAG_MASK);
+
+        if (unlikely(ACCEPT_FUNC32(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports32(s, cb, ctxt, tmp, match_offset,
+                                  cached_accept_state, cached_accept_id,
+                                  0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng64 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFA execution in state %u\n",
+                 *state & SHENG64_STATE_MASK);
+    const u8 *cur_buf = start;
+    if (DEAD_FUNC64(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+    DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
+
+    m512 cur_state = set1_64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(cur_buf != end)) {
+        const u8 c = *cur_buf;
+        const m512 succ_mask = masks[c];
+        cur_state = vpermb512(cur_state, succ_mask);
+        const u8 tmp = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK,
+                     tmp & SHENG64_STATE_FLAG_MASK);
+
+        if (unlikely(ACCEPT_FUNC64(tmp))) {
+            DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK);
+            u64a match_offset = base_offset + (cur_buf - buf) + 1;
+            DEBUG_PRINTF("Match @ %llu\n", match_offset);
+            if (STOP_AT_MATCH) {
+                DEBUG_PRINTF("Stopping at match @ %lli\n",
+                             (u64a)(cur_buf - start));
+                *state = tmp;
+                *scan_end = cur_buf;
+                return MO_MATCHES_PENDING;
+            }
+            if (single) {
+                if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                    MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            } else {
+                if (fireReports64(s, cb, ctxt, tmp, match_offset,
+                                  cached_accept_state, cached_accept_id,
+                                  0) == MO_HALT_MATCHING) {
+                    return MO_HALT_MATCHING;
+                }
+            }
+        }
+        cur_buf++;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+#endif
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
index 740322010..e5d3468f4 100644
--- a/src/nfa/sheng_impl4.h
+++ b/src/nfa/sheng_impl4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -86,7 +86,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
         return MO_CONTINUE_MATCHING;
     }
 
-    m128 cur_state = set16x8(*state);
+    m128 cur_state = set1_16x8(*state);
     const m128 *masks = s->shuffle_masks;
 
     while (likely(end - cur_buf >= 4)) {
@@ -282,3 +282,430 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     *scan_end = cur_buf;
     return MO_CONTINUE_MATCHING;
 }
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline
+char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng32 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG32_STATE_MASK);
+    const u8 *cur_buf = start;
+    const u8 *min_accel_dist = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) {
+        DEBUG_PRINTF("Accel state reached @ 0\n");
+        const union AccelAux *aaux =
+            get_accel32(s, *state & SHENG32_STATE_MASK);
+        const u8 *new_offset = run_accel(aaux, cur_buf, end);
+        if (new_offset < cur_buf + BAD_ACCEL_DIST) {
+            min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+        } else {
+            min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+        }
+        DEBUG_PRINTF("Next accel chance: %llu\n",
+                     (u64a)(min_accel_dist - start));
+        DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf);
+        cur_buf = new_offset;
+        DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start));
+    }
+    if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m512 cur_state = set1_64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m512 succ_mask1 = masks[c1];
+        cur_state = vpermb512(cur_state, succ_mask1);
+        const u8 a1 = movd512(cur_state);
+
+        const m512 succ_mask2 = masks[c2];
+        cur_state = vpermb512(cur_state, succ_mask2);
+        const u8 a2 = movd512(cur_state);
+
+        const m512 succ_mask3 = masks[c3];
+        cur_state = vpermb512(cur_state, succ_mask3);
+        const u8 a3 = movd512(cur_state);
+
+        const m512 succ_mask4 = masks[c4];
+        cur_state = vpermb512(cur_state, succ_mask4);
+        const u8 a4 = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK,
+                     a1 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK,
+                     a2 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK,
+                     a3 & SHENG32_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK,
+                     a4 & SHENG32_STATE_FLAG_MASK);
+
+        if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC32(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a1, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a2, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a3, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC32(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG32_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports32(s, cb, ctxt, a4, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC32(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+            if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) {
+                DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+                const union AccelAux *aaux =
+                    get_accel32(s, a4 & SHENG32_STATE_MASK);
+                const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+                if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                    min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+                } else {
+                    min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+                }
+                DEBUG_PRINTF("Next accel chance: %llu\n",
+                             (u64a)(min_accel_dist - start));
+                DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                             (u64a)(new_offset - cur_buf - 4));
+                cur_buf = new_offset;
+                DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+                continue;
+            }
+        }
+        if (OUTER_DEAD_FUNC32(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        };
+        if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) {
+            DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf));
+            const union AccelAux *aaux =
+                get_accel32(s, a4 & SHENG32_STATE_MASK);
+            const u8 *new_offset = run_accel(aaux, cur_buf + 4, end);
+            if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) {
+                min_accel_dist = new_offset + BIG_ACCEL_PENALTY;
+            } else {
+                min_accel_dist = new_offset + SMALL_ACCEL_PENALTY;
+            }
+            DEBUG_PRINTF("Next accel chance: %llu\n",
+                         (u64a)(min_accel_dist - start));
+            DEBUG_PRINTF("Accel scanned %llu bytes\n",
+                         (u64a)(new_offset - cur_buf - 4));
+            cur_buf = new_offset;
+            DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf));
+            continue;
+        };
+        cur_buf += 4;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+
+#ifndef NO_SHENG64_IMPL
+static really_inline
+char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt,
+                  const struct sheng64 *s,
+                  u8 *const cached_accept_state,
+                  ReportID *const cached_accept_id,
+                  u8 single, u64a base_offset, const u8 *buf, const u8 *start,
+                  const u8 *end, const u8 **scan_end) {
+    DEBUG_PRINTF("Starting DFAx4 execution in state %u\n",
+                 *state & SHENG64_STATE_MASK);
+    const u8 *cur_buf = start;
+    base_offset++;
+    DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start));
+
+    if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) {
+        DEBUG_PRINTF("Dead on arrival\n");
+        *scan_end = end;
+        return MO_CONTINUE_MATCHING;
+    }
+
+    m512 cur_state = set1_64x8(*state);
+    const m512 *masks = s->succ_masks;
+
+    while (likely(end - cur_buf >= 4)) {
+        const u8 *b1 = cur_buf;
+        const u8 *b2 = cur_buf + 1;
+        const u8 *b3 = cur_buf + 2;
+        const u8 *b4 = cur_buf + 3;
+        const u8 c1 = *b1;
+        const u8 c2 = *b2;
+        const u8 c3 = *b3;
+        const u8 c4 = *b4;
+
+        const m512 succ_mask1 = masks[c1];
+        cur_state = vpermb512(cur_state, succ_mask1);
+        const u8 a1 = movd512(cur_state);
+
+        const m512 succ_mask2 = masks[c2];
+        cur_state = vpermb512(cur_state, succ_mask2);
+        const u8 a2 = movd512(cur_state);
+
+        const m512 succ_mask3 = masks[c3];
+        cur_state = vpermb512(cur_state, succ_mask3);
+        const u8 a3 = movd512(cur_state);
+
+        const m512 succ_mask4 = masks[c4];
+        cur_state = vpermb512(cur_state, succ_mask4);
+        const u8 a4 = movd512(cur_state);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK,
+                     a1 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK,
+                     a2 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK,
+                     a3 & SHENG64_STATE_FLAG_MASK);
+
+        DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?');
+        DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK,
+                     a4 & SHENG64_STATE_FLAG_MASK);
+
+        if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) {
+            if (ACCEPT_FUNC64(a1)) {
+                u64a match_offset = base_offset + b1 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a1 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b1 - start));
+                    *scan_end = b1;
+                    *state = a1;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a1, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a2)) {
+                u64a match_offset = base_offset + b2 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a2 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b2 - start));
+                    *scan_end = b2;
+                    *state = a2;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a2, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a3)) {
+                u64a match_offset = base_offset + b3 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a3 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b3 - start));
+                    *scan_end = b3;
+                    *state = a3;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a3, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (ACCEPT_FUNC64(a4)) {
+                u64a match_offset = base_offset + b4 - buf;
+                DEBUG_PRINTF("Accept state %u reached\n",
+                             a4 & SHENG64_STATE_MASK);
+                DEBUG_PRINTF("Match @ %llu\n", match_offset);
+                if (STOP_AT_MATCH) {
+                    DEBUG_PRINTF("Stopping at match @ %lli\n",
+                                 (s64a)(b4 - start));
+                    *scan_end = b4;
+                    *state = a4;
+                    return MO_MATCHES_PENDING;
+                }
+                if (single) {
+                    if (fireSingleReport(cb, ctxt, s->report, match_offset) ==
+                        MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                } else {
+                    if (fireReports64(s, cb, ctxt, a4, match_offset,
+                                      cached_accept_state, cached_accept_id,
+                                      0) == MO_HALT_MATCHING) {
+                        return MO_HALT_MATCHING;
+                    }
+                }
+            }
+            if (INNER_DEAD_FUNC64(a4)) {
+                DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf));
+                *scan_end = end;
+                *state = a4;
+                return MO_CONTINUE_MATCHING;
+            }
+        }
+        if (OUTER_DEAD_FUNC64(a4)) {
+            DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf));
+            *scan_end = end;
+            *state = a4;
+            return MO_CONTINUE_MATCHING;
+        }
+        cur_buf += 4;
+    }
+    *state = movd512(cur_state);
+    *scan_end = cur_buf;
+    return MO_CONTINUE_MATCHING;
+}
+#endif // !NO_SHENG64_IMPL
+#endif
diff --git a/src/nfa/sheng_internal.h b/src/nfa/sheng_internal.h
index ff843ebee..98536886c 100644
--- a/src/nfa/sheng_internal.h
+++ b/src/nfa/sheng_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,17 @@
 #define SHENG_STATE_MASK 0xF
 #define SHENG_STATE_FLAG_MASK 0x70
 
+#define SHENG32_STATE_ACCEPT 0x20
+#define SHENG32_STATE_DEAD 0x40
+#define SHENG32_STATE_ACCEL 0x80
+#define SHENG32_STATE_MASK 0x1F
+#define SHENG32_STATE_FLAG_MASK 0xE0
+
+#define SHENG64_STATE_ACCEPT 0x40
+#define SHENG64_STATE_DEAD 0x80
+#define SHENG64_STATE_MASK 0x3F
+#define SHENG64_STATE_FLAG_MASK 0xC0
+
 #define SHENG_FLAG_SINGLE_REPORT 0x1
 #define SHENG_FLAG_CAN_DIE 0x2
 #define SHENG_FLAG_HAS_ACCEL 0x4
@@ -67,4 +78,30 @@ struct sheng {
     ReportID report;
 };
 
+struct sheng32 {
+    m512 succ_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
+
+struct sheng64 {
+    m512 succ_masks[256];
+    u32 length;
+    u32 aux_offset;
+    u32 report_offset;
+    u32 accel_offset;
+    u8 n_states;
+    u8 anchored;
+    u8 floating;
+    u8 flags;
+    ReportID report;
+};
+
 #endif /* SHENG_INTERNAL_H_ */
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index c4094cedc..aa3537839 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,6 @@
 #include "sheng_internal.h"
 #include "ue2common.h"
 #include "util/compile_context.h"
-#include "util/make_unique.h"
 #include "util/verify_types.h"
 #include "util/simd_types.h"
 
@@ -179,7 +178,7 @@ size_t raw_report_info_impl::size() const {
 void raw_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
                                            vector<u32> &ro) const {
     for (const auto &reps : rl) {
-        ro.push_back(base_offset);
+        ro.emplace_back(base_offset);
 
         report_list *p = (report_list *)((char *)n + base_offset);
 
@@ -203,44 +202,44 @@ unique_ptr<raw_report_info> sheng_build_strat::gatherReports(
 
     const bool remap_reports = has_managed_reports(rdfa.kind);
 
-    auto ri = ue2::make_unique<raw_report_info_impl>();
+    auto ri = std::make_unique<raw_report_info_impl>();
     map<raw_report_list, u32> rev;
 
     for (const dstate &s : rdfa.states) {
         if (s.reports.empty()) {
-            reports.push_back(MO_INVALID_IDX);
+            reports.emplace_back(MO_INVALID_IDX);
             continue;
         }
 
         raw_report_list rrl(s.reports, rm, remap_reports);
         DEBUG_PRINTF("non empty r\n");
         if (rev.find(rrl) != rev.end()) {
-            reports.push_back(rev[rrl]);
+            reports.emplace_back(rev[rrl]);
         } else {
             DEBUG_PRINTF("adding to rl %zu\n", ri->size());
             rev[rrl] = ri->size();
-            reports.push_back(ri->size());
-            ri->rl.push_back(rrl);
+            reports.emplace_back(ri->size());
+            ri->rl.emplace_back(rrl);
         }
     }
 
     for (const dstate &s : rdfa.states) {
         if (s.reports_eod.empty()) {
-            reports_eod.push_back(MO_INVALID_IDX);
+            reports_eod.emplace_back(MO_INVALID_IDX);
             continue;
         }
 
         DEBUG_PRINTF("non empty r eod\n");
         raw_report_list rrl(s.reports_eod, rm, remap_reports);
         if (rev.find(rrl) != rev.end()) {
-            reports_eod.push_back(rev[rrl]);
+            reports_eod.emplace_back(rev[rrl]);
             continue;
         }
 
         DEBUG_PRINTF("adding to rl eod %zu\n", s.reports_eod.size());
         rev[rrl] = ri->size();
-        reports_eod.push_back(ri->size());
-        ri->rl.push_back(rrl);
+        reports_eod.emplace_back(ri->size());
+        ri->rl.emplace_back(rrl);
     }
 
     assert(!ri->rl.empty()); /* all components should be able to generate
@@ -271,7 +270,7 @@ unique_ptr<raw_report_info> sheng_build_strat::gatherReports(
         *isSingleReport = 0;
     }
 
-    return move(ri);
+    return std::move(ri);
 }
 
 u32 sheng_build_strat::max_allowed_offset_accel() const {
@@ -301,6 +300,28 @@ void dumpShuffleMask(const u8 chr, const u8 *buf, unsigned sz) {
     }
     DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
 }
+
+static really_inline
+void dumpShuffleMask32(const u8 chr, const u8 *buf, unsigned sz) {
+    stringstream o;
+
+    for (unsigned i = 0; i < sz; i++) {
+        o.width(2);
+        o << (buf[i] & SHENG32_STATE_MASK) << " ";
+    }
+    DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
+}
+
+static really_inline
+void dumpShuffleMask64(const u8 chr, const u8 *buf, unsigned sz) {
+    stringstream o;
+
+    for (unsigned i = 0; i < sz; i++) {
+        o.width(2);
+        o << (buf[i] & SHENG64_STATE_MASK) << " ";
+    }
+    DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str());
+}
 #endif
 
 static
@@ -311,9 +332,16 @@ void fillAccelOut(const map<dstate_id_t, AccelScheme> &accel_escape_info,
     }
 }
 
+template <typename T>
 static
-u8 getShengState(dstate &state, dfa_info &info,
-                 map<dstate_id_t, AccelScheme> &accelInfo) {
+u8 getShengState(UNUSED dstate &state, UNUSED dfa_info &info,
+                 UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
+    return 0;
+}
+
+template <>
+u8 getShengState<sheng>(dstate &state, dfa_info &info,
+                        map<dstate_id_t, AccelScheme> &accelInfo) {
     u8 s = state.impl_id;
     if (!state.reports.empty()) {
         s |= SHENG_STATE_ACCEPT;
@@ -327,11 +355,41 @@ u8 getShengState(dstate &state, dfa_info &info,
     return s;
 }
 
+template <>
+u8 getShengState<sheng32>(dstate &state, dfa_info &info,
+                          map<dstate_id_t, AccelScheme> &accelInfo) {
+    u8 s = state.impl_id;
+    if (!state.reports.empty()) {
+        s |= SHENG32_STATE_ACCEPT;
+    }
+    if (info.isDead(state)) {
+        s |= SHENG32_STATE_DEAD;
+    }
+    if (accelInfo.find(info.raw_id(state.impl_id)) != accelInfo.end()) {
+        s |= SHENG32_STATE_ACCEL;
+    }
+    return s;
+}
+
+template <>
+u8 getShengState<sheng64>(dstate &state, dfa_info &info,
+                          UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
+    u8 s = state.impl_id;
+    if (!state.reports.empty()) {
+        s |= SHENG64_STATE_ACCEPT;
+    }
+    if (info.isDead(state)) {
+        s |= SHENG64_STATE_DEAD;
+    }
+    return s;
+}
+
+template <typename T>
 static
 void fillAccelAux(struct NFA *n, dfa_info &info,
                   map<dstate_id_t, AccelScheme> &accelInfo) {
     DEBUG_PRINTF("Filling accel aux structures\n");
-    sheng *s = (sheng *)getMutableImplNfa(n);
+    T *s = (T *)getMutableImplNfa(n);
     u32 offset = s->accel_offset;
 
     for (dstate_id_t i = 0; i < info.size(); i++) {
@@ -349,11 +407,21 @@ void fillAccelAux(struct NFA *n, dfa_info &info,
     }
 }
 
+template <typename T>
 static
-void populateBasicInfo(struct NFA *n, dfa_info &info,
-                       map<dstate_id_t, AccelScheme> &accelInfo, u32 aux_offset,
-                       u32 report_offset, u32 accel_offset, u32 total_size,
-                       u32 dfa_size) {
+void populateBasicInfo(UNUSED struct NFA *n, UNUSED dfa_info &info,
+                       UNUSED map<dstate_id_t, AccelScheme> &accelInfo,
+                       UNUSED u32 aux_offset, UNUSED u32 report_offset,
+                       UNUSED u32 accel_offset, UNUSED u32 total_size,
+                       UNUSED u32 dfa_size) {
+}
+
+template <>
+void populateBasicInfo<sheng>(struct NFA *n, dfa_info &info,
+                              map<dstate_id_t, AccelScheme> &accelInfo,
+                              u32 aux_offset, u32 report_offset,
+                              u32 accel_offset, u32 total_size,
+                              u32 dfa_size) {
     n->length = total_size;
     n->scratchStateSize = 1;
     n->streamStateSize = 1;
@@ -369,14 +437,65 @@ void populateBasicInfo(struct NFA *n, dfa_info &info,
     s->length = dfa_size;
     s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
 
-    s->anchored = getShengState(info.anchored, info, accelInfo);
-    s->floating = getShengState(info.floating, info, accelInfo);
+    s->anchored = getShengState<sheng>(info.anchored, info, accelInfo);
+    s->floating = getShengState<sheng>(info.floating, info, accelInfo);
+}
+
+template <>
+void populateBasicInfo<sheng32>(struct NFA *n, dfa_info &info,
+                                map<dstate_id_t, AccelScheme> &accelInfo,
+                                u32 aux_offset, u32 report_offset,
+                                u32 accel_offset, u32 total_size,
+                                u32 dfa_size) {
+    n->length = total_size;
+    n->scratchStateSize = 1;
+    n->streamStateSize = 1;
+    n->nPositions = info.size();
+    n->type = SHENG_NFA_32;
+    n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
+
+    sheng32 *s = (sheng32 *)getMutableImplNfa(n);
+    s->aux_offset = aux_offset;
+    s->report_offset = report_offset;
+    s->accel_offset = accel_offset;
+    s->n_states = info.size();
+    s->length = dfa_size;
+    s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
+
+    s->anchored = getShengState<sheng32>(info.anchored, info, accelInfo);
+    s->floating = getShengState<sheng32>(info.floating, info, accelInfo);
 }
 
+template <>
+void populateBasicInfo<sheng64>(struct NFA *n, dfa_info &info,
+                                map<dstate_id_t, AccelScheme> &accelInfo,
+                                u32 aux_offset, u32 report_offset,
+                                u32 accel_offset, u32 total_size,
+                                u32 dfa_size) {
+    n->length = total_size;
+    n->scratchStateSize = 1;
+    n->streamStateSize = 1;
+    n->nPositions = info.size();
+    n->type = SHENG_NFA_64;
+    n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0;
+
+    sheng64 *s = (sheng64 *)getMutableImplNfa(n);
+    s->aux_offset = aux_offset;
+    s->report_offset = report_offset;
+    s->accel_offset = accel_offset;
+    s->n_states = info.size();
+    s->length = dfa_size;
+    s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0;
+
+    s->anchored = getShengState<sheng64>(info.anchored, info, accelInfo);
+    s->floating = getShengState<sheng64>(info.floating, info, accelInfo);
+}
+
+template <typename T>
 static
 void fillTops(NFA *n, dfa_info &info, dstate_id_t id,
               map<dstate_id_t, AccelScheme> &accelInfo) {
-    sheng *s = (sheng *)getMutableImplNfa(n);
+    T *s = (T *)getMutableImplNfa(n);
     u32 aux_base = s->aux_offset;
 
     DEBUG_PRINTF("Filling tops for state %u\n", id);
@@ -393,13 +512,14 @@ void fillTops(NFA *n, dfa_info &info, dstate_id_t id,
 
     DEBUG_PRINTF("Top transition for state %u: %u\n", id, top_state.impl_id);
 
-    aux->top = getShengState(top_state, info, accelInfo);
+    aux->top = getShengState<T>(top_state, info, accelInfo);
 }
 
+template <typename T>
 static
 void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports,
                  vector<u32> &reports_eod, vector<u32> &report_offsets) {
-    sheng *s = (sheng *)getMutableImplNfa(n);
+    T *s = (T *)getMutableImplNfa(n);
     u32 aux_base = s->aux_offset;
     auto raw_id = info.raw_id(id);
 
@@ -419,60 +539,97 @@ void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector<u32> &reports,
     DEBUG_PRINTF("EOD report list offset: %u\n", aux->accept_eod);
 }
 
+template <typename T>
 static
 void fillSingleReport(NFA *n, ReportID r_id) {
-    sheng *s = (sheng *)getMutableImplNfa(n);
+    T *s = (T *)getMutableImplNfa(n);
 
     DEBUG_PRINTF("Single report ID: %u\n", r_id);
     s->report = r_id;
     s->flags |= SHENG_FLAG_SINGLE_REPORT;
 }
 
+template <typename T>
 static
-void createShuffleMasks(sheng *s, dfa_info &info,
-                        map<dstate_id_t, AccelScheme> &accelInfo) {
+bool createShuffleMasks(UNUSED T *s, UNUSED dfa_info &info,
+                        UNUSED map<dstate_id_t, AccelScheme> &accelInfo) {
+    return true;
+}
+
+template <>
+bool createShuffleMasks<sheng>(sheng *s, dfa_info &info,
+                               map<dstate_id_t, AccelScheme> &accelInfo) {
     for (u16 chr = 0; chr < 256; chr++) {
         u8 buf[16] = {0};
 
         for (dstate_id_t idx = 0; idx < info.size(); idx++) {
             auto &succ_state = info.next(idx, chr);
 
-            buf[idx] = getShengState(succ_state, info, accelInfo);
+            buf[idx] = getShengState<sheng>(succ_state, info, accelInfo);
         }
 #ifdef DEBUG
         dumpShuffleMask(chr, buf, sizeof(buf));
 #endif
         memcpy(&s->shuffle_masks[chr], buf, sizeof(m128));
     }
+    return true;
 }
 
-bool has_accel_sheng(const NFA *) {
-    return true; /* consider the sheng region as accelerated */
-}
+template <>
+bool createShuffleMasks<sheng32>(sheng32 *s, dfa_info &info,
+                                 map<dstate_id_t, AccelScheme> &accelInfo) {
+    for (u16 chr = 0; chr < 256; chr++) {
+        u8 buf[64] = {0};
 
-bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
-                               const ReportManager &rm, bool only_accel_init,
-                               set<dstate_id_t> *accel_states) {
-    if (!cc.grey.allowSheng) {
-        DEBUG_PRINTF("Sheng is not allowed!\n");
-        return nullptr;
-    }
+        assert(info.size() <= 32);
+        for (dstate_id_t idx = 0; idx < info.size(); idx++) {
+            auto &succ_state = info.next(idx, chr);
 
-    sheng_build_strat strat(raw, rm, only_accel_init);
-    dfa_info info(strat);
+            buf[idx] = getShengState<sheng32>(succ_state, info, accelInfo);
+            buf[32 + idx] = buf[idx];
+        }
+#ifdef DEBUG
+        dumpShuffleMask32(chr, buf, sizeof(buf));
+#endif
+        memcpy(&s->succ_masks[chr], buf, sizeof(m512));
+    }
+    return true;
+}
 
-    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+template <>
+bool createShuffleMasks<sheng64>(sheng64 *s, dfa_info &info,
+                                 map<dstate_id_t, AccelScheme> &accelInfo) {
+    for (u16 chr = 0; chr < 256; chr++) {
+        u8 buf[64] = {0};
 
-    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
-                 raw.start_anchored, raw.start_floating);
+        assert(info.size() <= 64);
+        for (dstate_id_t idx = 0; idx < info.size(); idx++) {
+            auto &succ_state = info.next(idx, chr);
 
-    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
-                 info.can_die ? "can" : "cannot", info.size());
-    if (info.size() > 16) {
-        DEBUG_PRINTF("Too many states\n");
-        return nullptr;
+            if (accelInfo.find(info.raw_id(succ_state.impl_id))
+                != accelInfo.end()) {
+                return false;
+            }
+            buf[idx] = getShengState<sheng64>(succ_state, info, accelInfo);
+        }
+#ifdef DEBUG
+        dumpShuffleMask64(chr, buf, sizeof(buf));
+#endif
+        memcpy(&s->succ_masks[chr], buf, sizeof(m512));
     }
+    return true;
+}
+
+bool has_accel_sheng(const NFA *) {
+    return true; /* consider the sheng region as accelerated */
+}
 
+template <typename T>
+static
+bytecode_ptr<NFA> shengCompile_int(raw_dfa &raw, const CompileContext &cc,
+                                   set<dstate_id_t> *accel_states,
+                                   sheng_build_strat &strat,
+                                   dfa_info &info) {
     if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming
                           * mode with our semantics */
         raw.stripExtraEodReports();
@@ -487,7 +644,7 @@ bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
     DEBUG_PRINTF("Anchored start state: %u, floating start state: %u\n",
                  info.anchored.impl_id, info.floating.impl_id);
 
-    u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(sheng));
+    u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(T));
     vector<u32> reports, eod_reports, report_offsets;
     u8 isSingle = 0;
     ReportID single_report = 0;
@@ -509,29 +666,128 @@ bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
 
     auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
 
-    populateBasicInfo(nfa.get(), info, accelInfo, nfa_size, reports_offset,
-                      accel_offset, total_size, total_size - sizeof(NFA));
+    populateBasicInfo<T>(nfa.get(), info, accelInfo, nfa_size,
+                             reports_offset, accel_offset, total_size,
+                             total_size - sizeof(NFA));
 
     DEBUG_PRINTF("Setting up aux and report structures\n");
 
     ri->fillReportLists(nfa.get(), reports_offset, report_offsets);
 
     for (dstate_id_t idx = 0; idx < info.size(); idx++) {
-        fillTops(nfa.get(), info, idx, accelInfo);
-        fillAux(nfa.get(), info, idx, reports, eod_reports, report_offsets);
+        fillTops<T>(nfa.get(), info, idx, accelInfo);
+        fillAux<T>(nfa.get(), info, idx, reports, eod_reports,
+                       report_offsets);
     }
     if (isSingle) {
-        fillSingleReport(nfa.get(), single_report);
+        fillSingleReport<T>(nfa.get(), single_report);
     }
 
-    fillAccelAux(nfa.get(), info, accelInfo);
+    fillAccelAux<T>(nfa.get(), info, accelInfo);
 
     if (accel_states) {
         fillAccelOut(accelInfo, accel_states);
     }
 
-    createShuffleMasks((sheng *)getMutableImplNfa(nfa.get()), info, accelInfo);
+    if (!createShuffleMasks<T>((T *)getMutableImplNfa(nfa.get()), info, accelInfo)) {
+        return nullptr;
+    }
+
+    return nfa;
+}
+
+bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
+                               const ReportManager &rm, bool only_accel_init,
+                               set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowSheng) {
+        DEBUG_PRINTF("Sheng is not allowed!\n");
+        return nullptr;
+    }
+
+    sheng_build_strat strat(raw, rm, only_accel_init);
+    dfa_info info(strat);
+
+    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+
+    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
+                 raw.start_anchored, raw.start_floating);
+
+    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
+                 info.can_die ? "can" : "cannot", info.size());
+    if (info.size() > 16) {
+        DEBUG_PRINTF("Too many states\n");
+        return nullptr;
+    }
 
+    return shengCompile_int<sheng>(raw, cc, accel_states, strat, info);
+}
+
+bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm, bool only_accel_init,
+                                 set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowSheng) {
+        DEBUG_PRINTF("Sheng is not allowed!\n");
+        return nullptr;
+    }
+
+    if (!cc.target_info.has_avx512vbmi()) {
+        DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
+        return nullptr;
+    }
+
+    sheng_build_strat strat(raw, rm, only_accel_init);
+    dfa_info info(strat);
+
+    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+
+    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
+                 raw.start_anchored, raw.start_floating);
+
+    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
+                 info.can_die ? "can" : "cannot", info.size());
+    assert(info.size() > 16);
+    if (info.size() > 32) {
+        DEBUG_PRINTF("Too many states\n");
+        return nullptr;
+    }
+
+    return shengCompile_int<sheng32>(raw, cc, accel_states, strat, info);
+}
+
+bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm, bool only_accel_init,
+                                 set<dstate_id_t> *accel_states) {
+    if (!cc.grey.allowSheng) {
+        DEBUG_PRINTF("Sheng is not allowed!\n");
+        return nullptr;
+    }
+
+    if (!cc.target_info.has_avx512vbmi()) {
+        DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n");
+        return nullptr;
+    }
+
+    sheng_build_strat strat(raw, rm, only_accel_init);
+    dfa_info info(strat);
+
+    DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
+
+    DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n",
+                 raw.start_anchored, raw.start_floating);
+
+    DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n",
+                 info.can_die ? "can" : "cannot", info.size());
+    assert(info.size() > 32);
+    if (info.size() > 64) {
+        DEBUG_PRINTF("Too many states\n");
+        return nullptr;
+    }
+    vector<dstate> old_states;
+    old_states = info.states;
+    auto nfa = shengCompile_int<sheng64>(raw, cc, accel_states, strat, info);
+    if (!nfa) {
+        info.states = old_states;
+    }
     return nfa;
 }
 
diff --git a/src/nfa/shengcompile.h b/src/nfa/shengcompile.h
index d795b3623..256f4a4e5 100644
--- a/src/nfa/shengcompile.h
+++ b/src/nfa/shengcompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,6 +71,14 @@ bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
                                const ReportManager &rm, bool only_accel_init,
                                std::set<dstate_id_t> *accel_states = nullptr);
 
+bytecode_ptr<NFA> sheng32Compile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm, bool only_accel_init,
+                                 std::set<dstate_id_t> *accel_states = nullptr);
+
+bytecode_ptr<NFA> sheng64Compile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm, bool only_accel_init,
+                                 std::set<dstate_id_t> *accel_states = nullptr);
+
 struct sheng_escape_info {
     CharReach outs;
     CharReach outs2_single;
diff --git a/src/nfa/shengdump.cpp b/src/nfa/shengdump.cpp
index 99fda76fd..6eb784077 100644
--- a/src/nfa/shengdump.cpp
+++ b/src/nfa/shengdump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,7 +51,7 @@ namespace ue2 {
 
 static
 const sstate_aux *get_aux(const NFA *n, dstate_id_t i) {
-    assert(n && isShengType(n->type));
+    assert(n && isSheng16Type(n->type));
 
     const sheng *s = (const sheng *)getImplNfa(n);
     const sstate_aux *aux_base =
@@ -64,6 +64,36 @@ const sstate_aux *get_aux(const NFA *n, dstate_id_t i) {
     return aux;
 }
 
+static
+const sstate_aux *get_aux32(const NFA *n, dstate_id_t i) {
+    assert(n && isSheng32Type(n->type));
+
+    const sheng32 *s = (const sheng32 *)getImplNfa(n);
+    const sstate_aux *aux_base =
+        (const sstate_aux *)((const char *)n + s->aux_offset);
+
+    const sstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)s + s->length);
+
+    return aux;
+}
+
+static
+const sstate_aux *get_aux64(const NFA *n, dstate_id_t i) {
+    assert(n && isSheng64Type(n->type));
+
+    const sheng64 *s = (const sheng64 *)getImplNfa(n);
+    const sstate_aux *aux_base =
+        (const sstate_aux *)((const char *)n + s->aux_offset);
+
+    const sstate_aux *aux = aux_base + i;
+
+    assert((const char *)aux < (const char *)s + s->length);
+
+    return aux;
+}
+
 static
 void dumpHeader(FILE *f, const sheng *s) {
     fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
@@ -79,6 +109,36 @@ void dumpHeader(FILE *f, const sheng *s) {
             !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
 }
 
+static
+void dumpHeader32(FILE *f, const sheng32 *s) {
+    fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
+            s->length);
+    fprintf(f, "aux base offset: %u, reports base offset: %u, "
+               "accel offset: %u\n",
+            s->aux_offset, s->report_offset, s->accel_offset);
+    fprintf(f, "anchored start state: %u, floating start state: %u\n",
+            s->anchored & SHENG32_STATE_MASK, s->floating & SHENG32_STATE_MASK);
+    fprintf(f, "has accel: %u can die: %u single report: %u\n",
+            !!(s->flags & SHENG_FLAG_HAS_ACCEL),
+            !!(s->flags & SHENG_FLAG_CAN_DIE),
+            !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
+}
+
+static
+void dumpHeader64(FILE *f, const sheng64 *s) {
+    fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states,
+            s->length);
+    fprintf(f, "aux base offset: %u, reports base offset: %u, "
+               "accel offset: %u\n",
+            s->aux_offset, s->report_offset, s->accel_offset);
+    fprintf(f, "anchored start state: %u, floating start state: %u\n",
+            s->anchored & SHENG64_STATE_MASK, s->floating & SHENG64_STATE_MASK);
+    fprintf(f, "has accel: %u can die: %u single report: %u\n",
+            !!(s->flags & SHENG_FLAG_HAS_ACCEL),
+            !!(s->flags & SHENG_FLAG_CAN_DIE),
+            !!(s->flags & SHENG_FLAG_SINGLE_REPORT));
+}
+
 static
 void dumpAux(FILE *f, u32 state, const sstate_aux *aux) {
     fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
@@ -87,6 +147,22 @@ void dumpAux(FILE *f, u32 state, const sstate_aux *aux) {
             aux->top & SHENG_STATE_MASK);
 }
 
+static
+void dumpAux32(FILE *f, u32 state, const sstate_aux *aux) {
+    fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
+               "accel offset: %u, top: %u\n",
+            state, aux->accept, aux->accept_eod, aux->accel,
+            aux->top & SHENG32_STATE_MASK);
+}
+
+static
+void dumpAux64(FILE *f, u32 state, const sstate_aux *aux) {
+    fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, "
+               "accel offset: %u, top: %u\n",
+            state, aux->accept, aux->accept_eod, aux->accel,
+            aux->top & SHENG64_STATE_MASK);
+}
+
 static
 void dumpReports(FILE *f, const report_list *rl) {
     fprintf(f, "reports count: %u\n", rl->count);
@@ -115,6 +191,46 @@ void dumpMasks(FILE *f, const sheng *s) {
     }
 }
 
+static
+void dumpMasks32(FILE *f, const sheng32 *s) {
+    for (u32 chr = 0; chr < 256; chr++) {
+        u8 buf[64];
+        m512 succ_mask = s->succ_masks[chr];
+        memcpy(buf, &succ_mask, sizeof(m512));
+
+        fprintf(f, "%3u: ", chr);
+        for (u32 pos = 0; pos < 64; pos++) {
+            u8 c = buf[pos];
+            if (c & SHENG32_STATE_FLAG_MASK) {
+                fprintf(f, "%2u* ", c & SHENG32_STATE_MASK);
+            } else {
+                fprintf(f, "%2u  ", c & SHENG32_STATE_MASK);
+            }
+        }
+        fprintf(f, "\n");
+    }
+}
+
+static
+void dumpMasks64(FILE *f, const sheng64 *s) {
+    for (u32 chr = 0; chr < 256; chr++) {
+        u8 buf[64];
+        m512 succ_mask = s->succ_masks[chr];
+        memcpy(buf, &succ_mask, sizeof(m512));
+
+        fprintf(f, "%3u: ", chr);
+        for (u32 pos = 0; pos < 64; pos++) {
+            u8 c = buf[pos];
+            if (c & SHENG64_STATE_FLAG_MASK) {
+                fprintf(f, "%2u* ", c & SHENG64_STATE_MASK);
+            } else {
+                fprintf(f, "%2u  ", c & SHENG64_STATE_MASK);
+            }
+        }
+        fprintf(f, "\n");
+    }
+}
+
 static
 void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) {
     assert(nfa->type == SHENG_NFA);
@@ -153,6 +269,82 @@ void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) {
     fprintf(f, "\n");
 }
 
+static
+void nfaExecSheng32_dumpText(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_32);
+    const sheng32 *s = (const sheng32 *)getImplNfa(nfa);
+
+    fprintf(f, "sheng32 DFA\n");
+    dumpHeader32(f, s);
+
+    for (u32 state = 0; state < s->n_states; state++) {
+        const sstate_aux *aux = get_aux32(nfa, state);
+        dumpAux32(f, state, aux);
+        if (aux->accept) {
+            fprintf(f, "report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept);
+            dumpReports(f, rl);
+        }
+        if (aux->accept_eod) {
+            fprintf(f, "EOD report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept_eod);
+            dumpReports(f, rl);
+        }
+        if (aux->accel) {
+            fprintf(f, "accel:\n");
+            const AccelAux *accel =
+                (const AccelAux *)((const char *)nfa + aux->accel);
+            dumpAccelInfo(f, *accel);
+        }
+    }
+
+    fprintf(f, "\n");
+
+    dumpMasks32(f, s);
+
+    fprintf(f, "\n");
+}
+
+static
+void nfaExecSheng64_dumpText(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_64);
+    const sheng64 *s = (const sheng64 *)getImplNfa(nfa);
+
+    fprintf(f, "sheng64 DFA\n");
+    dumpHeader64(f, s);
+
+    for (u32 state = 0; state < s->n_states; state++) {
+        const sstate_aux *aux = get_aux64(nfa, state);
+        dumpAux64(f, state, aux);
+        if (aux->accept) {
+            fprintf(f, "report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept);
+            dumpReports(f, rl);
+        }
+        if (aux->accept_eod) {
+            fprintf(f, "EOD report list:\n");
+            const report_list *rl =
+                (const report_list *)((const char *)nfa + aux->accept_eod);
+            dumpReports(f, rl);
+        }
+        if (aux->accel) {
+            fprintf(f, "accel:\n");
+            const AccelAux *accel =
+                (const AccelAux *)((const char *)nfa + aux->accel);
+            dumpAccelInfo(f, *accel);
+        }
+    }
+
+    fprintf(f, "\n");
+
+    dumpMasks64(f, s);
+
+    fprintf(f, "\n");
+}
+
 static
 void dumpDotPreambleDfa(FILE *f) {
     dumpDotPreamble(f);
@@ -163,8 +355,14 @@ void dumpDotPreambleDfa(FILE *f) {
     fprintf(f, "0 [style=invis];\n");
 }
 
+template <typename T>
 static
-void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) {
+void describeNode(UNUSED const NFA *n, UNUSED const T *s, UNUSED u16 i,
+                  UNUSED FILE *f) {
+}
+
+template <>
+void describeNode<sheng>(const NFA *n, const sheng *s, u16 i, FILE *f) {
     const sstate_aux *aux = get_aux(n, i);
 
     fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
@@ -193,6 +391,66 @@ void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) {
     }
 }
 
+template <>
+void describeNode<sheng32>(const NFA *n, const sheng32 *s, u16 i, FILE *f) {
+    const sstate_aux *aux = get_aux32(n, i);
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+               "label = \"%u\" ]; \n",
+            i, i);
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && (aux->top & SHENG32_STATE_MASK) != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top & SHENG32_STATE_MASK);
+    }
+
+    if (i == (s->anchored & SHENG32_STATE_MASK)) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == (s->floating & SHENG32_STATE_MASK)) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+}
+
+template <>
+void describeNode<sheng64>(const NFA *n, const sheng64 *s, u16 i, FILE *f) {
+    const sstate_aux *aux = get_aux64(n, i);
+
+    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
+               "label = \"%u\" ]; \n",
+            i, i);
+
+    if (aux->accept_eod) {
+        fprintf(f, "%u [ color = darkorchid ];\n", i);
+    }
+
+    if (aux->accept) {
+        fprintf(f, "%u [ shape = doublecircle ];\n", i);
+    }
+
+    if (aux->top && (aux->top & SHENG64_STATE_MASK) != i) {
+        fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i,
+                aux->top & SHENG64_STATE_MASK);
+    }
+
+    if (i == (s->anchored & SHENG64_STATE_MASK)) {
+        fprintf(f, "STARTA -> %u [color = blue ]\n", i);
+    }
+
+    if (i == (s->floating & SHENG64_STATE_MASK)) {
+        fprintf(f, "STARTF -> %u [color = red ]\n", i);
+    }
+}
+
 static
 void describeEdge(FILE *f, const u16 *t, u16 i) {
     for (u16 s = 0; s < N_CHARS; s++) {
@@ -228,7 +486,7 @@ void describeEdge(FILE *f, const u16 *t, u16 i) {
 
 static
 void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
-    assert(isShengType(n->type));
+    assert(isSheng16Type(n->type));
     const sheng *s = (const sheng *)getImplNfa(n);
     const sstate_aux *aux = get_aux(n, state);
 
@@ -244,6 +502,42 @@ void shengGetTransitions(const NFA *n, u16 state, u16 *t) {
     t[TOP] = aux->top & SHENG_STATE_MASK;
 }
 
+static
+void sheng32GetTransitions(const NFA *n, u16 state, u16 *t) {
+    assert(isSheng32Type(n->type));
+    const sheng32 *s = (const sheng32 *)getImplNfa(n);
+    const sstate_aux *aux = get_aux32(n, state);
+
+    for (unsigned i = 0; i < N_CHARS; i++) {
+        u8 buf[64];
+        m512 succ_mask = s->succ_masks[i];
+
+        memcpy(buf, &succ_mask, sizeof(m512));
+
+        t[i] = buf[state] & SHENG32_STATE_MASK;
+    }
+
+    t[TOP] = aux->top & SHENG32_STATE_MASK;
+}
+
+static
+void sheng64GetTransitions(const NFA *n, u16 state, u16 *t) {
+    assert(isSheng64Type(n->type));
+    const sheng64 *s = (const sheng64 *)getImplNfa(n);
+    const sstate_aux *aux = get_aux64(n, state);
+
+    for (unsigned i = 0; i < N_CHARS; i++) {
+        u8 buf[64];
+        m512 succ_mask = s->succ_masks[i];
+
+        memcpy(buf, &succ_mask, sizeof(m512));
+
+        t[i] = buf[state] & SHENG64_STATE_MASK;
+    }
+
+    t[TOP] = aux->top & SHENG64_STATE_MASK;
+}
+
 static
 void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
     assert(nfa->type == SHENG_NFA);
@@ -252,7 +546,7 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
     dumpDotPreambleDfa(f);
 
     for (u16 i = 1; i < s->n_states; i++) {
-        describeNode(nfa, s, i, f);
+        describeNode<sheng>(nfa, s, i, f);
 
         u16 t[ALPHABET_SIZE];
 
@@ -264,10 +558,62 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) {
     fprintf(f, "}\n");
 }
 
+static
+void nfaExecSheng32_dumpDot(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_32);
+    const sheng32 *s = (const sheng32 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < s->n_states; i++) {
+        describeNode<sheng32>(nfa, s, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        sheng32GetTransitions(nfa, i, t);
+
+        describeEdge(f, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
+static
+void nfaExecSheng64_dumpDot(const NFA *nfa, FILE *f) {
+    assert(nfa->type == SHENG_NFA_64);
+    const sheng64 *s = (const sheng64 *)getImplNfa(nfa);
+
+    dumpDotPreambleDfa(f);
+
+    for (u16 i = 1; i < s->n_states; i++) {
+        describeNode<sheng64>(nfa, s, i, f);
+
+        u16 t[ALPHABET_SIZE];
+
+        sheng64GetTransitions(nfa, i, t);
+
+        describeEdge(f, t, i);
+    }
+
+    fprintf(f, "}\n");
+}
+
 void nfaExecSheng_dump(const NFA *nfa, const string &base) {
     assert(nfa->type == SHENG_NFA);
     nfaExecSheng_dumpText(nfa, StdioFile(base + ".txt", "w"));
     nfaExecSheng_dumpDot(nfa, StdioFile(base + ".dot", "w"));
 }
 
+void nfaExecSheng32_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+    assert(nfa->type == SHENG_NFA_32);
+    nfaExecSheng32_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecSheng32_dumpDot(nfa, StdioFile(base + ".dot", "w"));
+}
+
+void nfaExecSheng64_dump(UNUSED const NFA *nfa, UNUSED const string &base) {
+    assert(nfa->type == SHENG_NFA_64);
+    nfaExecSheng64_dumpText(nfa, StdioFile(base + ".txt", "w"));
+    nfaExecSheng64_dumpDot(nfa, StdioFile(base + ".dot", "w"));
+}
+
 } // namespace ue2
diff --git a/src/nfa/shengdump.h b/src/nfa/shengdump.h
index 2bdffeb9a..321536742 100644
--- a/src/nfa/shengdump.h
+++ b/src/nfa/shengdump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,8 @@ struct NFA;
 namespace ue2 {
 
 void nfaExecSheng_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecSheng32_dump(const struct NFA *nfa, const std::string &base);
+void nfaExecSheng64_dump(const struct NFA *nfa, const std::string &base);
 
 } // namespace ue2
 
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
deleted file mode 100644
index 09ffc0cf9..000000000
--- a/src/nfa/shufti.c
+++ /dev/null
@@ -1,1097 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Shufti: character class acceleration.
- *
- * Utilises the SSSE3 pshufb shuffle instruction
- */
-
-#include "shufti.h"
-#include "ue2common.h"
-#include "util/arch.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-#include "util/unaligned.h"
-
-#ifdef DEBUG
-#include <ctype.h>
-
-#define DUMP_MSK(_t)                                \
-static UNUSED                                       \
-void dumpMsk##_t(m##_t msk) {                       \
-    u8 * mskAsU8 = (u8 *)&msk;                      \
-    for (unsigned i = 0; i < sizeof(msk); i++) {    \
-        u8 c = mskAsU8[i];                          \
-        for (int j = 0; j < 8; j++) {               \
-            if ((c >> (7-j)) & 0x1)                 \
-                printf("1");                        \
-            else                                    \
-                printf("0");                        \
-        }                                           \
-        printf(" ");                                \
-    }                                               \
-}                                                   \
-static UNUSED                                       \
-void dumpMsk##_t##AsChars(m##_t msk) {              \
-    u8 * mskAsU8 = (u8 *)&msk;                      \
-    for (unsigned i = 0; i < sizeof(msk); i++) {    \
-        u8 c = mskAsU8[i];                          \
-        if (isprint(c))                             \
-            printf("%c",c);                         \
-        else                                        \
-            printf(".");                            \
-    }                                               \
-}
-
-#endif
-
-/** \brief Naive byte-by-byte implementation. */
-static really_inline
-const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
-                        const u8 *buf_end) {
-    assert(buf < buf_end);
-
-    for (; buf < buf_end; ++buf) {
-        u8 c = *buf;
-        if (lo[c & 0xf] & hi[c >> 4]) {
-            break;
-        }
-    }
-    return buf;
-}
-
-/** \brief Naive byte-by-byte implementation. */
-static really_inline
-const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
-                        const u8 *buf_end) {
-    assert(buf < buf_end);
-
-    for (buf_end--; buf_end >= buf; buf_end--) {
-        u8 c = *buf_end;
-        if (lo[c & 0xf] & hi[c >> 4]) {
-            break;
-        }
-    }
-    return buf_end;
-}
-
-#if !defined(HAVE_AVX2)
-/* Normal SSSE3 shufti */
-
-#ifdef DEBUG
-DUMP_MSK(128)
-#endif
-
-#define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
-
-static really_inline
-u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
-          const m128 compare) {
-    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
-    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
-    m128 t     = and128(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
-#endif
-    return movemask128(eq128(t, compare));
-}
-
-static really_inline
-const u8 *firstMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffff)) {
-        u32 pos = ctz32(~z & 0xffff);
-        assert(pos < 16);
-        return buf + pos;
-    } else {
-        return NULL; // no match
-    }
-}
-
-static really_inline
-const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
-                   const m128 low4bits, const m128 zeroes) {
-    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
-
-    return firstMatch(buf, z);
-}
-
-const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                     const u8 *buf_end) {
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-
-    // Slow path for small cases.
-    if (buf_end - buf < 16) {
-        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
-    }
-
-    const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
-    const u8 *rv;
-
-    size_t min = (size_t)buf % 16;
-    assert(buf_end - buf >= 16);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m128 chars = loadu128(buf);
-    rv = fwdBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-    buf += (16 - min);
-
-    // Unrolling was here, but it wasn't doing anything but taking up space.
-    // Reroll FTW.
-
-    const u8 *last_block = buf_end - 16;
-    while (buf < last_block) {
-        m128 lchars = load128(buf);
-        rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes);
-        if (rv) {
-            return rv;
-        }
-        buf += 16;
-    }
-
-    // Use an unaligned load to mop up the last 16 bytes and get an accurate
-    // picture to buf_end.
-    assert(buf <= buf_end && buf >= buf_end - 16);
-    chars = loadu128(buf_end - 16);
-    rv = fwdBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
-
-static really_inline
-const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
-#ifdef DEBUG
-    DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n");
-#endif
-
-    u32 z = movemask128(eq128(t, compare));
-    if (unlikely(z != 0xffff)) {
-        u32 pos = clz32(~z & 0xffff);
-        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
-        assert(pos >= 16 && pos < 32);
-        return buf + (31 - pos);
-    } else {
-        return NULL; // no match
-    }
-}
-
-
-static really_inline
-const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
-                   const m128 low4bits, const m128 zeroes) {
-    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
-    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
-    m128 t     = and128(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
-#endif
-
-    return lastMatch(buf, t, zeroes);
-}
-
-const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                      const u8 *buf_end) {
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-
-    // Slow path for small cases.
-    if (buf_end - buf < 16) {
-        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
-    }
-
-    const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
-    const u8 *rv;
-
-    assert(buf_end - buf >= 16);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m128 chars = loadu128(buf_end - 16);
-    rv = revBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf));
-
-    // Unrolling was here, but it wasn't doing anything but taking up space.
-    // Reroll FTW.
-
-    const u8 *last_block = buf + 16;
-    while (buf_end > last_block) {
-        buf_end -= 16;
-        m128 lchars = load128(buf_end);
-        rv = revBlock(mask_lo, mask_hi, lchars, buf_end, low4bits, zeroes);
-        if (rv) {
-            return rv;
-        }
-    }
-
-    // Use an unaligned load to mop up the last 16 bytes and get an accurate
-    // picture to buf.
-    chars = loadu128(buf);
-    rv = revBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-
-    return buf - 1;
-}
-
-static really_inline
-const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
-                    m128 chars, const u8 *buf, const m128 low4bits,
-                    const m128 ones) {
-    m128 chars_lo = GET_LO_4(chars);
-    m128 chars_hi = GET_HI_4(chars);
-    m128 c_lo  = pshufb_m128(mask1_lo, chars_lo);
-    m128 c_hi  = pshufb_m128(mask1_hi, chars_hi);
-    m128 t     = or128(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
-#endif
-
-    m128 c2_lo  = pshufb_m128(mask2_lo, chars_lo);
-    m128 c2_hi  = pshufb_m128(mask2_hi, chars_hi);
-    m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo);        printf("\n");
-    DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi);        printf("\n");
-    DEBUG_PRINTF("    t2: "); dumpMsk128(t2);           printf("\n");
-#endif
-
-    u32 z = movemask128(eq128(t2, ones));
-    DEBUG_PRINTF("    z: 0x%08x\n", z);
-    return firstMatch(buf, z);
-}
-
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
-                           m128 mask2_lo, m128 mask2_hi,
-                           const u8 *buf, const u8 *buf_end) {
-    const m128 ones = ones128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
-    const u8 *rv;
-
-    size_t min = (size_t)buf % 16;
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m128 chars = loadu128(buf);
-    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
-                   chars, buf, low4bits, ones);
-    if (rv) {
-        return rv;
-    }
-    buf += (16 - min);
-
-    // Unrolling was here, but it wasn't doing anything but taking up space.
-    // Reroll FTW.
-
-    const u8 *last_block = buf_end - 16;
-    while (buf < last_block) {
-        m128 lchars = load128(buf);
-        rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
-                       lchars, buf, low4bits, ones);
-        if (rv) {
-            return rv;
-        }
-        buf += 16;
-    }
-
-    // Use an unaligned load to mop up the last 16 bytes and get an accurate
-    // picture to buf_end.
-    chars = loadu128(buf_end - 16);
-    rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
-                   chars, buf_end - 16, low4bits, ones);
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
-
-#elif !defined(HAVE_AVX512)
-// AVX2 - 256 wide shuftis
-
-#ifdef DEBUG
-DUMP_MSK(256)
-#endif
-
-#define GET_LO_4(chars) and256(chars, low4bits)
-#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
-
-static really_inline
-u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
-          const m256 compare) {
-    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
-    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
-    m256 t = and256(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
-#endif
-
-    return movemask256(eq256(t, compare));
-}
-
-static really_inline
-const u8 *firstMatch(const u8 *buf, u32 z) {
-    DEBUG_PRINTF("z 0x%08x\n", z);
-    if (unlikely(z != 0xffffffff)) {
-        u32 pos = ctz32(~z);
-        assert(pos < 32);
-        DEBUG_PRINTF("match @ pos %u\n", pos);
-        return buf + pos;
-    } else {
-        return NULL; // no match
-    }
-}
-
-static really_inline
-const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
-                        const m256 low4bits) {
-    // do the hi and lo shuffles in the one avx register
-    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
-    c = and256(c, low4bits);
-    m256 c_shuf = pshufb_m256(mask, c);
-    m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
-    // the upper 32-bits can't match
-    u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
-
-    return firstMatch(buf, z);
-}
-
-static really_inline
-const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                         const u8 *buf_end, const m256 low4bits) {
-    // run shufti over two overlapping 16-byte unaligned reads
-    const m256 mask = combine2x128(mask_hi, mask_lo);
-    m128 chars = loadu128(buf);
-    const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits);
-    if (rv) {
-        return rv;
-    }
-
-    chars = loadu128(buf_end - 16);
-    rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits);
-    if (rv) {
-        return rv;
-    }
-    return buf_end;
-}
-
-static really_inline
-const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
-                   const m256 low4bits, const m256 zeroes) {
-    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
-
-    return firstMatch(buf, z);
-}
-
-/* takes 128 bit masks, but operates on 256 bits of data */
-const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                     const u8 *buf_end) {
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
-
-    // Slow path for small cases.
-    if (buf_end - buf < 16) {
-        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
-    }
-
-    const m256 low4bits = set32x8(0xf);
-
-    if (buf_end - buf <= 32) {
-        return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
-    }
-
-    const m256 zeroes = zeroes256();
-    const m256 wide_mask_lo = set2x128(mask_lo);
-    const m256 wide_mask_hi = set2x128(mask_hi);
-    const u8 *rv;
-
-    size_t min = (size_t)buf % 32;
-    assert(buf_end - buf >= 32);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m256 chars = loadu256(buf);
-    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-    buf += (32 - min);
-
-    // Unrolling was here, but it wasn't doing anything but taking up space.
-    // Reroll FTW.
-
-    const u8 *last_block = buf_end - 32;
-    while (buf < last_block) {
-        m256 lchars = load256(buf);
-        rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes);
-        if (rv) {
-            return rv;
-        }
-        buf += 32;
-    }
-
-    // Use an unaligned load to mop up the last 32 bytes and get an accurate
-    // picture to buf_end.
-    assert(buf <= buf_end && buf >= buf_end - 32);
-    chars = loadu256(buf_end - 32);
-    rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
-
-static really_inline
-const u8 *lastMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffffffff)) {
-        u32 pos = clz32(~z);
-        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
-        return buf + (31 - pos);
-    } else {
-        return NULL; // no match
-    }
-}
-
-static really_inline
-const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
-                   const m256 low4bits, const m256 zeroes) {
-    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
-    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
-    m256 t     = and256(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
-#endif
-
-    u32 z = movemask256(eq256(t, zeroes));
-    return lastMatch(buf, z);
-}
-
-static really_inline
-const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
-                        const m256 low4bits) {
-    // do the hi and lo shuffles in the one avx register
-    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
-    c = and256(c, low4bits);
-    m256 c_shuf = pshufb_m256(mask, c);
-    m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
-    // the upper 32-bits can't match
-    u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
-
-    return lastMatch(buf, z);
-}
-
-static really_inline
-const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                         const u8 *buf_end, const m256 low4bits) {
-    // run shufti over two overlapping 16-byte unaligned reads
-    const m256 mask = combine2x128(mask_hi, mask_lo);
-
-    m128 chars = loadu128(buf_end - 16);
-    const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits);
-    if (rv) {
-        return rv;
-    }
-
-    chars = loadu128(buf);
-    rv = revBlockShort(mask, chars, buf, low4bits);
-    if (rv) {
-        return rv;
-    }
-    return buf - 1;
-}
-
-
-/* takes 128 bit masks, but operates on 256 bits of data */
-const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                      const u8 *buf_end) {
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-
-    // Slow path for small cases.
-    if (buf_end - buf < 16) {
-        return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
-    }
-
-    const m256 low4bits = set32x8(0xf);
-
-    if (buf_end - buf <= 32) {
-        return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
-    }
-
-    const m256 zeroes = zeroes256();
-    const m256 wide_mask_lo = set2x128(mask_lo);
-    const m256 wide_mask_hi = set2x128(mask_hi);
-    const u8 *rv;
-
-    assert(buf_end - buf >= 32);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m256 chars = loadu256(buf_end - 32);
-    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f));
-
-    // Unrolling was here, but it wasn't doing anything but taking up space.
-    // Reroll FTW.
-    const u8 *last_block = buf + 32;
-    while (buf_end > last_block) {
-        buf_end -= 32;
-        m256 lchars = load256(buf_end);
-        rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes);
-        if (rv) {
-            return rv;
-        }
-    }
-
-    // Use an unaligned load to mop up the last 32 bytes and get an accurate
-    // picture to buf.
-    chars = loadu256(buf);
-    rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-
-    return buf - 1;
-}
-
-static really_inline
-const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
-                    m256 chars, const u8 *buf, const m256 low4bits,
-                    const m256 ones) {
-    DEBUG_PRINTF("buf %p\n", buf);
-    m256 chars_lo = GET_LO_4(chars);
-    m256 chars_hi = GET_HI_4(chars);
-    m256 c_lo  = pshufb_m256(mask1_lo, chars_lo);
-    m256 c_hi  = pshufb_m256(mask1_hi, chars_hi);
-    m256 t     = or256(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk256(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
-#endif
-
-    m256 c2_lo  = pshufb_m256(mask2_lo, chars_lo);
-    m256 c2_hi  = pshufb_m256(mask2_hi, chars_hi);
-    m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo);        printf("\n");
-    DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi);        printf("\n");
-    DEBUG_PRINTF("    t2: "); dumpMsk256(t2);           printf("\n");
-#endif
-    u32 z = movemask256(eq256(t2, ones));
-
-    return firstMatch(buf, z);
-}
-
-static really_inline
-const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
-                         const m256 low4bits) {
-    // do the hi and lo shuffles in the one avx register
-    m256 c = combine2x128(rshift64_m128(chars, 4), chars);
-    c = and256(c, low4bits);
-    m256 c_shuf1 = pshufb_m256(mask1, c);
-    m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1);
-    m256 t0 = or256(c_shuf1, c_shuf2);
-    m128 t = or128(movdq_hi(t0), cast256to128(t0));
-    // the upper 32-bits can't match
-    u32 z = 0xffff0000U | movemask128(eq128(t, ones128()));
-
-    return firstMatch(buf, z);
-}
-
-static really_inline
-const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
-                            m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
-    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
-    const m256 low4bits = set32x8(0xf);
-    // run shufti over two overlapping 16-byte unaligned reads
-    const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
-    const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
-    m128 chars = loadu128(buf);
-    const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits);
-    if (rv) {
-        return rv;
-    }
-
-    chars = loadu128(buf_end - 16);
-    rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits);
-    if (rv) {
-        return rv;
-    }
-    return buf_end;
-}
-
-/* takes 128 bit masks, but operates on 256 bits of data */
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
-                           m128 mask2_lo, m128 mask2_hi,
-                           const u8 *buf, const u8 *buf_end) {
-    /* we should always have at least 16 bytes */
-    assert(buf_end - buf >= 16);
-    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
-
-    if (buf_end - buf < 32) {
-        return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf,
-                                 buf_end);
-    }
-
-    const m256 ones = ones256();
-    const m256 low4bits = set32x8(0xf);
-    const m256 wide_mask1_lo = set2x128(mask1_lo);
-    const m256 wide_mask1_hi = set2x128(mask1_hi);
-    const m256 wide_mask2_lo = set2x128(mask2_lo);
-    const m256 wide_mask2_hi = set2x128(mask2_hi);
-    const u8 *rv;
-
-    size_t min = (size_t)buf % 32;
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m256 chars = loadu256(buf);
-    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
-                   chars, buf, low4bits, ones);
-    if (rv) {
-        return rv;
-    }
-    buf += (32 - min);
-
-    // Unrolling was here, but it wasn't doing anything but taking up space.
-    // Reroll FTW.
-    const u8 *last_block = buf_end - 32;
-    while (buf < last_block) {
-        m256 lchars = load256(buf);
-        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
-                       lchars, buf, low4bits, ones);
-        if (rv) {
-            return rv;
-        }
-        buf += 32;
-    }
-
-    // Use an unaligned load to mop up the last 32 bytes and get an accurate
-    // picture to buf_end.
-    chars = loadu256(buf_end - 32);
-    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
-                   chars, buf_end - 32, low4bits, ones);
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
-
-#else // defined(HAVE_AVX512)
-
-#ifdef DEBUG
-DUMP_MSK(512)
-#endif
-
-static really_inline
-u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits,
-           const m512 compare) {
-    m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
-    m512 c_hi = pshufb_m512(mask_hi,
-                            rshift64_m512(andnot512(low4bits, chars), 4));
-    m512 t = and512(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk512(chars); printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo); printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi); printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk512(t); printf("\n");
-#endif
-
-    return eq512mask(t, compare);
-}
-static really_inline
-const u8 *firstMatch64(const u8 *buf, u64a z) {
-    DEBUG_PRINTF("z 0x%016llx\n", z);
-    if (unlikely(z != ~0ULL)) {
-        u32 pos = ctz64(~z);
-        DEBUG_PRINTF("match @ pos %u\n", pos);
-        assert(pos < 64);
-        return buf + pos;
-    } else {
-        return NULL; // no match
-    }
-}
-
-static really_inline
-const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
-                      const m512 low4bits, const m512 zeroes) {
-    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
-
-    return firstMatch64(buf, z);
-}
-
-static really_inline
-const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
-                         const u8 *buf_end, const m512 low4bits,
-                         const m512 zeroes) {
-    DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf);
-    uintptr_t len = buf_end - buf;
-    assert(len <= 64);
-
-    // load mask
-    u64a k = (~0ULL) >> (64 - len);
-    DEBUG_PRINTF("load mask 0x%016llx\n", k);
-
-    m512 chars = loadu_maskz_m512(k, buf);
-
-    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
-
-    // reuse the load mask to indicate valid bytes
-    return firstMatch64(buf, z | ~k);
-}
-
-/* takes 128 bit masks, but operates on 512 bits of data */
-const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                     const u8 *buf_end) {
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
-    DEBUG_PRINTF("b %s\n", buf);
-
-    const m512 low4bits = set64x8(0xf);
-    const m512 zeroes = zeroes512();
-    const m512 wide_mask_lo = set4x128(mask_lo);
-    const m512 wide_mask_hi = set4x128(mask_hi);
-    const u8 *rv;
-
-    // small cases.
-    if (buf_end - buf <= 64) {
-        rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
-                            zeroes);
-        return rv ? rv : buf_end;
-    }
-
-    assert(buf_end - buf >= 64);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    if ((uintptr_t)buf % 64) {
-        rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf,
-                            ROUNDUP_PTR(buf, 64), low4bits, zeroes);
-        if (rv) {
-            return rv;
-        }
-        buf = ROUNDUP_PTR(buf, 64);
-    }
-
-    const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64);
-    while (buf < last_block) {
-        m512 lchars = load512(buf);
-        rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits,
-                         zeroes);
-        if (rv) {
-            return rv;
-        }
-        buf += 64;
-    }
-
-    if (buf == buf_end) {
-        goto done;
-    }
-
-    // Use an unaligned load to mop up the last 64 bytes and get an accurate
-    // picture to buf_end.
-    assert(buf <= buf_end && buf >= buf_end - 64);
-    m512 chars = loadu512(buf_end - 64);
-    rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits,
-                     zeroes);
-    if (rv) {
-        return rv;
-    }
-done:
-    return buf_end;
-}
-
-static really_inline
-const u8 *lastMatch64(const u8 *buf, u64a z) {
-    DEBUG_PRINTF("z 0x%016llx\n", z);
-    if (unlikely(z != ~0ULL)) {
-        u32 pos = clz64(~z);
-        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
-        return buf + (63 - pos);
-    } else {
-        return NULL; // no match
-    }
-}
-
-static really_inline
-const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
-                          const u8 *buf_end, const m512 low4bits,
-                          const m512 zeroes) {
-    DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
-    uintptr_t len = buf_end - buf;
-    assert(len <= 64);
-
-    // load mask
-    u64a k = (~0ULL) >> (64 - len);
-    DEBUG_PRINTF("load mask 0x%016llx\n", k);
-
-    m512 chars = loadu_maskz_m512(k, buf);
-
-    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
-
-    // reuse the load mask to indicate valid bytes
-    return lastMatch64(buf, z | ~k);
-}
-
-static really_inline
-const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
-                      const m512 low4bits, const m512 zeroes) {
-    m512 c_lo  = pshufb_m512(mask_lo, and512(chars, low4bits));
-    m512 c_hi  = pshufb_m512(mask_hi,
-                             rshift64_m512(andnot512(low4bits, chars), 4));
-    m512 t     = and512(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
-#endif
-
-    u64a z = eq512mask(t, zeroes);
-    return lastMatch64(buf, z);
-}
-
-/* takes 128 bit masks, but operates on 512 bits of data */
-const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                      const u8 *buf_end) {
-    DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end);
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-
-    const m512 low4bits = set64x8(0xf);
-    const m512 zeroes = zeroes512();
-    const m512 wide_mask_lo = set4x128(mask_lo);
-    const m512 wide_mask_hi = set4x128(mask_hi);
-    const u8 *rv;
-
-    if (buf_end - buf < 64) {
-        rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
-                             zeroes);
-        return rv ? rv : buf - 1;
-    }
-
-    if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) {
-        // peel off unaligned portion
-        assert(buf_end - buf >= 64);
-        DEBUG_PRINTF("start\n");
-        rv = rshortShufti512(wide_mask_lo, wide_mask_hi,
-                             ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits,
-                             zeroes);
-        if (rv) {
-            return rv;
-        }
-        buf_end = ROUNDDOWN_PTR(buf_end, 64);
-    }
-
-    const u8 *last_block = ROUNDUP_PTR(buf, 64);
-    while (buf_end > last_block) {
-        buf_end -= 64;
-        m512 lchars = load512(buf_end);
-        rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits,
-                         zeroes);
-        if (rv) {
-            return rv;
-        }
-    }
-    if (buf_end == buf) {
-        goto done;
-    }
-    // Use an unaligned load to mop up the last 64 bytes and get an accurate
-    // picture to buf.
-    m512 chars = loadu512(buf);
-    rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
-    if (rv) {
-        return rv;
-    }
-done:
-    return buf - 1;
-}
-
-static really_inline
-const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
-                    m512 chars, const u8 *buf, const m512 low4bits,
-                    const m512 ones, __mmask64 k) {
-    DEBUG_PRINTF("buf %p %.64s\n", buf, buf);
-    m512 chars_lo = and512(chars, low4bits);
-    m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4);
-    m512 c_lo  = maskz_pshufb_m512(k, mask1_lo, chars_lo);
-    m512 c_hi  = maskz_pshufb_m512(k, mask1_hi, chars_hi);
-    m512 t     = or512(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
-#endif
-
-    m512 c2_lo  = maskz_pshufb_m512(k, mask2_lo, chars_lo);
-    m512 c2_hi  = maskz_pshufb_m512(k, mask2_hi, chars_hi);
-    m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1));
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo);        printf("\n");
-    DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi);        printf("\n");
-    DEBUG_PRINTF("    t2: "); dumpMsk512(t2);           printf("\n");
-#endif
-    u64a z = eq512mask(t2, ones);
-
-    return firstMatch64(buf, z | ~k);
-}
-
-static really_inline
-const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
-                               m512 mask2_hi, const u8 *buf, const u8 *buf_end,
-                               const m512 low4bits, const m512 ones) {
-    DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
-    uintptr_t len = buf_end - buf;
-    assert(len <= 64);
-
-    u64a k = (~0ULL) >> (64 - len);
-    DEBUG_PRINTF("load mask 0x%016llx\n", k);
-
-    m512 chars = loadu_mask_m512(ones, k, buf);
-
-    const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf,
-                             low4bits, ones, k);
-
-    return rv;
-}
-
-/* takes 128 bit masks, but operates on 512 bits of data */
-const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
-                           m128 mask2_lo, m128 mask2_hi,
-                           const u8 *buf, const u8 *buf_end) {
-    /* we should always have at least 16 bytes */
-    assert(buf_end - buf >= 16);
-    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
-
-    const m512 ones = ones512();
-    const m512 low4bits = set64x8(0xf);
-    const m512 wide_mask1_lo = set4x128(mask1_lo);
-    const m512 wide_mask1_hi = set4x128(mask1_hi);
-    const m512 wide_mask2_lo = set4x128(mask2_lo);
-    const m512 wide_mask2_hi = set4x128(mask2_hi);
-    const u8 *rv;
-
-    if (buf_end - buf <= 64) {
-        rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
-                                  wide_mask2_hi, buf, buf_end, low4bits, ones);
-        DEBUG_PRINTF("rv %p\n", rv);
-        return rv ? rv : buf_end;
-    }
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    if ((uintptr_t)buf % 64) {
-        rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
-                                  wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64),
-                                  low4bits, ones);
-        if (rv) {
-            return rv;
-        }
-
-        buf = ROUNDUP_PTR(buf, 64);
-    }
-
-    const u8 *last_block = buf_end - 64;
-    while (buf < last_block) {
-        m512 lchars = load512(buf);
-        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
-                       wide_mask2_hi, lchars, buf, low4bits, ones, ~0);
-        if (rv) {
-            return rv;
-        }
-        buf += 64;
-    }
-
-    // Use an unaligned load to mop up the last 64 bytes and get an accurate
-    // picture to buf_end.
-    m512 chars = loadu512(buf_end - 64);
-    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
-                   chars, buf_end - 64, low4bits, ones, ~0);
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
-#endif
diff --git a/src/nfa/shufti.cpp b/src/nfa/shufti.cpp
new file mode 100644
index 000000000..2d858c665
--- /dev/null
+++ b/src/nfa/shufti.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020, 2021, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Shufti: character class acceleration.
+ *
+ * Utilises the SSSE3 pshufb shuffle instruction
+ */
+
+#include "shufti.h"
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+
+/** \brief Naive byte-by-byte implementation. */
+static really_inline
+const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
+                        const u8 *buf_end) {
+    DEBUG_PRINTF("buf %p end %p \n", buf, buf_end);
+    for (; buf < buf_end; ++buf) {
+        u8 c = *buf;
+        if (lo[c & 0xf] & hi[c >> 4]) {
+            break;
+        }
+    }
+    return buf;
+}
+
+/** \brief Naive byte-by-byte implementation. */
+static really_inline
+const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
+                        const u8 *buf_end) {
+    for (buf_end--; buf_end >= buf; buf_end--) {
+        u8 c = *buf_end;
+        if (lo[c & 0xf] & hi[c >> 4]) {
+            break;
+        }
+    }
+    return buf_end;
+}
+
+#ifdef HAVE_SVE
+#include "shufti_sve.hpp"
+#else
+#include "shufti_simd.hpp"
+#endif
diff --git a/src/nfa/shufti.h b/src/nfa/shufti.h
index 1ebf776cc..a6f9bc793 100644
--- a/src/nfa/shufti.h
+++ b/src/nfa/shufti.h
@@ -36,7 +36,7 @@
 #define SHUFTI_H
 
 #include "ue2common.h"
-#include "util/simd_utils.h"
+#include "util/simd_types.h"
 
 #ifdef __cplusplus
 extern "C"
diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp
new file mode 100644
index 000000000..feeb54abd
--- /dev/null
+++ b/src/nfa/shufti_simd.hpp
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Shufti: character class acceleration.
+ *
+ * Utilises the SSSE3 pshufb shuffle instruction
+ */
+
+#include <algorithm>
+
+#include "shufti.h"
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/unaligned.h"
+
+#include "util/supervector/supervector.hpp"
+#include "util/match.hpp"
+
+template <uint16_t S>
+static really_inline
+const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars);
+template <uint16_t S>
+static really_inline
+SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars);
+
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/shufti.hpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "x86/shufti.hpp"
+#elif (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
+#include "arm/shufti.hpp"
+#elif defined(ARCH_PPC64EL)
+#include "ppc64el/shufti.hpp"
+#endif
+#endif
+
+template <uint16_t S>
+static really_inline
+const u8 *fwdBlock(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars, const u8 *buf) {
+    SuperVector<S> v = blockSingleMask(mask_lo, mask_hi, chars);
+
+    return first_zero_match_inverted<S>(buf, v);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *revBlock(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars, const u8 *buf) {
+    SuperVector<S> v = blockSingleMask(mask_lo, mask_hi, chars);
+
+    return last_zero_match_inverted<S>(buf, v);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *fwdBlockDouble(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars, const u8 *buf) {
+
+    SuperVector<S> mask = blockDoubleMask(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars);
+
+    return first_zero_match_inverted<S>(buf, mask);
+}
+
+template <uint16_t S>
+const u8 *shuftiExecReal(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const SuperVector<S> wide_mask_lo(mask_lo);
+    const SuperVector<S> wide_mask_hi(mask_hi);
+
+    const u8 *d = buf;
+    const u8 *rv;
+
+    __builtin_prefetch(d +   64);
+    __builtin_prefetch(d + 2*64);
+    __builtin_prefetch(d + 3*64);
+    __builtin_prefetch(d + 4*64);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            SuperVector<S> chars = SuperVector<S>::loadu(d);
+            rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, d);
+            if (rv) return rv;
+            d = ROUNDUP_PTR(d, S);
+        }
+
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 64);
+            DEBUG_PRINTF("d %p \n", d);
+            SuperVector<S> chars = SuperVector<S>::load(d);
+            rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, d);
+            if (rv) return rv;
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("d %p e %p \n", d, buf_end);
+    // finish off tail
+
+    if (d != buf_end) {
+        SuperVector<S> chars = SuperVector<S>::loadu(buf_end - S);
+        rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - S);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf_end;
+}
+
+template <uint16_t S>
+const u8 *rshuftiExecReal(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("rshufti %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const SuperVector<S> wide_mask_lo(mask_lo);
+    const SuperVector<S> wide_mask_hi(mask_hi);
+
+    const u8 *d = buf_end;
+    const u8 *rv;
+
+    __builtin_prefetch(d -   64);
+    __builtin_prefetch(d - 2*64);
+    __builtin_prefetch(d - 3*64);
+    __builtin_prefetch(d - 4*64);
+    DEBUG_PRINTF("start %p end %p \n", buf, d);
+    assert(d > buf);
+    if (d - S >= buf) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            SuperVector<S> chars = SuperVector<S>::loadu(d - S);
+            rv = revBlock(wide_mask_lo, wide_mask_hi, chars, d - S);
+            DEBUG_PRINTF("rv %p \n", rv);
+            if (rv) return rv;
+            d = ROUNDDOWN_PTR(d, S);
+        }
+
+        while (d - S >= buf) {
+            DEBUG_PRINTF("aligned %p \n", d);
+            // On large packet buffers, this prefetch appears to get us about 2%.
+            __builtin_prefetch(d - 64);
+
+            d -= S;
+            SuperVector<S> chars = SuperVector<S>::load(d);
+            rv = revBlock(wide_mask_lo, wide_mask_hi, chars, d);
+            if (rv) return rv;
+        }
+    }
+
+    DEBUG_PRINTF("tail d %p e %p \n", buf, d);
+    // finish off head
+
+    if (d != buf) {
+        SuperVector<S> chars = SuperVector<S>::loadu(buf);
+        rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf - 1;
+}
+
+template <uint16_t S>
+const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const SuperVector<S> wide_mask1_lo(mask1_lo);
+    const SuperVector<S> wide_mask1_hi(mask1_hi);
+    const SuperVector<S> wide_mask2_lo(mask2_lo);
+    const SuperVector<S> wide_mask2_hi(mask2_hi);
+
+    const u8 *d = buf;
+    const u8 *rv;
+
+    __builtin_prefetch(d +   64);
+    __builtin_prefetch(d + 2*64);
+    __builtin_prefetch(d + 3*64);
+    __builtin_prefetch(d + 4*64);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // peel off first part to cacheline boundary
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            SuperVector<S> chars = SuperVector<S>::loadu(d);
+            rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, d);
+            DEBUG_PRINTF("rv %p \n", rv);
+            if (rv) return rv;
+            d = ROUNDUP_PTR(d, S);
+        }
+
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 64);
+            DEBUG_PRINTF("d %p \n", d);
+
+            SuperVector<S> chars = SuperVector<S>::load(d);
+            rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, d);
+            if (rv) return rv;
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("tail d %p e %p \n", d, buf_end);
+    // finish off tail
+
+    if (d != buf_end) {
+        SuperVector<S> chars = SuperVector<S>::Zeroes();
+        const u8 *end_buf;
+        if (buf_end - buf < S) {
+          memcpy(&chars.u, buf, buf_end - buf);
+          end_buf = buf;
+        } else {
+          chars = SuperVector<S>::loadu(buf_end - S);
+          end_buf = buf_end - S;
+        }
+        rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, end_buf);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf_end;
+}
+
+const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                      const u8 *buf_end) {
+  if (buf_end - buf < VECTORSIZE) {
+    return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, buf_end);
+  }
+  return shuftiExecReal<VECTORSIZE>(mask_lo, mask_hi, buf, buf_end);
+}
+
+const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                       const u8 *buf_end) {
+    if (buf_end - buf < VECTORSIZE) {
+      return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, buf_end);
+    }
+    return rshuftiExecReal<VECTORSIZE>(mask_lo, mask_hi, buf, buf_end);
+}
+
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                            m128 mask2_lo, m128 mask2_hi,
+                            const u8 *buf, const u8 *buf_end) {
+    return shuftiDoubleExecReal<VECTORSIZE>(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, buf_end);
+}
diff --git a/src/nfa/shufti_sve.hpp b/src/nfa/shufti_sve.hpp
new file mode 100644
index 000000000..76f1e7adb
--- /dev/null
+++ b/src/nfa/shufti_sve.hpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Shufti: character class acceleration.
+ *
+ * Utilises the SVE tbl shuffle instruction
+ */
+
+static really_inline
+svbool_t singleMatched(svuint8_t mask_lo, svuint8_t mask_hi,
+                       const u8 *buf, svbool_t pg) {
+    svuint8_t vec = svld1_u8(pg, buf);
+    svuint8_t c_lo = svtbl(mask_lo, svand_z(svptrue_b8(), vec, (uint8_t)0xf));
+    svuint8_t c_hi = svtbl(mask_hi, svlsr_z(svptrue_b8(), vec, 4));
+    svuint8_t t = svand_z(svptrue_b8(), c_lo, c_hi);
+    return svcmpne(pg, t, (uint8_t)0);
+}
+
+static really_inline
+const u8 *shuftiOnce(svuint8_t mask_lo, svuint8_t mask_hi,
+                     const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+    assert(buf <= buf_end);
+    svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf);
+    svbool_t matched = singleMatched(mask_lo, mask_hi, buf, pg);
+    return accelSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *shuftiLoopBody(svuint8_t mask_lo, svuint8_t mask_hi, const u8 *buf) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb());
+    svbool_t matched = singleMatched(mask_lo, mask_hi, buf, svptrue_b8());
+    return accelSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *rshuftiOnce(svuint8_t mask_lo, svuint8_t mask_hi,
+                      const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+    assert(buf <= buf_end);
+    DEBUG_PRINTF("l = %td\n", buf_end - buf);
+    svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf);
+    svbool_t matched = singleMatched(mask_lo, mask_hi, buf, pg);
+    return accelRevSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *rshuftiLoopBody(svuint8_t mask_lo, svuint8_t mask_hi,
+                          const u8 *buf) {
+    svbool_t matched = singleMatched(mask_lo, mask_hi, buf, svptrue_b8());
+    return accelRevSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *shuftiSearch(svuint8_t mask_lo, svuint8_t mask_hi,
+                       const u8 *buf, const u8 *buf_end) {
+    assert(buf < buf_end);
+    size_t len = buf_end - buf;
+    if (len <= svcntb()) {
+        return shuftiOnce(mask_lo, mask_hi, buf, buf_end);
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2));
+    assert(aligned_buf < buf_end);
+    if (buf != aligned_buf) {
+        const u8 *ptr = shuftiLoopBody(mask_lo, mask_hi, buf);
+        if (ptr) return ptr;
+    }
+    buf = aligned_buf;
+    size_t loops = (buf_end - buf) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    for (size_t i = 0; i < loops; i++, buf += svcntb()) {
+        const u8 *ptr = shuftiLoopBody(mask_lo, mask_hi, buf);
+        if (ptr) return ptr;
+    }
+    DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+    return buf == buf_end ? NULL : shuftiLoopBody(mask_lo, mask_hi,
+                                                  buf_end - svcntb());
+}
+
+static really_inline
+const u8 *rshuftiSearch(svuint8_t mask_lo, svuint8_t mask_hi,
+                        const u8 *buf, const u8 *buf_end) {
+    assert(buf < buf_end);
+    size_t len = buf_end - buf;
+    if (len <= svcntb()) {
+        return rshuftiOnce(mask_lo, mask_hi, buf, buf_end);
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_buf_end = ROUNDDOWN_PTR(buf_end, svcntb_pat(SV_POW2));
+    assert(buf < aligned_buf_end);
+    if (buf_end != aligned_buf_end) {
+        const u8 *ptr = rshuftiLoopBody(mask_lo, mask_hi, buf_end - svcntb());
+        if (ptr) return ptr;
+    }
+    buf_end = aligned_buf_end;
+    size_t loops = (buf_end - buf) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    for (size_t i = 0; i < loops; i++) {
+        buf_end -= svcntb();
+        const u8 *ptr = rshuftiLoopBody(mask_lo, mask_hi, buf_end);
+        if (ptr) return ptr;
+    }
+    DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+    return buf == buf_end ? NULL : rshuftiLoopBody(mask_lo, mask_hi, buf);
+}
+
+const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                     const u8 *buf_end) {
+    DEBUG_PRINTF("shufti scan over %td bytes\n", buf_end - buf);
+    svuint8_t sve_mask_lo = getSVEMaskFrom128(mask_lo);
+    svuint8_t sve_mask_hi = getSVEMaskFrom128(mask_hi);
+    const u8 *ptr = shuftiSearch(sve_mask_lo, sve_mask_hi, buf, buf_end);
+    return ptr ? ptr : buf_end;
+}
+
+const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                      const u8 *buf_end) {
+    DEBUG_PRINTF("rshufti scan over %td bytes\n", buf_end - buf);
+    svuint8_t sve_mask_lo = getSVEMaskFrom128(mask_lo);
+    svuint8_t sve_mask_hi = getSVEMaskFrom128(mask_hi);
+    const u8 *ptr = rshuftiSearch(sve_mask_lo, sve_mask_hi, buf, buf_end);
+    return ptr ? ptr : buf - 1;
+}
+
+static really_inline
+svbool_t doubleMatched(svuint8_t mask1_lo, svuint8_t mask1_hi,
+                       svuint8_t mask2_lo, svuint8_t mask2_hi,
+                       const u8 *buf, const svbool_t pg) {
+    svuint8_t vec = svld1_u8(pg, buf);
+
+    svuint8_t chars_lo = svand_x(svptrue_b8(), vec, (uint8_t)0xf);
+    svuint8_t chars_hi = svlsr_x(svptrue_b8(), vec, 4);
+
+    svuint8_t c1_lo  = svtbl(mask1_lo, chars_lo);
+    svuint8_t c1_hi  = svtbl(mask1_hi, chars_hi);
+    svuint8_t t1     = svorr_x(svptrue_b8(), c1_lo, c1_hi);
+
+    svuint8_t c2_lo  = svtbl(mask2_lo, chars_lo);
+    svuint8_t c2_hi  = svtbl(mask2_hi, chars_hi);
+    svuint8_t t2     = svext(svorr_z(pg, c2_lo, c2_hi), svdup_u8(0), 1);
+
+    svuint8_t t      = svorr_x(svptrue_b8(), t1, t2);
+
+    return svnot_z(svptrue_b8(), svcmpeq(svptrue_b8(), t, (uint8_t)0xff));
+}
+
+static really_inline
+const u8 *dshuftiOnce(svuint8_t mask1_lo, svuint8_t mask1_hi,
+                      svuint8_t mask2_lo, svuint8_t mask2_hi,
+                      const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("l = %td\n", buf_end - buf);
+    svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf);
+    svbool_t matched = doubleMatched(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                                     buf, pg);
+    return accelSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *dshuftiLoopBody(svuint8_t mask1_lo, svuint8_t mask1_hi,
+                          svuint8_t mask2_lo, svuint8_t mask2_hi,
+                          const u8 *buf) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb());
+    svbool_t matched = doubleMatched(mask1_lo, mask1_hi, mask2_lo, mask2_hi,
+                                     buf, svptrue_b8());
+    return accelSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *dshuftiSearch(svuint8_t mask1_lo, svuint8_t mask1_hi,
+                        svuint8_t mask2_lo, svuint8_t mask2_hi,
+                        const u8 *buf, const u8 *buf_end) {
+    assert(buf < buf_end);
+    size_t len = buf_end - buf;
+    if (len <= svcntb()) {
+        return dshuftiOnce(mask1_lo, mask1_hi,
+                           mask2_lo, mask2_hi, buf, buf_end);
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2));
+    assert(aligned_buf < buf_end);
+    if (buf != aligned_buf) {
+        const u8 *ptr = dshuftiLoopBody(mask1_lo, mask1_hi,
+                                        mask2_lo, mask2_hi, buf);
+        if (ptr) return ptr;
+    }
+    buf = aligned_buf;
+    size_t loops = (buf_end - buf) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    for (size_t i = 0; i < loops; i++, buf += svcntb()) {
+        const u8 *ptr = dshuftiLoopBody(mask1_lo, mask1_hi,
+                                        mask2_lo, mask2_hi, buf);
+        if (ptr) return ptr;
+    }
+    DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+    return buf == buf_end ? NULL : dshuftiLoopBody(mask1_lo, mask1_hi,
+                                                   mask2_lo, mask2_hi,
+                                                   buf_end - svcntb());
+}
+
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("double shufti scan %td bytes\n", buf_end - buf);
+    DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+    svuint8_t sve_mask1_lo = getSVEMaskFrom128(mask1_lo);
+    svuint8_t sve_mask1_hi = getSVEMaskFrom128(mask1_hi);
+    svuint8_t sve_mask2_lo = getSVEMaskFrom128(mask2_lo);
+    svuint8_t sve_mask2_hi = getSVEMaskFrom128(mask2_hi);
+    const u8 *ptr = dshuftiSearch(sve_mask1_lo, sve_mask1_hi,
+                                  sve_mask2_lo, sve_mask2_hi, buf, buf_end);
+    return ptr ? ptr : buf_end;
+}
\ No newline at end of file
diff --git a/src/nfa/shufticompile.cpp b/src/nfa/shufticompile.cpp
index f712ef94a..5385a8ce0 100644
--- a/src/nfa/shufticompile.cpp
+++ b/src/nfa/shufticompile.cpp
@@ -182,7 +182,7 @@ bool shuftiBuildDoubleMasks(const CharReach &onechar,
         }
         nibble_masks.clear();
         for (const auto &e : new_masks) {
-            nibble_masks.push_back(e.second);
+            nibble_masks.emplace_back(e.second);
         }
     }
 
diff --git a/src/nfa/tamaramacompile.cpp b/src/nfa/tamaramacompile.cpp
index 1a6e8beff..6f8c3dbe4 100644
--- a/src/nfa/tamaramacompile.cpp
+++ b/src/nfa/tamaramacompile.cpp
@@ -54,7 +54,7 @@ void remapTops(const TamaInfo &tamaInfo,
     u32 cur = 0;
     for (const auto &sub : tamaInfo.subengines) {
         u32 base = cur;
-        top_base.push_back(base + MQE_TOP_FIRST);
+        top_base.emplace_back(base + MQE_TOP_FIRST);
         DEBUG_PRINTF("subengine:%u\n", i);
         for (const auto &t : tamaInfo.tops[i++]) {
             cur = base + t;
@@ -163,8 +163,8 @@ set<ReportID> all_reports(const TamaProto &proto) {
 
 void TamaInfo::add(NFA *sub, const set<u32> &top) {
     assert(subengines.size() < max_occupancy);
-    subengines.push_back(sub);
-    tops.push_back(top);
+    subengines.emplace_back(sub);
+    tops.emplace_back(top);
 }
 
 void TamaProto::add(const NFA *n, const u32 id, const u32 top,
diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
deleted file mode 100644
index be6b312cf..000000000
--- a/src/nfa/truffle.c
+++ /dev/null
@@ -1,608 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Matches a byte in a charclass using three shuffles
- */
-
-
-#include "ue2common.h"
-#include "truffle.h"
-#include "util/arch.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-
-#if !defined(HAVE_AVX2)
-
-static really_inline
-const u8 *lastMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffff)) {
-        u32 pos = clz32(~z & 0xffff);
-        assert(pos >= 16 && pos < 32);
-        return buf + (31 - pos);
-    }
-
-    return NULL; // no match
-}
-
-static really_inline
-const u8 *firstMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffff)) {
-        u32 pos = ctz32(~z & 0xffff);
-        assert(pos < 16);
-        return buf + pos;
-    }
-
-    return NULL; // no match
-}
-
-static really_inline
-u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
-
-    m128 highconst = _mm_set1_epi8(0x80);
-    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
-
-    // and now do the real work
-    m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
-    m128 t1 = xor128(v, highconst);
-    m128 shuf2 = pshufb_m128(shuf_mask_lo_highset, t1);
-    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
-    m128 shuf3 = pshufb_m128(shuf_mask_hi, t2);
-    m128 tmp = and128(or128(shuf1, shuf2), shuf3);
-    m128 tmp2 = eq128(tmp, zeroes128());
-    u32 z = movemask128(tmp2);
-
-    return z;
-}
-
-static
-const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                       const u8 *buf, const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    assert(len < 16);
-
-    m128 chars = zeroes128();
-    memcpy(&chars, buf, len);
-
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
-    // can't be these bytes in z
-    u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
-    const u8 *rv = firstMatch(buf, z | mask);
-
-    if (rv) {
-        return rv;
-    } else {
-        return buf_end;
-    }
-}
-
-static really_inline
-const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                   m128 v, const u8 *buf) {
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
-    return firstMatch(buf, z);
-}
-
-static really_inline
-const u8 *revBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                   m128 v, const u8 *buf) {
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
-    return lastMatch(buf, z);
-}
-
-const u8 *truffleExec(m128 shuf_mask_lo_highclear,
-                      m128 shuf_mask_lo_highset,
-                      const u8 *buf, const u8 *buf_end) {
-    DEBUG_PRINTF("len %zu\n", buf_end - buf);
-
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    const u8 *rv;
-
-    if (buf_end - buf < 16) {
-        return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
-                           buf_end);
-    }
-
-    size_t min = (size_t)buf % 16;
-    assert(buf_end - buf >= 16);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m128 chars = loadu128(buf);
-    rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf);
-    if (rv) {
-        return rv;
-    }
-    buf += (16 - min);
-
-    const u8 *last_block = buf_end - 16;
-    while (buf < last_block) {
-        m128 lchars = load128(buf);
-        rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars,
-                      buf);
-        if (rv) {
-            return rv;
-        }
-        buf += 16;
-    }
-
-    // Use an unaligned load to mop up the last 16 bytes and get an accurate
-    // picture to buf_end.
-    assert(buf <= buf_end && buf >= buf_end - 16);
-    chars = loadu128(buf_end - 16);
-    rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
-                  buf_end - 16);
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
-
-static
-const u8 *truffleRevMini(m128 shuf_mask_lo_highclear,
-                         m128 shuf_mask_lo_highset, const u8 *buf,
-                         const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    assert(len < 16);
-
-    m128 chars = zeroes128();
-    memcpy(&chars, buf, len);
-
-    u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
-    const u8 *rv = lastMatch(buf, z | mask);
-
-    if (rv) {
-        return rv;
-    }
-    return buf - 1;
-}
-
-const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
-                       m128 shuf_mask_lo_highset,
-                       const u8 *buf, const u8 *buf_end) {
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    const u8 *rv;
-
-    DEBUG_PRINTF("len %zu\n", buf_end - buf);
-
-    if (buf_end - buf < 16) {
-        return truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf,
-                              buf_end);
-    }
-
-    assert(buf_end - buf >= 16);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m128 chars = loadu128(buf_end - 16);
-    rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
-                  buf_end - 16);
-    if (rv) {
-        return rv;
-    }
-    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf));
-
-    const u8 *last_block = buf + 16;
-    while (buf_end > last_block) {
-        buf_end -= 16;
-        m128 lchars = load128(buf_end);
-        rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars,
-                      buf_end);
-        if (rv) {
-            return rv;
-        }
-    }
-
-    // Use an unaligned load to mop up the last 16 bytes and get an accurate
-    // picture to buf_end.
-    chars = loadu128(buf);
-    rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf);
-    if (rv) {
-        return rv;
-    }
-
-    return buf - 1;
-}
-
-#elif !defined(HAVE_AVX512)
-
-// AVX2
-
-static really_inline
-const u8 *lastMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffffffff)) {
-        u32 pos = clz32(~z);
-        assert(pos < 32);
-        return buf + (31 - pos);
-    }
-
-    return NULL; // no match
-}
-
-static really_inline
-const u8 *firstMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffffffff)) {
-        u32 pos = ctz32(~z);
-        assert(pos < 32);
-        return buf + pos;
-    }
-
-    return NULL; // no match
-}
-
-static really_inline
-u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
-
-    m256 highconst = _mm256_set1_epi8(0x80);
-    m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
-
-    // and now do the real work
-    m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
-    m256 t1 = xor256(v, highconst);
-    m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1);
-    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
-    m256 shuf3 = pshufb_m256(shuf_mask_hi, t2);
-    m256 tmp = and256(or256(shuf1, shuf2), shuf3);
-    m256 tmp2 = eq256(tmp, zeroes256());
-    u32 z = movemask256(tmp2);
-
-    return z;
-}
-
-static
-const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
-                       const u8 *buf, const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    assert(len < 32);
-
-    m256 chars = zeroes256();
-    memcpy(&chars, buf, len);
-
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
-    // can't be these bytes in z
-    u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
-    const u8 *rv = firstMatch(buf, z | mask);
-
-    if (rv) {
-        return rv;
-    } else {
-        return buf_end;
-    }
-}
-
-static really_inline
-const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
-                   m256 v, const u8 *buf) {
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
-    return firstMatch(buf, z);
-}
-
-static really_inline
-const u8 *revBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
-                   m256 v, const u8 *buf) {
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
-    return lastMatch(buf, z);
-}
-
-const u8 *truffleExec(m128 shuf_mask_lo_highclear,
-                      m128 shuf_mask_lo_highset,
-                      const u8 *buf, const u8 *buf_end) {
-    DEBUG_PRINTF("len %zu\n", buf_end - buf);
-    const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
-    const m256 wide_set = set2x128(shuf_mask_lo_highset);
-
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    const u8 *rv;
-
-    if (buf_end - buf < 32) {
-        return truffleMini(wide_clear, wide_set, buf, buf_end);
-    }
-
-    size_t min = (size_t)buf % 32;
-    assert(buf_end - buf >= 32);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m256 chars = loadu256(buf);
-    rv = fwdBlock(wide_clear, wide_set, chars, buf);
-    if (rv) {
-        return rv;
-    }
-    buf += (32 - min);
-
-    const u8 *last_block = buf_end - 32;
-    while (buf < last_block) {
-        m256 lchars = load256(buf);
-        rv = fwdBlock(wide_clear, wide_set, lchars, buf);
-        if (rv) {
-            return rv;
-        }
-        buf += 32;
-    }
-
-    // Use an unaligned load to mop up the last 32 bytes and get an accurate
-    // picture to buf_end.
-    assert(buf <= buf_end && buf >= buf_end - 32);
-    chars = loadu256(buf_end - 32);
-    rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 32);
-    if (rv) {
-        return rv;
-    }
-    return buf_end;
-}
-
-static
-const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
-                         m256 shuf_mask_lo_highset, const u8 *buf,
-                         const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    assert(len < 32);
-
-    m256 chars = zeroes256();
-    memcpy(&chars, buf, len);
-
-    u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
-    const u8 *rv = lastMatch(buf, z | mask);
-
-    if (rv) {
-        return rv;
-    }
-    return buf - 1;
-}
-
-
-const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
-                       m128 shuf_mask_lo_highset,
-                       const u8 *buf, const u8 *buf_end) {
-    const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
-    const m256 wide_set = set2x128(shuf_mask_lo_highset);
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    const u8 *rv;
-
-    DEBUG_PRINTF("len %zu\n", buf_end - buf);
-
-    if (buf_end - buf < 32) {
-        return truffleRevMini(wide_clear, wide_set, buf, buf_end);
-    }
-
-    assert(buf_end - buf >= 32);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m256 chars = loadu256(buf_end - 32);
-    rv = revBlock(wide_clear, wide_set, chars,
-                  buf_end - 32);
-    if (rv) {
-        return rv;
-    }
-    buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f));
-
-    const u8 *last_block = buf + 32;
-    while (buf_end > last_block) {
-        buf_end -= 32;
-        m256 lchars = load256(buf_end);
-        rv = revBlock(wide_clear, wide_set, lchars, buf_end);
-        if (rv) {
-            return rv;
-        }
-    }
-
-    // Use an unaligned load to mop up the last 32 bytes and get an accurate
-    // picture to buf_end.
-    chars = loadu256(buf);
-    rv = revBlock(wide_clear, wide_set, chars, buf);
-    if (rv) {
-        return rv;
-    }
-    return buf - 1;
-}
-
-#else // AVX512
-
-static really_inline
-const u8 *lastMatch(const u8 *buf, u64a z) {
-    if (unlikely(z != ~0ULL)) {
-        u64a pos = clz64(~z);
-        assert(pos < 64);
-        return buf + (63 - pos);
-    }
-
-    return NULL; // no match
-}
-
-static really_inline
-const u8 *firstMatch(const u8 *buf, u64a z) {
-    if (unlikely(z != ~0ULL)) {
-        u64a pos = ctz64(~z);
-        assert(pos < 64);
-        DEBUG_PRINTF("pos %llu\n", pos);
-        return buf + pos;
-    }
-
-    return NULL; // no match
-}
-
-static really_inline
-u64a block(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, m512 v) {
-    m512 highconst = set64x8(0x80);
-    m512 shuf_mask_hi = set8x64(0x8040201008040201);
-
-    // and now do the real work
-    m512 shuf1 = pshufb_m512(shuf_mask_lo_highclear, v);
-    m512 t1 = xor512(v, highconst);
-    m512 shuf2 = pshufb_m512(shuf_mask_lo_highset, t1);
-    m512 t2 = andnot512(highconst, rshift64_m512(v, 4));
-    m512 shuf3 = pshufb_m512(shuf_mask_hi, t2);
-    m512 tmp = and512(or512(shuf1, shuf2), shuf3);
-    u64a z = eq512mask(tmp, zeroes512());
-
-    return z;
-}
-
-static really_inline
-const u8 *truffleMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
-                      const u8 *buf, const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    assert(len <= 64);
-
-    __mmask64 mask = (~0ULL) >> (64 - len);
-
-    m512 chars = loadu_maskz_m512(mask, buf);
-
-    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
-
-    const u8 *rv = firstMatch(buf, z | ~mask);
-
-    return rv;
-}
-
-static really_inline
-const u8 *fwdBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
-                   m512 v, const u8 *buf) {
-    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
-    return firstMatch(buf, z);
-}
-
-static really_inline
-const u8 *revBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
-                   m512 v, const u8 *buf) {
-    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
-    return lastMatch(buf, z);
-}
-
-const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                      const u8 *buf, const u8 *buf_end) {
-    DEBUG_PRINTF("len %zu\n", buf_end - buf);
-    const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
-    const m512 wide_set = set4x128(shuf_mask_lo_highset);
-
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    const u8 *rv;
-
-    if (buf_end - buf <= 64) {
-        rv = truffleMini(wide_clear, wide_set, buf, buf_end);
-        return rv ? rv : buf_end;
-    }
-
-    assert(buf_end - buf >= 64);
-    if ((uintptr_t)buf % 64) {
-        // Preconditioning: most of the time our buffer won't be aligned.
-        rv = truffleMini(wide_clear, wide_set, buf, ROUNDUP_PTR(buf, 64));
-        if (rv) {
-            return rv;
-        }
-        buf = ROUNDUP_PTR(buf, 64);
-    }
-    const u8 *last_block = buf_end - 64;
-    while (buf < last_block) {
-        m512 lchars = load512(buf);
-        rv = fwdBlock(wide_clear, wide_set, lchars, buf);
-        if (rv) {
-            return rv;
-        }
-        buf += 64;
-    }
-
-    // Use an unaligned load to mop up the last 64 bytes and get an accurate
-    // picture to buf_end.
-    assert(buf <= buf_end && buf >= buf_end - 64);
-    m512 chars = loadu512(buf_end - 64);
-    rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 64);
-    if (rv) {
-        return rv;
-    }
-    return buf_end;
-}
-
-static really_inline
-const u8 *truffleRevMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
-                         const u8 *buf, const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    assert(len < 64);
-
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 chars = loadu_maskz_m512(mask, buf);
-    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
-    DEBUG_PRINTF("mask 0x%016llx z 0x%016llx\n", mask, z);
-    const u8 *rv = lastMatch(buf, z | ~mask);
-
-    if (rv) {
-        return rv;
-    }
-    return buf - 1;
-}
-
-const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                       const u8 *buf, const u8 *buf_end) {
-    const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
-    const m512 wide_set = set4x128(shuf_mask_lo_highset);
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    const u8 *rv;
-
-    DEBUG_PRINTF("len %zu\n", buf_end - buf);
-
-    if (buf_end - buf < 64) {
-        return truffleRevMini(wide_clear, wide_set, buf, buf_end);
-    }
-
-    assert(buf_end - buf >= 64);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m512 chars = loadu512(buf_end - 64);
-    rv = revBlock(wide_clear, wide_set, chars, buf_end - 64);
-    if (rv) {
-        return rv;
-    }
-    buf_end = (const u8 *)ROUNDDOWN_N((uintptr_t)buf_end, 64);
-
-    const u8 *last_block = buf + 64;
-    while (buf_end > last_block) {
-        buf_end -= 64;
-        m512 lchars = load512(buf_end);
-        rv = revBlock(wide_clear, wide_set, lchars, buf_end);
-        if (rv) {
-            return rv;
-        }
-    }
-
-    // Use an unaligned load to mop up the last 64 bytes and get an accurate
-    // picture to buf_end.
-    chars = loadu512(buf);
-    rv = revBlock(wide_clear, wide_set, chars, buf);
-    if (rv) {
-        return rv;
-    }
-    return buf - 1;
-}
-
-#endif
diff --git a/src/nfa/truffle.cpp b/src/nfa/truffle.cpp
new file mode 100644
index 000000000..c83914455
--- /dev/null
+++ b/src/nfa/truffle.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020, 2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Truffle: character class acceleration.
+ */
+
+#include "truffle.h"
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+
+#include "truffle_simd.hpp"
+
+const u8 *truffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                      const u8 *buf_end) {
+    return truffleExecReal<VECTORSIZE>(mask_lo, mask_hi, buf, buf_end);
+}
+
+const u8 *rtruffleExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                       const u8 *buf_end) {
+    return rtruffleExecReal<VECTORSIZE>(mask_lo, mask_hi, buf, buf_end);
+}
diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp
new file mode 100644
index 000000000..c1028156e
--- /dev/null
+++ b/src/nfa/truffle_simd.hpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Truffle: character class acceleration.
+ *
+ */
+
+#include "truffle.h"
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/unaligned.h"
+
+#include "util/supervector/supervector.hpp"
+#include "util/match.hpp"
+
+template <uint16_t S>
+static really_inline
+const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars);
+
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/truffle.hpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "x86/truffle.hpp"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "arm/truffle.hpp"
+#elif defined(ARCH_PPC64EL)
+#include "ppc64el/truffle.hpp"
+#endif
+#endif
+
+template <uint16_t S>
+static really_inline
+const u8 *fwdBlock(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars, const u8 *buf) {
+    SuperVector<S> res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    return first_zero_match_inverted<S>(buf, res);
+}
+
+template <uint16_t S>
+const u8 *truffleExecReal(m128 &shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("truffle %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const SuperVector<S> wide_shuf_mask_lo_highclear(shuf_mask_lo_highclear);
+    const SuperVector<S> wide_shuf_mask_lo_highset(shuf_mask_lo_highset);
+
+    const u8 *d = buf;
+    const u8 *rv;
+
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+    assert(d < buf_end);
+
+    __builtin_prefetch(d +   64);
+    __builtin_prefetch(d + 2*64);
+    __builtin_prefetch(d + 3*64);
+    __builtin_prefetch(d + 4*64);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            SuperVector<S> chars = SuperVector<S>::loadu(d);
+            const u8 *dup = ROUNDUP_PTR(d, S);
+            rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d);
+            if (rv && rv < dup) return rv;
+            d = dup;
+        }
+
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 64);
+            DEBUG_PRINTF("d %p \n", d);
+            SuperVector<S> chars = SuperVector<S>::load(d);
+            rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d);
+            if (rv) return rv;
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("d %p e %p \n", d, buf_end);
+    // finish off tail
+
+    if (d != buf_end) {
+        SuperVector<S> chars = SuperVector<S>::Zeroes();
+        const u8* end_buf;
+        if (buf_end - buf < S) {
+          memcpy(&chars.u, buf, buf_end - buf);
+          end_buf = buf;
+        } else {
+          chars = SuperVector<S>::loadu(buf_end - S);
+          end_buf = buf_end - S;
+        }
+        rv = fwdBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, end_buf);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf_end;
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *revBlock(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> v, 
+                    const u8 *buf) {
+    SuperVector<S> res = blockSingleMask(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return last_zero_match_inverted<S>(buf, res);
+}
+
+template <uint16_t S>
+const u8 *rtruffleExecReal(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, const u8 *buf, const u8 *buf_end){
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("trufle %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const SuperVector<S> wide_shuf_mask_lo_highclear(shuf_mask_lo_highclear);
+    const SuperVector<S> wide_shuf_mask_lo_highset(shuf_mask_lo_highset);
+
+    const u8 *d = buf_end;
+    const u8 *rv;
+
+    __builtin_prefetch(d -   64);
+    __builtin_prefetch(d - 2*64);
+    __builtin_prefetch(d - 3*64);
+    __builtin_prefetch(d - 4*64);
+    DEBUG_PRINTF("start %p end %p \n", buf, d);
+    assert(d > buf);
+    if (d - S >= buf) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            SuperVector<S> chars = SuperVector<S>::loadu(d - S);
+            const u8 *dbot = ROUNDDOWN_PTR(d, S);
+            rv = revBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d - S);
+            DEBUG_PRINTF("rv %p \n", rv);
+            if (rv >= dbot) return rv;
+            d = dbot;
+        }
+
+        while (d - S >= buf) {
+            DEBUG_PRINTF("aligned %p \n", d);
+            // On large packet buffers, this prefetch appears to get us about 2%.
+            __builtin_prefetch(d - 64);
+
+            d -= S;
+            SuperVector<S> chars = SuperVector<S>::load(d);
+            rv = revBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, d);
+            if (rv) return rv;
+        }
+    }
+
+    DEBUG_PRINTF("tail d %p e %p \n", buf, d);
+    // finish off head
+
+    if (d != buf) {
+        SuperVector<S> chars = SuperVector<S>::Zeroes();
+        if (buf_end - buf < S) {
+          memcpy(&chars.u, buf, buf_end - buf);
+        } else {
+          chars = SuperVector<S>::loadu(buf);
+        }
+        rv = revBlock(wide_shuf_mask_lo_highclear, wide_shuf_mask_lo_highset, chars, buf);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf - 1;
+}
diff --git a/src/nfa/vermicelli.h b/src/nfa/vermicelli.h
deleted file mode 100644
index ed797d83f..000000000
--- a/src/nfa/vermicelli.h
+++ /dev/null
@@ -1,518 +0,0 @@
-/*
- * Copyright (c) 2015-2020, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Vermicelli: single-byte and double-byte acceleration.
- */
-
-#ifndef VERMICELLI_H
-#define VERMICELLI_H
-
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-#include "util/unaligned.h"
-
-#include "vermicelli_sse.h"
-
-static really_inline
-const u8 *vermicelliExec(char c, char nocase, const u8 *buf,
-                         const u8 *buf_end) {
-    DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
-                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
-
-    // Handle small scans.
-#ifdef HAVE_AVX512
-    if (buf_end - buf <= VERM_BOUNDARY) {
-        const u8 *ptr = nocase
-                      ? vermMiniNocase(chars, buf, buf_end, 0)
-                      : vermMini(chars, buf, buf_end, 0);
-        if (ptr) {
-            return ptr;
-        }
-        return buf_end;
-    }
-#else
-    if (buf_end - buf < VERM_BOUNDARY) {
-        for (; buf < buf_end; buf++) {
-            char cur = (char)*buf;
-            if (nocase) {
-                cur &= CASE_CLEAR;
-            }
-            if (cur == c) {
-                break;
-            }
-        }
-        return buf;
-    }
-#endif
-
-    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
-    if (min) {
-        // Input isn't aligned, so we need to run one iteration with an
-        // unaligned load, then skip buf forward to the next aligned address.
-        // There's some small overlap here, but we don't mind scanning it twice
-        // if we can do it quickly, do we?
-        const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 0)
-                               : vermUnalign(chars, buf, 0);
-        if (ptr) {
-            return ptr;
-        }
-
-        buf += VERM_BOUNDARY - min;
-        assert(buf < buf_end);
-    }
-
-    // Aligned loops from here on in
-    const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 0)
-                           : vermSearchAligned(chars, buf, buf_end - 1, 0);
-    if (ptr) {
-        return ptr;
-    }
-
-    // Tidy up the mess at the end
-    ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0)
-                 : vermUnalign(chars, buf_end - VERM_BOUNDARY, 0);
-    return ptr ? ptr : buf_end;
-}
-
-/* like vermicelliExec except returns the address of the first character which
- * is not c */
-static really_inline
-const u8 *nvermicelliExec(char c, char nocase, const u8 *buf,
-                         const u8 *buf_end) {
-    DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n",
-                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
-
-    // Handle small scans.
-#ifdef HAVE_AVX512
-    if (buf_end - buf <= VERM_BOUNDARY) {
-        const u8 *ptr = nocase
-                      ? vermMiniNocase(chars, buf, buf_end, 1)
-                      : vermMini(chars, buf, buf_end, 1);
-        if (ptr) {
-            return ptr;
-        }
-        return buf_end;
-    }
-#else
-    if (buf_end - buf < VERM_BOUNDARY) {
-        for (; buf < buf_end; buf++) {
-            char cur = (char)*buf;
-            if (nocase) {
-                cur &= CASE_CLEAR;
-            }
-            if (cur != c) {
-                break;
-            }
-        }
-        return buf;
-    }
-#endif
-
-    size_t min = (size_t)buf % VERM_BOUNDARY;
-    if (min) {
-        // Input isn't aligned, so we need to run one iteration with an
-        // unaligned load, then skip buf forward to the next aligned address.
-        // There's some small overlap here, but we don't mind scanning it twice
-        // if we can do it quickly, do we?
-        const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 1)
-                               : vermUnalign(chars, buf, 1);
-        if (ptr) {
-            return ptr;
-        }
-
-        buf += VERM_BOUNDARY - min;
-        assert(buf < buf_end);
-    }
-
-    // Aligned loops from here on in
-    const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 1)
-                           : vermSearchAligned(chars, buf, buf_end - 1, 1);
-    if (ptr) {
-        return ptr;
-    }
-
-    // Tidy up the mess at the end
-    ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1)
-                 : vermUnalign(chars, buf_end - VERM_BOUNDARY, 1);
-    return ptr ? ptr : buf_end;
-}
-
-static really_inline
-const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
-                               const u8 *buf_end) {
-    DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
-                 nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
-    VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
-
-#ifdef HAVE_AVX512
-    if (buf_end - buf <= VERM_BOUNDARY) {
-        const u8 *ptr = nocase
-                      ? dvermMiniNocase(chars1, chars2, buf, buf_end)
-                      : dvermMini(chars1, chars2, buf, buf_end);
-        if (ptr) {
-            return ptr;
-        }
-
-        /* check for partial match at end */
-        u8 mask = nocase ? CASE_CLEAR : 0xff;
-        if ((buf_end[-1] & mask) == (u8)c1) {
-            DEBUG_PRINTF("partial!!!\n");
-            return buf_end - 1;
-        }
-
-        return buf_end;
-    }
-#endif
-
-    assert((buf_end - buf) >= VERM_BOUNDARY);
-    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
-    if (min) {
-        // Input isn't aligned, so we need to run one iteration with an
-        // unaligned load, then skip buf forward to the next aligned address.
-        // There's some small overlap here, but we don't mind scanning it twice
-        // if we can do it quickly, do we?
-        const u8 *ptr = nocase
-                        ? dvermPreconditionNocase(chars1, chars2, buf)
-                        : dvermPrecondition(chars1, chars2, buf);
-        if (ptr) {
-            return ptr;
-        }
-
-        buf += VERM_BOUNDARY - min;
-        assert(buf < buf_end);
-    }
-
-    // Aligned loops from here on in
-    const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2,
-                                                      buf, buf_end)
-                           : dvermSearchAligned(chars1, chars2, c1, c2, buf,
-                                                buf_end);
-    if (ptr) {
-        return ptr;
-    }
-
-    // Tidy up the mess at the end
-    ptr = nocase ? dvermPreconditionNocase(chars1, chars2,
-                                           buf_end - VERM_BOUNDARY)
-                 : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY);
-
-    if (ptr) {
-        return ptr;
-    }
-
-    /* check for partial match at end */
-    u8 mask = nocase ? CASE_CLEAR : 0xff;
-    if ((buf_end[-1] & mask) == (u8)c1) {
-        DEBUG_PRINTF("partial!!!\n");
-        return buf_end - 1;
-    }
-
-    return buf_end;
-}
-
-static really_inline
-const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
-                                     const u8 *buf, const u8 *buf_end) {
-    DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
-                 "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    VERM_TYPE chars1 = VERM_SET_FN(c1);
-    VERM_TYPE chars2 = VERM_SET_FN(c2);
-    VERM_TYPE mask1 = VERM_SET_FN(m1);
-    VERM_TYPE mask2 = VERM_SET_FN(m2);
-
-#ifdef HAVE_AVX512
-    if (buf_end - buf <= VERM_BOUNDARY) {
-        const u8 *ptr = dvermMiniMasked(chars1, chars2, mask1, mask2, buf,
-                                        buf_end);
-        if (ptr) {
-            return ptr;
-        }
-
-        /* check for partial match at end */
-        if ((buf_end[-1] & m1) == (u8)c1) {
-            DEBUG_PRINTF("partial!!!\n");
-            return buf_end - 1;
-        }
-
-        return buf_end;
-    }
-#endif
-
-    assert((buf_end - buf) >= VERM_BOUNDARY);
-    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
-    if (min) {
-        // Input isn't aligned, so we need to run one iteration with an
-        // unaligned load, then skip buf forward to the next aligned address.
-        // There's some small overlap here, but we don't mind scanning it twice
-        // if we can do it quickly, do we?
-        const u8 *p = dvermPreconditionMasked(chars1, chars2, mask1, mask2, buf);
-        if (p) {
-            return p;
-        }
-
-        buf += VERM_BOUNDARY - min;
-        assert(buf < buf_end);
-    }
-
-    // Aligned loops from here on in
-    const u8 *ptr = dvermSearchAlignedMasked(chars1, chars2, mask1, mask2, c1,
-                                             c2, m1, m2, buf, buf_end);
-    if (ptr) {
-        return ptr;
-    }
-
-    // Tidy up the mess at the end
-    ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2,
-                                  buf_end - VERM_BOUNDARY);
-
-    if (ptr) {
-        return ptr;
-    }
-
-    /* check for partial match at end */
-    if ((buf_end[-1] & m1) == (u8)c1) {
-        DEBUG_PRINTF("partial!!!\n");
-        return buf_end - 1;
-    }
-
-    return buf_end;
-}
-
-// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
-// character not found.
-static really_inline
-const u8 *rvermicelliExec(char c, char nocase, const u8 *buf,
-                          const u8 *buf_end) {
-    DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
-                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
-
-    // Handle small scans.
-#ifdef HAVE_AVX512
-    if (buf_end - buf <= VERM_BOUNDARY) {
-        const u8 *ptr = nocase
-                      ? rvermMiniNocase(chars, buf, buf_end, 0)
-                      : rvermMini(chars, buf, buf_end, 0);
-        if (ptr) {
-            return ptr;
-        }
-        return buf - 1;
-    }
-#else
-    if (buf_end - buf < VERM_BOUNDARY) {
-        for (buf_end--; buf_end >= buf; buf_end--) {
-            char cur = (char)*buf_end;
-            if (nocase) {
-                cur &= CASE_CLEAR;
-            }
-            if (cur == c) {
-                break;
-            }
-        }
-        return buf_end;
-    }
-#endif
-
-    size_t min = (size_t)buf_end % VERM_BOUNDARY;
-    if (min) {
-        // Input isn't aligned, so we need to run one iteration with an
-        // unaligned load, then skip buf backward to the next aligned address.
-        // There's some small overlap here, but we don't mind scanning it twice
-        // if we can do it quickly, do we?
-        const u8 *ptr = nocase ? rvermUnalignNocase(chars,
-                                                    buf_end - VERM_BOUNDARY,
-                                                    0)
-                               : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
-                                              0);
-
-        if (ptr) {
-            return ptr;
-        }
-
-        buf_end -= min;
-        if (buf >= buf_end) {
-            return buf_end;
-        }
-    }
-
-    // Aligned loops from here on in.
-    const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 0)
-                           : rvermSearchAligned(chars, buf, buf_end, 0);
-    if (ptr) {
-        return ptr;
-    }
-
-    // Tidy up the mess at the end, return buf - 1 if not found.
-    ptr = nocase ? rvermUnalignNocase(chars, buf, 0)
-                 : rvermUnalign(chars, buf, 0);
-    return ptr ? ptr : buf - 1;
-}
-
-/* like rvermicelliExec except returns the address of the last character which
- * is not c */
-static really_inline
-const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf,
-                           const u8 *buf_end) {
-    DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
-                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
-
-    // Handle small scans.
-#ifdef HAVE_AVX512
-    if (buf_end - buf <= VERM_BOUNDARY) {
-        const u8 *ptr = nocase
-                      ? rvermMiniNocase(chars, buf, buf_end, 1)
-                      : rvermMini(chars, buf, buf_end, 1);
-        if (ptr) {
-            return ptr;
-        }
-        return buf - 1;
-    }
-#else
-    if (buf_end - buf < VERM_BOUNDARY) {
-        for (buf_end--; buf_end >= buf; buf_end--) {
-            char cur = (char)*buf_end;
-            if (nocase) {
-                cur &= CASE_CLEAR;
-            }
-            if (cur != c) {
-                break;
-            }
-        }
-        return buf_end;
-    }
-#endif
-
-    size_t min = (size_t)buf_end % VERM_BOUNDARY;
-    if (min) {
-        // Input isn't aligned, so we need to run one iteration with an
-        // unaligned load, then skip buf backward to the next aligned address.
-        // There's some small overlap here, but we don't mind scanning it twice
-        // if we can do it quickly, do we?
-        const u8 *ptr = nocase ? rvermUnalignNocase(chars,
-                                                    buf_end - VERM_BOUNDARY,
-                                                    1)
-                               : rvermUnalign(chars, buf_end - VERM_BOUNDARY,
-                                              1);
-
-        if (ptr) {
-            return ptr;
-        }
-
-        buf_end -= min;
-        if (buf >= buf_end) {
-            return buf_end;
-        }
-    }
-
-    // Aligned loops from here on in.
-    const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 1)
-                           : rvermSearchAligned(chars, buf, buf_end, 1);
-    if (ptr) {
-        return ptr;
-    }
-
-    // Tidy up the mess at the end, return buf - 1 if not found.
-    ptr = nocase ? rvermUnalignNocase(chars, buf, 1)
-                 : rvermUnalign(chars, buf, 1);
-    return ptr ? ptr : buf - 1;
-}
-
-/* returns highest offset of c2 (NOTE: not c1) */
-static really_inline
-const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf,
-                                const u8 *buf_end) {
-    DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
-                 nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */
-    VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */
-
-#ifdef HAVE_AVX512
-    if (buf_end - buf <= VERM_BOUNDARY) {
-        const u8 *ptr = nocase
-                      ? rdvermMiniNocase(chars1, chars2, buf, buf_end)
-                      : rdvermMini(chars1, chars2, buf, buf_end);
-
-        if (ptr) {
-            return ptr;
-        }
-
-        // check for partial match at end ???
-        return buf - 1;
-    }
-#endif
-
-    assert((buf_end - buf) >= VERM_BOUNDARY);
-    size_t min = (size_t)buf_end % VERM_BOUNDARY;
-    if (min) {
-        // input not aligned, so we need to run one iteration with an unaligned
-        // load, then skip buf forward to the next aligned address. There's
-        // some small overlap here, but we don't mind scanning it twice if we
-        // can do it quickly, do we?
-        const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2,
-                                                          buf_end - VERM_BOUNDARY)
-                               : rdvermPrecondition(chars1, chars2,
-                                                    buf_end - VERM_BOUNDARY);
-
-        if (ptr) {
-            return ptr;
-        }
-
-        buf_end -= min;
-        if (buf >= buf_end) {
-            return buf_end;
-        }
-    }
-
-    // Aligned loops from here on in
-    if (nocase) {
-        return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end);
-    } else {
-        return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end);
-    }
-}
-
-#endif /* VERMICELLI_H */
diff --git a/src/nfa/vermicelli.hpp b/src/nfa/vermicelli.hpp
new file mode 100644
index 000000000..f4958ada3
--- /dev/null
+++ b/src/nfa/vermicelli.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli: single-byte and double-byte acceleration.
+ */
+
+#ifndef VERMICELLI_HPP
+#define VERMICELLI_HPP
+
+#include "util/bitutils.h"
+
+#ifdef HAVE_SVE2
+#include "vermicelli_sve.h"
+#else
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const u8 *vermicelliExec(char c, char noCase, const u8 *buf, const u8 *buf_end);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const u8 *nvermicelliExec(char c, char noCase, const u8 *buf, const u8 *buf_end);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, const u8 *buf, const u8 *buf_end);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* VERMICELLI_HPP */
diff --git a/src/nfa/vermicelli_run.h b/src/nfa/vermicelli_run.h
index d6fe7ec78..1deda48ae 100644
--- a/src/nfa/vermicelli_run.h
+++ b/src/nfa/vermicelli_run.h
@@ -26,7 +26,10 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "vermicelli.h"
+#include "vermicelli.hpp"
+
+#define VERM_BOUNDARY 16
+#define VERM_TYPE m128
 
 static really_inline
 const u8 *find_xverm_run(char c, char nocase, u32 repeat, UNUSED const u8 *buf,
diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp
new file mode 100644
index 000000000..67ac1dac8
--- /dev/null
+++ b/src/nfa/vermicelli_simd.cpp
@@ -0,0 +1,626 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli: single-byte and double-byte acceleration.
+ */
+
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+
+#include "vermicelli.hpp"
+#include "util/supervector/casemask.hpp"
+#include "util/match.hpp"
+
+template <uint16_t S>
+static really_inline
+const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len);
+
+template <uint16_t S>
+static really_inline
+const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len);
+
+template <uint16_t S>
+static really_inline
+const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len);
+
+template <uint16_t S>
+static really_inline
+const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len);
+
+template <uint16_t S, bool check_partial = true>
+static really_inline
+const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+                                u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len);
+
+template <uint16_t S, bool check_partial = true>
+static really_inline
+const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+                                 u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len);
+
+template <uint16_t S, bool check_partial = true>
+static really_inline
+const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
+                                      SuperVector<S> const mask1, SuperVector<S> const mask2,
+                                      u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len);
+
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/vermicelli.hpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "x86/vermicelli.hpp"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "arm/vermicelli.hpp"
+#elif defined(ARCH_PPC64EL)
+#include "ppc64el/vermicelli.hpp"
+#endif
+#endif
+
+template <uint16_t S>
+static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u8 const *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const u8 *d = buf;
+    const u8 *rv;
+
+    __builtin_prefetch(d +   64);
+    __builtin_prefetch(d + 2*64);
+    __builtin_prefetch(d + 3*64);
+    __builtin_prefetch(d + 4*64);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            u8 const *d1 = ROUNDUP_PTR(d, S);
+            SuperVector<S> data = SuperVector<S>::loadu(d);
+            rv = vermicelliBlock(data, chars, casemask, d, S);
+            if (rv) return rv;
+            d = d1;
+        }
+
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 64);
+            DEBUG_PRINTF("d %p \n", d);
+            SuperVector<S> data = SuperVector<S>::load(d);
+            rv = vermicelliBlock(data, chars, casemask, d, S);
+            if (rv) return rv;
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("d %p e %p \n", d, buf_end);
+    // finish off tail
+
+    if (d != buf_end) {
+        SuperVector<S> data = SuperVector<S>::loadu(buf_end - S);
+        rv = vermicelliBlock(data, chars, casemask, buf_end - S, buf_end - d);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf_end;
+}
+
+template <uint16_t S>
+static const u8 *nvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const u8 *d = buf;
+    const u8 *rv;
+
+    __builtin_prefetch(d +   64);
+    __builtin_prefetch(d + 2*64);
+    __builtin_prefetch(d + 3*64);
+    __builtin_prefetch(d + 4*64);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+    assert(d < buf_end);
+    if (d + S <= buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            u8 const *d1 = ROUNDUP_PTR(d, S);
+            SuperVector<S> data = SuperVector<S>::loadu(d);
+            rv = vermicelliBlockNeg(data, chars, casemask, d, S);
+            if (rv) return rv;
+            d = d1;
+        }
+
+        while(d + S <= buf_end) {
+            __builtin_prefetch(d + 64);
+            DEBUG_PRINTF("d %p \n", d);
+            SuperVector<S> data = SuperVector<S>::load(d);
+            rv = vermicelliBlockNeg(data, chars, casemask, d, S);
+            if (rv) return rv;
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("d %p e %p \n", d, buf_end);
+    // finish off tail
+
+    if (d != buf_end) {
+        SuperVector<S> data = SuperVector<S>::loadu(buf_end - S);
+        rv = vermicelliBlockNeg(data, chars, casemask, buf_end - S, buf_end - d);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf_end;
+}
+
+// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
+// character not found.
+template <uint16_t S>
+const u8 *rvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("rverm %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const u8 *d = buf_end;
+    const u8 *rv;
+
+    __builtin_prefetch(d -   64);
+    __builtin_prefetch(d - 2*64);
+    __builtin_prefetch(d - 3*64);
+    __builtin_prefetch(d - 4*64);
+    DEBUG_PRINTF("start %p end %p \n", buf, d);
+    assert(d > buf);
+    if (d - S >= buf) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            u8 const *d1 = ROUNDDOWN_PTR(d, S);
+            SuperVector<S> data = SuperVector<S>::loadu(d - S);
+            rv = rvermicelliBlock(data, chars, casemask, d - S, S);
+            DEBUG_PRINTF("rv %p \n", rv);
+            if (rv) return rv;
+            d = d1;
+        }
+
+        while (d - S >= buf) {
+            DEBUG_PRINTF("aligned %p \n", d);
+            // On large packet buffers, this prefetch appears to get us about 2%.
+            __builtin_prefetch(d - 64);
+
+            d -= S;
+            SuperVector<S> data = SuperVector<S>::load(d);
+            rv = rvermicelliBlock(data, chars, casemask, d, S);
+            if (rv) return rv;
+        }
+    }
+
+    DEBUG_PRINTF("tail d %p e %p \n", buf, d);
+    // finish off head
+
+    if (d != buf) {
+        SuperVector<S> data = SuperVector<S>::loadu(buf);
+        rv = rvermicelliBlock(data, chars, casemask, buf, d - buf);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf - 1;
+}
+
+// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
+// character not found.
+template <uint16_t S>
+const u8 *rnvermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("rverm %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const u8 *d = buf_end;
+    const u8 *rv;
+
+    __builtin_prefetch(d -   64);
+    __builtin_prefetch(d - 2*64);
+    __builtin_prefetch(d - 3*64);
+    __builtin_prefetch(d - 4*64);
+    DEBUG_PRINTF("start %p end %p \n", buf, d);
+    assert(d > buf);
+    if (d - S >= buf) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            u8 const *d1 = ROUNDDOWN_PTR(d, S);
+            SuperVector<S> data = SuperVector<S>::loadu(d - S);
+            rv = rvermicelliBlockNeg(data, chars, casemask, d - S, S);
+            DEBUG_PRINTF("rv %p \n", rv);
+            if (rv) return rv;
+            d = d1;
+        }
+
+        while (d - S >= buf) {
+            DEBUG_PRINTF("aligned %p \n", d);
+            // On large packet buffers, this prefetch appears to get us about 2%.
+            __builtin_prefetch(d - 64);
+
+            d -= S;
+            SuperVector<S> data = SuperVector<S>::load(d);
+            rv = rvermicelliBlockNeg(data, chars, casemask, d, S);
+            if (rv) return rv;
+        }
+    }
+
+    DEBUG_PRINTF("tail d %p e %p \n", buf, d);
+    // finish off head
+
+    if (d != buf) {
+        SuperVector<S> data = SuperVector<S>::loadu(buf);
+        rv = rvermicelliBlockNeg(data, chars, casemask, buf, d - buf);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf - 1;
+}
+
+template <uint16_t S>
+static const u8 *vermicelliDoubleExecReal(u8 const c1, u8 const c2, SuperVector<S> const casemask,
+                                          const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const u8 *d = buf;
+    const u8 *rv;
+    // SuperVector<S> lastmask1{0};
+    const SuperVector<VECTORSIZE> chars1 = SuperVector<VECTORSIZE>::dup_u8(c1);
+    const SuperVector<VECTORSIZE> chars2 = SuperVector<VECTORSIZE>::dup_u8(c2);
+    const u8 casechar = casemask.u.u8[0];
+
+    __builtin_prefetch(d +   64);
+    __builtin_prefetch(d + 2*64);
+    __builtin_prefetch(d + 3*64);
+    __builtin_prefetch(d + 4*64);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+    assert(d < buf_end);
+    if (d + S < buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            u8 const *d1 = ROUNDUP_PTR(d, S);
+            SuperVector<S> data = SuperVector<S>::loadu(d);
+            rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d + S, S);
+            if (rv) return rv - S;
+            d = d1;
+        }
+
+        while(d + S < buf_end) {
+            __builtin_prefetch(d + 64);
+            DEBUG_PRINTF("d %p \n", d);
+            SuperVector<S> data = SuperVector<S>::load(d);
+            rv = vermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d + S, S);
+            if (rv) return rv - S;
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("tail d %p e %p \n", d, buf_end);
+    // finish off tail
+
+    if (d != buf_end) {
+        SuperVector<S> data = SuperVector<S>::Zeroes();
+        if (buf_end - d < S) {
+          memcpy(&data.u, d, buf_end - d);
+        } else {
+          data = SuperVector<S>::loadu(d);
+        }
+        rv = vermicelliDoubleBlock<S, false>(data, chars1, chars2, casemask, c1, c2, casechar, d, buf_end - d);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    DEBUG_PRINTF("real tail d %p e %p \n", d, buf_end);
+    /* check for partial match at end */
+    u8 mask = casemask.u.u8[0];
+    if ((buf_end[-1] & mask) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+
+    return buf_end;
+}
+
+// /* returns highest offset of c2 (NOTE: not c1) */
+template <uint16_t S>
+const u8 *rvermicelliDoubleExecReal(char c1, char c2, SuperVector<S> const casemask, const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("rverm %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+    char s[255];
+    snprintf(s, buf_end - buf + 1, "%s", buf);
+    DEBUG_PRINTF("b %s\n", s);
+
+    const u8 *d = buf_end;
+    const u8 *rv;
+    const SuperVector<VECTORSIZE> chars1 = SuperVector<VECTORSIZE>::dup_u8(c1);
+    const SuperVector<VECTORSIZE> chars2 = SuperVector<VECTORSIZE>::dup_u8(c2);
+    const u8 casechar = casemask.u.u8[0];
+
+    __builtin_prefetch(d -   64);
+    __builtin_prefetch(d - 2*64);
+    __builtin_prefetch(d - 3*64);
+    __builtin_prefetch(d - 4*64);
+    DEBUG_PRINTF("start %p end %p \n", buf, d);
+    assert(d > buf);
+    if (d - S > buf) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDDOWN_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            u8 const *d1 = ROUNDDOWN_PTR(d, S);
+            SuperVector<S> data = SuperVector<S>::loadu(d - S);
+            rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d - S, S);
+            DEBUG_PRINTF("rv %p \n", rv);
+            if (rv && rv < buf_end) return rv;
+            d = d1;
+        }
+
+        while (d - S > buf) {
+            DEBUG_PRINTF("aligned %p \n", d);
+            // On large packet buffers, this prefetch appears to get us about 2%.
+            __builtin_prefetch(d - 64);
+
+            d -= S;
+            SuperVector<S> data = SuperVector<S>::load(d);
+            rv = rvermicelliDoubleBlock(data, chars1, chars2, casemask, c1, c2, casechar, d, S);
+            if (rv) return rv;
+        }
+    }
+
+    DEBUG_PRINTF("tail d %p e %p \n", buf, d);
+    // finish off head
+
+    if (d != buf) {
+        SuperVector<S> data = SuperVector<S>::Zeroes();
+        if (d - buf < S) {
+          memcpy(&data.u, buf, d - buf);
+        } else {
+          data = SuperVector<S>::loadu(buf);
+        }
+        rv = rvermicelliDoubleBlock<S, false>(data, chars1, chars2, casemask, c1, c2, casechar, buf, d - buf);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    return buf - 1;
+}
+
+template <uint16_t S>
+static const u8 *vermicelliDoubleMaskedExecReal(u8 const c1, u8 const c2, u8 const m1, u8 const m2,
+                                                const u8 *buf, const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("verm %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const u8 *d = buf;
+    const u8 *rv;
+    // SuperVector<S> lastmask1{0};
+    const SuperVector<VECTORSIZE> chars1 = SuperVector<VECTORSIZE>::dup_u8(c1);
+    const SuperVector<VECTORSIZE> chars2 = SuperVector<VECTORSIZE>::dup_u8(c2);
+    const SuperVector<VECTORSIZE> mask1 = SuperVector<VECTORSIZE>::dup_u8(m1);
+    const SuperVector<VECTORSIZE> mask2 = SuperVector<VECTORSIZE>::dup_u8(m2);
+
+    __builtin_prefetch(d +   64);
+    __builtin_prefetch(d + 2*64);
+    __builtin_prefetch(d + 3*64);
+    __builtin_prefetch(d + 4*64);
+    DEBUG_PRINTF("start %p end %p \n", d, buf_end);
+    assert(d < buf_end);
+    if (d + S < buf_end) {
+        // Reach vector aligned boundaries
+        DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
+        if (!ISALIGNED_N(d, S)) {
+            u8 const *d1 = ROUNDUP_PTR(d, S);
+            SuperVector<S> data = SuperVector<S>::loadu(d);
+            rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d + S, S);
+            if (rv) return rv - S;
+            d = d1;
+        }
+
+        while(d + S < buf_end) {
+            __builtin_prefetch(d + 64);
+            DEBUG_PRINTF("d %p \n", d);
+            SuperVector<S> data = SuperVector<S>::load(d);
+            rv = vermicelliDoubleMaskedBlock(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d + S, S);
+            if (rv) return rv - S;
+            d += S;
+        }
+    }
+
+    DEBUG_PRINTF("tail d %p e %p \n", d, buf_end);
+    // finish off tail
+
+    if (d != buf_end) {
+        SuperVector<S> data = SuperVector<S>::Zeroes();
+        if (buf_end - d < S) {
+          memcpy(&data.u, d, buf_end - d);
+        } else {
+          data = SuperVector<S>::loadu(d);
+        }
+        rv = vermicelliDoubleMaskedBlock<S, false>(data, chars1, chars2, mask1, mask2, c1, c2, m1, m2, d, buf_end - d);
+        DEBUG_PRINTF("rv %p \n", rv);
+        if (rv && rv < buf_end) return rv;
+    }
+
+    DEBUG_PRINTF("real tail d %p e %p \n", d, buf_end);
+    /* check for partial match at end */
+    if ((buf_end[-1] & m1) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+
+    return buf_end;
+}
+
+extern "C" const u8 *vermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    // Small ranges.
+    if (buf_end - buf < VECTORSIZE) {
+        for (; buf < buf_end; buf++) {
+            char cur = (char)*buf;
+            if (nocase) {
+                cur &= CASE_CLEAR;
+            }
+            if (cur == c) {
+                break;
+            }
+        }
+        return buf;
+    }
+
+    const SuperVector<VECTORSIZE> chars = SuperVector<VECTORSIZE>::dup_u8(c);
+    const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
+
+    return vermicelliExecReal<VECTORSIZE>(chars, casemask, buf, buf_end);
+}
+
+/* like vermicelliExec except returns the address of the first character which
+ * is not c */
+extern "C" const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    // Small ranges.
+    if (buf_end - buf < VECTORSIZE) {
+        for (; buf < buf_end; buf++) {
+            char cur = *buf;
+            if (nocase) {
+                cur &= CASE_CLEAR;
+            }
+            if (cur != c) {
+                break;
+            }
+        }
+        return buf;
+    }
+
+    const SuperVector<VECTORSIZE> chars = SuperVector<VECTORSIZE>::dup_u8(c);
+    const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
+
+    return nvermicelliExecReal<VECTORSIZE>(chars, casemask, buf, buf_end);
+}
+
+extern "C" const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    // Small ranges.
+    if (buf_end - buf < VECTORSIZE) {
+        for (buf_end--; buf_end >= buf; buf_end--) {
+            char cur = (char)*buf_end;
+            if (nocase) {
+                cur &= CASE_CLEAR;
+            }
+            if (cur == c) {
+                break;
+            }
+        }
+        return buf_end;
+    }
+
+    const SuperVector<VECTORSIZE> chars = SuperVector<VECTORSIZE>::dup_u8(c);
+    const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
+
+    return rvermicelliExecReal<VECTORSIZE>(chars, casemask, buf, buf_end);
+}
+
+extern "C" const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, const u8 *buf_end) {
+     DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n",
+                  nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    // Small ranges.
+    if (buf_end - buf < VECTORSIZE) {
+        for (buf_end--; buf_end >= buf; buf_end--) {
+            char cur = (char)*buf_end;
+            if (nocase) {
+                cur &= CASE_CLEAR;
+            }
+            if (cur != c) {
+                break;
+            }
+        }
+        return buf_end;
+    }
+
+    const SuperVector<VECTORSIZE> chars = SuperVector<VECTORSIZE>::dup_u8(c);
+    const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
+
+    return rnvermicelliExecReal<VECTORSIZE>(chars, casemask, buf, buf_end);
+}
+
+extern "C" const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
+
+    return vermicelliDoubleExecReal<VECTORSIZE>(c1, c2, casemask, buf, buf_end);
+}
+
+extern "C" const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n",
+                 nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    const SuperVector<VECTORSIZE> casemask{nocase ? getCaseMask<VECTORSIZE>() : SuperVector<VECTORSIZE>::Ones()};
+
+    return rvermicelliDoubleExecReal<VECTORSIZE>(c1, c2, casemask, buf, buf_end);
+}
+
+extern "C" const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
+                                     const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
+                 "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    return vermicelliDoubleMaskedExecReal<VECTORSIZE>(c1, c2, m1, m2, buf, buf_end);
+}
diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h
deleted file mode 100644
index 3307486cf..000000000
--- a/src/nfa/vermicelli_sse.h
+++ /dev/null
@@ -1,889 +0,0 @@
-/*
- * Copyright (c) 2015-2020, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Vermicelli: Intel SSE implementation.
- *
- * (users should include vermicelli.h)
- */
-
-#if !defined(HAVE_AVX512)
-
-#define VERM_BOUNDARY 16
-#define VERM_TYPE m128
-#define VERM_SET_FN set16x8
-
-static really_inline
-const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
-                            char negate) {
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 31 < buf_end; buf += 32) {
-        m128 data = load128(buf);
-        u32 z1 = movemask128(eq128(chars, data));
-        m128 data2 = load128(buf + 16);
-        u32 z2 = movemask128(eq128(chars, data2));
-        u32 z = z1 | (z2 << 16);
-        if (negate) {
-            z = ~z;
-        }
-        if (unlikely(z)) {
-            u32 pos = ctz32(z);
-            return buf + pos;
-        }
-    }
-    for (; buf + 15 < buf_end; buf += 16) {
-        m128 data = load128(buf);
-        u32 z = movemask128(eq128(chars, data));
-        if (negate) {
-            z = ~z & 0xffff;
-        }
-        if (unlikely(z)) {
-            u32 pos = ctz32(z);
-            return buf + pos;
-        }
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
-                                  const u8 *buf_end, char negate) {
-    assert((size_t)buf % 16 == 0);
-    m128 casemask = set16x8(CASE_CLEAR);
-
-    for (; buf + 31 < buf_end; buf += 32) {
-        m128 data = load128(buf);
-        u32 z1 = movemask128(eq128(chars, and128(casemask, data)));
-        m128 data2 = load128(buf + 16);
-        u32 z2 = movemask128(eq128(chars, and128(casemask, data2)));
-        u32 z = z1 | (z2 << 16);
-        if (negate) {
-            z = ~z;
-        }
-        if (unlikely(z)) {
-            u32 pos = ctz32(z);
-            return buf + pos;
-        }
-    }
-
-    for (; buf + 15 < buf_end; buf += 16) {
-        m128 data = load128(buf);
-        u32 z = movemask128(eq128(chars, and128(casemask, data)));
-        if (negate) {
-            z = ~z & 0xffff;
-        }
-        if (unlikely(z)) {
-            u32 pos = ctz32(z);
-            return buf + pos;
-        }
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
-    m128 data = loadu128(buf); // unaligned
-    u32 z = movemask128(eq128(chars, data));
-    if (negate) {
-        z = ~z & 0xffff;
-    }
-    if (unlikely(z)) {
-        return buf + ctz32(z);
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
-    m128 casemask = set16x8(CASE_CLEAR);
-    m128 data = loadu128(buf); // unaligned
-    u32 z = movemask128(eq128(chars, and128(casemask, data)));
-    if (negate) {
-        z = ~z & 0xffff;
-    }
-    if (unlikely(z)) {
-        return buf + ctz32(z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
-                             const u8 *buf, const u8 *buf_end) {
-    for (; buf + 16 < buf_end; buf += 16) {
-        m128 data = load128(buf);
-        u32 z = movemask128(and128(eq128(chars1, data),
-                                   rshiftbyte_m128(eq128(chars2, data), 1)));
-        if (buf[15] == c1 && buf[16] == c2) {
-            z |= (1 << 15);
-        }
-        if (unlikely(z)) {
-            u32 pos = ctz32(z);
-            return buf + pos;
-        }
-    }
-
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
-                                   const u8 *buf, const u8 *buf_end) {
-    assert((size_t)buf % 16 == 0);
-    m128 casemask = set16x8(CASE_CLEAR);
-
-    for (; buf + 16 < buf_end; buf += 16) {
-        m128 data = load128(buf);
-        m128 v = and128(casemask, data);
-        u32 z = movemask128(and128(eq128(chars1, v),
-                                   rshiftbyte_m128(eq128(chars2, v), 1)));
-        if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
-            z |= (1 << 15);
-        }
-        if (unlikely(z)) {
-            u32 pos = ctz32(z);
-            return buf + pos;
-        }
-    }
-
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
-                                   m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1,
-                                   u8 m2, const u8 *buf, const u8 *buf_end) {
-    assert((size_t)buf % 16 == 0);
-
-    for (; buf + 16 < buf_end; buf += 16) {
-        m128 data = load128(buf);
-        m128 v1 = eq128(chars1, and128(data, mask1));
-        m128 v2 = eq128(chars2, and128(data, mask2));
-        u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
-
-        if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
-            z |= (1 << 15);
-        }
-        if (unlikely(z)) {
-            u32 pos = ctz32(z);
-            return buf + pos;
-        }
-    }
-
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
-    m128 data = loadu128(buf); // unaligned
-    u32 z = movemask128(and128(eq128(chars1, data),
-                               rshiftbyte_m128(eq128(chars2, data), 1)));
-
-    /* no fixup of the boundary required - the aligned run will pick it up */
-    if (unlikely(z)) {
-        u32 pos = ctz32(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
-    /* due to laziness, nonalphas and nocase having interesting behaviour */
-    m128 casemask = set16x8(CASE_CLEAR);
-    m128 data = loadu128(buf); // unaligned
-    m128 v = and128(casemask, data);
-    u32 z = movemask128(and128(eq128(chars1, v),
-                               rshiftbyte_m128(eq128(chars2, v), 1)));
-
-    /* no fixup of the boundary required - the aligned run will pick it up */
-    if (unlikely(z)) {
-        u32 pos = ctz32(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
-                                  m128 mask1, m128 mask2, const u8 *buf) {
-    m128 data = loadu128(buf); // unaligned
-    m128 v1 = eq128(chars1, and128(data, mask1));
-    m128 v2 = eq128(chars2, and128(data, mask2));
-    u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
-
-    /* no fixup of the boundary required - the aligned run will pick it up */
-    if (unlikely(z)) {
-        u32 pos = ctz32(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *lastMatchOffset(const u8 *buf_end, u32 z) {
-    assert(z);
-    return buf_end - 16 + 31 - clz32(z);
-}
-
-static really_inline
-const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
-                             char negate) {
-    assert((size_t)buf_end % 16 == 0);
-    for (; buf + 15 < buf_end; buf_end -= 16) {
-        m128 data = load128(buf_end - 16);
-        u32 z = movemask128(eq128(chars, data));
-        if (negate) {
-            z = ~z & 0xffff;
-        }
-        if (unlikely(z)) {
-            return lastMatchOffset(buf_end, z);
-        }
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
-                                   const u8 *buf_end, char negate) {
-    assert((size_t)buf_end % 16 == 0);
-    m128 casemask = set16x8(CASE_CLEAR);
-
-    for (; buf + 15 < buf_end; buf_end -= 16) {
-        m128 data = load128(buf_end - 16);
-        u32 z = movemask128(eq128(chars, and128(casemask, data)));
-        if (negate) {
-            z = ~z & 0xffff;
-        }
-        if (unlikely(z)) {
-            return lastMatchOffset(buf_end, z);
-        }
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
-    m128 data = loadu128(buf); // unaligned
-    u32 z = movemask128(eq128(chars, data));
-    if (negate) {
-        z = ~z & 0xffff;
-    }
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 16, z);
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
-    m128 casemask = set16x8(CASE_CLEAR);
-    m128 data = loadu128(buf); // unaligned
-    u32 z = movemask128(eq128(chars, and128(casemask, data)));
-    if (negate) {
-        z = ~z & 0xffff;
-    }
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 16, z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2,
-                              const u8 *buf, const u8 *buf_end) {
-    assert((size_t)buf_end % 16 == 0);
-
-    for (; buf + 16 < buf_end; buf_end -= 16) {
-        m128 data = load128(buf_end - 16);
-        u32 z = movemask128(and128(eq128(chars2, data),
-                                   lshiftbyte_m128(eq128(chars1, data), 1)));
-        if (buf_end[-17] == c1 && buf_end[-16] == c2) {
-            z |= 1;
-        }
-        if (unlikely(z)) {
-            return lastMatchOffset(buf_end, z);
-        }
-    }
-    return buf_end;
-}
-
-static really_inline
-const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
-                                    const u8 *buf, const u8 *buf_end) {
-    assert((size_t)buf_end % 16 == 0);
-    m128 casemask = set16x8(CASE_CLEAR);
-
-    for (; buf + 16 < buf_end; buf_end -= 16) {
-        m128 data = load128(buf_end - 16);
-        m128 v = and128(casemask, data);
-        u32 z = movemask128(and128(eq128(chars2, v),
-                                   lshiftbyte_m128(eq128(chars1, v), 1)));
-        if ((buf_end[-17] & CASE_CLEAR) == c1
-            && (buf_end[-16] & CASE_CLEAR) == c2) {
-            z |= 1;
-        }
-        if (unlikely(z)) {
-            return lastMatchOffset(buf_end, z);
-        }
-    }
-    return buf_end;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
-    m128 data = loadu128(buf);
-    u32 z = movemask128(and128(eq128(chars2, data),
-                               lshiftbyte_m128(eq128(chars1, data), 1)));
-
-    /* no fixup of the boundary required - the aligned run will pick it up */
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 16, z);
-    }
-
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
-    /* due to laziness, nonalphas and nocase having interesting behaviour */
-    m128 casemask = set16x8(CASE_CLEAR);
-    m128 data = loadu128(buf);
-    m128 v = and128(casemask, data);
-    u32 z = movemask128(and128(eq128(chars2, v),
-                               lshiftbyte_m128(eq128(chars1, v), 1)));
-    /* no fixup of the boundary required - the aligned run will pick it up */
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 16, z);
-    }
-
-    return NULL;
-}
-
-#else // HAVE_AVX512
-
-#define VERM_BOUNDARY 64
-#define VERM_TYPE m512
-#define VERM_SET_FN set64x8
-
-static really_inline
-const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-
-    u64a z = eq512mask(chars, data);
-
-    if (negate) {
-        z = ~z & mask;
-    }
-    z &= mask;
-    if (unlikely(z)) {
-        return buf + ctz64(z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
-                         char negate) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-    m512 casemask = set64x8(CASE_CLEAR);
-    m512 v = and512(casemask, data);
-
-    u64a z = eq512mask(chars, v);
-
-    if (negate) {
-        z = ~z & mask;
-    }
-    z &= mask;
-    if (unlikely(z)) {
-        return buf + ctz64(z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
-                            char negate) {
-    assert((size_t)buf % 64 == 0);
-    for (; buf + 63 < buf_end; buf += 64) {
-        m512 data = load512(buf);
-        u64a z = eq512mask(chars, data);
-        if (negate) {
-            z = ~z & ~0ULL;
-        }
-        if (unlikely(z)) {
-            u64a pos = ctz64(z);
-            return buf + pos;
-        }
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf,
-                                  const u8 *buf_end, char negate) {
-    assert((size_t)buf % 64 == 0);
-    m512 casemask = set64x8(CASE_CLEAR);
-
-    for (; buf + 63 < buf_end; buf += 64) {
-        m512 data = load512(buf);
-        u64a z = eq512mask(chars, and512(casemask, data));
-        if (negate) {
-            z = ~z & ~0ULL;
-        }
-        if (unlikely(z)) {
-            u64a pos = ctz64(z);
-            return buf + pos;
-        }
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) {
-    m512 data = loadu512(buf); // unaligned
-    u64a z = eq512mask(chars, data);
-    if (negate) {
-        z = ~z & ~0ULL;
-    }
-    if (unlikely(z)) {
-        return buf + ctz64(z);
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) {
-    m512 casemask = set64x8(CASE_CLEAR);
-    m512 data = loadu512(buf); // unaligned
-    u64a z = eq512mask(chars, and512(casemask, data));
-    if (negate) {
-        z = ~z & ~0ULL;
-    }
-    if (unlikely(z)) {
-        return buf + ctz64(z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf,
-                    const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-
-    u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
-
-    z &= mask;
-    if (unlikely(z)) {
-        u64a pos = ctz64(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
-                          const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-    m512 casemask = set64x8(CASE_CLEAR);
-    m512 v = and512(casemask, data);
-
-    u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
-
-    z &= mask;
-    if (unlikely(z)) {
-        u64a pos = ctz64(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2,
-                          const u8 *buf, const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-    m512 v1 = and512(data, mask1);
-    m512 v2 = and512(data, mask2);
-
-    u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
-
-    z &= mask;
-    if (unlikely(z)) {
-        u64a pos = ctz64(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
-                             const u8 *buf, const u8 *buf_end) {
-    for (; buf + 64 < buf_end; buf += 64) {
-        m512 data = load512(buf);
-        u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
-        if (buf[63] == c1 && buf[64] == c2) {
-            z |= (1ULL << 63);
-        }
-        if (unlikely(z)) {
-            u64a pos = ctz64(z);
-            return buf + pos;
-        }
-    }
-
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
-                                   const u8 *buf, const u8 *buf_end) {
-    assert((size_t)buf % 64 == 0);
-    m512 casemask = set64x8(CASE_CLEAR);
-
-    for (; buf + 64 < buf_end; buf += 64) {
-        m512 data = load512(buf);
-        m512 v = and512(casemask, data);
-        u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
-        if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) {
-            z |= (1ULL << 63);
-        }
-        if (unlikely(z)) {
-            u64a pos = ctz64(z);
-            return buf + pos;
-        }
-    }
-
-    return NULL;
-}
-
-static really_inline
-const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2,
-                                   m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1,
-                                   u8 m2, const u8 *buf, const u8 *buf_end) {
-    assert((size_t)buf % 64 == 0);
-
-    for (; buf + 64 < buf_end; buf += 64) {
-        m512 data = load512(buf);
-        m512 v1 = and512(data, mask1);
-        m512 v2 = and512(data, mask2);
-        u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
-
-        if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) {
-            z |= (1ULL << 63);
-        }
-        if (unlikely(z)) {
-            u64a pos = ctz64(z);
-            return buf + pos;
-        }
-    }
-
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
-    m512 data = loadu512(buf); // unaligned
-    u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1);
-
-    /* no fixup of the boundary required - the aligned run will pick it up */
-    if (unlikely(z)) {
-        u64a pos = ctz64(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
-    /* due to laziness, nonalphas and nocase having interesting behaviour */
-    m512 casemask = set64x8(CASE_CLEAR);
-    m512 data = loadu512(buf); // unaligned
-    m512 v = and512(casemask, data);
-    u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1);
-
-    /* no fixup of the boundary required - the aligned run will pick it up */
-    if (unlikely(z)) {
-        u64a pos = ctz64(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2,
-                                  m512 mask1, m512 mask2, const u8 *buf) {
-    m512 data = loadu512(buf); // unaligned
-    m512 v1 = and512(data, mask1);
-    m512 v2 = and512(data, mask2);
-    u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1);
-
-    /* no fixup of the boundary required - the aligned run will pick it up */
-    if (unlikely(z)) {
-        u64a pos = ctz64(z);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *lastMatchOffset(const u8 *buf_end, u64a z) {
-    assert(z);
-    return buf_end - 64 + 63 - clz64(z);
-}
-
-static really_inline
-const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-
-    u64a z = eq512mask(chars, data);
-
-    if (negate) {
-        z = ~z & mask;
-    }
-    z &= mask;
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 64, z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end,
-                          char negate) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-    m512 casemask = set64x8(CASE_CLEAR);
-    m512 v = and512(casemask, data);
-
-    u64a z = eq512mask(chars, v);
-
-    if (negate) {
-        z = ~z & mask;
-    }
-    z &= mask;
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 64, z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end,
-                             char negate) {
-    assert((size_t)buf_end % 64 == 0);
-    for (; buf + 63 < buf_end; buf_end -= 64) {
-        m512 data = load512(buf_end - 64);
-        u64a z = eq512mask(chars, data);
-        if (negate) {
-            z = ~z & ~0ULL;
-        }
-        if (unlikely(z)) {
-            return lastMatchOffset(buf_end, z);
-        }
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf,
-                                   const u8 *buf_end, char negate) {
-    assert((size_t)buf_end % 64 == 0);
-    m512 casemask = set64x8(CASE_CLEAR);
-
-    for (; buf + 63 < buf_end; buf_end -= 64) {
-        m512 data = load512(buf_end - 64);
-        u64a z = eq512mask(chars, and512(casemask, data));
-        if (negate) {
-            z = ~z & ~0ULL;
-        }
-        if (unlikely(z)) {
-            return lastMatchOffset(buf_end, z);
-        }
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) {
-    m512 data = loadu512(buf); // unaligned
-    u64a z = eq512mask(chars, data);
-    if (negate) {
-        z = ~z & ~0ULL;
-    }
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 64, z);
-    }
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) {
-    m512 casemask = set64x8(CASE_CLEAR);
-    m512 data = loadu512(buf); // unaligned
-    u64a z = eq512mask(chars, and512(casemask, data));
-    if (negate) {
-        z = ~z & ~0ULL;
-    }
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 64, z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf,
-                    const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-
-    u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
-
-    z &= mask;
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 64, z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf,
-                           const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    __mmask64 mask = (~0ULL) >> (64 - len);
-    m512 data = loadu_maskz_m512(mask, buf);
-    m512 casemask = set64x8(CASE_CLEAR);
-    m512 v = and512(casemask, data);
-
-    u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
-
-    z &= mask;
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 64, z);
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2,
-                              const u8 *buf, const u8 *buf_end) {
-    assert((size_t)buf_end % 64 == 0);
-
-    for (; buf + 64 < buf_end; buf_end -= 64) {
-        m512 data = load512(buf_end - 64);
-        u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
-        if (buf_end[-65] == c1 && buf_end[-64] == c2) {
-            z |= 1;
-        }
-        if (unlikely(z)) {
-            return lastMatchOffset(buf_end, z);
-        }
-    }
-    return buf_end;
-}
-
-static really_inline
-const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2,
-                                    const u8 *buf, const u8 *buf_end) {
-    assert((size_t)buf_end % 64 == 0);
-    m512 casemask = set64x8(CASE_CLEAR);
-
-    for (; buf + 64 < buf_end; buf_end -= 64) {
-        m512 data = load512(buf_end - 64);
-        m512 v = and512(casemask, data);
-        u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
-        if ((buf_end[-65] & CASE_CLEAR) == c1
-            && (buf_end[-64] & CASE_CLEAR) == c2) {
-            z |= 1;
-        }
-        if (unlikely(z)) {
-            return lastMatchOffset(buf_end, z);
-        }
-    }
-    return buf_end;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) {
-    m512 data = loadu512(buf);
-    u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1);
-
-    // no fixup of the boundary required - the aligned run will pick it up
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 64, z);
-    }
-
-    return NULL;
-}
-
-// returns NULL if not found
-static really_inline
-const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) {
-    // due to laziness, nonalphas and nocase having interesting behaviour
-    m512 casemask = set64x8(CASE_CLEAR);
-    m512 data = loadu512(buf);
-    m512 v = and512(casemask, data);
-    u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1);
-    // no fixup of the boundary required - the aligned run will pick it up
-    if (unlikely(z)) {
-        return lastMatchOffset(buf + 64, z);
-    }
-
-    return NULL;
-}
-
-#endif // HAVE_AVX512
diff --git a/src/nfa/vermicelli_sve.h b/src/nfa/vermicelli_sve.h
new file mode 100644
index 000000000..13f843417
--- /dev/null
+++ b/src/nfa/vermicelli_sve.h
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli: AArch64 SVE implementation.
+ *
+ * (users should include vermicelli.h instead of this)
+ */
+
+static really_inline
+int dvermSearchGetOffset(svbool_t matched, svbool_t matched_rot) {
+    int offset = accelSearchGetOffset(matched);
+    int offset_rot = accelSearchGetOffset(matched_rot) - 1;
+    return (offset_rot < offset) ? offset_rot : offset;
+}
+
+static really_inline
+uint64_t rdvermSearchGetSingleOffset(svbool_t matched) {
+    return svcntp_b8(svptrue_b8(), svbrkb_z(svptrue_b8(), svrev_b8(matched)));
+}
+
+static really_inline
+uint64_t rdvermSearchGetOffset(svbool_t matched, svbool_t matched_rot) {
+    uint64_t offset = rdvermSearchGetSingleOffset(matched);
+    uint64_t offset_rot = rdvermSearchGetSingleOffset(matched_rot) - 1;
+    return (offset_rot < offset) ? offset_rot : offset;
+}
+
+static really_inline
+const u8 *dvermSearchCheckMatched(const u8 *buf, svbool_t matched,
+                                  svbool_t matched_rot, svbool_t any) {
+    if (unlikely(svptest_any(svptrue_b8(), any))) {
+        const u8 *matchPos = buf + dvermSearchGetOffset(matched, matched_rot);
+        DEBUG_PRINTF("match pos %p\n", matchPos);
+        return matchPos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rdvermSearchCheckMatched(const u8 *buf, svbool_t matched,
+                                   svbool_t matched_rot, svbool_t any) {
+    if (unlikely(svptest_any(svptrue_b8(), any))) {
+        const u8 *matchPos = buf + (svcntb() -
+                                rdvermSearchGetOffset(matched, matched_rot));
+        DEBUG_PRINTF("match pos %p\n", matchPos);
+        return matchPos;
+    }
+    return NULL;
+}
+
+static really_inline
+svbool_t singleMatched(svuint8_t chars, const u8 *buf, svbool_t pg,
+                       bool negate, const int64_t vnum) {
+    svuint8_t vec = svld1_vnum_u8(pg, buf, vnum);
+    if (negate) {
+        return svnmatch(pg, vec, chars);
+    } else {
+        return svmatch(pg, vec, chars);
+    }
+}
+
+static really_inline
+svbool_t doubleMatched(svuint16_t chars, const u8 *buf, const u8 *buf_rot,
+                       svbool_t pg, svbool_t pg_rot, svbool_t * const matched,
+                       svbool_t * const matched_rot) {
+    svuint16_t vec = svreinterpret_u16(svld1_u8(pg, buf));
+    svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, buf_rot));
+    *matched = svmatch(pg, vec, chars);
+    *matched_rot = svmatch(pg_rot, vec_rot, chars);
+    return svorr_z(svptrue_b8(), *matched, *matched_rot);
+}
+
+static really_inline
+const u8 *vermSearchOnce(svuint8_t chars, const u8 *buf, const u8 *buf_end,
+                         bool negate) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+    assert(buf <= buf_end);
+    DEBUG_PRINTF("l = %td\n", buf_end - buf);
+    svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf);
+    svbool_t matched = singleMatched(chars, buf, pg, negate, 0);
+    return accelSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *vermSearchLoopBody(svuint8_t chars, const u8 *buf, bool negate) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb());
+    svbool_t matched = singleMatched(chars, buf, svptrue_b8(), negate, 0);
+    return accelSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *vermSearchLoopBodyUnrolled(svuint8_t chars, const u8 *buf,
+                                     bool negate) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf + (2 * svcntb()));
+    svbool_t matched0 = singleMatched(chars, buf, svptrue_b8(), negate, 0);
+    svbool_t matched1 = singleMatched(chars, buf, svptrue_b8(), negate, 1);
+    svbool_t any = svorr_z(svptrue_b8(), matched0, matched1);
+    if (unlikely(svptest_any(svptrue_b8(), any))) {
+        if (svptest_any(svptrue_b8(), matched0)) {
+            return buf + accelSearchGetOffset(matched0);
+        } else {
+            return buf + svcntb() + accelSearchGetOffset(matched1);
+        }
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *rvermSearchOnce(svuint8_t chars, const u8 *buf, const u8 *buf_end,
+                          bool negate) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+    assert(buf <= buf_end);
+    DEBUG_PRINTF("l = %td\n", buf_end - buf);
+    svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf);
+    svbool_t matched = singleMatched(chars, buf, pg, negate, 0);
+    return accelRevSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *rvermSearchLoopBody(svuint8_t chars, const u8 *buf, bool negate) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb());
+    svbool_t matched = singleMatched(chars, buf, svptrue_b8(), negate, 0);
+    return accelRevSearchCheckMatched(buf, matched);
+}
+
+static really_inline
+const u8 *dvermSearchOnce(svuint16_t chars, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("l = %td\n", buf_end - buf);
+    svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf);
+    svbool_t pg_rot = svwhilele_b8_s64(0, buf_end - buf);
+    svbool_t matched, matched_rot;
+    // buf - 1 won't underflow as the first position in the buffer has been
+    // dealt with meaning that buf - 1 is within the buffer.
+    svbool_t any = doubleMatched(chars, buf, buf - 1, pg, pg_rot,
+                                 &matched, &matched_rot);
+    return dvermSearchCheckMatched(buf, matched, matched_rot, any);
+}
+
+static really_inline
+const u8 *dvermSearchLoopBody(svuint16_t chars, const u8 *buf) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb());
+    svbool_t matched, matched_rot;
+    // buf - 1 won't underflow as the first position in the buffer has been
+    // dealt with meaning that buf - 1 is within the buffer.
+    svbool_t any = doubleMatched(chars, buf, buf - 1, svptrue_b8(),
+                                 svptrue_b8(), &matched, &matched_rot);
+    return dvermSearchCheckMatched(buf, matched, matched_rot, any);
+}
+
+static really_inline
+const u8 *rdvermSearchOnce(svuint16_t chars, const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf_end);
+    assert(buf < buf_end);
+
+    DEBUG_PRINTF("l = %td\n", buf_end - buf);
+    // buf_end can be read as the last position in the buffer has been
+    // dealt with meaning that buf_end is within the buffer.
+    // buf_end needs to be read by both the buf load and the buf + 1 load,
+    // this is because buf_end must be the upper 8 bits of the 16 bit element
+    // to be matched.
+    svbool_t pg = svwhilele_b8_s64(0, buf_end - buf);
+    svbool_t pg_rot = svwhilelt_b8_s64(0, buf_end - buf);
+    svbool_t matched, matched_rot;
+    svbool_t any = doubleMatched(chars, buf, buf + 1, pg, pg_rot,
+                                 &matched, &matched_rot);
+    return rdvermSearchCheckMatched(buf, matched, matched_rot, any);
+}
+
+static really_inline
+const u8 *rdvermSearchLoopBody(svuint16_t chars, const u8 *buf) {
+    DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb());
+    svbool_t matched, matched_rot;
+    // buf + svcntb() can be read as the last position in the buffer has
+    // been dealt with meaning that buf + svcntb() is within the buffer.
+    svbool_t any = doubleMatched(chars, buf, buf + 1, svptrue_b8(),
+                                 svptrue_b8(), &matched, &matched_rot);
+    return rdvermSearchCheckMatched(buf, matched, matched_rot, any);
+}
+
+static really_inline
+const u8 *vermSearch(svuint8_t chars, const u8 *buf, const u8 *buf_end,
+                     bool negate) {
+    assert(buf < buf_end);
+    size_t len = buf_end - buf;
+    if (len <= svcntb()) {
+        return vermSearchOnce(chars, buf, buf_end, negate);
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2));
+    assert(aligned_buf < buf_end);
+    if (buf != aligned_buf) {
+        const u8 *ptr = vermSearchLoopBody(chars, buf, negate);
+        if (ptr) return ptr;
+    }
+    buf = aligned_buf;
+    uint64_t unrolled_cntb = 2 * svcntb();
+    size_t unrolled_loops = (buf_end - buf) / unrolled_cntb;
+    DEBUG_PRINTF("unrolled_loops %zu \n", unrolled_loops);
+    for (size_t i = 0; i < unrolled_loops; i++, buf += unrolled_cntb) {
+        const u8 *ptr = vermSearchLoopBodyUnrolled(chars, buf, negate);
+        if (ptr) return ptr;
+    }
+    size_t loops = (buf_end - buf) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    for (size_t i = 0; i < loops; i++, buf += svcntb()) {
+        const u8 *ptr = vermSearchLoopBody(chars, buf, negate);
+        if (ptr) return ptr;
+    }
+    DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+    return buf == buf_end ? NULL : vermSearchLoopBody(chars, buf_end - svcntb(),
+                                                      negate);
+}
+
+static really_inline
+const u8 *rvermSearch(svuint8_t chars, const u8 *buf, const u8 *buf_end,
+                      bool negate) {
+    assert(buf < buf_end);
+    size_t len = buf_end - buf;
+    if (len <= svcntb()) {
+        return rvermSearchOnce(chars, buf, buf_end, negate);
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_buf_end = ROUNDDOWN_PTR(buf_end, svcntb_pat(SV_POW2));
+    assert(buf < aligned_buf_end);
+    if (buf_end != aligned_buf_end) {
+        const u8 *ptr = rvermSearchLoopBody(chars, buf_end - svcntb(), negate);
+        if (ptr) return ptr;
+    }
+    buf_end = aligned_buf_end;
+    size_t loops = (buf_end - buf) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    for (size_t i = 0; i < loops; i++) {
+        buf_end -= svcntb();
+        const u8 *ptr = rvermSearchLoopBody(chars, buf_end, negate);
+        if (ptr) return ptr;
+    }
+    DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+    return buf == buf_end ? NULL : rvermSearchLoopBody(chars, buf, negate);
+}
+
+static really_inline
+const u8 *dvermSearch(svuint8_t chars, const u8 *buf, const u8 *buf_end) {
+    size_t len = buf_end - buf;
+    if (len <= svcntb()) {
+        return dvermSearchOnce(svreinterpret_u16(chars), buf, buf_end);
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2));
+    assert(aligned_buf < buf_end);
+    if (buf != aligned_buf) {
+        const u8 *ptr = dvermSearchLoopBody(svreinterpret_u16(chars), buf);
+        if (ptr) return ptr;
+    }
+    buf = aligned_buf;
+    size_t loops = (buf_end - buf) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    for (size_t i = 0; i < loops; i++, buf += svcntb()) {
+        const u8 *ptr = dvermSearchLoopBody(svreinterpret_u16(chars), buf);
+        if (ptr) return ptr;
+    }
+    DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+    return buf == buf_end ? NULL : dvermSearchLoopBody(svreinterpret_u16(chars), buf_end - svcntb());
+}
+
+static really_inline
+const u8 *rdvermSearch(char c1, char c2, bool nocase, const u8 *buf,
+                       const u8 *buf_end) {
+    svuint16_t chars = getCharMaskDouble(c1, c2, nocase);
+    size_t len = buf_end - buf;
+    if (len <= svcntb()) {
+        return rdvermSearchOnce(chars, buf, buf_end);
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_buf_end = ROUNDDOWN_PTR(buf_end, svcntb_pat(SV_POW2));
+    assert(buf < aligned_buf_end);
+    if (buf_end != aligned_buf_end) {
+        const u8 *rv = rdvermSearchLoopBody(chars, buf_end - svcntb());
+        if (rv) return rv;
+    }
+    buf_end = aligned_buf_end;
+    size_t loops = (buf_end - buf) / svcntb();
+    DEBUG_PRINTF("loops %zu \n", loops);
+    for (size_t i = 0; i < loops; i++) {
+        buf_end -= svcntb();
+        const u8 *rv = rdvermSearchLoopBody(chars, buf_end);
+        if (rv) return rv;
+    }
+    DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end);
+    return buf == buf_end ? NULL : rdvermSearchLoopBody(chars, buf);
+}
+
+static really_inline
+const u8 *vermicelliExec(char c, bool nocase, const u8 *buf,
+                         const u8 *buf_end) {
+    DEBUG_PRINTF("verm scan %s\\x%02hhx over %td bytes\n",
+                 nocase ? "nocase " : "", c, buf_end - buf);
+    svuint8_t chars = getCharMaskSingle(c, nocase);
+    const u8 *ptr = vermSearch(chars, buf, buf_end, false);
+    return ptr ? ptr : buf_end;
+}
+
+/* like vermicelliExec except returns the address of the first character which
+ * is not c */
+static really_inline
+const u8 *nvermicelliExec(char c, bool nocase, const u8 *buf,
+                         const u8 *buf_end) {
+    DEBUG_PRINTF("nverm scan %s\\x%02hhx over %td bytes\n",
+                 nocase ? "nocase " : "", c, buf_end - buf);
+    svuint8_t chars = getCharMaskSingle(c, nocase);
+    const u8 *ptr = vermSearch(chars, buf, buf_end, true);
+    return ptr ? ptr : buf_end;
+}
+
+// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if
+// character not found.
+static really_inline
+const u8 *rvermicelliExec(char c, bool nocase, const u8 *buf,
+                          const u8 *buf_end) {
+    DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %td bytes\n",
+                 nocase ? "nocase " : "", c, buf_end - buf);
+    svuint8_t chars = getCharMaskSingle(c, nocase);
+    const u8 *ptr = rvermSearch(chars, buf, buf_end, false);
+    return ptr ? ptr : buf - 1;
+}
+
+/* like rvermicelliExec except returns the address of the last character which
+ * is not c */
+static really_inline
+const u8 *rnvermicelliExec(char c, bool nocase, const u8 *buf,
+                           const u8 *buf_end) {
+    DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %td bytes\n",
+                 nocase ? "nocase " : "", c, buf_end - buf);
+    svuint8_t chars = getCharMaskSingle(c, nocase);
+    const u8 *ptr = rvermSearch(chars, buf, buf_end, true);
+    return ptr ? ptr : buf - 1;
+}
+
+static really_inline
+const u8 *vermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf,
+                               const u8 *buf_end) {
+    DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %td bytes\n",
+                 nocase ? "nocase " : "", c1, c2, buf_end - buf);
+    assert(buf < buf_end);
+    if (buf_end - buf > 1) {
+        ++buf;
+        svuint8_t chars = svreinterpret_u8(getCharMaskDouble(c1, c2, nocase));
+        const u8 *ptr = dvermSearch(chars, buf, buf_end);
+        if (ptr) {
+            return ptr;
+        }
+    }
+    /* check for partial match at end */
+    u8 mask = nocase ? CASE_CLEAR : 0xff;
+    if ((buf_end[-1] & mask) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+    return buf_end;
+}
+
+/* returns highest offset of c2 (NOTE: not c1) */
+static really_inline
+const u8 *rvermicelliDoubleExec(char c1, char c2, bool nocase, const u8 *buf,
+                                const u8 *buf_end) {
+    DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %td bytes\n",
+                 nocase ? "nocase " : "", c1, c2, buf_end - buf);
+    assert(buf < buf_end);
+    if (buf_end - buf > 1) {
+        --buf_end;
+        const u8 *ptr = rdvermSearch(c1, c2, nocase, buf, buf_end);
+        if (ptr) {
+            return ptr;
+        }
+    }
+    return buf - 1;
+}
+
+static really_inline
+svuint8_t getDupSVEMaskFrom128(m128 mask) {
+    return svld1rq_u8(svptrue_b8(), (const uint8_t *)&mask);
+}
+
+static really_inline
+const u8 *vermicelli16Exec(const m128 mask, const u8 *buf,
+                           const u8 *buf_end) {
+    DEBUG_PRINTF("verm16 scan over %td bytes\n", buf_end - buf);
+    svuint8_t chars = getDupSVEMaskFrom128(mask);
+    const u8 *ptr = vermSearch(chars, buf, buf_end, false);
+    return ptr ? ptr : buf_end;
+}
+
+static really_inline
+const u8 *nvermicelli16Exec(const m128 mask, const u8 *buf,
+                            const u8 *buf_end) {
+    DEBUG_PRINTF("nverm16 scan over %td bytes\n", buf_end - buf);
+    svuint8_t chars = getDupSVEMaskFrom128(mask);
+    const u8 *ptr = vermSearch(chars, buf, buf_end, true);
+    return ptr ? ptr : buf_end;
+}
+
+static really_inline
+const u8 *rvermicelli16Exec(const m128 mask, const u8 *buf,
+                            const u8 *buf_end) {
+    DEBUG_PRINTF("rverm16 scan over %td bytes\n", buf_end - buf);
+    svuint8_t chars = getDupSVEMaskFrom128(mask);
+    const u8 *ptr = rvermSearch(chars, buf, buf_end, false);
+    return ptr ? ptr : buf - 1;
+}
+
+static really_inline
+const u8 *rnvermicelli16Exec(const m128 mask, const u8 *buf,
+                             const u8 *buf_end) {
+    DEBUG_PRINTF("rnverm16 scan over %td bytes\n", buf_end - buf);
+    svuint8_t chars = getDupSVEMaskFrom128(mask);
+    const u8 *ptr = rvermSearch(chars, buf, buf_end, true);
+    return ptr ? ptr : buf - 1;
+}
+
+static really_inline
+bool vermicelliDouble16CheckPartial(const u64a first_chars, const u8 *buf_end) {
+    svuint8_t firsts = svreinterpret_u8(svdup_u64(first_chars));
+    svbool_t matches = svcmpeq(svptrue_b8(), firsts, svdup_u8(buf_end[-1]));
+    return svptest_any(svptrue_b8(), matches);
+}
+
+static really_inline
+const u8 *vermicelliDouble16Exec(const m128 mask, const u64a firsts,
+                                 const u8 *buf, const u8 *buf_end) {
+    assert(buf < buf_end);
+    DEBUG_PRINTF("double verm16 scan over %td bytes\n", buf_end - buf);
+    if (buf_end - buf > 1) {
+        ++buf;
+        svuint8_t chars = svreinterpret_u8(getDupSVEMaskFrom128(mask));
+        const u8 *ptr = dvermSearch(chars, buf, buf_end);
+        if (ptr) {
+            return ptr;
+        }
+    }
+    /* check for partial match at end */
+    if (vermicelliDouble16CheckPartial(firsts, buf_end)) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+    return buf_end;
+}
+
+static really_inline
+const u8 *vermicelliDoubleMasked16Exec(const m128 mask, char c1, char m1,
+                                       const u8 *buf, const u8 *buf_end) {
+    assert(buf < buf_end);
+    DEBUG_PRINTF("double verm16 masked scan over %td bytes\n", buf_end - buf);
+    if (buf_end - buf > 1) {
+        ++buf;
+        svuint8_t chars = getDupSVEMaskFrom128(mask);
+        const u8 *ptr = dvermSearch(chars, buf, buf_end);
+        if (ptr) {
+            return ptr;
+        }
+    }
+    /* check for partial match at end */
+    if ((buf_end[-1] & m1) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+
+    return buf_end;
+}
+
+// returns NULL if not found
+static really_inline
+const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
+                                  m128 mask1, m128 mask2, const u8 *buf) {
+    m128 data = loadu128(buf); // unaligned
+    m128 v1 = eq128(chars1, and128(data, mask1));
+    m128 v2 = eq128(chars2, and128(data, mask2));
+    u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
+
+    /* no fixup of the boundary required - the aligned run will pick it up */
+    if (unlikely(z)) {
+        u32 pos = ctz32(z);
+        return buf + pos;
+    }
+    return NULL;
+}
+
+static really_inline
+const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2,
+                                   m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1,
+                                   u8 m2, const u8 *buf, const u8 *buf_end) {
+    assert((size_t)buf % 16 == 0);
+
+    for (; buf + 16 < buf_end; buf += 16) {
+        m128 data = load128(buf);
+        m128 v1 = eq128(chars1, and128(data, mask1));
+        m128 v2 = eq128(chars2, and128(data, mask2));
+        u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
+
+        if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
+            z |= (1 << 15);
+        }
+        if (unlikely(z)) {
+            u32 pos = ctz32(z);
+            return buf + pos;
+        }
+    }
+
+    return NULL;
+}
+
+static really_inline
+const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2,
+                                     const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) "
+                 "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf));
+    assert(buf < buf_end);
+
+    m128 chars1 = set1_16x8(c1);
+    m128 chars2 = set1_16x8(c2);
+    m128 mask1 = set1_16x8(m1);
+    m128 mask2 = set1_16x8(m2);
+
+    assert((buf_end - buf) >= 16);
+    uintptr_t min = (uintptr_t)buf % 16;
+    if (min) {
+        // Input isn't aligned, so we need to run one iteration with an
+        // unaligned load, then skip buf forward to the next aligned address.
+        // There's some small overlap here, but we don't mind scanning it twice
+        // if we can do it quickly, do we?
+        const u8 *p = dvermPreconditionMasked(chars1, chars2, mask1, mask2, buf);
+        if (p) {
+            return p;
+        }
+
+        buf += 16 - min;
+        assert(buf < buf_end);
+    }
+
+    // Aligned loops from here on in
+    const u8 *ptr = dvermSearchAlignedMasked(chars1, chars2, mask1, mask2, c1,
+                                             c2, m1, m2, buf, buf_end);
+    if (ptr) {
+        return ptr;
+    }
+
+    // Tidy up the mess at the end
+    ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2,
+                                  buf_end - 16);
+
+    if (ptr) {
+        return ptr;
+    }
+
+    /* check for partial match at end */
+    if ((buf_end[-1] & m1) == (u8)c1) {
+        DEBUG_PRINTF("partial!!!\n");
+        return buf_end - 1;
+    }
+
+    return buf_end;
+}
diff --git a/src/nfa/vermicellicompile.cpp b/src/nfa/vermicellicompile.cpp
new file mode 100644
index 000000000..d72ecece2
--- /dev/null
+++ b/src/nfa/vermicellicompile.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli acceleration: compile code.
+ */
+#include "vermicellicompile.h"
+#include "util/charreach.h"
+
+#include <cstring>
+
+namespace ue2 {
+
+bool vermicelli16Build(const CharReach &chars, u8 *rv) {
+    size_t i = chars.find_first();
+    u8 arr[16];
+    std::memset(arr, i, sizeof(arr));
+    size_t count = 1;
+    for (i = chars.find_next(i); i != CharReach::npos; i = chars.find_next(i)) {
+        if (count == sizeof(arr)) return false;
+        arr[count] = i;
+        ++count;
+    }
+    std::memcpy(rv, arr, sizeof(arr));
+    return true;
+}
+
+bool vermicelliDouble16Build(const flat_set<std::pair<u8, u8>> &twochar,
+                             u8 *chars, u8 *firsts) {
+    constexpr size_t count_limit = 8;
+    if (twochar.size() > count_limit) return false;
+    size_t count = 0;
+    for (const auto &p : twochar) {
+        firsts[count] = p.first;
+        chars[2 * count] = p.first;
+        chars[(2 * count) + 1] = p.second;
+        ++count;
+    }
+    for(; count < count_limit; ++count) {
+        firsts[count] = chars[0];
+        chars[2 * count] = chars[0];
+        chars[(2 * count) + 1] = chars[1];
+    }
+    return true;
+}
+
+static really_inline
+void fillMask(u8 matches[], size_t len, u8 *rv) {
+    for (size_t i = 0; i < 16; ++i) {
+        rv[i] = matches[i % len];
+    }
+}
+
+static really_inline
+void getTwoCases(u8 cases[2], u8 bit, char c) {
+    const u8 set = 1UL << bit;
+    cases[0] = c & (~set);
+    cases[1] = c | set;
+}
+
+static really_inline
+void getFourCases(u8 cases[4], u8 bit, char case1, char case2) {
+    const u8 set = 1UL << bit;
+    cases[0] = case1 & (~set);
+    cases[1] = case1 | set;
+    cases[2] = case2 & (~set);
+    cases[3] = case2 | set;
+}
+
+static really_inline
+void getEightCases(u8 cases[8], u8 bit, char case1, char case2,
+                                        char case3, char case4) {
+    const u8 set = 1UL << bit;
+    cases[0] = case1 & (~set);
+    cases[1] = case1 | set;
+    cases[2] = case2 & (~set);
+    cases[3] = case2 | set;
+    cases[4] = case3 & (~set);
+    cases[5] = case3 | set;
+    cases[6] = case4 & (~set);
+    cases[7] = case4 | set;
+}
+
+static really_inline
+bool getDoubleMatchesForBits(u8 c1, u8 c2, u8 holes[3], u8 c1_holes,
+                             u8 c2_holes, u8 *rv) {
+    u8 cases[8];
+    switch (c1_holes) {
+        case 0:
+            switch (c2_holes) {
+                case 0: {
+                    u8 matches[2] = { c1, c2 };
+                    fillMask(matches, 2, rv);
+                    return true;
+                }
+                case 1: {
+                    getTwoCases(cases, holes[0], c2);
+                    u8 matches[4] = { c1, cases[0], c1, cases[1] };
+                    fillMask(matches, 4, rv);
+                    return true;
+                }
+                case 2: {
+                    getTwoCases(cases, holes[0], c2);
+                    getFourCases(&cases[2], holes[1], cases[0], cases[1]);
+                    u8 matches[8] = { c1, cases[2], c1, cases[3],
+                                      c1, cases[4], c1, cases[5] };
+                    fillMask(matches, 8, rv);
+                    return true;
+                }
+                case 3: {
+                    getTwoCases(cases, holes[0], c2);
+                    getFourCases(&cases[4], holes[1], cases[0], cases[1]);
+                    getEightCases(cases, holes[2], cases[4], cases[5],
+                                                  cases[6], cases[7]);
+                    u8 matches[16] = { c1, cases[0], c1, cases[1],
+                                       c1, cases[2], c1, cases[3],
+                                       c1, cases[4], c1, cases[5],
+                                       c1, cases[6], c1, cases[7] };
+                    memcpy(rv, matches, sizeof(matches));
+                    return true;
+                }
+                default:
+                    assert(c2_holes < 4);
+                    break;
+            }
+            break;
+        case 1:
+            getTwoCases(cases, holes[0], c1);
+            switch (c2_holes) {
+                case 0: {
+                    u8 matches[4] = { cases[0] , c2, cases[1], c2 };
+                    fillMask(matches, 4, rv);
+                    return true;
+                }
+                case 1: {
+                    getTwoCases(&cases[2], holes[1], c2);
+                    u8 matches[8] = { cases[0], cases[2],
+                                      cases[0], cases[3],
+                                      cases[1], cases[2],
+                                      cases[1], cases[3] };
+                    fillMask(matches, 8, rv);
+                    return true;
+                }
+                case 2: {
+                    getTwoCases(&cases[2], holes[1], c2);
+                    getFourCases(&cases[4], holes[2], cases[2], cases[3]);
+                    u8 matches[16] = { cases[0], cases[4], cases[0], cases[5],
+                                       cases[0], cases[6], cases[0], cases[7],
+                                       cases[1], cases[4], cases[1], cases[5],
+                                       cases[1], cases[6], cases[1], cases[7] };
+                    memcpy(rv, matches, sizeof(matches));
+                    return true;
+                }
+                default:
+                    assert(c2_holes < 3);
+                    break;
+            }
+            break;
+        case 2:
+            getTwoCases(cases, holes[0], c1);
+            getFourCases(&cases[2], holes[1], cases[0], cases[1]);
+            switch (c2_holes) {
+                case 0: {
+                    u8 matches[8] = { cases[2], c2, cases[3], c2,
+                                      cases[4], c2, cases[5], c2 };
+                    fillMask(matches, 8, rv);
+                    return true;
+                }
+                case 1: {
+                    getTwoCases(&cases[6], holes[2], c2);
+                    u8 matches[16] = { cases[2], cases[6], cases[3], cases[6],
+                                       cases[4], cases[6], cases[5], cases[6],
+                                       cases[2], cases[7], cases[3], cases[7],
+                                       cases[4], cases[7], cases[5], cases[7] };
+                    memcpy(rv, matches, sizeof(matches));
+                    return true;
+                }
+                default:
+                    assert(c2_holes < 2);
+                    break;
+            }
+            break;
+        case 3: {
+            assert(!c2_holes);
+            getTwoCases(cases, holes[0], c1);
+            getFourCases(&cases[4], holes[1], cases[0], cases[1]);
+            getEightCases(cases, holes[2], cases[4], cases[5],
+                                        cases[6], cases[7]);
+            u8 matches[16] = { cases[0], c2, cases[1], c2,
+                                cases[2], c2, cases[3], c2,
+                                cases[4], c2, cases[5], c2,
+                                cases[6], c2, cases[7], c2 };
+            memcpy(rv, matches, sizeof(matches));
+            return true;
+        }
+    }
+    return false;
+}
+
+static really_inline
+bool getDoubleMatchesForMask(char c1, char c2, char m1, char m2,
+                             u8 c1_holes, u8 c2_holes, u8 *rv) {
+    u8 holes[3] = { 0 };
+    int count = 0;
+    if (c1_holes) {
+        for (int i = 0; i < 8; ++i) {
+            if (!(m1 & (1UL << i))) {
+                holes[count++] = i;
+            }
+        }
+    }
+    if (c2_holes) {
+        for (int i = 0; i < 8; ++i) {
+            if (!(m2 & (1UL << i))) {
+                holes[count++] = i;
+            }
+        }
+    }
+    return getDoubleMatchesForBits(c1, c2, holes, c1_holes, c2_holes, rv);
+}
+
+bool vermicelliDoubleMasked16Build(char c1, char c2, char m1, char m2, u8 *rv) {
+    u8 c1_holes = 8 - __builtin_popcount(m1);
+    u8 c2_holes = 8 - __builtin_popcount(m2);
+    if (c1_holes + c2_holes > 3) {
+        return false;
+    }
+    return getDoubleMatchesForMask(c1, c2, m1, m2, c1_holes, c2_holes, rv);
+}
+
+} // namespace ue2
diff --git a/src/util/simd_utils.c b/src/nfa/vermicellicompile.h
similarity index 59%
rename from src/util/simd_utils.c
rename to src/nfa/vermicellicompile.h
index 25a81412e..0075273c9 100644
--- a/src/util/simd_utils.c
+++ b/src/nfa/vermicellicompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,36 +27,27 @@
  */
 
 /** \file
- * \brief Lookup tables to support SIMD operations.
+ * \brief Vermicelli acceleration: compile code.
  */
 
-#include "simd_utils.h"
-
-ALIGN_CL_DIRECTIVE const char vbs_mask_data[] = {
-    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
-    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
-
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-
-    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
-    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
-};
-
-#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
-
-/** \brief LUT for the mask1bit functions. */
-ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = {
-    ZEROES_32, ZEROES_32,
-    ZEROES_31, 0x01, ZEROES_32,
-    ZEROES_31, 0x02, ZEROES_32,
-    ZEROES_31, 0x04, ZEROES_32,
-    ZEROES_31, 0x08, ZEROES_32,
-    ZEROES_31, 0x10, ZEROES_32,
-    ZEROES_31, 0x20, ZEROES_32,
-    ZEROES_31, 0x40, ZEROES_32,
-    ZEROES_31, 0x80, ZEROES_32,
-    ZEROES_32, ZEROES_32,
-};
+#ifndef VERM_COMPILE_H
+#define VERM_COMPILE_H
+
+#include "ue2common.h"
+#include "util/charreach.h"
+#include "util/flat_containers.h"
+
+#include <utility>
+
+namespace ue2 {
+
+bool vermicelli16Build(const CharReach &chars, u8 *rv);
+
+bool vermicelliDouble16Build(const flat_set<std::pair<u8, u8>> &twochar,
+                             u8 *chars, u8 *firsts);
+
+bool vermicelliDoubleMasked16Build(char c1, char c2, char m1, char m2, u8 *rv);
+
+} // namespace ue2
+
+#endif // VERM_COMPILE_H
diff --git a/src/nfa/x86/shufti.hpp b/src/nfa/x86/shufti.hpp
new file mode 100644
index 000000000..6fb34b2f2
--- /dev/null
+++ b/src/nfa/x86/shufti.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Shufti: character class acceleration.
+ */
+
+template <uint16_t S>
+static really_inline
+const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask_hi, SuperVector<S> chars) {
+    const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
+
+    SuperVector<S> c_lo = chars & low4bits;
+    SuperVector<S> c_hi = chars.template vshr_64_imm<4>() & low4bits;
+    c_lo = mask_lo.pshufb(c_lo);
+    c_hi = mask_hi.pshufb(c_hi);
+
+    return (c_lo & c_hi).eq(SuperVector<S>::Zeroes());
+}
+
+template <uint16_t S>
+static really_inline
+SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars) {
+
+    const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
+    SuperVector<S> chars_lo = chars & low4bits;
+    chars_lo.print8("chars_lo");
+    SuperVector<S> chars_hi = low4bits.opandnot(chars).template vshr_64_imm<4>();
+    chars_hi.print8("chars_hi");
+    SuperVector<S> c1_lo = mask1_lo.pshufb(chars_lo);
+    c1_lo.print8("c1_lo");
+    SuperVector<S> c1_hi = mask1_hi.pshufb(chars_hi);
+    c1_hi.print8("c1_hi");
+    SuperVector<S> c1 = c1_lo | c1_hi;
+    c1.print8("c1");
+
+    SuperVector<S> c2_lo = mask2_lo.pshufb(chars_lo);
+    c2_lo.print8("c2_lo");
+    SuperVector<S> c2_hi = mask2_hi.pshufb(chars_hi);
+    c2_hi.print8("c2_hi");
+    SuperVector<S> c2 = c2_lo | c2_hi;
+    c2.print8("c2");
+    c2.template vshr_128_imm<1>().print8("c2.vshr_128(1)");
+    SuperVector<S> c = c1 | (c2.template vshr_128_imm<1>());
+    c.print8("c");
+
+    return c.eq(SuperVector<S>::Ones());
+}
diff --git a/src/nfa/x86/truffle.hpp b/src/nfa/x86/truffle.hpp
new file mode 100644
index 000000000..7dc711f4e
--- /dev/null
+++ b/src/nfa/x86/truffle.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Truffle: character class acceleration.
+ *
+ */
+
+template <uint16_t S>
+static really_inline
+const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
+
+    chars.print8("chars");
+    shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear");
+    shuf_mask_lo_highset.print8("shuf_mask_lo_highset");
+
+    SuperVector<S> highconst = SuperVector<S>::dup_u8(0x80);
+    highconst.print8("highconst");
+    SuperVector<S> shuf_mask_hi = SuperVector<S>::dup_u64(0x8040201008040201);
+    shuf_mask_hi.print8("shuf_mask_hi");
+    
+    SuperVector<S> shuf1 = shuf_mask_lo_highclear.pshufb(chars);
+    shuf1.print8("shuf1");
+    SuperVector<S> t1 = chars ^ highconst;
+    t1.print8("t1");
+    SuperVector<S> shuf2 = shuf_mask_lo_highset.pshufb(t1);
+    shuf2.print8("shuf2");
+    SuperVector<S> t2 = highconst.opandnot(chars.template vshr_64_imm<4>());
+    t2.print8("t2");
+    SuperVector<S> shuf3 = shuf_mask_hi.pshufb(t2);
+    shuf3.print8("shuf3");
+    SuperVector<S> res = (shuf1 | shuf2) & shuf3;
+    res.print8("(shuf1 | shuf2) & shuf3");
+
+    return res.eq(SuperVector<S>::Zeroes());
+}
diff --git a/src/nfa/x86/vermicelli.hpp b/src/nfa/x86/vermicelli.hpp
new file mode 100644
index 000000000..2f219f319
--- /dev/null
+++ b/src/nfa/x86/vermicelli.hpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Vermicelli: single-byte and double-byte acceleration.
+ */
+
+template <uint16_t S>
+static really_inline
+const u8 *vermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = chars.eq(casemask & data);
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *vermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = chars.eq(casemask & data);
+    return first_zero_match_inverted<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *rvermicelliBlock(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u16 const len) {
+
+    SuperVector<S> mask = chars.eq(casemask & data);
+    return last_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S>
+static really_inline
+const u8 *rvermicelliBlockNeg(SuperVector<S> const data, SuperVector<S> const chars, SuperVector<S> const casemask, const u8 *buf, u16 const len) {
+
+    data.print8("data");
+    chars.print8("chars");
+    casemask.print8("casemask");
+    SuperVector<S> mask = chars.eq(casemask & data);
+    mask.print8("mask");
+    return last_zero_match_inverted<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *vermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+                                u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v = casemask & data;
+    SuperVector<S> mask1 = chars1.eq(v);
+    SuperVector<S> mask2 = chars2.eq(v);
+    SuperVector<S> mask = mask1 & (mask2 >> 1);
+
+    DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
+    }
+
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *rvermicelliDoubleBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2, SuperVector<S> const casemask,
+                                 u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v = casemask & data;
+    SuperVector<S> mask1 = chars1.eq(v);
+    SuperVector<S> mask2 = chars2.eq(v);
+    SuperVector<S> mask = (mask1 << 1)& mask2;
+
+    DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | (SuperVector<S>::Ones() >> (S-1));
+    }
+
+    return last_non_zero_match<S>(buf, mask, len);
+}
+
+template <uint16_t S, bool check_partial>
+static really_inline
+const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S> const chars1, SuperVector<S> const chars2,
+                                      SuperVector<S> const mask1, SuperVector<S> const mask2,
+                                      u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) {
+
+    SuperVector<S> v1 = chars1.eq(data & mask1);
+    SuperVector<S> v2 = chars2.eq(data & mask2);
+    SuperVector<S> mask = v1 & (v2 >> 1);
+
+    DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]);
+    bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1));
+    DEBUG_PRINTF("partial = %d\n", partial_match);
+    if (partial_match) {
+        mask = mask | ((SuperVector<S>::Ones() >> (S-1)) << (S-1));
+    }
+
+    return first_non_zero_match<S>(buf, mask, len);
+}
+
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index 8dccf9863..b2a875236 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -71,7 +71,6 @@
 #include "util/container.h"
 #include "util/depth.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "util/ue2string.h"
 
 using namespace std;
diff --git a/src/nfagraph/ng_asserts.cpp b/src/nfagraph/ng_asserts.cpp
index 8812afadb..764ebed1b 100644
--- a/src/nfagraph/ng_asserts.cpp
+++ b/src/nfagraph/ng_asserts.cpp
@@ -94,7 +94,7 @@ vector<NFAEdge> getAsserts(const NGHolder &g) {
     vector<NFAEdge> out;
     for (const auto &e : edges_range(g)) {
         if (g[e].assert_flags) {
-            out.push_back(e);
+            out.emplace_back(e);
         }
     }
     return out;
diff --git a/src/nfagraph/ng_builder.cpp b/src/nfagraph/ng_builder.cpp
index 60f667f49..e867bbde6 100644
--- a/src/nfagraph/ng_builder.cpp
+++ b/src/nfagraph/ng_builder.cpp
@@ -39,7 +39,6 @@
 #include "ue2common.h"
 #include "compiler/compiler.h" // for ParsedExpression
 #include "util/compile_error.h"
-#include "util/make_unique.h"
 
 #include <cassert>
 
@@ -114,7 +113,7 @@ class NFABuilderImpl : public NFABuilder {
 
 NFABuilderImpl::NFABuilderImpl(ReportManager &rm_in, const Grey &grey_in,
                                const ParsedExpression &parsed)
-    : rm(rm_in), grey(grey_in), graph(ue2::make_unique<NGHolder>()),
+    : rm(rm_in), grey(grey_in), graph(std::make_unique<NGHolder>()),
       expr(parsed.expr), vertIdx(N_SPECIALS) {
 
     // Reserve space for a reasonably-sized NFA
@@ -163,7 +162,7 @@ BuiltExpression NFABuilderImpl::getGraph() {
         throw CompileError("Pattern too large.");
     }
 
-    return { expr, move(graph) };
+    return { expr, std::move(graph) };
 }
 
 void NFABuilderImpl::setNodeReportID(Position pos, int offsetAdjust) {
@@ -270,7 +269,7 @@ void NFABuilderImpl::cloneRegion(Position first, Position last, unsigned posOffs
 
 unique_ptr<NFABuilder> makeNFABuilder(ReportManager &rm, const CompileContext &cc,
                            const ParsedExpression &expr) {
-    return ue2::make_unique<NFABuilderImpl>(rm, cc.grey, expr);
+    return std::make_unique<NFABuilderImpl>(rm, cc.grey, expr);
 }
 
 NFABuilder::~NFABuilder() { }
diff --git a/src/nfagraph/ng_calc_components.cpp b/src/nfagraph/ng_calc_components.cpp
index 3e9454eee..c5e93cc0b 100644
--- a/src/nfagraph/ng_calc_components.cpp
+++ b/src/nfagraph/ng_calc_components.cpp
@@ -58,7 +58,6 @@
 #include "ue2common.h"
 #include "util/graph_range.h"
 #include "util/graph_undirected.h"
-#include "util/make_unique.h"
 
 #include <map>
 #include <vector>
@@ -213,7 +212,7 @@ vector<NFAEdge> findShellEdges(const NGHolder &g,
             (is_special(v, g) || contains(tail_shell, v))) {
             DEBUG_PRINTF("edge (%zu,%zu) is a shell edge\n", g[u].index,
                          g[v].index);
-            shell_edges.push_back(e);
+            shell_edges.emplace_back(e);
         }
     }
 
@@ -291,7 +290,7 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
     if (head_shell.size() + tail_shell.size() + N_SPECIALS >=
         num_vertices(*g)) {
         DEBUG_PRINTF("all in shell component\n");
-        comps.push_back(std::move(g));
+        comps.emplace_back(std::move(g));
         *shell_comp = true;
         return;
     }
@@ -306,7 +305,7 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
     // into the tail shell, we aren't going to find more than one component.
     if (shell_edges.empty() && shellHasOnePath(*g, head_shell, tail_shell)) {
         DEBUG_PRINTF("single component\n");
-        comps.push_back(std::move(g));
+        comps.emplace_back(std::move(g));
         return;
     }
 
@@ -329,7 +328,7 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
     assert(num > 0);
     if (num == 1 && shell_edges.empty()) {
         DEBUG_PRINTF("single component\n");
-        comps.push_back(std::move(g));
+        comps.emplace_back(std::move(g));
         return;
     }
 
@@ -341,7 +340,7 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
     for (const auto &m : split_components) {
         NFAVertex v = m.first;
         u32 c = m.second;
-        verts[c].push_back(v);
+        verts[c].emplace_back(v);
         DEBUG_PRINTF("vertex %zu is in comp %u\n", (*g)[v].index, c);
     }
 
@@ -355,7 +354,7 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
          * no deterministic ordering (split_components map). */
         sort(begin(vv), end(vv));
 
-        auto gc = ue2::make_unique<NGHolder>();
+        auto gc = std::make_unique<NGHolder>();
         v_map.clear();
         fillHolder(gc.get(), *g, vv, &v_map);
 
@@ -370,7 +369,7 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
         pruneUseless(*gc);
         DEBUG_PRINTF("component %zu has %zu vertices\n", comps.size(),
                      num_vertices(*gc));
-        comps.push_back(move(gc));
+        comps.emplace_back(std::move(gc));
     }
 
     // Another component to handle the direct shell-to-shell edges.
@@ -379,14 +378,14 @@ void splitIntoComponents(unique_ptr<NGHolder> g,
         vv.insert(vv.end(), begin(head_shell), end(head_shell));
         vv.insert(vv.end(), begin(tail_shell), end(tail_shell));
 
-        auto gc = ue2::make_unique<NGHolder>();
+        auto gc = std::make_unique<NGHolder>();
         v_map.clear();
         fillHolder(gc.get(), *g, vv, &v_map);
 
         pruneUseless(*gc);
         DEBUG_PRINTF("shell edge component %zu has %zu vertices\n",
                      comps.size(), num_vertices(*gc));
-        comps.push_back(move(gc));
+        comps.emplace_back(std::move(gc));
         *shell_comp = true;
     }
 
@@ -410,7 +409,7 @@ deque<unique_ptr<NGHolder>> calcComponents(unique_ptr<NGHolder> g,
     // For trivial cases, we needn't bother running the full
     // connected_components algorithm.
     if (!grey.calcComponents || isAlternationOfClasses(*g)) {
-        comps.push_back(std::move(g));
+        comps.emplace_back(std::move(g));
         return comps;
     }
 
@@ -444,7 +443,7 @@ void recalcComponents(deque<unique_ptr<NGHolder>> &comps, const Grey &grey) {
         }
 
         if (isAlternationOfClasses(*gc)) {
-            out.push_back(std::move(gc));
+            out.emplace_back(std::move(gc));
             continue;
         }
 
diff --git a/src/nfagraph/ng_edge_redundancy.cpp b/src/nfagraph/ng_edge_redundancy.cpp
index b8354bd42..d6e9895b7 100644
--- a/src/nfagraph/ng_edge_redundancy.cpp
+++ b/src/nfagraph/ng_edge_redundancy.cpp
@@ -493,7 +493,7 @@ bool removeSiblingsOfStartDotStar(NGHolder &g) {
                 continue;
             }
             DEBUG_PRINTF("removing %zu->%zu\n", g[u].index, g[v].index);
-            dead.push_back(e);
+            dead.emplace_back(e);
         }
     }
 
@@ -520,7 +520,7 @@ bool optimiseVirtualStarts(NGHolder &g) {
 
         for (const auto &e : in_edges_range(v, g)) {
             if (!is_any_start(source(e, g), g)) {
-                dead.push_back(e);
+                dead.emplace_back(e);
             }
         }
     }
diff --git a/src/nfagraph/ng_equivalence.cpp b/src/nfagraph/ng_equivalence.cpp
index a42a0ac71..7bfe3c933 100644
--- a/src/nfagraph/ng_equivalence.cpp
+++ b/src/nfagraph/ng_equivalence.cpp
@@ -39,7 +39,6 @@
 #include "util/compile_context.h"
 #include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "util/unordered.h"
 
 #include <algorithm>
@@ -148,7 +147,7 @@ class WorkQueue {
     // unique push
     void push(unsigned id) {
         if (ids.insert(id).second) {
-            q.push_back(id);
+            q.emplace_back(id);
         }
     }
 
@@ -269,7 +268,7 @@ vector<unique_ptr<VertexInfo>> getVertexInfos(const NGHolder &g) {
     vertex_map.resize(num_verts);
 
     for (auto v : vertices_range(g)) {
-        infos.push_back(make_unique<VertexInfo>(v, g));
+        infos.emplace_back(std::make_unique<VertexInfo>(v, g));
         vertex_map[g[v].index] = infos.back().get();
     }
 
@@ -350,7 +349,7 @@ vector<VertexInfoSet> partitionGraph(vector<unique_ptr<VertexInfo>> &infos,
             unsigned eq_class = classes.size();
             vi->equivalence_class = eq_class;
             classes.push_back({vi.get()});
-            classinfomap.emplace(move(ci), eq_class);
+            classinfomap.emplace(std::move(ci), eq_class);
         } else {
             // vertex is added to an existing class.
             unsigned eq_class = ii->second;
@@ -442,7 +441,7 @@ void equivalence(vector<VertexInfoSet> &classes, WorkQueue &work_queue,
                     classes[cur_class].erase(vi);
                     new_class_vertices.insert(vi);
                 }
-                classes.push_back(move(new_class_vertices));
+                classes.emplace_back(std::move(new_class_vertices));
 
                 if (contains(tmi->first, cur_class)) {
                     reval_queue.push(new_class);
@@ -516,7 +515,7 @@ void mergeClass(vector<unique_ptr<VertexInfo>> &infos, NGHolder &g,
     g[new_v].reports.clear(); /* populated as we pull in succs */
 
     // store this vertex in our global vertex list
-    infos.push_back(make_unique<VertexInfo>(new_v, g));
+    infos.emplace_back(std::make_unique<VertexInfo>(new_v, g));
     VertexInfo *new_vertex_info = infos.back().get();
 
     NFAVertex new_v_eod = NGHolder::null_vertex();
@@ -525,7 +524,7 @@ void mergeClass(vector<unique_ptr<VertexInfo>> &infos, NGHolder &g,
     if (require_separate_eod_vertex(cur_class_vertices, g)) {
         new_v_eod = clone_vertex(g, old_v);
         g[new_v_eod].reports.clear();
-        infos.push_back(make_unique<VertexInfo>(new_v_eod, g));
+        infos.emplace_back(std::make_unique<VertexInfo>(new_v_eod, g));
         new_vertex_info_eod = infos.back().get();
     }
 
diff --git a/src/nfagraph/ng_expr_info.cpp b/src/nfagraph/ng_expr_info.cpp
index f8abbd04a..4d4678336 100644
--- a/src/nfagraph/ng_expr_info.cpp
+++ b/src/nfagraph/ng_expr_info.cpp
@@ -68,7 +68,7 @@ void removeLeadingVirtualVerticesFromRoot(NGHolder &g, NFAVertex root) {
     for (auto v : adjacent_vertices_range(root, g)) {
         if (g[v].assert_flags & POS_FLAG_VIRTUAL_START) {
             DEBUG_PRINTF("(?m)^ vertex or leading \\[bB] vertex\n");
-            victims.push_back(v);
+            victims.emplace_back(v);
         }
     }
 
diff --git a/src/nfagraph/ng_extparam.cpp b/src/nfagraph/ng_extparam.cpp
index 6eb23113f..65e30a140 100644
--- a/src/nfagraph/ng_extparam.cpp
+++ b/src/nfagraph/ng_extparam.cpp
@@ -353,7 +353,7 @@ bool anchorPatternWithBoundedRepeat(NGHolder &g, ReportManager &rm) {
         if (v == g.startDs) {
             continue;
         }
-        initials.push_back(v);
+        initials.emplace_back(v);
     }
     if (initials.empty()) {
         DEBUG_PRINTF("no initial vertices\n");
@@ -576,13 +576,13 @@ bool transformMinLengthToRepeat(NGHolder &g, ReportManager &rm) {
         if (u == cyclic) {
             continue;
         }
-        preds.push_back(u);
+        preds.emplace_back(u);
 
         // We want to delete the out-edges of each predecessor, but need to
         // make sure we don't delete the startDs self loop.
         for (const auto &e : out_edges_range(u, g)) {
             if (target(e, g) != g.startDs) {
-                dead.push_back(e);
+                dead.emplace_back(e);
             }
         }
     }
@@ -601,7 +601,7 @@ bool transformMinLengthToRepeat(NGHolder &g, ReportManager &rm) {
             add_edge(u, v, g);
         }
         preds.clear();
-        preds.push_back(v);
+        preds.emplace_back(v);
     }
     assert(!preds.empty());
     for (auto u : preds) {
@@ -732,7 +732,7 @@ void pruneExtUnreachable(NGHolder &g, const ReportManager &rm) {
     for (const auto &e : edges_range(g)) {
         if (isEdgePrunable(g, report, depths, e)) {
             DEBUG_PRINTF("pruning\n");
-            dead.push_back(e);
+            dead.emplace_back(e);
         }
     }
 
@@ -775,14 +775,14 @@ void pruneVacuousEdges(NGHolder &g, const ReportManager &rm) {
         // a min_offset.
         if (u == g.start && is_any_accept(v, g) && has_min_offset(u)) {
             DEBUG_PRINTF("vacuous edge in graph with min_offset!\n");
-            dead.push_back(e);
+            dead.emplace_back(e);
             continue;
         }
 
         // If a min_length is set, vacuous edges can be removed.
         if (is_any_start(u, g) && is_any_accept(v, g) && has_min_length(u)) {
             DEBUG_PRINTF("vacuous edge in graph with min_length!\n");
-            dead.push_back(e);
+            dead.emplace_back(e);
             continue;
         }
     }
@@ -825,14 +825,14 @@ void pruneUnmatchable(NGHolder &g, const vector<DepthMinMax> &depths,
         if (d.max.is_finite() && d.max < report.minLength) {
             DEBUG_PRINTF("prune, max match length %s < min_length=%llu\n",
                          d.max.str().c_str(), report.minLength);
-            dead.push_back(e);
+            dead.emplace_back(e);
             continue;
         }
 
         if (report.maxOffset != MAX_OFFSET && d.min > report.maxOffset) {
             DEBUG_PRINTF("prune, min match length %s > max_offset=%llu\n",
                          d.min.str().c_str(), report.maxOffset);
-            dead.push_back(e);
+            dead.emplace_back(e);
             continue;
         }
     }
diff --git a/src/nfagraph/ng_fixed_width.cpp b/src/nfagraph/ng_fixed_width.cpp
index 8fb264d8a..f901a534d 100644
--- a/src/nfagraph/ng_fixed_width.cpp
+++ b/src/nfagraph/ng_fixed_width.cpp
@@ -88,7 +88,7 @@ bool findMask(const NGHolder &g, vector<CharReach> *mask, bool *anchored,
             return true;
         }
 
-        mask->push_back(g[v].char_reach);
+        mask->emplace_back(g[v].char_reach);
 
         if (out_degree(v, g) != 1) {
             DEBUG_PRINTF("out_degree != 1\n");
diff --git a/src/nfagraph/ng_haig.cpp b/src/nfagraph/ng_haig.cpp
index 805454477..bf951a0b0 100644
--- a/src/nfagraph/ng_haig.cpp
+++ b/src/nfagraph/ng_haig.cpp
@@ -44,7 +44,6 @@
 #include "util/graph.h"
 #include "util/graph_range.h"
 #include "util/hash_dynamic_bitset.h"
-#include "util/make_unique.h"
 #include "util/unordered.h"
 
 #include <algorithm>
@@ -194,7 +193,7 @@ class Automaton_Base {
     const vector<StateSet> initial() {
         vector<StateSet> rv = {init};
         if (start_floating != DEAD_STATE && start_floating != start_anchored) {
-            rv.push_back(initDS);
+            rv.emplace_back(initDS);
         }
         return rv;
     }
@@ -354,7 +353,7 @@ class Automaton_Haig_Merge {
 
                 if (t.any() && t != esets[i]) {
                     esets[i] &= ~t;
-                    esets.push_back(t);
+                    esets.emplace_back(t);
                 }
             }
         }
@@ -380,7 +379,7 @@ class Automaton_Haig_Merge {
     const vector<StateSet> initial() {
         vector<StateSet> rv(1, as);
         if (start_floating != DEAD_STATE && start_floating != start_anchored) {
-            rv.push_back(fs);
+            rv.emplace_back(fs);
         }
         return rv;
     }
@@ -454,7 +453,7 @@ void haig_do_preds(const NGHolder &g, const stateset &nfa_states,
         DEBUG_PRINTF("d vertex %zu\n", g[v].index);
         vector<u32> &out_map = preds[slot_id];
         for (auto u : inv_adjacent_vertices_range(v, g)) {
-            out_map.push_back(g[u].index);
+            out_map.emplace_back(g[u].index);
         }
 
         sort(out_map.begin(), out_map.end());
@@ -536,7 +535,7 @@ bool doHaig(const NGHolder &g, som_type som,
 
     rdfa->state_som.reserve(rdfa->states.size());
     for (u32 i = 0; i < rdfa->states.size(); i++) {
-        rdfa->state_som.push_back(dstate_som());
+        rdfa->state_som.emplace_back(dstate_som());
         const StateSet &source_states = nfa_state_map[i];
         if (source_states.count() > HAIG_MAX_LIVE_SOM_SLOTS) {
             DEBUG_PRINTF("too many live states\n");
@@ -581,7 +580,7 @@ attemptToBuildHaig(const NGHolder &g, som_type som, u32 somPrecision,
         return nullptr;
     }
 
-    auto rdfa = ue2::make_unique<raw_som_dfa>(g.kind, unordered_som, NODE_START,
+    auto rdfa = std::make_unique<raw_som_dfa>(g.kind, unordered_som, NODE_START,
                                               somPrecision);
 
     DEBUG_PRINTF("determinising nfa with %u vertices\n", numStates);
@@ -632,9 +631,9 @@ void haig_merge_do_preds(const vector<const raw_som_dfa *> &dfas,
             for (vector<u32>::const_iterator jt = it->second.begin();
                  jt != it->second.end(); ++jt) {
                 if (*jt < N_SPECIALS || *jt == CREATE_NEW_SOM) {
-                    out.push_back(*jt);
+                    out.emplace_back(*jt);
                 } else {
-                    out.push_back(*jt + adj);
+                    out.emplace_back(*jt + adj);
                 }
             }
         }
@@ -724,7 +723,7 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
 
     using StateSet = Automaton_Haig_Merge::StateSet;
     vector<StateSet> nfa_state_map;
-    auto rdfa = ue2::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som,
+    auto rdfa = std::make_unique<raw_som_dfa>(dfas[0]->kind, unordered_som,
                                               NODE_START,
                                               dfas[0]->stream_som_loc_width);
 
@@ -741,7 +740,7 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
     vector<u32> per_dfa_adj;
     u32 curr_adj = 0;
     for (const auto &haig : dfas) {
-        per_dfa_adj.push_back(curr_adj);
+        per_dfa_adj.emplace_back(curr_adj);
         curr_adj += total_slots_used(*haig);
         if (curr_adj < per_dfa_adj.back()) {
             /* overflowed our som slot count */
@@ -751,7 +750,7 @@ unique_ptr<raw_som_dfa> attemptToMergeHaig(const vector<const raw_som_dfa *> &df
 
     rdfa->state_som.reserve(rdfa->states.size());
     for (u32 i = 0; i < rdfa->states.size(); i++) {
-        rdfa->state_som.push_back(dstate_som());
+        rdfa->state_som.emplace_back(dstate_som());
         const vector<dstate_id_t> &source_nfa_states = nfa_state_map[i];
         DEBUG_PRINTF("finishing state %u\n", i);
 
diff --git a/src/nfagraph/ng_is_equal.cpp b/src/nfagraph/ng_is_equal.cpp
index 35a09d0ea..ca6e30b3f 100644
--- a/src/nfagraph/ng_is_equal.cpp
+++ b/src/nfagraph/ng_is_equal.cpp
@@ -41,7 +41,6 @@
 #include "util/container.h"
 #include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 
 using namespace std;
 
diff --git a/src/nfagraph/ng_lbr.cpp b/src/nfagraph/ng_lbr.cpp
index d8ba503ce..039eeb3b4 100644
--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,6 +44,7 @@
 #include "nfa/repeatcompile.h"
 #include "nfa/shufticompile.h"
 #include "nfa/trufflecompile.h"
+#include "nfa/vermicellicompile.h"
 #include "util/alloc.h"
 #include "util/bitutils.h" // for lg2
 #include "util/compile_context.h"
@@ -143,6 +145,10 @@ bytecode_ptr<NFA> makeLbrNfa(NFAEngineType nfa_type, enum RepeatType rtype,
     return nfa;
 }
 
+#ifdef HAVE_SVE2
+#include "ng_lbr_sve.hpp"
+#endif
+
 static
 bytecode_ptr<NFA> buildLbrDot(const CharReach &cr, const depth &repeatMin,
                               const depth &repeatMax, u32 minPeriod,
@@ -269,6 +275,16 @@ bytecode_ptr<NFA> constructLBR(const CharReach &cr, const depth &repeatMin,
         nfa = buildLbrNVerm(cr, repeatMin, repeatMax, minPeriod, is_reset,
                             report);
     }
+#ifdef HAVE_SVE2
+    if (!nfa) {
+        nfa = buildLbrVerm16(cr, repeatMin, repeatMax, minPeriod, is_reset,
+                             report);
+    }
+    if (!nfa) {
+        nfa = buildLbrNVerm16(cr, repeatMin, repeatMax, minPeriod, is_reset,
+                              report);
+    }
+#endif // HAVE_SVE2
     if (!nfa) {
         nfa = buildLbrShuf(cr, repeatMin, repeatMax, minPeriod, is_reset,
                            report);
diff --git a/src/nfagraph/ng_lbr_sve.hpp b/src/nfagraph/ng_lbr_sve.hpp
new file mode 100644
index 000000000..82df3ea19
--- /dev/null
+++ b/src/nfagraph/ng_lbr_sve.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Large Bounded Repeat (LBR) engine build code for SVE.
+ */
+
+static
+bytecode_ptr<NFA> buildLbrVerm16(const CharReach &cr, const depth &repeatMin,
+                                 const depth &repeatMax, u32 minPeriod,
+                                 bool is_reset, ReportID report) {
+    const CharReach escapes(~cr);
+
+    if (escapes.count() > 16) {
+        return nullptr;
+    }
+
+    enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
+                                             is_reset);
+    auto nfa = makeLbrNfa<lbr_verm16>(LBR_NFA_VERM16, rtype, repeatMax);
+    struct lbr_verm16 *lv = (struct lbr_verm16 *)getMutableImplNfa(nfa.get());
+    vermicelli16Build(escapes, (u8 *)&lv->mask);
+
+    fillNfa<lbr_verm16>(nfa.get(), &lv->common, report, repeatMin, repeatMax,
+                        minPeriod, rtype);
+
+    DEBUG_PRINTF("built verm16 lbr\n");
+    return nfa;
+}
+
+static
+bytecode_ptr<NFA> buildLbrNVerm16(const CharReach &cr, const depth &repeatMin,
+                                  const depth &repeatMax, u32 minPeriod,
+                                  bool is_reset, ReportID report) {
+    const CharReach escapes(cr);
+
+    if (escapes.count() > 16) {
+        return nullptr;
+    }
+
+    enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
+                                             is_reset);
+    auto nfa = makeLbrNfa<lbr_verm16>(LBR_NFA_NVERM16, rtype, repeatMax);
+    struct lbr_verm16 *lv = (struct lbr_verm16 *)getMutableImplNfa(nfa.get());
+    vermicelli16Build(escapes, (u8 *)&lv->mask);
+
+    fillNfa<lbr_verm16>(nfa.get(), &lv->common, report, repeatMin, repeatMax,
+                        minPeriod, rtype);
+
+    DEBUG_PRINTF("built negated verm16 lbr\n");
+    return nfa;
+}
\ No newline at end of file
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index 922100e7a..27d8c5244 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -391,7 +391,7 @@ void reusePredsAsStarts(const NGHolder &g, const map<u32, CharReach> &top_reach,
     vector<NFAVertex> cand_starts;
     for (NFAVertex u : unhandled_succ_tops | map_keys) {
         if (hasSelfLoop(u, g)) {
-            cand_starts.push_back(u);
+            cand_starts.emplace_back(u);
         }
     }
 
@@ -525,7 +525,7 @@ void reverseStateOrdering(unordered_map<NFAVertex, u32> &state_ids) {
         if (e.second == NO_STATE) {
             continue;
         }
-        ordering.push_back(e.first);
+        ordering.emplace_back(e.first);
     }
 
     // Sort in reverse order by state ID.
@@ -632,8 +632,8 @@ bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
-             bool compress_state, bool do_accel, bool impl_test_only, u32 hint,
-             const CompileContext &cc) {
+             bool compress_state, bool do_accel, bool impl_test_only,
+             bool &fast, u32 hint, const CompileContext &cc) {
     if (!has_managed_reports(h_in)) {
         rm = nullptr;
     } else {
@@ -684,19 +684,19 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
     }
 
     return generate(*h, state_ids, repeats, reportSquashMap, squashMap, tops,
-                    zombies, do_accel, compress_state, hint, cc);
+                    zombies, do_accel, compress_state, fast, hint, cc);
 }
 
 bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
-             bool compress_state, const CompileContext &cc) {
+             bool compress_state, bool &fast, const CompileContext &cc) {
     const u32 hint = INVALID_NFA;
     const bool do_accel = cc.grey.accelerateNFA;
     const bool impl_test_only = false;
     return constructNFA(h_in, rm, fixed_depth_tops, triggers, compress_state,
-                        do_accel, impl_test_only, hint, cc);
+                        do_accel, impl_test_only, fast, hint, cc);
 }
 
 #ifndef RELEASE_BUILD
@@ -705,11 +705,11 @@ bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
-             bool compress_state, u32 hint, const CompileContext &cc) {
+             bool compress_state, bool &fast, u32 hint, const CompileContext &cc) {
     const bool do_accel = cc.grey.accelerateNFA;
     const bool impl_test_only = false;
-    return constructNFA(h_in, rm, fixed_depth_tops, triggers,
-                        compress_state, do_accel, impl_test_only, hint, cc);
+    return constructNFA(h_in, rm, fixed_depth_tops, triggers, compress_state,
+                        do_accel, impl_test_only, fast, hint, cc);
 }
 #endif // RELEASE_BUILD
 
@@ -739,9 +739,10 @@ bytecode_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
     vector<BoundedRepeatData> repeats;
     unordered_map<NFAVertex, NFAStateSet> reportSquashMap;
     unordered_map<NFAVertex, NFAStateSet> squashMap;
+    UNUSED bool fast = false;
 
     return generate(h, state_ids, repeats, reportSquashMap, squashMap, tops,
-                    zombies, false, false, hint, cc);
+                    zombies, false, false, fast, hint, cc);
 }
 
 bytecode_ptr<NFA> constructReversedNFA(const NGHolder &h_in,
diff --git a/src/nfagraph/ng_limex.h b/src/nfagraph/ng_limex.h
index 9bf46d693..7eba2eff0 100644
--- a/src/nfagraph/ng_limex.h
+++ b/src/nfagraph/ng_limex.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -100,7 +100,7 @@ bytecode_ptr<NFA>
 constructNFA(const NGHolder &g, const ReportManager *rm,
              const std::map<u32, u32> &fixed_depth_tops,
              const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
-             bool compress_state, const CompileContext &cc);
+             bool compress_state, bool &fast, const CompileContext &cc);
 
 /**
  * \brief Build a reverse NFA from the graph given, which should have already
@@ -129,7 +129,7 @@ bytecode_ptr<NFA>
 constructNFA(const NGHolder &g, const ReportManager *rm,
              const std::map<u32, u32> &fixed_depth_tops,
              const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
-             bool compress_state, u32 hint, const CompileContext &cc);
+             bool compress_state, bool &fast, u32 hint, const CompileContext &cc);
 
 /**
  * \brief Build a reverse NFA (with model type hint) from the graph given,
diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index f1f829f2c..8bac753d9 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -148,7 +148,7 @@ void findPaths(const NGHolder &g, NFAVertex v,
     if (v == g.accept || v == g.acceptEod) {
         paths->push_back({});
         if (!generates_callbacks(g) || v == g.acceptEod) {
-            paths->back().push_back(CharReach()); /* red tape options */
+            paths->back().emplace_back(CharReach()); /* red tape options */
         }
         return;
     }
@@ -181,8 +181,8 @@ void findPaths(const NGHolder &g, NFAVertex v,
         } while (new_depth-- && curr.size() >= MAGIC_TOO_WIDE_NUMBER);
 
         for (auto &c : curr) {
-            c.push_back(cr);
-            paths->push_back(std::move(c));
+            c.emplace_back(cr);
+            paths->emplace_back(std::move(c));
         }
     }
 }
@@ -254,7 +254,7 @@ void findBestInternal(vector<vector<CharReach>>::const_iterator pb,
             DEBUG_PRINTF("worse\n");
             continue;
         }
-        priority_path.push_back(move(as));
+        priority_path.emplace_back(std::move(as));
     }
 
     sort(priority_path.begin(), priority_path.end());
@@ -422,7 +422,7 @@ void findDoubleBest(vector<vector<CharReach> >::const_iterator pb,
             DEBUG_PRINTF("worse\n");
             continue;
         }
-        priority_path.push_back(move(as));
+        priority_path.emplace_back(std::move(as));
     }
 
     sort(priority_path.begin(), priority_path.end());
@@ -569,7 +569,7 @@ AccelScheme findBestAccelScheme(vector<vector<CharReach>> paths,
         DAccelScheme da = findBestDoubleAccelScheme(paths, terminating);
         if (da.double_byte.size() <= DOUBLE_SHUFTI_LIMIT) {
             rv.double_byte = std::move(da.double_byte);
-            rv.double_cr = move(da.double_cr);
+            rv.double_cr = std::move(da.double_cr);
             rv.double_offset = da.double_offset;
         }
     }
diff --git a/src/nfagraph/ng_literal_analysis.cpp b/src/nfagraph/ng_literal_analysis.cpp
index ea0def021..77964b812 100644
--- a/src/nfagraph/ng_literal_analysis.cpp
+++ b/src/nfagraph/ng_literal_analysis.cpp
@@ -67,16 +67,16 @@ namespace {
 
 struct LitGraphVertexProps {
     LitGraphVertexProps() = default;
-    explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(move(c_in)) {}
+    explicit LitGraphVertexProps(ue2_literal::elem c_in) : c(std::move(c_in)) {}
     ue2_literal::elem c; // string element (char + bool)
-    size_t index; // managed by ue2_graph
+    size_t index = 0; // managed by ue2_graph
 };
 
 struct LitGraphEdgeProps {
     LitGraphEdgeProps() = default;
     explicit LitGraphEdgeProps(u64a score_in) : score(score_in) {}
     u64a score = NO_LITERAL_AT_EDGE_SCORE;
-    size_t index; // managed by ue2_graph
+    size_t index = 0; // managed by ue2_graph
 };
 
 struct LitGraph
@@ -558,12 +558,12 @@ void findMinCut(LitGraph &lg, vector<LitEdge> &cutset) {
 
         if (ucolor != small_color::white && vcolor == small_color::white) {
             assert(v != lg.sink);
-            white_cut.push_back(e);
+            white_cut.emplace_back(e);
             white_flow += lg[e].score;
         }
         if (ucolor == small_color::black && vcolor != small_color::black) {
             assert(v != lg.sink);
-            black_cut.push_back(e);
+            black_cut.emplace_back(e);
             black_flow += lg[e].score;
         }
     }
@@ -657,7 +657,7 @@ u64a sanitizeAndCompressAndScore(set<ue2_literal> &lits) {
         continue;
     dont_explode:
         make_nocase(&s);
-        replacements.push_back(s);
+        replacements.emplace_back(s);
     }
 
     insert(&lits, replacements);
diff --git a/src/nfagraph/ng_literal_decorated.cpp b/src/nfagraph/ng_literal_decorated.cpp
index 61a31dbf3..7fa2416a1 100644
--- a/src/nfagraph/ng_literal_decorated.cpp
+++ b/src/nfagraph/ng_literal_decorated.cpp
@@ -39,7 +39,6 @@
 #include "rose/rose_in_util.h"
 #include "util/compile_context.h"
 #include "util/dump_charclass.h"
-#include "util/make_unique.h"
 
 #include <algorithm>
 #include <memory>
@@ -102,8 +101,8 @@ bool findPaths(const NGHolder &g, vector<Path> &paths) {
             assert(read_count[g[u].index]);
 
             for (const auto &p : built[g[u].index]) {
-                out.push_back(p);
-                out.back().push_back(v);
+                out.emplace_back(p);
+                out.back().emplace_back(v);
 
                 if (out.size() > MAX_PATHS) {
                     // All these paths should eventually end up at a sink, so
@@ -182,7 +181,7 @@ struct PathMask {
             if (is_special(v, g)) {
                 continue;
             }
-            mask.push_back(g[v].char_reach);
+            mask.emplace_back(g[v].char_reach);
         }
 
         // Reports are attached to the second-to-last vertex.
@@ -238,7 +237,7 @@ bool handleDecoratedLiterals(RoseBuild &rose, const NGHolder &g,
             DEBUG_PRINTF("failed validation\n");
             return false;
         }
-        masks.push_back(move(pm));
+        masks.emplace_back(std::move(pm));
     }
 
     for (const auto &pm : masks) {
diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp
index 4ce5dc153..1e4b743f7 100644
--- a/src/nfagraph/ng_mcclellan.cpp
+++ b/src/nfagraph/ng_mcclellan.cpp
@@ -45,7 +45,6 @@
 #include "util/graph_range.h"
 #include "util/hash.h"
 #include "util/hash_dynamic_bitset.h"
-#include "util/make_unique.h"
 #include "util/report_manager.h"
 
 #include <algorithm>
@@ -116,7 +115,7 @@ void calculateAlphabet(const NGHolder &g, array<u16, ALPHABET_SIZE> &alpha,
             CharReach t = cr & esets[i];
             if (t.any() && t != esets[i]) {
                 esets[i] &= ~t;
-                esets.push_back(t);
+                esets.emplace_back(t);
             }
         }
     }
@@ -401,7 +400,7 @@ class Automaton_Base {
     const vector<StateSet> initial() {
         vector<StateSet> rv = {init};
         if (start_floating != DEAD_STATE && start_floating != start_anchored) {
-            rv.push_back(initDS);
+            rv.emplace_back(initDS);
         }
         return rv;
     }
@@ -568,7 +567,7 @@ unique_ptr<raw_dfa> buildMcClellan(const NGHolder &graph,
         return nullptr;
     }
 
-    auto rdfa = ue2::make_unique<raw_dfa>(graph.kind);
+    auto rdfa = std::make_unique<raw_dfa>(graph.kind);
 
     if (numStates <= NFA_STATE_LIMIT) {
         /* Fast path. Automaton_Graph uses a bitfield internally to represent
diff --git a/src/nfagraph/ng_misc_opt.cpp b/src/nfagraph/ng_misc_opt.cpp
index 8aaaf99fd..2b898cf76 100644
--- a/src/nfagraph/ng_misc_opt.cpp
+++ b/src/nfagraph/ng_misc_opt.cpp
@@ -112,7 +112,7 @@ void findCandidates(NGHolder &g, const vector<NFAVertex> &ordering,
             }
         }
         DEBUG_PRINTF("vertex %zu is a candidate\n", g[v].index);
-        cand->push_back(v);
+        cand->emplace_back(v);
     next_cand:;
     }
 }
@@ -143,7 +143,7 @@ void findCandidates_rev(NGHolder &g, const vector<NFAVertex> &ordering,
             }
         }
         DEBUG_PRINTF("vertex %zu is a candidate\n", g[v].index);
-        cand->push_back(v);
+        cand->emplace_back(v);
     next_cand:;
     }
 }
@@ -385,8 +385,7 @@ bool improveGraph(NGHolder &g, som_type som) {
 
     const vector<NFAVertex> ordering = getTopoOrdering(g);
 
-    return enlargeCyclicCR(g, som, ordering)
-        | enlargeCyclicCR_rev(g, ordering);
+    return enlargeCyclicCR(g, som, ordering) || enlargeCyclicCR_rev(g, ordering);
 }
 
 /** finds a smaller reachability for a state by the reverse transformation of
@@ -525,7 +524,7 @@ bool mergeCyclicDotStars(NGHolder &g) {
                 add_edge_if_not_present(g.startDs, t, g);
 
                 // mark this edge for removal
-                deadEdges.push_back(e);
+                deadEdges.emplace_back(e);
             }
             // if the number of edges to be removed equals out degree, vertex
             // needs to be removed; else, only remove the edges
@@ -641,7 +640,7 @@ bool pruneUsingSuccessors(NGHolder &g, PrunePathsInfo &info, NFAVertex u,
              * existing in progress matches. */
             continue;
         }
-        u_succs.push_back(v);
+        u_succs.emplace_back(v);
     }
 
     stable_sort(u_succs.begin(), u_succs.end(),
diff --git a/src/nfagraph/ng_netflow.cpp b/src/nfagraph/ng_netflow.cpp
index 780a319f5..b48e33c46 100644
--- a/src/nfagraph/ng_netflow.cpp
+++ b/src/nfagraph/ng_netflow.cpp
@@ -193,14 +193,14 @@ vector<NFAEdge> findMinCut(NGHolder &h, const vector<u64a> &scores) {
             DEBUG_PRINTF("found white cut edge %zu->%zu cap %llu\n",
                      h[from].index, h[to].index, ec);
             observed_white_flow += ec;
-            picked_white.push_back(e);
+            picked_white.emplace_back(e);
         }
         if (fromColor == small_color::black && toColor != small_color::black) {
             assert(ec <= INVALID_EDGE_CAP);
             DEBUG_PRINTF("found black cut edge %zu->%zu cap %llu\n",
                      h[from].index, h[to].index, ec);
             observed_black_flow += ec;
-            picked_black.push_back(e);
+            picked_black.emplace_back(e);
         }
     }
 
diff --git a/src/nfagraph/ng_prefilter.cpp b/src/nfagraph/ng_prefilter.cpp
index 04611872a..d26939455 100644
--- a/src/nfagraph/ng_prefilter.cpp
+++ b/src/nfagraph/ng_prefilter.cpp
@@ -183,7 +183,7 @@ map<u32, RegionInfo> findRegionInfo(const NGHolder &h,
         }
         u32 id = region_map.at(v);
         RegionInfo &ri = regions.emplace(id, RegionInfo(id)).first->second;
-        ri.vertices.push_back(v);
+        ri.vertices.emplace_back(v);
         ri.reach |= h[v].char_reach;
     }
 
@@ -283,7 +283,7 @@ void replaceRegion(NGHolder &g, const RegionInfo &ri,
         if (i > 0) {
             add_edge(verts.back(), v, g);
         }
-        verts.push_back(v);
+        verts.emplace_back(v);
     }
 
     if (maxWidth.is_infinite()) {
diff --git a/src/nfagraph/ng_prune.cpp b/src/nfagraph/ng_prune.cpp
index adda70312..73d7e64b2 100644
--- a/src/nfagraph/ng_prune.cpp
+++ b/src/nfagraph/ng_prune.cpp
@@ -64,7 +64,7 @@ void pruneUnreachable(NGHolder &g) {
         // accept->acceptEod), so all non-specials are unreachable.
         for (auto v : vertices_range(g)) {
             if (!is_special(v, g)) {
-                dead.push_back(v);
+                dead.emplace_back(v);
             }
         }
     } else {
@@ -88,7 +88,7 @@ void pruneUnreachable(NGHolder &g) {
                 continue;
             }
             if (!contains(colours, v)) {
-                dead.push_back(v);
+                dead.emplace_back(v);
             }
         }
     }
@@ -120,7 +120,7 @@ bool pruneForwardUseless(NGHolder &h, const nfag_t &g,
         if (!is_special(v, g) && get(colors, v) == small_color::white) {
             DEBUG_PRINTF("vertex %zu is unreachable from %zu\n",
                          g[v].index, g[s].index);
-            dead.push_back(NFAVertex(v));
+            dead.emplace_back(NFAVertex(v));
         }
     }
 
@@ -169,7 +169,7 @@ void pruneEmptyVertices(NGHolder &g) {
         const CharReach &cr = g[v].char_reach;
         if (cr.none()) {
             DEBUG_PRINTF("empty: %zu\n", g[v].index);
-            dead.push_back(v);
+            dead.emplace_back(v);
         }
     }
 
@@ -207,7 +207,7 @@ void pruneHighlanderAccepts(NGHolder &g, const ReportManager &rm) {
         // We can prune any out-edges that aren't accepts
         for (const auto &e : out_edges_range(u, g)) {
             if (!is_any_accept(target(e, g), g)) {
-                dead.push_back(e);
+                dead.emplace_back(e);
             }
         }
     }
@@ -272,7 +272,7 @@ void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm) {
         for (const auto &report_id : g[v].reports) {
             const Report &r = rm.getReport(report_id);
             if (isSimpleExhaustible(r)) {
-                reporters.push_back(v);
+                reporters.emplace_back(v);
                 break;
             }
         }
@@ -281,7 +281,7 @@ void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm) {
         for (const auto &report_id : g[v].reports) {
             const Report &r = rm.getReport(report_id);
             if (isSimpleExhaustible(r)) {
-                reporters.push_back(v);
+                reporters.emplace_back(v);
                 break;
             }
         }
diff --git a/src/nfagraph/ng_puff.cpp b/src/nfagraph/ng_puff.cpp
index 984518b0f..9b03f4c07 100644
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@@ -346,7 +346,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
             unbounded = true;
         }
 
-        nodes.push_back(a);
+        nodes.emplace_back(a);
         DEBUG_PRINTF("vertex %zu has in_degree %zu\n", g[a].index,
                      in_degree(a, g));
 
@@ -379,7 +379,7 @@ bool doComponent(RoseBuild &rose, ReportManager &rm, NGHolder &g, NFAVertex a,
     if (a != g.startDs && edge(g.startDs, a, g).second
         && proper_out_degree(a, g) == 1
         && g[a].char_reach == cr) {
-        nodes.push_back(a);
+        nodes.emplace_back(a);
         a = g.startDs;
     }
 
diff --git a/src/nfagraph/ng_redundancy.cpp b/src/nfagraph/ng_redundancy.cpp
index 06b9daeec..a499a40d4 100644
--- a/src/nfagraph/ng_redundancy.cpp
+++ b/src/nfagraph/ng_redundancy.cpp
@@ -207,7 +207,7 @@ void succPredIntersection(const NFAVertex v, const flat_set<NFAVertex> &predSet,
             // Break out if we've reduced our intersection to [v]
             if (best->size() == 1) {
                 assert(*(best->begin()) == v);
-                intersection.push_back(v);
+                intersection.emplace_back(v);
                 return;
             }
         }
@@ -256,7 +256,7 @@ void predSuccIntersection(const NFAVertex v,
             // Break out if we've reduced our intersection to [v]
             if (best->size() == 1) {
                 assert(*(best->begin()) == v);
-                intersection.push_back(v);
+                intersection.emplace_back(v);
                 return;
             }
         }
diff --git a/src/nfagraph/ng_region.cpp b/src/nfagraph/ng_region.cpp
index 2675be643..1d5bc164b 100644
--- a/src/nfagraph/ng_region.cpp
+++ b/src/nfagraph/ng_region.cpp
@@ -100,7 +100,7 @@ void checkAndAddExitCandidate(const AcyclicGraph &g,
 
     if (!open.empty()) {
         DEBUG_PRINTF("exit %zu\n", g[v].index);
-        exits.push_back(move(v_exit));
+        exits.emplace_back(std::move(v_exit));
     }
 }
 
@@ -210,7 +210,7 @@ void buildInitialCandidate(const AcyclicGraph &g,
 
     if (it != ite) {
         enters.erase(*it);
-        open_jumps = move(enters);
+        open_jumps = std::move(enters);
         DEBUG_PRINTF("oj size = %zu\n", open_jumps.size());
         ++it;
     } else {
diff --git a/src/nfagraph/ng_region_redundancy.cpp b/src/nfagraph/ng_region_redundancy.cpp
index 1126d4d6c..a3ea558f8 100644
--- a/src/nfagraph/ng_region_redundancy.cpp
+++ b/src/nfagraph/ng_region_redundancy.cpp
@@ -256,7 +256,7 @@ void removeRegionRedundancy(NGHolder &g, som_type som) {
         }
         u32 region = region_map.at(v);
         if (contains(deadRegions, region)) {
-            dead.push_back(v);
+            dead.emplace_back(v);
         }
     }
 
diff --git a/src/nfagraph/ng_repeat.cpp b/src/nfagraph/ng_repeat.cpp
index 1f63ad3c6..2aa318089 100644
--- a/src/nfagraph/ng_repeat.cpp
+++ b/src/nfagraph/ng_repeat.cpp
@@ -320,7 +320,7 @@ void splitSubgraph(const NGHolder &g, const deque<NFAVertex> &verts,
         }
         u32 comp_id = rit->second;
         assert(comp_id < num);
-        rs[comp_id].vertices.push_back(v);
+        rs[comp_id].vertices.emplace_back(v);
     }
 
     for (const auto &rsi : rs) {
@@ -409,7 +409,7 @@ void checkReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
                 continue;
             }
 
-            verts.push_back(v);
+            verts.emplace_back(v);
         }
 
         if (recalc) {
@@ -421,7 +421,7 @@ void checkReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
             splitSubgraph(g, verts, minNumVertices, q);
         } else {
             DEBUG_PRINTF("subgraph is ok\n");
-            rs_out.push_back(rsi);
+            rs_out.emplace_back(rsi);
         }
         q.pop();
     }
@@ -638,7 +638,7 @@ void buildTugTrigger(NGHolder &g, NFAVertex cyclic, NFAVertex v,
         DEBUG_PRINTF("all preds in subgraph, vertex %zu becomes tug\n",
                      g[v].index);
         add_edge(cyclic, v, g);
-        tugs.push_back(v);
+        tugs.emplace_back(v);
         return;
     }
 
@@ -650,7 +650,7 @@ void buildTugTrigger(NGHolder &g, NFAVertex cyclic, NFAVertex v,
     DEBUG_PRINTF("there are other paths, cloned tug %zu from vertex %zu\n",
                   g[t].index, g[v].index);
 
-    tugs.push_back(t);
+    tugs.emplace_back(t);
     add_edge(cyclic, t, g);
 
     // New vertex gets all of v's successors, including v itself if it's
@@ -738,7 +738,7 @@ void unpeelNearEnd(NGHolder &g, ReachSubgraph &rsi,
         }
 
         succs->clear();
-        succs->push_back(d);
+        succs->emplace_back(d);
 
         rsi.repeatMax -= 1;
 
@@ -761,7 +761,7 @@ void getSuccessors(const NGHolder &g, const ReachSubgraph &rsi,
         if (v == last) { /* ignore self loop */
             continue;
         }
-        succs->push_back(v);
+        succs->emplace_back(v);
     }
 }
 
@@ -837,7 +837,7 @@ void replaceSubgraphWithSpecial(NGHolder &g, ReachSubgraph &rsi,
     remove_vertices(rsi.vertices, g, false);
     erase_all(&depths, rsi.vertices);
 
-    repeats->push_back(BoundedRepeatData(rsi.historyType, rsi.repeatMin,
+    repeats->emplace_back(BoundedRepeatData(rsi.historyType, rsi.repeatMin,
                                          rsi.repeatMax, rsi.minPeriod, cyclic,
                                          pos_trigger, tugs));
 }
@@ -905,7 +905,7 @@ void replaceSubgraphWithLazySpecial(NGHolder &g, ReachSubgraph &rsi,
     remove_vertices(rsi.vertices, g, false);
     erase_all(&depths, rsi.vertices);
 
-    repeats->push_back(BoundedRepeatData(rsi.historyType, rsi.repeatMin,
+    repeats->emplace_back(BoundedRepeatData(rsi.historyType, rsi.repeatMin,
                                          rsi.repeatMax, rsi.minPeriod, cyclic,
                                          pos_trigger, tugs));
 }
@@ -1057,7 +1057,7 @@ void buildReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
         }
         u32 comp_id = rit->second;
         assert(comp_id < num);
-        rs[comp_id].vertices.push_back(v);
+        rs[comp_id].vertices.emplace_back(v);
     }
 
 #ifdef DEBUG
@@ -1176,9 +1176,9 @@ void addTriggers(NGHolder &g,
                 goto next_edge;
             }
 
-            starts_by_top[top].push_back(v);
+            starts_by_top[top].emplace_back(v);
         }
-        dead.push_back(e);
+        dead.emplace_back(e);
     next_edge:;
     }
 
@@ -1519,7 +1519,7 @@ struct StrawWalker {
             }
 
             v = next;
-            straw.push_back(v);
+            straw.emplace_back(v);
         }
 
         straw.clear();
@@ -1615,13 +1615,13 @@ vector<CharReach> getUnionedTrigger(const NGHolder &g, const NFAVertex v) {
 
     if (contains(curr, g.start)) {
         DEBUG_PRINTF("start in repeat's immediate preds\n");
-        trigger.push_back(CharReach::dot()); // Trigger could be anything!
+        trigger.emplace_back(CharReach::dot()); // Trigger could be anything!
         return trigger;
     }
 
     for (size_t num_steps = 0; num_steps < MAX_TRIGGER_STEPS; num_steps++) {
         next.clear();
-        trigger.push_back(CharReach());
+        trigger.emplace_back(CharReach());
         CharReach &cr = trigger.back();
 
         for (auto v_c : curr) {
@@ -1664,7 +1664,7 @@ vector<vector<CharReach>> getRepeatTriggers(const NGHolder &g,
             triggers.push_back({}); // empty
             return triggers;
         }
-        q.push_back(Path(1, u));
+        q.emplace_back(Path(1, u));
     }
 
     while (!q.empty()) {
@@ -1673,7 +1673,7 @@ vector<vector<CharReach>> getRepeatTriggers(const NGHolder &g,
 
         if (path.size() >= max_len) {
             max_len = min(max_len, path.size());
-            done.push_back(path);
+            done.emplace_back(path);
             goto next_path;
         }
 
@@ -1682,16 +1682,16 @@ vector<vector<CharReach>> getRepeatTriggers(const NGHolder &g,
                 // Found an accept. There's no point expanding this path any
                 // further, we're done.
                 max_len = min(max_len, path.size());
-                done.push_back(path);
+                done.emplace_back(path);
                 goto next_path;
             }
 
             if (path.size() + 1 >= max_len) {
-                done.push_back(path);
-                done.back().push_back(u);
+                done.emplace_back(path);
+                done.back().emplace_back(u);
             } else {
-                q.push_back(path); // copy
-                q.back().push_back(u);
+                q.emplace_back(path); // copy
+                q.back().emplace_back(u);
             }
         }
 
@@ -1703,7 +1703,7 @@ vector<vector<CharReach>> getRepeatTriggers(const NGHolder &g,
         if (q.size() + done.size() > UNIONED_FALLBACK_THRESHOLD) {
             DEBUG_PRINTF("search too large, fall back to union trigger\n");
             triggers.clear();
-            triggers.push_back(getUnionedTrigger(g, sink));
+            triggers.emplace_back(getUnionedTrigger(g, sink));
             return triggers;
         }
     }
@@ -1715,7 +1715,7 @@ vector<vector<CharReach>> getRepeatTriggers(const NGHolder &g,
     for (const auto &path : done) {
         vector<CharReach> reach_path;
         for (auto jt = path.rbegin(), jte = path.rend(); jt != jte; ++jt) {
-            reach_path.push_back(g[*jt].char_reach);
+            reach_path.emplace_back(g[*jt].char_reach);
         }
         unique_triggers.insert(reach_path);
     }
@@ -1960,7 +1960,7 @@ vector<NFAVertex> makeOwnStraw(NGHolder &g, BoundedRepeatData &rd,
         if (!own_straw.empty()) {
             add_edge(own_straw.back(), v2, g);
         }
-        own_straw.push_back(v2);
+        own_straw.emplace_back(v2);
     }
 
     // Wire our straw to start, not startDs.
@@ -2536,7 +2536,7 @@ void findRepeats(const NGHolder &h, u32 minRepeatVertices,
             repeatMax = depth::infinity(); /* will continue to pump out matches */
         }
 
-        repeats_out->push_back(GraphRepeatInfo());
+        repeats_out->emplace_back(GraphRepeatInfo());
         GraphRepeatInfo &ri = repeats_out->back();
         ri.vertices.swap(rsi.vertices);
         ri.repeatMin = rsi.repeatMin;
diff --git a/src/nfagraph/ng_restructuring.cpp b/src/nfagraph/ng_restructuring.cpp
index 704697e57..73b4d23e5 100644
--- a/src/nfagraph/ng_restructuring.cpp
+++ b/src/nfagraph/ng_restructuring.cpp
@@ -56,7 +56,7 @@ void wireStartToTops(NGHolder &g, const flat_set<NFAVertex> &tops,
         assert(!isLeafNode(v, g));
 
         const NFAEdge &e = add_edge(g.start, v, g);
-        tempEdges.push_back(e);
+        tempEdges.emplace_back(e);
     }
 }
 
@@ -109,10 +109,10 @@ void getStateOrdering(NGHolder &g, const flat_set<NFAVertex> &tops,
     temp.erase(remove(temp.begin(), temp.end(), g.startDs));
     temp.erase(remove(temp.begin(), temp.end(), g.start));
     if (proper_out_degree(g.startDs, g)) {
-        temp.push_back(g.startDs);
+        temp.emplace_back(g.startDs);
     }
     if (!startIsRedundant(g)) {
-        temp.push_back(g.start);
+        temp.emplace_back(g.start);
     }
 
     // Walk ordering, remove vertices that shouldn't be participating in state
@@ -122,7 +122,7 @@ void getStateOrdering(NGHolder &g, const flat_set<NFAVertex> &tops,
             continue; // accepts don't need states
         }
 
-        ordering.push_back(v);
+        ordering.emplace_back(v);
     }
 
     // Output of topo order was in reverse.
@@ -167,7 +167,7 @@ void optimiseTightLoops(const NGHolder &g, vector<NFAVertex> &ordering) {
                 continue;
             }
             if (edge(t, v, g).second && find(start, it, t) != ite) {
-                candidates.push_back(make_pair(v, t));
+                candidates.emplace_back(make_pair(v, t));
             }
         }
     }
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index d23ac408b..359fa17bc 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -65,7 +65,6 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 
 #include <algorithm>
 #include <map>
@@ -166,12 +165,12 @@ void buildRegionMapping(const NGHolder &g,
         }
 
         if (isRegionEntry(g, v, regions)) {
-            info[region].enters.push_back(v);
+            info[region].enters.emplace_back(v);
         }
         if (isRegionExit(g, v, regions)) {
-            info[region].exits.push_back(v);
+            info[region].exits.emplace_back(v);
         }
-        info[region].full.push_back(v);
+        info[region].full.emplace_back(v);
     }
 
     for (auto &m : info) {
@@ -364,7 +363,7 @@ makePrefix(const NGHolder &g, const unordered_map<NFAVertex, u32> &regions,
     assert(!next_enters.empty());
     assert(!curr_exits.empty());
 
-    unique_ptr<NGHolder> prefix_ptr = ue2::make_unique<NGHolder>();
+    unique_ptr<NGHolder> prefix_ptr = std::make_unique<NGHolder>();
     NGHolder &prefix = *prefix_ptr;
 
     deque<NFAVertex> lhs_verts;
@@ -410,7 +409,7 @@ makePrefix(const NGHolder &g, const unordered_map<NFAVertex, u32> &regions,
         if (p_v == prefix.accept || regions.at(v) < dead_region) {
             continue;
         }
-        to_clear.push_back(p_v);
+        to_clear.emplace_back(p_v);
     }
 
     for (auto v : to_clear) {
@@ -1045,7 +1044,7 @@ void addReporterVertices(const region_info &r, const NGHolder &g,
     for (auto v : r.exits) {
         if (edge(v, g.accept, g).second || edge(v, g.acceptEod, g).second) {
             DEBUG_PRINTF("add reporter %zu\n", g[v].index);
-            reporters.push_back(v);
+            reporters.emplace_back(v);
         }
     }
 }
@@ -1060,7 +1059,7 @@ void addMappedReporterVertices(const region_info &r, const NGHolder &g,
             DEBUG_PRINTF("adding v=%zu\n", g[v].index);
             auto it = mapping.find(v);
             assert(it != mapping.end());
-            reporters.push_back(it->second);
+            reporters.emplace_back(it->second);
         }
     }
 }
@@ -1109,7 +1108,7 @@ void expandGraph(NGHolder &g, unordered_map<NFAVertex, u32> &regions,
         if (is_special(v, g) || regions.at(v) < split_region) {
             continue;
         }
-        tail_vertices.push_back(v);
+        tail_vertices.emplace_back(v);
     }
 
     for (auto enter : enters) {
@@ -1166,7 +1165,7 @@ void expandGraph(NGHolder &g, unordered_map<NFAVertex, u32> &regions,
                               }, g);
         }
 
-        new_enters.push_back(orig_to_copy[enter]);
+        new_enters.emplace_back(orig_to_copy[enter]);
     }
 
     // Remove the original set of tail vertices.
@@ -1659,7 +1658,7 @@ void anchorStarts(NGHolder &g) {
             continue;
         }
         add_edge_if_not_present(g.start, v, g[e], g);
-        dead.push_back(e);
+        dead.emplace_back(e);
     }
     remove_edges(dead, g);
 }
@@ -1720,7 +1719,7 @@ void clearProperInEdges(NGHolder &g, const NFAVertex sink) {
         if (source(e, g) == g.accept) {
             continue;
         }
-        dead.push_back(e);
+        dead.emplace_back(e);
     }
 
     if (dead.empty()) {
@@ -1734,7 +1733,7 @@ void clearProperInEdges(NGHolder &g, const NFAVertex sink) {
 namespace {
 struct SomRevNfa {
     SomRevNfa(NFAVertex s, ReportID r, bytecode_ptr<NFA> n)
-        : sink(s), report(r), nfa(move(n)) {}
+        : sink(s), report(r), nfa(std::move(n)) {}
     NFAVertex sink;
     ReportID report;
     bytecode_ptr<NFA> nfa;
@@ -1800,7 +1799,7 @@ bool makeSomRevNfa(vector<SomRevNfa> &som_nfas, const NGHolder &g,
         return false;
     }
 
-    som_nfas.emplace_back(sink, report, move(nfa));
+    som_nfas.emplace_back(sink, report, std::move(nfa));
     return true;
 }
 
@@ -1840,7 +1839,7 @@ bool doSomRevNfa(NG &ng, NGHolder &g, const CompileContext &cc) {
         assert(som_nfa.nfa);
 
         // Transfer ownership of the NFA to the SOM slot manager.
-        u32 comp_id = ng.ssm.addRevNfa(move(som_nfa.nfa), maxWidth);
+        u32 comp_id = ng.ssm.addRevNfa(std::move(som_nfa.nfa), maxWidth);
 
         // Replace this report on 'g' with a SOM_REV_NFA report pointing at our
         // new component.
@@ -1873,7 +1872,7 @@ u32 doSomRevNfaPrefix(NG &ng, const ExpressionInfo &expr, NGHolder &g,
                max(cc.grey.maxHistoryAvailable, ng.maxSomRevHistoryAvailable));
     }
 
-    return ng.ssm.addRevNfa(move(nfa), maxWidth);
+    return ng.ssm.addRevNfa(std::move(nfa), maxWidth);
 }
 
 static
@@ -2214,7 +2213,7 @@ bool leadingLiterals(const NGHolder &g, set<ue2_literal> *lits,
     sds_succ.erase(g.startDs);
 
     map<NFAVertex, vector<ue2_literal> > curr;
-    curr[g.startDs].push_back(ue2_literal());
+    curr[g.startDs].emplace_back(ue2_literal());
 
     map<NFAVertex, set<NFAVertex> > seen;
     map<NFAVertex, vector<ue2_literal> > next;
@@ -2273,7 +2272,7 @@ bool leadingLiterals(const NGHolder &g, set<ue2_literal> *lits,
                             goto exit;
                         }
                         did_expansion = true;
-                        out.push_back(lit);
+                        out.emplace_back(lit);
                         out.back().push_back(c, nocase);
                         count++;
                         if (out.back().length() > MAX_MASK2_WIDTH
@@ -2446,6 +2445,10 @@ static
 bool doLitHaigSom(NG &ng, NGHolder &g, som_type som) {
     ue2_literal lit;
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
+    if (!rhs) {
+        assert(0);
+        throw std::bad_alloc();
+    }
     if (!ng.cc.grey.allowLitHaig) {
         return false;
     }
@@ -2469,7 +2472,7 @@ bool doLitHaigSom(NG &ng, NGHolder &g, som_type som) {
     dumpHolder(*rhs, 91, "lithaig_rhs", ng.cc.grey);
 
     vector<vector<CharReach> > triggers;
-    triggers.push_back(as_cr_seq(lit));
+    triggers.emplace_back(as_cr_seq(lit));
 
     assert(rhs->kind == NFA_SUFFIX);
     shared_ptr<raw_som_dfa> haig
@@ -2510,6 +2513,11 @@ bool doHaigLitHaigSom(NG &ng, NGHolder &g,
     ue2_literal lit;
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
     shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+    if (!rhs || !lhs) {
+        assert(0);
+        throw std::bad_alloc();
+    }
+
     if (!splitOffBestLiteral(g, regions, &lit, &*lhs, &*rhs, ng.cc)) {
         return false;
     }
@@ -2579,7 +2587,7 @@ bool doHaigLitHaigSom(NG &ng, NGHolder &g,
         assert(rhs->kind == NFA_SUFFIX);
 
         vector<vector<CharReach> > triggers;
-        triggers.push_back(as_cr_seq(lit));
+        triggers.emplace_back(as_cr_seq(lit));
 
         ue2_literal lit2;
         if (getTrailingLiteral(g, &lit2)
@@ -2677,7 +2685,7 @@ bool doMultiLitHaigSom(NG &ng, NGHolder &g, som_type som) {
         }
 
         assert(lit.length() <= MAX_MASK2_WIDTH || !mixed_sensitivity(lit));
-        triggers.push_back(as_cr_seq(lit));
+        triggers.emplace_back(as_cr_seq(lit));
     }
 
     bool unordered_som_triggers = true; /* TODO: check overlaps to ensure that
@@ -2791,7 +2799,7 @@ map<u32, region_info>::const_iterator tryForLaterRevNfaCut(const NGHolder &g,
             continue;
         }
 
-        cands.push_back(it);
+        cands.emplace_back(it);
     }
 
     while (!cands.empty()) {
@@ -3023,7 +3031,7 @@ sombe_rv doSom(NG &ng, NGHolder &g, const ExpressionInfo &expr, u32 comp_id,
     vector<som_plan> plan;
  retry:
     // Note: no-one should ever pay attention to the root plan's parent.
-    plan.push_back(som_plan(prefix, escapes, false, 0));
+    plan.emplace_back(som_plan(prefix, escapes, false, 0));
     dumpHolder(*plan.back().prefix, 12, "som_prefix", cc.grey);
     if (!prefix_by_rev) {
         if (!doSomPlanning(g, stuck, regions, info, picked, plan, cc.grey)) {
diff --git a/src/nfagraph/ng_som_add_redundancy.cpp b/src/nfagraph/ng_som_add_redundancy.cpp
index 33544ec17..871679d99 100644
--- a/src/nfagraph/ng_som_add_redundancy.cpp
+++ b/src/nfagraph/ng_som_add_redundancy.cpp
@@ -102,7 +102,7 @@ bool forkVertex(NFAVertex v, NGHolder &g, vector<DepthMinMax> &depths,
     for (const auto &e : in_edges_range(v, g)) {
         const DepthMinMax &d = getDepth(source(e, g), g, depths);
         assert(d.min == d.max);
-        predGroups[d.min].push_back(e);
+        predGroups[d.min].emplace_back(e);
     }
 
     DEBUG_PRINTF("forking vertex with %zu pred groups\n", predGroups.size());
@@ -121,7 +121,7 @@ bool forkVertex(NFAVertex v, NGHolder &g, vector<DepthMinMax> &depths,
         NFAVertex clone = add_vertex(g[v], g);
         depth clone_depth = predDepth + 1;
         g[clone].index = clone_idx;
-        depths.push_back(DepthMinMax(clone_depth, clone_depth));
+        depths.emplace_back(DepthMinMax(clone_depth, clone_depth));
         DEBUG_PRINTF("cloned vertex %u with depth %s\n", clone_idx,
                      clone_depth.str().c_str());
 
diff --git a/src/nfagraph/ng_som_util.cpp b/src/nfagraph/ng_som_util.cpp
index 1e7a41bb0..82277c061 100644
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@@ -60,10 +60,10 @@ vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
     vector<NFAVertex> vstarts;
     for (auto v : vertices_range(g)) {
         if (is_virtual_start(v, g)) {
-            vstarts.push_back(v);
+            vstarts.emplace_back(v);
         }
     }
-    vstarts.push_back(g.startDs);
+    vstarts.emplace_back(g.startDs);
 
     // wire the successors of every virtual start or startDs to g.start.
     for (auto v : vstarts) {
diff --git a/src/nfagraph/ng_squash.cpp b/src/nfagraph/ng_squash.cpp
index 03495d144..0b51792b1 100644
--- a/src/nfagraph/ng_squash.cpp
+++ b/src/nfagraph/ng_squash.cpp
@@ -281,7 +281,7 @@ void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
                           smgb_cache &cache) {
     deque<NFAVertex> remaining;
     for (const auto &m : *squash) {
-        remaining.push_back(m.first);
+        remaining.emplace_back(m.first);
     }
 
     while (!remaining.empty()) {
@@ -313,7 +313,7 @@ void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
                 DEBUG_PRINTF("%zu is an upstream squasher of %zu\n", u_index,
                              g[v].index);
                 (*squash)[u] = u_squash;
-                remaining.push_back(u);
+                remaining.emplace_back(u);
             }
         }
     }
@@ -639,7 +639,7 @@ vector<NFAVertex> findUnreachable(const NGHolder &g) {
     vector<NFAVertex> unreach;
     for (auto v : vertices_range(revg)) {
         if (!contains(colours, v)) {
-            unreach.push_back(NFAVertex(v));
+            unreach.emplace_back(NFAVertex(v));
         }
     }
     return unreach;
diff --git a/src/nfagraph/ng_uncalc_components.cpp b/src/nfagraph/ng_uncalc_components.cpp
index 4ad5ff787..a10673e69 100644
--- a/src/nfagraph/ng_uncalc_components.cpp
+++ b/src/nfagraph/ng_uncalc_components.cpp
@@ -92,7 +92,7 @@ struct ranking_info {
     u32 add_to_tail(NFAVertex v) {
         u32 rank = size();
         to_rank[v] = rank;
-        to_vertex.push_back(v);
+        to_vertex.emplace_back(v);
         return rank;
     }
 
diff --git a/src/nfagraph/ng_utf8.cpp b/src/nfagraph/ng_utf8.cpp
index 89500fe39..72b4ba9b6 100644
--- a/src/nfagraph/ng_utf8.cpp
+++ b/src/nfagraph/ng_utf8.cpp
@@ -178,7 +178,7 @@ void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
         }
 
         DEBUG_PRINTF("%zu is a seed\n", h[v].index);
-        seeds->push_back(v);
+        seeds->emplace_back(v);
         already_seeds.insert(v);
     }
 }
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index cb2b71035..b1d39d2e3 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -39,7 +39,6 @@
 #include "parser/position.h"
 #include "util/graph_range.h"
 #include "util/graph_small_color_map.h"
-#include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/ue2string.h"
 #include "util/report_manager.h"
@@ -407,7 +406,7 @@ void appendLiteral(NGHolder &h, const ue2_literal &s) {
     vector<NFAVertex> tail;
     assert(in_degree(h.acceptEod, h) == 1);
     for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
-        tail.push_back(v);
+        tail.emplace_back(v);
     }
     assert(!tail.empty());
 
@@ -422,7 +421,7 @@ void appendLiteral(NGHolder &h, const ue2_literal &s) {
             add_edge(u, v, h);
         }
         tail.clear();
-        tail.push_back(v);
+        tail.emplace_back(v);
     }
 
     for (auto v : tail) {
@@ -596,7 +595,7 @@ void cloneHolder(NGHolder &out, const NGHolder &in,
 }
 
 unique_ptr<NGHolder> cloneHolder(const NGHolder &in) {
-    unique_ptr<NGHolder> h = ue2::make_unique<NGHolder>();
+    unique_ptr<NGHolder> h = std::make_unique<NGHolder>();
     cloneHolder(*h, in);
     return h;
 }
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 78d73082a..02461e981 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -62,7 +62,6 @@
 #include "util/graph_range.h"
 #include "util/graph_small_color_map.h"
 #include "util/insertion_ordered.h"
-#include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/target_info.h"
 #include "util/ue2string.h"
@@ -70,6 +69,7 @@
 #include <set>
 #include <utility>
 #include <vector>
+#include <memory>
 #include <boost/dynamic_bitset.hpp>
 #include <boost/range/adaptor/map.hpp>
 
@@ -375,7 +375,7 @@ void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
 
         DEBUG_PRINTF("candidate is a candidate\n");
         scores[v] = score;
-        lit_info[v] = make_unique<VertLitInfo>(v, s, anchored);
+        lit_info[v] = std::make_unique<VertLitInfo>(v, s, anchored);
     }
 
     /* try to filter out cases where appending some characters produces worse
@@ -394,7 +394,7 @@ void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
 
     lits->reserve(lit_info.size());
     for (auto &m : lit_info) {
-        lits->push_back(move(m.second));
+        lits->emplace_back(std::move(m.second));
     }
     DEBUG_PRINTF("%zu candidate literal sets\n", lits->size());
 }
@@ -434,7 +434,7 @@ void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
         }
 
         if (isRegionExit(g, v, regions)) {
-            exits[region].push_back(v);
+            exits[region].emplace_back(v);
         }
 
         if (isRegionEntry(g, v, regions)) {
@@ -531,7 +531,7 @@ void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
         }
 
         DEBUG_PRINTF("candidate is a candidate\n");
-        lits->push_back(make_unique<VertLitInfo>(vv, s, anchored));
+        lits->emplace_back(std::make_unique<VertLitInfo>(vv, s, anchored));
     }
 }
 
@@ -592,7 +592,7 @@ void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
     assert(ait != accepts.end());
     NFAVertex curr = *ait;
     while (curr && !is_special(curr, g)) {
-        dom_trace.push_back(curr);
+        dom_trace.emplace_back(curr);
         curr = dominators[curr];
     }
     reverse(dom_trace.begin(), dom_trace.end());
@@ -600,7 +600,7 @@ void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
         curr = *ait;
         vector<NFAVertex> dom_trace2;
         while (curr && !is_special(curr, g)) {
-            dom_trace2.push_back(curr);
+            dom_trace2.emplace_back(curr);
             curr = dominators[curr];
         }
         reverse(dom_trace2.begin(), dom_trace2.end());
@@ -707,11 +707,11 @@ unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
     auto cmp = LitComparator(g, seeking_anchored, seeking_transient,
                              last_chance);
 
-    unique_ptr<VertLitInfo> best = move(lits.back());
+    unique_ptr<VertLitInfo> best = std::move(lits.back());
     lits.pop_back();
     while (!lits.empty()) {
         if (cmp(best, lits.back())) {
-            best = move(lits.back());
+            best = std::move(lits.back());
         }
         lits.pop_back();
     }
@@ -945,7 +945,7 @@ unique_ptr<VertLitInfo> findSimplePrefixSplit(const NGHolder &g,
         sanitizeAndCompressAndScore(best_lit_set);
     }
 
-    return ue2::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true);
+    return std::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true);
 }
 
 static
@@ -1036,6 +1036,11 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
     shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
     shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
 
+    if (!lhs || !rhs) {
+        assert(0);
+        throw std::bad_alloc();
+    }
+
     unordered_map<NFAVertex, NFAVertex> lhs_map;
     unordered_map<NFAVertex, NFAVertex> rhs_map;
 
@@ -1095,7 +1100,7 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
     for (const RoseInEdge &e : ee) {
         RoseInVertex src = source(e, vg);
         RoseInVertex dest = target(e, vg);
-        images[src].push_back(dest);
+        images[src].emplace_back(dest);
         remove_edge(e, vg);
     }
 
@@ -1149,7 +1154,7 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
                     add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg);
                 }
             }
-            verts_by_image[image].push_back(v);
+            verts_by_image[image].emplace_back(v);
         }
     }
 
@@ -1229,6 +1234,10 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
             DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
             unordered_map<NFAVertex, NFAVertex> temp_map;
             shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
+            if (!new_lhs) {
+                assert(0);
+                throw std::bad_alloc();
+            }
             splitLHS(h, pivot, new_lhs.get(), &temp_map);
 
             /* want to cut off paths to pivot from things other than the pivot -
@@ -1310,6 +1319,10 @@ void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
             if (!contains(done_rhs, adj)) {
                 unordered_map<NFAVertex, NFAVertex> temp_map;
                 shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
+                if (!new_rhs) {
+                    assert(0);
+                    throw std::bad_alloc();
+                }
                 splitRHS(h, adj, new_rhs.get(), &temp_map);
                 remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
                 remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
@@ -1598,7 +1611,7 @@ void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
 
         if (delay == lit.length() && edge(h->start, h->accept, *h).second
             && num_vertices(*h) == N_SPECIALS) {
-            to_anchor.push_back(e);
+            to_anchor.emplace_back(e);
             continue;
         }
 
@@ -1608,7 +1621,7 @@ void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
         if (delay && delay != MO_INVALID_IDX) {
             DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get());
 
-            g[e].graph = move(h);
+            g[e].graph = std::move(h);
             g[e].graph_lag = delay;
         }
     }
@@ -1775,7 +1788,7 @@ void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
         }
 
         NGHolder *h = g[e].graph.get();
-        infixes[h].push_back(e);
+        infixes[h].emplace_back(e);
     }
 
     for (const auto &m : infixes) {
@@ -1835,7 +1848,7 @@ static
 unique_ptr<NGHolder> make_chain(u32 count) {
     assert(count);
 
-    auto rv = make_unique<NGHolder>(NFA_INFIX);
+    auto rv = std::make_unique<NGHolder>(NFA_INFIX);
 
     NGHolder &h = *rv;
 
@@ -2110,7 +2123,7 @@ void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
             assert(vg[target(e, vg)].type == RIV_LITERAL);
             if (vg[e].graph) {
                 NGHolder *h = vg[e].graph.get();
-                prefixes[h].push_back(e);
+                prefixes[h].emplace_back(e);
             }
         }
 
@@ -2174,7 +2187,7 @@ void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
 
             if (vg[ve].graph) {
                 NGHolder *h = vg[ve].graph.get();
-                edges_by_graph[h].push_back(ve);
+                edges_by_graph[h].emplace_back(ve);
             }
         }
 
@@ -2262,7 +2275,7 @@ void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
     for (const RoseInEdge &ve : edges_range(vg)) {
         NGHolder *h = vg[ve].graph.get();
         if (contains(weak, h)) {
-            weak_edges[h].push_back(ve);
+            weak_edges[h].emplace_back(ve);
         }
     }
 
@@ -2281,6 +2294,10 @@ void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
     assert(!splitters.empty());
 
     shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+    if (!lhs) {
+        assert(0);
+        throw bad_alloc();
+    }
     unordered_map<NFAVertex, NFAVertex> v_map;
     cloneHolder(*lhs, base_graph, &v_map);
     lhs->kind = NFA_INFIX;
@@ -2366,7 +2383,7 @@ bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
 
         VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)];
         insert(&vli.lit, ss);
-        vli.vv.push_back(v);
+        vli.vv.emplace_back(v);
         seen.insert(v);
     }
 
@@ -2384,7 +2401,7 @@ bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
 
         VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)];
         insert(&vli.lit, ss);
-        vli.vv.push_back(v);
+        vli.vv.emplace_back(v);
     }
 
     assert(!by_reports.empty());
@@ -2435,7 +2452,7 @@ void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
         assert(vg[e].graph); /* non suffix paths should be wired to other
                                 accepts */
         const NGHolder *h = vg[e].graph.get();
-        suffixes[h].push_back(e);
+        suffixes[h].emplace_back(e);
     }
 
     /* look at suffixes and try to split */
@@ -2530,7 +2547,7 @@ void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) {
     for (const RoseInEdge &ve : edges_range(vg)) {
         if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
             const NGHolder *h = vg[ve].graph.get();
-            right_edges[h].push_back(ve);
+            right_edges[h].emplace_back(ve);
         }
     }
 
@@ -2671,7 +2688,7 @@ void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) {
         for (const RoseInEdge &ve : edges_range(vg)) {
             if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
                 const NGHolder *h = vg[ve].graph.get();
-                right_edges[h].push_back(ve);
+                right_edges[h].emplace_back(ve);
             }
         }
 
@@ -2721,7 +2738,7 @@ void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
             for (const RoseInEdge &e : out_edges_range(v, vg)) {
                 if (vg[e].graph) {
                     NGHolder *h = vg[e].graph.get();
-                    rightfixes[h].push_back(e);
+                    rightfixes[h].emplace_back(e);
                 }
             }
         }
@@ -2757,7 +2774,7 @@ void rehomeEodSuffixes(RoseInGraph &vg) {
             continue;
         }
 
-        acc_edges.push_back(e);
+        acc_edges.emplace_back(e);
     }
 
     for (const RoseInEdge &e : acc_edges) {
@@ -2797,7 +2814,7 @@ vector<vector<CharReach>> getDfaTriggers(RoseInGraph &vg,
     for (const auto &e : edges) {
         RoseInVertex s = source(e, vg);
         if (vg[s].type == RIV_LITERAL) {
-            triggers.push_back(as_cr_seq(vg[s].s));
+            triggers.emplace_back(as_cr_seq(vg[s].s));
         }
         ENSURE_AT_LEAST(&max_offset, vg[s].max_offset);
         LIMIT_TO_AT_MOST(&min_offset, vg[s].min_offset);
@@ -2911,7 +2928,7 @@ bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
         for (const RoseInEdge &ve : edges_range(vg)) {
             if (vg[ve].graph && !vg[ve].dfa) {
                 auto &h = vg[ve].graph;
-                edges_by_graph[h].push_back(ve);
+                edges_by_graph[h].emplace_back(ve);
             }
         }
         for (auto &m : edges_by_graph) {
diff --git a/src/parser/ComponentAlternation.cpp b/src/parser/ComponentAlternation.cpp
index 3e6515fa4..e38c9ce79 100644
--- a/src/parser/ComponentAlternation.cpp
+++ b/src/parser/ComponentAlternation.cpp
@@ -57,7 +57,7 @@ ComponentAlternation::ComponentAlternation(const ComponentAlternation &other)
     : Component(other) {
     for (const auto &c : other.children) {
         assert(c);
-        children.push_back(unique_ptr<Component>(c->clone()));
+        children.emplace_back(unique_ptr<Component>(c->clone()));
     }
 }
 
@@ -103,7 +103,7 @@ void ComponentAlternation::accept(ConstComponentVisitor &v) const {
 }
 
 void ComponentAlternation::append(unique_ptr<Component> component) {
-    children.push_back(move(component));
+    children.emplace_back(std::move(component));
 }
 
 vector<PositionInfo> ComponentAlternation::first() const {
diff --git a/src/parser/ComponentBoundary.cpp b/src/parser/ComponentBoundary.cpp
index efd6bf88d..e8eafc8cb 100644
--- a/src/parser/ComponentBoundary.cpp
+++ b/src/parser/ComponentBoundary.cpp
@@ -94,11 +94,11 @@ void ComponentBoundary::notePositions(GlushkovBuildState & bs) {
     {
         PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
         epsilon.flags = POS_FLAG_NOFLOAT;
-        m_first.push_back(epsilon);
+        m_first.emplace_back(epsilon);
 
         // We have the start vertex in firsts so that we can discourage
         // the mid-pattern use of boundaries.
-        m_first.push_back(startState);
+        m_first.emplace_back(startState);
 
         break;
     }
@@ -106,11 +106,11 @@ void ComponentBoundary::notePositions(GlushkovBuildState & bs) {
     {
         PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
         epsilon.flags = POS_FLAG_NOFLOAT;
-        m_first.push_back(epsilon);
+        m_first.emplace_back(epsilon);
 
         // We have the start vertex in firsts so that we can discourage
         // the mid-pattern use of boundaries.
-        m_first.push_back(startState);
+        m_first.emplace_back(startState);
 
         // Newline
         m_newline = makeNewline(bs);
@@ -118,8 +118,8 @@ void ComponentBoundary::notePositions(GlushkovBuildState & bs) {
         builder.setAssertFlag(m_newline, POS_FLAG_VIRTUAL_START);
         PositionInfo nl(m_newline);
         nl.flags = POS_FLAG_MUST_FLOAT | POS_FLAG_FIDDLE_ACCEPT;
-        m_first.push_back(nl);
-        m_last.push_back(nl);
+        m_first.emplace_back(nl);
+        m_last.emplace_back(nl);
         recordPosBounds(m_newline, m_newline + 1);
         break;
     }
@@ -128,7 +128,7 @@ void ComponentBoundary::notePositions(GlushkovBuildState & bs) {
         PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
         epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_NO_NL_EOD |
                         POS_FLAG_NO_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
-        m_first.push_back(epsilon);
+        m_first.emplace_back(epsilon);
         break;
     }
     case END_STRING_OPTIONAL_LF: // end of data with optional LF ('$')
@@ -136,7 +136,7 @@ void ComponentBoundary::notePositions(GlushkovBuildState & bs) {
         PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
         epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_WIRE_NL_EOD |
                         POS_FLAG_NO_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
-        m_first.push_back(epsilon);
+        m_first.emplace_back(epsilon);
         break;
     }
     case END_LINE: // multiline anchor: end of data or a newline
@@ -144,7 +144,7 @@ void ComponentBoundary::notePositions(GlushkovBuildState & bs) {
         PositionInfo epsilon(GlushkovBuildState::POS_EPSILON);
         epsilon.flags = POS_FLAG_WIRE_EOD | POS_FLAG_WIRE_NL_EOD |
                         POS_FLAG_WIRE_NL_ACCEPT | POS_FLAG_ONLY_ENDS;
-        m_first.push_back(epsilon);
+        m_first.emplace_back(epsilon);
         break;
     }
     default:
diff --git a/src/parser/ComponentClass.cpp b/src/parser/ComponentClass.cpp
index a91ae979f..106c1dab7 100644
--- a/src/parser/ComponentClass.cpp
+++ b/src/parser/ComponentClass.cpp
@@ -35,7 +35,6 @@
 #include "ucp_table.h"
 #include "Utf8ComponentClass.h"
 #include "util/charreach.h"
-#include "util/make_unique.h"
 
 #include <boost/icl/interval_set.hpp>
 
@@ -399,9 +398,9 @@ CharReach getPredefinedCharReach(PredefinedClass c, const ParseMode &mode) {
 
 unique_ptr<ComponentClass> getComponentClass(const ParseMode &mode) {
     if (mode.utf8) {
-        return ue2::make_unique<UTF8ComponentClass>(mode);
+        return std::make_unique<UTF8ComponentClass>(mode);
     } else {
-        return ue2::make_unique<AsciiComponentClass>(mode);
+        return std::make_unique<AsciiComponentClass>(mode);
     }
 }
 
diff --git a/src/parser/ComponentCondReference.cpp b/src/parser/ComponentCondReference.cpp
index 2a2ed4e09..b6ff44db9 100644
--- a/src/parser/ComponentCondReference.cpp
+++ b/src/parser/ComponentCondReference.cpp
@@ -50,7 +50,7 @@ ComponentCondReference::ComponentCondReference(const string &name)
     : kind(CONDITION_NAME), ref_id(0), ref_name(name), hasBothBranches(false) {}
 
 ComponentCondReference::ComponentCondReference(unique_ptr<Component> c)
-    : kind(CONDITION_ASSERTION), ref_id(0), assertion(move(c)),
+    : kind(CONDITION_ASSERTION), ref_id(0), assertion(std::move(c)),
       hasBothBranches(false) {}
 
 ComponentCondReference::~ComponentCondReference() {}
diff --git a/src/parser/ComponentRepeat.cpp b/src/parser/ComponentRepeat.cpp
index 09f59d05e..7090459f5 100644
--- a/src/parser/ComponentRepeat.cpp
+++ b/src/parser/ComponentRepeat.cpp
@@ -41,7 +41,6 @@
 #include "position_dump.h"
 #include "position_info.h"
 #include "ue2common.h"
-#include "util/make_unique.h"
 
 #include <algorithm>
 #include <cassert>
@@ -61,7 +60,7 @@ static constexpr u32 MAX_POSITIONS_EXPANDED = 500000; // arbitrarily huge
  * extent is effectively zero. */
 ComponentRepeat::ComponentRepeat(unique_ptr<Component> sub_comp_in, u32 min,
                                  u32 max, enum RepeatType t)
-    : type(t), sub_comp(move(sub_comp_in)), m_min(min), m_max(max),
+    : type(t), sub_comp(std::move(sub_comp_in)), m_min(min), m_max(max),
       posFirst(GlushkovBuildState::POS_UNINITIALIZED),
       posLast(GlushkovBuildState::POS_UNINITIALIZED) {
     assert(sub_comp);
@@ -177,7 +176,7 @@ void ComponentRepeat::notePositions(GlushkovBuildState &bs) {
 
     // Each optional repeat has an epsilon at the end of its firsts list.
     for (u32 i = m_min; i < m_firsts.size(); i++) {
-        m_firsts[i].push_back(GlushkovBuildState::POS_EPSILON);
+        m_firsts[i].emplace_back(GlushkovBuildState::POS_EPSILON);
     }
 
 }
@@ -362,7 +361,7 @@ void ComponentRepeat::postSubNotePositionHook() {
 unique_ptr<ComponentRepeat> makeComponentRepeat(unique_ptr<Component> sub_comp,
                                                 u32 min, u32 max,
                                                 ComponentRepeat::RepeatType t) {
-    return ue2::make_unique<ComponentRepeat>(move(sub_comp), min, max, t);
+    return std::make_unique<ComponentRepeat>(std::move(sub_comp), min, max, t);
 }
 
 } // namespace ue2
diff --git a/src/parser/ComponentSequence.cpp b/src/parser/ComponentSequence.cpp
index b0b5b1393..2b78177b8 100644
--- a/src/parser/ComponentSequence.cpp
+++ b/src/parser/ComponentSequence.cpp
@@ -43,7 +43,6 @@
 #include "position_info.h"
 #include "nfagraph/ng_builder.h"
 #include "util/container.h"
-#include "util/make_unique.h"
 
 #include <algorithm>
 #include <cassert>
@@ -61,7 +60,7 @@ ComponentSequence::ComponentSequence(const ComponentSequence &other)
     // Deep copy children.
     for (const auto &c : other.children) {
         assert(c);
-        children.push_back(unique_ptr<Component>(c->clone()));
+        children.emplace_back(unique_ptr<Component>(c->clone()));
     }
     if (other.alternation) {
         const ComponentAlternation &c = *other.alternation;
@@ -117,7 +116,7 @@ void ComponentSequence::accept(ConstComponentVisitor &v) const {
 }
 
 void ComponentSequence::addComponent(unique_ptr<Component> comp) {
-    children.push_back(move(comp));
+    children.emplace_back(std::move(comp));
 }
 
 bool ComponentSequence::addRepeat(u32 min, u32 max,
@@ -132,7 +131,7 @@ bool ComponentSequence::addRepeat(u32 min, u32 max,
         return false;
     }
 
-    children.back() = makeComponentRepeat(move(children.back()), min, max,
+    children.back() = makeComponentRepeat(std::move(children.back()), min, max,
                                           type);
     assert(children.back());
     return true;
@@ -140,19 +139,19 @@ bool ComponentSequence::addRepeat(u32 min, u32 max,
 
 void ComponentSequence::addAlternation() {
     if (!alternation) {
-        alternation = ue2::make_unique<ComponentAlternation>();
+        alternation = std::make_unique<ComponentAlternation>();
     }
 
-    auto seq = ue2::make_unique<ComponentSequence>();
+    auto seq = std::make_unique<ComponentSequence>();
     seq->children.swap(children);
-    alternation->append(move(seq));
+    alternation->append(std::move(seq));
 }
 
 void ComponentSequence::finalize() {
     if (alternation) {
         addAlternation();
         assert(children.empty());
-        children.push_back(move(alternation));
+        children.emplace_back(std::move(alternation));
         alternation = nullptr;
     }
 }
@@ -171,7 +170,7 @@ vector<PositionInfo> ComponentSequence::first() const {
     if (firsts.empty()) {
         DEBUG_PRINTF("trivial empty sequence %zu\n", firsts.size());
         assert(children.empty());
-        firsts.push_back(GlushkovBuildState::POS_EPSILON);
+        firsts.emplace_back(GlushkovBuildState::POS_EPSILON);
     }
 
     DEBUG_PRINTF("%zu firsts\n", firsts.size());
@@ -202,7 +201,7 @@ void epsilonVisit(vector<eps_info> *info, const vector<PositionInfo> &f) {
                 continue;
             }
 
-            out.push_back(*it);
+            out.emplace_back(*it);
             out.back().flags = flags;
             seen_flags.insert(flags);
         }
@@ -220,7 +219,7 @@ void applyEpsilonVisits(vector<PositionInfo> &lasts,
 
     for (const auto &last : lasts) {
         for (const auto &e : eps_visits) {
-            out.push_back(last);
+            out.emplace_back(last);
             out.back().flags |= e.flags;
         }
     }
diff --git a/src/parser/ComponentWordBoundary.cpp b/src/parser/ComponentWordBoundary.cpp
index 168a2aad8..347202a09 100644
--- a/src/parser/ComponentWordBoundary.cpp
+++ b/src/parser/ComponentWordBoundary.cpp
@@ -55,7 +55,7 @@ ComponentWordBoundary * ComponentWordBoundary::clone() const {
 
 vector<PositionInfo> ComponentWordBoundary::first() const {
     vector<PositionInfo> firsts;
-    firsts.push_back(position);
+    firsts.emplace_back(position);
     return firsts;
 }
 
diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 8643aebfc..ba01511a8 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -54,7 +54,6 @@
 #include "ue2common.h"
 #include "util/compare.h"
 #include "util/flat_containers.h"
-#include "util/make_unique.h"
 #include "util/unicode_def.h"
 #include "util/verify_types.h"
 
@@ -164,7 +163,7 @@ ComponentSequence *enterSequence(ComponentSequence *parent,
     assert(child);
 
     ComponentSequence *seq = child.get();
-    parent->addComponent(move(child));
+    parent->addComponent(std::move(child));
     return seq;
 }
 
@@ -176,7 +175,7 @@ void addLiteral(ComponentSequence *currentSeq, char c, const ParseMode &mode) {
         assert(cc);
         cc->add(c);
         cc->finalize();
-        currentSeq->addComponent(move(cc));
+        currentSeq->addComponent(std::move(cc));
     } else {
         currentSeq->addComponent(getLiteralComponentClass(c, mode.caseless));
     }
@@ -191,7 +190,7 @@ void addEscaped(ComponentSequence *currentSeq, unichar accum,
         assert(cc);
         cc->add(accum);
         cc->finalize();
-        currentSeq->addComponent(move(cc));
+        currentSeq->addComponent(std::move(cc));
     } else {
         if (accum > 255) {
             throw LocatedParseError(err_msg);
@@ -273,6 +272,7 @@ unichar readUtf8CodePoint4c(const char *s) {
 
 %%{
     machine regex;
+    alphtype unsigned char;
 
     action throwUnsupportedEscape {
         ostringstream str;
@@ -328,9 +328,9 @@ unichar readUtf8CodePoint4c(const char *s) {
     # enter a CAPTURING group ( e.g. '(blah)' )
     action enterCapturingGroup {
         PUSH_SEQUENCE;
-        auto seq = ue2::make_unique<ComponentSequence>();
+        auto seq = std::make_unique<ComponentSequence>();
         seq->setCaptureIndex(groupIndex++);
-        currentSeq = enterSequence(currentSeq, move(seq));
+        currentSeq = enterSequence(currentSeq, std::move(seq));
     }
 
     # enter a NAMED CAPTURING group ( e.g. (?'<hatstand>blah) )
@@ -344,10 +344,10 @@ unichar readUtf8CodePoint4c(const char *s) {
             throw LocatedParseError("Two named subpatterns use the name '" + label + "'");
         }
         PUSH_SEQUENCE;
-        auto seq = ue2::make_unique<ComponentSequence>();
+        auto seq = std::make_unique<ComponentSequence>();
         seq->setCaptureIndex(groupIndex++);
         seq->setCaptureName(label);
-        currentSeq = enterSequence(currentSeq, move(seq));
+        currentSeq = enterSequence(currentSeq, std::move(seq));
     }
 
     # enter a NON-CAPTURING group where we're modifying flags
@@ -357,7 +357,7 @@ unichar readUtf8CodePoint4c(const char *s) {
         PUSH_SEQUENCE;
         mode = newMode;
         currentSeq =
-            enterSequence(currentSeq, ue2::make_unique<ComponentSequence>());
+            enterSequence(currentSeq, std::make_unique<ComponentSequence>());
     }
 
     action exitGroup {
@@ -370,25 +370,25 @@ unichar readUtf8CodePoint4c(const char *s) {
     action enterZWLookAhead {
         PUSH_SEQUENCE;
         currentSeq = enterSequence(currentSeq,
-            ue2::make_unique<ComponentAssertion>(ComponentAssertion::LOOKAHEAD,
+            std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKAHEAD,
                                                  ComponentAssertion::POS));
     }
     action enterZWNegLookAhead {
         PUSH_SEQUENCE;
         currentSeq = enterSequence(currentSeq,
-            ue2::make_unique<ComponentAssertion>(ComponentAssertion::LOOKAHEAD,
+            std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKAHEAD,
                                                  ComponentAssertion::NEG));
     }
     action enterZWLookBehind {
         PUSH_SEQUENCE;
         currentSeq = enterSequence(currentSeq,
-            ue2::make_unique<ComponentAssertion>(ComponentAssertion::LOOKBEHIND,
+            std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKBEHIND,
                                                  ComponentAssertion::POS));
     }
     action enterZWNegLookBehind {
         PUSH_SEQUENCE;
         currentSeq = enterSequence(currentSeq,
-            ue2::make_unique<ComponentAssertion>(ComponentAssertion::LOOKBEHIND,
+            std::make_unique<ComponentAssertion>(ComponentAssertion::LOOKBEHIND,
                                                  ComponentAssertion::NEG));
     }
     action enterEmbeddedCode {
@@ -406,18 +406,18 @@ unichar readUtf8CodePoint4c(const char *s) {
         }
         PUSH_SEQUENCE;
         currentSeq = enterSequence(currentSeq,
-                ue2::make_unique<ComponentCondReference>(accumulator));
+                std::make_unique<ComponentCondReference>(accumulator));
     }
     action enterNamedConditionalRef {
         PUSH_SEQUENCE;
         assert(!label.empty());
         currentSeq = enterSequence(currentSeq,
-                ue2::make_unique<ComponentCondReference>(label));
+                std::make_unique<ComponentCondReference>(label));
     }
     action enterAtomicGroup {
         PUSH_SEQUENCE;
         currentSeq = enterSequence(currentSeq,
-                                   ue2::make_unique<ComponentAtomicGroup>());
+                                   std::make_unique<ComponentAtomicGroup>());
     }
     action eatClass {
         assert(!currentCls);
@@ -433,7 +433,7 @@ unichar readUtf8CodePoint4c(const char *s) {
     }
     action applyModifiers {
         mode = newMode;
-        currentSeq->addComponent(ue2::make_unique<ComponentEmpty>());
+        currentSeq->addComponent(std::make_unique<ComponentEmpty>());
     }
     action modifyMatchPositive {
         switch (fc) {
@@ -481,7 +481,7 @@ unichar readUtf8CodePoint4c(const char *s) {
         if (accumulator == 0) {
             throw LocatedParseError("Numbered reference cannot be zero");
         }
-        currentSeq->addComponent(ue2::make_unique<ComponentBackReference>(accumulator));
+        currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
     }
 
     action addNegativeNumberedBackRef {
@@ -493,11 +493,11 @@ unichar readUtf8CodePoint4c(const char *s) {
             throw LocatedParseError("Invalid reference");
         }
         unsigned idx = groupIndex - accumulator;
-        currentSeq->addComponent(ue2::make_unique<ComponentBackReference>(idx));
+        currentSeq->addComponent(std::make_unique<ComponentBackReference>(idx));
     }
 
     action addNamedBackRef {
-        currentSeq->addComponent(ue2::make_unique<ComponentBackReference>(label));
+        currentSeq->addComponent(std::make_unique<ComponentBackReference>(label));
     }
 
     escapedOctal0 = '\\0' @clearOctAccumulator [0-7]{0,2} $appendAccumulatorOctDigit;
@@ -724,7 +724,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                      ([^^] ${ fhold; fcall readUCP; })
                       '}' ${ if (!inCharClass) { // not inside [..]
                                  currentCls->finalize();
-                                 currentSeq->addComponent(move(currentCls));
+                                 currentSeq->addComponent(std::move(currentCls));
                              }
                              fret; 
                            })
@@ -735,7 +735,7 @@ unichar readUtf8CodePoint4c(const char *s) {
             currentCls->add(CLASS_UCP_C, negated); 
             if (!inCharClass) {
                 currentCls->finalize();
-                currentSeq->addComponent(move(currentCls));
+                currentSeq->addComponent(std::move(currentCls));
             }
             fret; 
         };
@@ -743,7 +743,7 @@ unichar readUtf8CodePoint4c(const char *s) {
             currentCls->add(CLASS_UCP_L, negated); 
             if (!inCharClass) {
                 currentCls->finalize();
-                currentSeq->addComponent(move(currentCls));
+                currentSeq->addComponent(std::move(currentCls));
             }
             fret; 
         };
@@ -751,7 +751,7 @@ unichar readUtf8CodePoint4c(const char *s) {
             currentCls->add(CLASS_UCP_M, negated); 
             if (!inCharClass) {
                 currentCls->finalize();
-                currentSeq->addComponent(move(currentCls));
+                currentSeq->addComponent(std::move(currentCls));
             }
             fret; 
         };
@@ -759,7 +759,7 @@ unichar readUtf8CodePoint4c(const char *s) {
             currentCls->add(CLASS_UCP_N, negated); 
             if (!inCharClass) {
                 currentCls->finalize();
-                currentSeq->addComponent(move(currentCls));
+                currentSeq->addComponent(std::move(currentCls));
             }
             fret;
         };
@@ -767,7 +767,7 @@ unichar readUtf8CodePoint4c(const char *s) {
             currentCls->add(CLASS_UCP_P, negated); 
             if (!inCharClass) {
                 currentCls->finalize();
-                currentSeq->addComponent(move(currentCls));
+                currentSeq->addComponent(std::move(currentCls));
             }
             fret; 
         };
@@ -775,7 +775,7 @@ unichar readUtf8CodePoint4c(const char *s) {
             currentCls->add(CLASS_UCP_S, negated); 
             if (!inCharClass) {
                 currentCls->finalize();
-                currentSeq->addComponent(move(currentCls));
+                currentSeq->addComponent(std::move(currentCls));
             }
             fret; 
         };
@@ -783,7 +783,7 @@ unichar readUtf8CodePoint4c(const char *s) {
             currentCls->add(CLASS_UCP_Z, negated); 
             if (!inCharClass) {
                 currentCls->finalize();
-                currentSeq->addComponent(move(currentCls));
+                currentSeq->addComponent(std::move(currentCls));
             }
             fret; 
         };
@@ -1106,7 +1106,7 @@ unichar readUtf8CodePoint4c(const char *s) {
 
               ']' => {
                   currentCls->finalize();
-                  currentSeq->addComponent(move(currentCls));
+                  currentSeq->addComponent(std::move(currentCls));
                   inCharClass = false;
                   fgoto main;
               };
@@ -1163,7 +1163,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                   auto cc = getComponentClass(mode);
                   cc->add(readUtf8CodePoint2c(ts));
                   cc->finalize();
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
 
               utf8_3c when is_utf8 => {
@@ -1172,7 +1172,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                   auto cc = getComponentClass(mode);
                   cc->add(readUtf8CodePoint3c(ts));
                   cc->finalize();
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
 
               utf8_4c when is_utf8 => {
@@ -1181,7 +1181,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                   auto cc = getComponentClass(mode);
                   cc->add(readUtf8CodePoint4c(ts));
                   cc->finalize();
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
 
               hi_byte when is_utf8 => {
@@ -1305,7 +1305,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                   if (mode.utf8) {
                       throw LocatedParseError("\\C is unsupported in UTF8");
                   }
-                  currentSeq->addComponent(ue2::make_unique<ComponentByte>());
+                  currentSeq->addComponent(std::make_unique<ComponentByte>());
               };
               # Match 0 or more times (greedy)
               '\*' => {
@@ -1422,39 +1422,39 @@ unichar readUtf8CodePoint4c(const char *s) {
               '\^' => {
                   auto bound = mode.multiline ? ComponentBoundary::BEGIN_LINE
                                               : ComponentBoundary::BEGIN_STRING;
-                  currentSeq->addComponent(ue2::make_unique<ComponentBoundary>(bound));
+                  currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
               };
               # End of data (with optional internal newline); also before
               # internal newline in multiline mode
               '\$' => {
                   auto bound = mode.multiline ? ComponentBoundary::END_LINE
                                               : ComponentBoundary::END_STRING_OPTIONAL_LF;
-                  currentSeq->addComponent(ue2::make_unique<ComponentBoundary>(bound));
+                  currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
               };
               # Beginning of data
               '\\A' => {
                   auto bound = ComponentBoundary::BEGIN_STRING;
-                  currentSeq->addComponent(ue2::make_unique<ComponentBoundary>(bound));
+                  currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
               };
               # End of data (with optional internal newline)
               '\\Z' => {
                   auto bound = ComponentBoundary::END_STRING_OPTIONAL_LF;
-                  currentSeq->addComponent(ue2::make_unique<ComponentBoundary>(bound));
+                  currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
               };
               # End of data
               '\\z' => {
                   auto bound = ComponentBoundary::END_STRING;
-                  currentSeq->addComponent(ue2::make_unique<ComponentBoundary>(bound));
+                  currentSeq->addComponent(std::make_unique<ComponentBoundary>(bound));
               };
               # Word boundary
               '\\b' => {
                   currentSeq->addComponent(
-                      ue2::make_unique<ComponentWordBoundary>(ts - ptr, false, mode));
+                      std::make_unique<ComponentWordBoundary>(ts - ptr, false, mode));
               };
               # Non-word boundary
               '\\B' => {
                   currentSeq->addComponent(
-                      ue2::make_unique<ComponentWordBoundary>(ts - ptr, true, mode));
+                      std::make_unique<ComponentWordBoundary>(ts - ptr, true, mode));
               };
 
               #############################################################
@@ -1494,7 +1494,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                   // a back reference
                   accumulator = parseAsDecimal(octAccumulator);
                   if (accumulator < groupIndex) {
-                      currentSeq->addComponent(ue2::make_unique<ComponentBackReference>(accumulator));
+                      currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
                   } else {
                       addEscapedOctal(currentSeq, octAccumulator, mode);
                   }
@@ -1509,7 +1509,7 @@ unichar readUtf8CodePoint4c(const char *s) {
               '\\' backRefId => {
                   // if there are enough left parens to this point, back ref
                   if (accumulator < groupIndex) {
-                      currentSeq->addComponent(ue2::make_unique<ComponentBackReference>(accumulator));
+                      currentSeq->addComponent(std::make_unique<ComponentBackReference>(accumulator));
                   } else {
                       // Otherwise, we interpret the first three digits as an
                       // octal escape, and the remaining characters stand for
@@ -1618,52 +1618,52 @@ unichar readUtf8CodePoint4c(const char *s) {
               # Word character
               '\\w' => {
                   auto cc = generateComponent(CLASS_WORD, false, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Non word character
               '\\W' => {
                   auto cc = generateComponent(CLASS_WORD, true, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Whitespace character
               '\\s' => {
                   auto cc = generateComponent(CLASS_SPACE, false, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Non whitespace character
               '\\S' => {
                   auto cc = generateComponent(CLASS_SPACE, true, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Digit character
               '\\d' => {
                   auto cc = generateComponent(CLASS_DIGIT, false, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Non digit character
               '\\D' => {
                   auto cc = generateComponent(CLASS_DIGIT, true, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Horizontal whitespace
               '\\h' => {
                   auto cc = generateComponent(CLASS_HORZ, false, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Not horizontal whitespace
               '\\H' => {
                   auto cc = generateComponent(CLASS_HORZ, true, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Vertical whitespace
               '\\v' => {
                   auto cc = generateComponent(CLASS_VERT, false, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
               # Not vertical whitespace
               '\\V' => {
                   auto cc = generateComponent(CLASS_VERT, true, mode);
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
 
               '\\p{' => {
@@ -1731,7 +1731,7 @@ unichar readUtf8CodePoint4c(const char *s) {
               };
 
               '\\X' => {
-                  currentSeq->addComponent(ue2::make_unique<ComponentEUS>(ts - ptr, mode));
+                  currentSeq->addComponent(std::make_unique<ComponentEUS>(ts - ptr, mode));
               };
 
               # Fall through general escaped character
@@ -1782,45 +1782,45 @@ unichar readUtf8CodePoint4c(const char *s) {
 
               # Conditional reference with a positive lookahead assertion
               '(?(?=' => {
-                  auto a = ue2::make_unique<ComponentAssertion>(
+                  auto a = std::make_unique<ComponentAssertion>(
                         ComponentAssertion::LOOKAHEAD, ComponentAssertion::POS);
                   ComponentAssertion *a_seq = a.get();
                   PUSH_SEQUENCE;
                   currentSeq = enterSequence(currentSeq,
-                        ue2::make_unique<ComponentCondReference>(move(a)));
+                        std::make_unique<ComponentCondReference>(std::move(a)));
                   PUSH_SEQUENCE;
                   currentSeq = a_seq;
               };
               # Conditional reference with a negative lookahead assertion
               '(?(?!' => {
-                  auto a = ue2::make_unique<ComponentAssertion>(
+                  auto a = std::make_unique<ComponentAssertion>(
                         ComponentAssertion::LOOKAHEAD, ComponentAssertion::NEG);
                   ComponentAssertion *a_seq = a.get();
                   PUSH_SEQUENCE;
                   currentSeq = enterSequence(currentSeq,
-                        ue2::make_unique<ComponentCondReference>(move(a)));
+                        std::make_unique<ComponentCondReference>(std::move(a)));
                   PUSH_SEQUENCE;
                   currentSeq = a_seq;
               };
               # Conditional reference with a positive lookbehind assertion
               '(?(?<=' => {
-                  auto a = ue2::make_unique<ComponentAssertion>(
+                  auto a = std::make_unique<ComponentAssertion>(
                       ComponentAssertion::LOOKBEHIND, ComponentAssertion::POS);
                   ComponentAssertion *a_seq = a.get();
                   PUSH_SEQUENCE;
                   currentSeq = enterSequence(currentSeq,
-                        ue2::make_unique<ComponentCondReference>(move(a)));
+                        std::make_unique<ComponentCondReference>(std::move(a)));
                   PUSH_SEQUENCE;
                   currentSeq = a_seq;
               };
               # Conditional reference with a negative lookbehind assertion
               '(?(?<!' => {
-                  auto a = ue2::make_unique<ComponentAssertion>(
+                  auto a = std::make_unique<ComponentAssertion>(
                       ComponentAssertion::LOOKBEHIND, ComponentAssertion::NEG);
                   ComponentAssertion *a_seq = a.get();
                   PUSH_SEQUENCE;
                   currentSeq = enterSequence(currentSeq,
-                        ue2::make_unique<ComponentCondReference>(move(a)));
+                        std::make_unique<ComponentCondReference>(std::move(a)));
                   PUSH_SEQUENCE;
                   currentSeq = a_seq;
               };
@@ -1861,7 +1861,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                   auto cc = getComponentClass(mode);
                   cc->add(readUtf8CodePoint2c(ts));
                   cc->finalize();
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
 
               utf8_3c when is_utf8 => {
@@ -1870,7 +1870,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                   auto cc = getComponentClass(mode);
                   cc->add(readUtf8CodePoint3c(ts));
                   cc->finalize();
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
 
               utf8_4c when is_utf8 => {
@@ -1879,7 +1879,7 @@ unichar readUtf8CodePoint4c(const char *s) {
                   auto cc = getComponentClass(mode);
                   cc->add(readUtf8CodePoint4c(ts));
                   cc->finalize();
-                  currentSeq->addComponent(move(cc));
+                  currentSeq->addComponent(std::move(cc));
               };
 
               hi_byte when is_utf8 => {
@@ -1953,7 +1953,7 @@ unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
     flat_set<string> groupNames;
 
     // Root sequence.
-    unique_ptr<ComponentSequence> rootSeq = ue2::make_unique<ComponentSequence>();
+    unique_ptr<ComponentSequence> rootSeq = std::make_unique<ComponentSequence>();
     rootSeq->setCaptureIndex(0);
 
     // Current sequence being appended to
@@ -2024,7 +2024,7 @@ unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
         // Ensure that all references are valid.
         checkReferences(*rootSeq, groupIndex, groupNames);
 
-        return move(rootSeq);
+        return std::move(rootSeq);
     } catch (LocatedParseError &error) {
         if (ts >= ptr && ts <= pe) {
             error.locate(ts - ptr);
diff --git a/src/parser/Utf8ComponentClass.cpp b/src/parser/Utf8ComponentClass.cpp
index cdfc974ac..867bb6ef4 100644
--- a/src/parser/Utf8ComponentClass.cpp
+++ b/src/parser/Utf8ComponentClass.cpp
@@ -1145,20 +1145,20 @@ void UTF8ComponentClass::buildFollowSet(GlushkovBuildState &,
 vector<PositionInfo> UTF8ComponentClass::first(void) const {
     vector<PositionInfo> rv;
     if (single_pos != GlushkovBuildState::POS_UNINITIALIZED) {
-        rv.push_back(single_pos);
+        rv.emplace_back(single_pos);
     }
     if (two_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
-        rv.push_back(two_char_dot_head);
+        rv.emplace_back(two_char_dot_head);
     }
     if (three_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
-        rv.push_back(three_char_dot_head);
+        rv.emplace_back(three_char_dot_head);
     }
     if (four_char_dot_head != GlushkovBuildState::POS_UNINITIALIZED) {
-        rv.push_back(four_char_dot_head);
+        rv.emplace_back(four_char_dot_head);
     }
 
     for (auto it = heads.begin(); it != heads.end(); ++it) {
-        rv.push_back(it->second);
+        rv.emplace_back(it->second);
     }
     return rv;
 }
diff --git a/src/parser/buildstate.cpp b/src/parser/buildstate.cpp
index 75cfbb7b2..96f91cb6c 100644
--- a/src/parser/buildstate.cpp
+++ b/src/parser/buildstate.cpp
@@ -41,7 +41,6 @@
 #include "util/container.h"
 #include "util/flat_containers.h"
 #include "util/hash.h"
-#include "util/make_unique.h"
 #include "util/unordered.h"
 
 #include <algorithm>
@@ -155,9 +154,9 @@ GlushkovBuildStateImpl::GlushkovBuildStateImpl(NFABuilder &b,
     vector<PositionInfo> lasts, firsts;
 
     // start->startDs and startDs self-loop.
-    lasts.push_back(startState);
-    lasts.push_back(startDotstarState);
-    firsts.push_back(startDotstarState);
+    lasts.emplace_back(startState);
+    lasts.emplace_back(startDotstarState);
+    firsts.emplace_back(startDotstarState);
     connectRegions(lasts, firsts);
 
     // accept to acceptEod edges already wired
@@ -255,7 +254,7 @@ void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from,
     bool require_accept = !(flags & POS_FLAG_ONLY_ENDS);
 
     if (require_eod) {
-        tolist->push_back(bs.acceptEodState);
+        tolist->emplace_back(bs.acceptEodState);
     }
 
     if (require_nl_accept) {
@@ -264,7 +263,7 @@ void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from,
             bs.addSuccessor(newline, builder.getAccept());
             bs.acceptNlState = newline;
         }
-        tolist->push_back(bs.acceptNlState);
+        tolist->emplace_back(bs.acceptNlState);
     }
 
     if (require_nl_eod) {
@@ -273,11 +272,11 @@ void generateAccepts(GlushkovBuildStateImpl &bs, const PositionInfo &from,
             bs.addSuccessor(newline, builder.getAcceptEOD());
             bs.acceptNlEodState = newline;
         }
-        tolist->push_back(bs.acceptNlEodState);
+        tolist->emplace_back(bs.acceptNlEodState);
     }
 
     if (require_accept) {
-        tolist->push_back(bs.acceptState);
+        tolist->emplace_back(bs.acceptState);
     }
 }
 
@@ -441,7 +440,7 @@ void GlushkovBuildStateImpl::buildEdges() {
 // Construct a usable GlushkovBuildState for the outside world.
 unique_ptr<GlushkovBuildState> makeGlushkovBuildState(NFABuilder &b,
                                                       bool prefilter) {
-    return ue2::make_unique<GlushkovBuildStateImpl>(b, prefilter);
+    return std::make_unique<GlushkovBuildStateImpl>(b, prefilter);
 }
 
 // free functions for utility use
@@ -458,7 +457,7 @@ void cleanupPositions(vector<PositionInfo> &a) {
 
     for (const auto &p : a) {
         if (seen.emplace(p.pos, p.flags).second) {
-            out.push_back(p); // first encounter
+            out.emplace_back(p); // first encounter
         }
     }
 
diff --git a/src/parser/control_verbs.rl b/src/parser/control_verbs.rl
index 1d3e33a9a..09b0bfd7b 100644
--- a/src/parser/control_verbs.rl
+++ b/src/parser/control_verbs.rl
@@ -54,6 +54,7 @@ const char *read_control_verbs(const char *ptr, const char *end, size_t start,
 
     %%{
         machine ControlVerbs;
+        alphtype unsigned char;
 
         # Verbs that we recognise but do not support.
         unhandledVerbs = '(*' (
diff --git a/src/parser/logical_combination.cpp b/src/parser/logical_combination.cpp
index 49e060c98..b75ca34fc 100644
--- a/src/parser/logical_combination.cpp
+++ b/src/parser/logical_combination.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, Intel Corporation
+ * Copyright (c) 2018-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include "parser/parse_error.h"
 #include "util/container.h"
 #include "hs_compile.h"
+#include "allocator.h"
 
 #include <vector>
 
@@ -91,7 +92,7 @@ u32 ParsedLogical::logicalTreeAdd(u32 op, u32 left, u32 right) {
     lop.op = op;
     lop.lo = left;
     lop.ro = right;
-    logicalTree.push_back(lop);
+    logicalTree.emplace_back(lop);
     return lop.id;
 }
 
@@ -106,7 +107,7 @@ void ParsedLogical::combinationInfoAdd(UNUSED u32 ckey, u32 id, u32 ekey,
     ci.result = lkey_result;
     ci.min_offset = min_offset;
     ci.max_offset = max_offset;
-    combInfoMap.push_back(ci);
+    combInfoMap.emplace_back(ci);
 
     DEBUG_PRINTF("ckey %u (id %u) -> lkey %u..%u, ekey=0x%x\n", ckey, ci.id,
                  ci.start, ci.result, ci.ekey);
@@ -139,7 +140,8 @@ void ParsedLogical::validateSubIDs(const unsigned *ids,
         }
         hs_compile_error_t *compile_err = NULL;
         hs_expr_info_t *info = NULL;
-        hs_error_t err = hs_expression_info(expressions[i], flags[i], &info,
+        hs_error_t err = hs_expression_info(expressions[i],
+                                            flags ? flags[i] : 0, &info,
                                             &compile_err);
         if (err != HS_SUCCESS) {
             hs_free_compile_error(compile_err);
@@ -151,7 +153,7 @@ void ParsedLogical::validateSubIDs(const unsigned *ids,
             if (info->unordered_matches) {
                 throw CompileError("Have unordered match in sub-expressions.");
             }
-            free(info);
+            hs_misc_free(info);
         }
     }
 }
@@ -250,7 +252,7 @@ void popOperator(vector<LogicalOperator> &op_stack, vector<u32> &subid_stack,
         left = subid_stack.back();
         subid_stack.pop_back();
     }
-    subid_stack.push_back(pl.logicalTreeAdd(op_stack.back().op, left, right));
+    subid_stack.emplace_back(pl.logicalTreeAdd(op_stack.back().op, left, right));
     op_stack.pop_back();
 }
 
@@ -273,7 +275,7 @@ void ParsedLogical::parseLogicalCombination(unsigned id, const char *logical,
                 }
             } else {
                 if ((subid = fetchSubID(logical, digit, i)) != (u32)-1) {
-                    subid_stack.push_back(getLogicalKey(subid));
+                    subid_stack.emplace_back(getLogicalKey(subid));
                     addRelateCKey(subid_stack.back(), ckey);
                 }
                 if (logical[i] == ' ') { // skip whitespace
@@ -297,7 +299,7 @@ void ParsedLogical::parseLogicalCombination(unsigned id, const char *logical,
                                 lkey_start = subid_stack.back();
                             }
                         }
-                        op_stack.push_back(op);
+                        op_stack.emplace_back(op);
                     } else {
                         throw LocatedParseError("Unknown character");
                     }
@@ -308,7 +310,7 @@ void ParsedLogical::parseLogicalCombination(unsigned id, const char *logical,
             throw LocatedParseError("Not enough right parentheses");
         }
         if ((subid = fetchSubID(logical, digit, i)) != (u32)-1) {
-            subid_stack.push_back(getLogicalKey(subid));
+            subid_stack.emplace_back(getLogicalKey(subid));
             addRelateCKey(subid_stack.back(), ckey);
         }
         while (!op_stack.empty()) {
diff --git a/src/parser/utf8_validate.cpp b/src/parser/utf8_validate.cpp
index 50aa06d8e..54c9755e8 100644
--- a/src/parser/utf8_validate.cpp
+++ b/src/parser/utf8_validate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2022, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,7 +72,7 @@ bool isValidUtf8(const char *expression, const size_t len) {
     while (i < len) {
         DEBUG_PRINTF("byte %zu: 0x%02x\n", i, s[i]);
         // One octet.
-        if (s[i] < 0x7f) {
+        if (s[i] <= 0x7f) {
             DEBUG_PRINTF("one octet\n");
             i++;
             continue;
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
index 976208b73..602907cb8 100644
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,6 +40,10 @@
  * stop character. */
 #define COUNTING_MIRACLE_LEN_MAX 256
 
+#ifdef HAVE_SVE2
+#include "counting_miracle_sve.h"
+#else
+
 static really_inline
 char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
                              u32 target_count, u32 *count_inout,
@@ -47,7 +52,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
 
     u32 count = *count_inout;
 
-    m128 chars = set16x8(c);
+    m128 chars = set1_16x8(c);
 
     for (; d + 16 <= d_end; d_end -= 16) {
         m128 data = loadu128(d_end - 16);
@@ -81,6 +86,12 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
     return 0;
 }
 
+#endif
+
+#ifdef HAVE_SVE
+#include "counting_miracle_shufti_sve.h"
+#else
+
 #define GET_LO_4(chars) and128(chars, low4bits)
 #define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
 
@@ -94,7 +105,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
     u32 count = *count_inout;
 
     const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set1_16x8(0xf);
 
     for (; d + 16 <= d_end; d_end -= 16) {
         m128 data = loadu128(d_end - 16);
@@ -134,6 +145,8 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
     return 0;
 }
 
+#endif
+
 /**
  * \brief "Counting Miracle" scan: If we see more than N instances of a
  * particular character class we know that the engine must be dead.
@@ -213,8 +226,13 @@ int roseCountingMiracleOccurs(const struct RoseEngine *t,
             }
         }
     } else {
+#ifdef HAVE_SVE
+        svuint8_t lo = getSVEMaskFrom128(cm->lo);
+        svuint8_t hi = getSVEMaskFrom128(cm->hi);
+#else
         m128 lo = cm->lo;
         m128 hi = cm->hi;
+#endif
         u8 poison = cm->poison;
 
         // Scan buffer.
diff --git a/src/rose/counting_miracle_shufti_sve.h b/src/rose/counting_miracle_shufti_sve.h
new file mode 100644
index 000000000..26991a82f
--- /dev/null
+++ b/src/rose/counting_miracle_shufti_sve.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+static really_inline
+size_t countShuftiMatches(svuint8_t mask_lo, svuint8_t mask_hi,
+                          const svbool_t pg, const u8 *buf) {
+    svuint8_t vec = svld1_u8(pg, buf);
+    svuint8_t c_lo = svtbl(mask_lo, svand_z(svptrue_b8(), vec, (uint8_t)0xf));
+    svuint8_t c_hi = svtbl(mask_hi, svlsr_z(svptrue_b8(), vec, 4));
+    svuint8_t t = svand_z(svptrue_b8(), c_lo, c_hi);
+    return svcntp_b8(svptrue_b8(), svcmpne(pg, t, (uint8_t)0));
+}
+
+static really_inline
+bool countShuftiLoopBody(svuint8_t mask_lo, svuint8_t mask_hi,
+                         const svbool_t pg, const u8 *d, u32 target_count,
+                         u32 *count_inout, const u8 **d_out) {
+    *count_inout += countShuftiMatches(mask_lo, mask_hi, pg, d);
+    if (*count_inout >= target_count) {
+        *d_out = d;
+        return true;
+    }
+    return false;
+}
+
+static really_inline
+bool countShuftiOnce(svuint8_t mask_lo, svuint8_t mask_hi,
+                     const u8 *d, const u8 *d_end, u32 target_count,
+                     u32 *count_inout, const u8 **d_out) {
+    svbool_t pg = svwhilelt_b8_s64(0, d_end - d);
+    return countShuftiLoopBody(mask_lo, mask_hi, pg, d, target_count,
+                               count_inout, d_out);
+}
+
+static really_inline
+bool roseCountingMiracleScanShufti(svuint8_t mask_lo, svuint8_t mask_hi,
+                                   UNUSED u8 poison, const u8 *d,
+                                   const u8 *d_end, u32 target_count,
+                                   u32 *count_inout, const u8 **d_out) {
+    assert(d <= d_end);
+    size_t len = d_end - d;
+    if (len <= svcntb()) {
+        char rv = countShuftiOnce(mask_lo, mask_hi, d, d_end, target_count,
+                                  count_inout, d_out);
+        return rv;
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_d_end = ROUNDDOWN_PTR(d_end, svcntb_pat(SV_POW2));
+    assert(d < aligned_d_end);
+    if (d_end != aligned_d_end) {
+        if (countShuftiOnce(mask_lo, mask_hi, aligned_d_end, d_end,
+                            target_count, count_inout, d_out)) return true;
+        d_end = aligned_d_end;
+    }
+    size_t loops = (d_end - d) / svcntb();
+    for (size_t i = 0; i < loops; i++) {
+        d_end -= svcntb();
+        if (countShuftiLoopBody(mask_lo, mask_hi, svptrue_b8(), d_end,
+                                target_count, count_inout, d_out)) return true;
+    }
+    if (d != d_end) {
+        if (countShuftiOnce(mask_lo, mask_hi, d, d_end,
+                            target_count, count_inout, d_out)) return true;
+    }
+    return false;
+}
\ No newline at end of file
diff --git a/src/rose/counting_miracle_sve.h b/src/rose/counting_miracle_sve.h
new file mode 100644
index 000000000..8a7114f29
--- /dev/null
+++ b/src/rose/counting_miracle_sve.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+static really_inline
+size_t countMatches(svuint8_t chars, svbool_t pg, const u8 *buf) {
+    svuint8_t vec = svld1_u8(pg, buf);
+    return svcntp_b8(svptrue_b8(), svmatch(pg, vec, chars));
+}
+
+static really_inline
+bool countLoopBody(svuint8_t chars, svbool_t pg, const u8 *d,
+                   u32 target_count, u32 *count_inout, const u8 **d_out) {
+    *count_inout += countMatches(chars, pg, d);
+    if (*count_inout >= target_count) {
+        *d_out = d;
+        return true;
+    }
+    return false;
+}
+
+static really_inline
+bool countOnce(svuint8_t chars, const u8 *d, const u8 *d_end,
+               u32 target_count, u32 *count_inout, const u8 **d_out) {
+    assert(d <= d_end);
+    svbool_t pg = svwhilelt_b8_s64(0, d_end - d);
+    return countLoopBody(chars, pg, d, target_count, count_inout, d_out);
+}
+
+static really_inline
+bool roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
+                             u32 target_count, u32 *count_inout,
+                             const u8 **d_out) {
+    assert(d <= d_end);
+    svuint8_t chars = svdup_u8(c);
+    size_t len = d_end - d;
+    if (len <= svcntb()) {
+        bool rv = countOnce(chars, d, d_end, target_count, count_inout, d_out);
+        return rv;
+    }
+    // peel off first part to align to the vector size
+    const u8 *aligned_d_end = ROUNDDOWN_PTR(d_end, svcntb_pat(SV_POW2));
+    assert(d < aligned_d_end);
+    if (d_end != aligned_d_end) {
+        if (countOnce(chars, aligned_d_end, d_end,
+                      target_count, count_inout, d_out)) return true;
+        d_end = aligned_d_end;
+    }
+    size_t loops = (d_end - d) / svcntb();
+    for (size_t i = 0; i < loops; i++) {
+        d_end -= svcntb();
+        if (countLoopBody(chars, svptrue_b8(), d_end,
+                          target_count, count_inout, d_out)) return true;
+    }
+    if (d != d_end) {
+        if (countOnce(chars, d, d_end,
+                      target_count, count_inout, d_out)) return true;
+    }
+    return false;
+}
\ No newline at end of file
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
index 0f2d1083b..87dc0c4d9 100644
--- a/src/rose/program_runtime.c
+++ b/src/rose/program_runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -767,10 +767,10 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
                 c_shift = c_len - ci->len;
                 c_len = ci->len;
             }
-            copy_upto_32_bytes((u8 *)&data - offset, ci->buf, c_len);
+            copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len);
         }
         assert(h_shift + h_len + c_len + c_shift == 32);
-        copy_upto_32_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
+        copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
     } else {
         if (offset + 32 > (s64a)ci->len) {
             if (offset >= (s64a)ci->len) {
@@ -779,7 +779,7 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
             }
             c_len = ci->len - offset;
             c_shift = 32 - c_len;
-            copy_upto_32_bytes((u8 *)&data, ci->buf + offset, c_len);
+            copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len);
         } else {
             data = loadu256(ci->buf + offset);
         }
@@ -800,12 +800,90 @@ int roseCheckMask32(const struct core_info *ci, const u8 *and_mask,
     return 0;
 }
 
-// get 128/256 bits data from history and current buffer.
+#ifdef HAVE_AVX512
+static rose_inline
+int roseCheckMask64(const struct core_info *ci, const u8 *and_mask,
+                    const u8 *cmp_mask, const u64a neg_mask,
+                    s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    m512 data = zeroes512(); // consists of the following four parts.
+    s32 c_shift = 0; // blank bytes after current.
+    s32 h_shift = 0; // blank bytes before history.
+    s32 h_len = 64; // number of bytes from history buffer.
+    s32 c_len = 0; // number of bytes from current buffer.
+    /* h_shift + h_len + c_len + c_shift = 64 need to be hold.*/
+
+    if (offset < 0) {
+        s32 h_offset = 0; // the start offset in history buffer.
+        if (offset < -(s64a)ci->hlen) {
+            if (offset + 64 <= -(s64a)ci->hlen) {
+                DEBUG_PRINTF("all before history\n");
+                return 1;
+            }
+            h_shift = -(offset + (s64a)ci->hlen);
+            h_len = 64 - h_shift;
+        } else {
+            h_offset = ci->hlen + offset;
+        }
+        if (offset + 64 > 0) {
+            // part in current buffer.
+            c_len = offset + 64;
+            h_len = -(offset + h_shift);
+            if (c_len > (s64a)ci->len) {
+                // out of current buffer.
+                c_shift = c_len - ci->len;
+                c_len = ci->len;
+            }
+            copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len);
+        }
+        assert(h_shift + h_len + c_len + c_shift == 64);
+        copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len);
+    } else {
+        if (offset + 64 > (s64a)ci->len) {
+            if (offset >= (s64a)ci->len) {
+                DEBUG_PRINTF("all in the future.\n");
+                return 1;
+            }
+            c_len = ci->len - offset;
+            c_shift = 64 - c_len;
+            copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len);
+        } else {
+            data = loadu512(ci->buf + offset);
+        }
+    }
+    DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
+    DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
+    // we use valid_data_mask to blind bytes before history/in the future.
+    u64a valid_data_mask;
+    valid_data_mask = (~0ULL) << (h_shift + c_shift) >> (c_shift);
+
+    m512 and_mask_m512 = loadu512(and_mask);
+    m512 cmp_mask_m512 = loadu512(cmp_mask);
+
+    if (validateMask64(data, valid_data_mask, and_mask_m512,
+                       cmp_mask_m512, neg_mask)) {
+        DEBUG_PRINTF("Mask64 passed\n");
+        return 1;
+    }
+    return 0;
+}
+#endif
+
+// get 128/256/512 bits data from history and current buffer.
 // return data and valid_data_mask.
 static rose_inline
-u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
+u64a getBufferDataComplex(const struct core_info *ci, const s64a loc,
                          u8 *data, const u32 data_len) {
-    assert(data_len == 16 || data_len == 32);
+    assert(data_len == 16 || data_len == 32 || data_len == 64);
     s32 c_shift = 0; // blank bytes after current.
     s32 h_shift = 0; // blank bytes before history.
     s32 h_len = data_len; // number of bytes from history buffer.
@@ -831,10 +909,10 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
                 c_shift = c_len - ci->len;
                 c_len = ci->len;
             }
-            copy_upto_32_bytes(data - loc, ci->buf, c_len);
+            copy_upto_64_bytes(data - loc, ci->buf, c_len);
         }
         assert(h_shift + h_len + c_len + c_shift == (s32)data_len);
-        copy_upto_32_bytes(data + h_shift, ci->hbuf + h_offset, h_len);
+        copy_upto_64_bytes(data + h_shift, ci->hbuf + h_offset, h_len);
     } else {
         if (loc + data_len > (s64a)ci->len) {
             if (loc >= (s64a)ci->len) {
@@ -843,8 +921,14 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
             }
             c_len = ci->len - loc;
             c_shift = data_len - c_len;
-            copy_upto_32_bytes(data, ci->buf + loc, c_len);
+            copy_upto_64_bytes(data, ci->buf + loc, c_len);
         } else {
+#ifdef HAVE_AVX512
+            if (data_len == 64) {
+                storeu512(data, loadu512(ci->buf + loc));
+                return ~0ULL;
+            }
+#endif
             if (data_len == 16) {
                 storeu128(data, loadu128(ci->buf + loc));
                 return 0xffff;
@@ -857,6 +941,11 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
     DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift);
     DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len);
 
+#ifdef HAVE_AVX512
+    if (data_len == 64) {
+        return (~0ULL) << (h_shift + c_shift) >> c_shift;
+    }
+#endif
     if (data_len == 16) {
         return (u16)(0xffff << (h_shift + c_shift)) >> c_shift;
     } else {
@@ -870,7 +959,7 @@ m128 getData128(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
         *valid_data_mask = 0xffff;
         return loadu128(ci->buf + offset);
     }
-    ALIGN_DIRECTIVE u8 data[sizeof(m128)];
+    ALIGN_DIRECTIVE u8 data[sizeof(m128)] = { 0 };
     *valid_data_mask = getBufferDataComplex(ci, offset, data, 16);
     return *(m128 *)data;
 }
@@ -886,6 +975,19 @@ m256 getData256(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
     return *(m256 *)data;
 }
 
+#ifdef HAVE_AVX512
+static rose_inline
+m512 getData512(const struct core_info *ci, s64a offset, u64a *valid_data_mask) {
+    if (offset > 0 && offset + sizeof(m512) <= ci->len) {
+        *valid_data_mask = ~0ULL;
+        return loadu512(ci->buf + offset);
+    }
+    ALIGN_CL_DIRECTIVE u8 data[sizeof(m512)];
+    *valid_data_mask = getBufferDataComplex(ci, offset, data, 64);
+    return *(m512 *)data;
+}
+#endif
+
 static rose_inline
 int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask,
                         const u8 *bucket_select_mask, u32 neg_mask,
@@ -938,7 +1040,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
         return 1;
     }
 
-    m256 data_m256 = set2x128(data);
+    m256 data_m256 = set1_2x128(data);
     m256 hi_mask_m256 = loadu256(hi_mask);
     m256 lo_mask_m256 = loadu256(lo_mask);
     m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
@@ -974,8 +1076,8 @@ int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask,
 
     m128 hi_mask_m128 = loadu128(hi_mask);
     m128 lo_mask_m128 = loadu128(lo_mask);
-    m256 hi_mask_m256 = set2x128(hi_mask_m128);
-    m256 lo_mask_m256 = set2x128(lo_mask_m128);
+    m256 hi_mask_m256 = set1_2x128(hi_mask_m128);
+    m256 lo_mask_m256 = set1_2x128(lo_mask_m128);
     m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
     if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256,
                                bucket_select_mask_m256,
@@ -1025,6 +1127,83 @@ int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask,
     }
 }
 
+#ifdef HAVE_AVX512
+static rose_inline
+int roseCheckShufti64x8(const struct core_info *ci, const u8 *hi_mask,
+                        const u8 *lo_mask, const u8 *bucket_select_mask,
+                        u64a neg_mask, s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a valid_data_mask = 0;
+    m512 data = getData512(ci, offset, &valid_data_mask);
+
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m512 hi_mask_m512 = loadu512(hi_mask);
+    m512 lo_mask_m512 = loadu512(lo_mask);
+    m512 bucket_select_mask_m512 = loadu512(bucket_select_mask);
+    if (validateShuftiMask64x8(data, hi_mask_m512, lo_mask_m512,
+                               bucket_select_mask_m512,
+                               neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 64x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static rose_inline
+int roseCheckShufti64x16(const struct core_info *ci, const u8 *hi_mask_1,
+                         const u8 *hi_mask_2, const u8 *lo_mask_1,
+                         const u8 *lo_mask_2, const u8 *bucket_select_mask_hi,
+                         const u8 *bucket_select_mask_lo, u64a neg_mask,
+                         s32 checkOffset, u64a end) {
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    u64a valid_data_mask = 0;
+    m512 data = getData512(ci, offset, &valid_data_mask);
+    if (unlikely(!valid_data_mask)) {
+        return 1;
+    }
+
+    m512 hi_mask_1_m512 = loadu512(hi_mask_1);
+    m512 hi_mask_2_m512 = loadu512(hi_mask_2);
+    m512 lo_mask_1_m512 = loadu512(lo_mask_1);
+    m512 lo_mask_2_m512 = loadu512(lo_mask_2);
+
+    m512 bucket_select_mask_hi_m512 = loadu512(bucket_select_mask_hi);
+    m512 bucket_select_mask_lo_m512 = loadu512(bucket_select_mask_lo);
+    if (validateShuftiMask64x16(data, hi_mask_1_m512, hi_mask_2_m512,
+                              lo_mask_1_m512, lo_mask_2_m512,
+                              bucket_select_mask_hi_m512,
+                              bucket_select_mask_lo_m512,
+                              neg_mask, valid_data_mask)) {
+        DEBUG_PRINTF("check shufti 64x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+#endif
+
 static rose_inline
 int roseCheckSingleLookaround(const struct RoseEngine *t,
                               const struct hs_scratch *scratch,
@@ -1287,7 +1466,7 @@ int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch,
         u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
         DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
         DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
-        expand_valid = set64x2(valid_hi, valid_lo);
+        expand_valid = set2x64(valid_hi, valid_lo);
         valid_path_mask = ~movemask128(pshufb_m128(expand_valid,
                                                data_select_mask));
     }
@@ -1332,7 +1511,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
 
     u32 valid_data_mask;
     m128 data_m128 = getData128(ci, offset, &valid_data_mask);
-    m256 data_double = set2x128(data_m128);
+    m256 data_double = set1_2x128(data_m128);
     m256 data_select_mask = loadu256(ri->data_select_mask);
 
     u32 valid_path_mask = 0;
@@ -1346,7 +1525,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
         u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
         DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
         DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
-        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+        expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
                                          valid_lo);
         valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
                                                   data_select_mask));
@@ -1393,7 +1572,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
 
     u32 valid_data_mask;
     m128 data_m128 = getData128(ci, offset, &valid_data_mask);
-    m256 data_double = set2x128(data_m128);
+    m256 data_double = set1_2x128(data_m128);
     m256 data_select_mask = loadu256(ri->data_select_mask);
 
     u32 valid_path_mask = 0;
@@ -1407,7 +1586,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
         u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
         DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
         DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
-        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+        expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
                                          valid_lo);
         valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
                                                    data_select_mask));
@@ -1460,7 +1639,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
 
     u32 valid_data_mask;
     m128 data_m128 = getData128(ci, offset, &valid_data_mask);
-    m256 data_m256 = set2x128(data_m128);
+    m256 data_m256 = set1_2x128(data_m128);
     m256 data_select_mask_1 = loadu256(ri->data_select_mask);
     m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
 
@@ -1475,7 +1654,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
         u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
         DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
         DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
-        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+        expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
                                          valid_lo);
         u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
                                                    data_select_mask_1));
@@ -1937,7 +2116,6 @@ hwlmcb_rv_t checkPurelyNegatives(const struct RoseEngine *t,
     return HWLM_CONTINUE_MATCHING;
 }
 
-#if !defined(_WIN32)
 #define PROGRAM_CASE(name)                                                     \
     case ROSE_INSTR_##name: {                                                  \
     LABEL_ROSE_INSTR_##name:                                                   \
@@ -1953,21 +2131,6 @@ hwlmcb_rv_t checkPurelyNegatives(const struct RoseEngine *t,
 
 #define PROGRAM_NEXT_INSTRUCTION_JUMP                                          \
     goto *(next_instr[*(const u8 *)pc]);
-#else
-#define PROGRAM_CASE(name)                                                     \
-    case ROSE_INSTR_##name: {                                                  \
-        DEBUG_PRINTF("instruction: " #name " (pc=%u)\n",                       \
-                     programOffset + (u32)(pc - pc_base));                     \
-        const struct ROSE_STRUCT_##name *ri =                                  \
-            (const struct ROSE_STRUCT_##name *)pc;
-
-#define PROGRAM_NEXT_INSTRUCTION                                               \
-    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
-    break;                                                                     \
-    }
-
-#define PROGRAM_NEXT_INSTRUCTION_JUMP continue;
-#endif
 
 hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
                            struct hs_scratch *scratch, u32 programOffset,
@@ -1999,7 +2162,6 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
 
     assert(*(const u8 *)pc != ROSE_INSTR_END);
 
-#if !defined(_WIN32)
     static const void *next_instr[] = {
         &&LABEL_ROSE_INSTR_END,               //!< End of program.
         &&LABEL_ROSE_INSTR_ANCHORED_DELAY,    //!< Delay until after anchored matcher.
@@ -2068,8 +2230,13 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
         &&LABEL_ROSE_INSTR_FLUSH_COMBINATION,
         &&LABEL_ROSE_INSTR_SET_EXHAUST,
         &&LABEL_ROSE_INSTR_LAST_FLUSH_COMBINATION
-    };
+#ifdef HAVE_AVX512
+        ,
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti.
+        &&LABEL_ROSE_INSTR_CHECK_MASK_64     //!< 64-bytes and/cmp/neg mask check.
 #endif
+    };
 
     for (;;) {
         assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN));
@@ -2258,6 +2425,45 @@ hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
+#ifdef HAVE_AVX512
+            PROGRAM_CASE(CHECK_MASK_64) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_64x8) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti64x8(ci, ri->hi_mask, ri->lo_mask,
+                                         ri->bucket_select_mask,
+                                         ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_64x16) {
+                const struct core_info *ci = &scratch->core_info;
+                if (!roseCheckShufti64x16(ci, ri->hi_mask_1, ri->hi_mask_2,
+                                          ri->lo_mask_1, ri->lo_mask_2,
+                                          ri->bucket_select_mask_hi,
+                                          ri->bucket_select_mask_lo,
+                                          ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    PROGRAM_NEXT_INSTRUCTION_JUMP;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+#endif
+
             PROGRAM_CASE(CHECK_INFIX) {
                 if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report,
                                    end)) {
@@ -2886,6 +3092,7 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
 
     const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP;
     const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV;
+    const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP;
 
     const char *pc_base = getByOffset(t, programOffset);
     const char *pc = pc_base;
@@ -2945,6 +3152,19 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
             }
             L_PROGRAM_NEXT_INSTRUCTION
 
+#ifdef HAVE_AVX512
+            L_PROGRAM_CASE(CHECK_MASK_64) {
+                struct core_info *ci = &scratch->core_info;
+                if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask,
+                                     ri->neg_mask, ri->offset, end)) {
+                    assert(ri->fail_jump);
+                    pc += ri->fail_jump;
+                    L_PROGRAM_NEXT_INSTRUCTION_JUMP
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+#endif
+
             L_PROGRAM_CASE(CHECK_BYTE) {
                 const struct core_info *ci = &scratch->core_info;
                 if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask,
@@ -2969,6 +3189,17 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
             }
             L_PROGRAM_NEXT_INSTRUCTION
 
+            L_PROGRAM_CASE(CATCH_UP_MPV) {
+                if (from_mpv || skip_mpv_catchup) {
+                    DEBUG_PRINTF("skipping mpv catchup\n");
+                } else if (roseCatchUpMPV(t,
+                                          end - scratch->core_info.buf_offset,
+                                          scratch) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
             L_PROGRAM_CASE(SOM_FROM_REPORT) {
                 som = handleSomExternal(scratch, &ri->som, end);
                 DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch,
@@ -2976,6 +3207,15 @@ hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t,
             }
             L_PROGRAM_NEXT_INSTRUCTION
 
+            L_PROGRAM_CASE(TRIGGER_SUFFIX) {
+                if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som,
+                                      end) == HWLM_TERMINATE_MATCHING) {
+                    return HWLM_TERMINATE_MATCHING;
+                }
+                work_done = 1;
+            }
+            L_PROGRAM_NEXT_INSTRUCTION
+
             L_PROGRAM_CASE(DEDUPE) {
                 updateSeqPoint(tctxt, end, from_mpv);
                 const char do_som = t->hasSom; // TODO: constant propagate
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index aa043fade..5aed21f57 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -56,7 +56,6 @@
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
 #include "util/insertion_ordered.h"
-#include "util/make_unique.h"
 #include "util/noncopyable.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
@@ -217,9 +216,9 @@ RoseRoleHistory selectHistory(const RoseBuildImpl &tbi, const RoseBuildData &bd,
     const bool fixed_offset_src = g[u].fixedOffset();
     const bool has_bounds = g[e].minBound || (g[e].maxBound != ROSE_BOUND_INF);
 
-    DEBUG_PRINTF("edge %zu->%zu, bounds=[%u,%u], fixed_u=%d, prefix=%d\n",
+    /*DEBUG_PRINTF("edge %zu->%zu, bounds=[%u,%u], fixed_u=%d, prefix=%d\n",
                  g[u].index, g[v].index, g[e].minBound, g[e].maxBound,
-                 (int)g[u].fixedOffset(), (int)g[v].left);
+                 (int)g[u].fixedOffset(), (int)g[v].left);*/
 
     if (g[v].left) {
         // Roles with prefix engines have their history handled by that prefix.
@@ -301,7 +300,7 @@ void createVertices(RoseBuildImpl *tbi,
             }
 
             DEBUG_PRINTF("  adding new vertex index=%zu\n", tbi->g[w].index);
-            vertex_map[iv].push_back(w);
+            vertex_map[iv].emplace_back(w);
         } else {
             w = created[key];
         }
@@ -612,7 +611,7 @@ void doRoseLiteralVertex(RoseBuildImpl *tbi, bool use_eod_table,
         RoseVertex v = tryForAnchoredVertex(tbi, iv_info, ep);
         if (v != RoseGraph::null_vertex()) {
             DEBUG_PRINTF("add anchored literal vertex\n");
-            vertex_map[iv].push_back(v);
+            vertex_map[iv].emplace_back(v);
             return;
         }
     }
@@ -656,7 +655,7 @@ unique_ptr<NGHolder> makeRoseEodPrefix(const NGHolder &h, RoseBuildImpl &build,
             continue;
         }
         add_edge_if_not_present(u, g.accept, g);
-        dead.push_back(e);
+        dead.emplace_back(e);
 
         if (!contains(remap, g[u].reports)) {
             remap[g[u].reports] = build.getNewNfaReport();
@@ -967,11 +966,11 @@ void populateRoseGraph(RoseBuildImpl *tbi, RoseBuildData &bd) {
 
         if (ig[iv].type == RIV_START) {
             DEBUG_PRINTF("is root\n");
-            vertex_map[iv].push_back(tbi->root);
+            vertex_map[iv].emplace_back(tbi->root);
             continue;
         } else if (ig[iv].type == RIV_ANCHORED_START) {
             DEBUG_PRINTF("is anchored root\n");
-            vertex_map[iv].push_back(tbi->anchored_root);
+            vertex_map[iv].emplace_back(tbi->anchored_root);
             continue;
         }
 
@@ -1544,7 +1543,7 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter) {
         NGHolder *h = in[e].graph.get();
 
         assert(isCorrectlyTopped(*h));
-        graphs[h].push_back(e);
+        graphs[h].emplace_back(e);
     }
 
     vector<RoseInEdge> graph_edges;
@@ -1624,7 +1623,7 @@ bool roseCheckRose(const RoseInGraph &ig, bool prefilter,
             continue;
         }
 
-        graphs.push_back(ig[e].graph.get());
+        graphs.emplace_back(ig[e].graph.get());
     }
 
     for (const auto &g : graphs) {
@@ -1781,9 +1780,9 @@ bool RoseBuildImpl::addOutfix(const NGHolder &h) {
     }
 
     if (rdfa) {
-        outfixes.push_back(OutfixInfo(move(rdfa)));
+        outfixes.emplace_back(OutfixInfo(std::move(rdfa)));
     } else {
-        outfixes.push_back(OutfixInfo(cloneHolder(h)));
+        outfixes.emplace_back(OutfixInfo(cloneHolder(h)));
     }
 
     populateOutfixInfo(outfixes.back(), h, *this);
@@ -1794,7 +1793,7 @@ bool RoseBuildImpl::addOutfix(const NGHolder &h) {
 bool RoseBuildImpl::addOutfix(const NGHolder &h, const raw_som_dfa &haig) {
     DEBUG_PRINTF("haig with %zu states\n", haig.states.size());
 
-    outfixes.push_back(OutfixInfo(ue2::make_unique<raw_som_dfa>(haig)));
+    outfixes.emplace_back(OutfixInfo(std::make_unique<raw_som_dfa>(haig)));
     populateOutfixInfo(outfixes.back(), h, *this);
 
     return true; /* failure is not yet an option */
@@ -1802,12 +1801,12 @@ bool RoseBuildImpl::addOutfix(const NGHolder &h, const raw_som_dfa &haig) {
 
 bool RoseBuildImpl::addOutfix(const raw_puff &rp) {
     if (!mpv_outfix) {
-        mpv_outfix = make_unique<OutfixInfo>(MpvProto());
+        mpv_outfix = std::make_unique<OutfixInfo>(MpvProto());
     }
 
     auto *mpv = mpv_outfix->mpv();
     assert(mpv);
-    mpv->puffettes.push_back(rp);
+    mpv->puffettes.emplace_back(rp);
 
     mpv_outfix->maxBAWidth = ROSE_BOUND_INF; /* not ba */
     mpv_outfix->minWidth = min(mpv_outfix->minWidth, depth(rp.repeats));
@@ -1827,12 +1826,12 @@ bool RoseBuildImpl::addOutfix(const raw_puff &rp) {
 bool RoseBuildImpl::addChainTail(const raw_puff &rp, u32 *queue_out,
                                  u32 *event_out) {
     if (!mpv_outfix) {
-        mpv_outfix = make_unique<OutfixInfo>(MpvProto());
+        mpv_outfix = std::make_unique<OutfixInfo>(MpvProto());
     }
 
     auto *mpv = mpv_outfix->mpv();
     assert(mpv);
-    mpv->triggered_puffettes.push_back(rp);
+    mpv->triggered_puffettes.emplace_back(rp);
 
     mpv_outfix->maxBAWidth = ROSE_BOUND_INF; /* not ba */
     mpv_outfix->minWidth = min(mpv_outfix->minWidth, depth(rp.repeats));
diff --git a/src/rose/rose_build_add_mask.cpp b/src/rose/rose_build_add_mask.cpp
index 0a7e44c37..c3736f62f 100644
--- a/src/rose/rose_build_add_mask.cpp
+++ b/src/rose/rose_build_add_mask.cpp
@@ -48,7 +48,6 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph.h"
-#include "util/make_unique.h"
 #include "util/ue2string.h"
 #include "util/verify_types.h"
 
@@ -185,7 +184,7 @@ bool expandFmlCandidates(const CharReach &cr, vector<ue2_literal> &curr,
                 return false;
             }
 
-            curr.push_back(lit);
+            curr.emplace_back(lit);
             curr.back().push_back(c, nocase);
         }
     }
@@ -300,7 +299,7 @@ unique_ptr<NGHolder> buildMaskLhs(bool anchored, u32 prefix_len,
     DEBUG_PRINTF("build %slhs len %u/%zu\n", anchored ? "anc " : "", prefix_len,
                  mask.size());
 
-    unique_ptr<NGHolder> lhs = ue2::make_unique<NGHolder>(NFA_PREFIX);
+    unique_ptr<NGHolder> lhs = std::make_unique<NGHolder>(NFA_PREFIX);
 
     assert(prefix_len);
     assert(mask.size() >= prefix_len);
@@ -335,8 +334,8 @@ void buildLiteralMask(const vector<CharReach> &mask, vector<u8> &msk,
     auto it = ite - min(size_t{HWLM_MASKLEN}, mask.size() - delay);
 
     for (; it != ite; ++it) {
-        msk.push_back(0);
-        cmp.push_back(0);
+        msk.emplace_back(0);
+        cmp.emplace_back(0);
         make_and_cmp_mask(*it, &msk.back(), &cmp.back());
     }
 
@@ -568,7 +567,7 @@ unique_ptr<NGHolder> buildMaskRhs(const flat_set<ReportID> &reports,
     assert(suffix_len);
     assert(mask.size() > suffix_len);
 
-    unique_ptr<NGHolder> rhs = ue2::make_unique<NGHolder>(NFA_SUFFIX);
+    unique_ptr<NGHolder> rhs = std::make_unique<NGHolder>(NFA_SUFFIX);
     NGHolder &h = *rhs;
 
     NFAVertex succ = h.accept;
@@ -751,7 +750,7 @@ static
 unique_ptr<NGHolder> makeAnchoredGraph(const vector<CharReach> &mask,
                                        const flat_set<ReportID> &reports,
                                        bool eod) {
-    auto gp = ue2::make_unique<NGHolder>();
+    auto gp = std::make_unique<NGHolder>();
     NGHolder &g = *gp;
 
     NFAVertex u = g.start;
diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index 8ea07c95d..027aefd0b 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -51,7 +51,6 @@
 #include "util/determinise.h"
 #include "util/flat_containers.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/ue2string.h"
 #include "util/unordered.h"
@@ -145,9 +144,9 @@ void mergeAnchoredDfas(vector<unique_ptr<raw_dfa>> &dfas,
     for (auto &rdfa : dfas) {
         u32 start_size = mcclellanStartReachSize(rdfa.get());
         if (start_size <= MAX_SMALL_START_REACH) {
-            small_starts.push_back(move(rdfa));
+            small_starts.emplace_back(std::move(rdfa));
         } else {
-            big_starts.push_back(move(rdfa));
+            big_starts.emplace_back(std::move(rdfa));
         }
     }
     dfas.clear();
@@ -159,10 +158,10 @@ void mergeAnchoredDfas(vector<unique_ptr<raw_dfa>> &dfas,
 
     // Rehome our groups into one vector.
     for (auto &rdfa : small_starts) {
-        dfas.push_back(move(rdfa));
+        dfas.emplace_back(std::move(rdfa));
     }
     for (auto &rdfa : big_starts) {
-        dfas.push_back(move(rdfa));
+        dfas.emplace_back(std::move(rdfa));
     }
 
     // Final test: if we've built two DFAs here that are small enough, we can
@@ -300,7 +299,7 @@ class Automaton_Holder {
     explicit Automaton_Holder(const NGHolder &g_in) : g(g_in) {
         for (auto v : vertices_range(g)) {
             vertexToIndex[v] = indexToVertex.size();
-            indexToVertex.push_back(v);
+            indexToVertex.emplace_back(v);
         }
 
         assert(indexToVertex.size() <= ANCHORED_NFA_STATE_LIMIT);
@@ -331,7 +330,7 @@ class Automaton_Holder {
 
                 if (t.any() && t != esets[i]) {
                     esets[i] &= ~t;
-                    esets.push_back(t);
+                    esets.emplace_back(t);
                 }
             }
         }
@@ -511,7 +510,7 @@ NFAVertex extractLiteral(const NGHolder &h, ue2_literal *lit) {
         if (cr.count() > 1 && !cr.isCaselessChar()) {
             break;
         }
-        lit_verts.push_back(v);
+        lit_verts.emplace_back(v);
     }
 
     if (lit_verts.empty()) {
@@ -686,7 +685,7 @@ int finalise_out(RoseBuildImpl &build, const NGHolder &h,
     if (check_dupe(*out_dfa, build.anchored_nfas[hash], remap)) {
         return ANCHORED_REMAP;
     }
-    build.anchored_nfas[hash].push_back(move(out_dfa));
+    build.anchored_nfas[hash].emplace_back(std::move(out_dfa));
     return ANCHORED_SUCCESS;
 }
 
@@ -699,9 +698,9 @@ int addAutomaton(RoseBuildImpl &build, const NGHolder &h, ReportID *remap) {
 
     Automaton_Holder autom(h);
 
-    auto out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
+    auto out_dfa = std::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
     if (determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
-        return finalise_out(build, h, autom, move(out_dfa), remap);
+        return finalise_out(build, h, autom, std::move(out_dfa), remap);
     }
 
     DEBUG_PRINTF("determinise failed\n");
@@ -761,14 +760,14 @@ void buildSimpleDfas(const RoseBuildImpl &build, const vector<u32> &frag_map,
         }
         auto h = populate_holder(simple.first, exit_ids);
         Automaton_Holder autom(*h);
-        auto rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
+        auto rdfa = std::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
         UNUSED bool rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
         assert(rv);
         rdfa->start_anchored = INIT_STATE;
         rdfa->start_floating = DEAD_STATE;
         rdfa->alpha_size = autom.alphasize;
         rdfa->alpha_remap = autom.alpha;
-        anchored_dfas->push_back(move(rdfa));
+        anchored_dfas->emplace_back(std::move(rdfa));
     }
 }
 
@@ -785,7 +784,7 @@ vector<unique_ptr<raw_dfa>> getAnchoredDfas(RoseBuildImpl &build,
     // DFAs that already exist as raw_dfas.
     for (auto &anch_dfas : build.anchored_nfas) {
         for (auto &rdfa : anch_dfas.second) {
-            dfas.push_back(move(rdfa));
+            dfas.emplace_back(std::move(rdfa));
         }
     }
     build.anchored_nfas.clear();
@@ -823,7 +822,7 @@ size_t buildNfas(vector<raw_dfa> &anchored_dfas,
 
     for (auto &rdfa : anchored_dfas) {
         u32 removed_dots = remove_leading_dots(rdfa);
-        start_offset->push_back(removed_dots);
+        start_offset->emplace_back(removed_dots);
 
         minimize_hopcroft(rdfa, cc.grey);
 
@@ -835,7 +834,7 @@ size_t buildNfas(vector<raw_dfa> &anchored_dfas,
 
         assert(nfa->length);
         total_size += ROUNDUP_CL(sizeof(anchored_matcher_info) + nfa->length);
-        nfas->push_back(move(nfa));
+        nfas->emplace_back(std::move(nfa));
     }
 
     // We no longer need to keep the raw_dfa structures around.
@@ -862,7 +861,7 @@ vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build,
     dfas.reserve(anch_dfas.size());
     for (auto &rdfa : anch_dfas) {
         assert(rdfa);
-        dfas.push_back(move(*rdfa));
+        dfas.emplace_back(std::move(*rdfa));
     }
     return dfas;
 }
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 5cbb5c848..06f36582b 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -87,7 +87,6 @@
 #include "util/fatbit_build.h"
 #include "util/graph_range.h"
 #include "util/insertion_ordered.h"
-#include "util/make_unique.h"
 #include "util/multibit_build.h"
 #include "util/noncopyable.h"
 #include "util/order_check.h"
@@ -554,7 +553,8 @@ void findFixedDepthTops(const RoseGraph &g, const set<PredTopPair> &triggers,
  */
 static
 bytecode_ptr<NFA> pickImpl(bytecode_ptr<NFA> dfa_impl,
-                           bytecode_ptr<NFA> nfa_impl) {
+                           bytecode_ptr<NFA> nfa_impl,
+                           bool fast_nfa) {
     assert(nfa_impl);
     assert(dfa_impl);
     assert(isDfaType(dfa_impl->type));
@@ -584,7 +584,7 @@ bytecode_ptr<NFA> pickImpl(bytecode_ptr<NFA> dfa_impl,
                 return nfa_impl;
             }
         } else {
-            if (n_accel) {
+            if (n_accel && fast_nfa) {
                 return nfa_impl;
             } else {
                 return dfa_impl;
@@ -632,6 +632,15 @@ bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
          * bytecode and that they are usually run on small blocks */
         dfa = mcshengCompile(rdfa, cc, rm);
     }
+    if (!dfa) {
+        dfa = sheng32Compile(rdfa, cc, rm, false);
+    }
+    if (!dfa) {
+        dfa = sheng64Compile(rdfa, cc, rm, false);
+    }
+    if (!dfa && !is_transient) {
+        dfa = mcshengCompile64(rdfa, cc, rm);
+    }
     if (!dfa) {
         // Sheng wasn't successful, so unleash McClellan!
         dfa = mcclellanCompile(rdfa, cc, rm, false);
@@ -678,22 +687,23 @@ buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
         }
     }
 
+    bool fast_nfa = false;
     auto n = constructNFA(holder, &rm, fixed_depth_tops, triggers,
-                          compress_state, cc);
+                          compress_state, fast_nfa, cc);
     assert(n);
 
     if (oneTop && cc.grey.roseMcClellanSuffix) {
         if (cc.grey.roseMcClellanSuffix == 2 || n->nPositions > 128 ||
-            !has_bounded_repeats_other_than_firsts(*n)) {
+            !has_bounded_repeats_other_than_firsts(*n) || !fast_nfa) {
             auto rdfa = buildMcClellan(holder, &rm, false, triggers.at(0),
                                        cc.grey);
             if (rdfa) {
                 auto d = getDfa(*rdfa, false, cc, rm);
                 assert(d);
                 if (cc.grey.roseMcClellanSuffix != 2) {
-                    n = pickImpl(move(d), move(n));
+                    n = pickImpl(std::move(d), std::move(n), fast_nfa);
                 } else {
-                    n = move(d);
+                    n = std::move(d);
                 }
 
                 assert(n);
@@ -739,7 +749,7 @@ static
 vector<CharReach> as_cr_seq(const rose_literal_id &lit) {
     vector<CharReach> rv = as_cr_seq(lit.s);
     for (u32 i = 0; i < lit.delay; i++) {
-        rv.push_back(CharReach::dot());
+        rv.emplace_back(CharReach::dot());
     }
 
     /* TODO: take into account cmp/msk */
@@ -765,7 +775,7 @@ void findTriggerSequences(const RoseBuildImpl &tbi,
 
         for (u32 id : lit_ids) {
             const rose_literal_id &lit = tbi.literals.at(id);
-            (*trigger_lits)[top].push_back(as_cr_seq(lit));
+            (*trigger_lits)[top].emplace_back(as_cr_seq(lit));
         }
     }
 }
@@ -826,23 +836,24 @@ bytecode_ptr<NFA> makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
         n = constructLBR(*left.graph(), triggers.begin()->second, cc, rm);
     }
 
+    bool fast_nfa = false;
     if (!n && left.graph()) {
         map<u32, vector<vector<CharReach>>> triggers;
         if (left.graph()->kind == NFA_INFIX) {
             findTriggerSequences(tbi, infixTriggers.at(left), &triggers);
         }
         n = constructNFA(*left.graph(), nullptr, fixed_depth_tops, triggers,
-                         compress_state, cc);
+                         compress_state, fast_nfa, cc);
     }
 
     if (cc.grey.roseMcClellanPrefix == 1 && is_prefix && !left.dfa()
         && left.graph()
-        && (!n || !has_bounded_repeats_other_than_firsts(*n) || !is_fast(*n))) {
+        && (!n || !has_bounded_repeats_other_than_firsts(*n) || !fast_nfa)) {
         auto rdfa = buildMcClellan(*left.graph(), nullptr, cc.grey);
         if (rdfa) {
             auto d = getDfa(*rdfa, is_transient, cc, rm);
             assert(d);
-            n = pickImpl(move(d), move(n));
+            n = pickImpl(std::move(d), std::move(n), fast_nfa);
         }
     }
 
@@ -902,7 +913,7 @@ void appendTailToHolder(NGHolder &h, const vector<CharReach> &tail) {
 
     map<flat_set<ReportID>, vector<NFAVertex> > reporters;
     for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
-        reporters[h[v].reports].push_back(v);
+        reporters[h[v].reports].emplace_back(v);
     }
 
     for (const auto &e : reporters) {
@@ -1191,7 +1202,7 @@ static
 unique_ptr<TamaInfo> constructTamaInfo(const RoseGraph &g,
                      const vector<ExclusiveSubengine> &subengines,
                      const bool is_suffix) {
-    unique_ptr<TamaInfo> tamaInfo = ue2::make_unique<TamaInfo>();
+    unique_ptr<TamaInfo> tamaInfo = std::make_unique<TamaInfo>();
     for (const auto &sub : subengines) {
         const auto &rose_vertices = sub.vertices;
         NFA *nfa = sub.nfa.get();
@@ -1411,12 +1422,12 @@ void buildExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
             setLeftNfaProperties(*n, leftfix);
 
             ExclusiveSubengine engine;
-            engine.nfa = move(n);
+            engine.nfa = std::move(n);
             engine.vertices = verts;
-            info.subengines.push_back(move(engine));
+            info.subengines.emplace_back(std::move(engine));
         }
         info.queue = qif.get_queue();
-        exclusive_info.push_back(move(info));
+        exclusive_info.emplace_back(std::move(info));
     }
     updateExclusiveInfixProperties(build, exclusive_info, bc.leftfix_info,
                                    no_retrigger_queues);
@@ -1450,7 +1461,7 @@ void findExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
             // NFA already built.
             u32 id = leftfixes[leftfix];
             if (contains(vertex_map, id)) {
-                vertex_map[id].push_back(v);
+                vertex_map[id].emplace_back(v);
             }
             DEBUG_PRINTF("sharing leftfix, id=%u\n", id);
             continue;
@@ -1462,7 +1473,7 @@ void findExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
 
         if (leftfix.graph() || leftfix.castle()) {
             leftfixes.emplace(leftfix, role_id);
-            vertex_map[role_id].push_back(v);
+            vertex_map[role_id].emplace_back(v);
 
             map<u32, vector<vector<CharReach>>> triggers;
             findTriggerSequences(build, infixTriggers.at(leftfix), &triggers);
@@ -1533,7 +1544,7 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
             }
         }
 
-        succs[leftfix].push_back(v);
+        succs[leftfix].emplace_back(v);
     }
 
     rose_group initial_groups = tbi.getInitialGroups();
@@ -1627,17 +1638,18 @@ class OutfixBuilder : public boost::static_visitor<bytecode_ptr<NFA>> {
         const map<u32, u32> fixed_depth_tops; /* no tops */
         const map<u32, vector<vector<CharReach>>> triggers; /* no tops */
         bool compress_state = cc.streaming;
+        bool fast_nfa = false;
         auto n = constructNFA(h, &rm, fixed_depth_tops, triggers,
-                              compress_state, cc);
+                              compress_state, fast_nfa, cc);
 
         // Try for a DFA upgrade.
         if (n && cc.grey.roseMcClellanOutfix &&
-            !has_bounded_repeats_other_than_firsts(*n)) {
+            (!has_bounded_repeats_other_than_firsts(*n) || !fast_nfa)) {
             auto rdfa = buildMcClellan(h, &rm, cc.grey);
             if (rdfa) {
                 auto d = getDfa(*rdfa, false, cc, rm);
                 if (d) {
-                    n = pickImpl(move(d), move(n));
+                    n = pickImpl(std::move(d), std::move(n), fast_nfa);
                 }
             }
         }
@@ -1852,15 +1864,15 @@ void buildExclusiveSuffixes(RoseBuildImpl &build, build_context &bc,
             setSuffixProperties(*n, s, build.rm);
 
             ExclusiveSubengine engine;
-            engine.nfa = move(n);
+            engine.nfa = std::move(n);
             engine.vertices = verts;
-            info.subengines.push_back(move(engine));
+            info.subengines.emplace_back(std::move(engine));
 
             const auto &reports = all_reports(s);
             info.reports.insert(reports.begin(), reports.end());
         }
         info.queue = qif.get_queue();
-        exclusive_info.push_back(move(info));
+        exclusive_info.emplace_back(std::move(info));
     }
     updateExclusiveSuffixProperties(build, exclusive_info,
                                     no_retrigger_queues);
@@ -1891,7 +1903,7 @@ void findExclusiveSuffixes(RoseBuildImpl &tbi, build_context &bc,
         if (contains(suffixes, s)) {
             u32 id = suffixes[s];
             if (!tbi.isInETable(v)) {
-                vertex_map[id].push_back(v);
+                vertex_map[id].emplace_back(v);
             }
             continue;
         }
@@ -1905,7 +1917,7 @@ void findExclusiveSuffixes(RoseBuildImpl &tbi, build_context &bc,
             DEBUG_PRINTF("assigning %p to id %u\n", s.graph(), role_id);
             suffixes.emplace(s, role_id);
 
-            vertex_map[role_id].push_back(v);
+            vertex_map[role_id].emplace_back(v);
             const set<PredTopPair> &s_triggers = suffixTriggers.at(s);
             map<u32, vector<vector<CharReach>>> triggers;
             findTriggerSequences(tbi, s_triggers, &triggers);
@@ -2178,7 +2190,7 @@ u32 buildLastByteIter(const RoseGraph &g, build_context &bc) {
         // Eager EOD reporters won't have state indices.
         auto it = bc.roleStateIndices.find(v);
         if (it != end(bc.roleStateIndices)) {
-            lb_roles.push_back(it->second);
+            lb_roles.emplace_back(it->second);
             DEBUG_PRINTF("last byte %u\n", it->second);
         }
     }
@@ -2251,7 +2263,7 @@ vector<u32> buildSuffixEkeyLists(const RoseBuildImpl &build, build_context &bc,
         u32 qi = e.first;
         auto &ekeys = e.second;
         assert(!ekeys.empty());
-        ekeys.push_back(INVALID_EKEY); /* terminator */
+        ekeys.emplace_back(INVALID_EKEY); /* terminator */
         out[qi] = bc.engine_blob.add_range(ekeys);
     }
 
@@ -2266,7 +2278,7 @@ u32 buildEodNfaIterator(build_context &bc, const u32 activeQueueCount) {
         const auto &eng_info = bc.engine_info_by_queue.at(qi);
         if (eng_info.accepts_eod) {
             DEBUG_PRINTF("nfa qi=%u accepts eod\n", qi);
-            keys.push_back(qi);
+            keys.emplace_back(qi);
         }
     }
 
@@ -2341,7 +2353,7 @@ void addSomRevNfas(build_context &bc, RoseEngine &proto,
         u32 offset = bc.engine_blob.add(*nfa, nfa->length);
         DEBUG_PRINTF("wrote SOM rev NFA %zu (len %u) to offset %u\n",
                      nfa_offsets.size(), nfa->length, offset);
-        nfa_offsets.push_back(offset);
+        nfa_offsets.emplace_back(offset);
         /* note: som rev nfas don't need a queue assigned as only run in block
          * mode reverse */
     }
@@ -2404,7 +2416,7 @@ u32 writeProgram(build_context &bc, RoseProgram &&program) {
     u32 offset = bc.engine_blob.add(prog_bytecode);
     DEBUG_PRINTF("prog len %zu written at offset %u\n", prog_bytecode.size(),
                  offset);
-    bc.program_cache.emplace(move(program), offset);
+    bc.program_cache.emplace(std::move(program), offset);
     return offset;
 }
 
@@ -2415,7 +2427,7 @@ u32 writeActiveLeftIter(RoseEngineBlob &engine_blob,
     for (size_t i = 0; i < leftInfoTable.size(); i++) {
         if (!leftInfoTable[i].transient) {
             DEBUG_PRINTF("leftfix %zu is active\n", i);
-            keys.push_back(verify_u32(i));
+            keys.emplace_back(verify_u32(i));
         }
     }
 
@@ -2569,13 +2581,13 @@ void makeBoundaryPrograms(const RoseBuildImpl &build, build_context &bc,
     DEBUG_PRINTF("report ^$: %zu\n", dboundary.report_at_0_eod_full.size());
 
     auto eod_prog = makeBoundaryProgram(build, boundary.report_at_eod);
-    out.reportEodOffset = writeProgram(bc, move(eod_prog));
+    out.reportEodOffset = writeProgram(bc, std::move(eod_prog));
 
     auto zero_prog = makeBoundaryProgram(build, boundary.report_at_0);
-    out.reportZeroOffset = writeProgram(bc, move(zero_prog));
+    out.reportZeroOffset = writeProgram(bc, std::move(zero_prog));
 
     auto zeod_prog = makeBoundaryProgram(build, dboundary.report_at_0_eod_full);
-    out.reportZeroEodOffset = writeProgram(bc, move(zeod_prog));
+    out.reportZeroEodOffset = writeProgram(bc, std::move(zeod_prog));
 }
 
 static
@@ -2740,10 +2752,10 @@ RoseProgram makeFragmentProgram(const RoseBuildImpl &build, build_context &bc,
     for (const auto &lit_id : lit_ids) {
         auto prog = makeLiteralProgram(build, bc, prog_build, lit_id,
                                        lit_edge_map, false);
-        blocks.push_back(move(prog));
+        blocks.emplace_back(std::move(prog));
     }
 
-    return assembleProgramBlocks(move(blocks));
+    return assembleProgramBlocks(std::move(blocks));
 }
 
 /**
@@ -2844,7 +2856,7 @@ vector<LitFragment> groupByFragment(const RoseBuildImpl &build) {
         DEBUG_PRINTF("fragment candidate: lit_id=%u %s\n", lit_id,
                      dumpString(lit.s).c_str());
         auto &fi = frag_info[getFragment(lit)];
-        fi.lit_ids.push_back(lit_id);
+        fi.lit_ids.emplace_back(lit_id);
         fi.groups |= groups;
     }
 
@@ -2853,7 +2865,7 @@ vector<LitFragment> groupByFragment(const RoseBuildImpl &build) {
         auto &fi = m.second;
         DEBUG_PRINTF("frag %s -> ids: %s\n", dumpString(m.first.s).c_str(),
                      as_string_list(fi.lit_ids).c_str());
-        fragments.emplace_back(frag_id, lit.s, fi.groups, move(fi.lit_ids));
+        fragments.emplace_back(frag_id, lit.s, fi.groups, std::move(fi.lit_ids));
         frag_id++;
         assert(frag_id == fragments.size());
     }
@@ -2906,7 +2918,7 @@ void findInclusionGroups(vector<LitFragment> &fragments,
         u32 id = j;
         if (contains(includedIdMap, id) ||
             contains(includedDelayIdMap, id)) {
-            candidates.push_back(j);
+            candidates.emplace_back(j);
             DEBUG_PRINTF("find candidate\n");
         }
     }
@@ -2969,7 +2981,7 @@ void buildFragmentPrograms(const RoseBuildImpl &build,
                          child_offset);
             addIncludedJumpProgram(lit_prog, child_offset, pfrag.squash);
         }
-        pfrag.lit_program_offset = writeProgram(bc, move(lit_prog));
+        pfrag.lit_program_offset = writeProgram(bc, std::move(lit_prog));
 
         // We only do delayed rebuild in streaming mode.
         if (!build.cc.streaming) {
@@ -2989,7 +3001,7 @@ void buildFragmentPrograms(const RoseBuildImpl &build,
             addIncludedJumpProgram(rebuild_prog, child_offset,
                                    pfrag.delay_squash);
         }
-        pfrag.delay_program_offset = writeProgram(bc, move(rebuild_prog));
+        pfrag.delay_program_offset = writeProgram(bc, std::move(rebuild_prog));
     }
 }
 
@@ -3078,7 +3090,7 @@ pair<u32, u32> writeDelayPrograms(const RoseBuildImpl &build,
                 auto prog = makeLiteralProgram(build, bc, prog_build,
                                                delayed_lit_id, lit_edge_map,
                                                false);
-                u32 offset = writeProgram(bc, move(prog));
+                u32 offset = writeProgram(bc, std::move(prog));
 
                 u32 delay_id;
                 auto it = cache.find(offset);
@@ -3088,7 +3100,7 @@ pair<u32, u32> writeDelayPrograms(const RoseBuildImpl &build,
                                  delay_id, offset);
                 } else {
                     delay_id = verify_u32(programs.size());
-                    programs.push_back(offset);
+                    programs.emplace_back(offset);
                     cache.emplace(offset, delay_id);
                     DEBUG_PRINTF("assigned new delay_id %u for offset %u\n",
                                  delay_id, offset);
@@ -3138,7 +3150,7 @@ pair<u32, u32> writeAnchoredPrograms(const RoseBuildImpl &build,
 
             auto prog = makeLiteralProgram(build, bc, prog_build, lit_id,
                                            lit_edge_map, true);
-            u32 offset = writeProgram(bc, move(prog));
+            u32 offset = writeProgram(bc, std::move(prog));
             DEBUG_PRINTF("lit_id=%u -> anch prog at %u\n", lit_id, offset);
 
             u32 anch_id;
@@ -3149,7 +3161,7 @@ pair<u32, u32> writeAnchoredPrograms(const RoseBuildImpl &build,
                              offset);
             } else {
                 anch_id = verify_u32(programs.size());
-                programs.push_back(offset);
+                programs.emplace_back(offset);
                 cache.emplace(offset, anch_id);
                 DEBUG_PRINTF("assigned new anch_id %u for offset %u\n", anch_id,
                              offset);
@@ -3198,8 +3210,8 @@ pair<u32, u32> buildReportPrograms(const RoseBuildImpl &build,
 
     for (ReportID id : reports) {
         auto program = makeReportProgram(build, bc.needs_mpv_catchup, id);
-        u32 offset = writeProgram(bc, move(program));
-        programs.push_back(offset);
+        u32 offset = writeProgram(bc, std::move(program));
+        programs.emplace_back(offset);
         build.rm.setProgramOffset(id, offset);
         DEBUG_PRINTF("program for report %u @ %u (%zu instructions)\n", id,
                      programs.back(), program.size());
@@ -3265,7 +3277,7 @@ void addEodAnchorProgram(const RoseBuildImpl &build, const build_context &bc,
                              g[u].index);
                 continue;
             }
-            edge_list.push_back(e);
+            edge_list.emplace_back(e);
         }
 
         const bool multiple_preds = edge_list.size() > 1;
@@ -3298,7 +3310,7 @@ void addEodEventProgram(const RoseBuildImpl &build, build_context &bc,
     vector<RoseEdge> edge_list;
     for (const auto &v : lit_info.vertices) {
         for (const auto &e : in_edges_range(v, g)) {
-            edge_list.push_back(e);
+            edge_list.emplace_back(e);
         }
     }
 
@@ -3314,7 +3326,7 @@ void addEodEventProgram(const RoseBuildImpl &build, build_context &bc,
                                     bc.roleStateIndices, prog_build,
                                     build.eod_event_literal_id, edge_list,
                                     false);
-    program.add_block(move(block));
+    program.add_block(std::move(block));
 }
 
 static
@@ -3465,7 +3477,7 @@ u32 writeEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
     vector<u32> vec;
     for (u32 q : eager) {
         assert(q >= leftfixBeginQueue);
-        vec.push_back(q - leftfixBeginQueue);
+        vec.emplace_back(q - leftfixBeginQueue);
     }
 
     auto iter = mmbBuildSparseIterator(vec, queue_count - leftfixBeginQueue);
@@ -3703,7 +3715,7 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
                          drproto.get(), eproto.get(), sbproto.get());
 
     auto eod_prog = makeEodProgram(*this, bc, prog_build, eodNfaIterOffset);
-    proto.eodProgramOffset = writeProgram(bc, move(eod_prog));
+    proto.eodProgramOffset = writeProgram(bc, std::move(eod_prog));
 
     size_t longLitStreamStateRequired = 0;
     proto.longLitTableOffset
@@ -3722,11 +3734,11 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     writeLogicalInfo(rm, bc.engine_blob, proto);
 
     auto flushComb_prog = makeFlushCombProgram(proto);
-    proto.flushCombProgramOffset = writeProgram(bc, move(flushComb_prog));
+    proto.flushCombProgramOffset = writeProgram(bc, std::move(flushComb_prog));
 
     auto lastFlushComb_prog = makeLastFlushCombProgram(proto);
     proto.lastFlushCombProgramOffset =
-        writeProgram(bc, move(lastFlushComb_prog));
+        writeProgram(bc, std::move(lastFlushComb_prog));
 
     // Build anchored matcher.
     auto atable = buildAnchoredMatcher(*this, fragments, anchored_dfas);
@@ -3870,7 +3882,7 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     bc.engine_blob.write_bytes(engine.get());
 
     // Add a small write engine if appropriate.
-    engine = addSmallWriteEngine(*this, bc.resources, move(engine));
+    engine = addSmallWriteEngine(*this, bc.resources, std::move(engine));
 
     DEBUG_PRINTF("rose done %p\n", engine.get());
 
diff --git a/src/rose/rose_build_castle.cpp b/src/rose/rose_build_castle.cpp
index 59bab3b1f..f3357982e 100644
--- a/src/rose/rose_build_castle.cpp
+++ b/src/rose/rose_build_castle.cpp
@@ -130,7 +130,7 @@ vector<rose_literal_id> literals_for_vertex(const RoseBuildImpl &tbi,
     vector<rose_literal_id> rv;
 
     for (const u32 id : tbi.g[v].literals) {
-        rv.push_back(tbi.literals.at(id));
+        rv.emplace_back(tbi.literals.at(id));
     }
 
     return rv;
@@ -227,7 +227,7 @@ void makeCastles(RoseBuildImpl &tbi) {
         if (g[v].left && !tbi.isRootSuccessor(v)) {
             makeCastle(g[v].left, left_cache);
             if (g[v].left.castle) {
-                rev[g[v].left.castle.get()].push_back(v);
+                rev[g[v].left.castle.get()].emplace_back(v);
             }
         }
 
@@ -253,11 +253,11 @@ bool unmakeCastles(RoseBuildImpl &tbi) {
     for (auto v : vertices_range(g)) {
         const LeftEngInfo &left = g[v].left;
         if (left.castle && left.castle->repeats.size() > 1) {
-            left_castles[left].push_back(v);
+            left_castles[left].emplace_back(v);
         }
         const RoseSuffixInfo &suffix = g[v].suffix;
         if (suffix.castle && suffix.castle->repeats.size() > 1) {
-            suffix_castles[suffix].push_back(v);
+            suffix_castles[suffix].emplace_back(v);
         }
     }
 
@@ -303,10 +303,10 @@ void remapCastleTops(RoseBuildImpl &tbi) {
     RoseGraph &g = tbi.g;
     for (auto v : vertices_range(g)) {
         if (g[v].left.castle) {
-            rose_castles[g[v].left.castle.get()].push_back(v);
+            rose_castles[g[v].left.castle.get()].emplace_back(v);
         }
         if (g[v].suffix.castle) {
-            suffix_castles[g[v].suffix.castle.get()].push_back(v);
+            suffix_castles[g[v].suffix.castle.get()].emplace_back(v);
         }
     }
 
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index 1cf3bbe69..e67c9149a 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -194,7 +194,7 @@ void RoseBuildImpl::handleMixedSensitivity(void) {
             limited_explosion(lit.s) && literal_info[id].delayed_ids.empty()) {
             DEBUG_PRINTF("need to explode existing string '%s'\n",
                          dumpString(lit.s).c_str());
-            explode.push_back(id);
+            explode.emplace_back(id);
         } else {
             literal_info[id].requires_benefits = true;
         }
@@ -734,9 +734,9 @@ void stealEodVertices(RoseBuildImpl &tbi) {
 
         if (lit.table == ROSE_EOD_ANCHORED) {
             if (suitableForAnchored(tbi, lit, info)) {
-                eodLiteralsForAnchored.push_back(i);
+                eodLiteralsForAnchored.emplace_back(i);
             } else {
-                eodLiteralsForFloating.push_back(i);
+                eodLiteralsForFloating.emplace_back(i);
             }
         } else if (lit.table == ROSE_FLOATING) {
             numFloatingLiterals++;
@@ -863,7 +863,7 @@ map<left_id, vector<RoseVertex>> findLeftSucc(const RoseBuildImpl &build) {
     for (auto v : vertices_range(build.g)) {
         if (build.g[v].left) {
             const LeftEngInfo &lei = build.g[v].left;
-            leftfixes[lei].push_back(v);
+            leftfixes[lei].emplace_back(v);
         }
     }
     return leftfixes;
@@ -1046,7 +1046,7 @@ void packInfixTops(NGHolder &h, RoseGraph &g,
         h[e].tops = std::move(updated_tops);
         if (h[e].tops.empty()) {
             DEBUG_PRINTF("edge (start,%zu) has only unused tops\n", h[v].index);
-            dead.push_back(e);
+            dead.emplace_back(e);
         }
     }
 
@@ -1481,7 +1481,7 @@ bool extractSEPLiterals(const raw_dfa &rdfa,
         if (!stateIsSEPLiteral(next, i, rdfa)) {
             return false;
         }
-        lits[rdfa.states[next].reports].push_back(i);
+        lits[rdfa.states[next].reports].emplace_back(i);
     }
 
     // Map from symbols back to character reachability.
@@ -1577,7 +1577,7 @@ void addAnchoredSmallBlockLiterals(RoseBuildImpl &tbi) {
                          dumpString(sai.literal).c_str(), sai.min_bound);
         }
 
-        anchored_lits.push_back(make_pair(sai, lit_ids));
+        anchored_lits.emplace_back(make_pair(sai, lit_ids));
         if (sai.literal.length() == 1) {
             oneByteLiterals++;
         }
@@ -1588,7 +1588,7 @@ void addAnchoredSmallBlockLiterals(RoseBuildImpl &tbi) {
     map<ue2_literal, flat_set<ReportID>> sep_literals;
     for (OutfixInfo &oi : tbi.outfixes) {
         if (extractSEPLiterals(oi, tbi.rm, sep_literals)) {
-            sep_outfixes.push_back(&oi);
+            sep_outfixes.emplace_back(&oi);
         }
     }
 
@@ -1782,7 +1782,7 @@ bytecode_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
 
     /* transfer mpv outfix to main queue */
     if (mpv_outfix) {
-        outfixes.push_back(move(*mpv_outfix));
+        outfixes.emplace_back(std::move(*mpv_outfix));
         mpv_outfix = nullptr;
     }
 
diff --git a/src/rose/rose_build_convert.cpp b/src/rose/rose_build_convert.cpp
index 33351099f..c89c6ddd2 100644
--- a/src/rose/rose_build_convert.cpp
+++ b/src/rose/rose_build_convert.cpp
@@ -49,7 +49,6 @@
 #include "util/compile_context.h"
 #include "util/depth.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/ue2string.h"
 
@@ -95,7 +94,7 @@ unique_ptr<NGHolder> makeFloodProneSuffix(const ue2_literal &s, size_t len,
     assert(len < s.length());
     assert(!reports.empty());
 
-    unique_ptr<NGHolder> h = ue2::make_unique<NGHolder>(NFA_SUFFIX);
+    unique_ptr<NGHolder> h = std::make_unique<NGHolder>(NFA_SUFFIX);
 
     NFAVertex u = h->start;
     for (auto it = s.begin() + s.length() - len; it != s.end(); ++it) {
@@ -114,7 +113,7 @@ unique_ptr<NGHolder> makeFloodProneSuffix(const ue2_literal &s, size_t len,
 
 static
 unique_ptr<NGHolder> makeRosePrefix(const ue2_literal &s) {
-    unique_ptr<NGHolder> h = ue2::make_unique<NGHolder>(NFA_PREFIX);
+    unique_ptr<NGHolder> h = std::make_unique<NGHolder>(NFA_PREFIX);
 
     NFAVertex u = h->startDs;
     for (const auto &c : s) {
@@ -213,7 +212,7 @@ void convertFloodProneSuffix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id,
 
     // Apply the NFA.
     assert(!g[v].suffix);
-    g[v].suffix.graph = move(h);
+    g[v].suffix.graph = std::move(h);
     g[v].reports.clear();
 
     // Swap v's literal for a shorter one.
@@ -413,7 +412,7 @@ bool handleStartPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     } else {
         RoseEdge e_new = add_edge(ar, v, g);
         setEdgeBounds(g, e_new, bound_min, bound_max);
-        to_delete->push_back(e_old);
+        to_delete->emplace_back(e_old);
     }
 
     g[v].left.reset(); /* clear the prefix info */
@@ -562,6 +561,10 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
     DEBUG_PRINTF("woot?\n");
 
     shared_ptr<NGHolder> h_new = make_shared<NGHolder>();
+    if (!h_new) {
+        assert(0);
+        throw std::bad_alloc();
+    }
     unordered_map<NFAVertex, NFAVertex> rhs_map;
     vector<NFAVertex> exits_vec;
     insert(&exits_vec, exits_vec.end(), exits);
@@ -605,7 +608,7 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
         } else {
             RoseEdge e_new = add_edge(ar, v, g);
             setEdgeBounds(g, e_new, ri.repeatMin + width, ri.repeatMax + width);
-            to_delete->push_back(e_old);
+            to_delete->emplace_back(e_old);
         }
 
     } else {
diff --git a/src/rose/rose_build_dedupe.cpp b/src/rose/rose_build_dedupe.cpp
index d5d002d43..0a19480a4 100644
--- a/src/rose/rose_build_dedupe.cpp
+++ b/src/rose/rose_build_dedupe.cpp
@@ -32,7 +32,6 @@
 #include "smallwrite/smallwrite_build.h"
 #include "util/compile_context.h"
 #include "util/boundary_reports.h"
-#include "util/make_unique.h"
 #include "util/report_manager.h"
 
 using namespace std;
@@ -100,7 +99,7 @@ class RoseDedupeAuxImpl : public RoseDedupeAux {
 };
 
 unique_ptr<RoseDedupeAux> RoseBuildImpl::generateDedupeAux() const {
-    return ue2::make_unique<RoseDedupeAuxImpl>(*this);
+    return std::make_unique<RoseDedupeAuxImpl>(*this);
 }
 
 RoseDedupeAux::~RoseDedupeAux() = default;
@@ -177,7 +176,7 @@ static
 vector<CharReach> makePath(const rose_literal_id &lit) {
     vector<CharReach> path(begin(lit.s), end(lit.s));
     for (u32 i = 0; i < lit.delay; i++) {
-        path.push_back(CharReach::dot());
+        path.emplace_back(CharReach::dot());
     }
     return path;
 }
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 8999daef2..e63d41039 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -696,7 +696,7 @@ vector<u32> sparseIterValues(const mmbit_sparse_iter *it, u32 num_bits) {
     u32 idx = 0;
     u32 i = mmbit_sparse_iter_begin(b, num_bits, &idx, it, s);
     while (i != MMB_INVALID) {
-        keys.push_back(i);
+        keys.emplace_back(i);
         i = mmbit_sparse_iter_next(b, num_bits, i, &idx, it, s);
     }
 
@@ -757,13 +757,12 @@ CharReach shufti2cr(const u8 *lo, const u8 *hi, u8 bucket_mask) {
 
 static
 void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
-                          const u8 *bucket_mask, u32 neg_mask, s32 offset) {
-    assert(len == 16 || len == 32);
+                          const u8 *bucket_mask, u64a neg_mask, s32 offset) {
+    assert(len == 16 || len == 32 || len == 64);
     os << "    contents:" << endl;
     for (u32 idx = 0; idx < len; idx++) {
         CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
-
-        if (neg_mask & (1U << idx)) {
+        if (neg_mask & (1ULL << idx)) {
             cr.flip();
         }
 
@@ -779,14 +778,13 @@ void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
 static
 void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
                           const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask,
-                          const u8 *bucket_mask_2, u32 neg_mask, s32 offset) {
-    assert(len == 16 || len == 32);
+                          const u8 *bucket_mask_2, u64a neg_mask, s32 offset) {
+    assert(len == 16 || len == 32 || len == 64);
     os << "    contents:" << endl;
     for (u32 idx = 0; idx < len; idx++) {
         CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
         cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]);
-
-        if (neg_mask & (1U << idx)) {
+        if (neg_mask & (1ULL << idx)) {
             cr.flip();
         }
 
@@ -970,6 +968,20 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_MASK_64) {
+                os << "    and_mask "
+                   << dumpStrMask(ri->and_mask, sizeof(ri->and_mask))
+                   << endl;
+                os << "    cmp_mask "
+                   << dumpStrMask(ri->cmp_mask, sizeof(ri->cmp_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_BYTE) {
                 os << "    and_mask 0x" << std::hex << std::setw(2)
                    << std::setfill('0') << u32{ri->and_mask} << std::dec
@@ -1072,6 +1084,60 @@ void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
             }
             PROGRAM_NEXT_INSTRUCTION
 
+            PROGRAM_CASE(CHECK_SHUFTI_64x8) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 64, ri->lo_mask, ri->hi_mask,
+                                     ri->bucket_select_mask, ri->neg_mask,
+                                     ri->offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_64x16) {
+                os << "    hi_mask_1 "
+                   << dumpStrMask(ri->hi_mask_1, sizeof(ri->hi_mask_1))
+                   << endl;
+                os << "    hi_mask_2 "
+                   << dumpStrMask(ri->hi_mask_2, sizeof(ri->hi_mask_2))
+                   << endl;
+                os << "    lo_mask_1 "
+                   << dumpStrMask(ri->lo_mask_1, sizeof(ri->lo_mask_1))
+                   << endl;
+                os << "    lo_mask_2 "
+                   << dumpStrMask(ri->lo_mask_2, sizeof(ri->lo_mask_2))
+                   << endl;
+                os << "    bucket_select_mask_hi "
+                   << dumpStrMask(ri->bucket_select_mask_hi,
+                                  sizeof(ri->bucket_select_mask_hi))
+                   << endl;
+                os << "    bucket_select_mask_lo "
+                   << dumpStrMask(ri->bucket_select_mask_lo,
+                                  sizeof(ri->bucket_select_mask_lo))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 64, ri->lo_mask_1, ri->hi_mask_1,
+                                     ri->lo_mask_2, ri->hi_mask_2,
+                                     ri->bucket_select_mask_lo,
+                                     ri->bucket_select_mask_hi,
+                                     ri->neg_mask, ri->offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
             PROGRAM_CASE(CHECK_INFIX) {
                 os << "    queue " << ri->queue << endl;
                 os << "    lag " << ri->lag << endl;
@@ -1509,10 +1575,10 @@ void dumpRoseLitPrograms(const vector<LitFragment> &fragments,
     vector<u32> programs;
     for (const auto &frag : fragments) {
         if (frag.lit_program_offset) {
-            programs.push_back(frag.lit_program_offset);
+            programs.emplace_back(frag.lit_program_offset);
         }
         if (frag.delay_program_offset) {
-            programs.push_back(frag.delay_program_offset);
+            programs.emplace_back(frag.delay_program_offset);
         }
     }
     sort_and_unique(programs);
diff --git a/src/rose/rose_build_exclusive.cpp b/src/rose/rose_build_exclusive.cpp
index 6a5a710d0..bc9b15582 100644
--- a/src/rose/rose_build_exclusive.cpp
+++ b/src/rose/rose_build_exclusive.cpp
@@ -39,7 +39,6 @@
 #include "util/container.h"
 #include "util/flat_containers.h"
 #include "util/graph.h"
-#include "util/make_unique.h"
 
 using namespace std;
 
@@ -72,15 +71,15 @@ vector<RoleChunk<role_id>> divideIntoChunks(const RoseBuildImpl &build,
     for (const auto &roleInfo : roleInfoSet) {
         if (cnt == chunkSize) {
             cnt -= chunkSize;
-            chunks.push_back(roleChunk);
+            chunks.emplace_back(roleChunk);
             roleChunk.roles.clear();
         }
-        roleChunk.roles.push_back(roleInfo);
+        roleChunk.roles.emplace_back(roleInfo);
         cnt++;
     }
 
     if (cnt > 1) {
-        chunks.push_back(roleChunk);
+        chunks.emplace_back(roleChunk);
     }
 
     return chunks;
@@ -106,14 +105,14 @@ bool addPrefixLiterals(NGHolder &h, unordered_set<u32> &tailId,
             NFAVertex u = add_vertex(h);
             h[u].char_reach = c;
             if (!i++) {
-                heads.push_back(u);
+                heads.emplace_back(u);
                 last = u;
                 continue;
             }
             add_edge(last, u, h);
             last = u;
         }
-        tails.push_back(last);
+        tails.emplace_back(last);
         tailId.insert(h[last].index);
     }
 
@@ -280,7 +279,7 @@ void findCliques(const map<u32, set<u32>> &exclusiveGroups,
     }
     // Construct the exclusivity graph
     map<u32, CliqueVertex> vertex_map;
-    unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();
+    unique_ptr<CliqueGraph> cg = std::make_unique<CliqueGraph>();
 
     // Add vertices representing infixes/suffixes
     for (const auto &e : exclusiveGroups) {
@@ -309,7 +308,7 @@ void findCliques(const map<u32, set<u32>> &exclusiveGroups,
     for (const auto &i : clique) {
         DEBUG_PRINTF("cliq:%zu\n", i.size());
         if (i.size() > 1) {
-            exclusive_roles.push_back(i);
+            exclusive_roles.emplace_back(i);
         }
     }
     DEBUG_PRINTF("Clique graph size:%zu\n", exclusive_roles.size());
@@ -359,7 +358,7 @@ bool setTriggerLiterals(RoleInfo<role_id> &roleInfo,
             for (const auto &c : lit) {
                 roleInfo.prefix_cr |= c;
             }
-            roleInfo.literals.push_back(lit);
+            roleInfo.literals.emplace_back(lit);
         }
     }
 
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index c670e6033..d8b9c9514 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -96,7 +96,7 @@ bool eligibleForAlwaysOnGroup(const RoseBuildImpl &build, u32 id) {
 static
 bool requires_group_assignment(const rose_literal_id &lit,
                                const rose_literal_info &info) {
-    if (lit.delay) { /* we will check the shadow's master */
+    if (lit.delay) { /* we will check the shadow's leader */
         return false;
     }
 
@@ -326,7 +326,7 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
         /* long literals will either be stuck in a mega group or spread around
          * depending on availability */
         if (superStrong(lit)) {
-            long_lits.push_back(id);
+            long_lits.emplace_back(id);
             continue;
         }
 
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 7780848b1..d0ed84dfa 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -343,7 +343,7 @@ class RoseLiteralMap {
             return {it->second, false};
         }
         u32 id = verify_u32(lits.size());
-        lits.push_back(lit);
+        lits.emplace_back(lit);
         lits_index.emplace(lit, id);
         return {id, true};
     }
diff --git a/src/rose/rose_build_infix.cpp b/src/rose/rose_build_infix.cpp
index 80e125423..48c11c0f3 100644
--- a/src/rose/rose_build_infix.cpp
+++ b/src/rose/rose_build_infix.cpp
@@ -163,7 +163,7 @@ u32 findMaxLiteralMatches(const NGHolder &h, const set<ue2_literal> &lits) {
         }
 
         contractVertex(g, v, all_edges);
-        dead.push_back(v);
+        dead.emplace_back(v);
     }
 
     remove_vertices(dead, g);
diff --git a/src/rose/rose_build_instructions.cpp b/src/rose/rose_build_instructions.cpp
index c503f7311..0baaa7449 100644
--- a/src/rose/rose_build_instructions.cpp
+++ b/src/rose/rose_build_instructions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019, Intel Corporation
+ * Copyright (c) 2017-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -131,8 +131,8 @@ void RoseInstrCheckLookaround::write(void *dest, RoseEngineBlob &blob,
     vector<s8> look_offsets;
     vector<CharReach> reaches;
     for (const auto &le : look) {
-        look_offsets.push_back(le.offset);
-        reaches.push_back(le.reach);
+        look_offsets.emplace_back(le.offset);
+        reaches.emplace_back(le.reach);
     }
     inst->look_index = blob.lookaround_cache.get_offset_of(look_offsets, blob);
     inst->reach_index = blob.lookaround_cache.get_offset_of(reaches, blob);
@@ -162,6 +162,17 @@ void RoseInstrCheckMask32::write(void *dest, RoseEngineBlob &blob,
     inst->fail_jump = calc_jump(offset_map, this, target);
 }
 
+void RoseInstrCheckMask64::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(and_mask), end(and_mask), inst->and_mask);
+    copy(begin(cmp_mask), end(cmp_mask), inst->cmp_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
 void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob,
                                const OffsetMap &offset_map) const {
     RoseInstrBase::write(dest, blob, offset_map);
@@ -227,6 +238,36 @@ void RoseInstrCheckShufti32x16::write(void *dest, RoseEngineBlob &blob,
     inst->fail_jump = calc_jump(offset_map, this, target);
 }
 
+void RoseInstrCheckShufti64x8::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti64x16::write(void *dest, RoseEngineBlob &blob,
+                                      const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask_1), end(hi_mask_1), inst->hi_mask_1);
+    copy(begin(hi_mask_2), end(hi_mask_2), inst->hi_mask_2);
+    copy(begin(lo_mask_1), end(lo_mask_1), inst->lo_mask_1);
+    copy(begin(lo_mask_2), end(lo_mask_2), inst->lo_mask_2);
+    copy(begin(bucket_select_mask_hi), end(bucket_select_mask_hi),
+         inst->bucket_select_mask_hi);
+    copy(begin(bucket_select_mask_lo), end(bucket_select_mask_lo),
+         inst->bucket_select_mask_lo);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
 void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob,
                                 const OffsetMap &offset_map) const {
     RoseInstrBase::write(dest, blob, offset_map);
@@ -445,9 +486,9 @@ void RoseInstrSparseIterBegin::write(void *dest, RoseEngineBlob &blob,
     vector<u32> keys;
     vector<u32> jump_offsets;
     for (const auto &jump : jump_table) {
-        keys.push_back(jump.first);
+        keys.emplace_back(jump.first);
         assert(contains(offset_map, jump.second));
-        jump_offsets.push_back(offset_map.at(jump.second));
+        jump_offsets.emplace_back(offset_map.at(jump.second));
     }
 
     auto iter = mmbBuildSparseIterator(keys, num_keys);
@@ -548,11 +589,11 @@ void RoseInstrMultipathLookaround::write(void *dest, RoseEngineBlob &blob,
         bool done_offset = false;
 
         for (const auto &le : vle) {
-            reaches.back().push_back(le.reach);
+            reaches.back().emplace_back(le.reach);
 
             /* empty reaches don't have valid offsets */
             if (!done_offset && le.reach.any()) {
-                look_offsets.push_back(le.offset);
+                look_offsets.emplace_back(le.offset);
                 done_offset = true;
             }
         }
diff --git a/src/rose/rose_build_instructions.h b/src/rose/rose_build_instructions.h
index 306a4166c..f18f4a471 100644
--- a/src/rose/rose_build_instructions.h
+++ b/src/rose/rose_build_instructions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019, Intel Corporation
+ * Copyright (c) 2017-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -519,6 +519,43 @@ class RoseInstrCheckMask32
     }
 };
 
+class RoseInstrCheckMask64
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK_64,
+                                    ROSE_STRUCT_CHECK_MASK_64,
+                                    RoseInstrCheckMask64> {
+public:
+    std::array<u8, 64> and_mask;
+    std::array<u8, 64> cmp_mask;
+    u64a neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMask64(std::array<u8, 64> and_mask_in,
+                         std::array<u8, 64> cmp_mask_in, u64a neg_mask_in,
+                         s32 offset_in, const RoseInstruction *target_in)
+        : and_mask(std::move(and_mask_in)), cmp_mask(std::move(cmp_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+    bool operator==(const RoseInstrCheckMask64 &ri) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(opcode, and_mask, cmp_mask, neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMask64 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
 class RoseInstrCheckByte
     : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BYTE,
                                     ROSE_STRUCT_CHECK_BYTE,
@@ -738,6 +775,109 @@ class RoseInstrCheckShufti32x16
     }
 };
 
+class RoseInstrCheckShufti64x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_64x8,
+                                    ROSE_STRUCT_CHECK_SHUFTI_64x8,
+                                    RoseInstrCheckShufti64x8> {
+public:
+    std::array<u8, 64> hi_mask;
+    std::array<u8, 64> lo_mask;
+    std::array<u8, 64> bucket_select_mask;
+    u64a neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti64x8(std::array<u8, 64> hi_mask_in,
+                             std::array<u8, 64> lo_mask_in,
+                             std::array<u8, 64> bucket_select_mask_in,
+                             u64a neg_mask_in, s32 offset_in,
+                             const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti64x8 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(opcode, hi_mask, lo_mask, bucket_select_mask, neg_mask,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti64x8 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti64x16
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_64x16,
+                                    ROSE_STRUCT_CHECK_SHUFTI_64x16,
+                                    RoseInstrCheckShufti64x16> {
+public:
+    std::array<u8, 64> hi_mask_1;
+    std::array<u8, 64> hi_mask_2;
+    std::array<u8, 64> lo_mask_1;
+    std::array<u8, 64> lo_mask_2;
+    std::array<u8, 64> bucket_select_mask_hi;
+    std::array<u8, 64> bucket_select_mask_lo;
+    u64a neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti64x16(std::array<u8, 64> hi_mask_1_in,
+                              std::array<u8, 64> hi_mask_2_in,
+                              std::array<u8, 64> lo_mask_1_in,
+                              std::array<u8, 64> lo_mask_2_in,
+                              std::array<u8, 64> bucket_select_mask_hi_in,
+                              std::array<u8, 64> bucket_select_mask_lo_in,
+                              u64a neg_mask_in, s32 offset_in,
+                              const RoseInstruction *target_in)
+        : hi_mask_1(std::move(hi_mask_1_in)), hi_mask_2(std::move(hi_mask_2_in)),
+          lo_mask_1(std::move(lo_mask_1_in)), lo_mask_2(std::move(lo_mask_2_in)),
+          bucket_select_mask_hi(std::move(bucket_select_mask_hi_in)),
+          bucket_select_mask_lo(std::move(bucket_select_mask_lo_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti64x16 &ri) const {
+        return hi_mask_1 == ri.hi_mask_1 && hi_mask_2 == ri.hi_mask_2 &&
+               lo_mask_1 == ri.lo_mask_1 && lo_mask_2 == ri.lo_mask_2 &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(opcode, hi_mask_1, hi_mask_2, lo_mask_1, lo_mask_2,
+                        bucket_select_mask_hi, bucket_select_mask_lo, neg_mask,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti64x16 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask_1 == ri.hi_mask_1 && hi_mask_2 == ri.hi_mask_2 &&
+               lo_mask_1 == ri.lo_mask_1 && lo_mask_2 == ri.lo_mask_2 &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
 class RoseInstrCheckInfix
     : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_INFIX,
                                     ROSE_STRUCT_CHECK_INFIX,
diff --git a/src/rose/rose_build_lit_accel.cpp b/src/rose/rose_build_lit_accel.cpp
index b389f493d..7286fddbd 100644
--- a/src/rose/rose_build_lit_accel.cpp
+++ b/src/rose/rose_build_lit_accel.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +37,7 @@
 #include "nfa/accel.h"
 #include "nfa/shufticompile.h"
 #include "nfa/trufflecompile.h"
+#include "nfa/vermicellicompile.h"
 #include "util/compare.h"
 #include "util/dump_charclass.h"
 #include "util/ue2string.h"
@@ -346,7 +348,7 @@ void filterLits(const vector<AccelString> &lits, hwlm_group_t expected_groups,
         DEBUG_PRINTF("lit: '%s', nocase=%d, groups=0x%llx\n",
                      escapeString(lit.s).c_str(), lit.nocase ? 1 : 0,
                      lit.groups);
-        filtered_lits->push_back(&lit);
+        filtered_lits->emplace_back(&lit);
     }
 }
 
@@ -440,6 +442,17 @@ void findForwardAccelScheme(const vector<AccelString> &lits,
     }
 
     const CharReach &cr = reach[min_offset];
+#ifdef HAVE_SVE2
+    if (min_count <= 16) {
+        vermicelli16Build(cr, (u8 *)&aux->verm16.mask);
+        DEBUG_PRINTF("built verm16 for %s (%zu chars, offset %u)\n",
+                     describeClass(cr).c_str(), cr.count(), min_offset);
+        aux->verm16.accel_type = ACCEL_VERM16;
+        aux->verm16.offset = verify_u8(min_offset);
+        return;
+    }
+#endif // HAVE_SVE2
+
     if (-1 !=
         shuftiBuildMasks(cr, (u8 *)&aux->shufti.lo, (u8 *)&aux->shufti.hi)) {
         DEBUG_PRINTF("built shufti for %s (%zu chars, offset %u)\n",
diff --git a/src/rose/rose_build_lookaround.cpp b/src/rose/rose_build_lookaround.cpp
index 7cc1c584d..88e8d4748 100644
--- a/src/rose/rose_build_lookaround.cpp
+++ b/src/rose/rose_build_lookaround.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,7 +58,7 @@ static const u32 MAX_FWD_LEN = 64;
 static const u32 MAX_BACK_LEN = 64;
 
 /** \brief Max lookaround entries for a role. */
-static const u32 MAX_LOOKAROUND_ENTRIES = 16;
+static const u32 MAX_LOOKAROUND_ENTRIES = 32;
 
 /** \brief We would rather have lookarounds with smaller reach than this. */
 static const u32 LOOKAROUND_WIDE_REACH = 200;
@@ -279,13 +279,13 @@ void findForwardReach(const RoseGraph &g, const RoseVertex v,
             DEBUG_PRINTF("successor %zu has no leftfix\n", g[t].index);
             return;
         }
-        rose_look.push_back(map<s32, CharReach>());
+        rose_look.emplace_back(map<s32, CharReach>());
         getRoseForwardReach(g[t].left, g[e].rose_top, rose_look.back());
     }
 
     if (g[v].suffix) {
         DEBUG_PRINTF("suffix engine\n");
-        rose_look.push_back(map<s32, CharReach>());
+        rose_look.emplace_back(map<s32, CharReach>());
         getSuffixForwardReach(g[v].suffix, g[v].suffix.top, rose_look.back());
     }
 
@@ -319,7 +319,7 @@ void normalise(map<s32, CharReach> &look) {
     vector<s32> dead;
     for (const auto &m : look) {
         if (m.second.all()) {
-            dead.push_back(m.first);
+            dead.emplace_back(m.first);
         }
     }
     erase_all(&look, dead);
@@ -464,7 +464,7 @@ void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v,
 namespace {
 struct LookProto {
     LookProto(s32 offset_in, CharReach reach_in)
-        : offset(offset_in), reach(move(reach_in)) {}
+        : offset(offset_in), reach(std::move(reach_in)) {}
     s32 offset;
     CharReach reach;
 };
@@ -569,7 +569,7 @@ void normaliseLeftfix(map<s32, CharReach> &look) {
     vector<s32> dead;
     for (const auto &m : look) {
         if (m.second.all() && m.first != earliest) {
-            dead.push_back(m.first);
+            dead.emplace_back(m.first);
         }
     }
     erase_all(&look, dead);
@@ -617,7 +617,7 @@ void transToLookaround(const vector<map<s32, CharReach>> &looks,
             s8 offset = verify_s8(m.first);
             lookaround.emplace_back(offset, m.second);
         }
-        lookarounds.push_back(lookaround);
+        lookarounds.emplace_back(lookaround);
     }
 }
 
@@ -711,7 +711,7 @@ bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag,
             return true;
         }
         if (contains(g[v].reports, report)) {
-            curr.push_back(v);
+            curr.emplace_back(v);
         }
     }
 
@@ -765,8 +765,8 @@ bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag,
                     looks[idx][0 - i] = g[u].char_reach;
                     total_len++;
                 } else {
-                    curr.push_back(u);
-                    looks.push_back(looks[idx]);
+                    curr.emplace_back(u);
+                    looks.emplace_back(looks[idx]);
                     (looks.back())[0 - i] = g[u].char_reach;
                     total_len += looks.back().size();
                 }
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 4fde4c441..96cdfbe5c 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -46,7 +46,6 @@
 #include "util/compile_context.h"
 #include "util/compile_error.h"
 #include "util/dump_charclass.h"
-#include "util/make_unique.h"
 #include "util/report.h"
 #include "util/report_manager.h"
 #include "util/verify_types.h"
@@ -353,7 +352,7 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
             continue;
         }
 
-        candidates.push_back(id);
+        candidates.emplace_back(id);
     }
 
     for (const u32 &id : candidates) {
@@ -739,7 +738,7 @@ void addFragmentLiteral(const RoseBuildImpl &build, MatcherProto &mp,
 
     const auto &groups = f.groups;
 
-    mp.lits.emplace_back(move(s_final), nocase, noruns, f.fragment_id,
+    mp.lits.emplace_back(std::move(s_final), nocase, noruns, f.fragment_id,
                          groups, msk, cmp);
 }
 
@@ -827,7 +826,7 @@ MatcherProto makeMatcherProto(const RoseBuildImpl &build,
                 }
             }
 
-            used_lit_ids.push_back(id);
+            used_lit_ids.emplace_back(id);
         }
 
         if (used_lit_ids.empty()) {
@@ -937,7 +936,7 @@ buildFloatingMatcherProto(const RoseBuildImpl &build,
         throw CompileError("Unable to generate literal matcher proto.");
      }
 
-     return ue2::make_unique<LitProto>(move(proto), mp.accel_lits);
+     return std::make_unique<LitProto>(std::move(proto), mp.accel_lits);
 }
 
 unique_ptr<LitProto>
@@ -965,7 +964,7 @@ buildDelayRebuildMatcherProto(const RoseBuildImpl &build,
         throw CompileError("Unable to generate literal matcher proto.");
     }
 
-    return ue2::make_unique<LitProto>(move(proto), mp.accel_lits);
+    return std::make_unique<LitProto>(std::move(proto), mp.accel_lits);
 }
 
 unique_ptr<LitProto>
@@ -1022,7 +1021,7 @@ buildSmallBlockMatcherProto(const RoseBuildImpl &build,
         throw CompileError("Unable to generate literal matcher proto.");
     }
 
-    return ue2::make_unique<LitProto>(move(proto), mp.accel_lits);
+    return std::make_unique<LitProto>(std::move(proto), mp.accel_lits);
 }
 
 unique_ptr<LitProto>
@@ -1047,7 +1046,7 @@ buildEodAnchoredMatcherProto(const RoseBuildImpl &build,
         throw CompileError("Unable to generate literal matcher proto.");
     }
 
-    return ue2::make_unique<LitProto>(move(proto), mp.accel_lits);
+    return std::make_unique<LitProto>(std::move(proto), mp.accel_lits);
 }
 
 } // namespace ue2
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index 5066dbd57..cddbb760b 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -239,7 +239,7 @@ bool dedupeLeftfixes(RoseBuildImpl &tbi) {
             continue;
         }
 
-        roses[RoseGroup(tbi, v)].push_back(v);
+        roses[RoseGroup(tbi, v)].emplace_back(v);
     }
 
     DEBUG_PRINTF("collected %zu rose groups\n", roses.size());
@@ -338,7 +338,7 @@ void dedupeSuffixes(RoseBuildImpl &tbi) {
 
         set<RoseVertex> &verts = suffix_map[s];
         if (verts.empty()) {
-            part[make_pair(suffix_size_key(s), all_reports(s))].push_back(s);
+            part[make_pair(suffix_size_key(s), all_reports(s))].emplace_back(s);
         }
         verts.insert(v);
     }
@@ -393,17 +393,17 @@ class Bouquet {
     void insert(const EngineRef &h, RoseVertex v) {
         typename BouquetMap::iterator f = bouquet.find(h);
         if (f == bouquet.end()) {
-            ordering.push_back(h);
-            bouquet[h].push_back(v);
+            ordering.emplace_back(h);
+            bouquet[h].emplace_back(v);
         } else {
-            f->second.push_back(v);
+            f->second.emplace_back(v);
         }
     }
 
     void insert(const EngineRef &h, const deque<RoseVertex> &verts) {
         typename BouquetMap::iterator f = bouquet.find(h);
         if (f == bouquet.end()) {
-            ordering.push_back(h);
+            ordering.emplace_back(h);
             bouquet.insert(make_pair(h, verts));
         } else {
             f->second.insert(f->second.end(), verts.begin(), verts.end());
@@ -472,14 +472,14 @@ static void chunkBouquets(const Bouquet<EngineRef> &in,
                           deque<Bouquet<EngineRef>> &out,
                           const size_t chunk_size) {
     if (in.size() <= chunk_size) {
-        out.push_back(in);
+        out.emplace_back(in);
         return;
     }
 
-    out.push_back(Bouquet<EngineRef>());
+    out.emplace_back(Bouquet<EngineRef>());
     for (const auto &engine : in) {
         if (out.back().size() >= chunk_size) {
-            out.push_back(Bouquet<EngineRef>());
+            out.emplace_back(Bouquet<EngineRef>());
         }
         out.back().insert(engine, in.vertices(engine));
     }
@@ -820,7 +820,7 @@ bool checkPredDelays(const RoseBuildImpl &build, const VertexCont &v1,
     vector<const rose_literal_id *> pred_rose_lits;
     pred_rose_lits.reserve(pred_lits.size());
     for (const auto &p : pred_lits) {
-        pred_rose_lits.push_back(&build.literals.at(p));
+        pred_rose_lits.emplace_back(&build.literals.at(p));
     }
 
     for (auto v : v2) {
@@ -1322,18 +1322,18 @@ template <typename T>
 static
 void chunk(vector<T> in, vector<vector<T>> *out, size_t chunk_size) {
     if (in.size() <= chunk_size) {
-        out->push_back(std::move(in));
+        out->emplace_back(std::move(in));
         return;
     }
 
-    out->push_back(vector<T>());
+    out->emplace_back(vector<T>());
     out->back().reserve(chunk_size);
     for (const auto &t : in) {
         if (out->back().size() >= chunk_size) {
-            out->push_back(vector<T>());
+            out->emplace_back(vector<T>());
             out->back().reserve(chunk_size);
         }
-        out->back().push_back(std::move(t));
+        out->back().emplace_back(std::move(t));
     }
 }
 
@@ -1346,7 +1346,7 @@ insertion_ordered_map<left_id, vector<RoseVertex>> get_eng_verts(RoseGraph &g) {
             continue;
         }
         assert(contains(all_reports(left), left.leftfix_report));
-        eng_verts[left].push_back(v);
+        eng_verts[left].emplace_back(v);
     }
 
     return eng_verts;
@@ -1437,24 +1437,12 @@ void mergeLeftfixesVariableLag(RoseBuildImpl &build) {
 
         assert(!parents.empty());
 
-#ifndef _WIN32
-        engine_groups[MergeKey(left, parents)].push_back(left);
-#else
-        // On windows, when passing MergeKey object into map 'engine_groups',
-        // it will not be copied, but will be freed along with
-        // engine_groups.clear().
-        // If we construct MergeKey object on the stack, it will be destructed
-        // on its life cycle ending, then on engine_groups.clear(), which
-        // will cause is_block_type_valid() assertion error in MergeKey
-        // destructor.
-        MergeKey *mk = new MergeKey(left, parents);
-        engine_groups[*mk].push_back(left);
-#endif
+        engine_groups[MergeKey(left, parents)].emplace_back(left);
     }
 
     vector<vector<left_id>> chunks;
     for (auto &raw_group : engine_groups | map_values) {
-        chunk(move(raw_group), &chunks, MERGE_GROUP_SIZE_MAX);
+        chunk(std::move(raw_group), &chunks, MERGE_GROUP_SIZE_MAX);
     }
     engine_groups.clear();
 
@@ -1523,7 +1511,7 @@ namespace {
 struct DedupeLeftKey {
     DedupeLeftKey(const RoseBuildImpl &build,
                   flat_set<pair<size_t, u32>> preds_in, const left_id &left)
-        : left_hash(hashLeftfix(left)), preds(move(preds_in)),
+        : left_hash(hashLeftfix(left)), preds(std::move(preds_in)),
           transient(contains(build.transient, left)) {
     }
 
@@ -1611,7 +1599,7 @@ void dedupeLeftfixesVariableLag(RoseBuildImpl &build) {
                 continue;
             }
         }
-        engine_groups[DedupeLeftKey(build, move(preds), left)].push_back(left);
+        engine_groups[DedupeLeftKey(build, std::move(preds), left)].emplace_back(left);
     }
 
     /* We don't bother chunking as we expect deduping to be successful if the
@@ -1871,7 +1859,7 @@ void mergeNfaLeftfixes(RoseBuildImpl &tbi, LeftfixBouquet &roses) {
             }
             roses.insert(r1, verts2);
 
-            merged.push_back(r2);
+            merged.emplace_back(r2);
 
             if (num_vertices(*winner) >= small_merge_max_vertices(tbi.cc)) {
                 DEBUG_PRINTF("h1 now has %zu vertices, proceeding to next\n",
@@ -2050,17 +2038,17 @@ void mergeCastleLeftfixes(RoseBuildImpl &build) {
             continue;
         }
 
-        eng_verts[g[v].left].push_back(v);
+        eng_verts[g[v].left].emplace_back(v);
     }
 
     map<CharReach, vector<left_id>> by_reach;
     for (const auto &left : eng_verts | map_keys) {
-        by_reach[left.castle()->reach()].push_back(left);
+        by_reach[left.castle()->reach()].emplace_back(left);
     }
 
     vector<vector<left_id>> chunks;
     for (auto &raw_group : by_reach | map_values) {
-        chunk(move(raw_group), &chunks, MERGE_CASTLE_GROUP_SIZE_MAX);
+        chunk(std::move(raw_group), &chunks, MERGE_CASTLE_GROUP_SIZE_MAX);
     }
     by_reach.clear();
 
@@ -2151,7 +2139,7 @@ void mergeSuffixes(RoseBuildImpl &tbi, SuffixBouquet &suffixes,
                 g[v].suffix.graph = winner;
             }
             suffixes.insert(s1, verts2);
-            merged.push_back(s2);
+            merged.emplace_back(s2);
 
             if (num_vertices(*s1.graph()) >= small_merge_max_vertices(tbi.cc)) {
                 DEBUG_PRINTF("h1 now has %zu vertices, proceeding to next\n",
@@ -2324,7 +2312,7 @@ map<NGHolder *, NGHolder *> chunkedNfaMerge(RoseBuildImpl &build,
 
     vector<NGHolder *> batch;
     for (auto it = begin(nfas), ite = end(nfas); it != ite; ++it) {
-        batch.push_back(*it);
+        batch.emplace_back(*it);
         assert((*it)->kind == NFA_OUTFIX);
         if (batch.size() == MERGE_GROUP_SIZE_MAX || next(it) == ite) {
             auto batch_merged = mergeNfaCluster(batch, &build.rm, build.cc);
@@ -2441,7 +2429,7 @@ void pairwiseDfaMerge(vector<RawDfa *> &dfas,
             RawDfa *dfa_ptr = rdfa.get();
             dfa_mapping[dfa_ptr] = dfa_mapping[*it];
             dfa_mapping.erase(*it);
-            winner.proto = move(rdfa);
+            winner.proto = std::move(rdfa);
 
             mergeOutfixInfo(winner, victim);
 
@@ -2463,7 +2451,7 @@ void chunkedDfaMerge(vector<RawDfa *> &dfas,
     vector<RawDfa *> out_dfas;
     vector<RawDfa *> chunk;
     for (auto it = begin(dfas), ite = end(dfas); it != ite; ++it) {
-        chunk.push_back(*it);
+        chunk.emplace_back(*it);
         if (chunk.size() >= DFA_CHUNK_SIZE_MAX || next(it) == ite) {
             pairwiseDfaMerge(chunk, dfa_mapping, outfixes, merge_func);
             out_dfas.insert(end(out_dfas), begin(chunk), end(chunk));
@@ -2542,7 +2530,7 @@ void mergeOutfixCombo(RoseBuildImpl &tbi, const ReportManager &rm,
 
         if (outfix.rdfa()) {
             auto *rdfa = outfix.rdfa();
-            dfas.push_back(rdfa);
+            dfas.emplace_back(rdfa);
             dfa_mapping[rdfa] = it - tbi.outfixes.begin();
             continue;
         }
@@ -2557,8 +2545,8 @@ void mergeOutfixCombo(RoseBuildImpl &tbi, const ReportManager &rm,
         if (rdfa) {
             // Transform this outfix into a DFA and add it to the merge set.
             dfa_mapping[rdfa.get()] = it - tbi.outfixes.begin();
-            dfas.push_back(rdfa.get());
-            outfix.proto = move(rdfa);
+            dfas.emplace_back(rdfa.get());
+            outfix.proto = std::move(rdfa);
             new_dfas++;
         }
     }
@@ -2615,11 +2603,11 @@ void mergeOutfixes(RoseBuildImpl &tbi) {
 
     for (auto &outfix : tbi.outfixes) {
         if (outfix.rdfa()) {
-            dfas.push_back(outfix.rdfa());
+            dfas.emplace_back(outfix.rdfa());
         } else if (outfix.holder()) {
-            nfas.push_back(outfix.holder());
+            nfas.emplace_back(outfix.holder());
         } else if (outfix.haig()) {
-            som_dfas.push_back(outfix.haig());
+            som_dfas.emplace_back(outfix.haig());
         }
     }
 
@@ -2805,9 +2793,9 @@ void mergeCastleSuffixes(RoseBuildImpl &build) {
         }
 
         if (!contains(eng_verts, c)) {
-            by_reach[c->reach()].push_back(c);
+            by_reach[c->reach()].emplace_back(c);
         }
-        eng_verts[c].push_back(v);
+        eng_verts[c].emplace_back(v);
     }
 
     for (auto &chunk : by_reach | map_values) {
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 0b0e689c9..d3ff236d2 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -50,7 +50,6 @@
 #include "util/container.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/ue2string.h"
@@ -234,7 +233,7 @@ unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
                                       SmallWriteBuild &smwr,
                                       const CompileContext &cc,
                                       const BoundaryReports &boundary) {
-    return ue2::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary);
+    return std::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary);
 }
 
 bool roseIsPureLiteral(const RoseEngine *t) {
@@ -375,7 +374,7 @@ u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, u32 delay,
     bool inserted = m.second;
 
     if (inserted) {
-        literal_info.push_back(rose_literal_info());
+        literal_info.emplace_back(rose_literal_info());
         assert(literal_info.size() == id + 1);
 
         if (delay) {
@@ -465,7 +464,7 @@ u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector<u8> &msk,
     bool inserted = m.second;
 
     if (inserted) {
-        literal_info.push_back(rose_literal_info());
+        literal_info.emplace_back(rose_literal_info());
         assert(literal_info.size() == id + 1);
 
         if (delay) {
@@ -488,7 +487,7 @@ u32 RoseBuildImpl::getNewLiteralId() {
     assert(m.second);
     u32 id = m.first;
 
-    literal_info.push_back(rose_literal_info());
+    literal_info.emplace_back(rose_literal_info());
     assert(literal_info.size() == id + 1);
 
     literal_info[id].undelayed_id = id;
diff --git a/src/rose/rose_build_program.cpp b/src/rose/rose_build_program.cpp
index 501932c5c..1e0fe24b6 100644
--- a/src/rose/rose_build_program.cpp
+++ b/src/rose/rose_build_program.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -95,7 +95,7 @@ OffsetMap makeOffsetMap(const RoseProgram &program, u32 *total_len) {
 }
 
 RoseProgram::RoseProgram() {
-    prog.push_back(make_unique<RoseInstrEnd>());
+    prog.emplace_back(std::make_unique<RoseInstrEnd>());
 }
 
 RoseProgram::~RoseProgram() = default;
@@ -135,7 +135,7 @@ RoseProgram::iterator RoseProgram::insert(RoseProgram::iterator it,
     assert(it != end());
     assert(prog.back()->code() == ROSE_INSTR_END);
 
-    return prog.insert(it, move(ri));
+    return prog.insert(it, std::move(ri));
 }
 
 RoseProgram::iterator RoseProgram::insert(RoseProgram::iterator it,
@@ -183,7 +183,7 @@ void RoseProgram::add_before_end(RoseProgram &&block) {
         return;
     }
 
-    insert(prev(prog.end()), move(block));
+    insert(prev(prog.end()), std::move(block));
 }
 
 void RoseProgram::add_block(RoseProgram &&block) {
@@ -204,6 +204,15 @@ void RoseProgram::add_block(RoseProgram &&block) {
                 make_move_iterator(block.prog.end()));
 }
 
+template<class Iter>
+void RoseProgram::replace(Iter it, std::unique_ptr<RoseInstruction> ri) {
+    assert(!prog.empty());
+
+    const RoseInstruction *old_ptr = it->get();
+    *it = std::move(ri);
+    update_targets(prog.begin(), prog.end(), old_ptr, it->get());
+}
+
 bytecode_ptr<char> writeProgram(RoseEngineBlob &blob,
                                 const RoseProgram &program) {
     u32 total_len = 0;
@@ -297,28 +306,28 @@ void addEnginesEodProgram(u32 eodNfaIterOffset, RoseProgram &program) {
     }
 
     RoseProgram block;
-    block.add_before_end(make_unique<RoseInstrEnginesEod>(eodNfaIterOffset));
-    program.add_block(move(block));
+    block.add_before_end(std::make_unique<RoseInstrEnginesEod>(eodNfaIterOffset));
+    program.add_block(std::move(block));
 }
 
 void addSuffixesEodProgram(RoseProgram &program) {
     RoseProgram block;
-    block.add_before_end(make_unique<RoseInstrSuffixesEod>());
-    program.add_block(move(block));
+    block.add_before_end(std::make_unique<RoseInstrSuffixesEod>());
+    program.add_block(std::move(block));
 }
 
 void addMatcherEodProgram(RoseProgram &program) {
     RoseProgram block;
-    block.add_before_end(make_unique<RoseInstrMatcherEod>());
-    program.add_block(move(block));
+    block.add_before_end(std::make_unique<RoseInstrMatcherEod>());
+    program.add_block(std::move(block));
 }
 
 void addFlushCombinationProgram(RoseProgram &program) {
-    program.add_before_end(make_unique<RoseInstrFlushCombination>());
+    program.add_before_end(std::make_unique<RoseInstrFlushCombination>());
 }
 
 void addLastFlushCombinationProgram(RoseProgram &program) {
-    program.add_before_end(make_unique<RoseInstrLastFlushCombination>());
+    program.add_before_end(std::make_unique<RoseInstrLastFlushCombination>());
 }
 
 static
@@ -342,15 +351,15 @@ void makeRoleCheckLeftfix(const RoseBuildImpl &build,
 
     unique_ptr<RoseInstruction> ri;
     if (is_prefix) {
-        ri = make_unique<RoseInstrCheckPrefix>(lni.queue, build.g[v].left.lag,
+        ri = std::make_unique<RoseInstrCheckPrefix>(lni.queue, build.g[v].left.lag,
                                                build.g[v].left.leftfix_report,
                                                end_inst);
     } else {
-        ri = make_unique<RoseInstrCheckInfix>(lni.queue, build.g[v].left.lag,
+        ri = std::make_unique<RoseInstrCheckInfix>(lni.queue, build.g[v].left.lag,
                                               build.g[v].left.leftfix_report,
                                               end_inst);
     }
-    program.add_before_end(move(ri));
+    program.add_before_end(std::move(ri));
 }
 
 static
@@ -384,8 +393,8 @@ void makeAnchoredLiteralDelay(const RoseBuildImpl &build,
     u32 anch_id = prog_build.anchored_programs.at(lit_id);
 
     const auto *end_inst = program.end_instruction();
-    auto ri = make_unique<RoseInstrAnchoredDelay>(groups, anch_id, end_inst);
-    program.add_before_end(move(ri));
+    auto ri = std::make_unique<RoseInstrAnchoredDelay>(groups, anch_id, end_inst);
+    program.add_before_end(std::move(ri));
 }
 
 static
@@ -393,19 +402,19 @@ void makeDedupe(const ReportManager &rm, const Report &report,
                 RoseProgram &program) {
     const auto *end_inst = program.end_instruction();
     auto ri =
-        make_unique<RoseInstrDedupe>(report.quashSom, rm.getDkey(report),
+        std::make_unique<RoseInstrDedupe>(report.quashSom, rm.getDkey(report),
                                      report.offsetAdjust, end_inst);
-    program.add_before_end(move(ri));
+    program.add_before_end(std::move(ri));
 }
 
 static
 void makeDedupeSom(const ReportManager &rm, const Report &report,
                    RoseProgram &program) {
     const auto *end_inst = program.end_instruction();
-    auto ri = make_unique<RoseInstrDedupeSom>(report.quashSom,
+    auto ri = std::make_unique<RoseInstrDedupeSom>(report.quashSom,
                                               rm.getDkey(report),
                                               report.offsetAdjust, end_inst);
-    program.add_before_end(move(ri));
+    program.add_before_end(std::move(ri));
 }
 
 static
@@ -428,7 +437,7 @@ void makeCatchup(const ReportManager &rm, bool needs_catchup,
         return;
     }
 
-    program.add_before_end(make_unique<RoseInstrCatchUp>());
+    program.add_before_end(std::make_unique<RoseInstrCatchUp>());
 }
 
 static
@@ -511,13 +520,13 @@ void addLogicalSetRequired(const Report &report, ReportManager &rm,
         return;
     }
     // set matching status of current lkey
-    auto risl = make_unique<RoseInstrSetLogical>(report.lkey,
+    auto risl = std::make_unique<RoseInstrSetLogical>(report.lkey,
                                                  report.offsetAdjust);
-    program.add_before_end(move(risl));
+    program.add_before_end(std::move(risl));
     // set current lkey's corresponding ckeys active, pending to check
     for (auto ckey : rm.getRelateCKeys(report.lkey)) {
-        auto risc = make_unique<RoseInstrSetCombination>(ckey);
-        program.add_before_end(move(risc));
+        auto risc = std::make_unique<RoseInstrSetCombination>(ckey);
+        program.add_before_end(std::move(risc));
     }
 }
 
@@ -532,37 +541,37 @@ void makeReport(const RoseBuildImpl &build, const ReportID id,
 
     // Handle min/max offset checks.
     if (report.minOffset > 0 || report.maxOffset < MAX_OFFSET) {
-        auto ri = make_unique<RoseInstrCheckBounds>(report.minOffset,
+        auto ri = std::make_unique<RoseInstrCheckBounds>(report.minOffset,
                                                     report.maxOffset, end_inst);
-        report_block.add_before_end(move(ri));
+        report_block.add_before_end(std::move(ri));
     }
 
     // If this report has an exhaustion key, we can check it in the program
     // rather than waiting until we're in the callback adaptor.
     if (report.ekey != INVALID_EKEY) {
-        auto ri = make_unique<RoseInstrCheckExhausted>(report.ekey, end_inst);
-        report_block.add_before_end(move(ri));
+        auto ri = std::make_unique<RoseInstrCheckExhausted>(report.ekey, end_inst);
+        report_block.add_before_end(std::move(ri));
     }
 
     // External SOM reports that aren't passthrough need their SOM value
     // calculated.
     if (isExternalSomReport(report) &&
         report.type != EXTERNAL_CALLBACK_SOM_PASS) {
-        auto ri = make_unique<RoseInstrSomFromReport>();
+        auto ri = std::make_unique<RoseInstrSomFromReport>();
         writeSomOperation(report, &ri->som);
-        report_block.add_before_end(move(ri));
+        report_block.add_before_end(std::move(ri));
     }
 
     // Min length constraint.
     if (report.minLength > 0) {
         assert(build.hasSom);
-        auto ri = make_unique<RoseInstrCheckMinLength>(
+        auto ri = std::make_unique<RoseInstrCheckMinLength>(
             report.offsetAdjust, report.minLength, end_inst);
-        report_block.add_before_end(move(ri));
+        report_block.add_before_end(std::move(ri));
     }
 
     if (report.quashSom) {
-        report_block.add_before_end(make_unique<RoseInstrSomZero>());
+        report_block.add_before_end(std::make_unique<RoseInstrSomZero>());
     }
 
     switch (report.type) {
@@ -578,7 +587,7 @@ void makeReport(const RoseBuildImpl &build, const ReportID id,
                 if (needs_dedupe) {
                     if (!report.quiet) {
                         report_block.add_before_end(
-                            make_unique<RoseInstrDedupeAndReport>(
+                            std::make_unique<RoseInstrDedupeAndReport>(
                                 report.quashSom, build.rm.getDkey(report),
                                 report.onmatch, report.offsetAdjust, end_inst));
                     } else {
@@ -587,7 +596,7 @@ void makeReport(const RoseBuildImpl &build, const ReportID id,
                 } else {
                     if (!report.quiet) {
                         report_block.add_before_end(
-                            make_unique<RoseInstrReport>(
+                            std::make_unique<RoseInstrReport>(
                                 report.onmatch, report.offsetAdjust));
                     }
                 }
@@ -597,28 +606,28 @@ void makeReport(const RoseBuildImpl &build, const ReportID id,
                 }
                 if (!report.quiet) {
                     report_block.add_before_end(
-                        make_unique<RoseInstrReportExhaust>(
+                        std::make_unique<RoseInstrReportExhaust>(
                             report.onmatch, report.offsetAdjust, report.ekey));
                 } else {
                     report_block.add_before_end(
-                        make_unique<RoseInstrSetExhaust>(report.ekey));
+                        std::make_unique<RoseInstrSetExhaust>(report.ekey));
                 }
             }
         } else { // has_som
             makeDedupeSom(build.rm, report, report_block);
             if (report.ekey == INVALID_EKEY) {
                 if (!report.quiet) {
-                    report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                    report_block.add_before_end(std::make_unique<RoseInstrReportSom>(
                         report.onmatch, report.offsetAdjust));
                 }
             } else {
                 if (!report.quiet) {
                     report_block.add_before_end(
-                        make_unique<RoseInstrReportSomExhaust>(
+                        std::make_unique<RoseInstrReportSomExhaust>(
                             report.onmatch, report.offsetAdjust, report.ekey));
                 } else {
                     report_block.add_before_end(
-                        make_unique<RoseInstrSetExhaust>(report.ekey));
+                        std::make_unique<RoseInstrSetExhaust>(report.ekey));
                 }
             }
         }
@@ -639,17 +648,17 @@ void makeReport(const RoseBuildImpl &build, const ReportID id,
             addFlushCombinationProgram(report_block);
         }
         if (has_som) {
-            auto ri = make_unique<RoseInstrReportSomAware>();
+            auto ri = std::make_unique<RoseInstrReportSomAware>();
             writeSomOperation(report, &ri->som);
-            report_block.add_before_end(move(ri));
+            report_block.add_before_end(std::move(ri));
         } else {
-            auto ri = make_unique<RoseInstrReportSomInt>();
+            auto ri = std::make_unique<RoseInstrReportSomInt>();
             writeSomOperation(report, &ri->som);
-            report_block.add_before_end(move(ri));
+            report_block.add_before_end(std::move(ri));
         }
         break;
     case INTERNAL_ROSE_CHAIN: {
-        report_block.add_before_end(make_unique<RoseInstrReportChain>(
+        report_block.add_before_end(std::make_unique<RoseInstrReportChain>(
             report.onmatch, report.topSquashDistance));
         break;
     }
@@ -663,17 +672,17 @@ void makeReport(const RoseBuildImpl &build, const ReportID id,
         makeDedupeSom(build.rm, report, report_block);
         if (report.ekey == INVALID_EKEY) {
             if (!report.quiet) {
-                report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                report_block.add_before_end(std::make_unique<RoseInstrReportSom>(
                     report.onmatch, report.offsetAdjust));
             }
         } else {
             if (!report.quiet) {
                 report_block.add_before_end(
-                    make_unique<RoseInstrReportSomExhaust>(
+                    std::make_unique<RoseInstrReportSomExhaust>(
                         report.onmatch, report.offsetAdjust, report.ekey));
             } else {
                 report_block.add_before_end(
-                    make_unique<RoseInstrSetExhaust>(report.ekey));
+                    std::make_unique<RoseInstrSetExhaust>(report.ekey));
             }
         }
         addLogicalSetRequired(report, build.rm, report_block);
@@ -685,17 +694,17 @@ void makeReport(const RoseBuildImpl &build, const ReportID id,
         makeDedupeSom(build.rm, report, report_block);
         if (report.ekey == INVALID_EKEY) {
             if (!report.quiet) {
-                report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                report_block.add_before_end(std::make_unique<RoseInstrReportSom>(
                     report.onmatch, report.offsetAdjust));
             }
         } else {
             if (!report.quiet) {
                 report_block.add_before_end(
-                    make_unique<RoseInstrReportSomExhaust>(
+                    std::make_unique<RoseInstrReportSomExhaust>(
                         report.onmatch, report.offsetAdjust, report.ekey));
             } else {
                 report_block.add_before_end(
-                    make_unique<RoseInstrSetExhaust>(report.ekey));
+                    std::make_unique<RoseInstrSetExhaust>(report.ekey));
             }
         }
         addLogicalSetRequired(report, build.rm, report_block);
@@ -706,7 +715,7 @@ void makeReport(const RoseBuildImpl &build, const ReportID id,
         throw CompileError("Unable to generate bytecode.");
     }
 
-    program.add_block(move(report_block));
+    program.add_block(std::move(report_block));
 }
 
 static
@@ -722,11 +731,11 @@ void makeRoleReports(const RoseBuildImpl &build,
         assert(contains(leftfix_info, v));
         const left_build_info &lni = leftfix_info.at(v);
         program.add_before_end(
-            make_unique<RoseInstrSomLeftfix>(lni.queue, g[v].left.lag));
+            std::make_unique<RoseInstrSomLeftfix>(lni.queue, g[v].left.lag));
         report_som = true;
     } else if (g[v].som_adjust) {
         program.add_before_end(
-            make_unique<RoseInstrSomAdjust>(g[v].som_adjust));
+            std::make_unique<RoseInstrSomAdjust>(g[v].som_adjust));
         report_som = true;
     }
 
@@ -736,7 +745,7 @@ void makeRoleReports(const RoseBuildImpl &build,
     for (ReportID id : g[v].reports) {
         makeReport(build, id, report_som, report_block);
     }
-    program.add_before_end(move(report_block));
+    program.add_before_end(std::move(report_block));
 }
 
 static
@@ -748,7 +757,7 @@ void makeRoleSetState(const unordered_map<RoseVertex, u32> &roleStateIndices,
     if (it == end(roleStateIndices)) {
         return;
     }
-    program.add_before_end(make_unique<RoseInstrSetState>(it->second));
+    program.add_before_end(std::make_unique<RoseInstrSetState>(it->second));
 }
 
 static
@@ -772,7 +781,7 @@ void makePushDelayedInstructions(const RoseLiteralMap &literals,
     });
 
     for (const auto &ri : delay_instructions) {
-        program.add_before_end(make_unique<RoseInstrPushDelayed>(ri));
+        program.add_before_end(std::make_unique<RoseInstrPushDelayed>(ri));
     }
 }
 
@@ -801,13 +810,13 @@ void makeCheckLiteralInstruction(const rose_literal_id &lit,
         const auto *end_inst = program.end_instruction();
         unique_ptr<RoseInstruction> ri;
         if (lit.s.any_nocase()) {
-            ri = make_unique<RoseInstrCheckMedLitNocase>(lit.s.get_string(),
+            ri = std::make_unique<RoseInstrCheckMedLitNocase>(lit.s.get_string(),
                                                          end_inst);
         } else {
-            ri = make_unique<RoseInstrCheckMedLit>(lit.s.get_string(),
+            ri = std::make_unique<RoseInstrCheckMedLit>(lit.s.get_string(),
                                                    end_inst);
         }
-        program.add_before_end(move(ri));
+        program.add_before_end(std::move(ri));
         return;
     }
 
@@ -820,12 +829,12 @@ void makeCheckLiteralInstruction(const rose_literal_id &lit,
     const auto *end_inst = program.end_instruction();
     unique_ptr<RoseInstruction> ri;
     if (lit.s.any_nocase()) {
-        ri = make_unique<RoseInstrCheckLongLitNocase>(lit.s.get_string(),
+        ri = std::make_unique<RoseInstrCheckLongLitNocase>(lit.s.get_string(),
                                                       end_inst);
     } else {
-        ri = make_unique<RoseInstrCheckLongLit>(lit.s.get_string(), end_inst);
+        ri = std::make_unique<RoseInstrCheckLongLit>(lit.s.get_string(), end_inst);
     }
-    program.add_before_end(move(ri));
+    program.add_before_end(std::move(ri));
 }
 
 static
@@ -840,8 +849,8 @@ void makeRoleCheckNotHandled(ProgramBuild &prog_build, RoseVertex v,
     }
 
     const auto *end_inst = program.end_instruction();
-    auto ri = make_unique<RoseInstrCheckNotHandled>(handled_key, end_inst);
-    program.add_before_end(move(ri));
+    auto ri = std::make_unique<RoseInstrCheckNotHandled>(handled_key, end_inst);
+    program.add_before_end(std::move(ri));
 }
 
 static
@@ -889,7 +898,7 @@ void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
 
     const auto *end_inst = program.end_instruction();
     program.add_before_end(
-        make_unique<RoseInstrCheckBounds>(min_bound, max_bound, end_inst));
+        std::make_unique<RoseInstrCheckBounds>(min_bound, max_bound, end_inst));
 }
 
 static
@@ -924,7 +933,7 @@ void makeRoleGroups(const RoseGraph &g, ProgramBuild &prog_build,
         return;
     }
 
-    program.add_before_end(make_unique<RoseInstrSetGroups>(groups));
+    program.add_before_end(std::make_unique<RoseInstrSetGroups>(groups));
 }
 
 static
@@ -968,9 +977,9 @@ bool makeRoleByte(const vector<LookEntry> &look, RoseProgram &program) {
         s32 checkbyte_offset = verify_s32(entry.offset);
         DEBUG_PRINTF("CHECK BYTE offset=%d\n", checkbyte_offset);
         const auto *end_inst = program.end_instruction();
-        auto ri = make_unique<RoseInstrCheckByte>(andmask_u8, cmpmask_u8, flip,
+        auto ri = std::make_unique<RoseInstrCheckByte>(andmask_u8, cmpmask_u8, flip,
                                                   checkbyte_offset, end_inst);
-        program.add_before_end(move(ri));
+        program.add_before_end(std::move(ri));
         return true;
     }
     return false;
@@ -1000,9 +1009,9 @@ bool makeRoleMask(const vector<LookEntry> &look, RoseProgram &program) {
         DEBUG_PRINTF("CHECK MASK and_mask=%llx cmp_mask=%llx\n",
                      and_mask, cmp_mask);
         const auto *end_inst = program.end_instruction();
-        auto ri = make_unique<RoseInstrCheckMask>(and_mask, cmp_mask, neg_mask,
+        auto ri = std::make_unique<RoseInstrCheckMask>(and_mask, cmp_mask, neg_mask,
                                                   base_offset, end_inst);
-        program.add_before_end(move(ri));
+        program.add_before_end(std::move(ri));
         return true;
     }
     return false;
@@ -1055,9 +1064,52 @@ bool makeRoleMask32(const vector<LookEntry> &look,
     DEBUG_PRINTF("base_offset %d\n", base_offset);
 
     const auto *end_inst = program.end_instruction();
-    auto ri = make_unique<RoseInstrCheckMask32>(and_mask, cmp_mask, neg_mask,
+    auto ri = std::make_unique<RoseInstrCheckMask32>(and_mask, cmp_mask, neg_mask,
                                                 base_offset, end_inst);
-    program.add_before_end(move(ri));
+    program.add_before_end(std::move(ri));
+    return true;
+}
+
+static
+bool makeRoleMask64(const vector<LookEntry> &look,
+                    RoseProgram &program, const target_t &target) {
+    if (!target.has_avx512()) {
+        return false;
+    }
+
+    if (look.back().offset >= look.front().offset + 64) {
+        return false;
+    }
+    s32 base_offset = verify_s32(look.front().offset);
+    array<u8, 64> and_mask, cmp_mask;
+    and_mask.fill(0);
+    cmp_mask.fill(0);
+    u64a neg_mask = 0;
+    for (const auto &entry : look) {
+        u8 andmask_u8, cmpmask_u8, flip;
+        if (!checkReachWithFlip(entry.reach, andmask_u8, cmpmask_u8, flip)) {
+            return false;
+        }
+        u32 shift = entry.offset - base_offset;
+        assert(shift < 64);
+        and_mask[shift] = andmask_u8;
+        cmp_mask[shift] = cmpmask_u8;
+        if (flip) {
+            neg_mask |= 1ULL << shift;
+        }
+    }
+
+    DEBUG_PRINTF("and_mask %s\n",
+                 convertMaskstoString(and_mask.data(), 64).c_str());
+    DEBUG_PRINTF("cmp_mask %s\n",
+                 convertMaskstoString(cmp_mask.data(), 64).c_str());
+    DEBUG_PRINTF("neg_mask %llx\n", neg_mask);
+    DEBUG_PRINTF("base_offset %d\n", base_offset);
+
+    const auto *end_inst = program.end_instruction();
+    auto ri = std::make_unique<RoseInstrCheckMask64>(and_mask, cmp_mask, neg_mask,
+                                                base_offset, end_inst);
+    program.add_before_end(std::move(ri));
     return true;
 }
 
@@ -1084,6 +1136,7 @@ void getAllBuckets(const vector<LookEntry> &look,
         } else {
             neg_mask ^= 1ULL << (entry.offset - base_offset);
         }
+
         map <u16, u16> lo2hi;
         // We treat Ascii Table as a 16x16 grid.
         // Push every row in cr into lo2hi and mark the row number.
@@ -1098,7 +1151,7 @@ void getAllBuckets(const vector<LookEntry> &look,
         }
         for (const auto &it : lo2hi) {
             u32 hi_lo = (it.second << 16) | it.first;
-            buckets[hi_lo].push_back(entry.offset);
+            buckets[hi_lo].emplace_back(entry.offset);
         }
     }
 }
@@ -1191,7 +1244,7 @@ makeCheckShufti16x8(u32 offset_range, u8 bucket_idx,
     copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin() + 16);
     copy(bucket_select_mask.begin(), bucket_select_mask.begin() + 16,
          bucket_select_mask_16.begin());
-    return make_unique<RoseInstrCheckShufti16x8>
+    return std::make_unique<RoseInstrCheckShufti16x8>
            (nib_mask, bucket_select_mask_16,
             neg_mask & 0xffff, base_offset, end_inst);
 }
@@ -1211,7 +1264,7 @@ makeCheckShufti32x8(u32 offset_range, u8 bucket_idx,
     array<u8, 16> lo_mask_16;
     copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_16.begin());
     copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_16.begin());
-    return make_unique<RoseInstrCheckShufti32x8>
+    return std::make_unique<RoseInstrCheckShufti32x8>
            (hi_mask_16, lo_mask_16, bucket_select_mask,
             neg_mask, base_offset, end_inst);
 }
@@ -1233,10 +1286,11 @@ makeCheckShufti16x16(u32 offset_range, u8 bucket_idx,
          bucket_select_mask_32.begin());
     copy(bucket_select_mask_hi.begin(), bucket_select_mask_hi.begin() + 16,
          bucket_select_mask_32.begin() + 16);
-    return make_unique<RoseInstrCheckShufti16x16>
+    return std::make_unique<RoseInstrCheckShufti16x16>
            (hi_mask, lo_mask, bucket_select_mask_32,
             neg_mask & 0xffff, base_offset, end_inst);
 }
+
 static
 unique_ptr<RoseInstruction>
 makeCheckShufti32x16(u32 offset_range, u8 bucket_idx,
@@ -1249,16 +1303,89 @@ makeCheckShufti32x16(u32 offset_range, u8 bucket_idx,
         return nullptr;
     }
 
-    return make_unique<RoseInstrCheckShufti32x16>
+    return std::make_unique<RoseInstrCheckShufti32x16>
            (hi_mask, lo_mask, bucket_select_mask_hi,
             bucket_select_mask_lo, neg_mask, base_offset, end_inst);
 }
 
 static
-bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
+unique_ptr<RoseInstruction>
+makeCheckShufti64x8(u32 offset_range, u8 bucket_idx,
+                    const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                    const array<u8, 64> &bucket_select_mask,
+                    u64a neg_mask, s32 base_offset,
+                    const RoseInstruction *end_inst) {
+    if (offset_range > 64 || bucket_idx > 8) {
+        return nullptr;
+    }
+
+    array<u8, 64> hi_mask_64;
+    array<u8, 64> lo_mask_64;
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin());
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 16);
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 32);
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_64.begin() + 48);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin());
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 16);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 32);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_64.begin() + 48);
+
+    return std::make_unique<RoseInstrCheckShufti64x8>
+           (hi_mask_64, lo_mask_64, bucket_select_mask,
+            neg_mask, base_offset, end_inst);
+}
+
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti64x16(u32 offset_range, u8 bucket_idx,
+                     const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                     const array<u8, 64> &bucket_select_mask_lo,
+                     const array<u8, 64> &bucket_select_mask_hi,
+                     u64a neg_mask, s32 base_offset,
+                     const RoseInstruction *end_inst) {
+    if (offset_range > 64 || bucket_idx > 16) {
+        return nullptr;
+    }
+
+    array<u8, 64> hi_mask_1;
+    array<u8, 64> hi_mask_2;
+    array<u8, 64> lo_mask_1;
+    array<u8, 64> lo_mask_2;
+
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin());
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 16);
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 32);
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_1.begin() + 48);
+    copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin());
+    copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 16);
+    copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 32);
+    copy(hi_mask.begin() + 16, hi_mask.begin() + 32, hi_mask_2.begin() + 48);
+
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin());
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 16);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 32);
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_1.begin() + 48);
+    copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin());
+    copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 16);
+    copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 32);
+    copy(lo_mask.begin() + 16, lo_mask.begin() + 32, lo_mask_2.begin() + 48);
+
+    return std::make_unique<RoseInstrCheckShufti64x16>
+           (hi_mask_1, hi_mask_2, lo_mask_1, lo_mask_2, bucket_select_mask_hi,
+            bucket_select_mask_lo, neg_mask, base_offset, end_inst);
+}
 
+static
+bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program,
+                    const target_t &target) {
+    s32 offset_limit;
+    if (target.has_avx512()) {
+        offset_limit = 64;
+    } else {
+        offset_limit = 32;
+    }
     s32 base_offset = verify_s32(look.front().offset);
-    if (look.back().offset >= base_offset + 32) {
+    if (look.back().offset >= base_offset + offset_limit) {
         return false;
     }
 
@@ -1266,17 +1393,40 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
     u64a neg_mask_64;
     array<u8, 32> hi_mask;
     array<u8, 32> lo_mask;
+    array<u8, 64> bucket_select_hi_64; // for AVX512
+    array<u8, 64> bucket_select_lo_64; // for AVX512
     array<u8, 32> bucket_select_hi;
     array<u8, 32> bucket_select_lo;
     hi_mask.fill(0);
     lo_mask.fill(0);
+    bucket_select_hi_64.fill(0);
+    bucket_select_lo_64.fill(0);
     bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8.
     bucket_select_lo.fill(0);
 
-    if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(),
-                        bucket_select_lo.data(), neg_mask_64, bucket_idx, 32)) {
-        return false;
+    if (target.has_avx512()) {
+        if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi_64.data(),
+                            bucket_select_lo_64.data(), neg_mask_64, bucket_idx,
+                            32)) {
+            return false;
+        }
+        copy(bucket_select_hi_64.begin(), bucket_select_hi_64.begin() + 32,
+             bucket_select_hi.begin());
+        copy(bucket_select_lo_64.begin(), bucket_select_lo_64.begin() + 32,
+            bucket_select_lo.begin());
+
+        DEBUG_PRINTF("bucket_select_hi_64 %s\n",
+                     convertMaskstoString(bucket_select_hi_64.data(), 64).c_str());
+        DEBUG_PRINTF("bucket_select_lo_64 %s\n",
+                     convertMaskstoString(bucket_select_lo_64.data(), 64).c_str());
+    } else {
+        if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(),
+                            bucket_select_lo.data(), neg_mask_64, bucket_idx,
+                            32)) {
+            return false;
+        }
     }
+
     u32 neg_mask = (u32)neg_mask_64;
 
     DEBUG_PRINTF("hi_mask %s\n",
@@ -1299,6 +1449,13 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
                                  bucket_select_lo, neg_mask, base_offset,
                                  end_inst);
     }
+    if (target.has_avx512()) {
+        if (!ri) {
+            ri = makeCheckShufti64x8(offset_range, bucket_idx, hi_mask, lo_mask,
+                                     bucket_select_lo_64, neg_mask_64,
+                                     base_offset, end_inst);
+        }
+    }
     if (!ri) {
         ri = makeCheckShufti16x16(offset_range, bucket_idx, hi_mask, lo_mask,
                                   bucket_select_lo, bucket_select_hi,
@@ -1309,8 +1466,15 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
                                   bucket_select_lo, bucket_select_hi,
                                   neg_mask, base_offset, end_inst);
     }
+    if (target.has_avx512()) {
+        if (!ri) {
+            ri = makeCheckShufti64x16(offset_range, bucket_idx, hi_mask, lo_mask,
+                                      bucket_select_lo_64, bucket_select_hi_64,
+                                      neg_mask_64, base_offset, end_inst);
+        }
+    }
     assert(ri);
-    program.add_before_end(move(ri));
+    program.add_before_end(std::move(ri));
 
     return true;
 }
@@ -1321,7 +1485,7 @@ bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
  */
 static
 void makeLookaroundInstruction(const vector<LookEntry> &look,
-                               RoseProgram &program) {
+                               RoseProgram &program, const target_t &target) {
     assert(!look.empty());
 
     if (makeRoleByte(look, program)) {
@@ -1331,9 +1495,9 @@ void makeLookaroundInstruction(const vector<LookEntry> &look,
     if (look.size() == 1) {
         s8 offset = look.begin()->offset;
         const CharReach &reach = look.begin()->reach;
-        auto ri = make_unique<RoseInstrCheckSingleLookaround>(offset, reach,
+        auto ri = std::make_unique<RoseInstrCheckSingleLookaround>(offset, reach,
                                                      program.end_instruction());
-        program.add_before_end(move(ri));
+        program.add_before_end(std::move(ri));
         return;
     }
 
@@ -1345,13 +1509,17 @@ void makeLookaroundInstruction(const vector<LookEntry> &look,
         return;
     }
 
-    if (makeRoleShufti(look, program)) {
+    if (makeRoleMask64(look, program, target)) {
+        return;
+    }
+
+    if (makeRoleShufti(look, program, target)) {
         return;
     }
 
-    auto ri = make_unique<RoseInstrCheckLookaround>(look,
+    auto ri = std::make_unique<RoseInstrCheckLookaround>(look,
                                                     program.end_instruction());
-    program.add_before_end(move(ri));
+    program.add_before_end(std::move(ri));
 }
 
 static
@@ -1386,7 +1554,7 @@ void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 lit_id,
         return; // all caseful chars handled by HWLM mask.
     }
 
-    makeLookaroundInstruction(look, program);
+    makeLookaroundInstruction(look, program, build.cc.target_info);
 }
 
 static
@@ -1425,7 +1593,7 @@ void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, u32 lit_id,
 
     DEBUG_PRINTF("adding lit early check, min_offset=%u\n", min_offset);
     const auto *end = prog.end_instruction();
-    prog.add_before_end(make_unique<RoseInstrCheckLitEarly>(min_offset, end));
+    prog.add_before_end(std::make_unique<RoseInstrCheckLitEarly>(min_offset, end));
 }
 
 static
@@ -1436,7 +1604,7 @@ void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 lit_id,
     if (!info.group_mask) {
         return;
     }
-    prog.add_before_end(make_unique<RoseInstrCheckGroups>(info.group_mask));
+    prog.add_before_end(std::make_unique<RoseInstrCheckGroups>(info.group_mask));
 }
 
 static
@@ -1603,33 +1771,33 @@ bool makeRoleMultipathShufti(const vector<vector<LookEntry>> &multi_look,
         copy(begin(lo_mask), begin(lo_mask) + 16, nib_mask.begin());
         copy(begin(hi_mask), begin(hi_mask) + 16, nib_mask.begin() + 16);
 
-        auto ri = make_unique<RoseInstrCheckMultipathShufti16x8>
+        auto ri = std::make_unique<RoseInstrCheckMultipathShufti16x8>
                   (nib_mask, bucket_select_lo, data_select_mask, hi_bits_mask,
                    lo_bits_mask, neg_mask, base_offset, last_start, end_inst);
-        program.add_before_end(move(ri));
+        program.add_before_end(std::move(ri));
     } else if (multi_len == 32) {
         neg_mask &= 0xffffffff;
         assert(!(hi_bits_mask & ~0xffffffffULL));
         assert(!(lo_bits_mask & ~0xffffffffULL));
         if (bit_index <= 8) {
-            auto ri = make_unique<RoseInstrCheckMultipathShufti32x8>
+            auto ri = std::make_unique<RoseInstrCheckMultipathShufti32x8>
                       (hi_mask, lo_mask, bucket_select_lo, data_select_mask,
                        hi_bits_mask, lo_bits_mask, neg_mask, base_offset,
                        last_start, end_inst);
-            program.add_before_end(move(ri));
+            program.add_before_end(std::move(ri));
         } else {
-            auto ri = make_unique<RoseInstrCheckMultipathShufti32x16>
+            auto ri = std::make_unique<RoseInstrCheckMultipathShufti32x16>
                       (hi_mask, lo_mask, bucket_select_hi, bucket_select_lo,
                        data_select_mask, hi_bits_mask, lo_bits_mask, neg_mask,
                        base_offset, last_start, end_inst);
-            program.add_before_end(move(ri));
+            program.add_before_end(std::move(ri));
         }
     } else {
-        auto ri = make_unique<RoseInstrCheckMultipathShufti64>
+        auto ri = std::make_unique<RoseInstrCheckMultipathShufti64>
                   (hi_mask, lo_mask, bucket_select_lo, data_select_mask,
                    hi_bits_mask, lo_bits_mask, neg_mask, base_offset,
                    last_start, end_inst);
-        program.add_before_end(move(ri));
+        program.add_before_end(std::move(ri));
     }
     return true;
 }
@@ -1697,10 +1865,10 @@ void makeRoleMultipathLookaround(const vector<vector<LookEntry>> &multi_look,
         ordered_look.emplace_back(multi_entry);
     }
 
-    auto ri = make_unique<RoseInstrMultipathLookaround>(move(ordered_look),
+    auto ri = std::make_unique<RoseInstrMultipathLookaround>(std::move(ordered_look),
                                                         last_start, start_mask,
                                                     program.end_instruction());
-    program.add_before_end(move(ri));
+    program.add_before_end(std::move(ri));
 }
 
 static
@@ -1725,12 +1893,12 @@ void makeRoleLookaround(const RoseBuildImpl &build,
         vector<LookEntry> look;
         vector<LookEntry> look_more;
         if (!looks.empty()) {
-            look = move(looks.front());
+            look = std::move(looks.front());
         }
         findLookaroundMasks(build, v, look_more);
         mergeLookaround(look, look_more);
         if (!look.empty()) {
-            makeLookaroundInstruction(look, program);
+            makeLookaroundInstruction(look, program, build.cc.target_info);
         }
         return;
     }
@@ -1773,7 +1941,7 @@ void makeRoleSuffix(const RoseBuildImpl &build,
         event = MQE_TOP;
     }
 
-    prog.add_before_end(make_unique<RoseInstrTriggerSuffix>(queue, event));
+    prog.add_before_end(std::make_unique<RoseInstrTriggerSuffix>(queue, event));
 }
 
 static
@@ -1786,7 +1954,7 @@ void addInfixTriggerInstructions(vector<TriggerInfo> triggers,
     });
     for (const auto &ti : triggers) {
         prog.add_before_end(
-             make_unique<RoseInstrTriggerInfix>(ti.cancel, ti.queue, ti.event));
+             std::make_unique<RoseInstrTriggerInfix>(ti.cancel, ti.queue, ti.event));
     }
 }
 
@@ -1833,7 +2001,7 @@ void makeRoleInfixTriggers(const RoseBuildImpl &build,
         triggers.emplace_back(g[e].rose_cancel_prev_top, lbi.queue, top);
     }
 
-    addInfixTriggerInstructions(move(triggers), program);
+    addInfixTriggerInstructions(std::move(triggers), program);
 }
 
 
@@ -1880,7 +2048,7 @@ static
 void addCheckOnlyEodInstruction(RoseProgram &prog) {
     DEBUG_PRINTF("only at eod\n");
     const auto *end_inst = prog.end_instruction();
-    prog.add_before_end(make_unique<RoseInstrCheckOnlyEod>(end_inst));
+    prog.add_before_end(std::make_unique<RoseInstrCheckOnlyEod>(end_inst));
 }
 
 static
@@ -1895,7 +2063,7 @@ void makeRoleEagerEodReports(const RoseBuildImpl &build,
             RoseProgram block;
             makeRoleReports(build, leftfix_info, needs_catchup,
                             target(e, build.g), block);
-            eod_program.add_block(move(block));
+            eod_program.add_block(std::move(block));
         }
     }
 
@@ -1909,7 +2077,7 @@ void makeRoleEagerEodReports(const RoseBuildImpl &build,
         addCheckOnlyEodInstruction(program);
     }
 
-    program.add_before_end(move(eod_program));
+    program.add_before_end(std::move(eod_program));
 }
 
 /** Makes a program for a role/vertex given a specific pred/in_edge. */
@@ -1956,33 +2124,33 @@ RoseProgram makeRoleProgram(const RoseBuildImpl &build,
     RoseProgram reports_block;
     makeRoleReports(build, leftfix_info, prog_build.needs_catchup, v,
                     reports_block);
-    effects_block.add_block(move(reports_block));
+    effects_block.add_block(std::move(reports_block));
 
     RoseProgram infix_block;
     makeRoleInfixTriggers(build, leftfix_info, engine_info_by_queue, v,
                           infix_block);
-    effects_block.add_block(move(infix_block));
+    effects_block.add_block(std::move(infix_block));
 
     // Note: SET_GROUPS instruction must be after infix triggers, as an infix
     // going dead may switch off groups.
     RoseProgram groups_block;
     makeRoleGroups(build.g, prog_build, v, groups_block);
-    effects_block.add_block(move(groups_block));
+    effects_block.add_block(std::move(groups_block));
 
     RoseProgram suffix_block;
     makeRoleSuffix(build, suffixes, engine_info_by_queue, v, suffix_block);
-    effects_block.add_block(move(suffix_block));
+    effects_block.add_block(std::move(suffix_block));
 
     RoseProgram state_block;
     makeRoleSetState(roleStateIndices, v, state_block);
-    effects_block.add_block(move(state_block));
+    effects_block.add_block(std::move(state_block));
 
     // Note: EOD eager reports may generate a CHECK_ONLY_EOD instruction (if
     // the program doesn't have one already).
     RoseProgram eod_block;
     makeRoleEagerEodReports(build, leftfix_info, prog_build.needs_catchup, v,
                             eod_block);
-    effects_block.add_block(move(eod_block));
+    effects_block.add_block(std::move(eod_block));
 
     /* a 'ghost role' may do nothing if we know that its groups are already set
      * - in this case we can avoid producing a program at all. */
@@ -1990,7 +2158,7 @@ RoseProgram makeRoleProgram(const RoseBuildImpl &build,
         return {};
     }
 
-    program.add_before_end(move(effects_block));
+    program.add_before_end(std::move(effects_block));
     return program;
 }
 
@@ -2005,7 +2173,7 @@ void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 lit_id,
     DEBUG_PRINTF("squashes 0x%llx\n", info.group_mask);
     assert(info.group_mask);
     /* Note: group_mask is negated. */
-    prog.add_before_end(make_unique<RoseInstrSquashGroups>(~info.group_mask));
+    prog.add_before_end(std::make_unique<RoseInstrSquashGroups>(~info.group_mask));
 }
 
 namespace {
@@ -2036,7 +2204,7 @@ RoseProgram assembleProgramBlocks(vector<RoseProgram> &&blocks_in) {
             continue;
         }
 
-        blocks.push_back(move(block));
+        blocks.emplace_back(std::move(block));
         seen.emplace(blocks.back());
     }
 
@@ -2050,11 +2218,11 @@ RoseProgram assembleProgramBlocks(vector<RoseProgram> &&blocks_in) {
          * only set if a state has been. */
         if (!prog.empty() && reads_work_done_flag(block)) {
             RoseProgram clear_block;
-            clear_block.add_before_end(make_unique<RoseInstrClearWorkDone>());
-            prog.add_block(move(clear_block));
+            clear_block.add_before_end(std::make_unique<RoseInstrClearWorkDone>());
+            prog.add_block(std::move(clear_block));
         }
 
-        prog.add_block(move(block));
+        prog.add_block(std::move(block));
     }
 
     return prog;
@@ -2097,7 +2265,7 @@ RoseProgram makeLiteralProgram(const RoseBuildImpl &build,
                                          engine_info_by_queue, roleStateIndices,
                                          prog_build, e);
         if (!role_prog.empty()) {
-            pred_blocks[pred_state].add_block(move(role_prog));
+            pred_blocks[pred_state].add_block(std::move(role_prog));
         }
     }
 
@@ -2116,7 +2284,7 @@ RoseProgram makeLiteralProgram(const RoseBuildImpl &build,
         auto role_prog = makeRoleProgram(build, leftfix_info, suffixes,
                                          engine_info_by_queue, roleStateIndices,
                                          prog_build, e);
-        role_programs.add_block(move(role_prog));
+        role_programs.add_block(std::move(role_prog));
     }
 
     if (lit_id == build.eod_event_literal_id) {
@@ -2131,8 +2299,8 @@ RoseProgram makeLiteralProgram(const RoseBuildImpl &build,
     // Literal may squash groups.
     makeGroupSquashInstruction(build, lit_id, unconditional_block);
 
-    role_programs.add_block(move(unconditional_block));
-    lit_program.add_before_end(move(role_programs));
+    role_programs.add_block(std::move(unconditional_block));
+    lit_program.add_before_end(std::move(role_programs));
 
     return lit_program;
 }
@@ -2163,10 +2331,10 @@ RoseProgram makeDelayRebuildProgram(const RoseBuildImpl &build,
         makePushDelayedInstructions(build.literals, prog_build,
                                     build.literal_info.at(lit_id).delayed_ids,
                                     prog);
-        blocks.push_back(move(prog));
+        blocks.emplace_back(std::move(prog));
     }
 
-    return assembleProgramBlocks(move(blocks));
+    return assembleProgramBlocks(std::move(blocks));
 }
 
 RoseProgram makeEodAnchorProgram(const RoseBuildImpl &build,
@@ -2193,7 +2361,7 @@ RoseProgram makeEodAnchorProgram(const RoseBuildImpl &build,
     for (const auto &id : g[v].reports) {
         makeReport(build, id, has_som, report_block);
     }
-    program.add_before_end(move(report_block));
+    program.add_before_end(std::move(report_block));
 
     return program;
 }
@@ -2210,7 +2378,7 @@ void makeCatchupMpv(const ReportManager &rm, bool needs_mpv_catchup,
         return;
     }
 
-    program.add_before_end(make_unique<RoseInstrCatchUpMpv>());
+    program.add_before_end(std::make_unique<RoseInstrCatchUpMpv>());
 }
 
 RoseProgram makeReportProgram(const RoseBuildImpl &build,
@@ -2243,9 +2411,9 @@ RoseProgram makeBoundaryProgram(const RoseBuildImpl &build,
 void addIncludedJumpProgram(RoseProgram &program, u32 child_offset,
                             u8 squash) {
     RoseProgram block;
-    block.add_before_end(make_unique<RoseInstrIncludedJump>(child_offset,
+    block.add_before_end(std::make_unique<RoseInstrIncludedJump>(child_offset,
                                                             squash));
-    program.add_block(move(block));
+    program.add_block(std::move(block));
 }
 
 static
@@ -2254,8 +2422,8 @@ void addPredBlockSingle(u32 pred_state, RoseProgram &pred_block,
     // Prepend an instruction to check the pred state is on.
     const auto *end_inst = pred_block.end_instruction();
     pred_block.insert(begin(pred_block),
-                      make_unique<RoseInstrCheckState>(pred_state, end_inst));
-    program.add_block(move(pred_block));
+                      std::make_unique<RoseInstrCheckState>(pred_state, end_inst));
+    program.add_block(std::move(pred_block));
 }
 
 static
@@ -2265,12 +2433,12 @@ void addPredBlocksAny(map<u32, RoseProgram> &pred_blocks, u32 num_states,
 
     vector<u32> keys;
     for (const u32 &key : pred_blocks | map_keys) {
-        keys.push_back(key);
+        keys.emplace_back(key);
     }
 
     const RoseInstruction *end_inst = sparse_program.end_instruction();
-    auto ri = make_unique<RoseInstrSparseIterAny>(num_states, keys, end_inst);
-    sparse_program.add_before_end(move(ri));
+    auto ri = std::make_unique<RoseInstrSparseIterAny>(num_states, keys, end_inst);
+    sparse_program.add_before_end(std::move(ri));
 
     RoseProgram &block = pred_blocks.begin()->second;
 
@@ -2278,8 +2446,8 @@ void addPredBlocksAny(map<u32, RoseProgram> &pred_blocks, u32 num_states,
      * blocks are being collapsed together */
     stripCheckHandledInstruction(block);
 
-    sparse_program.add_before_end(move(block));
-    program.add_block(move(sparse_program));
+    sparse_program.add_before_end(std::move(block));
+    program.add_block(std::move(sparse_program));
 }
 
 static
@@ -2292,16 +2460,16 @@ void addPredBlocksMulti(map<u32, RoseProgram> &pred_blocks,
     vector<pair<u32, const RoseInstruction *>> jump_table;
 
     // BEGIN instruction.
-    auto ri_begin = make_unique<RoseInstrSparseIterBegin>(num_states, end_inst);
+    auto ri_begin = std::make_unique<RoseInstrSparseIterBegin>(num_states, end_inst);
     RoseInstrSparseIterBegin *begin_inst = ri_begin.get();
-    sparse_program.add_before_end(move(ri_begin));
+    sparse_program.add_before_end(std::move(ri_begin));
 
     // NEXT instructions, one per pred program.
     u32 prev_key = pred_blocks.begin()->first;
     for (auto it = next(begin(pred_blocks)); it != end(pred_blocks); ++it) {
-        auto ri = make_unique<RoseInstrSparseIterNext>(prev_key, begin_inst,
+        auto ri = std::make_unique<RoseInstrSparseIterNext>(prev_key, begin_inst,
                                                        end_inst);
-        sparse_program.add_before_end(move(ri));
+        sparse_program.add_before_end(std::move(ri));
         prev_key = it->first;
     }
 
@@ -2315,7 +2483,7 @@ void addPredBlocksMulti(map<u32, RoseProgram> &pred_blocks,
 
         assert(dynamic_cast<const RoseInstrSparseIterBegin *>(out_it->get()) ||
                dynamic_cast<const RoseInstrSparseIterNext *>(out_it->get()));
-        out_it = sparse_program.insert(++out_it, move(flat_prog));
+        out_it = sparse_program.insert(++out_it, std::move(flat_prog));
 
         // Jump table target for this key is the beginning of the block we just
         // spliced in.
@@ -2327,9 +2495,9 @@ void addPredBlocksMulti(map<u32, RoseProgram> &pred_blocks,
     }
 
     // Write the jump table back into the SPARSE_ITER_BEGIN instruction.
-    begin_inst->jump_table = move(jump_table);
+    begin_inst->jump_table = std::move(jump_table);
 
-    program.add_block(move(sparse_program));
+    program.add_block(std::move(sparse_program));
 }
 
 void addPredBlocks(map<u32, RoseProgram> &pred_blocks, u32 num_states,
@@ -2380,7 +2548,7 @@ void applyFinalSpecialisation(RoseProgram &program) {
     auto it = next(program.rbegin());
     if (auto *ri = dynamic_cast<const RoseInstrReport *>(it->get())) {
         DEBUG_PRINTF("replacing REPORT with FINAL_REPORT\n");
-        program.replace(it, make_unique<RoseInstrFinalReport>(
+        program.replace(it, std::make_unique<RoseInstrFinalReport>(
                                 ri->onmatch, ri->offset_adjust));
     }
 }
diff --git a/src/rose/rose_build_program.h b/src/rose/rose_build_program.h
index 7d781f319..1882279dd 100644
--- a/src/rose/rose_build_program.h
+++ b/src/rose/rose_build_program.h
@@ -33,7 +33,6 @@
 #include "rose_program.h"
 #include "util/bytecode_ptr.h"
 #include "util/hash.h"
-#include "util/make_unique.h"
 
 #include <unordered_map>
 #include <vector>
@@ -125,13 +124,7 @@ class RoseProgram {
      * \brief Replace the instruction pointed to by the given iterator.
      */
     template<class Iter>
-    void replace(Iter it, std::unique_ptr<RoseInstruction> ri) {
-        assert(!prog.empty());
-
-        const RoseInstruction *old_ptr = it->get();
-        *it = move(ri);
-        update_targets(prog.begin(), prog.end(), old_ptr, it->get());
-    }
+    void replace(Iter it, std::unique_ptr<RoseInstruction> ri);
 };
 
 bytecode_ptr<char> writeProgram(RoseEngineBlob &blob,
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index 359550e11..2888b9a0f 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -846,7 +846,7 @@ void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
         h[e].tops = std::move(pruned_tops);
         if (h[e].tops.empty()) {
             DEBUG_PRINTF("edge (start,%zu) has only unused tops\n", h[v].index);
-            dead.push_back(e);
+            dead.emplace_back(e);
         }
     }
 
@@ -1457,7 +1457,7 @@ void splitAndFilterBuckets(vector<vector<RoseVertex>> &buckets,
                 out.emplace_back();
             }
             auto out_bucket = p.first->second;
-            out[out_bucket].push_back(v);
+            out[out_bucket].emplace_back(v);
         }
     }
 
@@ -1511,7 +1511,7 @@ void splitByNeighbour(const RoseGraph &g, vector<vector<RoseVertex>> &buckets,
             for (RoseVertex v : adjacent_vertices_range(u, g)) {
                 auto it = inv.find(v);
                 if (it != end(inv)) {
-                    neighbours_by_bucket[it->second].push_back(v);
+                    neighbours_by_bucket[it->second].emplace_back(v);
                 }
             }
         } else {
@@ -1519,7 +1519,7 @@ void splitByNeighbour(const RoseGraph &g, vector<vector<RoseVertex>> &buckets,
             for (RoseVertex v : inv_adjacent_vertices_range(u, g)) {
                 auto it = inv.find(v);
                 if (it != end(inv)) {
-                    neighbours_by_bucket[it->second].push_back(v);
+                    neighbours_by_bucket[it->second].emplace_back(v);
                 }
             }
         }
@@ -1540,14 +1540,14 @@ void splitByNeighbour(const RoseGraph &g, vector<vector<RoseVertex>> &buckets,
                 if (contains(picked, v)) {
                     inv[v] = new_key;
                 } else {
-                    leftovers.push_back(v);
+                    leftovers.emplace_back(v);
                 }
             }
 
             assert(!leftovers.empty());
             assert(e.second.size() + leftovers.size()
                    == buckets[old_key].size());
-            extras.push_back(e.second);
+            extras.emplace_back(e.second);
             buckets[old_key].swap(leftovers);
         }
         insert(&buckets, buckets.end(), extras);
@@ -1650,7 +1650,7 @@ void diamondMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                 }
 
                 mergeVerticesDiamond(a, b, build, rai);
-                dead->push_back(a);
+                dead->emplace_back(a);
                 candidates.erase(a);
                 break; // next a
             }
@@ -1758,7 +1758,7 @@ void leftMergePass(CandidateSet &candidates, RoseBuildImpl &build,
             RoseVertex b = *jt;
             if (attemptRoseMerge(build, true, a, b, false, rai)) {
                 mergeVerticesLeft(a, b, build, rai);
-                dead->push_back(a);
+                dead->emplace_back(a);
                 candidates.erase(ait);
                 break; // consider next a
             }
@@ -1918,7 +1918,7 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                 RoseVertex b = *jt;
                 if (attemptRoseMerge(build, false, a, b, !mergeRoses, rai)) {
                     mergeVerticesRight(a, b, build, rai);
-                    dead->push_back(a);
+                    dead->emplace_back(a);
                     candidates.erase(a);
                     break; // consider next a
                 }
@@ -1978,7 +1978,7 @@ void filterDiamondCandidates(RoseGraph &g, CandidateSet &candidates) {
     vector<RoseVertex> dead;
     for (const auto &v : candidates) {
         if (hasNoDiamondSiblings(g, v)) {
-            dead.push_back(v);
+            dead.emplace_back(v);
         }
     }
 
@@ -2145,13 +2145,13 @@ void mergeDupeLeaves(RoseBuildImpl &build) {
                 if (g[et].minBound <= g[e].minBound
                     && g[et].maxBound >= g[e].maxBound) {
                     DEBUG_PRINTF("remove more constrained edge\n");
-                    deadEdges.push_back(e);
+                    deadEdges.emplace_back(e);
                 }
             } else {
                 DEBUG_PRINTF("rehome edge: add %zu->%zu\n", g[u].index,
                              g[t].index);
                 add_edge(u, t, g[e], g);
-                deadEdges.push_back(e);
+                deadEdges.emplace_back(e);
             }
         }
 
@@ -2159,7 +2159,7 @@ void mergeDupeLeaves(RoseBuildImpl &build) {
             for (auto &e : deadEdges) {
                 remove_edge(e, g);
             }
-            changed.push_back(v);
+            changed.emplace_back(v);
             g[t].min_offset = min(g[t].min_offset, g[v].min_offset);
             g[t].max_offset = max(g[t].max_offset, g[v].max_offset);
         }
@@ -2212,7 +2212,7 @@ void mergeCluster(RoseGraph &g, const ReportManager &rm,
             NGHolder *h = g[v].suffix.graph.get();
             assert(!g[v].suffix.haig); /* should not be here if haig */
             rev[h] = v;
-            cluster.push_back(h);
+            cluster.emplace_back(h);
         }
         it = it2;
 
@@ -2230,7 +2230,7 @@ void mergeCluster(RoseGraph &g, const ReportManager &rm,
             ENSURE_AT_LEAST(&g[winner].max_offset, g[victim].max_offset);
             insert(&g[winner].reports, g[victim].reports);
 
-            dead.push_back(victim);
+            dead.emplace_back(victim);
         }
     }
 }
@@ -2263,7 +2263,7 @@ void findUncalcLeavesCandidates(RoseBuildImpl &build,
                 continue;
             }
 
-            suffix_vertices.push_back(v);
+            suffix_vertices.emplace_back(v);
         }
     }
 
@@ -2289,9 +2289,9 @@ void findUncalcLeavesCandidates(RoseBuildImpl &build,
         vector<RoseVertex> &vec = clusters[key];
         if (vec.empty()) {
 
-            ordered.push_back(key);
+            ordered.emplace_back(key);
         }
-        vec.push_back(v);
+        vec.emplace_back(v);
     }
 
     DEBUG_PRINTF("find loop done\n");
diff --git a/src/rose/rose_build_width.cpp b/src/rose/rose_build_width.cpp
index 182b62ee6..327911eac 100644
--- a/src/rose/rose_build_width.cpp
+++ b/src/rose/rose_build_width.cpp
@@ -67,7 +67,7 @@ u32 findMinWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
 
     for (auto v : vertices_range(g)) {
         if (tbi.hasLiteralInTable(v, table)) {
-            table_verts.push_back(v);
+            table_verts.emplace_back(v);
         }
     }
 
@@ -193,7 +193,7 @@ u32 findMaxBAWidth(const RoseBuildImpl &tbi, enum rose_literal_table table) {
     for (auto v : vertices_range(g)) {
         if ((table == ROSE_FLOATING && tbi.isFloating(v))
             || (table == ROSE_ANCHORED && tbi.isAnchored(v))) {
-            table_verts.push_back(v);
+            table_verts.emplace_back(v);
         }
     }
 
diff --git a/src/rose/rose_graph.h b/src/rose/rose_graph.h
index 499d796ac..b5bf1985d 100644
--- a/src/rose/rose_graph.h
+++ b/src/rose/rose_graph.h
@@ -112,7 +112,7 @@ struct LeftEngInfo {
     }
     size_t hash() const;
     void reset(void);
-    operator bool() const;
+    explicit operator bool() const;
     bool tracksSom() const { return !!haig; }
 };
 
@@ -133,7 +133,7 @@ struct RoseSuffixInfo {
     bool operator<(const RoseSuffixInfo &b) const;
     size_t hash() const;
     void reset(void);
-    operator bool() const { return graph || castle || haig || rdfa || tamarama; }
+    explicit operator bool() const { return graph || castle || haig || rdfa || tamarama; }
 };
 
 /** \brief Properties attached to each Rose graph vertex. */
diff --git a/src/rose/rose_in_util.cpp b/src/rose/rose_in_util.cpp
index 9fe47c276..c26280821 100644
--- a/src/rose/rose_in_util.cpp
+++ b/src/rose/rose_in_util.cpp
@@ -35,7 +35,6 @@
 #include "nfagraph/ng_width.h"
 #include "util/container.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 
 #include <vector>
 
@@ -93,7 +92,7 @@ struct RoseEdgeCopier {
 
 unique_ptr<RoseInGraph> cloneRoseGraph(const RoseInGraph &ig) {
     assert(hasCorrectlyNumberedVertices(ig));
-    unique_ptr<RoseInGraph> out = make_unique<RoseInGraph>();
+    unique_ptr<RoseInGraph> out = std::make_unique<RoseInGraph>();
 
     unordered_map<const NGHolder *, shared_ptr<NGHolder>> graph_map;
     unordered_map<const raw_som_dfa *, shared_ptr<raw_som_dfa>> haig_map;
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index e5485476b..7e21303cb 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -208,7 +208,11 @@ enum RoseInstructionCode {
      */
     ROSE_INSTR_LAST_FLUSH_COMBINATION,
 
-    LAST_ROSE_INSTRUCTION = ROSE_INSTR_LAST_FLUSH_COMBINATION //!< Sentinel.
+    ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti.
+    ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti.
+    ROSE_INSTR_CHECK_MASK_64,     //!< 64-bytes and/cmp/neg mask check.
+
+    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MASK_64 //!< Sentinel.
 };
 
 struct ROSE_STRUCT_END {
@@ -285,6 +289,15 @@ struct ROSE_STRUCT_CHECK_MASK_32 {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_CHECK_MASK_64 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 and_mask[64]; //!< 64-byte and mask.
+    u8 cmp_mask[64]; //!< 64-byte cmp mask.
+    u64a neg_mask; //!< negation mask with 32 bits.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_CHECK_BYTE {
     u8 code; //!< From enum RoseInstructionCode.
     u8 and_mask; //!< 8-bits and mask.
@@ -336,6 +349,29 @@ struct ROSE_STRUCT_CHECK_SHUFTI_32x16 {
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_CHECK_SHUFTI_64x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[64]; //!< High nibble mask in shufti.
+    u8 lo_mask[64]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[64]; //!< Mask for bucket assigning.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_SHUFTI_64x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask_1[64]; //!< 4 copies of 0-15 High nibble mask.
+    u8 hi_mask_2[64]; //!< 4 copies of 16-32 High nibble mask.
+    u8 lo_mask_1[64]; //!< 4 copies of 0-15 Low nibble mask.
+    u8 lo_mask_2[64]; //!< 4 copies of 16-32 Low nibble mask.
+    u8 bucket_select_mask_hi[64]; //!< Bucket mask for high 8 buckets.
+    u8 bucket_select_mask_lo[64]; //!< Bucket mask for low 8 buckets.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 offset; //!< Relative offset of the first byte.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
 struct ROSE_STRUCT_CHECK_INFIX {
     u8 code; //!< From enum RoseInstructionCode.
     u32 queue; //!< Queue of leftfix to check.
diff --git a/src/rose/stream_long_lit.h b/src/rose/stream_long_lit.h
index 348676087..df9b57f4e 100644
--- a/src/rose/stream_long_lit.h
+++ b/src/rose/stream_long_lit.h
@@ -201,12 +201,12 @@ const u8 *prepScanBuffer(const struct core_info *ci,
         } else {
             // Copy: first chunk from history buffer.
             assert(overhang <= ci->hlen);
-            copy_upto_32_bytes(tempbuf, ci->hbuf + ci->hlen - overhang,
+            copy_upto_64_bytes(tempbuf, ci->hbuf + ci->hlen - overhang,
                                overhang);
             // Copy: second chunk from current buffer.
             size_t copy_buf_len = LONG_LIT_HASH_LEN - overhang;
             assert(copy_buf_len <= ci->len);
-            copy_upto_32_bytes(tempbuf + overhang, ci->buf, copy_buf_len);
+            copy_upto_64_bytes(tempbuf + overhang, ci->buf, copy_buf_len);
             // Read from our temporary buffer for the hash.
             base = tempbuf;
         }
diff --git a/src/rose/validate_mask.h b/src/rose/validate_mask.h
index ac8cc312e..8191db52f 100644
--- a/src/rose/validate_mask.h
+++ b/src/rose/validate_mask.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,6 +41,17 @@ void validateMask32Print(const u8 *mask) {
     }
     printf("\n");
 }
+
+#ifdef HAVE_AVX512
+static
+void validateMask64Print(const u8 *mask) {
+    int i;
+    for (i = 0; i < 64; i++) {
+        printf("%02x ", mask[i]);
+    }
+    printf("\n");
+}
+#endif
 #endif
 
 // check positive bytes in cmp_result.
@@ -115,4 +126,29 @@ int validateMask32(const m256 data, const u32 valid_data_mask,
     }
 }
 
+#ifdef HAVE_AVX512
+static really_inline
+int validateMask64(const m512 data, const u64a valid_data_mask,
+                   const m512 and_mask, const m512 cmp_mask,
+                   const u64a neg_mask) {
+    u64a cmp_result = ~eq512mask(and512(data, and_mask), cmp_mask);
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    validateMask64Print((const u8 *)&data);
+    DEBUG_PRINTF("cmp_result\n");
+    validateMask64Print((const u8 *)&cmp_result);
+#endif
+    DEBUG_PRINTF("cmp_result %016llx neg_mask %016llx\n", cmp_result, neg_mask);
+    DEBUG_PRINTF("valid_data_mask %016llx\n", valid_data_mask);
+
+    if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) {
+        DEBUG_PRINTF("checkCompareResult64 passed\n");
+        return 1;
+    } else {
+        DEBUG_PRINTF("checkCompareResult64 failed\n");
+        return 0;
+    }
+}
+#endif
+
 #endif
diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h
index 1dc855d99..1ee7fa0ab 100644
--- a/src/rose/validate_shufti.h
+++ b/src/rose/validate_shufti.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,7 +47,7 @@ static really_inline
 int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
                             const m256 lo_mask, const m256 and_mask,
                             const u32 neg_mask, const u32 valid_data_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
     m256 c_hi = pshufb_m256(hi_mask,
                             rshift64_m256(andnot256(low4bits, data), 4));
@@ -78,7 +78,7 @@ int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
                            const m128 and_mask, const u32 neg_mask,
                            const u32 valid_data_mask) {
     m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
     m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
     m128 nresult = eq128(and128(t, and_mask), zeroes128());
@@ -101,7 +101,7 @@ static really_inline
 int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
                            const m256 lo_mask, const m256 and_mask,
                            const u32 neg_mask, const u32 valid_data_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
     m256 c_hi = pshufb_m256(hi_mask,
                             rshift64_m256(andnot256(low4bits, data), 4));
@@ -133,7 +133,7 @@ int validateShuftiMask32x16(const m256 data,
                             const m256 bucket_mask_hi,
                             const m256 bucket_mask_lo, const u32 neg_mask,
                             const u32 valid_data_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 data_lo = and256(data, low4bits);
     m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
     m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
@@ -175,6 +175,84 @@ int validateShuftiMask32x16(const m256 data,
     return !cmp_result;
 }
 
+#ifdef HAVE_AVX512
+static really_inline
+int validateShuftiMask64x8(const m512 data, const m512 hi_mask,
+                           const m512 lo_mask, const m512 and_mask,
+                           const u64a neg_mask, const u64a valid_data_mask) {
+    m512 low4bits = set1_64x8(0xf);
+    m512 c_lo = pshufb_m512(lo_mask, and512(data, low4bits));
+    m512 c_hi = pshufb_m512(hi_mask,
+                            rshift64_m512(andnot512(low4bits, data), 4));
+    m512 t = and512(c_lo, c_hi);
+    u64a nresult = eq512mask(and512(t, and_mask), zeroes512());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 64);
+    DEBUG_PRINTF("hi_mask\n");
+    dumpMask(&hi_mask, 64);
+    DEBUG_PRINTF("lo_mask\n");
+    dumpMask(&lo_mask, 64);
+    DEBUG_PRINTF("c_lo\n");
+    dumpMask(&c_lo, 64);
+    DEBUG_PRINTF("c_hi\n");
+    dumpMask(&c_hi, 64);
+    DEBUG_PRINTF("nresult %llx\n", nresult);
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+#endif
+    u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+
+static really_inline
+int validateShuftiMask64x16(const m512 data,
+                            const m512 hi_mask_1, const m512 hi_mask_2,
+                            const m512 lo_mask_1, const m512 lo_mask_2,
+                            const m512 and_mask_hi, const m512 and_mask_lo,
+                            const u64a neg_mask, const u64a valid_data_mask) {
+    m512 low4bits = set1_64x8(0xf);
+    m512 data_lo = and512(data, low4bits);
+    m512 data_hi = and512(rshift64_m512(data, 4), low4bits);
+    m512 c_lo_1 = pshufb_m512(lo_mask_1, data_lo);
+    m512 c_lo_2 = pshufb_m512(lo_mask_2, data_lo);
+    m512 c_hi_1 = pshufb_m512(hi_mask_1, data_hi);
+    m512 c_hi_2 = pshufb_m512(hi_mask_2, data_hi);
+    m512 t1 = and512(c_lo_1, c_hi_1);
+    m512 t2 = and512(c_lo_2, c_hi_2);
+    m512 result = or512(and512(t1, and_mask_lo), and512(t2, and_mask_hi));
+    u64a nresult = eq512mask(result, zeroes512());
+#ifdef DEBUG
+    DEBUG_PRINTF("data\n");
+    dumpMask(&data, 64);
+    DEBUG_PRINTF("data_lo\n");
+    dumpMask(&data_lo, 64);
+    DEBUG_PRINTF("data_hi\n");
+    dumpMask(&data_hi, 64);
+    DEBUG_PRINTF("hi_mask_1\n");
+    dumpMask(&hi_mask_1, 64);
+    DEBUG_PRINTF("hi_mask_2\n");
+    dumpMask(&hi_mask_2, 64);
+    DEBUG_PRINTF("lo_mask_1\n");
+    dumpMask(&lo_mask_1, 64);
+    DEBUG_PRINTF("lo_mask_2\n");
+    dumpMask(&lo_mask_2, 64);
+    DEBUG_PRINTF("c_lo_1\n");
+    dumpMask(&c_lo_1, 64);
+    DEBUG_PRINTF("c_lo_2\n");
+    dumpMask(&c_lo_2, 64);
+    DEBUG_PRINTF("c_hi_1\n");
+    dumpMask(&c_hi_1, 64);
+    DEBUG_PRINTF("c_hi_2\n");
+    dumpMask(&c_hi_2, 64);
+    DEBUG_PRINTF("result\n");
+    dumpMask(&result, 64);
+    DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask);
+#endif
+    u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask;
+    return !cmp_result;
+}
+#endif
+
 static really_inline
 int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) {
     u32 t = ~(data | hi_bits);
@@ -201,7 +279,7 @@ int validateMultipathShuftiMask16x8(const m128 data,
                                     const u32 neg_mask,
                                     const u32 valid_path_mask) {
     m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
     m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
     m128 result = and128(t, bucket_select_mask);
@@ -220,7 +298,7 @@ int validateMultipathShuftiMask32x8(const m256 data,
                                     const u32 hi_bits, const u32 lo_bits,
                                     const u32 neg_mask,
                                     const u32 valid_path_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 data_lo = and256(data, low4bits);
     m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
     m256 c_lo = pshufb_m256(lo_mask, data_lo);
@@ -244,7 +322,7 @@ int validateMultipathShuftiMask32x16(const m256 data,
                                      const u32 hi_bits, const u32 lo_bits,
                                      const u32 neg_mask,
                                      const u32 valid_path_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 data_lo = and256(data, low4bits);
     m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
     m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
@@ -271,7 +349,7 @@ int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
                                   const u64a hi_bits, const u64a lo_bits,
                                   const u64a neg_mask,
                                   const u64a valid_path_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
     m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
     m256 c_hi_1 = pshufb_m256(hi_mask,
diff --git a/src/runtime.c b/src/runtime.c
index a3659348c..a055e5f4f 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2022, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1013,6 +1013,7 @@ hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
         report_eod_matches(id, scratch, onEvent, context);
         if (unlikely(internal_matching_error(scratch))) {
             unmarkScratchInUse(scratch);
+            hs_stream_free(id);
             return HS_UNKNOWN_ERROR;
         }
         unmarkScratchInUse(scratch);
diff --git a/src/scratch.c b/src/scratch.c
index 25991e2bb..9f6d77cdc 100644
--- a/src/scratch.c
+++ b/src/scratch.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/scratch.h b/src/scratch.h
index 1256f7aba..e3cd92452 100644
--- a/src/scratch.h
+++ b/src/scratch.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2015-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index 345edfe95..e1d2f1f31 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -56,7 +56,6 @@
 #include "util/compare.h"
 #include "util/compile_context.h"
 #include "util/container.h"
-#include "util/make_unique.h"
 #include "util/ue2_graph.h"
 #include "util/ue2string.h"
 #include "util/verify_types.h"
@@ -78,7 +77,7 @@ namespace ue2 {
 struct LitTrieVertexProps {
     LitTrieVertexProps() = default;
     explicit LitTrieVertexProps(u8 c_in) : c(c_in) {}
-    size_t index; // managed by ue2_graph
+    size_t index = 0; // managed by ue2_graph
     u8 c = 0; //!< character reached on this vertex
     flat_set<ReportID> reports; //!< managed reports fired on this vertex
 };
@@ -170,7 +169,7 @@ bool pruneOverlongReports(NFAVertex v, NGHolder &g, const depth &max_depth,
     for (ReportID id : g[v].reports) {
         const auto &report = rm.getReport(id);
         if (report.minOffset > max_depth) {
-            bad_reports.push_back(id);
+            bad_reports.emplace_back(id);
         }
     }
 
@@ -242,7 +241,7 @@ bool mergeDfas(vector<unique_ptr<raw_dfa>> &dfas, const ReportManager &rm,
     vector<const raw_dfa *> dfa_ptrs;
     dfa_ptrs.reserve(dfas.size());
     for (auto &d : dfas) {
-        dfa_ptrs.push_back(d.get());
+        dfa_ptrs.emplace_back(d.get());
     }
 
     auto merged = mergeAllDfas(dfa_ptrs, DFA_MERGE_MAX_STATES, &rm, cc.grey);
@@ -254,7 +253,7 @@ bool mergeDfas(vector<unique_ptr<raw_dfa>> &dfas, const ReportManager &rm,
     DEBUG_PRINTF("merge succeeded, result has %zu states\n",
                   merged->states.size());
     dfas.clear();
-    dfas.push_back(std::move(merged));
+    dfas.emplace_back(std::move(merged));
     return true;
 }
 
@@ -315,7 +314,7 @@ void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
         minimize_hopcroft(*r, cc.grey);
     }
 
-    dfas.push_back(std::move(r));
+    dfas.emplace_back(std::move(r));
 
     if (dfas.size() >= cc.grey.smallWriteMergeBatchSize) {
         if (!mergeDfas(dfas, rm, cc)) {
@@ -426,7 +425,7 @@ struct ACVisitor : public boost::default_bfs_visitor {
         auto v = target(e, trie);
         DEBUG_PRINTF("bfs (%zu, %zu) on '%c'\n", trie[u].index, trie[v].index,
                      trie[v].c);
-        ordering.push_back(v);
+        ordering.emplace_back(v);
 
         auto f = find_failure_target(u, v, trie);
 
@@ -524,7 +523,7 @@ vector<u32> findDistToAccept(const LitTrie &trie) {
     deque<LitTrieVertex> q;
     for (auto v : vertices_range(trie)) {
         if (!trie[v].reports.empty()) {
-            q.push_back(v);
+            q.emplace_back(v);
             dist[trie[v].index] = 0;
         }
     }
@@ -538,7 +537,7 @@ vector<u32> findDistToAccept(const LitTrie &trie) {
         for (auto u : inv_adjacent_vertices_range(v, trie)) {
             auto &u_dist = dist[trie[u].index];
             if (u_dist == UINT32_MAX) {
-                q.push_back(u);
+                q.emplace_back(u);
                 u_dist = d + 1;
             }
         }
@@ -573,7 +572,7 @@ void pruneTrie(LitTrie &trie, u32 max_depth) {
             DEBUG_PRINTF("pruning vertex %zu (min path len %u)\n",
                          trie[v].index, min_path_len);
             clear_vertex(v, trie);
-            dead.push_back(v);
+            dead.emplace_back(v);
         }
     }
 
@@ -615,7 +614,7 @@ vector<CharReach> getAlphabet(const LitTrie &trie, bool nocase) {
             CharReach t = cr & esets[i];
             if (t.any() && t != esets[i]) {
                 esets[i] &= ~t;
-                esets.push_back(t);
+                esets.emplace_back(t);
             }
         }
     }
@@ -793,6 +792,12 @@ bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
     bytecode_ptr<NFA> dfa = nullptr;
     if (cc.grey.allowSmallWriteSheng) {
         dfa = shengCompile(rdfa, cc, rm, only_accel_init, &accel_states);
+        if (!dfa) {
+            dfa = sheng32Compile(rdfa, cc, rm, only_accel_init, &accel_states);
+        }
+        if (!dfa) {
+            dfa = sheng64Compile(rdfa, cc, rm, only_accel_init, &accel_states);
+        }
     }
     if (!dfa) {
         dfa = mcclellanCompile(rdfa, cc, rm, only_accel_init,
@@ -856,7 +861,7 @@ bytecode_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
 unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(size_t num_patterns,
                                                   const ReportManager &rm,
                                                   const CompileContext &cc) {
-    return ue2::make_unique<SmallWriteBuildImpl>(num_patterns, rm, cc);
+    return std::make_unique<SmallWriteBuildImpl>(num_patterns, rm, cc);
 }
 
 bytecode_ptr<SmallWriteEngine> SmallWriteBuildImpl::build(u32 roseQuality) {
@@ -886,12 +891,12 @@ bytecode_ptr<SmallWriteEngine> SmallWriteBuildImpl::build(u32 roseQuality) {
     }
 
     if (!is_empty(lit_trie)) {
-        dfas.push_back(buildDfa(lit_trie, false));
+        dfas.emplace_back(buildDfa(lit_trie, false));
         DEBUG_PRINTF("caseful literal dfa with %zu states\n",
                      dfas.back()->states.size());
     }
     if (!is_empty(lit_trie_nocase)) {
-        dfas.push_back(buildDfa(lit_trie_nocase, true));
+        dfas.emplace_back(buildDfa(lit_trie_nocase, true));
         DEBUG_PRINTF("nocase literal dfa with %zu states\n",
                      dfas.back()->states.size());
     }
diff --git a/src/som/slot_manager.cpp b/src/som/slot_manager.cpp
index d97e8fc1d..33b8d503d 100644
--- a/src/som/slot_manager.cpp
+++ b/src/som/slot_manager.cpp
@@ -41,7 +41,6 @@
 #include "nfagraph/ng_region.h"
 #include "util/charreach.h"
 #include "util/hash.h"
-#include "util/make_unique.h"
 #include "util/dump_charclass.h"
 #include "util/verify_types.h"
 
@@ -105,7 +104,7 @@ const SlotCacheEntry *SlotCache::find(const NGHolder &prefix,
 }
 
 SomSlotManager::SomSlotManager(u8 p)
-    : nextSomSlot(0), cache(ue2::make_unique<SlotCache>()), historyRequired(0),
+    : nextSomSlot(0), cache(std::make_unique<SlotCache>()), historyRequired(0),
       precision(p) {}
 
 SomSlotManager::~SomSlotManager() { }
@@ -243,7 +242,7 @@ u32 SomSlotManager::numSomSlots() const {
 
 u32 SomSlotManager::addRevNfa(bytecode_ptr<NFA> nfa, u32 maxWidth) {
     u32 rv = verify_u32(rev_nfas.size());
-    rev_nfas.push_back(move(nfa));
+    rev_nfas.emplace_back(std::move(nfa));
 
     // A rev nfa commits us to having enough history around to handle its
     // max width.
diff --git a/src/state.h b/src/state.h
index 9ade59db4..68600a910 100644
--- a/src/state.h
+++ b/src/state.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/stream_compress_impl.h b/src/stream_compress_impl.h
index d1ccf5e6d..f02543efa 100644
--- a/src/stream_compress_impl.h
+++ b/src/stream_compress_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, Intel Corporation
+ * Copyright (c) 2017-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/ue2common.h b/src/ue2common.h
index 5705af7be..b8300dc75 100644
--- a/src/ue2common.h
+++ b/src/ue2common.h
@@ -45,12 +45,7 @@
 // stdint.h for things like uintptr_t and friends
 #include <stdint.h>
 
-/* ick */
-#if defined(_WIN32)
-#define ALIGN_ATTR(x) __declspec(align(x))
-#else
 #define ALIGN_ATTR(x) __attribute__((aligned((x))))
-#endif
 
 #define ALIGN_DIRECTIVE ALIGN_ATTR(16)
 #define ALIGN_AVX_DIRECTIVE ALIGN_ATTR(32)
@@ -66,13 +61,8 @@ typedef signed int s32;
 /* We append the 'a' for aligned, since these aren't common, garden variety
  * 64 bit values. The alignment is necessary for structs on some platforms,
  * so we don't end up performing accidental unaligned accesses. */
-#if defined(_WIN32) && ! defined(_WIN64)
-typedef unsigned long long ALIGN_ATTR(4) u64a;
-typedef signed long long ALIGN_ATTR(4) s64a;
-#else
 typedef unsigned long long ALIGN_ATTR(8) u64a;
 typedef signed long long ALIGN_ATTR(8) s64a;
-#endif
 
 /* get the SIMD types */
 #include "util/simd_types.h"
@@ -83,24 +73,16 @@ typedef u32 ReportID;
 
 /* Shorthand for attribute to mark a function as part of our public API.
  * Functions without this attribute will be hidden. */
-#if !defined(_WIN32)
+#ifndef HS_PUBLIC_API
 #define HS_PUBLIC_API     __attribute__((visibility("default")))
-#else
-// TODO: dllexport defines for windows
-#define HS_PUBLIC_API
 #endif
 
 #define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0]))
 
 /** \brief Shorthand for the attribute to shut gcc about unused parameters */
-#if !defined(_WIN32)
 #define UNUSED __attribute__ ((unused))
-#else
-#define UNUSED
-#endif
 
 /* really_inline forces inlining always */
-#if !defined(_WIN32)
 #if defined(HS_OPTIMIZE)
 #define really_inline inline __attribute__ ((always_inline, unused))
 #else
@@ -113,33 +95,9 @@ typedef u32 ReportID;
 #define alignof __alignof
 #define HAVE_TYPEOF 1
 
-#else // ms windows
-#define really_inline __forceinline
-#define really_really_inline __forceinline
-#define never_inline
-#define __builtin_prefetch(...) do {} while(0)
-#if defined(__cplusplus)
-#define __typeof__ decltype
-#define HAVE_TYPEOF 1
-#else // C
-/* msvc doesn't have decltype or typeof in C */
-#define inline __inline
-#define alignof __alignof
-#endif
-#endif
-
 
 // We use C99-style "restrict".
-#ifdef _WIN32
-#ifdef __cplusplus
-#define restrict
-#else
-#define restrict __restrict
-#endif
-#else
 #define restrict __restrict
-#endif
-
 
 // Align to 16-byte boundary
 #define ROUNDUP_16(a) (((a) + 0xf) & ~0xf)
@@ -186,25 +144,16 @@ typedef u32 ReportID;
 #define LIMIT_TO_AT_MOST(a, b) (*(a) = MIN(*(a),(b)))
 #define ENSURE_AT_LEAST(a, b) (*(a) = MAX(*(a),(b)))
 
-#ifndef _WIN32
 #ifndef likely
   #define likely(x)     __builtin_expect(!!(x), 1)
 #endif
 #ifndef unlikely
   #define unlikely(x)   __builtin_expect(!!(x), 0)
 #endif
-#else
-#define likely(x)   (x)
-#define unlikely(x) (x)
-#endif
 
 #if !defined(RELEASE_BUILD) || defined(DEBUG)
-#ifdef _WIN32
-#define PATH_SEP '\\'
-#else
 #define PATH_SEP '/'
 #endif
-#endif
 
 #if defined(DEBUG) && !defined(DEBUG_PRINTF)
 #include <string.h>
diff --git a/src/util/alloc.cpp b/src/util/alloc.cpp
index ace26ed5d..400049323 100644
--- a/src/util/alloc.cpp
+++ b/src/util/alloc.cpp
@@ -47,7 +47,15 @@ namespace ue2 {
 #endif
 
 /* get us a posix_memalign from somewhere */
-#if !defined(HAVE_POSIX_MEMALIGN)
+#if defined(__MINGW32__) || defined(__MINGW64__)
+  #include <stdlib.h>
+  #include <intrin.h>
+  #include <malloc.h>
+  #include <windows.h>
+
+  #define posix_memalign(A, B, C) ((*A = (void *)__mingw_aligned_malloc(C, B)) == nullptr)
+
+#elif !defined(HAVE_POSIX_MEMALIGN)
 # if defined(HAVE_MEMALIGN)
     #define posix_memalign(A, B, C) ((*A = (void *)memalign(B, C)) == nullptr)
 # elif defined(HAVE__ALIGNED_MALLOC)
@@ -61,20 +69,12 @@ namespace ue2 {
 
 void *aligned_malloc_internal(size_t size, size_t align) {
     void *mem;
-#if !defined(_WIN32)
     int rv = posix_memalign(&mem, align, size);
     if (rv != 0) {
         DEBUG_PRINTF("posix_memalign returned %d when asked for %zu bytes\n",
                      rv, size);
         return nullptr;
     }
-#else
-    if (nullptr == (mem = _aligned_malloc(size, align))) {
-        DEBUG_PRINTF("_aligned_malloc failed when asked for %zu bytes\n",
-                     size);
-        return nullptr;
-    }
-#endif
 
     assert(mem);
     return mem;
@@ -85,8 +85,8 @@ void aligned_free_internal(void *ptr) {
         return;
     }
 
-#if defined(_WIN32)
-    _aligned_free(ptr);
+#if defined(__MINGW32__) || defined(__MINGW64__)
+    __mingw_aligned_free(ptr);
 #else
     free(ptr);
 #endif
diff --git a/src/util/alloc.h b/src/util/alloc.h
index de20c8d02..49b4a824d 100644
--- a/src/util/alloc.h
+++ b/src/util/alloc.h
@@ -76,7 +76,11 @@ class AlignedAllocator {
 
     T *allocate(std::size_t size) const {
         size_t alloc_size = size * sizeof(T);
-        return static_cast<T *>(aligned_malloc_internal(alloc_size, N));
+        T *ptr = static_cast<T *>(aligned_malloc_internal(alloc_size, N));
+        if (!ptr) {
+            throw std::bad_alloc();
+        }
+        return ptr;
     }
 
     void deallocate(T *x, std::size_t) const noexcept {
diff --git a/src/util/arch.h b/src/util/arch.h
index 985fec6ac..1e8d2fbd4 100644
--- a/src/util/arch.h
+++ b/src/util/arch.h
@@ -33,58 +33,15 @@
 #ifndef UTIL_ARCH_H_
 #define UTIL_ARCH_H_
 
-#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
-#define HAVE_SSE2
-#endif
-
-#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
-#define HAVE_SSE41
-#endif
-
-#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
-#define HAVE_SSE42
-#endif
-
-#if defined(__AVX__)
-#define HAVE_AVX
-#endif
-
-#if defined(__AVX2__)
-#define HAVE_AVX2
-#endif
-
-#if defined(__AVX512BW__)
-#define HAVE_AVX512
-#endif
+#include "config.h"
 
-#if defined(__AVX512VBMI__)
-#define HAVE_AVX512VBMI
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/x86.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/arm.h"
+#elif defined(ARCH_PPC64EL)
+#include "util/arch/ppc64el/ppc64el.h"
 #endif
 
-/*
- * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
- */
-#if defined(__POPCNT__) ||                                                     \
-    (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) ||                      \
-    (defined(_WIN32) && defined(__AVX__))
-#define HAVE_POPCOUNT_INSTR
-#endif
-
-#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) ||              \
-    (defined(__INTEL_COMPILER) && defined(__AVX2__))
-#define HAVE_BMI
-#endif
-
-#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) ||             \
-    (defined(__INTEL_COMPILER) && defined(__AVX2__))
-#define HAVE_BMI2
-#endif
-
-/*
- * MSVC uses a different form of inline asm
- */
-#if defined(_WIN32) && defined(_MSC_VER)
-#define NO_ASM
-#endif
+#endif // UTIL_ARCH_X86_H_
 
-#endif // UTIL_ARCH_H_
diff --git a/src/util/arch/arm/arm.h b/src/util/arch/arm/arm.h
new file mode 100644
index 000000000..c38ac697b
--- /dev/null
+++ b/src/util/arch/arm/arm.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Per-platform architecture definitions
+ */
+
+#ifndef UTIL_ARCH_ARM_H_
+#define UTIL_ARCH_ARM_H_
+
+#if defined(__ARM_NEON) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
+#define HAVE_NEON
+#define HAVE_SIMD_128_BITS
+#define CHUNKSIZE 128
+#define VECTORSIZE 16
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+#define HAVE_SVE
+#endif
+
+#if defined(__ARM_FEATURE_SVE2)
+#define HAVE_SVE2
+#endif
+
+#if defined(__ARM_FEATURE_SVE2_BITPERM)
+#define HAVE_SVE2_BITPERM
+#endif
+
+#endif // UTIL_ARCH_ARM_H_
+
diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h
new file mode 100644
index 000000000..04d001d30
--- /dev/null
+++ b/src/util/arch/arm/bitutils.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_ARM_H
+#define BITUTILS_ARCH_ARM_H
+
+#include "ue2common.h"
+#include "util/popcount.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include "util/arch/common/bitutils.h"
+
+static really_inline
+u32 clz32_impl(u32 x) {
+    return clz32_impl_c(x);
+}
+
+static really_inline
+u32 clz64_impl(u64a x) {
+    return clz64_impl_c(x);
+}
+
+static really_inline
+u32 ctz32_impl(u32 x) {
+    return ctz32_impl_c(x);
+}
+
+static really_inline
+u32 ctz64_impl(u64a x) {
+    return ctz64_impl_c(x);
+}
+
+static really_inline
+u32 lg2_impl(u32 x) {
+    return lg2_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl(u64a x) {
+    return lg2_64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl(u32 *v) {
+    return findAndClearLSB_32_impl_c(v);
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl(u64a *v) {
+    return findAndClearLSB_64_impl_c(v);
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl(u32 *v) {
+    return findAndClearMSB_32_impl_c(v);
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl(u64a *v) {
+    return findAndClearMSB_64_impl_c(v);
+}
+
+static really_inline
+u32 compress32_impl(u32 x, u32 m) {
+    return compress32_impl_c(x, m);
+}
+
+static really_inline
+u64a compress64_impl(u64a x, u64a m) {
+    return compress64_impl_c(x, m);
+}
+
+static really_inline
+m128 compress128_impl(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bb = one;
+    m128 res = zeroes128();
+    while (isnonzero128(m)) {
+        m128 mm = sub_2x64(zeroes128(), m);
+        m128 xm = and128(x, m);
+        xm = and128(xm, mm);
+
+        m128 mask = not128(eq64_m128(xm, zeroes128()));
+        res = or128(res, and128(bb, mask));
+        m = and128(m, sub_2x64(m, one));
+        bb = lshift64_m128(bb, 1);
+    }
+    return res;
+}
+
+
+#if defined(HAVE_SVE2_BITPERM)
+#include "bitutils_sve.h"
+#else
+
+static really_inline
+u32 expand32_impl(u32 x, u32 m) {
+    return expand32_impl_c(x, m);
+}
+
+static really_inline
+u64a expand64_impl(u64a x, u64a m) {
+    return expand64_impl_c(x, m);
+}
+
+#endif // HAVE_SVE2_BITPERM
+
+static really_inline
+m128 expand128_impl(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bb = one;
+    m128 res = zeroes128();
+    while (isnonzero128(m)) {
+        m128 xm = and128(x, bb);
+        m128 mm = sub_2x64(zeroes128(), m);
+        m128 mask = not128(eq64_m128(xm, zeroes128()));
+        mask = and128(mask, and128(m, mm));
+        res = or128(res, mask);
+        m = and128(m, sub_2x64(m, one));
+        bb = lshift64_m128(bb, 1);
+    }
+    return res;
+}
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64_impl(bitfield);
+}
+
+static really_inline
+char bf64_set_impl(u64a *bitfield, u32 i) {
+    return bf64_set_impl_c(bitfield, i);
+}
+
+static really_inline
+void bf64_unset_impl(u64a *bitfield, u32 i) {
+    return bf64_unset_impl_c(bitfield, i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl(u32 mask, u32 bit) {
+    return rank_in_mask32_impl_c(mask, bit);
+}
+
+static really_inline
+u32 rank_in_mask64_impl(u64a mask, u32 bit) {
+    return rank_in_mask64_impl_c(mask, bit);
+}
+
+static really_inline
+u32 pext32_impl(u32 x, u32 mask) {
+    return pext32_impl_c(x, mask);
+}
+
+static really_inline
+u64a pext64_impl(u64a x, u64a mask) {
+    return pext64_impl_c(x, mask);
+}
+
+static really_inline
+u64a pdep64_impl(u64a x, u64a mask) {
+    return pdep64_impl_c(x, mask);
+}
+
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl(const u32 a, const u8 *b) {
+    return andn_impl_c(a, b);
+}
+
+#endif // BITUTILS_ARCH_ARM_H
diff --git a/src/util/arch/arm/bitutils_sve.h b/src/util/arch/arm/bitutils_sve.h
new file mode 100644
index 000000000..1cd503d5e
--- /dev/null
+++ b/src/util/arch/arm/bitutils_sve.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives for SVE (ctz, compress etc)
+ */
+
+static really_inline
+u32 expand32_impl(u32 x, u32 m) {
+    return svlasta(svpfalse(), svbdep(svdup_u32(x), m));
+}
+
+static really_inline
+u64a expand64_impl(u64a x, u64a m) {
+    return svlasta(svpfalse(), svbdep(svdup_u64(x), m));
+}
+
+static really_inline
+void bdep64x2(u64a *d, const u64a *x, const m128 *m) {
+    svbool_t pg = svptrue_pat_b64(SV_VL2);
+    svst1(pg, (uint64_t *)d, svbdep(svld1_u64(pg, (const uint64_t *)x),
+                                    svld1_u64(pg, (const uint64_t *)m)));
+}
diff --git a/src/util/arch/arm/cpuid_flags.c b/src/util/arch/arm/cpuid_flags.c
new file mode 100644
index 000000000..66040f83d
--- /dev/null
+++ b/src/util/arch/arm/cpuid_flags.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/arch/common/cpuid_flags.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "util/arch.h"
+
+u64a cpuid_flags(void) {
+     return 0;
+}
+
+u32 cpuid_tune(void) {
+    return HS_TUNE_FAMILY_GENERIC;
+}
diff --git a/src/util/arch/arm/cpuid_inline.h b/src/util/arch/arm/cpuid_inline.h
new file mode 100644
index 000000000..f8a59af3e
--- /dev/null
+++ b/src/util/arch/arm/cpuid_inline.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017-2020, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef AARCH64_CPUID_INLINE_H_
+#define AARCH64_CPUID_INLINE_H_
+
+#if defined(__linux__)
+#include <sys/auxv.h>
+/* This is to help fix https://github.com/envoyproxy/envoy/pull/29881
+ */
+#if !defined(HWCAP2_SVE2)
+#include <asm/hwcap.h>
+#endif
+#endif
+
+#include "ue2common.h"
+#include "util/arch/common/cpuid_flags.h"
+
+static inline
+int check_neon(void) {
+    return 1;
+}
+
+#if defined(__linux__)
+static inline
+int check_sve(void) {
+    unsigned long hwcap = getauxval(AT_HWCAP);
+    if (hwcap & HWCAP_SVE) {
+        return 1;
+    }
+    return 0;
+}
+
+static inline
+int check_sve2(void) {
+    unsigned long hwcap2 = getauxval(AT_HWCAP2);
+    if (hwcap2 & HWCAP2_SVE2) {
+        return 1;
+    }
+    return 0;
+}
+#else
+static inline
+int check_sve(void) {
+    return 0;
+}
+
+static inline
+int check_sve2(void) {
+    return 0;
+}
+#endif
+
+#endif // AARCH64_CPUID_INLINE_H_
diff --git a/src/util/arch/arm/match.hpp b/src/util/arch/arm/match.hpp
new file mode 100644
index 000000000..cea9c39c9
--- /dev/null
+++ b/src/util/arch/arm/match.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+template <>
+really_really_inline
+const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
+    uint32x4_t m = mask.u.u32x4[0];
+    uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
+    if (vmax != 0) {
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        u32 pos = ctz64(z) / SuperVector<16>::mask_width();
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 16);
+        DEBUG_PRINTF("buf + pos %p\n", buf + (pos));
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
+    uint32x4_t m = mask.u.u32x4[0];
+    uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
+    if (vmax != 0) {
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        u32 pos = clz64(z) / SuperVector<16>::mask_width();
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        return buf + (15 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
+    uint32x4_t m = mask.u.u32x4[0];
+    uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
+    if (vmax != 0) {
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        u32 pos = ctz64(z) / SuperVector<16>::mask_width();
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 16);
+        DEBUG_PRINTF("buf + pos %p\n", buf + pos);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) {
+    uint32x4_t m = mask.u.u32x4[0];
+    uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
+    if (vmax != 0) {
+        typename SuperVector<16>::comparemask_type z = mask.comparemask();
+        DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
+        u32 pos = clz64(z) / SuperVector<16>::mask_width();
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        return buf + (15 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
diff --git a/src/util/arch/arm/simd_types.h b/src/util/arch/arm/simd_types.h
new file mode 100644
index 000000000..7dafcf586
--- /dev/null
+++ b/src/util/arch/arm/simd_types.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMD_TYPES_ARM_H
+#define SIMD_TYPES_ARM_H
+
+#if !defined(m128) && defined(HAVE_NEON)
+typedef int32x4_t m128;
+#endif
+
+#endif /* SIMD_TYPES_ARM_H */
+
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
new file mode 100644
index 000000000..7f8539b09
--- /dev/null
+++ b/src/util/arch/arm/simd_utils.h
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef ARCH_ARM_SIMD_UTILS_H
+#define ARCH_ARM_SIMD_UTILS_H
+
+#include <stdio.h>
+#include <stdbool.h>
+
+#include "ue2common.h"
+#include "util/simd_types.h"
+#include "util/unaligned.h"
+#include "util/intrinsics.h"
+
+#ifdef HAVE_SVE
+#include "simd_utils_sve.h"
+#endif
+
+#ifdef HAVE_SVE2
+#include "simd_utils_sve2.h"
+#endif
+
+#include <string.h> // for memcpy
+
+static really_inline m128 ones128(void) {
+    return (m128) vdupq_n_s8(0xFF);
+}
+
+static really_inline m128 zeroes128(void) {
+    return (m128) vdupq_n_s32(0);
+}
+
+/** \brief Bitwise not for m128*/
+static really_inline m128 not128(m128 a) {
+    return (m128) vmvnq_s32(a);
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    uint64_t res = vget_lane_u64(
+        (uint64x1_t)vshrn_n_u16((uint16x8_t)vceqq_s32(a, b), 4), 0);
+    return (~0ull != res);
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    static const uint32x4_t movemask = { 1, 2, 4, 8 };
+    return vaddvq_u32(vandq_u32(vmvnq_u32(vceqq_u32((uint32x4_t)a, (uint32x4_t)b)), movemask));
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+    static const uint64x2_t movemask = { 1, 4 };
+    return (u32) vaddvq_u64(vandq_u64((uint64x2_t)vmvnq_u32((uint32x4_t)vceqq_u64((uint64x2_t)a, (uint64x2_t)b)), movemask));
+}
+
+static really_really_inline
+m128 add_2x64(m128 a, m128 b) {
+    return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b);
+}
+
+static really_really_inline
+m128 sub_2x64(m128 a, m128 b) {
+    return (m128) vsubq_u64((uint64x2_t)a, (uint64x2_t)b);
+}
+
+static really_inline
+m128 lshift_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshlq_n_u32((uint32x4_t)a, b);
+    }
+#endif
+  int32x4_t shift_indices = vdupq_n_s32(b);
+  return (m128) vshlq_s32(a, shift_indices);
+}
+
+static really_really_inline
+m128 rshift_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshrq_n_u32((uint32x4_t)a, b);
+    }
+#endif
+  int32x4_t shift_indices = vdupq_n_s32(-b);
+  return (m128) vshlq_s32(a, shift_indices);
+}
+
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshlq_n_u64((uint64x2_t)a, b);
+    }
+#endif
+  int64x2_t shift_indices = vdupq_n_s64(b);
+  return (m128) vshlq_s64((int64x2_t) a, shift_indices);
+}
+
+static really_really_inline
+m128 rshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshrq_n_u64((uint64x2_t)a, b);
+    }
+#endif
+  int64x2_t shift_indices = vdupq_n_s64(-b);
+  return (m128) vshlq_s64((int64x2_t) a, shift_indices);
+}
+
+static really_inline m128 eq128(m128 a, m128 b) {
+    return (m128) vceqq_u8((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline m128 eq64_m128(m128 a, m128 b) {
+    return (m128) vceqq_u64((uint64x2_t)a, (uint64x2_t)b);
+}
+
+static really_inline u32 movemask128(m128 a) {
+    uint8x16_t input = vreinterpretq_u8_s32(a);
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+static really_inline m128 set1_16x8(u8 c) {
+    return (m128) vdupq_n_u8(c);
+}
+
+static really_inline m128 set1_4x32(u32 c) {
+    return (m128) vdupq_n_u32(c);
+}
+
+static really_inline m128 set1_2x64(u64a c) {
+    return (m128) vdupq_n_u64(c);
+}
+
+static really_inline u32 movd(const m128 in) {
+    return vgetq_lane_u32((uint32x4_t) in, 0);
+}
+
+static really_inline u64a movq(const m128 in) {
+    return vgetq_lane_u64((uint64x2_t) in, 0);
+}
+
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0);
+}
+
+static really_inline u32 extract32from128(const m128 in, unsigned imm) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(imm)) {
+        return vgetq_lane_u32((uint32x4_t) in, imm);
+    }
+#endif
+    switch (imm) {
+    case 0:
+        return vgetq_lane_u32((uint32x4_t) in, 0);
+	break;
+    case 1:
+        return vgetq_lane_u32((uint32x4_t) in, 1);
+	break;
+    case 2:
+        return vgetq_lane_u32((uint32x4_t) in, 2);
+	break;
+    case 3:
+        return vgetq_lane_u32((uint32x4_t) in, 3);
+	break;
+    default:
+	return 0;
+	break;
+    }
+}
+
+static really_inline u64a extract64from128(const m128 in, unsigned imm) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(imm)) {
+        return vgetq_lane_u64((uint64x2_t) in, imm);
+    }
+#endif
+    switch (imm) {
+    case 0:
+        return vgetq_lane_u64((uint64x2_t) in, 0);
+	break;
+    case 1:
+        return vgetq_lane_u64((uint64x2_t) in, 1);
+	break;
+    default:
+	return 0;
+	break;
+    }
+}
+
+static really_inline m128 low64from128(const m128 in) {
+    return (m128) vcombine_u64(vget_low_u64((uint64x2_t)in), vdup_n_u64(0));
+}
+
+static really_inline m128 high64from128(const m128 in) {
+    return (m128) vcombine_u64(vget_high_u64((uint64x2_t)in), vdup_n_u64(0));
+}
+
+static really_inline m128 add128(m128 a, m128 b) {
+    return (m128) vaddq_u64((uint64x2_t)a, (uint64x2_t)b);
+}
+
+static really_inline m128 and128(m128 a, m128 b) {
+    return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    return (m128) vandq_s8( vmvnq_s8((int8x16_t) a), (int8x16_t) b);
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    return (m128) vld1q_s32((const int32_t *)ptr);
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    vst1q_s32((int32_t *)ptr, a);
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    return (m128) vld1q_s32((const int32_t *)ptr);
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    vst1q_s32((int32_t *)ptr, a);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+#define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
+
+static really_really_inline
+m128 palignr_imm(m128 r, m128 l, int offset) {
+    switch (offset) {
+    case 0: return l; break;
+    CASE_ALIGN_VECTORS(l, r, 1);
+    CASE_ALIGN_VECTORS(l, r, 2);
+    CASE_ALIGN_VECTORS(l, r, 3);
+    CASE_ALIGN_VECTORS(l, r, 4);
+    CASE_ALIGN_VECTORS(l, r, 5);
+    CASE_ALIGN_VECTORS(l, r, 6);
+    CASE_ALIGN_VECTORS(l, r, 7);
+    CASE_ALIGN_VECTORS(l, r, 8);
+    CASE_ALIGN_VECTORS(l, r, 9);
+    CASE_ALIGN_VECTORS(l, r, 10);
+    CASE_ALIGN_VECTORS(l, r, 11);
+    CASE_ALIGN_VECTORS(l, r, 12);
+    CASE_ALIGN_VECTORS(l, r, 13);
+    CASE_ALIGN_VECTORS(l, r, 14);
+    CASE_ALIGN_VECTORS(l, r, 15);
+    case 16: return r; break;
+    default:
+	return zeroes128();
+	break;
+    }
+}
+
+static really_really_inline
+m128 palignr(m128 r, m128 l, int offset) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
+    }
+#endif
+    return palignr_imm(r, l, offset);
+}
+#undef CASE_ALIGN_VECTORS
+
+static really_really_inline
+m128 rshiftbyte_m128(m128 a, unsigned b) {
+    if (b == 0) {
+        return a;
+    }
+    return palignr(zeroes128(), a, b);
+}
+
+static really_really_inline
+m128 lshiftbyte_m128(m128 a, unsigned b) {
+    if (b == 0) {
+        return a;
+    }
+    return palignr(a, zeroes128(), 16 - b);
+}
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    if (amount < 0) {
+        return palignr_imm(zeroes128(), in, -amount);
+    } else {
+        return palignr_imm(in, zeroes128(), 16 - amount);
+    }
+}
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    static m128 onebit = { 1, 0 };
+    m128 mask = lshiftbyte_m128( onebit, n / 8 );
+    return lshift64_m128( mask, n % 8 );
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+
+    return isnonzero128(and128(mask, val));
+}
+
+static really_inline
+m128 pshufb_m128(m128 a, m128 b) {
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON, if >=16, then the result is zero, otherwise it is that lane.
+       btranslated is the version that is converted from Intel to NEON.  */
+    int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f));
+    return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated);
+}
+
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return (m128) vmaxq_u8((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return (m128) vminq_u8((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
+    uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 };
+    return (m128) vld1q_u32((uint32_t *) data);
+}
+
+static really_inline
+m128 set2x64(u64a hi, u64a lo) {
+    uint64_t ALIGN_ATTR(16) data[2] = { lo, hi };
+    return (m128) vld1q_u64((uint64_t *) data);
+}
+
+#endif // ARCH_ARM_SIMD_UTILS_H
diff --git a/src/util/arch/arm/simd_utils_sve.h b/src/util/arch/arm/simd_utils_sve.h
new file mode 100644
index 000000000..48a4a9338
--- /dev/null
+++ b/src/util/arch/arm/simd_utils_sve.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SVE primitive operations.
+ */
+
+really_really_inline
+uint64_t accelSearchGetOffset(svbool_t matched) {
+    return svcntp_b8(svptrue_b8(), svbrkb_z(svptrue_b8(), matched));
+}
+
+really_really_inline
+const u8 *accelSearchCheckMatched(const u8 *buf, svbool_t matched) {
+    if (unlikely(svptest_any(svptrue_b8(), matched))) {
+        const u8 *matchPos = buf + accelSearchGetOffset(matched);
+        DEBUG_PRINTF("match pos %p\n", matchPos);
+        return matchPos;
+    }
+    return NULL;
+}
+
+really_really_inline
+const u8 *accelRevSearchCheckMatched(const u8 *buf, svbool_t matched) {
+    if (unlikely(svptest_any(svptrue_b8(), matched))) {
+        const u8 *matchPos = buf + (svcntb() -
+            svcntp_b8(svptrue_b8(), svbrka_z(svptrue_b8(), svrev_b8(matched))));
+        DEBUG_PRINTF("match pos %p\n", matchPos);
+        return matchPos;
+    }
+    return NULL;
+}
+
+static really_inline
+svuint8_t getSVEMaskFrom128(m128 mask) {
+    return svld1_u8(svptrue_pat_b8(SV_VL16), (const uint8_t *)&mask);
+}
\ No newline at end of file
diff --git a/src/util/arch/arm/simd_utils_sve2.h b/src/util/arch/arm/simd_utils_sve2.h
new file mode 100644
index 000000000..188ef3fff
--- /dev/null
+++ b/src/util/arch/arm/simd_utils_sve2.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021, Arm Limited
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SVE primitive operations.
+ */
+
+static really_inline
+svuint8_t getCharMaskSingle(const u8 c, bool noCase) {
+    if (noCase) {
+        uint16_t chars_u16 = (c & 0xdf) | ((c | 0x20) << 8);
+        return svreinterpret_u8(svdup_u16(chars_u16));
+    } else {
+        return svdup_u8(c);
+    }
+}
+
+static really_inline
+svuint16_t getCharMaskDouble(const u8 c0, const u8 c1, bool noCase) {
+    if (noCase) {
+        const uint64_t lowerFirst = c0 & 0xdf;
+        const uint64_t upperFirst = c0 | 0x20;
+        const uint64_t lowerSecond = c1 & 0xdf;
+        const uint64_t upperSecond = c1 | 0x20;
+        const uint64_t chars = lowerFirst | (lowerSecond << 8)
+                          | (lowerFirst << 16) | (upperSecond) << 24
+                          | (upperFirst << 32) | (lowerSecond) << 40
+                          | (upperFirst << 48) | (upperSecond) << 56;
+        return svreinterpret_u16(svdup_u64(chars));
+    } else {
+        uint16_t chars_u16 = c0 | (c1 << 8);
+        return svdup_u16(chars_u16);
+    }
+}
\ No newline at end of file
diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h
new file mode 100644
index 000000000..e5ab0d058
--- /dev/null
+++ b/src/util/arch/common/bitutils.h
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_COMMON_H
+#define BITUTILS_ARCH_COMMON_H
+
+#include "util/popcount.h"
+#include "util/unaligned.h"
+#include "util/simd_utils.h"
+
+static really_inline
+u32 clz32_impl_c(u32 x) {
+    return (u32)__builtin_clz(x);
+}
+
+static really_inline
+u32 clz64_impl_c(u64a x) {
+    return (u32)__builtin_clzll(x);
+}
+
+// CTZ (count trailing zero) implementations.
+static really_inline
+u32 ctz32_impl_c(u32 x) {
+    return (u32)__builtin_ctz(x);
+}
+
+static really_inline
+u32 ctz64_impl_c(u64a x) {
+    return (u32)__builtin_ctzll(x);
+}
+
+static really_inline
+u32 lg2_impl_c(u32 x) {
+    if (!x) {
+        return 0;
+    }
+    return 31 - clz32_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl_c(u64a x) {
+    if (!x) {
+        return 0;
+    }
+    return 63 - clz64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl_c(u32 *v) {
+    u32 val = *v;
+    u32 offset = ctz32_impl_c(val);
+    *v = val & (val - 1);
+
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl_c(u64a *v) {
+#ifdef ARCH_64_BIT
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = ctz64_impl_c(val);
+    *v = val & (val - 1);
+#else
+    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+    // inline calls to __builtin_ctzll
+    u32 v1 = (u32)*v;
+    u32 v2 = (u32)(*v >> 32);
+    u32 offset;
+    if (v1) {
+        offset = findAndClearLSB_32_impl_c(&v1);
+        *v = (u64a)v1 | ((u64a)v2 << 32);
+    } else {
+        offset = findAndClearLSB_32_impl_c(&v2) + 32;
+        *v = (u64a)v2 << 32;
+    }
+#endif
+
+    assert(offset < 64);
+    return (u32)offset;
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl_c(u32 *v) {
+    u32 val = *v;
+    u32 offset = 31 - clz32_impl_c(val);
+    *v = val & ~(1 << offset);
+
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl_c(u64a *v) {
+#ifdef ARCH_64_BIT
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = 63 - clz64_impl_c(val);
+    *v = val & ~(1ULL << offset);
+#else
+    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+    // inline calls to __builtin_ctzll
+    u32 v1 = (u32)*v;
+    u32 v2 = (*v >> 32);
+    u32 offset;
+    if (v2) {
+        offset = findAndClearMSB_32_impl_c(&v2) + 32;
+        *v = ((u64a)v2 << 32) | (u64a)v1;
+    } else {
+        offset = findAndClearMSB_32_impl_c(&v1);
+        *v = (u64a)v1;
+    }
+#endif
+
+    assert(offset < 64);
+    return (u32)offset;
+}
+
+static really_inline
+u32 compress32_impl_c(u32 x, u32 m) {
+
+    // Return zero quickly on trivial cases
+    if ((x & m) == 0) {
+        return 0;
+    }
+
+    u32 mk, mp, mv, t;
+
+    x &= m; // clear irrelevant bits
+
+    mk = ~m << 1; // we will count 0's to right
+    for (u32 i = 0; i < 5; i++) {
+        mp = mk ^ (mk << 1);
+        mp ^= mp << 2;
+        mp ^= mp << 4;
+        mp ^= mp << 8;
+        mp ^= mp << 16;
+
+        mv = mp & m; // bits to move
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        t = x & mv;
+        x = (x ^ t) | (t >> (1 << i)); // compress x
+        mk = mk & ~mp;
+    }
+
+    return x;
+}
+
+static really_inline
+u64a compress64_impl_c(u64a x, u64a m) {
+  u64a res = 0;
+  for (u64a bb = 1; m != 0; bb += bb) {
+    if (x & m & -m) { res |= bb; }
+    m &= (m - 1);
+  }
+  return res;
+/*    // Return zero quickly on trivial cases
+    if ((x & m) == 0) {
+        return 0;
+    }
+
+    u64a mk, mp, mv, t;
+
+    x &= m; // clear irrelevant bits
+
+    mk = ~m << 1; // we will count 0's to right
+    for (u32 i = 0; i < 6; i++) {
+        mp = mk ^ (mk << 1);
+        mp ^= mp << 2;
+        mp ^= mp << 4;
+        mp ^= mp << 8;
+        mp ^= mp << 16;
+        mp ^= mp << 32;
+
+        mv = mp & m; // bits to move
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        t = x & mv;
+        x = (x ^ t) | (t >> (1 << i)); // compress x
+        mk = mk & ~mp;
+    }
+
+    return x;*/
+}
+
+static really_inline
+m128 compress128_impl_c(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bitset = one;
+    m128 vres = zeroes128();
+    while (isnonzero128(m)) {
+	m128 mm = sub_2x64(zeroes128(), m);
+	m128 tv = and128(x, m);
+	tv = and128(tv, mm);
+
+	m128 mask = not128(eq64_m128(tv, zeroes128()));
+	mask = and128(bitset, mask);
+        vres = or128(vres, mask);
+	m = and128(m, sub_2x64(m, one));
+        bitset = lshift64_m128(bitset, 1);
+    }
+    return vres;
+}
+
+static really_inline
+u32 expand32_impl_c(u32 x, u32 m) {
+    // Return zero quickly on trivial cases
+    if (!x || !m) {
+        return 0;
+    }
+
+    u32 m0, mk, mp, mv, t;
+    u32 array[5];
+
+    m0 = m; // save original mask
+    mk = ~m << 1; // we will count 0's to right
+
+    for (int i = 0; i < 5; i++) {
+        mp = mk ^ (mk << 1); // parallel suffix
+        mp = mp ^ (mp << 2);
+        mp = mp ^ (mp << 4);
+        mp = mp ^ (mp << 8);
+        mp = mp ^ (mp << 16);
+        mv = mp & m; // bits to move
+        array[i] = mv;
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        mk = mk & ~mp;
+    }
+
+    for (int i = 4; i >= 0; i--) {
+        mv = array[i];
+        t = x << (1 << i);
+        x = (x & ~mv) | (t & mv);
+    }
+
+    return x & m0; // clear out extraneous bits
+}
+
+static really_inline
+u64a expand64_impl_c(u64a x, u64a m) {
+
+  u64a res = 0;
+  for (u64a bb = 1; m != 0; bb += bb) {
+    if (x & bb) { res |= m & (-m); }
+    m &= (m - 1);
+  }
+  return res;
+/*    // Return zero quickly on trivial cases
+    if (!x || !m) {
+        return 0;
+    }
+
+    u64a m0, mk, mp, mv, t;
+    u64a array[6];
+
+    m0 = m; // save original mask
+    mk = ~m << 1; // we will count 0's to right
+
+    for (int i = 0; i < 6; i++) {
+        mp = mk ^ (mk << 1); // parallel suffix
+        mp = mp ^ (mp << 2);
+        mp = mp ^ (mp << 4);
+        mp = mp ^ (mp << 8);
+        mp = mp ^ (mp << 16);
+        mp = mp ^ (mp << 32);
+        mv = mp & m; // bits to move
+        array[i] = mv;
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        mk = mk & ~mp;
+    }
+
+    for (int i = 5; i >= 0; i--) {
+        mv = array[i];
+        t = x << (1 << i);
+        x = (x & ~mv) | (t & mv);
+    }
+
+    return x & m0; // clear out extraneous bits*/
+}
+
+static really_inline
+m128 expand128_impl_c(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bb = one;
+    m128 res = zeroes128();
+    while (isnonzero128(m)) {
+	m128 xm = and128(x, bb);
+        m128 mm = sub_2x64(zeroes128(), m);
+        m128 mask = not128(eq64_m128(xm, zeroes128()));
+	mask = and128(mask, and128(m,mm));
+        res = or128(res, mask);
+        m = and128(m, sub_2x64(m, one));
+        bb = lshift64_m128(bb, 1);
+    }
+    return res;
+}
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl_c(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64_impl_c(bitfield);
+}
+
+static really_inline
+char bf64_set_impl_c(u64a *bitfield, u32 i) {
+    u64a mask = 1ULL << i;
+    char was_set = !!(*bitfield & mask);
+    *bitfield |= mask;
+
+    return was_set;
+}
+
+static really_inline
+void bf64_unset_impl_c(u64a *bitfield, u32 i) {
+    *bitfield &= ~(1ULL << i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl_c(u32 mask, u32 bit) {
+    mask &= (u32)(1U << bit) - 1;
+    return popcount32(mask);
+}
+
+static really_inline
+u32 rank_in_mask64_impl_c(u64a mask, u32 bit) {
+    mask &= (u64a)(1ULL << bit) - 1;
+    return popcount64(mask);
+}
+
+static really_inline
+u32 pext32_impl_c(u32 x, u32 mask) {
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_32_impl_c(&mask);
+        if (x & (1U << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+}
+
+static really_inline
+u64a pext64_impl_c(u64a x, u64a mask) {
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_64_impl_c(&mask);
+        if (x & (1ULL << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+}
+
+static really_inline
+u64a pdep64_impl_c(u64a x, u64a _m) {
+    /* Taken from:
+     * https://gcc.gnu.org/legacy-ml/gcc-patches/2017-06/msg01408.html
+     */
+
+    u64a result = 0x0UL;
+    const u64a mask = 0x8000000000000000UL;
+    u64a m = _m;
+    u64a c, t;
+    u64a p;
+
+    /* The pop-count of the mask gives the number of the bits from
+     source to process.  This is also needed to shift bits from the
+     source into the correct position for the result.  */
+    p = 64 - __builtin_popcountl (_m);
+
+    /* The loop is for the number of '1' bits in the mask and clearing
+     each mask bit as it is processed.  */
+    while (m != 0)
+    {
+        c = __builtin_clzl (m);
+        t = x << (p - c);
+        m ^= (mask >> c);
+        result |= (t & (mask >> c));
+        p++;
+    }
+    return (result);
+}
+
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl_c(const u32 a, const u8 *b) {
+    return unaligned_load_u32(b) & ~a;
+}
+
+#endif // BITUTILS_ARCH_COMMON_H
diff --git a/src/util/cpuid_flags.h b/src/util/arch/common/cpuid_flags.h
similarity index 94%
rename from src/util/cpuid_flags.h
rename to src/util/arch/common/cpuid_flags.h
index 527c6d52f..c1bbdc664 100644
--- a/src/util/cpuid_flags.h
+++ b/src/util/arch/common/cpuid_flags.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +32,7 @@
 
 #include "ue2common.h"
 
-#if !defined(_WIN32) && !defined(CPUID_H_)
+#if (defined(ARCH_IA32) || defined(ARCH_X86_64)) && !defined(CPUID_H_)
 #include <cpuid.h>
  /* system header doesn't have a header guard */
 #define CPUID_H_
diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
new file mode 100644
index 000000000..24331b103
--- /dev/null
+++ b/src/util/arch/common/simd_utils.h
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef ARCH_COMMON_SIMD_UTILS_H
+#define ARCH_COMMON_SIMD_UTILS_H
+
+#include "ue2common.h"
+#include "util/simd_types.h"
+#include "util/unaligned.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+#if !defined(HAVE_SIMD_128_BITS) && !defined(VS_SIMDE_BACKEND)
+#error "You need at least a 128-bit capable SIMD engine!"
+#endif // HAVE_SIMD_128_BITS
+
+#ifdef DEBUG
+static inline void print_m128_16x8(const char *label, m128 vec) {
+    uint8_t ALIGN_ATTR(16) data[16];
+    store128(data, vec);
+    DEBUG_PRINTF("%12s: ", label);
+    for(int i=15; i >=0; i--)
+        printf("%02x ", data[i]);
+    printf("\n");
+}
+
+static inline void print_m128_8x16(const char *label, m128 vec) {
+    uint16_t ALIGN_ATTR(16) data[8];
+    store128(data, vec);
+    DEBUG_PRINTF("%12s: ", label);
+    for(int i=7; i >= 0; i--)
+        printf("%04x ", data[i]);
+    printf("\n");
+}
+
+static inline void print_m128_4x32(const char *label, m128 vec) {
+    uint32_t ALIGN_ATTR(16) data[4];
+    store128(data, vec);
+    DEBUG_PRINTF("%12s: ", label);
+    for(int i=3; i >= 0; i--)
+        printf("%08x ", data[i]);
+    printf("\n");
+}
+
+static inline void print_m128_2x64(const char *label, m128 vec) {
+    uint64_t ALIGN_ATTR(16) data[2];
+    store128(data, vec);
+    DEBUG_PRINTF("%12s: ", label);
+    for(int i=1; i >= 0; i--)
+        printf("%016lx ", data[i]);
+    printf("\n");
+}
+#else
+#define print_m128_16x8(label, vec) ;
+#define print_m128_8x16(label, vec) ;
+#define print_m128_4x32(label, vec) ;
+#define print_m128_2x64(label, vec) ;
+#endif
+
+#if !defined(ARCH_IA32) && !defined(ARCH_X86_64) && !defined(VS_SIMDE_BACKEND)
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
+    ZEROES_32, ZEROES_32,
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+    ZEROES_32, ZEROES_32,
+};
+#endif // !defined(ARCH_IA32) && !defined(ARCH_X86_64)
+
+/****
+ **** 256-bit Primitives
+ ****/
+
+#if !defined(HAVE_SIMD_256_BITS)
+
+static really_really_inline
+m256 lshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
+    return rv;
+}
+
+static really_inline
+m256 rshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
+    return rv;
+}
+
+static really_inline
+m256 eq256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = eq128(a.lo, b.lo);
+    rv.hi = eq128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+u32 movemask256(m256 a) {
+    u32 lo_mask = movemask128(a.lo);
+    u32 hi_mask = movemask128(a.hi);
+    return lo_mask | (hi_mask << 16);
+}
+
+static really_inline m256 set1_4x64(u64a c) {
+    m128 a128 = set1_2x64(c);
+    m256 rv = {a128, a128};
+    return rv;
+}
+
+static really_inline
+m256 set1_2x128(m128 a) {
+    m256 rv = {a, a};
+    return rv;
+}
+
+static really_inline m256 zeroes256(void) {
+    m256 rv = {zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m256 ones256(void) {
+    m256 rv = {ones128(), ones128()};
+    return rv;
+}
+
+static really_inline m256 add256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = add128(a.lo, b.lo);
+    rv.hi = add128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 and256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 or256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 xor256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 not256(m256 a) {
+    m256 rv;
+    rv.lo = not128(a.lo);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+
+static really_inline m256 andnot256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline int diff256(m256 a, m256 b) {
+    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero256(m256 a) {
+    return isnonzero128(or128(a.lo, a.hi));
+}
+
+/**
+ * "Rich" version of diff256(). Takes two vectors a and b and returns a 8-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich256(m256 a, m256 b) {
+    return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 4);
+}
+
+/**
+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
+ * returns an 8-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_256(m256 a, m256 b) {
+    u32 d = diffrich256(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m256 load256(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
+    return rv;
+}
+
+// aligned load  of 128-bit value to low and high part of 256-bit value
+static really_inline m256 load2x128(const void *ptr) {
+    return set1_2x128(load128(ptr));
+}
+
+static really_inline m256 loadu2x128(const void *ptr) {
+    return set1_2x128(loadu128(ptr));
+}
+
+// aligned store
+static really_inline void store256(void *ptr, m256 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    ptr = vectorscan_assume_aligned(ptr, 16);
+    *(m256 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m256 loadu256(const void *ptr) {
+    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
+    return rv;
+}
+
+// unaligned store
+static really_inline void storeu256(void *ptr, m256 a) {
+    storeu128(ptr, a.lo);
+    storeu128((char *)ptr + 16, a.hi);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes256(void *ptr, m256 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m256 loadbytes256(const void *ptr, unsigned int n) {
+    m256 a = zeroes256();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
+static really_inline
+m256 set1_32x8(u32 in) {
+    m256 rv;
+    rv.hi = set1_16x8(in);
+    rv.lo = set1_16x8(in);
+    return rv;
+}
+
+static really_inline
+m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
+    m256 rv;
+    rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0);
+    rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0);
+    return rv;
+}
+
+static really_inline
+m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+    m256 rv;
+    rv.hi = set2x64(hi_1, hi_0);
+    rv.lo = set2x64(lo_1, lo_0);
+    return rv;
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    setbit128(sub, n);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    clearbit128(sub, n);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 128;
+    }
+    return testbit128(sub, n);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return x.hi;
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return x.lo;
+}
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+    m256 rv = {lo, hi};
+    return rv;
+}
+
+static really_inline
+m256 pshufb_m256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = pshufb_m128(a.lo, b.lo);
+    rv.hi = pshufb_m128(a.hi, b.hi);
+    return rv;
+}
+
+#endif // HAVE_SIMD_256_BITS
+
+/****
+ **** 384-bit Primitives
+ ****/
+
+static really_inline m384 and384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.mid = and128(a.mid, b.mid);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 or384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.mid = or128(a.mid, b.mid);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 xor384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.mid = xor128(a.mid, b.mid);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+static really_inline m384 not384(m384 a) {
+    m384 rv;
+    rv.lo = not128(a.lo);
+    rv.mid = not128(a.mid);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+static really_inline m384 andnot384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.mid = andnot128(a.mid, b.mid);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
+    m384 rv;
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
+    return rv;
+}
+
+static really_inline m384 zeroes384(void) {
+    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m384 ones384(void) {
+    m384 rv = {ones128(), ones128(), ones128()};
+    return rv;
+}
+
+static really_inline int diff384(m384 a, m384 b) {
+    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero384(m384 a) {
+    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
+}
+
+/**
+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich384(m384 a, m384 b) {
+    return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8);
+}
+
+/**
+ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
+ * returns a 12-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_384(m384 a, m384 b) {
+    u32 d = diffrich384(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m384 load384(const void *ptr) {
+    assert(ISALIGNED_16(ptr));
+    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
+                load128((const char *)ptr + 32) };
+    return rv;
+}
+
+// aligned store
+static really_inline void store384(void *ptr, m384 a) {
+    assert(ISALIGNED_16(ptr));
+    ptr = vectorscan_assume_aligned(ptr, 16);
+    *(m384 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m384 loadu384(const void *ptr) {
+    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
+                loadu128((const char *)ptr + 32)};
+    return rv;
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes384(void *ptr, m384 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m384 loadbytes384(const void *ptr, unsigned int n) {
+    m384 a = zeroes384();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    setbit128(sub, n % 128);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    clearbit128(sub, n % 128);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit384(m384 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else if (n < 256) {
+        sub = val.mid;
+    } else {
+        sub = val.hi;
+    }
+    return testbit128(sub, n % 128);
+}
+
+
+/****
+ **** 512-bit Primitives
+ ****/
+
+#if !defined(HAVE_SIMD_512_BITS)
+
+static really_inline
+m512 zeroes512(void) {
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+}
+
+static really_inline
+m512 ones512(void) {
+    m512 rv = {ones256(), ones256()};
+    return rv;
+}
+
+static really_inline
+m512 set1_64x8(u8 a) {
+    m256 a256 = set1_32x8(a);
+    m512 rv = {a256, a256};
+    return rv;
+}
+
+static really_inline
+m512 set1_8x64(u64a a) {
+    m256 a256 = set1_4x64(a);
+    m512 rv = {a256, a256};
+    return rv;
+}
+
+static really_inline
+m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
+    m512 rv;
+    rv.lo = set4x64(lo_3, lo_2, lo_1, lo_0);
+    rv.hi = set4x64(hi_3, hi_2, hi_1, hi_0);
+    return rv;
+}
+/*
+static really_inline
+m512 swap256in512(m512 a) {
+    m512 idx = set8x64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    return vpermq512(idx, a);
+}*/
+
+static really_inline
+m512 set1_4x128(m128 a) {
+    m256 a256 = set1_2x128(a);
+    m512 rv = {a256, a256};
+    return rv;
+}
+
+static really_inline
+m512 add512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = add256(a.lo, b.lo);
+    rv.hi = add256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+m512 and512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = and256(a.lo, b.lo);
+    rv.hi = and256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+m512 or512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = or256(a.lo, b.lo);
+    rv.hi = or256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+m512 xor512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = xor256(a.lo, b.lo);
+    rv.hi = xor256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+m512 not512(m512 a) {
+    m512 rv;
+    rv.lo = not256(a.lo);
+    rv.hi = not256(a.hi);
+    return rv;
+}
+
+static really_inline
+m512 andnot512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = andnot256(a.lo, b.lo);
+    rv.hi = andnot256(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+    m512 rv;
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
+    return rv;
+}
+
+static really_inline
+int diff512(m512 a, m512 b) {
+    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+}
+
+static really_inline
+int isnonzero512(m512 a) {
+    m256 x = or256(a.lo, a.lo);
+    m256 y = or256(a.hi, a.hi);
+    return isnonzero256(or256(x, y));
+}
+
+/**
+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich512(m512 a, m512 b) {
+    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
+}
+
+/**
+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
+ * returns a 16-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline
+u32 diffrich64_512(m512 a, m512 b) {
+    //TODO: cmp_epi64?
+    u32 d = diffrich512(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline
+m512 load512(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
+    return rv;
+}
+
+// aligned store
+static really_inline
+void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+    m512 *x = (m512 *)ptr;
+    store256(&x->lo, a.lo);
+    store256(&x->hi, a.hi);
+}
+
+// unaligned load
+static really_inline
+m512 loadu512(const void *ptr) {
+    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
+    return rv;
+}
+
+/*static really_inline
+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+}
+
+static really_inline
+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+}
+
+static really_inline
+m512 set_mask_m512(__mmask64 k) {
+}*/
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes512(void *ptr, m512 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m512 loadbytes512(const void *ptr, unsigned int n) {
+    m512 a = zeroes512();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    setbit256(sub, n);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    clearbit256(sub, n);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit512(m512 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m256 sub;
+    if (n < 256) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 256;
+    }
+    return testbit256(sub, n);
+}
+
+#endif // HAVE_SIMD_512_BITS
+
+#endif // ARCH_COMMON_SIMD_UTILS_H
diff --git a/src/util/arch/ppc64el/bitutils.h b/src/util/arch/ppc64el/bitutils.h
new file mode 100644
index 000000000..1741b09db
--- /dev/null
+++ b/src/util/arch/ppc64el/bitutils.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_PPC64EL_H
+#define BITUTILS_ARCH_PPC64EL_H
+
+#include "ue2common.h"
+#include "util/popcount.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include "util/arch/common/bitutils.h"
+
+static really_inline
+u32 clz32_impl(u32 x) {
+    return clz32_impl_c(x);
+}
+
+static really_inline
+u32 clz64_impl(u64a x) {
+    return clz64_impl_c(x);
+}
+
+static really_inline
+u32 ctz32_impl(u32 x) {
+    return ctz32_impl_c(x);
+}
+
+static really_inline
+u32 ctz64_impl(u64a x) {
+    return ctz64_impl_c(x);
+}
+
+static really_inline
+u32 lg2_impl(u32 x) {
+    return lg2_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl(u64a x) {
+    return lg2_64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl(u32 *v) {
+    return findAndClearLSB_32_impl_c(v);
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl(u64a *v) {
+    return findAndClearLSB_64_impl_c(v);
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl(u32 *v) {
+    u32 val = *v;
+    u32 offset = 31 - clz32_impl(val);
+    *v = val & ~(1 << offset);
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl(u64a *v) {
+    return findAndClearMSB_64_impl_c(v);
+}
+
+static really_inline
+u32 compress32_impl(u32 x, u32 m) {
+    return compress32_impl_c(x, m);
+}
+
+static really_inline
+u64a compress64_impl(u64a x, u64a m) {
+    return compress64_impl_c(x, m);
+}
+
+static really_inline
+m128 compress128_impl(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bitset = one;
+    m128 vres = zeroes128();
+    while (isnonzero128(m)) {
+	m128 mm = sub_2x64(zeroes128(), m);
+	m128 tv = and128(x, m);
+	tv = and128(tv, mm);
+
+	m128 mask = not128(eq64_m128(tv, zeroes128()));
+	mask = and128(bitset, mask);
+        vres = or128(vres, mask);
+	m = and128(m, sub_2x64(m, one));
+        bitset = lshift64_m128(bitset, 1);
+    }
+    return vres;
+}
+
+static really_inline
+u32 expand32_impl(u32 x, u32 m) {
+    return expand32_impl_c(x, m);
+}
+
+static really_inline
+u64a expand64_impl(u64a x, u64a m) {
+    return expand64_impl_c(x, m);
+}
+
+static really_inline
+m128 expand128_impl(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bb = one;
+    m128 res = zeroes128();
+    while (isnonzero128(m)) {
+	m128 xm = and128(x, bb);
+        m128 mm = sub_2x64(zeroes128(), m);
+        m128 mask = not128(eq64_m128(xm, zeroes128()));
+	mask = and128(mask, and128(m,mm));
+        res = or128(res, mask);
+        m = and128(m, sub_2x64(m, one));
+        bb = lshift64_m128(bb, 1);
+    }
+    return res;
+}
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64_impl(bitfield);
+}
+
+static really_inline
+char bf64_set_impl(u64a *bitfield, u32 i) {
+    return bf64_set_impl_c(bitfield, i);
+}
+
+static really_inline
+void bf64_unset_impl(u64a *bitfield, u32 i) {
+    return bf64_unset_impl_c(bitfield, i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl(u32 mask, u32 bit) {
+    return rank_in_mask32_impl_c(mask, bit);
+}
+
+static really_inline
+u32 rank_in_mask64_impl(u64a mask, u32 bit) {
+    return rank_in_mask64_impl_c(mask, bit);
+}
+
+static really_inline
+u32 pext32_impl(u32 x, u32 mask) {
+    return pext32_impl_c(x, mask);
+}
+
+static really_inline
+u64a pext64_impl(u64a x, u64a mask) {
+    return pext64_impl_c(x, mask);
+}
+
+static really_inline
+u64a pdep64_impl(u64a x, u64a mask) {
+    return pdep64_impl_c(x, mask);
+}
+
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl(const u32 a, const u8 *b) {
+    return andn_impl_c(a, b);
+}
+
+#endif // BITUTILS_ARCH_ARM_H
diff --git a/src/util/arch/ppc64el/cpuid_flags.c b/src/util/arch/ppc64el/cpuid_flags.c
new file mode 100644
index 000000000..a2f3758c4
--- /dev/null
+++ b/src/util/arch/ppc64el/cpuid_flags.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/arch/common/cpuid_flags.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "util/arch.h"
+
+u64a cpuid_flags(void) {
+     return 0;
+}
+
+u32 cpuid_tune(void) {
+    return HS_TUNE_FAMILY_GENERIC;
+}
diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp
new file mode 100644
index 000000000..700751abc
--- /dev/null
+++ b/src/util/arch/ppc64el/match.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+template <>
+really_really_inline
+const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
+    if (unlikely(z)) {
+        u32 pos = ctz32(z);
+        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 16);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
+    if (unlikely(z)) {
+        u32 pos = clz32(z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos >= 16 && pos < 32);
+        return buf + (31 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
+    if (unlikely(z != 0xffff)) {
+        u32 pos = ctz32(~z & 0xffff);
+        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 16);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+
+template <>
+really_really_inline
+const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
+    if (unlikely(z != 0xffff)) {
+        u32 pos = clz32(~z & 0xffff);
+        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos >= 16 && pos < 32);
+        return buf + (31 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+
diff --git a/src/util/arch/ppc64el/ppc64el.h b/src/util/arch/ppc64el/ppc64el.h
new file mode 100644
index 000000000..dbb382973
--- /dev/null
+++ b/src/util/arch/ppc64el/ppc64el.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Per-platform architecture definitions
+ */
+
+#ifndef UTIL_ARCH_PPC64EL_H_
+#define UTIL_ARCH_PPC64EL_H_
+
+#if defined(__VSX__) && defined(ARCH_PPC64EL)
+#define HAVE_VSX
+#define HAVE_SIMD_128_BITS
+#define VECTORSIZE 16
+#endif
+
+#endif // UTIL_ARCH_ARM_H_
+
diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h
new file mode 100644
index 000000000..8a5b0e252
--- /dev/null
+++ b/src/util/arch/ppc64el/simd_types.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ARCH_PPC64EL_SIMD_TYPES_H
+#define ARCH_PPC64EL_SIMD_TYPES_H
+
+#if !defined(m128) && defined(HAVE_VSX)
+typedef __vector int m128;
+#endif
+
+#endif /* ARCH_PPC64EL_SIMD_TYPES_H  */
+
diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h
new file mode 100644
index 000000000..15446e871
--- /dev/null
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef ARCH_PPC64EL_SIMD_UTILS_H
+#define ARCH_PPC64EL_SIMD_UTILS_H
+
+#include <stdio.h>
+
+#include "ue2common.h"
+#include "util/simd_types.h"
+#include "util/unaligned.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+#if defined(__clang__) && (__clang_major__ == 15)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecate-lax-vec-conv-all"
+#endif // defined(__clang__) && (__clang_major__ == 15)
+
+typedef __vector unsigned long long int  uint64x2_t;
+typedef __vector   signed long long int   int64x2_t;
+typedef __vector unsigned int            uint32x4_t;
+typedef __vector   signed int             int32x4_t;
+typedef __vector unsigned short int      uint16x8_t;
+typedef __vector   signed short int       int16x8_t;
+typedef __vector unsigned char           uint8x16_t;
+typedef __vector  signed char             int8x16_t;
+
+typedef unsigned long long int ulong64_t;
+typedef   signed long long int  long64_t;
+
+static really_inline m128 ones128(void) {
+    return (m128) vec_splat_u8(-1);
+}
+
+static really_inline m128 zeroes128(void) {
+    return (m128) vec_splat_s32(0);
+}
+
+/** \brief Bitwise not for m128*/
+static really_inline m128 not128(m128 a) {
+    //return (m128)vec_xor(a, a);
+    return (m128) vec_xor(a,ones128());
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    return vec_any_ne(a, b);
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return !!diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    static const m128 movemask = { 1, 2, 4, 8 };  
+    m128 mask = (m128) vec_cmpeq(a, b); // _mm_cmpeq_epi32 (a, b);
+    mask = vec_and(not128(mask), movemask);
+    m128 sum = vec_sums(mask, zeroes128()); 
+    return sum[3];
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+    static const uint64x2_t movemask = { 1, 4 };
+    uint64x2_t mask = (uint64x2_t) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
+    mask = (uint64x2_t) vec_and((uint64x2_t)not128((m128)mask), movemask);
+    m128 sum = vec_sums((m128)mask, zeroes128());
+    return sum[3];
+}
+
+static really_really_inline
+m128 add_2x64(m128 a, m128 b) {
+    return (m128) vec_add((uint64x2_t)a, (uint64x2_t)b);
+}
+
+static really_really_inline
+m128 sub_2x64(m128 a, m128 b) {
+    return (m128) vec_sub((uint64x2_t)a, (uint64x2_t)b);
+}
+
+static really_really_inline
+m128 lshift_m128(m128 a, unsigned b) {
+    if (b == 0) return a;
+    m128 sl = (m128) vec_splats((uint8_t) b << 3);
+    m128 result = (m128) vec_slo((uint8x16_t) a, (uint8x16_t) sl);
+    return result;
+}
+
+static really_really_inline
+m128 rshift_m128(m128 a, unsigned b) {
+    if (b == 0) return a;
+    m128 sl = (m128) vec_splats((uint8_t) b << 3);
+    uint8x16_t result = vec_sro((uint8x16_t) a, (uint8x16_t) sl);
+    return (m128) result;
+}
+
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+  uint64x2_t shift_indices = vec_splats((ulong64_t)b); 
+  return (m128) vec_sl((int64x2_t)a, shift_indices);
+}
+
+static really_really_inline
+m128 rshift64_m128(m128 a, unsigned  b) {
+  uint64x2_t shift_indices = vec_splats((ulong64_t)b); 
+  return (m128) vec_sr((int64x2_t)a, shift_indices);
+}
+
+static really_inline m128 eq128(m128 a, m128 b) {
+   return (m128) vec_cmpeq((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline m128 eq64_m128(m128 a, m128 b) {
+   return (m128) vec_cmpeq((uint64x2_t)a, (uint64x2_t)b);
+}
+
+static really_inline u32 movemask128(m128 a) {
+   static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+   uint8x16_t bitmask = vec_gb((uint8x16_t) a);
+   bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
+   u32 ALIGN_ATTR(16) movemask;
+   vec_ste((uint32x4_t) bitmask, 0, &movemask);
+   return movemask;
+}
+
+static really_inline m128 set1_16x8(u8 c) {
+    return (m128) vec_splats(c);
+}
+
+static really_inline m128 set1_4x32(u32 c) {
+    return (m128) vec_splats(c);
+}
+
+static really_inline m128 set1_2x64(u64a c) {
+    return (m128) vec_splats(c);
+}
+
+static really_inline u32 movd(const m128 in) {
+   return (u32) vec_extract((uint32x4_t)in, 0);
+}
+
+static really_inline u64a movq(const m128 in) {
+    u64a ALIGN_ATTR(16) a[2];
+    vec_xst((uint64x2_t) in, 0, a);
+    return a[0];  
+}
+
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    m128 vec =(m128) vec_splats(*p);
+    return rshift_m128(vec,8);
+}
+
+
+static really_inline u32 extract32from128(const m128 in, unsigned imm) {
+u32 ALIGN_ATTR(16) a[4];
+vec_xst((uint32x4_t) in, 0, a);
+switch (imm) {
+    case 0:
+        return a[0];break;
+    case 1:
+        return a[1];break;
+    case 2:
+        return a[2];break;
+    case 3:
+        return a[3];break;
+    default:
+	return 0;break;
+    }
+}
+
+static really_inline u64a extract64from128(const m128 in, unsigned imm) {
+u64a ALIGN_ATTR(16) a[2];
+vec_xst((uint64x2_t) in, 0, a);
+switch (imm) {
+    case 0:
+        return a[0];break;
+    case 1:
+        return a[1];break;
+    default:
+	return 0;
+	break;
+    }
+}
+
+static really_inline m128 low64from128(const m128 in) {
+    return rshift_m128(in,8); 
+}
+
+static really_inline m128 high64from128(const m128 in) {
+    return lshift_m128(in,8); 
+}
+
+
+static really_inline m128 add128(m128 a, m128 b) {
+    return (m128) vec_add((uint64x2_t)a, (uint64x2_t)b);
+}
+
+static really_inline m128 and128(m128 a, m128 b) {
+    return (m128) vec_and((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    return (m128) vec_xor((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    return (m128) vec_or((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    return (m128) and128(not128(a),b);
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    return (m128) vec_xl(0, (const int32_t*)ptr);
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) { 	
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    vec_st(a, 0, (int32_t*)ptr);
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    return (m128) vec_xl(0, (const int32_t*)ptr);
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    vec_xst(a, 0, (int32_t*)ptr);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+#define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return (m128)vec_sld((int8x16_t)(a), (int8x16_t)(b), (16 - offset)); break;
+
+static really_really_inline
+m128 palignr_imm(m128 r, m128 l, int offset) {
+    switch (offset) {
+    case 0: return l; break;
+    CASE_ALIGN_VECTORS(r, l, 1);
+    CASE_ALIGN_VECTORS(r, l, 2);
+    CASE_ALIGN_VECTORS(r, l, 3);
+    CASE_ALIGN_VECTORS(r, l, 4);
+    CASE_ALIGN_VECTORS(r, l, 5);
+    CASE_ALIGN_VECTORS(r, l, 6);
+    CASE_ALIGN_VECTORS(r, l, 7);
+    CASE_ALIGN_VECTORS(r, l, 8);
+    CASE_ALIGN_VECTORS(r, l, 9);
+    CASE_ALIGN_VECTORS(r, l, 10);
+    CASE_ALIGN_VECTORS(r, l, 11);
+    CASE_ALIGN_VECTORS(r, l, 12);
+    CASE_ALIGN_VECTORS(r, l, 13);
+    CASE_ALIGN_VECTORS(r, l, 14);
+    CASE_ALIGN_VECTORS(r, l, 15);
+    case 16: return r; break;
+    default: return zeroes128(); break;
+    } 
+}
+
+static really_really_inline
+m128 palignr(m128 r, m128 l, int offset) {
+    if (offset == 0) return l;
+    if (offset == 16) return r;
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        return (m128)vec_sld((int8x16_t)(r), (int8x16_t)(l), 16 - offset);
+    }
+#endif
+    m128 sl = (m128) vec_splats((uint8_t) (offset << 3));
+    m128 sr = (m128) vec_splats((uint8_t) ((16 - offset) << 3));
+    m128 rhs = (m128) vec_slo((uint8x16_t) r, (uint8x16_t) sr);
+    m128 lhs = (m128) vec_sro((uint8x16_t) l, (uint8x16_t) sl);
+    return or128(lhs, rhs);
+}
+
+#undef CASE_ALIGN_VECTORS
+
+static really_really_inline
+m128 rshiftbyte_m128(m128 a, unsigned b) {
+    return palignr_imm(zeroes128(), a, b);
+}
+
+static really_really_inline
+m128 lshiftbyte_m128(m128 a, unsigned b) {
+    return palignr_imm(a, zeroes128(), 16 - b);
+}
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    if (amount < 0) {
+        return rshiftbyte_m128(in, -amount);
+    } else {
+        return lshiftbyte_m128(in, amount);
+    }
+}
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    static uint64x2_t onebit = { 1, 0 };
+    m128 octets = (m128) vec_splats((uint8_t) ((n / 8) << 3));
+    m128 bits = (m128) vec_splats((uint8_t) ((n % 8)));
+    m128 mask = (m128) vec_slo((uint8x16_t) onebit, (uint8x16_t) octets);
+    return (m128) vec_sll((uint8x16_t) mask, (uint8x16_t) bits);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+    return isnonzero128(and128(mask, val));
+}
+
+static really_inline
+m128 pshufb_m128(m128 a, m128 b) {
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
+       below is the version that is converted from Intel to PPC.  */
+    uint8x16_t mask =(uint8x16_t)vec_cmpge((uint8x16_t)b, (uint8x16_t)vec_splats((uint8_t)0x80));
+    uint8x16_t res = vec_perm ((uint8x16_t)a, (uint8x16_t)a, (uint8x16_t)b);
+    return (m128) vec_sel((uint8x16_t)res, (uint8x16_t)zeroes128(), (uint8x16_t)mask);
+}
+
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return (m128) vec_max((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return (m128) vec_min((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return (m128) vec_adds((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return (m128) vec_sub((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 set4x32(u32 x3, u32 x2, u32 x1, u32  x0) {
+    uint32x4_t v = { x0, x1, x2, x3 };
+    return (m128) v;
+}
+
+static really_inline
+m128 set2x64(u64a hi, u64a lo) {
+    uint64x2_t v = { lo, hi };
+    return (m128) v;
+}
+
+#if defined(__clang__) && (__clang_major__ == 15)
+#pragma clang diagnostic pop
+#endif // defined(__clang__) && (__clang_major__ == 15)
+
+#endif // ARCH_PPC64EL_SIMD_UTILS_H
diff --git a/src/util/arch/simde/cpuid_flags.c b/src/util/arch/simde/cpuid_flags.c
new file mode 100644
index 000000000..a2f3758c4
--- /dev/null
+++ b/src/util/arch/simde/cpuid_flags.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/arch/common/cpuid_flags.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "util/arch.h"
+
+u64a cpuid_flags(void) {
+     return 0;
+}
+
+u32 cpuid_tune(void) {
+    return HS_TUNE_FAMILY_GENERIC;
+}
diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h
new file mode 100644
index 000000000..485b65122
--- /dev/null
+++ b/src/util/arch/x86/bitutils.h
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_X86_H
+#define BITUTILS_ARCH_X86_H
+
+#include "ue2common.h"
+#include "util/popcount.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include "util/arch/common/bitutils.h"
+
+static really_inline
+u32 clz32_impl(u32 x) {
+    return clz32_impl_c(x);
+}
+
+static really_inline
+u32 clz64_impl(u64a x) {
+    return clz64_impl_c(x);
+}
+
+// CTZ (count trailing zero) implementations.
+static really_inline
+u32 ctz32_impl(u32 x) {
+    return ctz32_impl_c(x);
+}
+
+static really_inline
+u32 ctz64_impl(u64a x) {
+    return ctz64_impl_c(x);
+}
+
+static really_inline
+u32 lg2_impl(u32 x) {
+    return lg2_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl(u64a x) {
+    return lg2_64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl(u32 *v) {
+#ifndef NO_ASM
+    u32 val = *v, offset;
+    __asm__ ("bsf %1, %0\n"
+             "btr %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+
+    assert(offset < 32);
+    return offset;
+#else
+    return findAndClearLSB_32_impl_c(v);
+#endif
+
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl(u64a *v) {
+#ifdef ARCH_64_BIT
+#if !defined(NO_ASM)
+    u64a val = *v, offset;
+    __asm__ ("bsfq %1, %0\n"
+             "btrq %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = ctz64(val);
+    *v = val & (val - 1);
+#endif // ARCH_X86_64
+    assert(offset < 64);
+    return (u32)offset;
+#else
+    return findAndClearLSB_64_impl_c(v);
+#endif
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl(u32 *v) {
+#if !defined(NO_ASM)
+    u32 val = *v, offset;
+    __asm__ ("bsr %1, %0\n"
+             "btr %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    u32 val = *v;
+    u32 offset = 31 - clz32_impl(val);
+    *v = val & ~(1 << offset);
+#endif
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl(u64a *v) {
+#ifdef ARCH_64_BIT
+#if !defined(NO_ASM)
+    u64a val = *v, offset;
+    __asm__ ("bsrq %1, %0\n"
+             "btrq %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = 63 - clz64_impl(val);
+    *v = val & ~(1ULL << offset);
+#endif // ARCH_X86_64
+    assert(offset < 64);
+    return (u32)offset;
+#else
+    return findAndClearMSB_64_impl_c(v);
+#endif
+}
+
+static really_inline
+u32 compress32_impl(u32 x, u32 m) {
+#if defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pext_u32(x, m);
+#else
+    return compress32_impl_c(x, m);
+#endif
+}
+
+static really_inline
+u64a compress64_impl(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pext_u64(x, m);
+#else
+    return compress64_impl_c(x, m);
+#endif
+}
+
+static really_inline
+m128 compress128_impl(m128 xvec, m128 mvec) {
+    u64a ALIGN_ATTR(16) x[2];
+    u64a ALIGN_ATTR(16) m[2];
+    store128(x, xvec);
+    store128(m, mvec);
+
+    x[0] = compress64_impl(x[0], m[0]);
+    x[1] = compress64_impl(x[1], m[1]);
+
+    return load128(x);
+}
+
+static really_inline
+u32 expand32_impl(u32 x, u32 m) {
+#if defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pdep_u32(x, m);
+#else
+    return expand32_impl_c(x, m);
+#endif
+}
+
+static really_inline
+u64a expand64_impl(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    DEBUG_PRINTF("pdep_u64\n");
+    return _pdep_u64(x, m);
+#else
+    return expand64_impl_c(x, m);
+#endif
+}
+
+static really_inline
+m128 expand128_impl(m128 xvec, m128 mvec) {
+    u64a ALIGN_ATTR(16) x[2];
+    u64a ALIGN_ATTR(16) m[2];
+    store128(x, xvec);
+    store128(m, mvec);
+    DEBUG_PRINTF("calling expand64_impl:\n");
+    x[0] = expand64_impl(x[0], m[0]);
+    x[1] = expand64_impl(x[1], m[1]);
+
+    return load128(x);
+}
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64_impl(bitfield);
+}
+
+static really_inline
+char bf64_set_impl(u64a *bitfield, u32 i) {
+    return bf64_set_impl_c(bitfield, i);
+}
+
+static really_inline
+void bf64_unset_impl(u64a *bitfield, u32 i) {
+    return bf64_unset_impl_c(bitfield, i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl(u32 mask, u32 bit) {
+    return rank_in_mask32_impl_c(mask, bit);
+}
+
+static really_inline
+u32 rank_in_mask64_impl(u64a mask, u32 bit) {
+    return rank_in_mask64_impl_c(mask, bit);
+}
+
+static really_inline
+u32 pext32_impl(u32 x, u32 mask) {
+#if defined(HAVE_BMI2)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u32(x, mask);
+#else
+    return pext32_impl_c(x, mask);
+#endif
+}
+
+static really_inline
+u64a pext64_impl(u64a x, u64a mask) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u64(x, mask);
+#else
+    return pext64_impl_c(x, mask);
+#endif
+}
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+static really_inline
+u64a pdep64_impl(u64a x, u64a mask) {
+    return _pdep_u64(x, mask);
+}
+#else
+static really_inline
+u64a pdep64_impl(u64a x, u64a mask) {
+    return pdep64_impl_c(x, mask);
+}
+#endif
+
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl(const u32 a, const u8 *b) {
+#if defined(HAVE_BMI) && !defined(NO_ASM)
+    u64a r;
+    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
+    return r;
+#else
+    return andn_impl_c(a, b);
+#endif
+}
+
+#endif // BITUTILS_ARCH_X86_H
diff --git a/src/util/cpuid_flags.c b/src/util/arch/x86/cpuid_flags.c
similarity index 87%
rename from src/util/cpuid_flags.c
rename to src/util/arch/x86/cpuid_flags.c
index 0b529c0bf..92c297b82 100644
--- a/src/util/cpuid_flags.c
+++ b/src/util/arch/x86/cpuid_flags.c
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,14 +27,14 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "cpuid_flags.h"
+#include "util/arch/common/cpuid_flags.h"
 #include "cpuid_inline.h"
 #include "ue2common.h"
 #include "hs_compile.h" // for HS_MODE_ flags
 #include "hs_internal.h"
 #include "util/arch.h"
 
-#if !defined(_WIN32) && !defined(CPUID_H_)
+#if !defined(CPUID_H_)
 #include <cpuid.h>
 #endif
 
@@ -50,6 +51,11 @@ u64a cpuid_flags(void) {
         cap |= HS_CPU_FEATURES_AVX512;
     }
 
+    if (check_avx512vbmi()) {
+        DEBUG_PRINTF("AVX512VBMI enabled\n");
+        cap |= HS_CPU_FEATURES_AVX512VBMI;
+    }
+
 #if !defined(FAT_RUNTIME) && !defined(HAVE_AVX2)
     cap &= ~HS_CPU_FEATURES_AVX2;
 #endif
@@ -59,6 +65,11 @@ u64a cpuid_flags(void) {
     cap &= ~HS_CPU_FEATURES_AVX512;
 #endif
 
+#if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512VBMI)) ||                    \
+    (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI))
+    cap &= ~HS_CPU_FEATURES_AVX512VBMI;
+#endif
+
     return cap;
 }
 
@@ -105,6 +116,11 @@ static const struct family_id known_microarch[] = {
     { 0x6, 0x8E, HS_TUNE_FAMILY_SKL }, /* Kabylake Mobile */
     { 0x6, 0x9E, HS_TUNE_FAMILY_SKL }, /* Kabylake desktop */
 
+    { 0x6, 0x7D, HS_TUNE_FAMILY_ICL }, /* Icelake */
+    { 0x6, 0x7E, HS_TUNE_FAMILY_ICL }, /* Icelake */
+    { 0x6, 0x6A, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon-D */
+    { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */
+
 };
 
 #ifdef DUMP_SUPPORT
@@ -120,6 +136,8 @@ const char *dumpTune(u32 tune) {
         T_CASE(HS_TUNE_FAMILY_BDW);
         T_CASE(HS_TUNE_FAMILY_SKL);
         T_CASE(HS_TUNE_FAMILY_SKX);
+        T_CASE(HS_TUNE_FAMILY_ICL);
+        T_CASE(HS_TUNE_FAMILY_ICX);
     }
 #undef T_CASE
     return "unknown";
diff --git a/src/util/cpuid_inline.h b/src/util/arch/x86/cpuid_inline.h
similarity index 82%
rename from src/util/cpuid_inline.h
rename to src/util/arch/x86/cpuid_inline.h
index b6768cc26..d5ff210cb 100644
--- a/src/util/cpuid_inline.h
+++ b/src/util/arch/x86/cpuid_inline.h
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2017-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,9 +31,9 @@
 #define CPUID_INLINE_H_
 
 #include "ue2common.h"
-#include "cpuid_flags.h"
+#include "util/arch/common/cpuid_flags.h"
 
-#if !defined(_WIN32) && !defined(CPUID_H_)
+#if !defined(CPUID_H_)
 #include <cpuid.h>
 /* system header doesn't have a header guard */
 #define CPUID_H_
@@ -46,16 +47,7 @@ extern "C"
 static inline
 void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
            unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
-#ifndef _WIN32
     __cpuid_count(op, leaf, *eax, *ebx, *ecx, *edx);
-#else
-    int a[4];
-    __cpuidex(a, op, leaf);
-    *eax = a[0];
-    *ebx = a[1];
-    *ecx = a[2];
-    *edx = a[3];
-#endif
 }
 
 // ECX
@@ -74,11 +66,12 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
 #define CPUID_HTT (1 << 28)
 
 // Structured Extended Feature Flags Enumeration Leaf ECX values
+#define CPUID_AVX512VBMI (1 << 1)
+
+// Structured Extended Feature Flags Enumeration Leaf EBX values
 #define CPUID_BMI (1 << 3)
 #define CPUID_AVX2 (1 << 5)
 #define CPUID_BMI2 (1 << 8)
-
-// Structured Extended Feature Flags Enumeration Leaf EBX values
 #define CPUID_AVX512F (1 << 16)
 #define CPUID_AVX512BW (1 << 30)
 
@@ -94,9 +87,6 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
 
 static inline
 u64a xgetbv(u32 op) {
-#if defined(_WIN32) || defined(__INTEL_COMPILER)
-    return _xgetbv(op);
-#else
     u32 a, d;
     __asm__ volatile (
             "xgetbv\n"
@@ -104,14 +94,10 @@ u64a xgetbv(u32 op) {
               "=d"(d)
             : "c"(op));
     return ((u64a)d << 32) + a;
-#endif
 }
 
 static inline
 int check_avx2(void) {
-#if defined(__INTEL_COMPILER)
-    return _may_i_use_cpu_feature(_FEATURE_AVX2);
-#else
     unsigned int eax, ebx, ecx, edx;
 
     cpuid(1, 0, &eax, &ebx, &ecx, &edx);
@@ -140,7 +126,6 @@ int check_avx2(void) {
     }
 
     return 0;
-#endif
 }
 
 static inline
@@ -148,9 +133,6 @@ int check_avx512(void) {
     /*
      * For our purposes, having avx512 really means "can we use AVX512BW?"
      */
-#if defined(__INTEL_COMPILER)
-    return _may_i_use_cpu_feature(_FEATURE_AVX512BW | _FEATURE_AVX512VL);
-#else
     unsigned int eax, ebx, ecx, edx;
 
     cpuid(1, 0, &eax, &ebx, &ecx, &edx);
@@ -183,7 +165,47 @@ int check_avx512(void) {
     }
 
     return 0;
-#endif
+}
+
+static inline
+int check_avx512vbmi(void) {
+    unsigned int eax, ebx, ecx, edx;
+
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+
+    /* check XSAVE is enabled by OS */
+    if (!(ecx & CPUID_XSAVE)) {
+        DEBUG_PRINTF("AVX and XSAVE not supported\n");
+        return 0;
+    }
+
+    /* check that AVX 512 registers are enabled by OS */
+    u64a xcr0 = xgetbv(0);
+    if ((xcr0 & CPUID_XCR0_AVX512) != CPUID_XCR0_AVX512) {
+        DEBUG_PRINTF("AVX512 registers not enabled\n");
+        return 0;
+    }
+
+    /* ECX and EDX contain capability flags */
+    ecx = 0;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+
+    if (!(ebx & CPUID_AVX512F)) {
+        DEBUG_PRINTF("AVX512F (AVX512 Foundation) instructions not enabled\n");
+        return 0;
+    }
+
+    if (!(ebx & CPUID_AVX512BW)) {
+        DEBUG_PRINTF("AVX512BW instructions not enabled\n");
+        return 0;
+    }
+
+    if (ecx & CPUID_AVX512VBMI) {
+        DEBUG_PRINTF("AVX512VBMI instructions enabled\n");
+        return 1;
+    }
+
+    return 0;
 }
 
 static inline
diff --git a/src/util/arch/x86/crc32.h b/src/util/arch/x86/crc32.h
new file mode 100644
index 000000000..61bdbf6ff
--- /dev/null
+++ b/src/util/arch/x86/crc32.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_ARCH_X86_CRC32_H_
+#define UTIL_ARCH_X86_CRC32_H_
+
+#include "util/arch/x86/x86.h"
+#include "util/intrinsics.h"
+
+#ifdef ARCH_64_BIT
+#define CRC_WORD 8
+#define CRC_TYPE u64a
+#define CRC_FUNC _mm_crc32_u64
+#else
+#define CRC_WORD 4
+#define CRC_TYPE u32
+#define CRC_FUNC _mm_crc32_u32
+#endif
+
+/*
+ * Use the crc32 instruction from SSE4.2 to compute our checksum - same
+ * polynomial as the above function.
+ */
+static really_inline
+u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
+                      const size_t length) {
+    u32 crc = running_crc;
+
+    // Process byte-by-byte until p_buf is aligned
+
+    const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
+    size_t init_bytes = aligned_buf - p_buf;
+    size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
+    size_t end_bytes = length - init_bytes - running_length;
+
+    while (p_buf < aligned_buf) {
+        crc = _mm_crc32_u8(crc, *p_buf++);
+    }
+
+    // Main aligned loop, processes a word at a time.
+
+    for (size_t li = 0; li < running_length/CRC_WORD; li++) {
+        CRC_TYPE block = *(const CRC_TYPE *)p_buf;
+        crc = CRC_FUNC(crc, block);
+        p_buf += CRC_WORD;
+    }
+
+    // Remaining bytes
+
+    for(size_t li = 0; li < end_bytes; li++) {
+        crc = _mm_crc32_u8(crc, *p_buf++);
+    }
+
+    return crc;
+}
+
+#endif // UTIL_ARCH_X86_CRC32_H_
diff --git a/src/util/masked_move.c b/src/util/arch/x86/masked_move.c
similarity index 99%
rename from src/util/masked_move.c
rename to src/util/arch/x86/masked_move.c
index 001cd49f2..b6ddc51ed 100644
--- a/src/util/masked_move.c
+++ b/src/util/arch/x86/masked_move.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/src/util/masked_move.h b/src/util/arch/x86/masked_move.h
similarity index 96%
rename from src/util/masked_move.h
rename to src/util/arch/x86/masked_move.h
index 4c877ca9e..4787ffa97 100644
--- a/src/util/masked_move.h
+++ b/src/util/arch/x86/masked_move.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,12 +30,12 @@
 #ifndef MASKED_MOVE_H
 #define MASKED_MOVE_H
 
-#include "arch.h"
+#include "x86.h"
 
 #if defined(HAVE_AVX2)
 
-#include "unaligned.h"
-#include "simd_utils.h"
+#include "util/unaligned.h"
+#include "util/simd_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/util/arch/x86/match.hpp b/src/util/arch/x86/match.hpp
new file mode 100644
index 000000000..ccd2a5769
--- /dev/null
+++ b/src/util/arch/x86/match.hpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+template <>
+really_really_inline
+const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08x\n", buf, z);
+    if (unlikely(z)) {
+        u32 pos = ctz32(z);
+        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 16);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *first_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
+    if (unlikely(z)) {
+        u32 pos = ctz32(z);
+        assert(pos < 32);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+template <>
+really_really_inline
+const u8 *first_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    u64a mask = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("mask %016llx\n", mask);
+    z &= mask;
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    if (unlikely(z)) {
+        u32 pos = ctz64(z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 64);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
+    if (unlikely(z)) {
+        u32 pos = clz32(z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos >= 16 && pos < 32);
+        return buf + (31 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *last_non_zero_match<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
+    if (unlikely(z)) {
+        u32 pos = clz32(z);
+        assert(pos < 32);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        return buf + (31 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+template <>
+really_really_inline
+const u8 *last_non_zero_match<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    u64a mask = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("mask %016llx\n", mask);
+    z &= mask;
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    if (unlikely(z)) {
+        u32 pos = clz64(z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 64);
+        return buf + (63 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
+    if (unlikely(z != 0xffff)) {
+        u32 pos = ctz32(~z & 0xffff);
+        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 16);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *first_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, u16 const UNUSED len) {
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%08llx\n", z);
+    if (unlikely(z != 0xffffffff)) {
+        u32 pos = ctz32(~z & 0xffffffffu);
+        assert(pos < 32);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+template <>
+really_really_inline
+const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 const len) {
+    assert(SuperVector<64>::mask_width() == 1);
+    SuperVector<64>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    u64a mask = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("mask %016llx\n", mask);
+    z = ~z & mask;
+    DEBUG_PRINTF("z 0x%016llx\n", (u64a) z);
+    if (unlikely(z)) {
+        u32 pos = ctz64(z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 64);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
+    assert(SuperVector<16>::mask_width() == 1);
+    SuperVector<16>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
+    if (unlikely(z != 0xffff)) {
+        u32 pos = clz32(~z & 0xffffu);
+        DEBUG_PRINTF("~z %08x\n", ~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos >= 16 && pos < 32);
+        return buf + (31 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+template<>
+really_really_inline
+const u8 *last_zero_match_inverted<32>(const u8 *buf, SuperVector<32> v, uint16_t UNUSED len) {
+    assert(SuperVector<32>::mask_width() == 1);
+    SuperVector<32>::comparemask_type z = v.comparemask();
+    if (unlikely(static_cast<u32>(z) != 0xffffffff)) {
+        u32 pos = clz32(~z & 0xffffffffu);
+        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
+        assert(pos < 32);
+        return buf + (31 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+template <>
+really_really_inline
+const u8 *last_zero_match_inverted<64>(const u8 *buf, SuperVector<64> v, uint16_t len) {
+    assert(SuperVector<64>::mask_width() == 1);
+    v.print8("v");
+    SuperVector<64>::comparemask_type z = v.comparemask();
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    u64a mask = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("mask %016llx\n", mask);
+    z = ~z & mask;
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    if (unlikely(z)) {
+        u32 pos = clz64(z);
+        DEBUG_PRINTF("~z 0x%016llx\n", ~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 64);
+        return buf + (63 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
diff --git a/src/util/make_unique.h b/src/util/arch/x86/simd_types.h
similarity index 80%
rename from src/util/make_unique.h
rename to src/util/arch/x86/simd_types.h
index 651e8c5cf..e16424041 100644
--- a/src/util/make_unique.h
+++ b/src/util/arch/x86/simd_types.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,24 +27,20 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef UTIL_MAKE_UNIQUE_H
-#define UTIL_MAKE_UNIQUE_H
+#ifndef SIMD_TYPES_X86_H
+#define SIMD_TYPES_X86_H
 
-#if (defined(_WIN32) || defined(_WIN64)) && (_MSC_VER > 1700)
-// VC++ 2013 onwards has make_unique in the STL
-#define USE_STD
-#include <memory>
-#else
-#include <boost/smart_ptr/make_unique.hpp>
+#if !defined(m128) && defined(HAVE_SSE42)
+typedef __m128i m128;
 #endif
 
-namespace ue2 {
-#if defined(USE_STD)
-using std::make_unique;
-#else
-using boost::make_unique;
+#if !defined(m256) && defined(HAVE_AVX2)
+typedef __m256i m256;
 #endif
-}
 
-#undef USE_STD
-#endif // UTIL_MAKE_UNIQUE_H
+#if !defined(m512) && defined(HAVE_AVX512)
+typedef __m512i m512;
+#endif
+
+#endif /* SIMD_TYPES_X86_H */
+
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
new file mode 100644
index 000000000..49797abab
--- /dev/null
+++ b/src/util/arch/x86/simd_utils.h
@@ -0,0 +1,986 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef ARCH_X86_SIMD_UTILS_H
+#define ARCH_X86_SIMD_UTILS_H
+
+#include "x86.h"
+#include "ue2common.h"
+#include "util/simd_types.h"
+#include "util/unaligned.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
+    ZEROES_32, ZEROES_32,
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+    ZEROES_32, ZEROES_32,
+};
+
+static really_inline m128 ones128(void) {
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+    /* gcc gets this right */
+    return (m128) _mm_set1_epi8(0xFF);
+#else
+    /* trick from Intel's optimization guide to generate all-ones.
+     * ICC converts this to the single cmpeq instruction */
+    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
+#endif
+}
+
+static really_inline m128 zeroes128(void) {
+    return (m128) _mm_setzero_si128();
+}
+
+/** \brief Bitwise not for m128*/
+static really_inline m128 not128(m128 a) {
+    return (m128) _mm_xor_si128(a, ones128());
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return !!diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    a = _mm_cmpeq_epi32(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+#if defined(HAVE_SSE41)
+    a = _mm_cmpeq_epi64(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
+#else
+    u32 d = diffrich128(a, b);
+    return (d | (d >> 1)) & 0x5;
+#endif
+}
+
+static really_really_inline
+m128 add_2x64(m128 a, m128 b) {
+    return (m128) _mm_add_epi64(a, b);
+}
+
+static really_really_inline
+m128 sub_2x64(m128 a, m128 b) {
+    return (m128) _mm_sub_epi64(a, b);
+}
+
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm_sll_epi64(a, x);
+}
+
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
+#define eq128(a, b)         _mm_cmpeq_epi8((a), (b))
+#define eq64_m128(a, b)     _mm_cmpeq_epi64((a), (b))
+#define movemask128(a)      ((u32)_mm_movemask_epi8((a)))
+
+#if defined(HAVE_AVX512)
+static really_inline m128 cast512to128(const m512 in) {
+    return _mm512_castsi512_si128(in);
+}
+#endif
+
+static really_inline m128 set1_16x8(u8 c) {
+    return _mm_set1_epi8(c);
+}
+
+static really_inline m128 set1_4x32(u32 c) {
+    return _mm_set1_epi32(c);
+}
+
+static really_inline m128 set1_2x64(u64a c) {
+    return _mm_set1_epi64x(c);
+}
+
+static really_inline u32 movd(const m128 in) {
+    return _mm_cvtsi128_si32(in);
+}
+
+static really_inline u64a movq(const m128 in) {
+#if defined(ARCH_X86_64)
+    return _mm_cvtsi128_si64(in);
+#else // 32-bit - this is horrific
+    u32 lo = movd(in);
+    u32 hi = movd(_mm_srli_epi64(in, 32));
+    return (u64a)hi << 32 | lo;
+#endif
+}
+
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return _mm_set_epi64x(0LL, *p);
+}
+
+#define CASE_RSHIFT_VECTOR(a, count)  case count: return _mm_srli_si128((m128)(a), (count)); break;
+
+static really_inline
+m128 rshiftbyte_m128(const m128 a, int count_immed) {
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
+    if (__builtin_constant_p(count_immed)) {
+        return _mm_srli_si128(a, count_immed);
+    }
+#endif
+    switch (count_immed) {
+    case 0: return a; break;
+    CASE_RSHIFT_VECTOR(a, 1);
+    CASE_RSHIFT_VECTOR(a, 2);
+    CASE_RSHIFT_VECTOR(a, 3);
+    CASE_RSHIFT_VECTOR(a, 4);
+    CASE_RSHIFT_VECTOR(a, 5);
+    CASE_RSHIFT_VECTOR(a, 6);
+    CASE_RSHIFT_VECTOR(a, 7);
+    CASE_RSHIFT_VECTOR(a, 8);
+    CASE_RSHIFT_VECTOR(a, 9);
+    CASE_RSHIFT_VECTOR(a, 10);
+    CASE_RSHIFT_VECTOR(a, 11);
+    CASE_RSHIFT_VECTOR(a, 12);
+    CASE_RSHIFT_VECTOR(a, 13);
+    CASE_RSHIFT_VECTOR(a, 14);
+    CASE_RSHIFT_VECTOR(a, 15);
+    default: return zeroes128(); break;
+    }
+}
+#undef CASE_RSHIFT_VECTOR
+
+#define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_slli_si128((m128)(a), (count)); break;
+
+static really_inline
+m128 lshiftbyte_m128(const m128 a, int count_immed) {
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
+    if (__builtin_constant_p(count_immed)) {
+        return _mm_slli_si128(a, count_immed);
+    }
+#endif
+    switch (count_immed) {
+    case 0: return a; break;
+    CASE_LSHIFT_VECTOR(a, 1);
+    CASE_LSHIFT_VECTOR(a, 2);
+    CASE_LSHIFT_VECTOR(a, 3);
+    CASE_LSHIFT_VECTOR(a, 4);
+    CASE_LSHIFT_VECTOR(a, 5);
+    CASE_LSHIFT_VECTOR(a, 6);
+    CASE_LSHIFT_VECTOR(a, 7);
+    CASE_LSHIFT_VECTOR(a, 8);
+    CASE_LSHIFT_VECTOR(a, 9);
+    CASE_LSHIFT_VECTOR(a, 10);
+    CASE_LSHIFT_VECTOR(a, 11);
+    CASE_LSHIFT_VECTOR(a, 12);
+    CASE_LSHIFT_VECTOR(a, 13);
+    CASE_LSHIFT_VECTOR(a, 14);
+    CASE_LSHIFT_VECTOR(a, 15);
+    default: return zeroes128(); break;
+    }
+}
+#undef CASE_LSHIFT_VECTOR
+
+#if defined(HAVE_SSE41)
+#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
+#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
+#else
+#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
+#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
+#endif
+
+#if !defined(HAVE_AVX2)
+// TODO: this entire file needs restructuring - this carveout is awful
+#define extractlow64from256(a) movq(a.lo)
+#define extractlow32from256(a) movd(a.lo)
+#if defined(HAVE_SSE41)
+#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
+#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
+#else
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
+#endif
+
+#endif // !AVX2
+
+static really_inline m128 add128(m128 a, m128 b) {
+    return _mm_add_epi64(a, b);
+}
+
+static really_inline m128 and128(m128 a, m128 b) {
+    return _mm_and_si128(a,b);
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    return _mm_xor_si128(a,b);
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    return _mm_or_si128(a,b);
+}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline m512 broadcast128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+#endif
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    return _mm_andnot_si128(a, b);
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = vectorscan_assume_aligned(ptr, 16);
+    return _mm_load_si128((const m128 *)ptr);
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = vectorscan_assume_aligned(ptr, 16);
+    *(m128 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    return _mm_loadu_si128((const m128 *)ptr);
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    _mm_storeu_si128 ((m128 *)ptr, a);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+#if defined(HAVE_SSE41)
+    return !_mm_testz_si128(mask, val);
+#else
+    return isnonzero128(and128(mask, val));
+#endif
+}
+
+// offset must be an immediate
+#define palignr_imm(r, l, offset) _mm_alignr_epi8(r, l, offset)
+
+static really_inline
+m128 pshufb_m128(m128 a, m128 b) {
+    return _mm_shuffle_epi8(a, b);
+}
+
+#define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return palignr_imm((m128)(a), (m128)(b), (offset)); break;
+
+static really_really_inline
+m128 palignr_sw(m128 r, m128 l, int offset) {
+    switch (offset) {
+    case 0: return l; break;
+    CASE_ALIGN_VECTORS(r, l, 1);
+    CASE_ALIGN_VECTORS(r, l, 2);
+    CASE_ALIGN_VECTORS(r, l, 3);
+    CASE_ALIGN_VECTORS(r, l, 4);
+    CASE_ALIGN_VECTORS(r, l, 5);
+    CASE_ALIGN_VECTORS(r, l, 6);
+    CASE_ALIGN_VECTORS(r, l, 7);
+    CASE_ALIGN_VECTORS(r, l, 8);
+    CASE_ALIGN_VECTORS(r, l, 9);
+    CASE_ALIGN_VECTORS(r, l, 10);
+    CASE_ALIGN_VECTORS(r, l, 11);
+    CASE_ALIGN_VECTORS(r, l, 12);
+    CASE_ALIGN_VECTORS(r, l, 13);
+    CASE_ALIGN_VECTORS(r, l, 14);
+    CASE_ALIGN_VECTORS(r, l, 15);
+    case 16: return r; break;
+    default:
+	    return zeroes128();
+	    break;
+    }
+}
+#undef CASE_ALIGN_VECTORS
+
+static really_really_inline
+m128 palignr(m128 r, m128 l, int offset) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+       return palignr_imm(r, l, offset);
+    }
+#endif
+    return palignr_sw(r, l, offset);
+}
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    if (amount < 0) {
+        return palignr(zeroes128(), in, -amount);
+    } else {
+        return palignr(in, zeroes128(), 16 - amount);
+    }
+}
+/*
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb_m128(in, shift_mask);
+}*/
+
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return _mm_max_epu8(a, b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return _mm_min_epu8(a, b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return _mm_adds_epu8(a, b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return _mm_sub_epi8(a, b);
+}
+
+static really_inline
+m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
+    return _mm_set_epi32(x3, x2, x1, x0);
+}
+
+static really_inline
+m128 set2x64(u64a hi, u64a lo) {
+    return _mm_set_epi64x(hi, lo);
+}
+
+/****
+ **** 256-bit Primitives
+ ****/
+
+#if defined(HAVE_SIMD_256_BITS) && defined(HAVE_AVX2)
+
+static really_inline
+m256 pshufb_m256(m256 a, m256 b) {
+    return _mm256_shuffle_epi8(a, b);
+}
+
+static really_really_inline
+m256 lshift64_m256(m256 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm256_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm256_sll_epi64(a, x);
+}
+
+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
+
+static really_inline m256 set1_4x64(u64a c) {
+    return _mm256_set1_epi64x(c);
+}
+
+#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
+#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
+
+static really_inline
+m256 set1_2x128(m128 a) {
+    return _mm256_broadcastsi128_si256(a);
+}
+
+static really_inline m256 zeroes256(void) {
+    return _mm256_setzero_si256();
+}
+
+static really_inline m256 ones256(void) {
+    m256 rv = _mm256_set1_epi8(0xFF);
+    return rv;
+}
+
+static really_inline m256 add256(m256 a, m256 b) {
+    return _mm256_add_epi64(a, b);
+}
+
+static really_inline m256 and256(m256 a, m256 b) {
+    return _mm256_and_si256(a, b);
+}
+
+static really_inline m256 or256(m256 a, m256 b) {
+    return _mm256_or_si256(a, b);
+}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline m512 broadcast256(m256 a) {
+    return _mm512_broadcast_i64x4(a);
+}
+#endif
+
+static really_inline m256 xor256(m256 a, m256 b) {
+    return _mm256_xor_si256(a, b);
+}
+
+static really_inline m256 not256(m256 a) {
+    return _mm256_xor_si256(a, ones256());
+}
+
+static really_inline m256 andnot256(m256 a, m256 b) {
+    return _mm256_andnot_si256(a, b);
+}
+
+static really_inline int diff256(m256 a, m256 b) {
+    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
+}
+
+static really_inline int isnonzero256(m256 a) {
+    return !!diff256(a, zeroes256());
+}
+
+/**
+ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich256(m256 a, m256 b) {
+    a = _mm256_cmpeq_epi32(a, b);
+    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
+}
+
+/**
+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
+ * returns an 8-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_256(m256 a, m256 b) {
+    u32 d = diffrich256(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m256 load256(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    return _mm256_load_si256((const m256 *)ptr);
+}
+
+// aligned load  of 128-bit value to low and high part of 256-bit value
+static really_inline m256 load2x128(const void *ptr) {
+    return set1_2x128(load128(ptr));
+}
+
+static really_inline m256 loadu2x128(const void *ptr) {
+    return set1_2x128(loadu128(ptr));
+}
+
+// aligned store
+static really_inline void store256(void *ptr, m256 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    _mm256_store_si256((m256 *)ptr, a);
+}
+
+// unaligned load
+static really_inline m256 loadu256(const void *ptr) {
+    return _mm256_loadu_si256((const m256 *)ptr);
+}
+
+static really_really_inline
+m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
+    return _mm256_maskz_loadu_epi8(k, ptr);
+}
+
+// unaligned store
+static really_inline void storeu256(void *ptr, m256 a) {
+    _mm256_storeu_si256((m256 *)ptr, a);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes256(void *ptr, m256 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m256 loadbytes256(const void *ptr, unsigned int n) {
+    m256 a = zeroes256();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
+static really_inline
+m256 set1_32x8(u32 in) {
+    return _mm256_set1_epi8(in);
+}
+
+static really_inline
+m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
+    return _mm256_set_epi32(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0);
+}
+
+static really_inline
+m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    *ptr = or256(mask1bit256(n), *ptr);
+}
+
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    *ptr = andnot256(mask1bit256(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    const m256 mask = mask1bit256(n);
+    return !_mm256_testz_si256(mask, val);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return _mm256_extracti128_si256(x, 1);
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return _mm256_extracti128_si256(x, 0);
+}
+
+#define cast256to128(a) _mm256_castsi256_si128(a)
+#define cast128to256(a) _mm256_castsi128_si256(a)
+#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
+#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
+#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
+#define extractlow64from256(a) movq(cast256to128(a))
+#define extractlow32from256(a) movd(cast256to128(a))
+#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
+#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+#if defined(_mm256_set_m128i)
+    return _mm256_set_m128i(hi, lo);
+#else
+    return insert128to256(cast128to256(lo), hi, 1);
+#endif
+}
+#endif //AVX2
+
+/****
+ **** 512-bit Primitives
+ ****/
+
+#if defined(HAVE_SIMD_512_BITS)
+
+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
+#define set2x256(a) _mm512_broadcast_i64x4(a)
+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
+
+static really_inline u32 movd512(const m512 in) {
+    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
+}
+
+static really_inline u64a movq512(const m512 in) {
+    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
+    //       so we use 2-step convertions to work around.
+    return movq(_mm512_castsi512_si128(in));
+}
+
+static really_inline
+m512 pshufb_m512(m512 a, m512 b) {
+    return _mm512_shuffle_epi8(a, b);
+}
+
+static really_inline
+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
+    return _mm512_maskz_shuffle_epi8(k, a, b);
+}
+
+#if defined(HAVE_AVX512VBMI)
+#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
+#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
+#endif
+
+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
+
+static really_inline
+m512 zeroes512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_setzero_si512();
+#else
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+#endif
+}
+
+static really_inline
+m512 ones512(void) {
+    return _mm512_set1_epi8(0xFF);
+    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
+}
+
+static really_inline
+m512 set1_64x8(u8 a) {
+    return _mm512_set1_epi8(a);
+}
+
+static really_inline
+m512 set1_8x64(u64a a) {
+    return _mm512_set1_epi64(a);
+}
+
+static really_inline
+m512 set1_16x32(u32 a) {
+    return _mm512_set1_epi32(a);
+}
+
+static really_inline
+m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
+    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
+                            lo_3, lo_2, lo_1, lo_0);
+}
+
+static really_inline
+m512 swap256in512(m512 a) {
+    m512 idx = set8x64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    return vpermq512(idx, a);
+}
+
+static really_inline
+m512 set1_4x128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+
+static really_inline
+m512 sadd_u8_m512(m512 a, m512 b) {
+    return _mm512_adds_epu8(a, b);
+}
+
+static really_inline
+m512 max_u8_m512(m512 a, m512 b) {
+    return _mm512_max_epu8(a, b);
+}
+
+static really_inline
+m512 min_u8_m512(m512 a, m512 b) {
+    return _mm512_min_epu8(a, b);
+}
+
+static really_inline
+m512 sub_u8_m512(m512 a, m512 b) {
+    return _mm512_sub_epi8(a, b);
+}
+
+static really_inline m512
+add512(m512 a, m512 b) {
+    return _mm512_add_epi64(a, b);
+}
+
+static really_inline
+m512 and512(m512 a, m512 b) {
+    return _mm512_and_si512(a, b);
+}
+
+static really_inline
+m512 or512(m512 a, m512 b) {
+    return _mm512_or_si512(a, b);
+}
+
+#if defined(HAVE_AVX512VBMI)
+static really_inline m512 broadcast384(m384 a) {
+    u64a *lo = (u64a*)&a.lo;
+    u64a *mid = (u64a*)&a.mid;
+    u64a *hi = (u64a*)&a.hi;
+    return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
+                            lo[1], lo[0]);
+}
+#endif
+
+static really_inline
+m512 xor512(m512 a, m512 b) {
+    return _mm512_xor_si512(a, b);
+}
+
+static really_inline
+m512 not512(m512 a) {
+    return _mm512_xor_si512(a, ones512());
+}
+
+static really_inline
+m512 andnot512(m512 a, m512 b) {
+    return _mm512_andnot_si512(a, b);
+}
+
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm512_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm512_sll_epi64(a, x);
+}
+
+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
+
+#if !defined(_MM_CMPINT_NE)
+#define _MM_CMPINT_NE 0x4
+#endif
+
+static really_inline
+int diff512(m512 a, m512 b) {
+    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
+}
+
+static really_inline
+int isnonzero512(m512 a) {
+    return diff512(a, zeroes512());
+}
+
+/**
+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich512(m512 a, m512 b) {
+    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
+}
+
+/**
+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
+ * returns a 16-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline
+u32 diffrich64_512(m512 a, m512 b) {
+    //TODO: cmp_epi64?
+    u32 d = diffrich512(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline
+m512 load512(const void *ptr) {
+    return _mm512_load_si512(ptr);
+}
+
+// aligned store
+static really_inline
+void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+    return _mm512_store_si512(ptr, a);
+}
+
+// unaligned load
+static really_inline
+m512 loadu512(const void *ptr) {
+    return _mm512_loadu_si512(ptr);
+}
+
+// unaligned store
+static really_inline
+void storeu512(void *ptr, m512 a) {
+#if defined(HAVE_AVX512)
+    _mm512_storeu_si512((m512 *)ptr, a);
+#elif defined(HAVE_AVX2)
+    storeu256(ptr, a.lo);
+    storeu256((char *)ptr + 32, a.hi);
+#else
+    storeu128(ptr, a.lo.lo);
+    storeu128((char *)ptr + 16, a.lo.hi);
+    storeu128((char *)ptr + 32, a.hi.lo);
+    storeu128((char *)ptr + 48, a.hi.hi);
+#endif
+}
+
+static really_inline
+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+    return _mm512_maskz_loadu_epi8(k, ptr);
+}
+
+static really_inline
+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+    return _mm512_mask_loadu_epi8(src, k, ptr);
+}
+
+static really_inline
+void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
+    _mm512_mask_storeu_epi8(ptr, k, a);
+}
+
+static really_inline
+m512 set_mask_m512(__mmask64 k) {
+    return _mm512_movm_epi8(k);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes512(void *ptr, m512 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m512 loadbytes512(const void *ptr, unsigned int n) {
+    m512 a = zeroes512();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    *ptr = or512(mask1bit512(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    *ptr = andnot512(mask1bit512(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit512(m512 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    const m512 mask = mask1bit512(n);
+    return !!_mm512_test_epi8_mask(mask, val);
+}
+
+#endif // HAVE_SIMD_512_BITS
+
+#endif // ARCH_X86_SIMD_UTILS_H
diff --git a/src/util/arch/x86/x86.h b/src/util/arch/x86/x86.h
new file mode 100644
index 000000000..87a525584
--- /dev/null
+++ b/src/util/arch/x86/x86.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Per-platform architecture definitions
+ */
+
+#ifndef UTIL_ARCH_X86_H_
+#define UTIL_ARCH_X86_H_
+
+#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
+#define HAVE_SSE2
+#define HAVE_SIMD_128_BITS
+#endif
+
+#if defined(__SSE4_1__) || defined(__AVX__)
+#define HAVE_SSE41
+#define HAVE_SIMD_128_BITS
+#endif
+
+#if defined(__SSE4_2__) || defined(__AVX__)
+#define HAVE_SSE42
+#define HAVE_SIMD_128_BITS
+#endif
+
+#if defined(__AVX__) && defined(BUILD_AVX2)
+#define HAVE_AVX
+#define HAVE_SIMD_256_BITS
+#endif
+
+#if defined(__AVX2__) && defined(BUILD_AVX2)
+#define HAVE_AVX2
+#define HAVE_SIMD_256_BITS
+#endif
+
+#if defined(__AVX512BW__) && defined(BUILD_AVX512)
+#define HAVE_AVX512
+#define HAVE_MASKED_LOADS
+#define HAVE_SIMD_512_BITS
+#endif
+
+#if defined(__AVX512VBMI__) && defined(BUILD_AVX512)
+#define HAVE_AVX512VBMI
+#endif
+
+#if defined(HAVE_SIMD_512_BITS)
+#define CHUNKSIZE 512
+#define VECTORSIZE 64
+#elif defined(HAVE_SIMD_256_BITS)
+#define CHUNKSIZE 256
+#define VECTORSIZE 32
+#elif defined(HAVE_SIMD_128_BITS)
+#define CHUNKSIZE 128
+#define VECTORSIZE 16
+#endif
+
+#if defined(__POPCNT__)
+#define HAVE_POPCOUNT_INSTR
+#endif
+
+#if defined(__BMI__)
+#define HAVE_BMI
+#endif
+
+#if defined(__BMI2__)
+#define HAVE_BMI2
+#endif
+
+#endif // UTIL_ARCH_X86_H_
diff --git a/src/util/bitfield.h b/src/util/bitfield.h
index a580da7b6..4a3fbd6ed 100644
--- a/src/util/bitfield.h
+++ b/src/util/bitfield.h
@@ -138,8 +138,8 @@ class bitfield {
 
     /// Flip all bits.
     void flip() {
-        for (auto &e : bits) {
-            e = ~e;
+        for (size_t i = 0; i < size(); i++) {
+            flip(i);
         }
         clear_trailer();
     }
@@ -189,10 +189,7 @@ class bitfield {
         size_t sum = 0;
         size_t i = 0;
         for (; i + 4 <= num_blocks; i += 4) {
-            sum += popcount64(bits[i]);
-            sum += popcount64(bits[i + 1]);
-            sum += popcount64(bits[i + 2]);
-            sum += popcount64(bits[i + 3]);
+            sum += popcount64x4(&bits[i]);
         }
         for (; i < num_blocks; i++) {
             sum += popcount64(bits[i]);
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index c545ee187..8e9aae9c2 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +34,7 @@
 #ifndef BITUTILS_H
 #define BITUTILS_H
 
+#include "config.h"
 #include "ue2common.h"
 #include "popcount.h"
 #include "util/arch.h"
@@ -43,450 +45,180 @@
 #define DOUBLE_CASE_CLEAR 0xdfdf
 #define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL
 
+#if !defined(VS_SIMDE_BACKEND)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/bitutils.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/bitutils.h"
+#elif defined(ARCH_PPC64EL)
+#include "util/arch/ppc64el/bitutils.h"
+#endif
+#else
+#include "util/arch/common/bitutils.h"
+#define clz32_impl clz32_impl_c
+#define clz64_impl clz64_impl_c
+#define ctz32_impl ctz32_impl_c
+#define ctz64_impl ctz64_impl_c
+#define lg2_impl lg2_impl_c
+#define lg2_64_impl lg2_64_impl_c
+#define findAndClearLSB_32_impl findAndClearLSB_32_impl_c
+#define findAndClearLSB_64_impl findAndClearLSB_64_impl_c
+#define findAndClearMSB_32_impl findAndClearMSB_32_impl_c
+#define findAndClearMSB_64_impl findAndClearMSB_64_impl_c
+#define compress32_impl compress32_impl_c
+#define compress64_impl compress64_impl_c
+#define compress128_impl compress128_impl_c
+#define expand32_impl expand32_impl_c
+#define expand64_impl expand64_impl_c
+#define expand128_impl expand128_impl_c
+#define bf64_iterate_impl bf64_iterate_impl_c
+#define bf64_set_impl bf64_set_impl_c
+#define bf64_unset_impl bf64_unset_impl_c
+#define rank_in_mask32_impl rank_in_mask32_impl_c
+#define rank_in_mask64_impl rank_in_mask64_impl_c
+#define pext32_impl pext32_impl_c
+#define pext64_impl pext64_impl_c
+#define pdep64_impl pdep64_impl_c
+#endif
+
 static really_inline
 u32 clz32(u32 x) {
     assert(x); // behaviour not defined for x == 0
-#if defined(_WIN32)
-    unsigned long r;
-    _BitScanReverse(&r, x);
-    return 31 - r;
-#else
-    return (u32)__builtin_clz(x);
-#endif
+
+    return clz32_impl(x);
 }
 
 static really_inline
 u32 clz64(u64a x) {
     assert(x); // behaviour not defined for x == 0
-#if defined(_WIN64)
-    unsigned long r;
-    _BitScanReverse64(&r, x);
-    return 63 - r;
-#elif defined(_WIN32)
-    unsigned long x1 = (u32)x;
-    unsigned long x2 = (u32)(x >> 32);
-    unsigned long r;
-    if (x2) {
-        _BitScanReverse(&r, x2);
-        return (u32)(31 - r);
-    }
-    _BitScanReverse(&r, (u32)x1);
-    return (u32)(63 - r);
-#else
-    return (u32)__builtin_clzll(x);
-#endif
+
+    return clz64_impl(x);
 }
 
 // CTZ (count trailing zero) implementations.
 static really_inline
 u32 ctz32(u32 x) {
     assert(x); // behaviour not defined for x == 0
-#if defined(_WIN32)
-    unsigned long r;
-    _BitScanForward(&r, x);
-    return r;
-#else
-    return (u32)__builtin_ctz(x);
-#endif
+
+    return ctz32_impl(x);
 }
 
 static really_inline
 u32 ctz64(u64a x) {
     assert(x); // behaviour not defined for x == 0
-#if defined(_WIN64)
-    unsigned long r;
-    _BitScanForward64(&r, x);
-    return r;
-#elif defined(_WIN32)
-    unsigned long r;
-    if (_BitScanForward(&r, (u32)x)) {
-        return (u32)r;
-    }
-    _BitScanForward(&r, x >> 32);
-    return (u32)(r + 32);
-#else
-    return (u32)__builtin_ctzll(x);
-#endif
+
+    return ctz64_impl(x);
 }
 
 static really_inline
 u32 lg2(u32 x) {
-    if (!x) {
-        return 0;
-    }
-    return 31 - clz32(x);
+    return lg2_impl(x);
 }
 
 static really_inline
 u64a lg2_64(u64a x) {
-    if (!x) {
-        return 0;
-    }
-    return 63 - clz64(x);
+    return lg2_64_impl(x);
 }
 
 static really_inline
 u32 findAndClearLSB_32(u32 *v) {
-    assert(*v != 0); // behaviour not defined in this case
-#ifndef NO_ASM
-    u32 val = *v, offset;
-    __asm__ ("bsf %1, %0\n"
-             "btr %0, %1\n"
-             : "=r" (offset), "=r" (val)
-             : "1" (val));
-    *v = val;
-#else
-    u32 val = *v;
-    u32 offset = ctz32(val);
-    *v = val & (val - 1);
-#endif
-
-    assert(offset < 32);
-    return offset;
+    return findAndClearLSB_32_impl(v);
 }
 
 static really_inline
 u32 findAndClearLSB_64(u64a *v) {
-    assert(*v != 0); // behaviour not defined in this case
-
-#ifdef ARCH_64_BIT
-#if defined(ARCH_X86_64) && !defined(NO_ASM)
-    u64a val = *v, offset;
-    __asm__ ("bsfq %1, %0\n"
-             "btrq %0, %1\n"
-             : "=r" (offset), "=r" (val)
-             : "1" (val));
-    *v = val;
-#else
-    // generic variant using gcc's builtin on 64-bit
-    u64a val = *v, offset;
-    offset = ctz64(val);
-    *v = val & (val - 1);
-#endif // ARCH_X86_64
-#else
-    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
-    // inline calls to __builtin_ctzll
-    u32 v1 = (u32)*v;
-    u32 v2 = (u32)(*v >> 32);
-    u32 offset;
-    if (v1) {
-        offset = findAndClearLSB_32(&v1);
-        *v = (u64a)v1 | ((u64a)v2 << 32);
-    } else {
-        offset = findAndClearLSB_32(&v2) + 32;
-        *v = (u64a)v2 << 32;
-    }
-#endif
-
-    assert(offset < 64);
-    return (u32)offset;
+    return findAndClearLSB_64_impl(v);
 }
 
 static really_inline
 u32 findAndClearMSB_32(u32 *v) {
-    assert(*v != 0); // behaviour not defined in this case
-#ifndef NO_ASM
-    u32 val = *v, offset;
-    __asm__ ("bsr %1, %0\n"
-             "btr %0, %1\n"
-             : "=r" (offset), "=r" (val)
-             : "1" (val));
-    *v = val;
-#else
-    u32 val = *v;
-    u32 offset = 31 - clz32(val);
-    *v = val & ~(1 << offset);
-#endif
-    assert(offset < 32);
-    return offset;
+    return findAndClearMSB_32_impl(v);
 }
 
 static really_inline
 u32 findAndClearMSB_64(u64a *v) {
-    assert(*v != 0); // behaviour not defined in this case
-
-#ifdef ARCH_64_BIT
-#if defined(ARCH_X86_64) && !defined(NO_ASM)
-    u64a val = *v, offset;
-    __asm__ ("bsrq %1, %0\n"
-             "btrq %0, %1\n"
-             : "=r" (offset), "=r" (val)
-             : "1" (val));
-    *v = val;
-#else
-    // generic variant using gcc's builtin on 64-bit
-    u64a val = *v, offset;
-    offset = 63 - clz64(val);
-    *v = val & ~(1ULL << offset);
-#endif // ARCH_X86_64
-#else
-    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
-    // inline calls to __builtin_ctzll
-    u32 v1 = (u32)*v;
-    u32 v2 = (*v >> 32);
-    u32 offset;
-    if (v2) {
-        offset = findAndClearMSB_32(&v2) + 32;
-        *v = ((u64a)v2 << 32) | (u64a)v1;
-    } else {
-        offset = findAndClearMSB_32(&v1);
-        *v = (u64a)v1;
-    }
-#endif
-
-    assert(offset < 64);
-    return (u32)offset;
+    return findAndClearMSB_64_impl(v);
 }
 
 static really_inline
 u32 compress32(u32 x, u32 m) {
-#if defined(HAVE_BMI2)
-    // BMI2 has a single instruction for this operation.
-    return _pext_u32(x, m);
-#else
-
-    // Return zero quickly on trivial cases
-    if ((x & m) == 0) {
-        return 0;
-    }
-
-    u32 mk, mp, mv, t;
-
-    x &= m; // clear irrelevant bits
-
-    mk = ~m << 1; // we will count 0's to right
-    for (u32 i = 0; i < 5; i++) {
-        mp = mk ^ (mk << 1);
-        mp ^= mp << 2;
-        mp ^= mp << 4;
-        mp ^= mp << 8;
-        mp ^= mp << 16;
-
-        mv = mp & m; // bits to move
-        m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        t = x & mv;
-        x = (x ^ t) | (t >> (1 << i)); // compress x
-        mk = mk & ~mp;
-    }
-
-    return x;
-#endif
+    return compress32_impl(x, m);
 }
 
 static really_inline
 u64a compress64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
-    // BMI2 has a single instruction for this operation.
-    return _pext_u64(x, m);
-#else
-
-    // Return zero quickly on trivial cases
-    if ((x & m) == 0) {
-        return 0;
-    }
-
-    u64a mk, mp, mv, t;
-
-    x &= m; // clear irrelevant bits
-
-    mk = ~m << 1; // we will count 0's to right
-    for (u32 i = 0; i < 6; i++) {
-        mp = mk ^ (mk << 1);
-        mp ^= mp << 2;
-        mp ^= mp << 4;
-        mp ^= mp << 8;
-        mp ^= mp << 16;
-        mp ^= mp << 32;
-
-        mv = mp & m; // bits to move
-        m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        t = x & mv;
-        x = (x ^ t) | (t >> (1 << i)); // compress x
-        mk = mk & ~mp;
-    }
+    return compress64_impl(x, m);
+}
 
-    return x;
-#endif
+static really_inline
+m128 compress128(m128 x, m128 m) {
+    return compress128_impl(x, m);
 }
 
 static really_inline
 u32 expand32(u32 x, u32 m) {
-#if defined(HAVE_BMI2)
-    // BMI2 has a single instruction for this operation.
-    return _pdep_u32(x, m);
-#else
-
-    // Return zero quickly on trivial cases
-    if (!x || !m) {
-        return 0;
-    }
-
-    u32 m0, mk, mp, mv, t;
-    u32 array[5];
-
-    m0 = m; // save original mask
-    mk = ~m << 1; // we will count 0's to right
-
-    for (int i = 0; i < 5; i++) {
-        mp = mk ^ (mk << 1); // parallel suffix
-        mp = mp ^ (mp << 2);
-        mp = mp ^ (mp << 4);
-        mp = mp ^ (mp << 8);
-        mp = mp ^ (mp << 16);
-        mv = mp & m; // bits to move
-        array[i] = mv;
-        m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        mk = mk & ~mp;
-    }
-
-    for (int i = 4; i >= 0; i--) {
-        mv = array[i];
-        t = x << (1 << i);
-        x = (x & ~mv) | (t & mv);
-    }
-
-    return x & m0; // clear out extraneous bits
-#endif
+    return expand32_impl(x, m);
 }
 
 static really_inline
 u64a expand64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
-    // BMI2 has a single instruction for this operation.
-    return _pdep_u64(x, m);
-#else
-
-    // Return zero quickly on trivial cases
-    if (!x || !m) {
-        return 0;
-    }
-
-    u64a m0, mk, mp, mv, t;
-    u64a array[6];
-
-    m0 = m; // save original mask
-    mk = ~m << 1; // we will count 0's to right
-
-    for (int i = 0; i < 6; i++) {
-        mp = mk ^ (mk << 1); // parallel suffix
-        mp = mp ^ (mp << 2);
-        mp = mp ^ (mp << 4);
-        mp = mp ^ (mp << 8);
-        mp = mp ^ (mp << 16);
-        mp = mp ^ (mp << 32);
-        mv = mp & m; // bits to move
-        array[i] = mv;
-        m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        mk = mk & ~mp;
-    }
-
-    for (int i = 5; i >= 0; i--) {
-        mv = array[i];
-        t = x << (1 << i);
-        x = (x & ~mv) | (t & mv);
-    }
-
-    return x & m0; // clear out extraneous bits
-#endif
+    return expand64_impl(x, m);
 }
 
+static really_inline
+m128 expand128(m128 x, m128 m) {
+    return expand128_impl(x, m);
+}
 
 /* returns the first set bit after begin (if not ~0U). If no bit is set after
  * begin returns ~0U
  */
 static really_inline
 u32 bf64_iterate(u64a bitfield, u32 begin) {
-    if (begin != ~0U) {
-        /* switch off all bits at or below begin. Note: not legal to shift by
-         * by size of the datatype or larger. */
-        assert(begin <= 63);
-        bitfield &= ~((2ULL << begin) - 1);
-    }
-
-    if (!bitfield) {
-        return ~0U;
-    }
-
-    return ctz64(bitfield);
+    return bf64_iterate_impl(bitfield, begin);
 }
 
 static really_inline
 char bf64_set(u64a *bitfield, u32 i) {
-    assert(i < 64);
-    u64a mask = 1ULL << i;
-    char was_set = !!(*bitfield & mask);
-    *bitfield |= mask;
-
-    return was_set;
+    return bf64_set_impl(bitfield, i);
 }
 
 static really_inline
 void bf64_unset(u64a *bitfield, u32 i) {
-    assert(i < 64);
-    *bitfield &= ~(1ULL << i);
+    return bf64_unset_impl(bitfield, i);
 }
 
 static really_inline
 u32 rank_in_mask32(u32 mask, u32 bit) {
-    assert(bit < sizeof(u32) * 8);
-    assert(mask & (u32)(1U << bit));
-    mask &= (u32)(1U << bit) - 1;
-    return popcount32(mask);
+    return rank_in_mask32_impl(mask, bit);
 }
 
 static really_inline
 u32 rank_in_mask64(u64a mask, u32 bit) {
-    assert(bit < sizeof(u64a) * 8);
-    assert(mask & (u64a)(1ULL << bit));
-    mask &= (u64a)(1ULL << bit) - 1;
-    return popcount64(mask);
+    return rank_in_mask64_impl(mask, bit);
 }
 
 static really_inline
 u32 pext32(u32 x, u32 mask) {
-#if defined(HAVE_BMI2)
-    // Intel BMI2 can do this operation in one instruction.
-    return _pext_u32(x, mask);
-#else
-
-    u32 result = 0, num = 1;
-    while (mask != 0) {
-        u32 bit = findAndClearLSB_32(&mask);
-        if (x & (1U << bit)) {
-            assert(num != 0); // more than 32 bits!
-            result |= num;
-        }
-        num <<= 1;
-    }
-    return result;
-#endif
+    return pext32_impl(x, mask);
 }
 
 static really_inline
 u64a pext64(u64a x, u64a mask) {
-#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
-    // Intel BMI2 can do this operation in one instruction.
-    return _pext_u64(x, mask);
-#else
-
-    u32 result = 0, num = 1;
-    while (mask != 0) {
-        u32 bit = findAndClearLSB_64(&mask);
-        if (x & (1ULL << bit)) {
-            assert(num != 0); // more than 32 bits!
-            result |= num;
-        }
-        num <<= 1;
-    }
-    return result;
-#endif
+    return pext64_impl(x, mask);
 }
 
-#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
 static really_inline
 u64a pdep64(u64a x, u64a mask) {
-    return _pdep_u64(x, mask);
+    return pdep64_impl(x, mask);
+}
+
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn(const u32 a, const u8 *b) {
+    return andn_impl_c(a, b);
 }
-#endif
 
 #endif // BITUTILS_H
diff --git a/src/util/clique.cpp b/src/util/clique.cpp
index c2befea49..19daed3cb 100644
--- a/src/util/clique.cpp
+++ b/src/util/clique.cpp
@@ -33,7 +33,6 @@
 #include "clique.h"
 #include "container.h"
 #include "graph_range.h"
-#include "make_unique.h"
 
 #include <map>
 #include <set>
@@ -51,7 +50,7 @@ vector<u32> getNeighborInfo(const CliqueGraph &g,
     // find neighbors for cv
     for (const auto &v : adjacent_vertices_range(cv, g)) {
         if (g[v].stateId != id && contains(group, g[v].stateId)){
-            neighbor.push_back(g[v].stateId);
+            neighbor.emplace_back(g[v].stateId);
             DEBUG_PRINTF("Neighbor:%u\n", g[v].stateId);
         }
     }
@@ -68,20 +67,20 @@ vector<u32> findCliqueGroup(CliqueGraph &cg) {
     vector<u32> init;
     for (const auto &v : vertices_range(cg)) {
         vertexMap[cg[v].stateId] = v;
-        init.push_back(cg[v].stateId);
+        init.emplace_back(cg[v].stateId);
     }
     gStack.push(init);
 
     // Get the vertex to start from
     vector<u32> clique;
     while (!gStack.empty()) {
-        vector<u32> g = move(gStack.top());
+        vector<u32> g = std::move(gStack.top());
         gStack.pop();
 
         // Choose a vertex from the graph
         u32 id = g[0];
         CliqueVertex &n = vertexMap.at(id);
-        clique.push_back(id);
+        clique.emplace_back(id);
         // Corresponding vertex in the original graph
         set<u32> subgraphId(g.begin(), g.end());
         auto neighbor = getNeighborInfo(cg, n, subgraphId);
@@ -110,7 +109,7 @@ vector<vector<u32>> removeClique(CliqueGraph &cg) {
         for (const auto &v : vertices_range(cg)) {
             u32 id = cg[v].stateId;
             if (find(c.begin(), c.end(), id) != c.end()) {
-                dead.push_back(v);
+                dead.emplace_back(v);
             }
         }
         for (const auto &v : dead) {
@@ -121,7 +120,7 @@ vector<vector<u32>> removeClique(CliqueGraph &cg) {
             break;
         }
         auto clique = findCliqueGroup(cg);
-        cliquesVec.push_back(clique);
+        cliquesVec.emplace_back(clique);
     }
 
     return cliquesVec;
diff --git a/src/util/copybytes.h b/src/util/copybytes.h
index 872b8d289..7f37d96bc 100644
--- a/src/util/copybytes.h
+++ b/src/util/copybytes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,7 @@
 #include "simd_utils.h"
 
 static really_inline
-void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) {
+void copy_upto_64_bytes(u8 *dst, const u8 *src, unsigned int len) {
     switch (len) {
     case 0:
         break;
@@ -72,14 +72,41 @@ void copy_upto_32_bytes(u8 *dst, const u8 *src, unsigned int len) {
     case 16:
         storeu128(dst, loadu128(src));
         break;
+    case 17:
+    case 18:
+    case 19:
+    case 20:
+    case 21:
+    case 22:
+    case 23:
+    case 24:
+    case 25:
+    case 26:
+    case 27:
+    case 28:
+    case 29:
+    case 30:
+    case 31:
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
     case 32:
         storeu256(dst, loadu256(src));
         break;
+#ifdef HAVE_AVX512
+    case 64:
+        storebytes512(dst, loadu512(src), 64);
+        break;
     default:
-        assert(len < 32);
-        storeu128(dst + len - 16, loadu128(src + len - 16));
-        storeu128(dst, loadu128(src));
+        assert(len < 64);
+        u64a k = (1ULL << len) - 1;
+        storeu_mask_m512(dst, k, loadu_maskz_m512(k, src));
+        break;
+#else
+    default:
+        assert(0);
         break;
+#endif
     }
 }
 
diff --git a/src/util/determinise.h b/src/util/determinise.h
index 102a19744..cfccd597f 100644
--- a/src/util/determinise.h
+++ b/src/util/determinise.h
@@ -88,7 +88,7 @@ bool determinise(Auto &n, std::vector<ds> &dstates, size_t state_limit,
     dstates.reserve(state_limit);
 
     dstate_ids.emplace(n.dead, DEAD_STATE);
-    dstates.push_back(ds(alphabet_size));
+    dstates.emplace_back(ds(alphabet_size));
     std::fill_n(dstates[0].next.begin(), alphabet_size, DEAD_STATE);
 
     std::queue<std::pair<StateSet, dstate_id_t>> q;
@@ -99,7 +99,7 @@ bool determinise(Auto &n, std::vector<ds> &dstates, size_t state_limit,
         q.emplace(init[i], dstates.size());
         assert(!contains(dstate_ids, init[i]));
         dstate_ids.emplace(init[i], dstates.size());
-        dstates.push_back(ds(alphabet_size));
+        dstates.emplace_back(ds(alphabet_size));
     }
 
     std::vector<StateSet> succs(alphabet_size, n.dead);
@@ -149,7 +149,7 @@ bool determinise(Auto &n, std::vector<ds> &dstates, size_t state_limit,
                 } else {
                     succ_id = dstate_ids.size();
                     dstate_ids.emplace(succs[s], succ_id);
-                    dstates.push_back(ds(alphabet_size));
+                    dstates.emplace_back(ds(alphabet_size));
                     dstates.back().daddy = n.unalpha[s] < N_CHARS ? curr_id : 0;
                     q.emplace(succs[s], succ_id);
                 }
diff --git a/src/util/dump_charclass.cpp b/src/util/dump_charclass.cpp
index d0659a8bd..df308dec0 100644
--- a/src/util/dump_charclass.cpp
+++ b/src/util/dump_charclass.cpp
@@ -56,11 +56,7 @@ void describeChar(ostream &os, char c, enum cc_output_t out_type) {
 
     const string backslash((out_type == CC_OUT_DOT ? 2 : 1), '\\');
 
-#ifdef _WIN32
-    if (c >= 0x21 && c < 0x7F && c != '\\') {
-#else
     if (isgraph(c) && c != '\\') {
-#endif
         if (escaped.find(c) != string::npos) {
             os << backslash << c;
         } else if (out_type == CC_OUT_DOT
diff --git a/src/util/graph.h b/src/util/graph.h
index 660afd029..7f9f9342d 100644
--- a/src/util/graph.h
+++ b/src/util/graph.h
@@ -157,7 +157,7 @@ find_vertices_in_cycles(const Graph &g) {
     std::map<size_t, std::vector<vertex_descriptor>> comps;
 
     for (const auto &e : comp_map) {
-        comps[e.second].push_back(e.first);
+        comps[e.second].emplace_back(e.first);
     }
 
     flat_set<vertex_descriptor> rv;
@@ -170,6 +170,7 @@ find_vertices_in_cycles(const Graph &g) {
         assert(!comp.empty());
         if (comp.size() > 1) {
             insert(&rv, comp);
+            continue;
         }
         vertex_descriptor v = *comp.begin();
         if (hasSelfLoop(v, g)) {
diff --git a/src/util/graph_undirected.h b/src/util/graph_undirected.h
index 049964ab0..507172847 100644
--- a/src/util/graph_undirected.h
+++ b/src/util/graph_undirected.h
@@ -70,8 +70,8 @@ class undirected_graph_edge_descriptor
     using base_vertex_type = typename base_graph_traits::vertex_descriptor;
 
     base_edge_type underlying_edge;
-    const base_graph_type *g;
-    bool reverse; // if true, reverse vertices in source() and target()
+    const base_graph_type *g = nullptr;
+    bool reverse = false; // if true, reverse vertices in source() and target()
 
     inline std::pair<base_vertex_type, base_vertex_type>
     canonical_edge() const {
diff --git a/src/util/insertion_ordered.h b/src/util/insertion_ordered.h
index 2067d3507..7121ab2be 100644
--- a/src/util/insertion_ordered.h
+++ b/src/util/insertion_ordered.h
@@ -163,7 +163,7 @@ class element_store {
     std::pair<iterator, bool> insert(const Key &key, const Element &element) {
         const auto idx = data.size();
         if (map.emplace(key, idx).second) {
-            data.push_back(element);
+            data.emplace_back(element);
             return {begin() + idx, true};
         }
         return {end(), false};
diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h
index edc4f6efb..64f9e9bad 100644
--- a/src/util/intrinsics.h
+++ b/src/util/intrinsics.h
@@ -45,6 +45,14 @@
 # endif
 #endif
 
+#if defined(HAVE_C_ARM_NEON_H)
+#  define USE_ARM_NEON_H
+#endif
+
+#if defined(HAVE_C_PPC64EL_ALTIVEC_H)
+#  define USE_PPC64EL_ALTIVEC_H
+#endif
+
 #ifdef __cplusplus
 # if defined(HAVE_CXX_INTRIN_H)
 #  define USE_INTRIN_H
@@ -59,8 +67,13 @@
 #include <x86intrin.h>
 #elif defined(USE_INTRIN_H)
 #include <intrin.h>
-#else
-#error no intrinsics file
+#elif defined(USE_ARM_NEON_H)
+#include <arm_neon.h>
+#  if defined(HAVE_SVE)
+#    include <arm_sve.h>
+#  endif
+#elif defined(USE_PPC64EL_ALTIVEC_H)
+#include <altivec.h>
 #endif
 
 #endif // INTRINSICS_H
diff --git a/src/util/match.hpp b/src/util/match.hpp
new file mode 100644
index 000000000..6567b2129
--- /dev/null
+++ b/src/util/match.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MATCH_HPP
+#define MATCH_HPP
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/bitutils.h"
+#include "util/unaligned.h"
+
+#include "util/supervector/supervector.hpp"
+
+template <u16 S>
+const u8 *first_non_zero_match(const u8 *buf, SuperVector<S> v, u16 const len = S);
+
+template <u16 S>
+const u8 *last_non_zero_match(const u8 *buf, SuperVector<S> v, u16 const len = S);
+
+template <u16 S>
+const u8 *first_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 const len = S);
+
+template <u16 S>
+const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S);
+
+#if defined(VS_SIMDE_BACKEND)
+#include "util/arch/x86/match.hpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/match.hpp"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/match.hpp"
+#elif defined(ARCH_PPC64EL)
+#include "util/arch/ppc64el/match.hpp"
+#endif
+#endif
+
+#endif // MATCH_HPP
diff --git a/src/util/multibit.h b/src/util/multibit.h
index c3a4ba461..95261b371 100644
--- a/src/util/multibit.h
+++ b/src/util/multibit.h
@@ -1197,11 +1197,7 @@ u32 mmbit_sparse_iter_begin(const u8 *bits, u32 total_bits, u32 *idx,
     assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter)));
 
     // Our state _may_ be on the stack
-#ifndef _WIN32
     assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
-#else
-    assert(ISALIGNED_N(s, 4));
-#endif
 
     MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
     // iterator should have _something_ at the root level
@@ -1309,11 +1305,7 @@ u32 mmbit_sparse_iter_next(const u8 *bits, u32 total_bits, u32 last_key,
     assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter)));
 
     // Our state _may_ be on the stack
-#ifndef _WIN32
     assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
-#else
-    assert(ISALIGNED_N(s, 4));
-#endif
 
     MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
     MDEBUG_PRINTF("NEXT (total_bits=%u, last_key=%u)\n", total_bits, last_key);
@@ -1466,11 +1458,7 @@ void mmbit_sparse_iter_unset(u8 *bits, u32 total_bits,
     assert(ISALIGNED_N(it, alignof(struct mmbit_sparse_iter)));
 
     // Our state _may_ be on the stack
-#ifndef _WIN32
     assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state)));
-#else
-    assert(ISALIGNED_N(s, 4));
-#endif
 
     MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits);
 
diff --git a/src/util/multibit_build.cpp b/src/util/multibit_build.cpp
index 67bb9ec70..442c528f7 100644
--- a/src/util/multibit_build.cpp
+++ b/src/util/multibit_build.cpp
@@ -112,13 +112,13 @@ void bfs(vector<mmbit_sparse_iter> &out, const TreeNode &tree) {
 
         if (depth != t->depth) {
             depth = t->depth;
-            levels.push_back(out.size());
+            levels.emplace_back(out.size());
         }
 
         DEBUG_PRINTF("pop: mask=0x%08llx, depth=%u, children.size()=%zu\n",
                      t->mask, t->depth, t->children.size());
 
-        out.push_back(mmbit_sparse_iter());
+        out.emplace_back(mmbit_sparse_iter());
         memset(&out.back(), 0, sizeof(mmbit_sparse_iter));
         mmbit_sparse_iter &record = out.back();
         record.mask = t->mask;
diff --git a/src/util/partitioned_set.h b/src/util/partitioned_set.h
index 8a4d3dd9e..f69712639 100644
--- a/src/util/partitioned_set.h
+++ b/src/util/partitioned_set.h
@@ -139,9 +139,9 @@ class partitioned_set : noncopyable {
             }
 
             if (*sp_it > member) {
-                split_temp_diff.push_back(member);
+                split_temp_diff.emplace_back(member);
             } else {
-                split_temp_inter.push_back(member);
+                split_temp_inter.emplace_back(member);
             }
         }
 
@@ -177,7 +177,7 @@ class partitioned_set : noncopyable {
 
         /* smaller subset is placed in the new subset  */
         size_t new_index = subsets.size();
-        subsets.push_back(subset());
+        subsets.emplace_back(subset());
         insert(&subsets.back().members, subsets.back().members.end(), *small);
 
         for (const auto &e : *small) {
@@ -203,7 +203,7 @@ class partitioned_set : noncopyable {
 
         for (size_t i = seen.find_first(); i != seen.npos;
              i = seen.find_next(i)) {
-            containing->push_back(i);
+            containing->emplace_back(i);
         }
     }
 
@@ -240,7 +240,7 @@ class partitioned_set : noncopyable {
             assert(sub < subsets.size());
 
             member_to_subset[i] = sub;
-            subsets[sub].members.push_back(i);
+            subsets[sub].members.emplace_back(i);
         }
 
         /* none of the subsets should be empty */
diff --git a/src/util/popcount.h b/src/util/popcount.h
index eb08f6b1b..d90a0d50d 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,36 +39,56 @@
 
 static really_inline
 u32 popcount32(u32 x) {
-#if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return _mm_popcnt_u32(x);
-#else
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x55555555;
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
-#endif
+    return __builtin_popcount(x);
+// #if defined(HAVE_POPCOUNT_INSTR)
+//     // Single-instruction builtin.
+//     return _mm_popcnt_u32(x);
+// #else
+//     // Fast branch-free version from bit-twiddling hacks as older Intel
+//     // processors do not have a POPCNT instruction.
+//     x -= (x >> 1) & 0x55555555;
+//     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+//     return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
+// #endif
+}
+
+static really_inline
+u32 popcount32x4(u32 const *x) {
+    u32 sum = popcount32(x[0]);
+    sum += popcount32(x[1]);
+    sum += popcount32(x[2]);
+    sum += popcount32(x[3]);
+    return sum;
 }
 
 static really_inline
 u32 popcount64(u64a x) {
-#if defined(ARCH_X86_64)
-# if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return (u32)_mm_popcnt_u64(x);
-# else
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x5555555555555555;
-    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
-    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
-    return (x * 0x0101010101010101) >> 56;
-# endif
-#else
-    // Synthesise from two 32-bit cases.
-    return popcount32(x >> 32) + popcount32(x);
-#endif
+    return __builtin_popcountll(x);
+// #if defined(ARCH_X86_64)
+// # if defined(HAVE_POPCOUNT_INSTR)
+//     // Single-instruction builtin.
+//     return (u32)_mm_popcnt_u64(x);
+// # else
+//     // Fast branch-free version from bit-twiddling hacks as older Intel
+//     // processors do not have a POPCNT instruction.
+//     x -= (x >> 1) & 0x5555555555555555;
+//     x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+//     x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
+//     return (x * 0x0101010101010101) >> 56;
+// # endif
+// #else
+//     // Synthesise from two 32-bit cases.
+//     return popcount32(x >> 32) + popcount32(x);
+// #endif
+}
+
+static really_inline
+u32 popcount64x4(u64a const *x) {
+    volatile u32 sum = popcount64(x[0]);
+    sum += popcount64(x[1]);
+    sum += popcount64(x[2]);
+    sum += popcount64(x[3]);
+    return sum;
 }
 
 #endif /* UTIL_POPCOUNT_H_ */
diff --git a/src/util/report_manager.cpp b/src/util/report_manager.cpp
index 78b9b73df..3ea712170 100644
--- a/src/util/report_manager.cpp
+++ b/src/util/report_manager.cpp
@@ -66,7 +66,7 @@ u32 ReportManager::getInternalId(const Report &ir) {
     }
 
     u32 size = reportIds.size();
-    reportIds.push_back(ir);
+    reportIds.emplace_back(ir);
     reportIdToInternalMap.emplace(ir, size);
     DEBUG_PRINTF("new report %u\n", size);
     return size;
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 962cad6c9..e393d081a 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,22 +35,31 @@
 #include "util/intrinsics.h"
 #include "ue2common.h"
 
-#if defined(HAVE_SSE2)
-typedef __m128i m128;
-#else
-typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
+#if defined(VS_SIMDE_BACKEND)
+#define VECTORSIZE 16
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#if !defined(VS_SIMDE_NATIVE)
+#define SIMDE_NO_NATIVE
 #endif
+#include <simde/x86/sse4.2.h>
+typedef simde__m128i m128;
+#define HAVE_SIMD_128_BITS
+#elif defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/simd_types.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/simd_types.h"
+#elif defined(ARCH_PPC64EL)
+#include "util/arch/ppc64el/simd_types.h"
+#endif
+
 
-#if defined(HAVE_AVX2)
-typedef __m256i m256;
-#else
+#if !defined(m256) && !defined(HAVE_SIMD_256_BITS)
 typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
 #endif
 
 typedef struct {m128 lo; m128 mid; m128 hi;} m384;
-#if defined(HAVE_AVX512)
-typedef __m512i m512;
-#else
+
+#if !defined(m512) && !defined(HAVE_SIMD_512_BITS)
 typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
 #endif
 
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 42223133d..01c309b1b 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,37 +31,27 @@
  * \brief SIMD types and primitive operations.
  */
 
-#ifndef SIMD_UTILS
-#define SIMD_UTILS
-
-#if !defined(_WIN32) && !defined(__SSSE3__)
-#error SSSE3 instructions must be enabled
-#endif
+#ifndef SIMD_UTILS_H
+#define SIMD_UTILS_H
 
 #include "config.h"
-#include "ue2common.h"
-#include "simd_types.h"
-#include "unaligned.h"
 #include "util/arch.h"
-#include "util/intrinsics.h"
-
-#include <string.h> // for memcpy
 
 // Define a common assume_aligned using an appropriate compiler built-in, if
 // it's available. Note that we need to handle C or C++ compilation.
 #ifdef __cplusplus
 #  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
-#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#    define vectorscan_assume_aligned(x, y) __builtin_assume_aligned((x), (y))
 #  endif
 #else
 #  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
-#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#    define vectorscan_assume_aligned(x, y) __builtin_assume_aligned((x), (y))
 #  endif
 #endif
 
 // Fallback to identity case.
-#ifndef assume_aligned
-#define assume_aligned(x, y) (x)
+#ifndef vectorscan_assume_aligned
+#define vectorscan_assume_aligned(x, y) (x)
 #endif
 
 #ifdef __cplusplus
@@ -71,1269 +62,18 @@ extern const char vbs_mask_data[];
 }
 #endif
 
-static really_inline m128 ones128(void) {
-#if defined(__GNUC__) || defined(__INTEL_COMPILER)
-    /* gcc gets this right */
-    return _mm_set1_epi8(0xFF);
-#else
-    /* trick from Intel's optimization guide to generate all-ones.
-     * ICC converts this to the single cmpeq instruction */
-    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
-#endif
-}
-
-static really_inline m128 zeroes128(void) {
-    return _mm_setzero_si128();
-}
-
-/** \brief Bitwise not for m128*/
-static really_inline m128 not128(m128 a) {
-    return _mm_xor_si128(a, ones128());
-}
-
-/** \brief Return 1 if a and b are different otherwise 0 */
-static really_inline int diff128(m128 a, m128 b) {
-    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
-}
-
-static really_inline int isnonzero128(m128 a) {
-    return !!diff128(a, zeroes128());
-}
-
-/**
- * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich128(m128 a, m128 b) {
-    a = _mm_cmpeq_epi32(a, b);
-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
-}
-
-/**
- * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
- * returns a 4-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_128(m128 a, m128 b) {
-#if defined(HAVE_SSE41)
-    a = _mm_cmpeq_epi64(a, b);
-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
-#else
-    u32 d = diffrich128(a, b);
-    return (d | (d >> 1)) & 0x5;
-#endif
-}
-
-static really_really_inline
-m128 lshift64_m128(m128 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm_sll_epi64(a, x);
-}
-
-#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
-
-static really_inline m128 set16x8(u8 c) {
-    return _mm_set1_epi8(c);
-}
-
-static really_inline m128 set4x32(u32 c) {
-    return _mm_set1_epi32(c);
-}
-
-static really_inline u32 movd(const m128 in) {
-    return _mm_cvtsi128_si32(in);
-}
-
-#if defined(HAVE_AVX512)
-static really_inline u32 movd512(const m512 in) {
-    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
-    //       so we use 2-step convertions to work around.
-    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
-}
-#endif
-
-static really_inline u64a movq(const m128 in) {
-#if defined(ARCH_X86_64)
-    return _mm_cvtsi128_si64(in);
-#else // 32-bit - this is horrific
-    u32 lo = movd(in);
-    u32 hi = movd(_mm_srli_epi64(in, 32));
-    return (u64a)hi << 32 | lo;
-#endif
-}
-
-/* another form of movq */
-static really_inline
-m128 load_m128_from_u64a(const u64a *p) {
-    return _mm_set_epi64x(0LL, *p);
-}
-
-#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
-#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
-
-#if defined(HAVE_SSE41)
-#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
-#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
-#else
-#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
-#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
-#endif
-
-#if !defined(HAVE_AVX2)
-// TODO: this entire file needs restructuring - this carveout is awful
-#define extractlow64from256(a) movq(a.lo)
-#define extractlow32from256(a) movd(a.lo)
-#if defined(HAVE_SSE41)
-#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
-#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
-#else
-#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
-#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
-#endif
-
-#endif // !AVX2
-
-static really_inline m128 and128(m128 a, m128 b) {
-    return _mm_and_si128(a,b);
-}
-
-static really_inline m128 xor128(m128 a, m128 b) {
-    return _mm_xor_si128(a,b);
-}
-
-static really_inline m128 or128(m128 a, m128 b) {
-    return _mm_or_si128(a,b);
-}
-
-static really_inline m128 andnot128(m128 a, m128 b) {
-    return _mm_andnot_si128(a, b);
-}
-
-// aligned load
-static really_inline m128 load128(const void *ptr) {
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = assume_aligned(ptr, 16);
-    return _mm_load_si128((const m128 *)ptr);
-}
-
-// aligned store
-static really_inline void store128(void *ptr, m128 a) {
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = assume_aligned(ptr, 16);
-    *(m128 *)ptr = a;
-}
-
-// unaligned load
-static really_inline m128 loadu128(const void *ptr) {
-    return _mm_loadu_si128((const m128 *)ptr);
-}
-
-// unaligned store
-static really_inline void storeu128(void *ptr, m128 a) {
-    _mm_storeu_si128 ((m128 *)ptr, a);
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes128(void *ptr, m128 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m128 loadbytes128(const void *ptr, unsigned int n) {
-    m128 a = zeroes128();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern const u8 simd_onebit_masks[];
-#ifdef __cplusplus
-}
-#endif
-
-static really_inline
-m128 mask1bit128(unsigned int n) {
-    assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu128(&simd_onebit_masks[mask_idx]);
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit128(m128 *ptr, unsigned int n) {
-    *ptr = or128(mask1bit128(n), *ptr);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit128(m128 *ptr, unsigned int n) {
-    *ptr = andnot128(mask1bit128(n), *ptr);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit128(m128 val, unsigned int n) {
-    const m128 mask = mask1bit128(n);
-#if defined(HAVE_SSE41)
-    return !_mm_testz_si128(mask, val);
-#else
-    return isnonzero128(and128(mask, val));
-#endif
-}
-
-// offset must be an immediate
-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
-
-static really_inline
-m128 pshufb_m128(m128 a, m128 b) {
-    m128 result;
-    result = _mm_shuffle_epi8(a, b);
-    return result;
-}
-
-static really_inline
-m256 pshufb_m256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    return _mm256_shuffle_epi8(a, b);
-#else
-    m256 rv;
-    rv.lo = pshufb_m128(a.lo, b.lo);
-    rv.hi = pshufb_m128(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 pshufb_m512(m512 a, m512 b) {
-    return _mm512_shuffle_epi8(a, b);
-}
-
-static really_inline
-m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
-    return _mm512_maskz_shuffle_epi8(k, a, b);
-}
-
-#if defined(HAVE_AVX512VBMI)
-#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
-#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
-#endif
-
-#endif
-
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return pshufb_m128(in, shift_mask);
-}
-
-static really_inline
-m128 max_u8_m128(m128 a, m128 b) {
-    return _mm_max_epu8(a, b);
-}
-
-static really_inline
-m128 min_u8_m128(m128 a, m128 b) {
-    return _mm_min_epu8(a, b);
-}
-
-static really_inline
-m128 sadd_u8_m128(m128 a, m128 b) {
-    return _mm_adds_epu8(a, b);
-}
-
-static really_inline
-m128 sub_u8_m128(m128 a, m128 b) {
-    return _mm_sub_epi8(a, b);
-}
-
-static really_inline
-m128 set64x2(u64a hi, u64a lo) {
-    return _mm_set_epi64x(hi, lo);
-}
-
-/****
- **** 256-bit Primitives
- ****/
-
-#if defined(HAVE_AVX2)
-
-static really_really_inline
-m256 lshift64_m256(m256 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm256_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm256_sll_epi64(a, x);
-}
-
-#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
-
-static really_inline
-m256 set32x8(u32 in) {
-    return _mm256_set1_epi8(in);
-}
-
-#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
-#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
-
-static really_inline
-m256 set2x128(m128 a) {
-    return _mm256_broadcastsi128_si256(a);
-}
-
-#else
-
-static really_really_inline
-m256 lshift64_m256(m256 a, int b) {
-    m256 rv = a;
-    rv.lo = lshift64_m128(rv.lo, b);
-    rv.hi = lshift64_m128(rv.hi, b);
-    return rv;
-}
-
-static really_inline
-m256 rshift64_m256(m256 a, int b) {
-    m256 rv = a;
-    rv.lo = rshift64_m128(rv.lo, b);
-    rv.hi = rshift64_m128(rv.hi, b);
-    return rv;
-}
-static really_inline
-m256 set32x8(u32 in) {
-    m256 rv;
-    rv.lo = set16x8((u8) in);
-    rv.hi = rv.lo;
-    return rv;
-}
-
-static really_inline
-m256 eq256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = eq128(a.lo, b.lo);
-    rv.hi = eq128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline
-u32 movemask256(m256 a) {
-    u32 lo_mask = movemask128(a.lo);
-    u32 hi_mask = movemask128(a.hi);
-    return lo_mask | (hi_mask << 16);
-}
-
-static really_inline
-m256 set2x128(m128 a) {
-    m256 rv = {a, a};
-    return rv;
-}
-#endif
-
-static really_inline m256 zeroes256(void) {
-#if defined(HAVE_AVX2)
-    return _mm256_setzero_si256();
-#else
-    m256 rv = {zeroes128(), zeroes128()};
-    return rv;
-#endif
-}
-
-static really_inline m256 ones256(void) {
-#if defined(HAVE_AVX2)
-    m256 rv = _mm256_set1_epi8(0xFF);
-#else
-    m256 rv = {ones128(), ones128()};
-#endif
-    return rv;
-}
-
-#if defined(HAVE_AVX2)
-static really_inline m256 and256(m256 a, m256 b) {
-    return _mm256_and_si256(a, b);
-}
-#else
-static really_inline m256 and256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = and128(a.lo, b.lo);
-    rv.hi = and128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 or256(m256 a, m256 b) {
-    return _mm256_or_si256(a, b);
-}
-#else
-static really_inline m256 or256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = or128(a.lo, b.lo);
-    rv.hi = or128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 xor256(m256 a, m256 b) {
-    return _mm256_xor_si256(a, b);
-}
-#else
-static really_inline m256 xor256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = xor128(a.lo, b.lo);
-    rv.hi = xor128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 not256(m256 a) {
-    return _mm256_xor_si256(a, ones256());
-}
-#else
-static really_inline m256 not256(m256 a) {
-    m256 rv;
-    rv.lo = not128(a.lo);
-    rv.hi = not128(a.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 andnot256(m256 a, m256 b) {
-    return _mm256_andnot_si256(a, b);
-}
-#else
-static really_inline m256 andnot256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = andnot128(a.lo, b.lo);
-    rv.hi = andnot128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-static really_inline int diff256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
-#else
-    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
-#endif
-}
-
-static really_inline int isnonzero256(m256 a) {
-#if defined(HAVE_AVX2)
-    return !!diff256(a, zeroes256());
-#else
-    return isnonzero128(or128(a.lo, a.hi));
-#endif
-}
-
-/**
- * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    a = _mm256_cmpeq_epi32(a, b);
-    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
-#else
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
-    return ~(_mm_movemask_epi8(packed)) & 0xff;
-#endif
-}
-
-/**
- * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
- * returns an 8-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_256(m256 a, m256 b) {
-    u32 d = diffrich256(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline m256 load256(const void *ptr) {
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
-    return _mm256_load_si256((const m256 *)ptr);
-#else
-    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
-    return rv;
-#endif
-}
-
-// aligned load  of 128-bit value to low and high part of 256-bit value
-static really_inline m256 load2x128(const void *ptr) {
-#if defined(HAVE_AVX2)
-    return set2x128(load128(ptr));
-#else
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    m256 rv;
-    rv.hi = rv.lo = load128(ptr);
-    return rv;
-#endif
-}
-
-static really_inline m256 loadu2x128(const void *ptr) {
-    return set2x128(loadu128(ptr));
-}
-
-// aligned store
-static really_inline void store256(void *ptr, m256 a) {
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
-    _mm256_store_si256((m256 *)ptr, a);
-#else
-    ptr = assume_aligned(ptr, 16);
-    *(m256 *)ptr = a;
-#endif
-}
-
-// unaligned load
-static really_inline m256 loadu256(const void *ptr) {
-#if defined(HAVE_AVX2)
-    return _mm256_loadu_si256((const m256 *)ptr);
+#if defined(VS_SIMDE_BACKEND)
+#include "util/arch/x86/simd_utils.h"
 #else
-    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
-    return rv;
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/simd_utils.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/simd_utils.h"
+#elif defined(ARCH_PPC64EL)
+#include "util/arch/ppc64el/simd_utils.h"
 #endif
-}
-
-// unaligned store
-static really_inline void storeu256(void *ptr, m256 a) {
-#if defined(HAVE_AVX2)
-    _mm256_storeu_si256((m256 *)ptr, a);
-#else
-    storeu128(ptr, a.lo);
-    storeu128((char *)ptr + 16, a.hi);
-#endif
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes256(void *ptr, m256 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m256 loadbytes256(const void *ptr, unsigned int n) {
-    m256 a = zeroes256();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-static really_inline
-m256 mask1bit256(unsigned int n) {
-    assert(n < sizeof(m256) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu256(&simd_onebit_masks[mask_idx]);
-}
-
-static really_inline
-m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
-#if defined(HAVE_AVX2)
-    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
-#else
-    m256 rv;
-    rv.hi = set64x2(hi_1, hi_0);
-    rv.lo = set64x2(lo_1, lo_0);
-    return rv;
-#endif
-}
-
-#if !defined(HAVE_AVX2)
-// switches on bit N in the given vector.
-static really_inline
-void setbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 128;
-    }
-    setbit128(sub, n);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 128;
-    }
-    clearbit128(sub, n);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit256(m256 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo;
-    } else {
-        sub = val.hi;
-        n -= 128;
-    }
-    return testbit128(sub, n);
-}
-
-static really_really_inline
-m128 movdq_hi(m256 x) {
-    return x.hi;
-}
-
-static really_really_inline
-m128 movdq_lo(m256 x) {
-    return x.lo;
-}
-
-static really_inline
-m256 combine2x128(m128 hi, m128 lo) {
-    m256 rv = {lo, hi};
-    return rv;
-}
-
-#else // AVX2
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit256(m256 *ptr, unsigned int n) {
-    *ptr = or256(mask1bit256(n), *ptr);
-}
-
-static really_inline
-void clearbit256(m256 *ptr, unsigned int n) {
-    *ptr = andnot256(mask1bit256(n), *ptr);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit256(m256 val, unsigned int n) {
-    const m256 mask = mask1bit256(n);
-    return !_mm256_testz_si256(mask, val);
-}
-
-static really_really_inline
-m128 movdq_hi(m256 x) {
-    return _mm256_extracti128_si256(x, 1);
-}
-
-static really_really_inline
-m128 movdq_lo(m256 x) {
-    return _mm256_extracti128_si256(x, 0);
-}
-
-#define cast256to128(a) _mm256_castsi256_si128(a)
-#define cast128to256(a) _mm256_castsi128_si256(a)
-#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
-#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
-#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
-#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
-#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
-#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
-#define extractlow32from256(a) movd(cast256to128(a))
-#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
-#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
-#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
-
-static really_inline
-m256 combine2x128(m128 hi, m128 lo) {
-#if defined(_mm256_set_m128i)
-    return _mm256_set_m128i(hi, lo);
-#else
-    return insert128to256(cast128to256(lo), hi, 1);
-#endif
-}
-#endif //AVX2
-
-#if defined(HAVE_AVX512)
-#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
-#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
-#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
-#define set2x256(a) _mm512_broadcast_i64x4(a)
-#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
-#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
 #endif
 
-/****
- **** 384-bit Primitives
- ****/
-
-static really_inline m384 and384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = and128(a.lo, b.lo);
-    rv.mid = and128(a.mid, b.mid);
-    rv.hi = and128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline m384 or384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = or128(a.lo, b.lo);
-    rv.mid = or128(a.mid, b.mid);
-    rv.hi = or128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline m384 xor384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = xor128(a.lo, b.lo);
-    rv.mid = xor128(a.mid, b.mid);
-    rv.hi = xor128(a.hi, b.hi);
-    return rv;
-}
-static really_inline m384 not384(m384 a) {
-    m384 rv;
-    rv.lo = not128(a.lo);
-    rv.mid = not128(a.mid);
-    rv.hi = not128(a.hi);
-    return rv;
-}
-static really_inline m384 andnot384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = andnot128(a.lo, b.lo);
-    rv.mid = andnot128(a.mid, b.mid);
-    rv.hi = andnot128(a.hi, b.hi);
-    return rv;
-}
-
-static really_really_inline
-m384 lshift64_m384(m384 a, unsigned b) {
-    m384 rv;
-    rv.lo = lshift64_m128(a.lo, b);
-    rv.mid = lshift64_m128(a.mid, b);
-    rv.hi = lshift64_m128(a.hi, b);
-    return rv;
-}
-
-static really_inline m384 zeroes384(void) {
-    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
-    return rv;
-}
-
-static really_inline m384 ones384(void) {
-    m384 rv = {ones128(), ones128(), ones128()};
-    return rv;
-}
-
-static really_inline int diff384(m384 a, m384 b) {
-    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
-}
-
-static really_inline int isnonzero384(m384 a) {
-    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
-}
-
-/**
- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich384(m384 a, m384 b) {
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
-                                  _mm_packs_epi32(a.hi, z));
-    return ~(_mm_movemask_epi8(packed)) & 0xfff;
-}
-
-/**
- * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
- * returns a 12-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_384(m384 a, m384 b) {
-    u32 d = diffrich384(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
+#include "util/arch/common/simd_utils.h"
 
-// aligned load
-static really_inline m384 load384(const void *ptr) {
-    assert(ISALIGNED_16(ptr));
-    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
-                load128((const char *)ptr + 32) };
-    return rv;
-}
-
-// aligned store
-static really_inline void store384(void *ptr, m384 a) {
-    assert(ISALIGNED_16(ptr));
-    ptr = assume_aligned(ptr, 16);
-    *(m384 *)ptr = a;
-}
-
-// unaligned load
-static really_inline m384 loadu384(const void *ptr) {
-    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
-                loadu128((const char *)ptr + 32)};
-    return rv;
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes384(void *ptr, m384 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m384 loadbytes384(const void *ptr, unsigned int n) {
-    m384 a = zeroes384();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit384(m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else if (n < 256) {
-        sub = &ptr->mid;
-    } else {
-        sub = &ptr->hi;
-    }
-    setbit128(sub, n % 128);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit384(m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else if (n < 256) {
-        sub = &ptr->mid;
-    } else {
-        sub = &ptr->hi;
-    }
-    clearbit128(sub, n % 128);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit384(m384 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo;
-    } else if (n < 256) {
-        sub = val.mid;
-    } else {
-        sub = val.hi;
-    }
-    return testbit128(sub, n % 128);
-}
-
-/****
- **** 512-bit Primitives
- ****/
-
-#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
-#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
-
-static really_inline
-m512 zeroes512(void) {
-#if defined(HAVE_AVX512)
-    return _mm512_setzero_si512();
-#else
-    m512 rv = {zeroes256(), zeroes256()};
-    return rv;
-#endif
-}
-
-static really_inline
-m512 ones512(void) {
-#if defined(HAVE_AVX512)
-    return _mm512_set1_epi8(0xFF);
-    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
-#else
-    m512 rv = {ones256(), ones256()};
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 set64x8(u8 a) {
-    return _mm512_set1_epi8(a);
-}
-
-static really_inline
-m512 set8x64(u64a a) {
-    return _mm512_set1_epi64(a);
-}
-
-static really_inline
-m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
-               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
-    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
-                            lo_3, lo_2, lo_1, lo_0);
-}
-
-static really_inline
-m512 swap256in512(m512 a) {
-    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
-    return vpermq512(idx, a);
-}
-
-static really_inline
-m512 set4x128(m128 a) {
-    return _mm512_broadcast_i32x4(a);
-}
-#endif
-
-static really_inline
-m512 and512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_and_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = and256(a.lo, b.lo);
-    rv.hi = and256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 or512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_or_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = or256(a.lo, b.lo);
-    rv.hi = or256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 xor512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_xor_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = xor256(a.lo, b.lo);
-    rv.hi = xor256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 not512(m512 a) {
-#if defined(HAVE_AVX512)
-    return _mm512_xor_si512(a, ones512());
-#else
-    m512 rv;
-    rv.lo = not256(a.lo);
-    rv.hi = not256(a.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 andnot512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_andnot_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = andnot256(a.lo, b.lo);
-    rv.hi = andnot256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_really_inline
-m512 lshift64_m512(m512 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm512_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm512_sll_epi64(a, x);
-}
-#else
-static really_really_inline
-m512 lshift64_m512(m512 a, unsigned b) {
-    m512 rv;
-    rv.lo = lshift64_m256(a.lo, b);
-    rv.hi = lshift64_m256(a.hi, b);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX512)
-#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
-#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
-#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
-#endif
-
-#if !defined(_MM_CMPINT_NE)
-#define _MM_CMPINT_NE 0x4
-#endif
-
-static really_inline
-int diff512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
-#else
-    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
-#endif
-}
-
-static really_inline
-int isnonzero512(m512 a) {
-#if defined(HAVE_AVX512)
-    return diff512(a, zeroes512());
-#elif defined(HAVE_AVX2)
-    m256 x = or256(a.lo, a.hi);
-    return !!diff256(x, zeroes256());
-#else
-    m128 x = or128(a.lo.lo, a.lo.hi);
-    m128 y = or128(a.hi.lo, a.hi.hi);
-    return isnonzero128(or128(x, y));
-#endif
-}
-
-/**
- * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline
-u32 diffrich512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
-#elif defined(HAVE_AVX2)
-    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
-#else
-    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
-    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
-    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
-    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
-                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
-    return ~(_mm_movemask_epi8(packed)) & 0xffff;
-#endif
-}
-
-/**
- * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
- * returns a 16-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline
-u32 diffrich64_512(m512 a, m512 b) {
-    //TODO: cmp_epi64?
-    u32 d = diffrich512(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline
-m512 load512(const void *ptr) {
-#if defined(HAVE_AVX512)
-    return _mm512_load_si512(ptr);
-#else
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
-    return rv;
-#endif
-}
-
-// aligned store
-static really_inline
-void store512(void *ptr, m512 a) {
-    assert(ISALIGNED_N(ptr, alignof(m512)));
-#if defined(HAVE_AVX512)
-    return _mm512_store_si512(ptr, a);
-#elif defined(HAVE_AVX2)
-    m512 *x = (m512 *)ptr;
-    store256(&x->lo, a.lo);
-    store256(&x->hi, a.hi);
-#else
-    ptr = assume_aligned(ptr, 16);
-    *(m512 *)ptr = a;
-#endif
-}
-
-// unaligned load
-static really_inline
-m512 loadu512(const void *ptr) {
-#if defined(HAVE_AVX512)
-    return _mm512_loadu_si512(ptr);
-#else
-    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
-    return _mm512_maskz_loadu_epi8(k, ptr);
-}
-
-static really_inline
-m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
-    return _mm512_mask_loadu_epi8(src, k, ptr);
-}
-
-static really_inline
-m512 set_mask_m512(__mmask64 k) {
-    return _mm512_movm_epi8(k);
-}
-#endif
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes512(void *ptr, m512 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m512 loadbytes512(const void *ptr, unsigned int n) {
-    m512 a = zeroes512();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-static really_inline
-m512 mask1bit512(unsigned int n) {
-    assert(n < sizeof(m512) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu512(&simd_onebit_masks[mask_idx]);
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit512(m512 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-#if !defined(HAVE_AVX2)
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo.lo;
-    } else if (n < 256) {
-        sub = &ptr->lo.hi;
-    } else if (n < 384) {
-        sub = &ptr->hi.lo;
-    } else {
-        sub = &ptr->hi.hi;
-    }
-    setbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    *ptr = or512(mask1bit512(n), *ptr);
-#else
-    m256 *sub;
-    if (n < 256) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 256;
-    }
-    setbit256(sub, n);
-#endif
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit512(m512 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-#if !defined(HAVE_AVX2)
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo.lo;
-    } else if (n < 256) {
-        sub = &ptr->lo.hi;
-    } else if (n < 384) {
-        sub = &ptr->hi.lo;
-    } else {
-        sub = &ptr->hi.hi;
-    }
-    clearbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    *ptr = andnot512(mask1bit512(n), *ptr);
-#else
-    m256 *sub;
-    if (n < 256) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 256;
-    }
-    clearbit256(sub, n);
-#endif
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit512(m512 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-#if !defined(HAVE_AVX2)
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo.lo;
-    } else if (n < 256) {
-        sub = val.lo.hi;
-    } else if (n < 384) {
-        sub = val.hi.lo;
-    } else {
-        sub = val.hi.hi;
-    }
-    return testbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    const m512 mask = mask1bit512(n);
-    return !!_mm512_test_epi8_mask(mask, val);
-#else
-    m256 sub;
-    if (n < 256) {
-        sub = val.lo;
-    } else {
-        sub = val.hi;
-        n -= 256;
-    }
-    return testbit256(sub, n);
-#endif
-}
-
-#endif
+#endif // SIMD_UTILS_H
diff --git a/src/util/small_vector.h b/src/util/small_vector.h
index 0f54bbf6b..5bad7df9f 100644
--- a/src/util/small_vector.h
+++ b/src/util/small_vector.h
@@ -29,7 +29,11 @@
 #ifndef UTIL_SMALL_VECTOR_H
 #define UTIL_SMALL_VECTOR_H
 
-#include <vector>
+#if defined(__has_feature)
+#  if __has_feature(memory_sanitizer)
+#define BUILD_WITH_MSAN
+#  endif
+#endif
 
 #include <boost/version.hpp>
 
@@ -37,8 +41,16 @@
  * We use the small_vector constructors introduced in Boost 1.61 (trac bug
  * #11866, github commit b436c91). If the Boost version is too old, we fall
  * back to using std::vector.
+ *
+ * Also with MSan boost::container::small_vector cannot be used because MSan
+ * reports some issues there, it looks similar to [1], but even adding
+ * __attribute__((no_sanitize_memory)) for ~small_vector_base() [2] is not
+ * enough since clang-16, so let's simply use std::vector under MSan.
+ *
+ *   [1]: https://github.com/google/sanitizers/issues/854
+ *   [2]: https://github.com/ClickHouse/boost/commit/229354100
  */
-#if BOOST_VERSION >= 106100
+#if !defined(BUILD_WITH_MSAN) && BOOST_VERSION >= 106100
 #  define HAVE_BOOST_CONTAINER_SMALL_VECTOR
 #endif
 
@@ -56,6 +68,8 @@ using small_vector = boost::container::small_vector<T, N, Allocator>;
 
 #else
 
+#include <vector>
+
 // Boost version isn't new enough, fall back to just using std::vector.
 template <class T, std::size_t N, typename Allocator = std::allocator<T>>
 using small_vector = std::vector<T, Allocator>;
diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index 7238849e7..fda541126 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -1,5 +1,7 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -72,9 +74,14 @@ void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) {
 
 void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) {
     assert(popcount64(*m) <= bytes * 8);
-
+#ifdef HAVE_SVE2_BITPERM
+    svbool_t pg = svwhilelt_b8(0U, bytes);
+    svuint64_t expanded = svbdep(svreinterpret_u64(svld1_u8(pg, ptr)), *m);
+    svst1(svptrue_pat_b64(SV_VL1), (uint64_t *)x, expanded);
+#else
     u64a v = partial_load_u64a(ptr, bytes);
     *x = expand64(v, *m);
+#endif
 }
 
 /*
@@ -108,20 +115,21 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
 static really_inline
 void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
     // First, decompose our vectors into 64-bit chunks.
-    u64a x[2];
-    memcpy(x, &xvec, sizeof(xvec));
-    u64a m[2];
-    memcpy(m, &mvec, sizeof(mvec));
+    u64a ALIGN_ATTR(16) x[2];
+    u64a ALIGN_ATTR(16) m[2];
+    store128(m, mvec);
+    store128(x, xvec);
 
     // Count the number of bits of compressed state we're writing out per
     // chunk.
-    u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
+    u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) };
 
     // Compress each 64-bit chunk individually.
-    u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) };
+    xvec = compress128(xvec, mvec);
+    store128(x, xvec);
 
     // Write packed data out.
-    pack_bits_64(ptr, v, bits, 2);
+    pack_bits_64(ptr, x, bits, 2);
 }
 #endif
 
@@ -150,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
     u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
                  expand32(v[2], m[2]), expand32(v[3], m[3]) };
 
-    return _mm_set_epi32(x[3], x[2], x[1], x[0]);
+    return set4x32(x[3], x[2], x[1], x[0]);
 }
 #endif
 
@@ -158,16 +166,23 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
 static really_inline
 m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
     // First, decompose our vectors into 64-bit chunks.
-    u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
+    u64a ALIGN_ATTR(16) m[2];
+    store128(m, mvec);
 
-    u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
-    u64a v[2];
+    // Count the number of bits of compressed state we're writing out per
+    // chunk.
+    u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) };
 
+    u64a ALIGN_ATTR(16) v[2];
     unpack_bits_64(v, (const u8 *)ptr, bits, 2);
 
-    u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
-
-    return _mm_set_epi64x(x[1], x[0]);
+#ifdef HAVE_SVE2_BITPERM
+    u64a ALIGN_ATTR(16) xvec[2];
+    bdep64x2(xvec, v, &mvec);
+    return load128(xvec);
+#else
+    return expand128(load128(v), mvec);
+#endif
 }
 #endif
 
@@ -215,10 +230,10 @@ void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) {
 static really_really_inline
 void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) {
     // First, decompose our vectors into 64-bit chunks.
-    u64a x[4];
-    memcpy(x, &xvec, sizeof(xvec));
-    u64a m[4];
-    memcpy(m, &mvec, sizeof(mvec));
+    u64a ALIGN_ATTR(32) x[4];
+    u64a ALIGN_ATTR(32) m[4];
+    store256(x, xvec);
+    store256(m, mvec);
 
     // Count the number of bits of compressed state we're writing out per
     // chunk.
@@ -264,11 +279,11 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
                  expand32(v[6], m[6]), expand32(v[7], m[7]) };
 
 #if !defined(HAVE_AVX2)
-    m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
-                  .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
+    m256 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
+                  .hi = set4x32(x[7], x[6], x[5], x[4]) };
 #else
-    m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
-                                 x[3], x[2], x[1], x[0]);
+    m256 xvec = set8x32(x[7], x[6], x[5], x[4],
+                        x[3], x[2], x[1], x[0]);
 #endif
     return xvec;
 }
@@ -287,14 +302,20 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
 
     unpack_bits_64(v, (const u8 *)ptr, bits, 4);
 
+#ifdef HAVE_SVE2_BITPERM
+    u64a ALIGN_ATTR(16) x[4];
+    bdep64x2(x, v, &mvec.lo);
+    bdep64x2(&x[2], &v[2], &mvec.hi);
+#else
     u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
                   expand64(v[2], m[2]), expand64(v[3], m[3]) };
+#endif
 
 #if !defined(HAVE_AVX2)
-    m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
-                  .hi = _mm_set_epi64x(x[3], x[2]) };
+    m256 xvec = { .lo = set2x64(x[1], x[0]),
+                  .hi = set2x64(x[3], x[2]) };
 #else
-    m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
+    m256 xvec = set4x64(x[3], x[2], x[1], x[0]);
 #endif
     return xvec;
 }
@@ -402,9 +423,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
                   expand32(v[8], m[8]), expand32(v[9], m[9]),
                   expand32(v[10], m[10]), expand32(v[11], m[11]) };
 
-    m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
-                  .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
-                  .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
+    m384 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
+                  .mid = set4x32(x[7], x[6], x[5], x[4]),
+                  .hi = set4x32(x[11], x[10], x[9], x[8]) };
     return xvec;
 }
 #endif
@@ -423,13 +444,20 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
 
     unpack_bits_64(v, (const u8 *)ptr, bits, 6);
 
+#ifdef HAVE_SVE2_BITPERM
+    u64a ALIGN_ATTR(16) x[6];
+    bdep64x2(x, v, &mvec.lo);
+    bdep64x2(&x[2], &v[2], &mvec.mid);
+    bdep64x2(&x[4], &v[4], &mvec.hi);
+#else
     u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
                   expand64(v[2], m[2]), expand64(v[3], m[3]),
                   expand64(v[4], m[4]), expand64(v[5], m[5]) };
+#endif
 
-    m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
-                  .mid = _mm_set_epi64x(x[3], x[2]),
-                  .hi = _mm_set_epi64x(x[5], x[4]) };
+    m384 xvec = { .lo = set2x64(x[1], x[0]),
+                  .mid = set2x64(x[3], x[2]),
+                  .hi = set2x64(x[5], x[4]) };
     return xvec;
 }
 #endif
@@ -548,20 +576,20 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
 
     m512 xvec;
 #if defined(HAVE_AVX512)
-    xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
-                            x[11], x[10], x[9], x[8],
-                            x[7], x[6], x[5], x[4],
-                            x[3], x[2], x[1], x[0]);
+    xvec = set32x16(x[15], x[14], x[13], x[12],
+                    x[11], x[10], x[9], x[8],
+                    x[7], x[6], x[5], x[4],
+                    x[3], x[2], x[1], x[0]);
 #elif defined(HAVE_AVX2)
-    xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
-                               x[3], x[2], x[1], x[0]);
-    xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
-                               x[11], x[10], x[9], x[8]);
+    xvec.lo = set8x32(x[7], x[6], x[5], x[4],
+                      x[3], x[2], x[1], x[0]);
+    xvec.hi = set8x32(x[15], x[14], x[13], x[12],
+                      x[11], x[10], x[9], x[8]);
 #else
-    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
-    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
-    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
-    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
+    xvec.lo.lo = set4x32(x[3], x[2], x[1], x[0]);
+    xvec.lo.hi = set4x32(x[7], x[6], x[5], x[4]);
+    xvec.hi.lo = set4x32(x[11], x[10], x[9], x[8]);
+    xvec.hi.hi = set4x32(x[15], x[14], x[13], x[12]);
 #endif
     return xvec;
 }
@@ -582,22 +610,30 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
 
     unpack_bits_64(v, (const u8 *)ptr, bits, 8);
 
+#ifdef HAVE_SVE2_BITPERM
+    u64a ALIGN_ATTR(16) x[8];
+    bdep64x2(x, v, &mvec.lo.lo);
+    bdep64x2(&x[2], &v[2], &mvec.lo.hi);
+    bdep64x2(&x[4], &v[4], &mvec.hi.lo);
+    bdep64x2(&x[6], &v[6], &mvec.hi.hi);
+#else
     u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
                   expand64(v[2], m[2]), expand64(v[3], m[3]),
                   expand64(v[4], m[4]), expand64(v[5], m[5]),
                   expand64(v[6], m[6]), expand64(v[7], m[7]) };
+#endif
 
 #if defined(HAVE_AVX512)
-    m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
+    m512 xvec = set8x64(x[7], x[6], x[5], x[4],
                                  x[3], x[2], x[1], x[0]);
 #elif defined(HAVE_AVX2)
-    m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
-                  .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
+    m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]),
+                  .hi = set4x64(x[7], x[6], x[5], x[4])};
 #else
-    m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
-                          _mm_set_epi64x(x[3], x[2]) },
-                  .hi = { _mm_set_epi64x(x[5], x[4]),
-                          _mm_set_epi64x(x[7], x[6]) } };
+    m512 xvec = { .lo = { set2x64(x[1], x[0]),
+                          set2x64(x[3], x[2]) },
+                  .hi = { set2x64(x[5], x[4]),
+                          set2x64(x[7], x[6]) } };
 #endif
     return xvec;
 }
diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp
new file mode 100644
index 000000000..bd866223b
--- /dev/null
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMD_IMPL_HPP
+#define SIMD_IMPL_HPP
+
+#include <cstdint>
+
+#include "ue2common.h"
+#include "util/supervector/supervector.hpp"
+
+// 128-bit NEON implementation
+
+template<>
+really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
+{
+    u.v128[0] = v;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8x16_t other)
+{
+    u.s8x16[0] = other;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8x16_t other)
+{
+    u.u8x16[0] = other;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16x8_t other)
+{
+    u.s16x8[0] = other;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16x8_t other)
+{
+    u.u16x8[0] = other;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32x4_t other)
+{
+    u.s32x4[0] = other;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32x4_t other)
+{
+    u.u32x4[0] = other;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64x2_t other)
+{
+    u.s64x2[0] = other;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64x2_t other)
+{
+    u.u64x2[0] = other;
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8_t const other)
+{
+    u.s8x16[0] = vdupq_n_s8(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
+{
+    u.u8x16[0] = vdupq_n_u8(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16_t const other)
+{
+    u.s16x8[0] = vdupq_n_s16(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
+{
+    u.u16x8[0] = vdupq_n_u16(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32_t const other)
+{
+    u.s32x4[0] = vdupq_n_s32(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
+{
+    u.u32x4[0] = vdupq_n_u32(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64_t const other)
+{
+    u.s64x2[0] = vdupq_n_s64(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
+{
+    u.u64x2[0] = vdupq_n_u64(other);
+}
+
+// Constants
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones(void)
+{
+    return {vdupq_n_u8(0xFF)};
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
+{
+    return {vdupq_n_u8(0)};
+}
+
+// Methods
+
+template <>
+really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
+{
+    u.v128[0] = other.u.v128[0];
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
+{
+    return {vandq_u8(u.u8x16[0], b.u.u8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
+{
+    return {vorrq_u8(u.u8x16[0], b.u.u8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
+{
+    return {veorq_u8(u.u8x16[0], b.u.u8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!() const
+{
+    return {vmvnq_u8(u.u8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
+{
+    return {vandq_u8(vmvnq_u8(u.u8x16[0]), b.u.u8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
+{
+    return {vceqq_u8(u.u8x16[0], b.u.u8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
+{
+    return !(*this == b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
+{
+    return {vcgtq_s8(u.s8x16[0], b.u.s8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
+{
+    return {vcgeq_u8(u.u8x16[0], b.u.u8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
+{
+    return {vcltq_s8(u.s8x16[0], b.u.s8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
+{
+    return {vcgeq_s8(u.s8x16[0], b.u.s8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+{
+    return (*this == b);
+}
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+    return static_cast<typename SuperVector<16>::comparemask_type>(
+        vget_lane_u64((uint64x1_t)vshrn_n_u16(u.u16x8[0], 4), 0));
+}
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 4; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask & 0x1111111111111111ull;
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
+{
+    return {vshlq_n_u8(u.u8x16[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
+{
+    return {vshlq_n_u16(u.u16x8[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
+{
+    return {vshlq_n_u32(u.u32x4[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
+{
+    return {vshlq_n_u64(u.u64x2[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
+{
+    return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
+{
+    return vshl_128_imm<N>();
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
+{
+    return {vshrq_n_u8(u.u8x16[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
+{
+    return {vshrq_n_u16(u.u16x8[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
+{
+    return {vshrq_n_u32(u.u32x4[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
+{
+    return {vshrq_n_u64(u.u64x2[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
+{
+    return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
+{
+    return vshr_128_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
+#endif
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 8) return Zeroes();
+    int8x16_t shift_indices = vdupq_n_s8(N);
+    return { vshlq_s8(u.s8x16[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    int16x8_t shift_indices = vdupq_n_s16(N);
+    return { vshlq_s16(u.s16x8[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    int32x4_t shift_indices = vdupq_n_s32(N);
+    return { vshlq_s32(u.s32x4[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    int64x2_t shift_indices = vdupq_n_s64(N);
+    return { vshlq_s64(u.s64x2[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - N)};
+    }
+#endif
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 8) return Zeroes();
+    int8x16_t shift_indices = vdupq_n_s8(-N);
+    return { vshlq_s8(u.s8x16[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    int16x8_t shift_indices = vdupq_n_s16(-N);
+    return { vshlq_s16(u.s16x8[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    int32x4_t shift_indices = vdupq_n_s32(-N);
+    return { vshlq_s32(u.s32x4[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    int64x2_t shift_indices = vdupq_n_s64(-N);
+    return { vshlq_s64(u.s64x2[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+         return {vextq_u8(u.u8x16[0], vdupq_n_u8(0), N)};
+    }
+#endif
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
+{
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+{
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
+{
+    return Ones().vshr_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
+{
+    return Ones().vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
+{
+    return {vld1q_s32((const int32_t *)ptr)};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
+{
+    assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
+    ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
+    return {vld1q_s32((const int32_t *)ptr)};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(16 - len);
+    SuperVector v = loadu(ptr);
+    return mask & v;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
+{
+    DEBUG_PRINTF("mask = %08llx\n", mask);
+    SuperVector v = loadu(ptr);
+    (void)mask;
+    return v; // FIXME: & mask
+}
+
+template<>
+really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
+{
+  return findAndClearLSB_64(&z) >> 2;
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
+{
+    if (offset == 0) return other;
+    if (offset == 16) return *this;
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        return {vextq_u8(other.u.u8x16[0], u.u8x16[0], offset)};
+    }
+#endif
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (offset == n) result = {vextq_u8(other.u.u8x16[0], v->u.u8x16[0], n)}; });
+    return result;
+}
+
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
+{
+    return {vqtbl1q_u8(u.u8x16[0], b.u.u8x16[0])};
+}
+
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
+{
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON, if >=16, then the result is zero, otherwise it is that lane.
+       btranslated is the version that is converted from Intel to NEON.  */
+    SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f);
+    return pshufb<false>(btranslated);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(16 -len);
+    return mask & pshufb(b);
+}
+
+#endif // SIMD_IMPL_HPP
diff --git a/src/util/supervector/arch/arm/types.hpp b/src/util/supervector/arch/arm/types.hpp
new file mode 100644
index 000000000..6e362e1c2
--- /dev/null
+++ b/src/util/supervector/arch/arm/types.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if !defined(m128) && defined(HAVE_NEON)
+typedef int32x4_t m128;
+#endif
+
diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
new file mode 100644
index 000000000..46e2a822b
--- /dev/null
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMD_IMPL_HPP
+#define SIMD_IMPL_HPP
+
+#include <cstdint>
+#include <cstdio>
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/unaligned.h"
+#include "util/supervector/supervector.hpp"
+#include <iostream>
+
+// 128-bit IBM Power VSX implementation
+
+template<>
+really_inline SuperVector<16>::SuperVector(SuperVector const &other)
+{
+    u.v128[0] = other.u.v128[0];
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(__vector __bool char v)
+{
+    u.u8x16[0] = (uint8x16_t) v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8x16_t const v)
+{
+    u.s8x16[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8x16_t const v)
+{
+    u.u8x16[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16x8_t const v)
+{
+    u.s16x8[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16x8_t const v)
+{
+    u.u16x8[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32x4_t const v)
+{
+    u.s32x4[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32x4_t const v)
+{
+    u.u32x4[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64x2_t const v)
+{
+    u.s64x2[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64x2_t const v)
+{
+    u.u64x2[0] = v;
+};
+
+template<>
+really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
+{
+    u.v128[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8_t const other)
+{
+    u.s8x16[0] = vec_splats(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
+{
+    u.u8x16[0] = vec_splats(static_cast<uint8_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16_t const other)
+{
+    u.s16x8[0] = vec_splats(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
+{
+    u.u16x8[0] = vec_splats(static_cast<uint16_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32_t const other)
+{
+    u.s32x4[0] = vec_splats(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
+{
+    u.u32x4[0] = vec_splats(static_cast<uint32_t>(other));
+}
+
+#if defined(__clang__) && (__clang_major__ >= 15)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecate-lax-vec-conv-all"
+#endif // defined(__clang__) && (__clang_major__ == 15)
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64_t const other)
+{
+    u.s64x2[0] = reinterpret_cast<int64x2_t>(vec_splats(static_cast<ulong64_t>(other)));
+}
+#if defined(__clang__) && (__clang_major__ >= 15)
+#pragma clang diagnostic pop
+#endif // defined(__clang__) && (__clang_major__ == 15)
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
+{
+    u.u64x2[0] = reinterpret_cast<uint64x2_t>(vec_splats(static_cast<ulong64_t>(other)));
+}
+
+// Constants
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones(void)
+{
+    return  { vec_splat_s8(-1)};
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
+{
+    return  { vec_splat_s8(0) };
+}
+
+// Methods
+
+template <>
+really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
+{
+    u.v128[0] = other.u.v128[0];
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
+{
+    return { vec_and(u.v128[0], b.u.v128[0]) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
+{
+    return  { vec_or(u.v128[0], b.u.v128[0]) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
+{
+    return  { vec_xor(u.v128[0], b.u.v128[0]) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!() const
+{
+    return  { vec_xor(u.v128[0], u.v128[0]) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
+{
+   int8x16_t not_res = vec_xor(u.s8x16[0], vec_splat_s8(-1));
+   return { vec_and(not_res, b.u.s8x16[0]) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
+{
+    return { vec_cmpeq(u.s8x16[0], b.u.s8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
+{
+    return !(*this == b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
+{ 
+    return { vec_cmpgt(u.s8x16[0], b.u.s8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
+{
+    return { vec_cmpge(u.s8x16[0], b.u.s8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
+{
+    return { vec_cmpgt(b.u.s8x16[0], u.s8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
+{   
+    return { vec_cmpge(b.u.s8x16[0], u.s8x16[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+{
+    return (*this == b);
+}
+
+#if defined(__clang__) && (__clang_major__ >= 15)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecate-lax-vec-conv-all"
+#endif // defined(__clang__) && (__clang_major__ == 15)
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+    static uint8x16_t perm = { 16, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    uint8x16_t bitmask = vec_gb(u.u8x16[0]);
+    bitmask = (uint8x16_t) vec_perm(vec_splat_u8(0), bitmask, perm);
+    u32 ALIGN_ATTR(16) movemask;
+    vec_ste(reinterpret_cast<uint32x4_t>(bitmask), 0, &movemask);
+    return movemask;
+}
+#if defined(__clang__) && (__clang_major__ >= 15)
+#pragma clang diagnostic pop
+#endif // defined(__clang__) && (__clang_major__ == 15)
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask;
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
+{
+    return { vec_sl(u.s8x16[0], vec_splat_u8(N)) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
+{
+    return { vec_sl(u.s16x8[0], vec_splat_u16(N)) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
+{
+    return { vec_sl(u.s32x4[0], vec_splat_u32(N)) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
+{
+    return { vec_sl(u.s64x2[0], vec_splats((ulong64_t) N)) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
+{
+    return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
+{
+   return vshl_128_imm<N>();
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
+{
+    return { vec_sr(u.s8x16[0], vec_splat_u8(N)) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
+{
+    return { vec_sr(u.s16x8[0], vec_splat_u16(N)) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
+{
+    return { vec_sr(u.s32x4[0], vec_splat_u32(N)) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
+{		 
+   return { vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
+{   
+    return { vec_sld(vec_splat_s8(0), u.s8x16[0], 16 - N) };
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
+{
+    return vshr_128_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
+#endif
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    uint8x16_t shift_indices = vec_splats((uint8_t) N);
+    return { vec_sl(u.u8x16[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) const
+{
+    if (N == 0) return *this;
+    uint16x8_t shift_indices = vec_splats((uint16_t) N);
+    return { vec_sl(u.u16x8[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    uint32x4_t shift_indices = vec_splats((uint32_t) N);
+    return { vec_sl(u.u32x4[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    uint64x2_t shift_indices = vec_splats((ulong64_t) N);
+    return { vec_sl(u.u64x2[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    SuperVector sl{N << 3};
+    return { vec_slo(u.u8x16[0], sl.u.u8x16[0]) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    uint8x16_t shift_indices = vec_splats((uint8_t) N);
+    return { vec_sr(u.u8x16[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    uint16x8_t shift_indices = vec_splats((uint16_t) N);
+    return { vec_sr(u.u16x8[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    uint32x4_t shift_indices = vec_splats((uint32_t) N);
+    return { vec_sr(u.u32x4[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    uint64x2_t shift_indices = vec_splats((ulong64_t) N);
+    return { vec_sr(u.u64x2[0], shift_indices) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    SuperVector sr{N << 3};
+    return { vec_sro(u.u8x16[0], sr.u.u8x16[0]) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
+{
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (N == 0) return *this;
+    if (__builtin_constant_p(N)) {
+        return { vec_sld(vec_splat_s8(0),  u.s8x16[0], 16 - N) };
+    }
+#endif
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (N == 0) return *this;
+    if (__builtin_constant_p(N)) {
+        return { vec_sld(u.s8x16[0], vec_splat_s8(0), N)};
+    }
+#endif
+    return vshl_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
+{
+    return Ones().vshr_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
+{
+    return Ones().vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
+{
+    return { vec_xl(0, (const long64_t*)ptr) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
+{
+    assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
+    return { vec_xl(0, (const long64_t*)ptr) };
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
+{
+    SuperVector<16> mask = Ones_vshr(16 -len);
+    SuperVector<16> v = loadu(ptr);
+    return mask & v;
+}
+
+template<>
+really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
+{
+  return findAndClearLSB_32(&z);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
+{   
+    if (offset == 0) return other;
+    if (offset == 16) return *this;
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        return { vec_sld(u.s8x16[0], other.u.s8x16[0], offset) };
+    }
+#endif
+    uint8x16_t sl = vec_splats((uint8_t) (offset << 3));
+    uint8x16_t sr = vec_splats((uint8_t) ((16 - offset) << 3));
+    uint8x16_t rhs = vec_slo(u.u8x16[0], sr);
+    uint8x16_t lhs = vec_sro(other.u.u8x16[0], sl);
+    return { vec_or(lhs, rhs) };
+}
+
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<false>(SuperVector<16> b)
+{
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
+       below is the version that is converted from Intel to PPC.  */
+    uint8x16_t mask =(uint8x16_t)vec_cmpge(b.u.u8x16[0], vec_splats((uint8_t)0x80));
+    uint8x16_t res = vec_perm (u.u8x16[0], u.u8x16[0], b.u.u8x16[0]);
+    return { vec_sel(res, vec_splat_u8(0), mask) };
+}
+
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
+{
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
+       btranslated is the version that is converted from Intel to PPC.  */
+    SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f);
+    return pshufb<false>(btranslated);
+}
+
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
+{
+    SuperVector<16> mask = Ones_vshr(16 -len);
+    return mask & pshufb(b);
+}
+
+#endif
diff --git a/src/util/supervector/arch/ppc64el/types.hpp b/src/util/supervector/arch/ppc64el/types.hpp
new file mode 100644
index 000000000..bdc6608e4
--- /dev/null
+++ b/src/util/supervector/arch/ppc64el/types.hpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+typedef __vector unsigned long long int  uint64x2_t;
+typedef __vector   signed long long int   int64x2_t;
+typedef __vector unsigned int            uint32x4_t;
+typedef __vector   signed int             int32x4_t;
+typedef __vector unsigned short int      uint16x8_t;
+typedef __vector   signed short int       int16x8_t;
+typedef __vector unsigned char           uint8x16_t;
+typedef __vector  signed char             int8x16_t;
+
+typedef unsigned long long int ulong64_t;
+typedef   signed long long int  long64_t;
+
+#if !defined(m128) && defined(HAVE_VSX)
+typedef __vector int m128;
+#endif
diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
new file mode 100644
index 000000000..77ffc038c
--- /dev/null
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -0,0 +1,1885 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMD_IMPL_HPP
+#define SIMD_IMPL_HPP
+
+#include <cstdint>
+#include <cstdio>
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/unaligned.h"
+#include "util/supervector/supervector.hpp"
+
+// 128-bit SSE implementation
+#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) && (defined(HAVE_AVX2) || defined(HAVE_AVX512))) && defined(HAVE_SIMD_128_BITS)
+
+template<>
+really_inline SuperVector<16>::SuperVector(SuperVector const &other)
+{
+    u.v128[0] = other.u.v128[0];
+}
+
+template<>
+really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
+{
+    u.v128[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8_t const other)
+{
+    u.v128[0] = _mm_set1_epi8(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
+{
+    u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16_t const other)
+{
+    u.v128[0] = _mm_set1_epi16(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
+{
+    u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32_t const other)
+{
+    u.v128[0] = _mm_set1_epi32(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
+{
+    u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64_t const other)
+{
+    u.v128[0] = _mm_set1_epi64x(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
+{
+    u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other));
+}
+
+// Constants
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones()
+{
+    return {_mm_set1_epi8(0xFF)};
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
+{
+    return {_mm_set1_epi8(0)};
+}
+
+// Methods
+
+template <>
+really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
+{
+    u.v128[0] = other.u.v128[0];
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
+{
+    return {_mm_and_si128(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
+{
+    return {_mm_or_si128(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
+{
+    return {_mm_xor_si128(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!() const
+{
+    return {_mm_xor_si128(u.v128[0], u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
+{
+    return {_mm_andnot_si128(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
+{
+    return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
+{
+    return !(*this == b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
+{
+    return {_mm_cmpgt_epi8(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
+{
+    return {_mm_cmplt_epi8(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
+{
+    return !(*this < b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
+{
+    return !(*this > b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+{
+    return (*this == b);
+}
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+    return (u32)_mm_movemask_epi8(u.v128[0]);
+}
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask;
+}
+
+// template <>
+// template<uint8_t N>
+// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
+// {
+//     const uint8_t i = N;
+//     return {_mm_slli_epi8(u.v128[0], i)};
+// }
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
+{
+    return {_mm_slli_epi16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
+{
+    return {_mm_slli_epi32(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
+{
+    return {_mm_slli_epi64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
+{
+    return {_mm_slli_si128(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
+{
+    return vshl_128_imm<N>();
+}
+
+// template <>
+// template<uint8_t N>
+// really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
+// {
+//     return {_mm_srli_epi8(u.v128[0], N)};
+// }
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
+{
+    return {_mm_srli_epi16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
+{
+    return {_mm_srli_epi32(u.v128[0], N)};
+}
+  
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
+{
+    return {_mm_srli_epi64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
+{
+    return {_mm_srli_si128(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
+{
+    return vshr_128_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
+#endif
+
+// template <>
+// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+// {
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) return Zeroes();
+// }
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {_mm_slli_epi16(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi16(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {_mm_slli_epi32(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi32(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {_mm_slli_epi64(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi64(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
+    if (__builtin_constant_p(N)) {
+        return {_mm_slli_si128(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_si128(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+// template <>
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+// {
+//     SuperVector<16> result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) result = Zeroes();
+//     return result;
+// }
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {_mm_srli_epi16(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi16(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {_mm_srli_epi32(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi32(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {_mm_srli_epi64(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi64(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
+    if (__builtin_constant_p(N)) {
+        return {_mm_srli_si128(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_si128(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
+{
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
+    if (__builtin_constant_p(N)) {
+        return {_mm_srli_si128(u.v128[0], N)};
+    }
+#endif
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
+    if (__builtin_constant_p(N)) {
+        return {_mm_slli_si128(u.v128[0], N)};
+    }
+#endif
+    return vshl_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    else return Ones().vshr_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    else return Ones().vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
+{
+    return _mm_loadu_si128((const m128 *)ptr);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
+{
+    assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
+    ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
+    return _mm_load_si128((const m128 *)ptr);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(16 -len);
+    SuperVector v = _mm_loadu_si128((const m128 *)ptr);
+    return v & mask;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
+{
+#ifdef HAVE_AVX512
+    SuperVector<16> v = _mm_maskz_loadu_epi8(mask, (const m128 *)ptr);
+    v.print8("v");
+    return v;
+#else
+    DEBUG_PRINTF("mask = %08x\n", mask);
+    SuperVector v = _mm_loadu_si128((const m128 *)ptr);
+    (void)mask;
+    return v; // FIXME: & mask
+#endif
+}
+
+template<>
+really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
+{
+  return findAndClearLSB_32(&z);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        if (offset == 16) {
+            return *this;
+        } else {
+            return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], offset)};
+        }
+    }
+#endif
+    switch(offset) {
+    case 0: return other; break;
+    case 1: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)}; break;
+    case 2: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 2)}; break;
+    case 3: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 3)}; break;
+    case 4: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 4)}; break;
+    case 5: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 5)}; break;
+    case 6: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 6)}; break;
+    case 7: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 7)}; break;
+    case 8: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 8)}; break;
+    case 9: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 9)}; break;
+    case 10: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 10)}; break;
+    case 11: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 11)}; break;
+    case 12: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 12)}; break;
+    case 13: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 13)}; break;
+    case 14: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 14)}; break;
+    case 15: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 15)}; break;
+    default: break;
+    }
+    return *this;
+}
+
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
+{
+    return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])};
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(16 -len);
+    return mask & pshufb(b);
+}
+
+#endif // !defined(FAT_RUNTIME) && !defined(HAVE_AVX2)
+
+// 256-bit AVX2 implementation
+#if !(!defined(RELEASE_BUILD) && defined(FAT_RUNTIME) && defined(HAVE_AVX512)) && defined(HAVE_AVX2)
+
+template<>
+really_inline SuperVector<32>::SuperVector(SuperVector const &other)
+{
+    u.v256[0] = other.u.v256[0];
+}
+
+template<>
+really_inline SuperVector<32>::SuperVector(typename base_type::type const v)
+{
+    u.v256[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(m128 const v)
+{
+    u.v256[0] = _mm256_broadcastsi128_si256(v);
+};
+
+template<>
+really_inline SuperVector<32>::SuperVector(m128 const lo, m128 const hi)
+{
+    u.v128[0] = lo;
+    u.v128[1] = hi;
+};
+
+template<>
+really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector<16> const hi)
+{
+    u.v128[0] = lo.u.v128[0];
+    u.v128[1] = hi.u.v128[0];
+};
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(int8_t const other)
+{
+    u.v256[0] = _mm256_set1_epi8(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(uint8_t const other)
+{
+    u.v256[0] = _mm256_set1_epi8(static_cast<int8_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(int16_t const other)
+{
+    u.v256[0] = _mm256_set1_epi16(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(uint16_t const other)
+{
+    u.v256[0] = _mm256_set1_epi16(static_cast<int16_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(int32_t const other)
+{
+    u.v256[0] = _mm256_set1_epi32(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(uint32_t const other)
+{
+    u.v256[0] = _mm256_set1_epi32(static_cast<int32_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(int64_t const other)
+{
+    u.v256[0] = _mm256_set1_epi64x(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<32>::SuperVector(uint64_t const other)
+{
+    u.v256[0] = _mm256_set1_epi64x(static_cast<int64_t>(other));
+}
+
+// Constants
+template<>
+really_inline SuperVector<32> SuperVector<32>::Ones(void)
+{
+    return {_mm256_set1_epi8(0xFF)};
+}
+
+template<>
+really_inline SuperVector<32> SuperVector<32>::Zeroes(void)
+{
+    return {_mm256_set1_epi8(0)};
+}
+
+template <>
+really_inline void SuperVector<32>::operator=(SuperVector<32> const &other)
+{
+    u.v256[0] = other.u.v256[0];
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator&(SuperVector<32> const &b) const
+{
+    return {_mm256_and_si256(u.v256[0], b.u.v256[0])};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator|(SuperVector<32> const &b) const
+{
+    return {_mm256_or_si256(u.v256[0], b.u.v256[0])};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator^(SuperVector<32> const &b) const
+{
+    return {_mm256_xor_si256(u.v256[0], b.u.v256[0])};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator!() const
+{
+    return {_mm256_xor_si256(u.v256[0], u.v256[0])};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::opandnot(SuperVector<32> const &b) const
+{
+    return {_mm256_andnot_si256(u.v256[0], b.u.v256[0])};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator==(SuperVector<32> const &b) const
+{
+    return {_mm256_cmpeq_epi8(u.v256[0], b.u.v256[0])};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator!=(SuperVector<32> const &b) const
+{
+    return !(*this == b);
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator>(SuperVector<32> const &b) const
+{
+    return {_mm256_cmpgt_epi8(u.v256[0], b.u.v256[0])};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator<(SuperVector<32> const &b) const
+{
+    return (b > *this);
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator>=(SuperVector<32> const &b) const
+{
+    return !(*this < b);
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator<=(SuperVector<32> const &b) const
+{
+    return !(*this > b);
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::eq(SuperVector<32> const &b) const
+{
+    return (*this == b);
+}
+
+template <>
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::comparemask(void) const {
+    return (u32)_mm256_movemask_epi8(u.v256[0]);
+}
+
+template <>
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::eqmask(SuperVector<32> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<32>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<32>::comparemask_type
+SuperVector<32>::iteration_mask(
+    typename SuperVector<32>::comparemask_type mask) {
+    return mask;
+}
+
+// template <>
+// template<uint8_t N>
+// really_inline SuperVector<32> SuperVector<32>::vshl_8_imm() const
+// {
+//     const uint8_t i = N;
+//     return {_mm256_slli_epi8(u.v256[0], i)};
+// }
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_16_imm() const
+{
+    return {_mm256_slli_epi16(u.v256[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_32_imm() const
+{
+    return {_mm256_slli_epi32(u.v256[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_64_imm() const
+{
+    return {_mm256_slli_epi64(u.v256[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const
+{
+    return {_mm256_slli_si256(u.v256[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const
+{
+    if (N == 0) return *this;
+    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
+    if (N == 32) return Zeroes();
+    if (N < 16) {
+        return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
+    } else {
+        return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
+    }
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshl_imm() const
+{
+    return vshl_256_imm<N>();
+}
+
+// template <>
+// template<uint8_t N>
+// really_inline SuperVector<32> SuperVector<32>::vshr_8_imm() const
+// {
+//     return {_mm256_srli_epi8(u.v256[0], N)};
+// }
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_16_imm() const
+{
+    return {_mm256_srli_epi16(u.v256[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_32_imm() const
+{
+    return {_mm256_srli_epi32(u.v256[0], N)};
+}
+  
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_64_imm() const
+{
+    return {_mm256_srli_epi64(u.v256[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_128_imm() const
+{
+    return {_mm256_srli_si256(u.v256[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_256_imm() const
+{
+    if (N == 0) return *this;
+    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
+    if (N == 32) return Zeroes();
+    if (N < 16) {
+        return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
+    } else {
+        return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
+    }
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<32> SuperVector<32>::vshr_imm() const
+{
+    return vshr_256_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<32> SuperVector<32>::vshl_16_imm<1>() const;
+template SuperVector<32> SuperVector<32>::vshl_64_imm<1>() const;
+template SuperVector<32> SuperVector<32>::vshl_64_imm<4>() const;
+template SuperVector<32> SuperVector<32>::vshl_128_imm<1>() const;
+template SuperVector<32> SuperVector<32>::vshl_128_imm<4>() const;
+template SuperVector<32> SuperVector<32>::vshr_16_imm<1>() const;
+template SuperVector<32> SuperVector<32>::vshr_64_imm<1>() const;
+template SuperVector<32> SuperVector<32>::vshr_64_imm<4>() const;
+template SuperVector<32> SuperVector<32>::vshr_128_imm<1>() const;
+template SuperVector<32> SuperVector<32>::vshr_128_imm<4>() const;
+template SuperVector<32> SuperVector<32>::vshr_256_imm<1>() const;
+template SuperVector<32> SuperVector<32>::vshr_imm<1>() const;
+#endif
+
+// template <>
+// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+// {
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm256_slli_epi8(v->u.v256[0], i)}; });
+//     if (N == 16) return Zeroes();
+// }
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi16(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi32(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi64(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_si256(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};;
+    });
+    Unroller<17, 32>::iterator([&,v=this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshl(uint8_t const N) const
+{
+    return vshl_256(N);
+}
+
+// template <>
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+// {
+//     SuperVector<16> result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) result = Zeroes();
+//     return result;
+// }
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi16(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi32(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi64(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_si256(v->u.v256[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr_256(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
+    if (N == 32) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n) result = {_mm256_alignr_epi8(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), v->u.v256[0], n)};
+    });
+    Unroller<17, 32>::iterator([&,v=this](auto const i) {
+        constexpr uint8_t n = i.value;
+        if (N == n) result = {_mm256_srli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), n - 16)};
+    });
+    return result;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::vshr(uint8_t const N) const
+{
+    return vshr_256(N);
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
+        if (N < 16) {
+            return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], N)};
+        } else if (N == 16) {
+            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
+        } else {
+            return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
+        }
+    }
+#endif
+    return vshr_256(N);
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        // As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
+        if (N < 16) {
+            return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
+        } else if (N == 16) {
+            return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
+        } else {
+            return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
+        }
+    }
+#endif
+    return vshl_256(N);
+}
+
+template<>
+really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    if (N >= 16)
+        return {SuperVector<16>::Ones_vshr(N - 16), SuperVector<16>::Zeroes()};
+    else
+        return {SuperVector<16>::Ones(), SuperVector<16>::Ones_vshr(N)};
+}
+
+template<>
+really_inline SuperVector<32> SuperVector<32>::Ones_vshl(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    if (N >= 16)
+        return {SuperVector<16>::Zeroes(), SuperVector<16>::Ones_vshl(N - 16)};
+    else
+        return {SuperVector<16>::Ones_vshl(N), SuperVector<16>::Ones()};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::loadu(void const *ptr)
+{
+    return {_mm256_loadu_si256((const m256 *)ptr)};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::load(void const *ptr)
+{
+    assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
+    ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
+    return {_mm256_load_si256((const m256 *)ptr)};
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(32 -len);
+    mask.print8("mask");
+    SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
+    v.print8("v");
+    return v & mask;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
+{
+    DEBUG_PRINTF("mask = %08llx\n", mask);
+#ifdef HAVE_AVX512
+    SuperVector<32> v = _mm256_maskz_loadu_epi8(mask, (const m256 *)ptr);
+    v.print8("v");
+    return v;
+#else
+    SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
+    v.print8("v");
+    (void)mask;
+    return v; // FIXME: & mask
+#endif
+}
+
+template<>
+really_inline typename SuperVector<32>::comparemask_type SuperVector<32>::findLSB(typename SuperVector<32>::comparemask_type &z)
+{
+  return findAndClearLSB_64(&z);
+}
+
+template<>
+really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !(defined(__GNUC__) && (__GNUC__ == 13))
+    if (__builtin_constant_p(offset)) {
+        if (offset == 16) {
+            return *this;
+        } else {
+            return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], offset)};
+        }
+    }
+#endif
+    // As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458
+    switch (offset){ 
+    case 0 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0)); break;
+    case 1 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 1), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 1)); break;
+    case 2 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 2), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 2)); break;
+    case 3 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 3), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 3)); break;
+    case 4 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 4), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 4)); break;
+    case 5 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 5), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 5)); break;
+    case 6 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 6), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 6)); break;
+    case 7 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 7), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 7)); break;
+    case 8 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 8), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 8)); break;
+    case 9 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 9), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 9)); break;
+    case 10 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 10), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 10)); break;
+    case 11 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 11), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 11)); break;
+    case 12 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 12), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 12)); break;
+    case 13 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 13), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 13)); break;
+    case 14 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 14), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 14)); break;
+    case 15 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 15), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 15)); break;
+    case 16 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 0), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0)); break;
+    case 17 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 1), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1)); break;
+    case 18 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 2), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2)); break;
+    case 19 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 3), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3)); break;
+    case 20 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 4), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4)); break;
+    case 21 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 5), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5)); break;
+    case 22 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 6), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6)); break;
+    case 23 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 7), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7)); break;
+    case 24 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 8), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8)); break;
+    case 25 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 9), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9)); break;
+    case 26 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 10), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10)); break;
+    case 27 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 11), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11)); break;
+    case 28 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 12), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12)); break;
+    case 29 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 13), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13)); break;
+    case 30 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 14), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14)); break;
+    case 31 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 15), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15)); break;  
+    default: break;
+    }
+    return *this;
+}
+
+template<>
+template<>
+really_inline SuperVector<32> SuperVector<32>::pshufb<true>(SuperVector<32> b)
+{
+    return {_mm256_shuffle_epi8(u.v256[0], b.u.v256[0])};
+}
+
+template<>
+really_inline SuperVector<32> SuperVector<32>::pshufb_maskz(SuperVector<32> b, uint8_t const len)
+{
+    SuperVector<32> mask = Ones_vshr(32 -len);
+    return mask & pshufb(b);
+}
+
+#endif // HAVE_AVX2
+
+
+// 512-bit AVX512 implementation
+#if defined(HAVE_AVX512)
+
+template<>
+really_inline SuperVector<64>::SuperVector(SuperVector const &o)
+{
+    u.v512[0] = o.u.v512[0];
+}
+
+template<>
+really_inline SuperVector<64>::SuperVector(typename base_type::type const v)
+{
+    u.v512[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(m256 const v)
+{
+    u.v512[0] = _mm512_broadcast_i64x4(v);
+};
+
+template<>
+really_inline SuperVector<64>::SuperVector(m256 const lo, m256 const hi)
+{
+    u.v256[0] = lo;
+    u.v256[1] = hi;
+};
+
+template<>
+really_inline SuperVector<64>::SuperVector(SuperVector<32> const lo, SuperVector<32> const hi)
+{
+    u.v256[0] = lo.u.v256[0];
+    u.v256[1] = hi.u.v256[0];
+};
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(m128 const v)
+{
+    u.v512[0] = _mm512_broadcast_i32x4(v);
+};
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(int8_t const o)
+{
+    u.v512[0] = _mm512_set1_epi8(o);
+}
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(uint8_t const o)
+{
+    u.v512[0] = _mm512_set1_epi8(static_cast<int8_t>(o));
+}
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(int16_t const o)
+{
+    u.v512[0] = _mm512_set1_epi16(o);
+}
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(uint16_t const o)
+{
+    u.v512[0] = _mm512_set1_epi16(static_cast<int16_t>(o));
+}
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(int32_t const o)
+{
+    u.v512[0] = _mm512_set1_epi32(o);
+}
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(uint32_t const o)
+{
+    u.v512[0] = _mm512_set1_epi32(static_cast<int32_t>(o));
+}
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(int64_t const o)
+{
+    u.v512[0] = _mm512_set1_epi64(o);
+}
+
+template<>
+template<>
+really_inline SuperVector<64>::SuperVector(uint64_t const o)
+{
+    u.v512[0] = _mm512_set1_epi64(static_cast<int64_t>(o));
+}
+
+// Constants
+template<>
+really_inline SuperVector<64> SuperVector<64>::Ones(void)
+{
+    return {_mm512_set1_epi8(0xFF)};
+}
+
+template<>
+really_inline SuperVector<64> SuperVector<64>::Zeroes(void)
+{
+    return {_mm512_set1_epi8(0)};
+}
+
+// Methods
+template <>
+really_inline void SuperVector<64>::operator=(SuperVector<64> const &o)
+{
+    u.v512[0] = o.u.v512[0];
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator&(SuperVector<64> const &b) const
+{
+    return {_mm512_and_si512(u.v512[0], b.u.v512[0])};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator|(SuperVector<64> const &b) const
+{
+    return {_mm512_or_si512(u.v512[0], b.u.v512[0])};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator^(SuperVector<64> const &b) const
+{
+    return {_mm512_xor_si512(u.v512[0], b.u.v512[0])};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator!() const
+{
+    return {_mm512_xor_si512(u.v512[0], u.v512[0])};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::opandnot(SuperVector<64> const &b) const
+{
+    return {_mm512_andnot_si512(u.v512[0], b.u.v512[0])};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator==(SuperVector<64> const &b) const
+{
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
+    return {_mm512_movm_epi8(mask)};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator!=(SuperVector<64> const &b) const
+{
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpneq_epi8_mask(u.v512[0], b.u.v512[0]);
+    return {_mm512_movm_epi8(mask)};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator>(SuperVector<64> const &b) const
+{
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpgt_epi8_mask(u.v512[0], b.u.v512[0]);
+    return {_mm512_movm_epi8(mask)};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator<(SuperVector<64> const &b) const
+{
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmplt_epi8_mask(u.v512[0], b.u.v512[0]);
+    return {_mm512_movm_epi8(mask)};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator>=(SuperVector<64> const &b) const
+{
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmpge_epi8_mask(u.v512[0], b.u.v512[0]);
+    return {_mm512_movm_epi8(mask)};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator<=(SuperVector<64> const &b) const
+{
+    SuperVector<64>::comparemask_type mask =
+        _mm512_cmple_epi8_mask(u.v512[0], b.u.v512[0]);
+    return {_mm512_movm_epi8(mask)};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::eq(SuperVector<64> const &b) const
+{
+    return (*this == b);
+}
+
+template <>
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::comparemask(void) const {
+    __m512i msb = _mm512_set1_epi8(0xFF);
+    __m512i mask = _mm512_and_si512(msb, u.v512[0]);
+    return _mm512_cmpeq_epi8_mask(mask, msb);
+}
+
+template <>
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::eqmask(SuperVector<64> const b) const {
+    return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
+}
+
+template <> really_inline u32 SuperVector<64>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<64>::comparemask_type
+SuperVector<64>::iteration_mask(
+    typename SuperVector<64>::comparemask_type mask) {
+    return mask;
+}
+
+// template <>
+// template<uint8_t N>
+// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
+// {
+//     const uint8_t i = N;
+//     return {_mm_slli_epi8(u.v128[0], i)};
+// }
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_16_imm() const
+{
+    return {_mm512_slli_epi16(u.v512[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_32_imm() const
+{
+    return {_mm512_slli_epi32(u.v512[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_64_imm() const
+{
+    return {_mm512_slli_epi64(u.v512[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_128_imm() const
+{
+    return {_mm512_bslli_epi128(u.v512[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_256_imm() const
+{
+    return {};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_512_imm() const
+{
+    return {};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshl_imm() const
+{
+    return vshl_512_imm<N>();
+}
+
+// template <>
+// template<uint8_t N>
+// really_inline SuperVector<64> SuperVector<64>::vshr_8_imm() const
+// {
+//     return {_mm_srli_epi8(u.v128[0], N)};
+// }
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_16_imm() const
+{
+    return {_mm512_srli_epi16(u.v512[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_32_imm() const
+{
+    return {_mm512_srli_epi32(u.v512[0], N)};
+}
+  
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_64_imm() const
+{
+    return {_mm512_srli_epi64(u.v512[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_128_imm() const
+{
+    return {_mm512_bsrli_epi128(u.v512[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_256_imm() const
+{
+    return {};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_512_imm() const
+{
+    return {};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<64> SuperVector<64>::vshr_imm() const
+{
+    return vshr_512_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<64> SuperVector<64>::vshl_16_imm<1>() const;
+template SuperVector<64> SuperVector<64>::vshl_64_imm<1>() const;
+template SuperVector<64> SuperVector<64>::vshl_64_imm<4>() const;
+template SuperVector<64> SuperVector<64>::vshl_128_imm<1>() const;
+template SuperVector<64> SuperVector<64>::vshl_128_imm<4>() const;
+template SuperVector<64> SuperVector<64>::vshr_16_imm<1>() const;
+template SuperVector<64> SuperVector<64>::vshr_64_imm<1>() const;
+template SuperVector<64> SuperVector<64>::vshr_64_imm<4>() const;
+template SuperVector<64> SuperVector<64>::vshr_128_imm<1>() const;
+template SuperVector<64> SuperVector<64>::vshr_128_imm<4>() const;
+#endif
+
+// template <>
+// really_inline SuperVector<64> SuperVector<64>::vshl_8  (uint8_t const N) const
+// {
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) return Zeroes();
+// }
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshl_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    SuperVector result;
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi16(v->u.v512[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    SuperVector result;
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi32(v->u.v512[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    SuperVector result;
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_slli_epi64(v->u.v512[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    SuperVector result;
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_bslli_epi128(v->u.v512[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshl_256(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshl_512(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshl(uint8_t const N) const
+{
+    return vshl_512(N);
+}
+
+// template <>
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+// {
+//     SuperVector<16> result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) result = Zeroes();
+//     return result;
+// }
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    SuperVector result;
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi16(v->u.v512[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    SuperVector result;
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi32(v->u.v512[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_srli_epi64(v->u.v512[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 64) return Zeroes();
+    SuperVector result;
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm512_bsrli_epi128(v->u.v512[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshr_256(uint8_t const N) const
+{
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshr_512(uint8_t const N) const
+{
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::vshr(uint8_t const N) const
+{
+    return vshr_512(N);
+}
+
+template<>
+really_inline SuperVector<64> SuperVector<64>::Ones_vshr(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    if (N >= 32)
+        return {SuperVector<32>::Ones_vshr(N - 32), SuperVector<32>::Zeroes()};
+    else
+        return {SuperVector<32>::Ones(), SuperVector<32>::Ones_vshr(N)};
+}
+
+template<>
+really_inline SuperVector<64> SuperVector<64>::Ones_vshl(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    if (N >= 32)
+        return {SuperVector<32>::Zeroes(), SuperVector<32>::Ones_vshl(N - 32)};
+    else
+        return {SuperVector<32>::Ones_vshl(N), SuperVector<32>::Ones()};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator>>(uint8_t const N) const
+{
+    if (N == 0) {
+        return *this;
+    } else if (N < 32) {
+        SuperVector<32> lo256 = u.v256[0];
+        SuperVector<32> hi256 = u.v256[1];
+        SuperVector<32> carry = hi256 << (32 - N);
+        hi256 = hi256 >> N;
+        lo256 = (lo256 >> N) | carry;
+        return SuperVector(lo256, hi256);
+    } else if (N == 32) {
+        SuperVector<32> hi256 = u.v256[1];
+        return SuperVector(hi256, SuperVector<32>::Zeroes());
+    } else if (N < 64) {
+        SuperVector<32> hi256 = u.v256[1];
+        return SuperVector(hi256 >> (N - 32), SuperVector<32>::Zeroes());
+    } else {
+        return Zeroes();
+    }
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::operator<<(uint8_t const N) const
+{
+    if (N == 0) {
+        return *this;
+    } else if (N < 32) {
+        SuperVector<32> lo256 = u.v256[0];
+        SuperVector<32> hi256 = u.v256[1];
+        SuperVector<32> carry = lo256 >> (32 - N);
+        hi256 = (hi256 << N) | carry;
+        lo256 = lo256 << N;
+        return SuperVector(lo256, hi256);
+    } else if (N == 32) {
+        SuperVector<32> lo256 = u.v256[0];
+        return SuperVector(SuperVector<32>::Zeroes(), lo256);
+    } else if (N < 64) {
+        SuperVector<32> lo256 = u.v256[0];
+        return SuperVector(SuperVector<32>::Zeroes(), lo256 << (N - 32));
+    } else {
+        return Zeroes();
+    }
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::loadu(void const *ptr)
+{
+    return {_mm512_loadu_si512((const m512 *)ptr)};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::load(void const *ptr)
+{
+    assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
+    ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
+    return {_mm512_load_si512((const m512 *)ptr)};
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, uint8_t const len)
+{
+    u64a mask = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("mask = %016llx\n", mask);
+    SuperVector<64> v = _mm512_maskz_loadu_epi8(mask, (const m512 *)ptr);
+    v.print8("v");
+    return v;
+}
+
+template <>
+really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
+{
+    DEBUG_PRINTF("mask = %016llx\n", mask);
+    SuperVector<64> v = _mm512_maskz_loadu_epi8(mask, (const m512 *)ptr);
+    v.print8("v");
+    return v;
+}
+
+template<>
+really_inline typename SuperVector<64>::comparemask_type SuperVector<64>::findLSB(typename SuperVector<64>::comparemask_type &z)
+{
+  return findAndClearLSB_64(&z);
+}
+
+template<>
+template<>
+really_inline SuperVector<64> SuperVector<64>::pshufb<true>(SuperVector<64> b)
+{
+    return {_mm512_shuffle_epi8(u.v512[0], b.u.v512[0])};
+}
+
+template<>
+really_inline SuperVector<64> SuperVector<64>::pshufb_maskz(SuperVector<64> b, uint8_t const len)
+{
+    u64a mask = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("mask = %016llx\n", mask);
+    return {_mm512_maskz_shuffle_epi8(mask, u.v512[0], b.u.v512[0])};
+}
+
+template<>
+really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset)
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        if (offset == 16) {
+            return *this;
+        } else {
+            return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], offset)};
+        }
+    }
+#endif
+    if(offset == 0) {
+        return *this;
+    } else if (offset < 32){
+        SuperVector<32> lo256 = u.v256[0];
+        SuperVector<32> hi256 = u.v256[1];
+        SuperVector<32> o_lo256 = l.u.v256[0];
+        SuperVector<32> carry1 = hi256.alignr(lo256,offset);
+        SuperVector<32> carry2 = o_lo256.alignr(hi256,offset);
+        return SuperVector(carry1, carry2);
+    } else if (offset <= 64){
+        SuperVector<32> hi256 = u.v256[1];
+        SuperVector<32> o_lo256 = l.u.v256[0];
+        SuperVector<32> o_hi256 = l.u.v256[1];
+        SuperVector<32> carry1 = o_lo256.alignr(hi256, offset - 32);
+        SuperVector<32> carry2 = o_hi256.alignr(o_lo256,offset -32);
+        return SuperVector(carry1, carry2);
+    } else {
+        return *this;
+    }
+}
+
+#endif // HAVE_AVX512
+
+#endif // SIMD_IMPL_HPP
diff --git a/src/util/supervector/arch/x86/types.hpp b/src/util/supervector/arch/x86/types.hpp
new file mode 100644
index 000000000..b63327819
--- /dev/null
+++ b/src/util/supervector/arch/x86/types.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if !defined(m128) && defined(HAVE_SSE2)
+typedef __m128i m128;
+#endif
+
+#if !defined(m256) && defined(HAVE_AVX2)
+typedef __m256i m256;
+#endif
+
+#if !defined(m512) && defined(HAVE_AVX512)
+typedef __m512i m512;
+#endif
\ No newline at end of file
diff --git a/src/util/supervector/casemask.hpp b/src/util/supervector/casemask.hpp
new file mode 100644
index 000000000..10fa5f1a6
--- /dev/null
+++ b/src/util/supervector/casemask.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CASEMASK_HPP
+#define CASEMASK_HPP
+
+#include "util/supervector/supervector.hpp"
+
+static u8 CASEMASK[] = { 0xff, 0xdf };
+
+static really_inline
+u8 caseClear8(u8 x, bool noCase)
+{
+    return static_cast<u8>(x & CASEMASK[(u8)noCase]);
+}
+
+template<uint16_t S>
+static really_inline SuperVector<S> getMask(u8 c, bool noCase) {
+    u8 k = caseClear8(c, noCase);
+    return SuperVector<S>(k);
+}
+
+template<uint16_t S>
+static really_inline SuperVector<S> getCaseMask(void) {
+    return SuperVector<S>(CASEMASK[1]);
+}
+
+#endif // CASEMASK_HPP
\ No newline at end of file
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
new file mode 100644
index 000000000..6d2bc8092
--- /dev/null
+++ b/src/util/supervector/supervector.hpp
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SUPERVECTOR_HPP
+#define SUPERVECTOR_HPP
+
+#include <cstdint>
+#include <cstdio>
+#include <type_traits>
+
+#if defined(VS_SIMDE_BACKEND)
+#include "util/supervector/arch/x86/types.hpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/supervector/arch/x86/types.hpp"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/supervector/arch/arm/types.hpp"
+#elif defined(ARCH_PPC64EL)
+#include "util/supervector/arch/ppc64el/types.hpp"
+#endif
+#endif // VS_SIMDE_BACKEND
+
+#include <util/bitutils.h>
+
+// Define a common assume_aligned using an appropriate compiler built-in, if
+// it's available. Note that we need to handle C or C++ compilation.
+#ifdef __cplusplus
+#  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
+#    define vectorscan_assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#  endif
+#else
+#  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
+#    define vectorscan_assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+#  endif
+#endif
+
+// Fallback to identity case.
+#ifndef vectorscan_assume_aligned
+#define vectorscan_assume_aligned(x, y) (x)
+#endif
+
+template <uint16_t SIZE>
+class SuperVector;
+
+using m128_t  = SuperVector<16>;
+using m256_t  = SuperVector<32>;
+using m512_t  = SuperVector<64>;
+using m1024_t = SuperVector<128>;
+
+// struct for inferring what underlying types to use
+template <int T>
+struct BaseVector
+{
+  static constexpr bool      is_valid = false;
+  static constexpr u16           size = 8;
+  using                          type = void;
+  using              comparemask_type = void;
+  static constexpr bool  has_previous = false;
+  using                 previous_type = void;
+  static constexpr u16  previous_size = 4;
+};
+
+template <>
+struct BaseVector<128>
+{
+  static constexpr bool      is_valid = true;
+  static constexpr u16           size = 128;
+  using                          type = void;
+  using              comparemask_type = u64a;
+  static constexpr bool  has_previous = true;
+  using                 previous_type = m512;
+  static constexpr u16  previous_size = 64;
+};
+
+template <>
+struct BaseVector<64>
+{
+  static constexpr bool      is_valid = true;
+  static constexpr u16           size = 64;
+  using                          type = m512;
+  using              comparemask_type = u64a;
+  static constexpr bool  has_previous = true;
+  using                 previous_type = m256;
+  static constexpr u16  previous_size = 32;
+};
+
+// 256 bit implementation
+template <>
+struct BaseVector<32>
+{
+  static constexpr bool      is_valid = true;
+  static constexpr u16           size = 32;
+  using                          type = m256;
+  using              comparemask_type = u64a;
+  static constexpr bool  has_previous = true;
+  using                 previous_type = m128;
+  static constexpr u16  previous_size = 16;
+};
+
+// 128 bit implementation
+template <>
+struct BaseVector<16>
+{
+  static constexpr bool      is_valid = true;
+  static constexpr u16           size = 16;
+  using                          type = m128;
+#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+  using              comparemask_type = u64a;
+#else
+  using              comparemask_type = u32;
+#endif
+  static constexpr bool  has_previous = false;
+  using                 previous_type = u64a;
+  static constexpr u16  previous_size = 8;
+};
+
+template <uint16_t SIZE>
+class SuperVector : public BaseVector<SIZE>
+{
+  static_assert(BaseVector<SIZE>::is_valid, "invalid SuperVector size");
+
+public:
+
+  using base_type      = BaseVector<SIZE>;
+  using previous_type  = typename BaseVector<SIZE>::previous_type;
+
+  union {
+    typename BaseVector<16>::type ALIGN_ATTR(BaseVector<16>::size) v128[SIZE / BaseVector<16>::size];
+    typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size];
+    typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size];
+
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL))
+    uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
+    int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
+    uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
+    int32x4_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
+    uint16x8_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size];
+    int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
+    uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
+    int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
+#endif
+
+    uint64_t u64[SIZE / sizeof(uint64_t)];
+    int64_t  s64[SIZE / sizeof(int64_t)];
+    uint32_t u32[SIZE / sizeof(uint32_t)];
+    int32_t  s32[SIZE / sizeof(int32_t)];
+    uint16_t u16[SIZE / sizeof(uint16_t)];
+    int16_t  s16[SIZE / sizeof(int16_t)];
+    uint8_t  u8[SIZE / sizeof(uint8_t)];
+    int8_t   s8[SIZE / sizeof(int8_t)];
+    float    f32[SIZE / sizeof(float)];
+    double   f64[SIZE / sizeof(double)];
+  } u;
+
+  constexpr SuperVector() {};
+  SuperVector(SuperVector const &other)
+  :u(other.u) {};
+  SuperVector(typename base_type::type const v);
+
+  template<typename T>
+  SuperVector(T const other);
+
+  SuperVector(SuperVector<SIZE/2> const lo, SuperVector<SIZE/2> const hi);
+  SuperVector(previous_type const lo, previous_type const hi);
+
+  static SuperVector dup_u8 (uint8_t  other) { return {other}; };
+  static SuperVector dup_s8 (int8_t   other) { return {other}; };
+  static SuperVector dup_u16(uint16_t other) { return {other}; };
+  static SuperVector dup_s16(int16_t  other) { return {other}; };
+  static SuperVector dup_u32(uint32_t other) { return {other}; };
+  static SuperVector dup_s32(int32_t  other) { return {other}; };
+  static SuperVector dup_u64(uint64_t other) { return {other}; };
+  static SuperVector dup_s64(int64_t  other) { return {other}; };
+
+  void operator=(SuperVector const &other);
+
+  SuperVector operator&(SuperVector const &b) const;
+  SuperVector operator|(SuperVector const &b) const;
+  SuperVector operator^(SuperVector const &b) const;
+  SuperVector operator!() const;
+
+  SuperVector operator==(SuperVector const &b) const;
+  SuperVector operator!=(SuperVector const &b) const;
+  SuperVector operator>(SuperVector const &b) const;
+  SuperVector operator>=(SuperVector const &b) const;
+  SuperVector operator<(SuperVector const &b) const;
+  SuperVector operator<=(SuperVector const &b) const;
+
+  SuperVector opand(SuperVector const &b) const { return *this & b; }
+  SuperVector opor (SuperVector const &b) const { return *this | b; }
+  SuperVector opxor(SuperVector const &b) const { return *this ^ b; }
+  SuperVector opandnot(SuperVector const &b) const;
+  SuperVector opnot() const { return !(*this); }
+
+  SuperVector eq(SuperVector const &b) const;
+  SuperVector operator<<(uint8_t const N) const;
+  SuperVector operator>>(uint8_t const N) const;
+  // Returns mask_width groups of zeros or ones. To get the mask which can be
+  // iterated, use iteration_mask method, it ensures only one bit is set per
+  // mask_width group.
+  // Precondition: all bytes must be 0 or 0xff.
+  typename base_type::comparemask_type comparemask(void) const;
+  typename base_type::comparemask_type eqmask(SuperVector const b) const;
+  static u32 mask_width();
+  // Returns a mask with at most 1 bit set to 1. It can be used to iterate
+  // over bits through ctz/clz and lowest bit clear.
+  static typename base_type::comparemask_type
+  iteration_mask(typename base_type::comparemask_type mask);
+
+  static typename base_type::comparemask_type load_mask(uint8_t const len) { return (((1ULL) << (len)) - 1ULL); }
+  static typename base_type::comparemask_type findLSB(typename base_type::comparemask_type &z);
+  static SuperVector loadu(void const *ptr);
+  static SuperVector load(void const *ptr);
+  static SuperVector loadu_maskz(void const *ptr, uint8_t const len);
+  static SuperVector loadu_maskz(void const *ptr, typename base_type::comparemask_type const len);
+  SuperVector alignr(SuperVector &other, int8_t offset);
+
+  template<bool emulateIntel=true>
+  SuperVector pshufb(SuperVector b);
+  SuperVector pshufb_maskz(SuperVector b, uint8_t const len);
+
+  // Shift instructions
+  template<uint8_t N>
+  SuperVector vshl_8_imm() const;
+  template<uint8_t N>
+  SuperVector vshr_8_imm() const;
+  template<uint8_t N>
+  SuperVector vshl_16_imm() const;
+  template<uint8_t N>
+  SuperVector vshr_16_imm() const;
+  template<uint8_t N>
+  SuperVector vshl_32_imm() const;
+  template<uint8_t N>
+  SuperVector vshr_32_imm() const;
+  template<uint8_t N>
+  SuperVector vshl_64_imm() const;
+  template<uint8_t N>
+  SuperVector vshr_64_imm() const;
+  template<uint8_t N>
+  SuperVector vshl_128_imm() const;
+  template<uint8_t N>
+  SuperVector vshr_128_imm() const;
+  #if defined(HAVE_SIMD_256_BITS)
+  template<uint8_t N>
+  SuperVector vshl_256_imm() const;
+  template<uint8_t N>
+  SuperVector vshr_256_imm() const;
+  #endif
+  #if defined(HAVE_SIMD_512_BITS)
+  template<uint8_t N>
+  SuperVector vshl_512_imm() const;
+  template<uint8_t N>
+  SuperVector vshr_512_imm() const;
+  #endif
+  template<uint8_t N>
+  SuperVector vshl_imm() const;
+  template<uint8_t N>
+  SuperVector vshr_imm() const;
+  SuperVector vshl_8  (uint8_t const N) const;
+  SuperVector vshr_8  (uint8_t const N) const;
+  SuperVector vshl_16 (uint8_t const N) const;
+  SuperVector vshr_16 (uint8_t const N) const;
+  SuperVector vshl_32 (uint8_t const N) const;
+  SuperVector vshr_32 (uint8_t const N) const;
+  SuperVector vshl_64 (uint8_t const N) const;
+  SuperVector vshr_64 (uint8_t const N) const;
+  SuperVector vshl_128(uint8_t const N) const;
+  SuperVector vshr_128(uint8_t const N) const;
+  #if defined(HAVE_SIMD_256_BITS)
+  SuperVector vshl_256(uint8_t const N) const;
+  SuperVector vshr_256(uint8_t const N) const;
+  #endif
+  #if defined(HAVE_SIMD_512_BITS)
+  SuperVector vshl_512(uint8_t const N) const;
+  SuperVector vshr_512(uint8_t const N) const;
+  #endif
+  SuperVector vshl    (uint8_t const N) const;
+  SuperVector vshr    (uint8_t const N) const;
+
+  // Constants
+  static SuperVector Ones();
+  static SuperVector Ones_vshr(uint8_t const N);
+  static SuperVector Ones_vshl(uint8_t const N);
+  static SuperVector Zeroes();
+
+  #if defined(DEBUG)
+  void print8(const char *label) const {
+      printf("%12s: ", label);
+      for(s16 i=SIZE-1; i >= 0; i--)
+          printf("%02x ", u.u8[i]);
+      printf("\n");
+  }
+
+  void print16(const char *label) const {
+      printf("%12s: ", label);
+      for(s16 i=SIZE/sizeof(u16)-1; i >= 0; i--)
+          printf("%04x ", u.u16[i]);
+      printf("\n");
+  }
+
+  void print32(const char *label) const {
+      printf("%12s: ", label);
+      for(s16 i=SIZE/sizeof(u32)-1; i >= 0; i--)
+          printf("%08x ", u.u32[i]);
+      printf("\n");
+  }
+
+  void print64(const char *label) const {
+      printf("%12s: ", label);
+      for(s16 i=SIZE/sizeof(u64a)-1; i >= 0; i--)
+          printf("%016lx ", u.u64[i]);
+      printf("\n");
+  }
+#else
+  void print8(const char *label UNUSED) const {};
+  void print16(const char *label UNUSED) const {};
+  void print32(const char *label UNUSED) const {};
+  void print64(const char *label UNUSED) const {};
+#endif
+};
+
+template <std::size_t Begin, std::size_t End>
+struct Unroller
+{
+  template<typename Action>
+  static void iterator(Action &&action)
+  {
+    action(std::integral_constant<int, Begin>());
+    Unroller<Begin + 1, End>::iterator(action);
+  }
+};
+
+template <std::size_t End>
+struct Unroller<End, End>
+{
+  template<typename Action>
+  static void iterator(Action &&action UNUSED)
+  {}
+};
+
+#if defined(HS_OPTIMIZE)
+#if defined(VS_SIMDE_BACKEND)
+#include "util/supervector/arch/x86/impl.cpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/supervector/arch/x86/impl.cpp"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/supervector/arch/arm/impl.cpp"
+#elif defined(ARCH_PPC64EL)
+#include "util/supervector/arch/ppc64el/impl.cpp"
+#endif
+#endif
+#endif
+
+#endif /* SUPERVECTOR_H */
+
diff --git a/src/util/target_info.cpp b/src/util/target_info.cpp
index 3a41e0207..9bd343426 100644
--- a/src/util/target_info.cpp
+++ b/src/util/target_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,7 +29,10 @@
 
 #include "hs_compile.h" // for various hs_platform_info flags
 #include "target_info.h"
-#include "util/cpuid_flags.h"
+#include "util/arch/common/cpuid_flags.h"
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#endif
 
 namespace ue2 {
 
@@ -50,6 +53,10 @@ bool target_t::can_run_on_code_built_for(const target_t &code_target) const {
         return false;
     }
 
+    if (!has_avx512vbmi() && code_target.has_avx512vbmi()) {
+        return false;
+    }
+
     return true;
 }
 
@@ -64,6 +71,10 @@ bool target_t::has_avx512(void) const {
     return cpu_features & HS_CPU_FEATURES_AVX512;
 }
 
+bool target_t::has_avx512vbmi(void) const {
+    return cpu_features & HS_CPU_FEATURES_AVX512VBMI;
+}
+
 bool target_t::is_atom_class(void) const {
     return tune == HS_TUNE_FAMILY_SLM || tune == HS_TUNE_FAMILY_GLM;
 }
diff --git a/src/util/target_info.h b/src/util/target_info.h
index 794b29855..f64573aed 100644
--- a/src/util/target_info.h
+++ b/src/util/target_info.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,8 @@ struct target_t {
 
     bool has_avx512(void) const;
 
+    bool has_avx512vbmi(void) const;
+
     bool is_atom_class(void) const;
 
     // This asks: can this target (the object) run on code that was built for
diff --git a/src/util/ue2_graph.h b/src/util/ue2_graph.h
index b8e2e935d..aa9718d73 100644
--- a/src/util/ue2_graph.h
+++ b/src/util/ue2_graph.h
@@ -176,7 +176,7 @@ class vertex_descriptor : totally_ordered<vertex_descriptor<Graph>> {
     vertex_descriptor() : p(nullptr), serial(0) {}
     explicit vertex_descriptor(vertex_node *pp) : p(pp), serial(pp->serial) {}
 
-    operator bool() const { return p; }
+    explicit operator bool() const { return p; }
     bool operator<(const vertex_descriptor b) const {
         if (p && b.p) {
             /* no vertices in the same graph can have the same serial */
diff --git a/src/util/ue2string.h b/src/util/ue2string.h
index 0aa846896..f436936d7 100644
--- a/src/util/ue2string.h
+++ b/src/util/ue2string.h
@@ -133,7 +133,7 @@ struct ue2_literal : totally_ordered<ue2_literal> {
             : lit(&lit_in), idx(idx_in) {}
 
         const ue2_literal *lit = nullptr;
-        size_t idx;
+        size_t idx = 0;
     };
 
     using const_reverse_iterator = std::reverse_iterator<const_iterator>;
diff --git a/src/util/unaligned.h b/src/util/unaligned.h
index 299e5677c..a8fba6b1c 100644
--- a/src/util/unaligned.h
+++ b/src/util/unaligned.h
@@ -35,12 +35,7 @@
 
 #include "ue2common.h"
 
-#if !defined(_WIN32)
 #define PACKED__MAY_ALIAS __attribute__((packed, may_alias))
-#else
-#define PACKED__MAY_ALIAS
-#pragma pack(push, 1) // pack everything until told otherwise
-#endif
 
 /// Perform an unaligned 16-bit load
 static really_inline
@@ -89,9 +84,6 @@ void unaligned_store_u64a(void *ptr, u64a val) {
     struct unaligned *uptr = (struct unaligned *)ptr;
     uptr->u = val;
 }
-#if defined(_WIN32)
-#pragma pack(pop)
-#endif // win32
 
 #undef PACKED__MAY_ALIAS
 
diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h
index 3385e4418..1c39c936d 100644
--- a/src/util/uniform_ops.h
+++ b/src/util/uniform_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -101,6 +101,18 @@
 #define or_m384(a, b)       (or384(a, b))
 #define or_m512(a, b)       (or512(a, b))
 
+#if defined(HAVE_AVX512VBMI)
+#define broadcast_m128(a)      (broadcast128(a))
+#define broadcast_m256(a)      (broadcast256(a))
+#define broadcast_m384(a)      (broadcast384(a))
+#define broadcast_m512(a)      (a)
+
+#define shuffle_byte_m128(a, b)       (pshufb_m512(b, a))
+#define shuffle_byte_m256(a, b)       (vpermb512(a, b))
+#define shuffle_byte_m384(a, b)       (vpermb512(a, b))
+#define shuffle_byte_m512(a, b)       (vpermb512(a, b))
+#endif
+
 #define and_u8(a, b)        ((a) & (b))
 #define and_u32(a, b)       ((a) & (b))
 #define and_u64a(a, b)      ((a) & (b))
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 6ca3fd8a9..e35e65e00 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -9,18 +9,11 @@ include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 include_directories(${PROJECT_SOURCE_DIR}/util)
 
-if (WIN32)
-    add_subdirectory(hscheck)
-    add_subdirectory(hsbench)
-    add_subdirectory(hsdump)
-    add_subdirectory(hscollider)
-else()
-    # add any subdir with a cmake file
-    file(GLOB dirents RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *)
-    foreach(e ${dirents})
-        if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${e} AND
-           EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${e}/CMakeLists.txt)
-            add_subdirectory(${e})
-        endif ()
-    endforeach ()
-endif()
+# add any subdir with a cmake file
+file(GLOB dirents RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *)
+foreach(e ${dirents})
+    if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${e} AND
+       EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${e}/CMakeLists.txt)
+        add_subdirectory(${e})
+    endif ()
+endforeach ()
\ No newline at end of file
diff --git a/tools/fuzz/aristocrats.py b/tools/fuzz/aristocrats.py
index 7b6ff2bf3..96169582a 100755
--- a/tools/fuzz/aristocrats.py
+++ b/tools/fuzz/aristocrats.py
@@ -33,13 +33,13 @@ def generateRandomOptions():
     parser.error("incorrect number of arguments")
 
 if (options.full):
-    crange = range(0,256)
+    crange = list(range(0,256))
     crange.remove(ord('\n'))
 else:
-    crange = range(32, 127)
+    crange = list(range(32, 127))
 
-for i in xrange(0, options.count):
+for i in range(0, options.count):
     len = randint(1, options.depth)
-    s = [ chr(choice(crange)) for x in xrange(len) ]
+    s = [ chr(choice(crange)) for x in range(len) ]
     line = str(i) + ":/" + "".join(s) + "/" + generateRandomOptions()
-    print line
+    print(line)
diff --git a/tools/fuzz/completocrats.py b/tools/fuzz/completocrats.py
index 60ac4d7ef..63ef0b91e 100755
--- a/tools/fuzz/completocrats.py
+++ b/tools/fuzz/completocrats.py
@@ -23,17 +23,17 @@
     parser.error("incorrect number of arguments")
 
 if (options.full):
-    crange = range(0,256)
+    crange = list(range(0,256))
     crange.remove(ord('\n'))
 elif (options.limited):
     crange = [ ord(c) for c in LIMITED_ALPHABET ]
 else:
-    crange = range(32, 127)
+    crange = list(range(32, 127))
 
 srange = [ chr(c) for c in crange ]
 
 i = 0
 for x in product(srange, repeat = options.depth):
     line = str(i) + ":/" + "".join(x) + "/"
-    print line
+    print(line)
     i += 1
diff --git a/tools/fuzz/heuristocrats.py b/tools/fuzz/heuristocrats.py
index 49c7acb43..abd6f8ae9 100755
--- a/tools/fuzz/heuristocrats.py
+++ b/tools/fuzz/heuristocrats.py
@@ -9,7 +9,7 @@
 def chooseLeafWidth(nChildren):
     width = randint(1, 5)
     width = min(width, nChildren-1)
-    s = sample(range(1, nChildren), width)
+    s = sample(list(range(1, nChildren)), width)
     s.sort()
     s = [0] + s + [nChildren]
     v = [ s[i+1] - s[i] for i in range(0, len(s)-1) if s[i+1] != s[i] ]
@@ -73,7 +73,7 @@ def generateCharClass(nChildren, atTop = False):
     else:
         nChars = randint(2,4)
 
-    for i in xrange(nChars):
+    for i in range(nChars):
         s += generateChar(1)
     return "[" + s + "]"
 
@@ -247,13 +247,13 @@ def generateRandomExtParam(depth, extparam):
 if len(args) != 0:
     parser.error("incorrect number of arguments")
 
-alphabet = range(ord('a'), ord('a') + options.alphabet)
+alphabet = list(range(ord('a'), ord('a') + options.alphabet))
 if options.nocase:
-    alphabet += range(ord('A'), ord('A') + options.alphabet)
+    alphabet += list(range(ord('A'), ord('A') + options.alphabet))
     
-for i in xrange(0, options.count):
-    print "%08d:/%s/%s%s" % (i, generateRE(randint(1, options.depth), atTop = True), generateRandomOptions(), generateRandomExtParam(options.depth, options.extparam))
+for i in range(0, options.count):
+    print("%08d:/%s/%s%s" % (i, generateRE(randint(1, options.depth), atTop = True), generateRandomOptions(), generateRandomExtParam(options.depth, options.extparam)))
 
 if options.logical:
-    for i in xrange(options.count, options.count + 3000):
-        print "%08d:/%s/C" % (i, generateCombination(randint(1, options.depth), atTop = True))
+    for i in range(options.count, options.count + 3000):
+        print("%08d:/%s/C" % (i, generateCombination(randint(1, options.depth), atTop = True)))
diff --git a/tools/hsbench/CMakeLists.txt b/tools/hsbench/CMakeLists.txt
index bbceda41c..42ab4ccbb 100644
--- a/tools/hsbench/CMakeLists.txt
+++ b/tools/hsbench/CMakeLists.txt
@@ -58,19 +58,10 @@ if (BUILD_CHIMERA)
     )
     add_executable(hsbench ${hsbench_SOURCES})
     include_directories(${PCRE_INCLUDE_DIRS})
-    if(NOT WIN32)
-        target_link_libraries(hsbench hs chimera ${PCRE_LDFLAGS} databaseutil
-            expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT})
-    else()
-        target_link_libraries(hsbench hs chimera pcre databaseutil
-            expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT})
-    endif()
+    target_link_libraries(hsbench hs chimera ${PCRE_LDFLAGS} databaseutil
+        expressionutil ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT})
 else()
-    if(WIN32 AND (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS))
-        add_executable(hsbench ${hsbench_SOURCES} $<TARGET_OBJECTS:hs_compile_shared> $<TARGET_OBJECTS:hs_exec_shared>)
-    else()
-        add_executable(hsbench ${hsbench_SOURCES})
-    endif()
+    add_executable(hsbench ${hsbench_SOURCES})
     target_link_libraries(hsbench hs databaseutil expressionutil
         ${SQLITE3_LDFLAGS} ${CMAKE_THREAD_LIBS_INIT})
 endif()
diff --git a/tools/hsbench/data_corpus.cpp b/tools/hsbench/data_corpus.cpp
index 8e761ec34..b23da1fb3 100644
--- a/tools/hsbench/data_corpus.cpp
+++ b/tools/hsbench/data_corpus.cpp
@@ -58,7 +58,10 @@ void readRow(sqlite3_stmt *statement, vector<DataBlock> &blocks,
     }
     auto internal_stream_index = stream_indices[stream_id];
 
-    assert(blob || bytes > 0);
+    if (!(blob &&  bytes > 0)) {
+        assert(0);
+        throw std::domain_error("Invalid blob or bytes from sqlite3.");
+    }
     blocks.emplace_back(id, stream_id, internal_stream_index,
                         string(blob, blob + bytes));
 }
diff --git a/tools/hsbench/engine.h b/tools/hsbench/engine.h
index e41f9948c..aea1c8162 100644
--- a/tools/hsbench/engine.h
+++ b/tools/hsbench/engine.h
@@ -88,6 +88,8 @@ class Engine : boost::noncopyable {
 
     virtual void printStats() const = 0;
 
+    virtual void printCsvStats() const = 0;
+
     virtual void sqlStats(SqlDB &db) const = 0;
 };
 
diff --git a/tools/hsbench/engine_chimera.cpp b/tools/hsbench/engine_chimera.cpp
index 8a15c5bee..9bc5ab223 100644
--- a/tools/hsbench/engine_chimera.cpp
+++ b/tools/hsbench/engine_chimera.cpp
@@ -38,8 +38,6 @@
 
 #include "chimera/ch_database.h"
 
-#include "util/make_unique.h"
-
 using namespace std;
 
 EngineCHContext::EngineCHContext(const ch_database_t *db) {
@@ -105,7 +103,7 @@ EngineChimera::~EngineChimera() {
 }
 
 unique_ptr<EngineContext> EngineChimera::makeContext() const {
-    return ue2::make_unique<EngineCHContext>(db);
+    return std::make_unique<EngineCHContext>(db);
 }
 
 void EngineChimera::scan(const char *data, unsigned int len, unsigned int id,
@@ -168,23 +166,22 @@ void EngineChimera::printStats() const {
     }
     printf("Signatures:        %s\n", compile_stats.signatures.c_str());
     printf("Chimera info:      %s\n", compile_stats.db_info.c_str());
-#ifndef _WIN32
     printf("Expression count:  %'zu\n", compile_stats.expressionCount);
     printf("Bytecode size:     %'zu bytes\n", compile_stats.compiledSize);
-#else
-    printf("Expression count:  %zu\n", compile_stats.expressionCount);
-    printf("Bytecode size:     %zu bytes\n", compile_stats.compiledSize);
-#endif
     printf("Database CRC:      0x%x\n", compile_stats.crc32);
-#ifndef _WIN32
     printf("Scratch size:      %'zu bytes\n", compile_stats.scratchSize);
     printf("Compile time:      %'0.3Lf seconds\n", compile_stats.compileSecs);
     printf("Peak heap usage:   %'u bytes\n", compile_stats.peakMemorySize);
-#else
-    printf("Scratch size:      %zu bytes\n", compile_stats.scratchSize);
-    printf("Compile time:      %0.3Lf seconds\n", compile_stats.compileSecs);
-    printf("Peak heap usage:   %u bytes\n", compile_stats.peakMemorySize);
-#endif
+}
+
+void EngineChimera::printCsvStats() const {
+    printf(",\"%s\"", compile_stats.signatures.c_str());
+    printf(",\"%zu\"", compile_stats.expressionCount);
+    printf(",\"0x%x\"", compile_stats.crc32);
+    printf(",\"%zu\"", compile_stats.compiledSize);
+    printf(",\"%zu\"", compile_stats.scratchSize);
+    printf(",\"%0.3Lf\"", compile_stats.compileSecs);
+    printf(",\"%u\"", compile_stats.peakMemorySize);
 }
 
 void EngineChimera::sqlStats(SqlDB &sqldb) const {
@@ -323,5 +320,5 @@ buildEngineChimera(const ExpressionMap &expressions, const string &name,
     cs.compileSecs = compileSecs;
     cs.peakMemorySize = peakMemorySize;
 
-    return ue2::make_unique<EngineChimera>(db, move(cs));
+    return std::make_unique<EngineChimera>(db, move(cs));
 }
diff --git a/tools/hsbench/engine_chimera.h b/tools/hsbench/engine_chimera.h
index 8e2cd0f6c..187dec8cb 100644
--- a/tools/hsbench/engine_chimera.h
+++ b/tools/hsbench/engine_chimera.h
@@ -89,6 +89,8 @@ class EngineChimera : public Engine {
 
     void printStats() const;
 
+    void printCsvStats() const;
+
     void sqlStats(SqlDB &db) const;
 
 private:
diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp
index 79c58f77d..0256dc973 100644
--- a/tools/hsbench/engine_hyperscan.cpp
+++ b/tools/hsbench/engine_hyperscan.cpp
@@ -42,7 +42,6 @@
 #include "hs_internal.h"
 #include "hs_runtime.h"
 #include "util/database_util.h"
-#include "util/make_unique.h"
 
 #include <cassert>
 #include <cstring>
@@ -126,7 +125,7 @@ EngineHyperscan::~EngineHyperscan() {
 }
 
 unique_ptr<EngineContext> EngineHyperscan::makeContext() const {
-    return ue2::make_unique<EngineHSContext>(db);
+    return std::make_unique<EngineHSContext>(db);
 }
 
 void EngineHyperscan::scan(const char *data, unsigned int len, unsigned int id,
@@ -166,7 +165,7 @@ void EngineHyperscan::scan_vectored(const char *const *data,
 unique_ptr<EngineStream> EngineHyperscan::streamOpen(EngineContext &ectx,
                                                      unsigned streamId) const {
     EngineHSContext &ctx = static_cast<EngineHSContext &>(ectx);
-    auto stream = ue2::make_unique<EngineHSStream>();
+    auto stream = std::make_unique<EngineHSStream>();
     stream->ctx = &ctx;
 
     hs_open_stream(db, 0, &stream->id);
@@ -175,7 +174,7 @@ unique_ptr<EngineStream> EngineHyperscan::streamOpen(EngineContext &ectx,
         return nullptr;
     }
     stream->sn = streamId;
-    return move(stream);
+    return std::move(stream);
 }
 
 void EngineHyperscan::streamClose(unique_ptr<EngineStream> stream,
@@ -250,30 +249,26 @@ void EngineHyperscan::printStats() const {
     }
     printf("Signatures:        %s\n", compile_stats.signatures.c_str());
     printf("Hyperscan info:    %s\n", compile_stats.db_info.c_str());
-#ifndef _WIN32
     printf("Expression count:  %'zu\n", compile_stats.expressionCount);
     printf("Bytecode size:     %'zu bytes\n", compile_stats.compiledSize);
-#else
-    printf("Expression count:  %zu\n", compile_stats.expressionCount);
-    printf("Bytecode size:     %zu bytes\n", compile_stats.compiledSize);
-#endif
     printf("Database CRC:      0x%x\n", compile_stats.crc32);
     if (compile_stats.streaming) {
-#ifndef _WIN32
         printf("Stream state size: %'zu bytes\n", compile_stats.streamSize);
-#else
-        printf("Stream state size: %zu bytes\n", compile_stats.streamSize);
-#endif
     }
-#ifndef _WIN32
     printf("Scratch size:      %'zu bytes\n", compile_stats.scratchSize);
     printf("Compile time:      %'0.3Lf seconds\n", compile_stats.compileSecs);
     printf("Peak heap usage:   %'u bytes\n", compile_stats.peakMemorySize);
-#else
-    printf("Scratch size:      %zu bytes\n", compile_stats.scratchSize);
-    printf("Compile time:      %0.3Lf seconds\n", compile_stats.compileSecs);
-    printf("Peak heap usage:   %u bytes\n", compile_stats.peakMemorySize);
-#endif
+}
+
+void EngineHyperscan::printCsvStats() const {
+    printf(",\"%s\"", compile_stats.signatures.c_str());
+    printf(",\"%zu\"", compile_stats.expressionCount);
+    printf(",\"0x%x\"", compile_stats.crc32);
+    printf(",\"%zu\"", compile_stats.compiledSize);
+    printf(",\"%zu\"", compile_stats.streamSize);
+    printf(",\"%zu\"", compile_stats.scratchSize);
+    printf(",\"%0.3Lf\"", compile_stats.compileSecs);
+    printf(",\"%u\"", compile_stats.peakMemorySize);
 }
 
 void EngineHyperscan::sqlStats(SqlDB &sqldb) const {
@@ -538,5 +533,5 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode,
     cs.compileSecs = compileSecs;
     cs.peakMemorySize = peakMemorySize;
 
-    return ue2::make_unique<EngineHyperscan>(db, std::move(cs));
+    return std::make_unique<EngineHyperscan>(db, std::move(cs));
 }
diff --git a/tools/hsbench/engine_hyperscan.h b/tools/hsbench/engine_hyperscan.h
index a8105d753..afbdf098d 100644
--- a/tools/hsbench/engine_hyperscan.h
+++ b/tools/hsbench/engine_hyperscan.h
@@ -65,8 +65,8 @@ class EngineHSContext : public EngineContext {
 class EngineHSStream : public EngineStream {
 public:
     ~EngineHSStream();
-    hs_stream_t *id;
-    EngineHSContext *ctx;
+    hs_stream_t *id = nullptr;
+    EngineHSContext *ctx = nullptr;
 };
 
 /** Hyperscan Engine for scanning data. */
@@ -98,6 +98,8 @@ class EngineHyperscan : public Engine {
 
     void printStats() const;
 
+    void printCsvStats() const;
+
     void sqlStats(SqlDB &db) const;
 
 private:
diff --git a/tools/hsbench/engine_pcre.cpp b/tools/hsbench/engine_pcre.cpp
index 85616e987..65fd6a2fb 100644
--- a/tools/hsbench/engine_pcre.cpp
+++ b/tools/hsbench/engine_pcre.cpp
@@ -26,9 +26,6 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifdef _WIN32
-#define PCRE_STATIC
-#endif
 #include "config.h"
 
 #include "common.h"
@@ -38,7 +35,6 @@
 #include "sqldb.h"
 #include "timer.h"
 
-#include "util/make_unique.h"
 #include "util/unicode_def.h"
 
 #include <algorithm>
@@ -105,7 +101,7 @@ EnginePCRE::~EnginePCRE() {
 }
 
 unique_ptr<EngineContext> EnginePCRE::makeContext() const {
-    return ue2::make_unique<EnginePCREContext>(capture_cnt);
+    return std::make_unique<EnginePCREContext>(capture_cnt);
 }
 
 void EnginePCRE::scan(const char *data, unsigned int len, unsigned int id,
@@ -212,19 +208,20 @@ void EnginePCRE::printStats() const {
     }
     printf("Signatures:        %s\n", compile_stats.signatures.c_str());
     printf("PCRE info:         %s\n", compile_stats.db_info.c_str());
-#ifndef _WIN32
     printf("Expression count:  %'zu\n", compile_stats.expressionCount);
     printf("Bytecode size:     %'zu bytes\n", compile_stats.compiledSize);
     printf("Scratch size:      %'zu bytes\n", compile_stats.scratchSize);
     printf("Compile time:      %'0.3Lf seconds\n", compile_stats.compileSecs);
     printf("Peak heap usage:   %'u bytes\n", compile_stats.peakMemorySize);
-#else
-    printf("Expression count:  %zu\n", compile_stats.expressionCount);
-    printf("Bytecode size:     %zu bytes\n", compile_stats.compiledSize);
-    printf("Scratch size:      %zu bytes\n", compile_stats.scratchSize);
-    printf("Compile time:      %0.3Lf seconds\n", compile_stats.compileSecs);
-    printf("Peak heap usage:   %u bytes\n", compile_stats.peakMemorySize);
-#endif
+}
+
+void EnginePCRE::printCsvStats() const {
+    printf(",\"%s\"", compile_stats.signatures.c_str());
+    printf(",\"%zu\"", compile_stats.expressionCount);
+    printf(",\"%zu\"", compile_stats.compiledSize);
+    printf(",\"%zu\"", compile_stats.scratchSize);
+    printf(",\"%0.3Lf\"", compile_stats.compileSecs);
+    printf(",\"%u\"", compile_stats.peakMemorySize);
 }
 
 void EnginePCRE::sqlStats(SqlDB &sqldb) const {
@@ -313,7 +310,7 @@ buildEnginePcre(const ExpressionMap &expressions, const string &name,
     for (const auto &m : expressions) {
         string expr(m.second);
         unsigned int flags = 0;
-        auto pcreDB = ue2::make_unique<PcreDB>();
+        auto pcreDB = std::make_unique<PcreDB>();
         if (!decodeExprPCRE(expr, &flags, *pcreDB)) {
             printf("Error parsing PCRE: %s (id %u)\n", m.second.c_str(),
                     m.first);
@@ -397,5 +394,5 @@ buildEnginePcre(const ExpressionMap &expressions, const string &name,
     cs.compileSecs = compileSecs;
     cs.peakMemorySize = peakMemorySize;
 
-    return ue2::make_unique<EnginePCRE>(move(dbs), move(cs), capture_cnt);
+    return std::make_unique<EnginePCRE>(move(dbs), move(cs), capture_cnt);
 }
diff --git a/tools/hsbench/engine_pcre.h b/tools/hsbench/engine_pcre.h
index 2e7dad9c5..9569bef48 100644
--- a/tools/hsbench/engine_pcre.h
+++ b/tools/hsbench/engine_pcre.h
@@ -62,7 +62,7 @@ class EnginePCREContext : public EngineContext{
 struct PcreDB {
     bool highlander = false;
     bool utf8 = false;
-    u32 id;
+    u32 id = 0;
     pcre *db = nullptr;
     pcre_extra *extra = nullptr;
 };
@@ -97,6 +97,8 @@ class EnginePCRE : public Engine {
 
     void printStats() const;
 
+    void printCsvStats() const;
+
     void sqlStats(SqlDB &db) const;
 
 private:
diff --git a/tools/hsbench/huge.cpp b/tools/hsbench/huge.cpp
index dbb453b29..2fa15ebf7 100644
--- a/tools/hsbench/huge.cpp
+++ b/tools/hsbench/huge.cpp
@@ -34,7 +34,6 @@
 #include "common.h"
 #include "huge.h"
 
-#ifndef _WIN32
 #include <cstdio>
 #include <cstring>
 #include <errno.h>
@@ -189,13 +188,3 @@ long gethugepagesize(void) {
 
     return hpage_size;
 }
-
-#else
-
-/* No huge page support on WIN32. */
-
-hs_database_t *get_huge(hs_database_t *db) { return db; }
-
-void release_huge(hs_database_t *db) { hs_free_database(db); }
-
-#endif
diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp
index 4e65c8e0b..1a19d510f 100644
--- a/tools/hsbench/main.cpp
+++ b/tools/hsbench/main.cpp
@@ -45,7 +45,6 @@
 #include "grey.h"
 #include "hs.h"
 #include "ue2common.h"
-#include "util/make_unique.h"
 
 #include <algorithm>
 #include <clocale>
@@ -58,18 +57,12 @@
 #include <set>
 #include <thread>
 
-#ifndef _WIN32
 #include <getopt.h>
-#else
-#include "win_getopt.h"
-#endif
-#ifndef _WIN32
 #include <pthread.h>
 #if defined(HAVE_PTHREAD_NP_H)
 #include <pthread_np.h>
 #endif
 #include <unistd.h>
-#endif
 
 #include <boost/core/noncopyable.hpp>
 #include <boost/range/adaptor/map.hpp>
@@ -98,6 +91,7 @@ bool display_per_scan = false;
 ScanMode scan_mode = ScanMode::STREAMING;
 bool useHybrid = false;
 bool usePcre = false;
+bool dumpCsvOut = false;
 unsigned repeats = 20;
 string exprPath("");
 string corpusFile("");
@@ -117,7 +111,7 @@ class ThreadContext : boost::noncopyable {
                   thread_barrier &tb_in, thread_func_t function_in,
                   vector<DataBlock> corpus_data_in)
         : num(num_in), results(repeats), engine(db_in),
-          enginectx(db_in.makeContext()), corpus_data(move(corpus_data_in)),
+          enginectx(db_in.makeContext()), corpus_data(std::move(corpus_data_in)),
           tb(tb_in), function(function_in) {}
 
     // Start the thread.
@@ -144,15 +138,6 @@ class ThreadContext : boost::noncopyable {
     // Apply processor affinity (if available) to this thread.
     bool affine(UNUSED int cpu) {
 
-#if defined(_WIN32)
-        SYSTEM_INFO system_info;
-        GetSystemInfo(&system_info);
-        assert(cpu >= 0 && (DWORD)cpu < system_info.dwNumberOfProcessors);
-        DWORD_PTR mask = 1 << cpu;
-        DWORD_PTR rv = SetThreadAffinityMask(thr.native_handle(), mask);
-        return rv != 0;
-#endif
-
 #ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP
 #if defined(__FreeBSD__)
         cpuset_t cpuset;
@@ -206,11 +191,12 @@ void usage(const char *error) {
     printf("  -H              Benchmark using Chimera (if supported).\n");
     printf("  -P              Benchmark using PCRE (if supported).\n");
 #endif
-#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32)
+#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP)
     printf("  -T CPU,CPU,... or -T CPU-CPU\n");
     printf("                  Benchmark with threads on specified CPUs or CPU"
            " range.\n");
 #endif
+    printf("  -C              Dump CSV output for tput matrix.\n");
     printf("  -i DIR          Don't compile, load from files in DIR"
            " instead.\n");
     printf("  -w DIR          After compiling, save to files in DIR.\n");
@@ -233,7 +219,7 @@ void usage(const char *error) {
 /** Wraps up a name and the set of signature IDs it refers to. */
 struct BenchmarkSigs {
     BenchmarkSigs(string name_in, SignatureSet sigs_in)
-        : name(move(name_in)), sigs(move(sigs_in)) {}
+        : name(std::move(name_in)), sigs(std::move(sigs_in)) {}
     string name;
     SignatureSet sigs;
 };
@@ -243,7 +229,7 @@ static
 void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
                  UNUSED unique_ptr<Grey> &grey) {
     const char options[] = "-b:c:Cd:e:E:G:hHi:n:No:p:PsS:Vw:z:"
-#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32)
+#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP)
         "T:" // add the thread flag
 #endif
         ;
@@ -275,6 +261,9 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
         case 'c':
             corpusFile.assign(optarg);
             break;
+        case 'C':
+            dumpCsvOut = true;
+            break;
         case 'd': {
             unsigned dist;
             if (!fromString(optarg, dist)) {
@@ -352,7 +341,7 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
         case 'S':
             sigName.assign(optarg);
             break;
-#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32)
+#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP)
         case 'T':
             if (!strToList(optarg, threadCores)) {
                 usage("Couldn't parse argument to -T flag, should be"
@@ -468,7 +457,7 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
     for (const auto &file : sigFiles) {
         SignatureSet sigs;
         loadSignatureList(file, sigs);
-        sigSets.emplace_back(file, move(sigs));
+        sigSets.emplace_back(file, std::move(sigs));
     }
 
     useLiteralApi = (bool)literalFlag;
@@ -601,7 +590,7 @@ void benchStreamingInternal(ThreadContext *ctx, vector<StreamInfo> &streams,
 
             // if this was the last block in the stream, close the stream handle
             if (b.id == stream.last_block_id) {
-                e.streamClose(move(stream.eng_handle), r);
+                e.streamClose(std::move(stream.eng_handle), r);
                 stream.eng_handle = nullptr;
             }
         }
@@ -727,11 +716,7 @@ void displayPerScanResults(const vector<unique_ptr<ThreadContext>> &threads,
         for (size_t j = 0; j != results.size(); j++) {
             const auto &r = results[j];
             double mbps = calc_mbps(r.seconds, bytesPerRun);
-#ifndef _WIN32
             printf("T %2u Scan %2zu: %'0.2f Mbit/sec\n", t->num, j, mbps);
-#else
-            printf("T %2u Scan %2zu: %0.2f Mbit/sec\n", t->num, j, mbps);
-#endif
         }
     }
     printf("\n");
@@ -755,6 +740,11 @@ u64a byte_size(const vector<DataBlock> &corpus_blocks) {
         total += block.payload.size();
     }
 
+    if (total == 0) {
+        assert(0);
+        throw std::invalid_argument("Empty corpus.");
+    }
+
     return total;
 }
 
@@ -776,7 +766,6 @@ void displayResults(const vector<unique_ptr<ThreadContext>> &threads,
         }
     }
 
-#ifndef _WIN32
     printf("Time spent scanning:       %'0.3f seconds\n", totalSecs);
     printf("Corpus size:               %'llu bytes ", bytesPerRun);
     switch (scan_mode) {
@@ -792,56 +781,22 @@ void displayResults(const vector<unique_ptr<ThreadContext>> &threads,
         printf("(%'zu blocks)\n", corpus_blocks.size());
         break;
     }
-#else
-    printf("Time spent scanning:       %0.3f seconds\n", totalSecs);
-    printf("Corpus size:               %llu bytes ", bytesPerRun);
-    switch (scan_mode) {
-    case ScanMode::STREAMING:
-        printf("(%zu blocks in %llu streams)\n", corpus_blocks.size(),
-               count_streams(corpus_blocks));
-        break;
-    case ScanMode::VECTORED:
-        printf("(%zu blocks in %llu vectors)\n", corpus_blocks.size(),
-               count_streams(corpus_blocks));
-        break;
-    case ScanMode::BLOCK:
-        printf("(%zu blocks)\n", corpus_blocks.size());
-        break;
-    }
-#endif
 
     u64a totalBytes = bytesPerRun * repeats * threads.size();
     u64a totalBlocks = corpus_blocks.size() * repeats * threads.size();
 
     double matchRate = ((double)matchesPerRun * 1024) / bytesPerRun;
-#ifndef _WIN32
     printf("Matches per iteration:     %'llu (%'0.3f matches/kilobyte)\n",
            matchesPerRun, matchRate);
-#else
-    printf("Matches per iteration:     %llu (%0.3f matches/kilobyte)\n",
-           matchesPerRun, matchRate);
-#endif
 
     double blockRate = (double)totalBlocks / (double)totalSecs;
-#ifndef _WIN32
     printf("Overall block rate:        %'0.2f blocks/sec\n", blockRate);
     printf("Mean throughput (overall): %'0.2Lf Mbit/sec\n",
            calc_mbps(totalSecs, totalBytes));
 
-#else
-    printf("Overall block rate:        %0.2f blocks/sec\n", blockRate);
-    printf("Mean throughput (overall): %0.2Lf Mbit/sec\n",
-           calc_mbps(totalSecs, totalBytes));
-
-#endif
     double lowestScanTime = fastestResult(threads);
-#ifndef _WIN32
     printf("Max throughput (per core): %'0.2Lf Mbit/sec\n",
            calc_mbps(lowestScanTime, bytesPerRun));
-#else
-    printf("Max throughput (per core): %0.2Lf Mbit/sec\n",
-           calc_mbps(lowestScanTime, bytesPerRun));
-#endif
     printf("\n");
 
     if (display_per_scan) {
@@ -849,6 +804,40 @@ void displayResults(const vector<unique_ptr<ThreadContext>> &threads,
     }
 }
 
+/** Dump benchmark results to csv. */
+static
+void displayCsvResults(const vector<unique_ptr<ThreadContext>> &threads,
+                       const vector<DataBlock> &corpus_blocks) {
+    u64a bytesPerRun = byte_size(corpus_blocks);
+    u64a matchesPerRun = threads[0]->results[0].matches;
+
+    // Sanity check: all of our results should have the same match count.
+    for (const auto &t : threads) {
+        if (!all_of(begin(t->results), end(t->results),
+                    [&matchesPerRun](const ResultEntry &e) {
+                        return e.matches == matchesPerRun;
+                    })) {
+            printf("\nWARNING: PER-SCAN MATCH COUNTS ARE INCONSISTENT!\n\n");
+            break;
+        }
+    }
+
+    u64a totalBytes = bytesPerRun * repeats * threads.size();
+    u64a totalBlocks = corpus_blocks.size() * repeats * threads.size();
+    printf(",\"%0.3f\"", totalSecs);
+    printf(",\"%0.2Lf\"", calc_mbps(totalSecs, totalBytes));
+
+    assert(bytesPerRun);
+    double matchRate = ((double)matchesPerRun * 1024) / bytesPerRun;
+    printf(",\"%llu\"", matchesPerRun);
+    printf(",\"%0.3f\"", matchRate);
+
+    double blockRate = (double)totalBlocks / (double)totalSecs;
+    printf(",\"%0.2f\"", blockRate);
+    printf("\n");
+}
+
+
 /** Dump per-scan throughput data to sql. */
 static
 void sqlPerScanResults(const vector<unique_ptr<ThreadContext>> &threads,
@@ -940,7 +929,7 @@ unique_ptr<ThreadContext> makeThreadContext(const Engine &db,
     }
     assert(fn);
 
-    return ue2::make_unique<ThreadContext>(id, db, sync_barrier, fn, blocks);
+    return std::make_unique<ThreadContext>(id, db, sync_barrier, fn, blocks);
 }
 
 /** Run the given benchmark. */
@@ -954,7 +943,7 @@ void runBenchmark(const Engine &db,
         numThreads = 1;
     } else {
         numThreads = threadCores.size();
-#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP) || defined(_WIN32)
+#if defined(HAVE_DECL_PTHREAD_SETAFFINITY_NP)
         useAffinity = true;
 #else
         useAffinity = false;
@@ -974,7 +963,7 @@ void runBenchmark(const Engine &db,
             printf("Unable to start processing thread %u\n", i);
             exit(1);
         }
-        threads.push_back(move(t));
+        threads.push_back(std::move(t));
     }
 
     // Reap threads.
@@ -982,7 +971,9 @@ void runBenchmark(const Engine &db,
         t->join();
     }
 
-    if (sqloutFile.empty()) {
+    if (dumpCsvOut) {
+        displayCsvResults(threads, corpus_blocks);
+    } else if (sqloutFile.empty()) {
         // Display global results.
         displayResults(threads, corpus_blocks);
     } else {
@@ -1020,7 +1011,7 @@ int HS_CDECL main(int argc, char *argv[]) {
         for (auto i : exprMapTemplate | map_keys) {
             sigs.push_back(i);
         }
-        sigSets.emplace_back(exprPath, move(sigs));
+        sigSets.emplace_back(exprPath, std::move(sigs));
     }
 
     // read in and process our corpus
@@ -1059,7 +1050,9 @@ int HS_CDECL main(int argc, char *argv[]) {
                 exit(1);
             }
 
-            if (sqloutFile.empty()) {
+            if (dumpCsvOut) {
+                engine->printCsvStats();
+            } else if (sqloutFile.empty()) {
                 // Display global results.
                 engine->printStats();
                 printf("\n");
diff --git a/tools/hsbench/scripts/gutenbergCorpus.py b/tools/hsbench/scripts/gutenbergCorpus.py
index 62752a4d2..71a6d32d6 100755
--- a/tools/hsbench/scripts/gutenbergCorpus.py
+++ b/tools/hsbench/scripts/gutenbergCorpus.py
@@ -16,7 +16,7 @@ def addBlocks(builder, block_size, stream_size, text_id, text):
     global stream_id
     global stream_bytes
 
-    print "text", text_id, "len", len(text)
+    print("text", text_id, "len", len(text))
     i = 0
     while i < len(text):
         chunk = text[i:min(len(text), i + block_size)]
@@ -26,11 +26,11 @@ def addBlocks(builder, block_size, stream_size, text_id, text):
         if stream_bytes >= stream_size:
             stream_id += 1
             stream_bytes = 0
-    print "Text", text_id, ": added", i/block_size, "blocks of", block_size, "bytes."
+    print("Text", text_id, ": added", i/block_size, "blocks of", block_size, "bytes.")
 
 def buildCorpus(outFN, block_size, stream_size, text_ids):
     if len(text_ids) == 0:
-        print >>sys.stderr, "Must provide at least one input ID"
+        print("Must provide at least one input ID", file=sys.stderr)
         sys.exit(0)
 
     builder = CorpusBuilder(outFN)
@@ -48,12 +48,12 @@ def buildCorpus(outFN, block_size, stream_size, text_ids):
 
     builder.finish()
 
-    print "Total:", total_bytes, "bytes."
+    print("Total:", total_bytes, "bytes.")
 
 def usage(exeName):
     errmsg = "Usage: %s -o <output file> -b <block size> -s <max stream size> <gutenberg text id>..."
     errmsg = errmsg % exeName
-    print >> sys.stderr, errmsg
+    print(errmsg, file=sys.stderr)
     sys.exit(-1)
 
 if __name__ == '__main__':
@@ -62,7 +62,7 @@ def usage(exeName):
 
     requiredKeys = [ '-o', '-b', '-s' ]
     for k in requiredKeys:
-        if not opts.has_key(k):
+        if k not in opts:
             usage(os.path.basename(sys.argv[0]))
 
     buildCorpus(opts['-o'], int(opts['-b']), int(opts['-s']), args)
diff --git a/tools/hsbench/scripts/linebasedCorpus.py b/tools/hsbench/scripts/linebasedCorpus.py
index b27f8674f..7af07d28c 100755
--- a/tools/hsbench/scripts/linebasedCorpus.py
+++ b/tools/hsbench/scripts/linebasedCorpus.py
@@ -15,13 +15,13 @@ def lineCorpus(inFN, outFN):
     '''
 
     if not os.path.exists(inFN):
-        print >> sys.stderr, "Input file '%s' does not exist. Exiting." % outFN
+        print("Input file '%s' does not exist. Exiting." % outFN, file=sys.stderr)
         sys.exit(-1)
 
     lines = open(inFN).readlines()
 
     if len(lines) == 0:
-        print >> sys.stderr, "Input file contained no lines. Exiting."
+        print("Input file contained no lines. Exiting.", file=sys.stderr)
         sys.exit(0)
 
     builder = CorpusBuilder(outFN)
@@ -37,7 +37,7 @@ def lineCorpus(inFN, outFN):
 def usage(exeName):
     errmsg = "Usage: %s -i <input file> -o <output file>"
     errmsg = errmsg % exeName
-    print >> sys.stderr, errmsg
+    print(errmsg, file=sys.stderr)
     sys.exit(-1)
 
 if __name__ == '__main__':
@@ -46,7 +46,7 @@ def usage(exeName):
 
     requiredKeys = [ '-i', '-o' ]
     for k in requiredKeys:
-        if not args.has_key(k):
+        if k not in args:
             usage(os.path.basename(sys.argv[0]))
 
     fnArgs = tuple([args[k] for k in requiredKeys])
diff --git a/tools/hsbench/scripts/pcapCorpus.py b/tools/hsbench/scripts/pcapCorpus.py
index 30d6192c6..3efba805d 100755
--- a/tools/hsbench/scripts/pcapCorpus.py
+++ b/tools/hsbench/scripts/pcapCorpus.py
@@ -35,7 +35,7 @@
 def usage(exeName) :
     errmsg = "Usage: %s -i <pcap-file> -o <sqlite-file>"
     errmsg = errmsg % exeName
-    print >> sys.stderr, errmsg
+    print(errmsg, file=sys.stderr)
     sys.exit(-1)
 
 class FiveTuple(object):
@@ -208,7 +208,7 @@ def enchunk_pcap(pcapFN, sqliteFN):
     """
 
     if not os.path.exists(pcapFN):
-        print >> sys.stderr, "Input file '%s' does not exist. Exiting." % pcapFN
+        print("Input file '%s' does not exist. Exiting." % pcapFN, file=sys.stderr)
         sys.exit(-1)
 
     builder = CorpusBuilder(sqliteFN)
@@ -225,7 +225,7 @@ def enchunk_pcap(pcapFN, sqliteFN):
 
     while not done:
         try:
-            ts, packet = pcap_ref.next()
+            ts, packet = next(pcap_ref)
         except:
             break
 
@@ -285,10 +285,10 @@ def enchunk_pcap(pcapFN, sqliteFN):
     # Having read the contents of the pcap, we fill the database with any
     # remaining TCP and UDP segments
     #
-    for tcp_stream in tcp_streams.itervalues():
+    for tcp_stream in tcp_streams.values():
         db_add_tcp_stream_segments(builder, tcp_stream)
 
-    for udp_stream in udp_streams.itervalues():
+    for udp_stream in udp_streams.values():
         db_add_udp_stream_segments(builder, udp_stream)
 
     #
@@ -303,7 +303,7 @@ def enchunk_pcap(pcapFN, sqliteFN):
 
     requiredKeys = [ '-i', '-o']
     for k in requiredKeys :
-        if not args.has_key(k) :
+        if k not in args :
             usage(os.path.basename(sys.argv[0]))
 
     fnArgs = tuple([ args[k] for k in requiredKeys ])
diff --git a/tools/hscheck/CMakeLists.txt b/tools/hscheck/CMakeLists.txt
index 2ae06137c..0ac4bdd73 100644
--- a/tools/hscheck/CMakeLists.txt
+++ b/tools/hscheck/CMakeLists.txt
@@ -10,20 +10,8 @@ if (BUILD_CHIMERA)
     include_directories(${PCRE_INCLUDE_DIRS})
     add_definitions(-DHS_HYBRID)
     add_executable(hscheck ${hscheck_SOURCES})
-    if(NOT WIN32)
-        target_link_libraries(hscheck hs chimera ${PCRE_LDFLAGS} expressionutil pthread)
-    else()
-        target_link_libraries(hscheck hs chimera pcre expressionutil)
-    endif()
+    target_link_libraries(hscheck hs chimera ${PCRE_LDFLAGS} expressionutil pthread)
 else()
-    if(WIN32 AND (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS))
-        add_executable(hscheck ${hscheck_SOURCES} $<TARGET_OBJECTS:hs_compile_shared> $<TARGET_OBJECTS:hs_exec_shared>)
-    else()
-        add_executable(hscheck ${hscheck_SOURCES})
-    endif()
-    if(NOT WIN32)
-        target_link_libraries(hscheck hs expressionutil pthread)
-    else()
-        target_link_libraries(hscheck hs expressionutil)
-    endif()
+    add_executable(hscheck ${hscheck_SOURCES})
+    target_link_libraries(hscheck hs expressionutil pthread)
 endif()
diff --git a/tools/hscheck/main.cpp b/tools/hscheck/main.cpp
index 197087bba..f3e9419ac 100644
--- a/tools/hscheck/main.cpp
+++ b/tools/hscheck/main.cpp
@@ -52,7 +52,6 @@
 #include "expressions.h"
 #include "string_util.h"
 #include "util/expression_path.h"
-#include "util/make_unique.h"
 
 #include "grey.h"
 #include "hs_compile.h"
@@ -70,11 +69,8 @@
 #include <stdexcept>
 #include <string>
 #include <thread>
-#ifndef _WIN32
 #include <getopt.h>
-#else
-#include "win_getopt.h"
-#endif
+
 #include <boost/algorithm/string/trim.hpp>
 
 using namespace std;
@@ -664,7 +660,7 @@ int HS_CDECL main(int argc, char **argv) {
     num_of_threads = max(1u, std::thread::hardware_concurrency());
 
 #if !defined(RELEASE_BUILD)
-    g_grey = make_unique<Grey>();
+    g_grey = std::make_unique<Grey>();
 #endif
     processArgs(argc, argv, g_grey);
 
diff --git a/tools/hscollider/CMakeLists.txt b/tools/hscollider/CMakeLists.txt
index a4d71b2fd..f9e71404c 100644
--- a/tools/hscollider/CMakeLists.txt
+++ b/tools/hscollider/CMakeLists.txt
@@ -17,7 +17,7 @@ CHECK_FUNCTION_EXISTS(sigaction HAVE_SIGACTION)
 CHECK_FUNCTION_EXISTS(setrlimit HAVE_SETRLIMIT)
 
 set_source_files_properties(
-    ${CMAKE_CURRENT_BINARY_DIR}/ColliderCorporaParser.cpp
+    ColliderCorporaParser.cpp
     PROPERTIES
     COMPILE_FLAGS "${RAGEL_C_FLAGS} -I${CMAKE_CURRENT_SOURCE_DIR}")
 
@@ -65,31 +65,20 @@ set_source_files_properties(${hscollider_SOURCES} PROPERTIES
 add_executable(hscollider ${hscollider_SOURCES})
 add_dependencies(hscollider ragel_ColliderCorporaParser)
 
-if(NOT WIN32)
-    if (BUILD_CHIMERA)
-        target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil
-            expressionutil corpusomatic crosscompileutil pthread
-        "${BACKTRACE_LDFLAGS}")
-    else()
-        target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
-            expressionutil corpusomatic crosscompileutil pthread
-        "${BACKTRACE_LDFLAGS}")
-    endif()
+if (BUILD_CHIMERA)
+    target_link_libraries(hscollider hs chimera ${PCRE_LDFLAGS} databaseutil
+        expressionutil corpusomatic crosscompileutil pthread
+    "${BACKTRACE_LDFLAGS}")
+else()
+    target_link_libraries(hscollider hs ${PCRE_LDFLAGS} databaseutil
+        expressionutil corpusomatic crosscompileutil pthread
+    "${BACKTRACE_LDFLAGS}")
+endif()
 
 if(HAVE_BACKTRACE)
     set_source_files_properties(hscollider_SOURCES COMPILE_FLAGS
         "${BACKTRACE_CFLAGS}")
 endif()
-else() # WIN32
-    set_target_properties(hscollider PROPERTIES LINK_FLAGS "/STACK:8388608,8388608")
-    if (BUILD_CHIMERA)
-        target_link_libraries(hscollider hs chimera pcre databaseutil
-            expressionutil corpusomatic crosscompileutil)
-    else()
-        target_link_libraries(hscollider hs pcre databaseutil
-            expressionutil corpusomatic crosscompileutil)
-    endif()
-endif()
 
 add_custom_target(
     collide_quick_test
diff --git a/tools/hscollider/ColliderCorporaParser.rl b/tools/hscollider/ColliderCorporaParser.rl
index ab40b2ba3..04e8f6feb 100644
--- a/tools/hscollider/ColliderCorporaParser.rl
+++ b/tools/hscollider/ColliderCorporaParser.rl
@@ -57,6 +57,7 @@ char unhex(const char *start, UNUSED const char *end) {
 
 %%{
     machine FileCorporaParser;
+    alphtype unsigned char;
 
     action accumulateNum {
         num = (num * 10) + (fc - '0');
diff --git a/tools/hscollider/DatabaseProxy.h b/tools/hscollider/DatabaseProxy.h
index 831ab1484..f6957d296 100644
--- a/tools/hscollider/DatabaseProxy.h
+++ b/tools/hscollider/DatabaseProxy.h
@@ -61,7 +61,7 @@ class DatabaseProxy : boost::noncopyable {
         std::lock_guard<std::mutex> lock(mutex);
         if (failed) {
             // We have previously failed to compile this database.
-            return nullptr;
+            throw CompileFailed("Unable to compile db previously.");
         }
         if (db) {
             return db;
diff --git a/tools/hscollider/GraphTruth.cpp b/tools/hscollider/GraphTruth.cpp
index 0b67b11c5..bd18d655a 100644
--- a/tools/hscollider/GraphTruth.cpp
+++ b/tools/hscollider/GraphTruth.cpp
@@ -50,7 +50,6 @@
 #include "parser/unsupported.h"
 #include "parser/logical_combination.h"
 #include "util/compile_context.h"
-#include "util/make_unique.h"
 #include "util/report_manager.h"
 
 #include <algorithm>
@@ -131,10 +130,10 @@ void CNGInfo::compile() {
 
     try {
         if (combination) {
-            auto pl = ue2::make_unique<ParsedLogical>();
+            auto pl = std::make_unique<ParsedLogical>();
             pl->parseLogicalCombination(id, re.c_str(), ~0U, 0, ~0ULL);
             pl->logicalKeyRenumber();
-            cng = make_unique<CompiledNG>(move(pl));
+            cng = make_unique<CompiledNG>(std::move(pl));
             return;
         }
 
@@ -148,7 +147,7 @@ void CNGInfo::compile() {
         // original expression starts with (*UTF8)
         utf8 |= pe.expr.utf8;
 
-        auto rm = ue2::make_unique<ReportManager>(cc.grey);
+        auto rm = std::make_unique<ReportManager>(cc.grey);
 
         // Expressions containing zero-width assertions and other extended pcre
         // types aren't supported yet. This call will throw a ParseError
@@ -193,7 +192,7 @@ void CNGInfo::compile() {
             }
         }
 
-        cng = make_unique<CompiledNG>(move(g), move(rm));
+        cng = make_unique<CompiledNG>(std::move(g), std::move(rm));
     } catch (CompileError &e) {
         throw NGCompileFailure(e.reason);
     } catch (NGUnsupportedFailure &e) {
diff --git a/tools/hscollider/GroundTruth.cpp b/tools/hscollider/GroundTruth.cpp
index a2673063c..7b92c408a 100644
--- a/tools/hscollider/GroundTruth.cpp
+++ b/tools/hscollider/GroundTruth.cpp
@@ -26,9 +26,6 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifdef _WIN32
-#define PCRE_STATIC
-#endif
 #include "config.h"
 
 #include "common.h"
@@ -42,7 +39,6 @@
 #include "parser/control_verbs.h"
 #include "parser/Parser.h"
 #include "parser/parse_error.h"
-#include "util/make_unique.h"
 #include "util/string_util.h"
 #include "util/unicode_def.h"
 #include "util/unordered.h"
@@ -331,7 +327,7 @@ GroundTruth::compile(unsigned id, bool no_callouts) {
     int errloc = 0;
     int errcode = 0;
 
-    unique_ptr<CompiledPcre> compiled = make_unique<CompiledPcre>();
+    unique_ptr<CompiledPcre> compiled = std::make_unique<CompiledPcre>();
     compiled->utf8 = flags & PCRE_UTF8;
     compiled->highlander = highlander;
     compiled->prefilter = prefilter;
diff --git a/tools/hscollider/NfaGeneratedCorpora.cpp b/tools/hscollider/NfaGeneratedCorpora.cpp
index 66ae270be..072138899 100644
--- a/tools/hscollider/NfaGeneratedCorpora.cpp
+++ b/tools/hscollider/NfaGeneratedCorpora.cpp
@@ -101,13 +101,13 @@ void NfaGeneratedCorpora::generate(unsigned id, vector<Corpus> &data) {
         pl.logicalKeyRenumber();
         const auto &m_lkey = pl.getLkeyMap();
         assert(!m_lkey.empty());
-        u32 a_subid; // arbitrary sub id
+        u32 a_subid = 0; // arbitrary sub id
         unordered_map<u32, vector<Corpus>> m_data;
         for (const auto &it : m_lkey) {
             a_subid = it.first;
             vector<Corpus> sub_data;
             generate(a_subid, sub_data);
-            m_data.emplace(a_subid, move(sub_data));
+            m_data.emplace(a_subid, std::move(sub_data));
         }
         assert(!m_data.empty());
         size_t num_corpus = m_data[a_subid].size();
diff --git a/tools/hscollider/Thread.cpp b/tools/hscollider/Thread.cpp
index 5fff82398..c63793d96 100644
--- a/tools/hscollider/Thread.cpp
+++ b/tools/hscollider/Thread.cpp
@@ -98,6 +98,6 @@ void *Thread::runThread(void *thr) {
 }
 
 
-Thread::Thread(size_t num) : thread_id(num) {}
+Thread::Thread(size_t num) : thread_id(num), thread() {}
 
 Thread::~Thread() {}
diff --git a/tools/hscollider/UltimateTruth.cpp b/tools/hscollider/UltimateTruth.cpp
index 038fbf777..93d432c30 100644
--- a/tools/hscollider/UltimateTruth.cpp
+++ b/tools/hscollider/UltimateTruth.cpp
@@ -39,7 +39,6 @@
 #include "crc32.h"
 #include "hs.h"
 #include "hs_internal.h"
-#include "util/make_unique.h"
 
 #include "scratch.h"
 #include "nfa/nfa_api_queue.h"
@@ -948,7 +947,7 @@ compileHyperscan(vector<const char *> &patterns, vector<unsigned> &flags,
         return nullptr;
     }
 
-    return ue2::make_unique<HyperscanDB>(db, idsvec.begin(), idsvec.end());
+    return std::make_unique<HyperscanDB>(db, idsvec.begin(), idsvec.end());
 }
 
 #ifdef HS_HYBRID
@@ -970,7 +969,7 @@ compileHybrid(vector<const char *> &patterns,
         return nullptr;
     }
 
-    return ue2::make_unique<HybridDB>(db, idsvec.begin(), idsvec.end());
+    return std::make_unique<HybridDB>(db, idsvec.begin(), idsvec.end());
 }
 #endif
 
@@ -1080,7 +1079,7 @@ shared_ptr<BaseDB> UltimateTruth::compile(const set<unsigned> &ids,
         }
     }
 
-    return move(db);
+    return std::move(db);
 }
 
 bool UltimateTruth::allocScratch(shared_ptr<const BaseDB> db) {
diff --git a/tools/hscollider/args.cpp b/tools/hscollider/args.cpp
index 2eb510e00..3fe48f933 100644
--- a/tools/hscollider/args.cpp
+++ b/tools/hscollider/args.cpp
@@ -46,11 +46,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
-#ifndef _WIN32
 #include <getopt.h>
-#else
-#include "win_getopt.h"
-#endif
 
 #define xstr(s) str(s)
 #define str(s) #s
@@ -503,8 +499,8 @@ void processArgs(int argc, char *argv[], CorpusProperties &corpus_gen_prop,
                 } else if (in_corpora) {
                     corpora->push_back(optarg);
                     in_corpora = 2;
-                    break;
                 }
+                break;
             case 0:
                 break;
             default:
diff --git a/tools/hscollider/main.cpp b/tools/hscollider/main.cpp
index afa6ef5a9..dcc5c1b69 100644
--- a/tools/hscollider/main.cpp
+++ b/tools/hscollider/main.cpp
@@ -52,7 +52,6 @@
 #include "parser/utf8_validate.h"
 #include "ue2common.h"
 #include "util/container.h"
-#include "util/make_unique.h"
 
 #include <algorithm>
 #include <cassert>
@@ -1077,7 +1076,7 @@ void addCorporaToQueue(ostream &out, BoundedQueue<TestUnit> &testq, unsigned id,
 
     size_t corpus_id = 0;
     for (const Corpus &corpus : c) {
-        tests.push_back(ue2::make_unique<TestUnit>(id, corpus_id, corpus, cpcre,
+        tests.push_back(std::make_unique<TestUnit>(id, corpus_id, corpus, cpcre,
                                                    cngi, ue2, multi, utf8,
                                                    highlander, prefilter, som));
         corpus_id++;
@@ -1189,7 +1188,7 @@ struct CorpusGenUnit {
     CorpusGenUnit(unique_ptr<CNGInfo> cngi_in, unique_ptr<CompiledPcre> pcre_in,
                shared_ptr<DatabaseProxy> ue2_in, unsigned expr_id,
                bool multi_in, bool utf8_in)
-        : cngi(move(cngi_in)), pcre(move(pcre_in)), ue2(ue2_in), id(expr_id),
+        : cngi(std::move(cngi_in)), pcre(std::move(pcre_in)), ue2(ue2_in), id(expr_id),
           multi(multi_in), utf8(utf8_in) {}
 
     unique_ptr<CNGInfo> cngi;
@@ -1221,7 +1220,7 @@ class CorpusGenThread : public OutputThread {
             }
 
             addCorporaToQueue(out, testq, c->id, *corpora, summary,
-                              move(c->pcre), move(c->cngi), c->ue2, c->multi,
+                              std::move(c->pcre), std::move(c->cngi), c->ue2, c->multi,
                               c->utf8);
 
             count++;
@@ -1435,7 +1434,7 @@ unique_ptr<CorpusGenUnit> makeCorpusGenUnit(unsigned id, TestSummary &summary,
     // Caller may already have set the UTF-8 property (in multi cases)
     utf8 |= cpcre ? cpcre->utf8 : cngi->utf8;
 
-    return ue2::make_unique<CorpusGenUnit>(move(cngi), move(cpcre), ue2, id,
+    return std::make_unique<CorpusGenUnit>(std::move(cngi), std::move(cpcre), ue2, id,
                                            multi, utf8);
 }
 
@@ -1490,7 +1489,7 @@ void buildSingle(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
         auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2,
                                    multi, utf8);
         if (u) {
-            corpq.push(move(u));
+            corpq.push(std::move(u));
         }
     }
 }
@@ -1548,7 +1547,7 @@ void buildBanded(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
             auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate,
                                        ue2, multi, utf8);
             if (u) {
-                corpq.push(move(u));
+                corpq.push(std::move(u));
             }
         }
     }
@@ -1588,7 +1587,7 @@ void buildMulti(BoundedQueue<CorpusGenUnit> &corpq, TestSummary &summary,
         auto u = makeCorpusGenUnit(id, summary, ground, graph, ultimate, ue2,
                                    multi, utf8);
         if (u) {
-            corpq.push(move(u));
+            corpq.push(std::move(u));
         }
     }
 }
@@ -1608,7 +1607,7 @@ void generateTests(CorporaSource &corpora_src, const ExpressionMap &exprMap,
     for (size_t i = 0; i < numGeneratorThreads; i++) {
         auto c = make_unique<CorpusGenThread>(i, testq, corpq, corpora_src);
         c->start();
-        generators.push_back(move(c));
+        generators.push_back(std::move(c));
     }
 
     if (g_ue2CompileAll && multicompile_bands) {
@@ -1824,18 +1823,18 @@ static
 unique_ptr<CorporaSource> buildCorpora(const vector<string> &corporaFiles,
                                        const ExpressionMap &exprMap) {
     if (!corporaFiles.empty()) {
-        auto c = ue2::make_unique<FileCorpora>();
+        auto c = std::make_unique<FileCorpora>();
         for (const auto &file : corporaFiles) {
             if (!c->readFile(file)) {
                 cout << "Error reading corpora from file: " << file << endl;
                 exit_with_fail();
             }
         }
-        return move(c); /* move allows unique_ptr<CorporaSource> conversion */
+        return std::move(c); /* move allows unique_ptr<CorporaSource> conversion */
     } else {
-        auto c = ue2::make_unique<NfaGeneratedCorpora>(
+        auto c = std::make_unique<NfaGeneratedCorpora>(
             exprMap, corpus_gen_prop, force_utf8, force_prefilter);
-        return move(c);
+        return std::move(c);
     }
 }
 
@@ -1846,13 +1845,9 @@ bool needsQuotes(const char *s) {
     if (len == 0) {
         return true;
     }
-#ifndef _WIN32
     // don't confuse the correct isblank for the one in locale
     int (*blank)(int) = &std::isblank;
     if (find_if(s, s + len, blank) != s + len) {
-#else
-    if (find_if(s, s + len, [](unsigned char c){ return std::isblank(c); }) != s + len) {
-#endif
         return true;
     }
 
@@ -1886,9 +1881,9 @@ bool runTests(CorporaSource &corpora_source, const ExpressionMap &exprMap,
     // Start scanning threads.
     vector<unique_ptr<ScanThread>> scanners;
     for (size_t i = 0; i < numScannerThreads; i++) {
-        auto s = ue2::make_unique<ScanThread>(i, testq, exprMap, plat, grey);
+        auto s = std::make_unique<ScanThread>(i, testq, exprMap, plat, grey);
         s->start();
-        scanners.push_back(move(s));
+        scanners.push_back(std::move(s));
     }
 
     generateTests(corpora_source, exprMap, summary, plat, grey, testq);
@@ -1989,7 +1984,7 @@ int HS_CDECL main(int argc, char *argv[]) {
 
     // If we're saving corpora out, truncate the output file.
     if (saveCorpora) {
-        corporaOut = ue2::make_unique<CorpusWriter>(saveCorporaFile);
+        corporaOut = std::make_unique<CorpusWriter>(saveCorporaFile);
     }
 
     GroundTruth::global_prep();
diff --git a/tools/hscollider/sig.cpp b/tools/hscollider/sig.cpp
index 7d580e410..d2e221b53 100644
--- a/tools/hscollider/sig.cpp
+++ b/tools/hscollider/sig.cpp
@@ -36,8 +36,9 @@
 #include <ctype.h>
 #include <string>
 
-#if defined(HAVE_SIGACTION) || defined(_WIN32)
+#if defined(HAVE_SIGACTION)
 #include <signal.h>
+#define STACK_SIZE 8192
 #endif
 
 #ifdef HAVE_BACKTRACE
@@ -59,12 +60,8 @@ TLS_VARIABLE volatile size_t debug_corpus_len = 0;
 
 extern std::string g_cmdline;
 
-#if defined(_WIN32)
-static void __cdecl sighandler(int signum) {
-#elif defined(HAVE_SIGACTION)
+#if defined(HAVE_SIGACTION)
 static void sighandler(int signum) {
-#endif
-#if defined(HAVE_SIGACTION) || defined(_WIN32)
     /* NOTE: This signal handler is designed solely to provide more information
      * when a crash occurs in ue2collider -- it makes calls to signal-unsafe
      * functions like printf() and backtrace() by design, since we're already
@@ -149,12 +146,7 @@ static void sighandler(int signum) {
 
 void installSignalHandler(void) {
 
-#ifdef _WIN32
-    signal(SIGABRT, sighandler);
-    signal(SIGFPE, sighandler);
-    signal(SIGILL, sighandler);
-    signal(SIGSEGV, sighandler);
-#elif defined(HAVE_SIGACTION)
+#if defined(HAVE_SIGACTION)
     struct sigaction act;
     memset(&act, 0, sizeof(act));
     act.sa_handler = sighandler;
@@ -175,7 +167,7 @@ void installSignalHandler(void) {
 }
 
 #ifdef HAVE_SIGALTSTACK
-static TLS_VARIABLE char alt_stack_loc[SIGSTKSZ];
+static TLS_VARIABLE char alt_stack_loc[STACK_SIZE];
 #endif
 
 void setSignalStack(void) {
@@ -187,7 +179,7 @@ void setSignalStack(void) {
     stack_t alt_stack;
     memset(&alt_stack, 0, sizeof(alt_stack));
     alt_stack.ss_flags = 0;
-    alt_stack.ss_size = SIGSTKSZ;
+    alt_stack.ss_size = STACK_SIZE;
     alt_stack.ss_sp = alt_stack_loc;
     if (!sigaltstack(&alt_stack, nullptr)) {
         act.sa_flags |= SA_ONSTACK;
diff --git a/tools/hscollider/sig.h b/tools/hscollider/sig.h
index 4b24e95f6..5be4c1461 100644
--- a/tools/hscollider/sig.h
+++ b/tools/hscollider/sig.h
@@ -40,11 +40,7 @@
 #define STAGE_GRAPH_COMPILE 6
 #define STAGE_GRAPH_RUN 7
 
-#ifndef WIN32
 #define TLS_VARIABLE __thread
-#else
-#define TLS_VARIABLE __declspec(thread)
-#endif
 
 extern TLS_VARIABLE volatile int debug_stage;
 extern TLS_VARIABLE volatile int debug_expr;
diff --git a/tools/hsdump/CMakeLists.txt b/tools/hsdump/CMakeLists.txt
index 0466d5720..4350b0f6d 100644
--- a/tools/hsdump/CMakeLists.txt
+++ b/tools/hsdump/CMakeLists.txt
@@ -10,10 +10,6 @@ include_directories(${PROJECT_SOURCE_DIR}/util)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 
-if(WIN32 AND (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS))
-    add_executable(hsdump main.cpp $<TARGET_OBJECTS:hs_compile_shared> $<TARGET_OBJECTS:hs_exec_shared>)
-else()
-    add_executable(hsdump main.cpp)
-endif()
+add_executable(hsdump main.cpp)
 target_link_libraries(hsdump hs expressionutil crosscompileutil)
 
diff --git a/tools/hsdump/main.cpp b/tools/hsdump/main.cpp
index 75db1c4f3..6c8464b66 100644
--- a/tools/hsdump/main.cpp
+++ b/tools/hsdump/main.cpp
@@ -58,19 +58,9 @@
 #include <string>
 #include <vector>
 
-#ifndef _WIN32
 #include <getopt.h>
-#else
-#include "win_getopt.h"
-#endif
 #include <sys/stat.h>
-
-#ifndef _WIN32
 #include <dirent.h>
-#else
-#include <direct.h>
-#define stat _stat
-#endif
 
 #include <boost/ptr_container/ptr_vector.hpp>
 
@@ -332,7 +322,6 @@ u32 buildDumpFlags(void) {
     return flags;
 }
 
-#ifndef _WIN32
 static
 void clearDir(const string &path) {
     DIR *dir = opendir(path.c_str());
@@ -356,46 +345,12 @@ void clearDir(const string &path) {
     }
     closedir(dir);
 }
-#else // windows
-static
-void clearDir(const string &path) {
-    WIN32_FIND_DATA ffd;
-    HANDLE hFind = INVALID_HANDLE_VALUE;
-    string glob = path + "/*";
-    hFind = FindFirstFile(glob.c_str(), &ffd);
-    if (hFind == INVALID_HANDLE_VALUE) {
-        printf("ERROR: couldn't open location %s\n", path.c_str());
-        exit(1);
-    }
-    do {
-        string basename(ffd.cFileName);
-        string fname(path);
-        fname.push_back('/');
-        fname.append(basename);
-
-        // Ignore '.' and '..'
-        if (basename == "." || basename == "..") {
-            continue;
-        }
-
-        if (!DeleteFile(fname.c_str())) {
-            printf("ERROR: couldn't remove file %s\n", fname.c_str());
-        }
-
-    } while (FindNextFile(hFind, &ffd) != 0);
-    FindClose(hFind);
-}
-#endif
 
 static
 int makeDirectory(const string &dirName) {
-#ifndef _WIN32
     mode_t mode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IXGRP |
                   S_IROTH | S_IXOTH;
     return mkdir(dirName.c_str(), mode);
-#else
-    return _mkdir(dirName.c_str());
-#endif
 }
 
 static
diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index b0706fa8e..cbb122557 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -1,13 +1,6 @@
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 
-if(CMAKE_C_FLAGS MATCHES "/Gv" )
-    string(REPLACE "/Gv" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-endif()
-if(CMAKE_CXX_FLAGS MATCHES "/Gv" )
-    string(REPLACE "/Gv" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-endif()
-
 set(gtest_SOURCES gtest/gtest-all.cc gtest/gtest.h)
 include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR})
@@ -38,10 +31,6 @@ endif()
 
 add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})
 
-if (WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4309 /wd4018")
-endif()
-
 set(unit_hyperscan_SOURCES
     ${gtest_SOURCES}
     hyperscan/allocators.cpp
@@ -67,14 +56,9 @@ set(unit_hyperscan_SOURCES
     hyperscan/test_util.h
     )
 add_executable(unit-hyperscan ${unit_hyperscan_SOURCES})
-if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
-target_link_libraries(unit-hyperscan hs_shared expressionutil)
-else()
 target_link_libraries(unit-hyperscan hs expressionutil)
-endif()
-
 
-if (NOT (RELEASE_BUILD OR FAT_RUNTIME))
+if (NOT FAT_RUNTIME AND BUILD_STATIC_LIBS)
 set(unit_internal_SOURCES
     ${gtest_SOURCES}
     internal/bitfield.cpp
@@ -83,8 +67,6 @@ set(unit_internal_SOURCES
     internal/compare.cpp
     internal/database.cpp
     internal/depth.cpp
-    internal/fdr.cpp
-    internal/fdr_flood.cpp
     internal/fdr_loadval.cpp
     internal/flat_set.cpp
     internal/flat_map.cpp
@@ -92,8 +74,6 @@ set(unit_internal_SOURCES
     internal/graph_undirected.cpp
     internal/insertion_ordered.cpp
     internal/lbr.cpp
-    internal/limex_nfa.cpp
-    internal/masked_move.cpp
     internal/multi_bit.cpp
     internal/multi_bit_compress.cpp
     internal/nfagraph_common.h
@@ -116,6 +96,7 @@ set(unit_internal_SOURCES
     internal/rose_mask_32.cpp
     internal/rvermicelli.cpp
     internal/simd_utils.cpp
+    internal/supervector.cpp
     internal/shuffle.cpp
     internal/shufti.cpp
     internal/state_compress.cpp
@@ -128,15 +109,26 @@ set(unit_internal_SOURCES
     internal/vermicelli.cpp
     internal/main.cpp
     )
+if (BUILD_AVX2)
+set(unit_internal_SOURCES
+    ${unit_internal_SOURCES}
+    internal/masked_move.cpp
+    )
+endif(BUILD_AVX2)
+
+if (NOT RELEASE_BUILD)
+set(unit_internal_SOURCES
+    ${unit_internal_SOURCES}    
+    internal/fdr.cpp
+    internal/fdr_flood.cpp
+    internal/limex_nfa.cpp
+   )	
+endif(NOT RELEASE_BUILD)
 
-if(WIN32 AND (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS))
-add_executable(unit-internal ${unit_internal_SOURCES} $<TARGET_OBJECTS:hs_compile_shared> $<TARGET_OBJECTS:hs_exec_shared>)
-else()
 add_executable(unit-internal ${unit_internal_SOURCES})
-endif()
 set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}")
 target_link_libraries(unit-internal hs corpusomatic)
-endif(NOT (RELEASE_BUILD OR FAT_RUNTIME))
+endif (NOT FAT_RUNTIME AND BUILD_STATIC_LIBS)
 
 if (BUILD_CHIMERA)
     # enable Chimera unit tests
@@ -187,9 +179,10 @@ else()
     else ()
     add_custom_target(
         unit
+        COMMAND bin/unit-internal
         COMMAND bin/unit-hyperscan
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-        DEPENDS unit-hyperscan
+        DEPENDS unit-internal unit-hyperscan
     )
     endif()
 endif()
diff --git a/unit/hyperscan/allocators.cpp b/unit/hyperscan/allocators.cpp
index 40c450720..a30a3702d 100644
--- a/unit/hyperscan/allocators.cpp
+++ b/unit/hyperscan/allocators.cpp
@@ -99,7 +99,7 @@ TEST(CustomAllocator, TwoAlignedCompileError) {
     ASSERT_NE(nullptr, compile_err);
     EXPECT_STREQ("Allocator returned misaligned memory.", compile_err->message);
     hs_free_compile_error(compile_err);
-    hs_set_database_allocator(nullptr, nullptr);
+    hs_set_misc_allocator(nullptr, nullptr);
 }
 
 TEST(CustomAllocator, TwoAlignedDatabaseInfo) {
diff --git a/unit/hyperscan/bad_patterns.cpp b/unit/hyperscan/bad_patterns.cpp
index 1756ba099..42e4772e7 100644
--- a/unit/hyperscan/bad_patterns.cpp
+++ b/unit/hyperscan/bad_patterns.cpp
@@ -280,7 +280,11 @@ TEST_P(BadPattern, Stream) {
 
 static
 vector<BadPatternParam> getBadPatterns() {
+#ifdef NDEBUG
     string filename = "unit/hyperscan/bad_patterns.txt";
+#else
+    string filename = "unit/hyperscan/bad_patterns_fast.txt";
+#endif
 
     ifstream f;
     f.open(filename.c_str(), ifstream::in);
diff --git a/unit/hyperscan/bad_patterns_fast.txt b/unit/hyperscan/bad_patterns_fast.txt
new file mode 100644
index 000000000..39a5c2e22
--- /dev/null
+++ b/unit/hyperscan/bad_patterns_fast.txt
@@ -0,0 +1,159 @@
+1:/\c空/ #\c must be followed by an ASCII character at index 0.
+2:/\c/ #\c must be followed by an ASCII character at index 0.
+3:/[\c空]/  #\c must be followed by an ASCII character at index 1.
+4:/[\c]/  #Unterminated character class starting at index 0.
+5:/\c空/8 #\c must be followed by an ASCII character at index 0.
+6:/<([^>+i)>.*?</\1>/sP #Unterminated character class starting at index 2.
+6:/[foo/ #Unterminated character class starting at index 0.
+7:/[\p{X}]/8 #Unknown property at index 4.
+8:/[\p{^X}]/8 #Unknown property at index 5.
+9:/[\p{L]/8 #Malformed property at index 0.
+10:/[\p{^L]/8 #Malformed property at index 0.
+11:/[\P{L]/8 #Malformed property at index 0.
+12:/[\P{^L]/8 #Malformed property at index 0.
+13:/\p/8 #Malformed property at index 0.
+14:/\P/8 #Malformed property at index 0.
+15:/\p{/8 #Malformed property at index 0.
+16:/\P{/8 #Malformed property at index 0.
+17:/\p{^/8 #Malformed property at index 0.
+18:/\P{^/8 #Malformed property at index 0.
+19:/[\p/8 #Malformed property at index 1.
+20:/[\P/8 #Malformed property at index 1.
+21:/[\p{/8 #Malformed property at index 0.
+22:/[\P{/8 #Malformed property at index 0.
+23:/[\p{^/8 #Malformed property at index 0.
+24:/[\P{^/8 #Malformed property at index 0.
+25:/\pl/8 #Unknown property at index 2.
+26:/\p{any}/8 #Unknown property at index 3.
+27:/\p{greek}/8 #Unknown property at index 3.
+28:/\b/8W #\b unsupported in UCP mode at index 0.
+29:/(*UCP)\b/8 #\b unsupported in UCP mode at index 6.
+30:/\B/8W #\B unsupported in UCP mode at index 0.
+31:/\B/W #\B unsupported in UCP mode at index 0.
+32:/foo(?{print "Hello world\n";})bar/ #Embedded code is not supported at index 3.
+33:/the (\S+)(?{ $color = $^N }) (\S+)(?{ $animal = $^N })/i #Embedded code is not supported at index 9.
+35:/\X/8 #\X unsupported at index 0.
+36:/\B+/ #Invalid repeat at index 2.
+37:/\B?/ #Invalid repeat at index 2.
+38:/\B*/ #Invalid repeat at index 2.
+39:/\B{0,6}/ #Invalid repeat at index 2.
+40:/\b+/ #Invalid repeat at index 2.
+41:/\b?/ #Invalid repeat at index 2.
+42:/\b*/ #Invalid repeat at index 2.
+43:/\b{0,6}/ #Invalid repeat at index 2.
+44:/[.ch.]/ #Unsupported POSIX collating element at index 0.
+45:/[=ch=]/ #Unsupported POSIX collating element at index 0.
+46:/[:digit:]/ #POSIX named classes are only supported inside a class at index 0.
+47:/[[.ch.]]/ #Unsupported POSIX collating element at index 1.
+48:/[[=ch=]]/ #Unsupported POSIX collating element at index 1.
+49:/foo(?m)?bar/ #Invalid repeat at index 7.
+50:/.(?)+/ #Invalid repeat at index 4.
+51:/(abc)\2/P #Invalid back reference to expression 2.
+52:/\x{100000000}/ #Value in \x{...} sequence is too large at index 0.
+53:/^foo/{min_offset=5} #Expression is anchored and cannot satisfy min_offset=5 as it can only produce matches of length 3 bytes at most.
+54:/foobar/{min_length=20} #Expression has min_length=20 but can only produce matches of length 6 bytes at most.
+55:/foobar/{max_offset=3} #Expression has max_offset=3 but requires 6 bytes to match.
+56:/mkdzo(x|u)(\b)kd/{max_offset=29} #Pattern can never match.
+57:/[^\x00-\xff]/ #Pattern can never match.
+58:/[^\x00-\xff]foo/ #Pattern can never match.
+59:/^\Bfoo/ #Pattern can never match.
+60:/^\B\Bfoo/ #Pattern can never match.
+61:/can't_match\b\B/ #Pattern can never match.
+62:/\b\Bcan't_match/ #Pattern can never match.
+63:/^\b$/m #Pattern can never match.
+64:/^\b\Z/m #Pattern can never match.
+65:/^\b\z/m #Pattern can never match.
+66:/\A\b$/m #Pattern can never match.
+67:/\A\b\Z/m #Pattern can never match.
+68:/\A\b\z/m #Pattern can never match.
+69:/^[^\x00-\xff]foo/ #Pattern can never match.
+70:/foo[^\x00-\xff]/ #Pattern can never match.
+71:/foo[^\x00-\xff]$/ #Pattern can never match.
+72:/\Bd\B/i{min_length=2,min_offset=4,max_offset=54} #Expression has min_length=2 but can only produce matches of length 1 bytes at most.
+74:/(((.|aaa)aaaaaa.aaa){14,19}a((a|a{5,6}|aa){3,11}|aa.|a){2}){40}\Z/smL #Pattern is too large.
+75:/\B/s8{min_length=1} #Expression has min_length=1 but can only produce matches of length 0 bytes at most.
+76:/(f|d|(\b)|i|a\Z)/mHV8{min_length=2,min_offset=9,max_offset=14} #Expression has min_length=2 but can only produce matches of length 1 bytes at most.
+77:/(f|e|d{19,}|h\Z|^j|\Aa)/smi{min_length=7,min_offset=8,max_offset=18} #Extended parameter constraints can not be satisfied for any match from this expression.
+78:/(i{13,}|i\Z)/s{min_length=3,max_offset=5} #Extended parameter constraints can not be satisfied for any match from this expression.
+79:/(?P<dupename>foo).*(?P<dupename>bar)/ #Two named subpatterns use the name 'dupename' at index 19.
+80:/_W{0,3}bazr_W{0,3}(ac[_a-z]{22}a)?e_W{0,3}bazr[_a-z](ac[a-z]{4}c{14}[a-z]{5})?e_W{0,3}bazr[_a-z](e|ac[_a-z]{4}c{16}([_a-z]|[a-p]W|[o-z]WW){3}([_a-z]|WWW))_W{0,3}bazr([_a-z]|[a-p]WW?|[o-z]WWW)a(foobar|c([a-z]W{0,3})bc([a-z]W{0,3})c{14}([_a-z]W{0,3}){6})((fooaa|[_a-z]W{0,3})bazr[_a-z]W{0,5}a(foobar|c([_a-z]|[a-z]W{1,3})bc([_a-z]|[o-z]W{1,5})c{14}([_a-f]|[A-Z0]W|~WW|;WWW){6})){40}(fooaa|_)bazr[_a-z]/sL #Pattern is too large.
+81:/[..]/ #Unsupported POSIX collating element at index 0.
+82:/[==]/ #Unsupported POSIX collating element at index 0.
+83:/[.\].]/ #Unsupported POSIX collating element at index 0.
+84:/[=\]=]/ #Unsupported POSIX collating element at index 0.
+85:/A(?!)+Z/ #Invalid repeat at index 5.
+86:/\X/ #\X unsupported at index 0.
+88:/[A-\d]/ #Invalid range in character class at index 3.
+89:/[A-[:digit:]]/ #Invalid range in character class at index 3.
+90:/B[--[:digit:]--]+/ #Invalid range in character class at index 4.
+91:/a\owibble/ #Value in \o{...} sequence is non-octal or missing braces at index 1.
+92:/a\o{wibble/ #Value in \o{...} sequence is non-octal or missing braces at index 1.
+93:/a\o{777}/ #Value in \o{...} sequence is too large at index 1.
+94:/(*UTF16)foo/ #Unsupported control verb (*UTF16) at index 0.
+95:/(*BSR_UNICODE)abc/ #Unsupported control verb (*BSR_UNICODE) at index 0.
+96:/a+(*SKIP)b/ #Unknown control verb (*SKIP) at index 2.
+97:/foo(*/ #Invalid repeat at index 4.
+98:/[:\]:]/ #POSIX named classes are only supported inside a class at index 0.
+99:/[[:[:]/ #Invalid POSIX named class at index 1.
+100:/abc(?(1)def|ghi)/P #Invalid conditional reference to expression 1.
+101:/abc(?(<blah>)def|ghi)/P #Invalid conditional reference to label 'blah'.
+102:/(?(DEFINE)foo|bar)/P #DEFINE conditional group with more than one branch at index 17.
+103:/(?<1name>group)/ #Group name cannot begin with a digit at index 0.
+104:/abc((def)?(?(R)bar))+/P #Pattern recursion not supported at index 10.
+105:/abc((def)?(?(R2)bar))+/P #Pattern recursion not supported at index 10.
+106:/abc((def)(?(R&label)bar))+/P #Pattern recursion not supported at index 9.
+107:/\o{4200000}/8 #Value in \o{...} sequence is too large at index 0.
+108:/\o{19}/ #Value in \o{...} sequence is non-octal or missing braces at index 0.
+109:/\o{/ #Value in \o{...} sequence is non-octal or missing braces at index 0.
+110:/\o{1/ #Value in \o{...} sequence is non-octal or missing braces at index 0.
+111:/\x{0x110000}/8 #Value in \x{...} sequence is non-hex or missing } at index 0.
+112:/\cÀ/ #\c must be followed by an ASCII character at index 0.
+113:/[\cÀ]/ #\c must be followed by an ASCII character at index 1.
+114:/[\o{4200000}]/8 #Value in \o{...} sequence is too large at index 1.
+115:/[\x{0x110000}]/8 #Value in \x{...} sequence is non-hex or missing } at index 1.
+116:/[\o{70]/ #Value in \o{...} sequence is non-octal or missing braces at index 1.
+117:/[\x{ff]/ #Value in \x{...} sequence is non-hex or missing } at index 1.
+118:/foo/{min_offset=10,max_offset=9} #In hs_expr_ext, min_offset must be less than or equal to max_offset.
+120:/foo/{min_length=10,max_offset=9} #In hs_expr_ext, min_length must be less than or equal to max_offset.
+122:/��/8 #Expression is not valid UTF-8.
+123:/hello \6 world/P #Invalid back reference to expression 6.
+124:/hello \6 world|dog/P #Invalid back reference to expression 6.
+125:/[~-\V]/8 #Invalid range in character class at index 3.
+126:/(*UTF8)��/ #Expression is not valid UTF-8.
+127:/^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/���������������������������0}l.{1,60}Car*k|npanomnax+8Wnah/8 #Expression is not valid UTF-8.
+128:/(*UTF8)^fo?ob{ro|nax_off\Qt=10omnax+8Wnah/���������������������������0}l.{1,60}Car*k|npanomnax+8Wnah/ #Expression is not valid UTF-8.
+129:/bignum \1111111111111111111/ #Number is too big at index 7.
+130:/foo|&{5555555,}/ #Bounded repeat is too large.
+131:/[a[..]]/ #Unsupported POSIX collating element at index 2.
+132:/[a[==]]/ #Unsupported POSIX collating element at index 2.
+133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
+134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.
+135:/[^\D\d]/8W #Pattern can never match.
+136:/(*LIMIT_MATCH=1000)foobar/ #Unsupported control verb (*LIMIT_MATCH=1000) at index 0.
+137:/(*UTF32)foobar/ #Unsupported control verb (*UTF32) at index 0.
+138:/(*UNKNOWNVERB)foobar/ #Unknown control verb (*UNKNOWNVERB) at index 0.
+139:/foo(*UTF8)bar/ #(*UTF8) must be at start of expression, encountered at index 5.
+140:/(?i)(*UTF8)foobar/ #(*UTF8) must be at start of expression, encountered at index 6.
+141:/(*@&/ #Unknown control verb at index 2.
+142:/abcd/si{edit_distance=4} #Approximate matching patterns that reduce to vacuous patterns are disallowed.
+143:/foobar|hatstand/sL{edit_distance=6} #Approximate matching patterns that reduce to vacuous patterns are disallowed.
+144:/abc\b/{edit_distance=1} #Zero-width assertions are disallowed for approximate matching.
+145:/abc/8{edit_distance=1} #UTF-8 is disallowed for approximate matching.
+146:/(*UTF8)abc/{edit_distance=1} #UTF-8 is disallowed for approximate matching.
+147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match.
+148:/\Q�\Eaaaa/8 #Expression is not valid UTF-8.
+149:/[\Q�\Eaaaa]/8 #Expression is not valid UTF-8.
+150:/abcd/{edit_distance=1,hamming_distance=1} #In hs_expr_ext, cannot have both edit distance and Hamming distance.
+151:/141 | abc/C #Unknown character at index 6.
+152:/141 & | 142/C #Not enough operand at index 6.
+153:/141 142 & 143/C #Not enough operator at index 13.
+154:/141 !142/C #Not enough operator at index 8.
+155:/141 & 142 |/C #Not enough operand at index 11.
+156:/)141 & 142 /C #Not enough left parentheses at index 0.
+157:/(141 & (142|!143) |144/C #Not enough right parentheses at index 22.
+158:/141 & (142|!143) )| 144/C #Not enough left parentheses at index 17.
+159:/1234567890 & (142|!143 )/C #Expression id too large at index 10.
+160:/141 & (142|!143 )|/C #Not enough operand at index 18.
+161:/141/C #No logical operation.
+162:/119 & 121/C #Unknown sub-expression id.
+163:/166 & 167/C #Unknown sub-expression id.
diff --git a/unit/hyperscan/behaviour.cpp b/unit/hyperscan/behaviour.cpp
index f15e71716..5947e61d1 100644
--- a/unit/hyperscan/behaviour.cpp
+++ b/unit/hyperscan/behaviour.cpp
@@ -157,7 +157,11 @@ TEST_P(HyperscanScanGigabytesMatch, StreamingMatch) {
 
     // gb is the number of gigabytes to scan between pre-block and post-block
     // run over 1,2,4,8 gb
+#ifdef NDEBUG
     for (unsigned long long gb = 1; gb <= 8; gb *= 2) {
+#else
+    for (unsigned long long gb = 1; gb <= 2; gb *= 2) {
+#endif
         SCOPED_TRACE(gb);
 
         hs_stream_t *stream = nullptr;
@@ -261,12 +265,12 @@ TEST_P(HyperscanScanGigabytesMatch, BlockMatch) {
         1*1024,
 #ifdef BIG_BLOCKS
         4*1024, 32*1024, 128*1024, 512*1024,
+#ifdef NDEBUG
         // gigabytes
         1024*1024,
-#ifdef ARCH_X86_64
         // big cases for big beefy machines
         2048*1024, 3072*1024
-#endif // ARCH_X86_64
+#endif // NDEBUG
 #endif // BIG_BLOCKS
     };
 
@@ -1333,6 +1337,7 @@ TEST(regression, UE_2425) {
     hs_free_database(db);
 }
 
+#ifdef NDEBUG
 TEST(regression, UE_2485) {
     const char regex[] = "(?:(.EeEa|((a{2}BD[bc]Bd[eae]|[DCd]|c|ebCa|d)){7,21})(E{5,}A{4,}[Cc].cc{3,6}|eCec|e+CaBEd|[Bb])){10}DB(a|[AAda])..A?DE?E";
     unsigned flags = HS_FLAG_DOTALL | HS_FLAG_CASELESS | HS_FLAG_UTF8 |
@@ -1348,6 +1353,7 @@ TEST(regression, UE_2485) {
     ASSERT_NE(nullptr, db);
     hs_free_database(db);
 }
+#endif
 
 TEST(regression, UE_2452) {
     const char regex[] = "/ab.b[bca]{2,}ca((?:c|(abc(?sxmi-xm)){10,14}|c|b|[abcb])){4,23}acbcbb*ba((?:(a|.{4,}|.|[acba])){3,16}a)+";
@@ -1364,6 +1370,7 @@ TEST(regression, UE_2452) {
     hs_free_database(db);
 }
 
+#ifdef NDEBUG
 TEST(regression, UE_2595) {
     const char regex[] = "(?:(?:acAa|c[EAA]aEb|((?:CC[bdd].cE((?x-msix)BE){32}(?:\\B)){16,19}CdD.E(E|E|B)){3,6}|E(a|d|.)(?:(?xs-isxm)|b|.|C))){17,}";
     unsigned flags = HS_FLAG_MULTILINE | HS_FLAG_CASELESS |
@@ -1378,6 +1385,7 @@ TEST(regression, UE_2595) {
     ASSERT_NE(nullptr, db);
     hs_free_database(db);
 }
+#endif
 
 TEST(regression, UE_2762) {
     const vector<pattern> patterns = {
diff --git a/unit/hyperscan/literals.cpp b/unit/hyperscan/literals.cpp
index 86bd317cd..6ff3aa434 100644
--- a/unit/hyperscan/literals.cpp
+++ b/unit/hyperscan/literals.cpp
@@ -235,7 +235,11 @@ static const unsigned test_modes[] = {HS_MODE_BLOCK, HS_MODE_STREAM,
 static const unsigned test_flags[] = {0, HS_FLAG_SINGLEMATCH,
                                       HS_FLAG_SOM_LEFTMOST};
 
+#ifdef NDEBUG
 static const unsigned test_sizes[] = {1, 10, 100, 500, 10000};
+#else
+static const unsigned test_sizes[] = {1, 10, 100, 500};
+#endif
 
 static const pair<unsigned, unsigned> test_bounds[] = {{3u, 10u}, {10u, 100u}};
 
diff --git a/unit/hyperscan/single.cpp b/unit/hyperscan/single.cpp
index 01fbfeab5..278d28f7e 100644
--- a/unit/hyperscan/single.cpp
+++ b/unit/hyperscan/single.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -363,8 +363,11 @@ static const unsigned validModes[] = {
 // Mode bits for switching off various architecture features
 static const unsigned long long featureMask[] = {
     ~0ULL, /* native */
-    ~(HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512), /* no avx2 */
-    ~HS_CPU_FEATURES_AVX512, /* no avx512 */
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+    ~(HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512 | HS_CPU_FEATURES_AVX512VBMI), /* no avx2 */
+    ~(HS_CPU_FEATURES_AVX512 | HS_CPU_FEATURES_AVX512VBMI), /* no avx512 */
+    ~HS_CPU_FEATURES_AVX512VBMI, /* no avx512vbmi */
+#endif
 };
 
 INSTANTIATE_TEST_CASE_P(Single,
diff --git a/unit/hyperscan/test_util.h b/unit/hyperscan/test_util.h
index efa0570c3..21862b6b4 100644
--- a/unit/hyperscan/test_util.h
+++ b/unit/hyperscan/test_util.h
@@ -37,9 +37,6 @@
 #include <vector>
 
 #ifndef UNUSED
-#if defined(_WIN32) || defined(_WIN64)
-#define UNUSED
-#else
 #define UNUSED __attribute__ ((unused))
 #endif
 #endif
@@ -124,5 +121,3 @@ void *count_malloc(size_t n);
 void *count_malloc_b(size_t n);
 void count_free(void *p);
 void count_free_b(void *p);
-
-#endif
diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp
index 3f7885449..8af8f9a43 100644
--- a/unit/internal/bitutils.cpp
+++ b/unit/internal/bitutils.cpp
@@ -294,6 +294,39 @@ TEST(BitUtils, compress64) {
     }
 }
 
+TEST(BitUtils, compress128) {
+    const m128 all_zeroes = zeroes128();
+    const m128 all_ones = ones128();
+    const m128 odd_bits = set1_2x64(0x5555555555555555ull);
+    const m128 even_bits = set1_2x64(0xaaaaaaaaaaaaaaaaull);
+
+    EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, all_zeroes)));
+    EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, set1_4x32(1))));
+    EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, all_ones)));
+    EXPECT_EQ(0, diff128(all_ones, compress128(all_ones, all_ones)));
+    EXPECT_EQ(0, diff128(set1_2x64(0xffffffffull), compress128(odd_bits, odd_bits)));
+    EXPECT_EQ(0, diff128(set1_2x64(0xffffffffull), compress128(even_bits, even_bits)));
+    EXPECT_EQ(0, diff128(all_zeroes, compress128(odd_bits, even_bits)));
+    EXPECT_EQ(0, diff128(all_zeroes, compress128(even_bits, odd_bits)));
+
+    // Some single-bit tests.
+    for (u32 i = 0; i < 64; i++) {
+        const m128 one_bit = set1_2x64(1ull << i);
+
+        EXPECT_EQ(0, diff128(all_zeroes, compress128(all_zeroes, one_bit)));
+        EXPECT_EQ(0, diff128(set1_2x64(1ull), compress128(one_bit, one_bit)));
+        EXPECT_EQ(0, diff128(one_bit, compress128(one_bit, all_ones)));
+
+        if (i % 2) {
+            EXPECT_EQ(0, diff128(set1_2x64(1ull << (i / 2)), compress128(one_bit, even_bits)));
+            EXPECT_EQ(0, diff128(all_zeroes, compress128(one_bit, odd_bits)));
+        } else {
+            EXPECT_EQ(0, diff128(set1_2x64(1ull << (i / 2)), compress128(one_bit, odd_bits)));
+            EXPECT_EQ(0, diff128(all_zeroes, compress128(one_bit, even_bits)));
+        }
+    }
+}
+
 TEST(BitUtils, expand32) {
     const u32 all_ones = 0xffffffffu;
     const u32 odd_bits = 0x55555555u;
@@ -352,6 +385,35 @@ TEST(BitUtils, expand64) {
     }
 }
 
+TEST(BitUtils, expand128) {
+    const m128 all_zeroes = zeroes128();
+    const m128 all_ones = ones128();
+    const m128 odd_bits = set1_2x64(0x5555555555555555ull);
+    const m128 even_bits = set1_2x64(0xaaaaaaaaaaaaaaaaull);
+
+    EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, all_zeroes)));
+    EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, set1_2x64(1ull))));
+    EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, all_ones)));
+    EXPECT_EQ(0, diff128(all_ones, expand128(all_ones, all_ones)));
+    EXPECT_EQ(0, diff128(odd_bits, expand128(set1_2x64(0xffffffffull), odd_bits)));
+    EXPECT_EQ(0, diff128(even_bits, expand128(set1_2x64(0xffffffffull), even_bits)));
+    EXPECT_EQ(0, diff128(all_zeroes, expand128(set1_2x64(0xffffffff00000000ull), even_bits)));
+    EXPECT_EQ(0, diff128(all_zeroes, expand128(set1_2x64(0xffffffff00000000ull), odd_bits)));
+    EXPECT_EQ(0, diff128(set1_2x64(1u), expand128(set1_2x64(1u), odd_bits)));
+    EXPECT_EQ(0, diff128(set1_2x64(2u), expand128(set1_2x64(1u), even_bits)));
+
+    // Some single-bit tests.
+    for (u32 i = 0; i < 64; i++) {
+        const m128 one_bit = set1_2x64(1ull << i);
+
+        EXPECT_EQ(0, diff128(all_zeroes, expand128(all_zeroes, one_bit)));
+        EXPECT_EQ(0, diff128(one_bit, expand128(set1_2x64(1ull), one_bit)));
+        EXPECT_EQ(0, diff128(one_bit, expand128(one_bit, all_ones)));
+
+        EXPECT_EQ(0, diff128(one_bit, expand128(set1_2x64(1ull << (i / 2)), i % 2 ? even_bits : odd_bits)));
+    }
+}
+
 TEST(BitUtils, bf_op_1) {
     u64a a = 0;
     for (u32 i = 0; i < 64; i++) {
diff --git a/unit/internal/database.cpp b/unit/internal/database.cpp
index 8f0c1a695..0070fbc96 100644
--- a/unit/internal/database.cpp
+++ b/unit/internal/database.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2021, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -56,6 +56,10 @@ TEST(DB, flagsToPlatform) {
     p.cpu_features |= HS_CPU_FEATURES_AVX512;
 #endif
 
+#if defined(HAVE_AVX512VBMI)
+    p.cpu_features |= HS_CPU_FEATURES_AVX512VBMI;
+#endif
+
     platform_t pp = target_to_platform(target_t(p));
     ASSERT_EQ(pp, hs_current_platform);
 }
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index c70ceeae1..28433c968 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2020, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -83,9 +83,10 @@ class LimExModelTest : public TestWithParam<int> {
         const map<u32, u32> fixed_depth_tops;
         const map<u32, vector<vector<CharReach>>> triggers;
         bool compress_state = false;
+        bool fast_nfa = false;
 
         nfa = constructNFA(*g, &rm, fixed_depth_tops, triggers, compress_state,
-                           type, cc);
+                           fast_nfa, type, cc);
         ASSERT_TRUE(nfa != nullptr);
 
         full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
@@ -376,9 +377,10 @@ class LimExZombieTest : public TestWithParam<int> {
         const map<u32, u32> fixed_depth_tops;
         const map<u32, vector<vector<CharReach>>> triggers;
         bool compress_state = false;
+        bool fast_nfa = false;
 
         nfa = constructNFA(*g, &rm, fixed_depth_tops, triggers, compress_state,
-                           type, cc);
+                           fast_nfa, type, cc);
         ASSERT_TRUE(nfa != nullptr);
 
         full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
diff --git a/unit/internal/masked_move.cpp b/unit/internal/masked_move.cpp
index 7bd78c504..8641a4685 100644
--- a/unit/internal/masked_move.cpp
+++ b/unit/internal/masked_move.cpp
@@ -32,7 +32,9 @@
 
 #include "gtest/gtest.h"
 #include "util/arch.h"
-#include "util/masked_move.h"
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/masked_move.h"
+#endif
 
 namespace {
 
diff --git a/unit/internal/multi_bit.cpp b/unit/internal/multi_bit.cpp
index 2b0c7c797..7bb4a1a8a 100644
--- a/unit/internal/multi_bit.cpp
+++ b/unit/internal/multi_bit.cpp
@@ -32,7 +32,6 @@
 #include "ue2common.h"
 #include "rose/rose_build_scatter.h"
 #include "util/compile_error.h"
-#include "util/make_unique.h"
 #include "util/multibit.h"
 #include "util/multibit_build.h"
 
@@ -49,10 +48,10 @@ class mmbit_holder {
 public:
     mmbit_holder() {}
     explicit mmbit_holder(u32 num_bits, u32 excess = 0)
-        : data(ue2::make_unique<u8[]>(mmbit_size(num_bits) + 7 + excess)) {}
+        : data(std::make_unique<u8[]>(mmbit_size(num_bits) + 7 + excess)) {}
     void init(u32 num_bits) {
         assert(!data);
-        data = ue2::make_unique<u8[]>(mmbit_size(num_bits) + 7);
+        data = std::make_unique<u8[]>(mmbit_size(num_bits) + 7);
     }
     operator u8 *() {
         assert(data);
@@ -727,7 +726,7 @@ TEST_P(MultiBitTest, InitRangeChunked) {
 }
 
 static
-void apply(const scatter_plan_raw &sp, u8 *out) {
+void applyMB(const scatter_plan_raw &sp, u8 *out) {
     for (const auto &e : sp.p_u64a) {
         memcpy(out + e.offset, &e.val, sizeof(e.val));
     }
@@ -761,7 +760,7 @@ TEST_P(MultiBitTest, InitRangePlanChunked) {
             scatter_plan_raw sp;
             mmbBuildInitRangePlan(test_size, chunk_begin, chunk_end, &sp);
             memset(ba, 0xaa, mmbit_size(test_size));
-            apply(sp, ba);
+            applyMB(sp, ba);
 
             // First bit set should be chunk_begin.
             ASSERT_EQ(chunk_begin, mmbit_iterate(ba, test_size, MMB_INVALID));
@@ -1328,16 +1327,19 @@ static const MultiBitTestParam multibitTests[] = {
     { 1024, 1 },
     { 1025, 1 },
     { 2099, 1 },
+#ifdef NDEBUG
     { 10000, 1 },
     { 32768, 1 },
     { 32769, 1 },
     { 200000, 1 },
+#endif
 
     // Larger cases, bigger strides.
     { 1U << 18, 3701 },
     { 1U << 19, 3701 },
     { 1U << 20, 3701 },
     { 1U << 21, 3701 },
+#ifdef NDEBUG
     { 1U << 22, 3701 },
     { 1U << 23, 3701 },
     { 1U << 24, 3701 },
@@ -1348,6 +1350,7 @@ static const MultiBitTestParam multibitTests[] = {
     { 1U << 29, 24413 },
     { 1U << 30, 50377 },
     { 1U << 31, 104729 },
+#endif
 };
 
 INSTANTIATE_TEST_CASE_P(MultiBit, MultiBitTest, ValuesIn(multibitTests));
diff --git a/unit/internal/multi_bit_compress.cpp b/unit/internal/multi_bit_compress.cpp
index d7396b811..14c3f4804 100644
--- a/unit/internal/multi_bit_compress.cpp
+++ b/unit/internal/multi_bit_compress.cpp
@@ -28,10 +28,11 @@
 
 #include "config.h"
 
+#include <memory>
+
 #include "gtest/gtest.h"
 #include "ue2common.h"
 #include "util/compile_error.h"
-#include "util/make_unique.h"
 #include "util/multibit.h"
 #include "util/multibit_build.h"
 #include "util/multibit_compress.h"
@@ -86,10 +87,10 @@ class mmbit_holder {
 public:
     mmbit_holder() {}
     explicit mmbit_holder(u32 num_bits, u32 excess = 0)
-        : data(ue2::make_unique<u8[]>(mmbit_size(num_bits) + 7 + excess)) {}
+        : data(std::make_unique<u8[]>(mmbit_size(num_bits) + 7 + excess)) {}
     void init(u32 num_bits) {
         assert(!data);
-        data = ue2::make_unique<u8[]>(mmbit_size(num_bits) + 7);
+        data = std::make_unique<u8[]>(mmbit_size(num_bits) + 7);
     }
     operator u8 *() {
         assert(data);
@@ -108,10 +109,10 @@ class comp_holder {
 public:
     comp_holder() {}
     explicit comp_holder(u32 length)
-        : data(ue2::make_unique<u8[]>(length + 7)) {}
+        : data(std::make_unique<u8[]>(length + 7)) {}
     void init(u32 length) {
         assert(!data);
-        data = ue2::make_unique<u8[]>(length + 7);
+        data = std::make_unique<u8[]>(length + 7);
     }
     operator u8 *() {
         assert(data);
@@ -164,10 +165,12 @@ TEST(MultiBitComp, CompCompsizeSparse) {
         257,
         4097,
         (1U << 18) + 1,
+#ifdef NDEBUG
         (1U << 24) + 1,
         (1U << 30) + 1
+#endif
     };
-    for (u32 i = 0; i < 5; i++) {
+    for (u32 i = 0; i < sizeof(test_set)/sizeof(u32); i++) {
         u32 test_size = test_set[i];
         mmbit_holder ba(test_size);
 
@@ -224,10 +227,12 @@ TEST(MultiBitComp, CompCompsizeDense) {
         257,
         4097,
         (1U << 18) + 1,
+#ifdef NDEBUG
         (1U << 24) + 1,
         (1U << 30) + 1
+#endif
     };
-    for (u32 i = 0; i < 5; i++) {
+    for (u32 i = 0; i < sizeof(test_set)/sizeof(u32); i++) {
         u32 test_size = test_set[i];
         mmbit_holder ba(test_size);
 
@@ -759,16 +764,19 @@ static const MultiBitCompTestParam multibitCompTests[] = {
     { 1025, 1 },
     { 2099, 1 },    // 4097 = 64 ^ 2 + 1
     { 4097, 1 },
+#ifdef NDEBUG
     { 10000, 1 },
     { 32768, 1 },
     { 32769, 1 },
     { 200000, 1 },
     { 262145, 1 },  // 262145 = 64 * 3 + 1
+#endif
 
     // Larger cases, bigger strides.
     { 1U << 19, 3701 },
     { 1U << 20, 3701 },
     { 1U << 21, 3701 },
+#ifdef NDEBUG
     { 1U << 22, 3701 },
     { 1U << 23, 3701 },
     { 1U << 24, 3701 },
@@ -779,6 +787,7 @@ static const MultiBitCompTestParam multibitCompTests[] = {
     { 1U << 29, 24413 },
     { 1U << 30, 50377 },
     { 1U << 31, 104729 },
+#endif
 };
 
 INSTANTIATE_TEST_CASE_P(MultiBitComp, MultiBitCompTest,
diff --git a/unit/internal/noodle.cpp b/unit/internal/noodle.cpp
index 7cf5744fe..16c257b89 100644
--- a/unit/internal/noodle.cpp
+++ b/unit/internal/noodle.cpp
@@ -123,7 +123,7 @@ TEST(Noodle, nood1) {
 TEST(Noodle, nood2) {
     const size_t data_len = 1024;
     unsigned int i, j;
-    u8 data[data_len];
+    u8 ALIGN_ATTR(32) data[data_len];
 
     memset(data, 'a', data_len);
 
@@ -224,7 +224,7 @@ TEST(Noodle, noodLong) {
 
 TEST(Noodle, noodCutoverSingle) {
     const size_t max_data_len = 128;
-    u8 data[max_data_len + 15];
+    u8 ALIGN_ATTR(32) data[max_data_len + 15];
 
     memset(data, 'a', max_data_len + 15);
 
diff --git a/unit/internal/pack_bits.cpp b/unit/internal/pack_bits.cpp
index aa0a35eb7..453dccfd4 100644
--- a/unit/internal/pack_bits.cpp
+++ b/unit/internal/pack_bits.cpp
@@ -30,7 +30,6 @@
 
 #include "gtest/gtest.h"
 #include "util/pack_bits.h"
-#include "util/make_unique.h"
 #include "ue2common.h"
 
 #include <algorithm>
@@ -92,7 +91,7 @@ void test_pack_and_unpack(const vector<T> &v, const vector<u32> &bits) {
 
     // Temporary char array to pack into.
     const size_t mem_size = packed_size(bits);
-    unique_ptr<char[]> mem = ue2::make_unique<char[]>(mem_size);
+    unique_ptr<char[]> mem = std::make_unique<char[]>(mem_size);
 
     pack_bits<T>(&mem[0], &v[0], &bits[0], elements);
 
diff --git a/unit/internal/repeat.cpp b/unit/internal/repeat.cpp
index 546d7d4f8..5665a0c3e 100644
--- a/unit/internal/repeat.cpp
+++ b/unit/internal/repeat.cpp
@@ -34,7 +34,6 @@
 #include "nfa/repeat.h"
 #include "nfa/repeatcompile.h"
 #include "util/depth.h"
-#include "util/make_unique.h"
 
 #include <algorithm>
 #include <memory>
@@ -431,7 +430,7 @@ TEST_P(RepeatTest, Pack) {
     // We should be able to pack and then unpack the control block at any
     // offset up to repeatMin and get a match at both the min and max repeats.
 
-    unique_ptr<char[]> packed = ue2::make_unique<char[]>(info.packedCtrlSize);
+    unique_ptr<char[]> packed = std::make_unique<char[]>(info.packedCtrlSize);
 
     for (u32 i = 0; i < info.repeatMax; i++) {
         SCOPED_TRACE(testing::Message() << "i=" << i);
diff --git a/unit/internal/rose_build_merge.cpp b/unit/internal/rose_build_merge.cpp
index 5029f0a53..73abff4d1 100644
--- a/unit/internal/rose_build_merge.cpp
+++ b/unit/internal/rose_build_merge.cpp
@@ -39,7 +39,6 @@
 #include "util/boundary_reports.h"
 #include "util/compile_context.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "smallwrite/smallwrite_build.h"
 #include "som/slot_manager.h"
 
@@ -52,7 +51,7 @@ using namespace ue2;
 
 static
 std::unique_ptr<NGHolder> makeSuffixGraph(ReportID report) {
-    auto h = ue2::make_unique<NGHolder>(NFA_SUFFIX);
+    auto h = std::make_unique<NGHolder>(NFA_SUFFIX);
     NGHolder &g = *h;
 
     NFAVertex v = add_vertex(g);
diff --git a/unit/internal/rvermicelli.cpp b/unit/internal/rvermicelli.cpp
index 22c238e91..5cd52e4d0 100644
--- a/unit/internal/rvermicelli.cpp
+++ b/unit/internal/rvermicelli.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,7 +30,7 @@
 #include "config.h"
 
 #include "gtest/gtest.h"
-#include "nfa/vermicelli.h"
+#include "nfa/vermicelli.hpp"
 
 #define BOUND (~(VERM_BOUNDARY - 1))
 
@@ -113,6 +114,92 @@ TEST(RVermicelli, Exec4) {
     }
 }
 
+TEST(RNVermicelli, ExecNoMatch1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    for (size_t i = 0; i < 16; i++) {
+        SCOPED_TRACE(i);
+        for (size_t j = 0; j < 16; j++) {
+            SCOPED_TRACE(j);
+            const u8 *rv = rnvermicelliExec('b', 0, buf + i,
+                                                    buf + strlen(t1) - j);
+
+            ASSERT_EQ(buf + i - 1, rv);
+
+            rv = rnvermicelliExec('B', 1, buf + i, buf + strlen(t1) - j);
+
+            ASSERT_EQ(buf + i - 1, rv);
+        }
+    }
+}
+
+TEST(RNVermicelli, Exec1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    for (size_t i = 0; i < 16; i++) {
+        SCOPED_TRACE(i);
+        const u8 *rv = rnvermicelliExec('b', 0, buf, buf + strlen(t1) - i);
+
+        ASSERT_EQ(buf + 48, rv);
+
+        rv = rnvermicelliExec('B', 1, buf + i, buf + strlen(t1) - i);
+
+        ASSERT_EQ(buf + 48, rv);
+    }
+}
+
+TEST(RNVermicelli,  Exec2) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    for (size_t i = 0; i < 16; i++) {
+        SCOPED_TRACE(i);
+        const u8 *rv = rnvermicelliExec('b', 0, buf, buf + strlen(t1) - i);
+
+        ASSERT_EQ(buf + 48, rv);
+
+        rv = rnvermicelliExec('B', 1, buf + i, buf + strlen(t1) - i);
+
+        ASSERT_EQ(buf + 48, rv);
+    }
+}
+
+TEST(RNVermicelli,  Exec3) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbaaaaaaaaaaaaaaaaaaaaaaAbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    for (size_t i = 0; i < 16; i++) {
+        SCOPED_TRACE(i);
+        const u8 *rv = rnvermicelliExec('b', 0, buf + i, buf + strlen(t1));
+
+        ASSERT_EQ(buf + 48, rv);
+
+        rv = rnvermicelliExec('B', 1, buf + i, buf + strlen(t1));
+
+        ASSERT_EQ(buf + 48, rv);
+    }
+}
+
+TEST(RNVermicelli, Exec4) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    for (size_t i = 0; i < 31; i++) {
+        SCOPED_TRACE(i);
+        t1[16 + i] = 'a';
+        const u8 *rv = rnvermicelliExec('b', 0, buf, buf + strlen(t1));
+
+        ASSERT_EQ(buf + 16 + i, rv);
+
+        rv = rnvermicelliExec('B', 1, buf, buf + strlen(t1));
+
+        ASSERT_EQ(buf + 16 + i, rv);
+    }
+}
+
+
 TEST(RDoubleVermicelli, Exec1) {
     char t1[] = "bbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbb";
 
@@ -218,3 +305,262 @@ TEST(RDoubleVermicelli, Exec5) {
         }
     }
 }
+
+#ifdef HAVE_SVE2
+
+#include "nfa/vermicellicompile.h"
+using namespace ue2;
+
+TEST(RVermicelli16, ExecNoMatch1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+
+    CharReach chars;
+    chars.set('a');
+    chars.set('B');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            const u8 *begin = (const u8 *)t1 + i;
+            const u8 *end = (const u8 *)t1 + strlen(t1) - j;
+
+            const u8 *rv = rvermicelli16Exec(matches, begin, end);
+            ASSERT_EQ(begin - 1, rv);
+        }
+    }
+}
+
+TEST(RVermicelli16, Exec1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = rvermicelli16Exec(matches, buf, buf + strlen(t1) - i);
+        ASSERT_EQ(buf + 48, rv);
+    }
+}
+
+TEST(RVermicelli16,  Exec2) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = rvermicelli16Exec(matches, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 48, rv);
+    }
+}
+
+TEST(RVermicelli16,  Exec3) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbaaaaaaaaaaaaaaaaaaaaaaAbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    m128 matches_a;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches_a);
+    ASSERT_TRUE(ret);
+
+    chars.set('A');
+    m128 matches_A;
+    ret = vermicelli16Build(chars, (u8 *)&matches_A);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = rvermicelli16Exec(matches_a, buf, buf + strlen(t1) - i);
+        ASSERT_EQ(buf + 47, rv);
+
+        rv = rvermicelli16Exec(matches_A, buf, buf + strlen(t1) - i);
+        ASSERT_EQ(buf + 48, rv);
+    }
+}
+
+TEST(RVermicelli16, Exec4) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    m128 matches_a;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches_a);
+    ASSERT_TRUE(ret);
+
+    chars.set('A');
+    m128 matches_A;
+    ret = vermicelli16Build(chars, (u8 *)&matches_A);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 31; i++) {
+        t1[16 + i] = 'a';
+        const u8 *rv = rvermicelli16Exec(matches_a, buf, buf + strlen(t1));
+        ASSERT_EQ(buf + 16 + i, rv);
+
+        rv = rvermicelli16Exec(matches_A, buf, buf + strlen(t1));
+        ASSERT_EQ(buf + 16 + i, rv);
+    }
+}
+
+TEST(RVermicelli16, Exec5) {
+    char t1[] = "qqqqqqqqqqqqqqqqqabcdefghijklmnopqqqqqqqqqqqqqqqqqqqqq";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    m128 matches[16];
+    bool ret;
+
+    for (int i = 0; i < 16; ++i) {
+        chars.set('a' + i);
+        ret = vermicelli16Build(chars, (u8 *)&matches[i]);
+        ASSERT_TRUE(ret);
+    }
+
+    for (int j = 0; j < 16; ++j) {
+        for (size_t i = 0; i < 16; i++) {
+            const u8 *rv = rvermicelli16Exec(matches[j], buf, buf + strlen(t1) - i);
+            ASSERT_EQ(buf + j + 17, rv);
+        }
+    }
+}
+
+TEST(RNVermicelli16, ExecNoMatch1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    chars.set('B');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            const u8 *rv = rnvermicelli16Exec(matches, buf + i, buf + strlen(t1) - j);
+            ASSERT_EQ(buf + i - 1, rv);
+        }
+    }
+}
+
+TEST(RNVermicelli16, Exec1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = rnvermicelli16Exec(matches, buf + i, buf + strlen(t1) - i);
+        ASSERT_EQ(buf + 48, rv);
+    }
+}
+
+TEST(RNVermicelli16,  Exec2) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = rnvermicelli16Exec(matches, buf, buf + strlen(t1) - i);
+        ASSERT_EQ(buf + 48, rv);
+    }
+}
+
+TEST(RNVermicelli16,  Exec3) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbaaaaaaaaaaaaaaaaaaaaaaAbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    m128 matches_b;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches_b);
+    ASSERT_TRUE(ret);
+
+    chars.set('A');
+    m128 matches_A;
+    ret = vermicelli16Build(chars, (u8 *)&matches_A);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = rnvermicelli16Exec(matches_b, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 48, rv);
+
+        rv = rnvermicelli16Exec(matches_A, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 47, rv);
+    }
+}
+
+TEST(RNVermicelli16, Exec4) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    m128 matches_b;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches_b);
+    ASSERT_TRUE(ret);
+
+    chars.set('A');
+    m128 matches_A;
+    ret = vermicelli16Build(chars, (u8 *)&matches_A);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 31; i++) {
+        t1[16 + i] = 'a';
+        const u8 *rv = rnvermicelli16Exec(matches_b, buf, buf + strlen(t1));
+        ASSERT_EQ(buf + 16 + i, rv);
+
+        rv = rnvermicelli16Exec(matches_A, buf, buf + strlen(t1));
+        ASSERT_EQ(buf + 16 + i, rv);
+    }
+}
+
+TEST(RNVermicelli16, Exec5) {
+    char t1[] = "aaaaaaaaaaaaaaaaaabcdefghijklmnopqqqqqqqqqqqqqqqqqqqqqqqq";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    m128 matches[16];
+    bool ret;
+
+    for (int i = 0; i < 16; ++i) {
+        chars.set('q' - i);
+        ret = vermicelli16Build(chars, (u8 *)&matches[i]);
+        ASSERT_TRUE(ret);
+    }
+
+    for (int j = 0; j < 16; ++j) {
+        for (size_t i = 0; i < 16; i++) {
+            const u8 *rv = rnvermicelli16Exec(matches[j], buf, buf + strlen(t1) - i);
+            ASSERT_EQ(buf - j + 32, rv);
+        }
+    }
+}
+
+#endif // HAVE_SVE2
diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp
index b2316babd..deb85e9f9 100644
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@@ -33,6 +33,12 @@
 #include "util/arch.h"
 #include "util/simd_utils.h"
 #include "nfa/limex_shuffle.h"
+#include"util/supervector/supervector.hpp"
+#include "nfa/limex_shuffle.hpp"
+
+#ifdef setbit
+#undef setbit
+#endif
 
 namespace {
 
@@ -180,11 +186,11 @@ void build_pshufb_masks_onebit(unsigned int bit, T *permute, T *compare) {
 
 TEST(Shuffle, PackedExtract128_1) {
     // Try all possible one-bit masks
-    for (unsigned int i = 0; i < 128; i++) {
+    for (unsigned int i = 0; i < 1; i++) {
         // shuffle a single 1 bit to the front
         m128 permute, compare;
         build_pshufb_masks_onebit(i, &permute, &compare);
-        EXPECT_EQ(1U, packedExtract128(setbit<m128>(i), permute, compare));
+	    EXPECT_EQ(1U, packedExtract128(setbit<m128>(i), permute, compare));
         EXPECT_EQ(1U, packedExtract128(ones128(), permute, compare));
         // we should get zero out of these cases
         EXPECT_EQ(0U, packedExtract128(zeroes128(), permute, compare));
@@ -196,6 +202,28 @@ TEST(Shuffle, PackedExtract128_1) {
     }
 }
 
+
+TEST(Shuffle, PackedExtract_templatized_128_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 128; i++) {
+        // shuffle a single 1 bit to the front
+        SuperVector<16> permute = SuperVector<16>::Zeroes();
+        SuperVector<16> compare = SuperVector<16>::Zeroes();
+        build_pshufb_masks_onebit(i, &permute.u.v128[0], &compare.u.v128[0]);
+        EXPECT_EQ(1U, packedExtract<16>(setbit<m128>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract<16>(SuperVector<16>::Ones(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract<16>(SuperVector<16>::Zeroes(), permute, compare));
+        EXPECT_EQ(0U, packedExtract<16>(not128(setbit<m128>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 128); j++) {
+            EXPECT_EQ(0U, packedExtract<16>(setbit<m128>(j), permute, compare));
+        }
+    }
+}
+
+
+
 #if defined(HAVE_AVX2)
 TEST(Shuffle, PackedExtract256_1) {
     // Try all possible one-bit masks
@@ -214,6 +242,27 @@ TEST(Shuffle, PackedExtract256_1) {
         }
     }
 }
+
+
+TEST(Shuffle, PackedExtract_templatized_256_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 256; i++) {
+        // shuffle a single 1 bit to the front
+        SuperVector<32> permute = SuperVector<32>::Zeroes();
+        SuperVector<32> compare = SuperVector<32>::Zeroes();
+        build_pshufb_masks_onebit(i, &permute.u.v256[0], &compare.u.v256[0]);
+        EXPECT_EQ(1U, packedExtract<32>(setbit<m256>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract<32>(SuperVector<32>::Ones(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract<32>(SuperVector<32>::Zeroes(), permute, compare));
+        EXPECT_EQ(0U, packedExtract<32>(not256(setbit<m256>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 256); j++) {
+            EXPECT_EQ(0U, packedExtract<32>(setbit<m256>(j), permute, compare));
+        }
+    }
+}
+
 #endif
 
 #if defined(HAVE_AVX512)
@@ -234,5 +283,25 @@ TEST(Shuffle, PackedExtract512_1) {
         }
     }
 }
+
+TEST(Shuffle, PackedExtract_templatized_512_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 512; i++) {
+        // shuffle a single 1 bit to the front
+        SuperVector<64> permute = SuperVector<64>::Zeroes();
+        SuperVector<64> compare = SuperVector<64>::Zeroes();
+        build_pshufb_masks_onebit(i, &permute.u.v512[0], &compare.u.v512[0]);
+        EXPECT_EQ(1U, packedExtract<64>(setbit<m512>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract<64>(SuperVector<64>::Ones(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract<64>(SuperVector<64>::Zeroes(), permute, compare));
+        EXPECT_EQ(0U, packedExtract<64>(not512(setbit<m512>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 512); j++) {
+            EXPECT_EQ(0U, packedExtract<64>(setbit<m512>(j), permute, compare));
+        }
+    }
+}
+
 #endif
 } // namespace
diff --git a/unit/internal/shufti.cpp b/unit/internal/shufti.cpp
index 0c9d26071..fb8d58a84 100644
--- a/unit/internal/shufti.cpp
+++ b/unit/internal/shufti.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -516,7 +517,7 @@ TEST(DoubleShufti, ExecNoMatch1b) {
         const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2,
                                         (u8 *)t1 + i, (u8 *)t1 + strlen(t1));
 
-        ASSERT_EQ((size_t)t1 + i + 15, (size_t)rv);
+        ASSERT_LE((size_t)t1 + i + 15, (size_t)rv);
     }
 }
 
@@ -560,7 +561,7 @@ TEST(DoubleShufti, ExecNoMatch2b) {
         const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2, (u8 *)t1 + i,
                                         (u8 *)t1 + strlen(t1));
 
-        ASSERT_EQ((size_t)t1 + i + 15, (size_t)rv);
+        ASSERT_LE((size_t)t1 + i + 15, (size_t)rv);
     }
 }
 
@@ -602,7 +603,7 @@ TEST(DoubleShufti, ExecNoMatch3b) {
         const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2,
                                         (u8 *)t1 + i, (u8 *)t1 + strlen(t1));
 
-        ASSERT_EQ((size_t)t1 + i + 15, (size_t)rv);
+        ASSERT_LE((size_t)t1 + i + 15, (size_t)rv);
     }
 }
 
@@ -894,10 +895,11 @@ TEST(DoubleShufti, ExecMatchMixed3) {
     for (size_t i = 0; i < 400; i++) {
         t2[len - i] = 'x';
         t2[len - i + 1] = 'y';
+        DEBUG_PRINTF("i = %ld\n", i);
         const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2,
                                         (u8 *)t2, (u8 *)t2 + len);
 
-        ASSERT_EQ((size_t)&t2[len - i], (size_t)rv);
+        ASSERT_EQ((const u8 *)&t2[len - i], rv);
     }
 }
 
@@ -1106,6 +1108,7 @@ TEST(ReverseShufti, ExecMatch6) {
 
     for (size_t i = 0; i < len; i++) {
         t1[i] = 'a';
+        DEBUG_PRINTF("i=%ld\n", i);
         const u8 *rv = rshuftiExec(lo, hi, (u8 *)t1, (u8 *)t1 + len);
 
         ASSERT_EQ((const u8 *)t1 + i, rv);
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 623c2c998..272d5456d 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,9 +32,12 @@
 #include "gtest/gtest.h"
 #include "util/arch.h"
 #include "util/bytecode_ptr.h"
-#include "util/make_unique.h"
 #include "util/simd_utils.h"
 
+#ifdef setbit
+#undef setbit
+#endif
+
 using namespace std;
 using namespace ue2;
 
@@ -522,7 +526,7 @@ TYPED_TEST(SimdUtilsTest, loadu) {
     const TypeParam ones = simd_ones();
 
     const size_t mem_len = sizeof(ones) * 2;
-    unique_ptr<char[]> mem_array = ue2::make_unique<char[]>(mem_len);
+    unique_ptr<char[]> mem_array = std::make_unique<char[]>(mem_len);
     char *mem = mem_array.get();
 
     for (size_t offset = 1; offset < sizeof(ones); offset++) {
@@ -658,39 +662,56 @@ TEST(SimdUtilsTest, movq) {
 
     char cmp[sizeof(m128)];
     memset(cmp, 0x80, sizeof(m128));
-    simd = set16x8(0x80);
+    simd = set1_16x8(0x80);
     r = movq(simd);
     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
     ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
 
+#if defined(HAVE_SIMD_128_BITS)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(VS_SIMDE_BACKEND)
     simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+    int64x2_t a = { 0x123456789abcdefLL, ~0LL };
+    simd = vreinterpretq_s32_s64(a);
+#elif defined(ARCH_PPC64EL)
+#if defined(__clang__) && (__clang_major__ >= 15)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecate-lax-vec-conv-all"
+#endif // defined(__clang__) && (__clang_major__ == 15)
+    int64x2_t a = {0x123456789abcdefLL, ~0LL };
+    simd = reinterpret_cast<m128>(a);
+#if defined(__clang__) && (__clang_major__ >= 15)
+#pragma clang diagnostic pop
+#endif // defined(__clang__) && (__clang_major__ == 15)
+#endif
+#endif
     r = movq(simd);
     ASSERT_EQ(r, 0x123456789abcdef);
 }
 
 
-TEST(SimdUtilsTest, set16x8) {
+TEST(SimdUtilsTest, set1_16x8) {
     char cmp[sizeof(m128)];
 
     for (unsigned i = 0; i < 256; i++) {
-        m128 simd = set16x8(i);
+        m128 simd = set1_16x8(i);
         memset(cmp, i, sizeof(simd));
         ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
     }
 }
 
-TEST(SimdUtilsTest, set4x32) {
+TEST(SimdUtilsTest, set1_4x32) {
     u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
-    m128 simd = set4x32(cmp[0]);
+    m128 simd = set1_4x32(cmp[0]);
     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
 }
 
-#if defined(HAVE_AVX2)
+#if defined(HAVE_SIMD_256_BITS)
 TEST(SimdUtilsTest, set32x8) {
     char cmp[sizeof(m256)];
 
     for (unsigned i = 0; i < 256; i++) {
-        m256 simd = set32x8(i);
+        m256 simd = set1_32x8(i);
         memset(cmp, i, sizeof(simd));
         ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
     }
@@ -700,9 +721,9 @@ TEST(SimdUtilsTest, set2x128) {
     char cmp[sizeof(m256)];
 
     for (unsigned i = 0; i < 256; i++) {
-        m128 x = set16x8(i);
-        m256 y = set32x8(i);
-        m256 z = set2x128(x);
+        m128 x = set1_16x8(i);
+        m256 y = set1_32x8(i);
+        m256 z = set1_2x128(x);
         memset(cmp, i, sizeof(z));
         ASSERT_EQ(0, memcmp(cmp, &z, sizeof(z)));
         ASSERT_EQ(0, memcmp(&y, &z, sizeof(z)));
@@ -710,10 +731,59 @@ TEST(SimdUtilsTest, set2x128) {
 }
 #endif
 
+#define TEST_LSHIFTBYTE128(v1, buf, l) {                                                 \
+                                           m128 v_shifted = lshiftbyte_m128(v1, l);      \
+                                           storeu128(res, v_shifted);                    \
+                                           int i;                                        \
+                                           for (i=0; i < l; i++) {                       \
+                                               assert(res[i] == 0);                      \
+                                           }                                             \
+                                           for (; i < 16; i++) {                         \
+                                               assert(res[i] == vec[i - l]);             \
+                                           }                                             \
+                                       }
+
+TEST(SimdUtilsTest, lshiftbyte128){
+    u8 vec[16];
+    u8 res[16];
+    for (int i=0; i<16; i++) {
+        vec[i]=i;
+    }
+    m128 v1 = loadu128(vec);
+    for (int j = 0; j<16; j++){
+        TEST_LSHIFTBYTE128(v1, vec, j);
+    }
+}
+
+#define TEST_RSHIFTBYTE128(v1, buf, l) {                                                 \
+                                           m128 v_shifted = rshiftbyte_m128(v1, l);      \
+                                           storeu128(res, v_shifted);                    \
+                                           int i;                                        \
+                                           for (i=15; i >= 16 - l; i--) {                \
+                                               assert(res[i] == 0);                      \
+                                           }                                             \
+                                           for (; i >= 0; i--) {                         \
+                                               assert(res[i] == vec[i + l]);             \
+                                           }                                             \
+                                       }
+
+TEST(SimdUtilsTest, rshiftbyte128){
+    u8 vec[16];
+    u8 res[16];
+    for (int i=0; i<16; i++) {
+        vec[i]=i;
+    }
+    m128 v1 = loadu128(vec);
+    for (int j = 0; j<16; j++){
+        TEST_RSHIFTBYTE128(v1, vec, j);
+    }
+}
+
 TEST(SimdUtilsTest, variableByteShift128) {
     char base[] = "0123456789ABCDEF";
     m128 in = loadu128(base);
 
+
     EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
                          variable_byte_shift_m128(in, 0)));
     EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
@@ -760,7 +830,7 @@ TEST(SimdUtilsTest, variableByteShift128) {
     EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
                          variable_byte_shift_m128(in, 10)));
 
-    EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));
+    EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 15), variable_byte_shift_m128(in, 15)));
     EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, -16)));
 }
 
@@ -810,4 +880,126 @@ TEST(SimdUtilsTest, sub_u8_m128) {
     EXPECT_TRUE(!diff128(result, loadu128(expec)));
 }
 
+TEST(SimdUtilsTest, load_m128_from_u64a) {
+    srand (time(NULL));
+    u64a tmp = rand();
+    m128 res = load_m128_from_u64a(&tmp);
+    m128 cmp = set2x64(0LL, tmp);
+    //print_m128_16x8("res",res);
+    //print_m128_16x8("cmp",cmp);
+    EXPECT_TRUE(!diff128(res, cmp));
+}
+
+
+TEST(SimdUtilsTest, movemask_128) {
+    srand (time(NULL));
+    u8 vec[16] = {0};
+    u8 vec2[16] = {0};
+    u16 r = rand() % 100 + 1;
+    for(int i=0; i<16; i++) {
+        if (r & (1 << i)) {
+            vec[i] = 0xff;
+        }
+    }
+    m128 v = loadu128(vec);
+    u16 mask = movemask128(v);
+    for(int i=0; i<16; i++) {
+        if (mask & (1 << i)) {
+            vec2[i] = 0xff;
+        }
+    }
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(vec[i],vec2[i]);
+    }
+}
+
+TEST(SimdUtilsTest, pshufb_m128) {
+    srand (time(NULL));
+    u8 vec[16];
+    for (int i=0; i<16; i++) {
+        vec[i] = rand() % 1000 + 1;
+    }
+    u8 vec2[16];
+    for (int i=0; i<16; i++) {
+        vec2[i]=i + (rand() % 100 + 0);
+    }
+
+    // On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+    // In NEON or PPC, if >=16, then the result is zero, otherwise it is that lane.
+    // Thus bellow we have to check that case to NEON or PPC. 
+    
+    //Insure that vec3 has at least 1 or more 0x80 elements
+    u8 vec3[16] = {0};
+    vec3[15] = 0x80;
+
+    for (int i=0; i<15; i++) {
+        int l = rand() % 1000 + 0;
+        if (l % 16 ==0){
+            vec3[i]= 0x80;
+        } else{
+            vec3[i]= vec2[i];
+        }
+    }
+    /*
+        printf("vec3: ");
+        for(int i=15; i>=0; i--) { printf("%02x, ", vec3[i]); }
+        printf("\n");
+    */
+
+    //Test Special Case
+    m128 v1 = loadu128(vec);
+    m128 v2 = loadu128(vec3);
+    m128 vres = pshufb_m128(v1, v2); 
+    
+    u8 res[16];
+    storeu128(res, vres);
+
+    for (int i=0; i<16; i++) {
+	if(vec3[i] & 0x80){
+	   ASSERT_EQ(res[i], 0);
+        }else{	   
+           ASSERT_EQ(vec[vec3[i] % 16 ], res[i]);
+	    }
+    }
+       
+    //Test Other Cases
+    v1 = loadu128(vec);
+    v2 = loadu128(vec2);
+    vres = pshufb_m128(v1, v2); 
+    storeu128(res, vres);
+
+    for (int i=0; i<16; i++) {
+	if(vec2[i] & 0x80){
+	   ASSERT_EQ(res[i], 0);
+        }else{	   
+           ASSERT_EQ(vec[vec2[i] % 16 ], res[i]);
+	    }
+    }
+}
+
+/*Define ALIGNR128 macro*/
+#define TEST_ALIGNR128(v1, v2, buf, l) {                                                 \
+                                           m128 v_aligned = palignr(v2,v1, l);           \
+                                           storeu128(res, v_aligned);                    \
+                                           for (size_t i=0; i<16; i++) {                 \
+                                               ASSERT_EQ(res[i], vec[i + l]);            \
+                                           }                                             \
+                                       }
+
+TEST(SimdUtilsTest, Alignr128){
+    u8 vec[32];
+    u8 res[16];
+    for (int i=0; i<32; i++) {
+        vec[i]=i;
+    }
+    m128 v1 = loadu128(vec);
+    m128 v2 = loadu128(vec+16);
+    for (int j = 0; j<16; j++){
+        TEST_ALIGNR128(v1, v2, vec, j);
+    }
+}
+
+
+
+
 } // namespace
diff --git a/unit/internal/state_compress.cpp b/unit/internal/state_compress.cpp
index 56be8aaea..004237021 100644
--- a/unit/internal/state_compress.cpp
+++ b/unit/internal/state_compress.cpp
@@ -98,8 +98,8 @@ TEST(state_compress, m128_1) {
     char buf[sizeof(m128)] = { 0 };
 
     for (u32 i = 0; i < 16; i++) {
-        char mask_raw[16] = { 0 };
-        char val_raw[16] = { 0 };
+        char ALIGN_ATTR(16) mask_raw[16] = { 0 };
+        char ALIGN_ATTR(16) val_raw[16] = { 0 };
 
         memset(val_raw, (i << 4) + 3, 16);
 
@@ -109,17 +109,32 @@ TEST(state_compress, m128_1) {
         mask_raw[15 - i] = 0xff;
         val_raw[15 - i] = i;
 
-        m128 val;
-        m128 mask;
-
-        memcpy(&val, val_raw, sizeof(val));
-        memcpy(&mask, mask_raw, sizeof(mask));
+        m128 val = load128(val_raw);
+        m128 mask = load128(mask_raw);
 
         storecompressed128(&buf, &val, &mask, 0);
 
         m128 val_out;
         loadcompressed128(&val_out, &buf, &mask, 0);
 
+        int8_t ALIGN_ATTR(16) data[16];
+	store128(data, val);
+	printf("val: ");
+	for (int j=0; j < 16; j++) printf("%02x ", data[j]);
+	printf("\n");
+	store128(data, mask);
+	printf("mask: ");
+	for (int j=0; j < 16; j++) printf("%02x ", data[j]);
+	printf("\n");
+	store128(data, and128(val, mask));
+	printf("and128(val, mask): ");
+	for (int j=0; j < 16; j++) printf("%02x ", data[j]);
+	printf("\n");
+	store128(data, val_out);
+	printf("val_out: ");
+	for (int j=0; j < 16; j++) printf("%02x ", data[j]);
+	printf("\n");
+
         EXPECT_TRUE(!diff128(and128(val, mask), val_out));
 
         mask_raw[i] = 0x0f;
diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp
new file mode 100644
index 000000000..2432e598b
--- /dev/null
+++ b/unit/internal/supervector.cpp
@@ -0,0 +1,1069 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include<iostream>
+#include<cstring>
+#include<time.h>
+#include"gtest/gtest.h"
+#include"ue2common.h"
+#include"util/supervector/supervector.hpp"
+
+
+TEST(SuperVectorUtilsTest, Zero128c) {
+    auto zeroes = SuperVector<16>::Zeroes();
+    u8 buf[16]{0};
+    for(int i=0; i<16; i++) {
+        ASSERT_EQ(zeroes.u.u8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest, Ones128c) {
+    auto ones = SuperVector<16>::Ones();
+    u8 buf[16];
+    for (int i=0; i<16; i++) { buf[i]=0xff; }
+    for(int i=0; i<16; i++) {
+        ASSERT_EQ(ones.u.u8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest, Loadu128c) {
+    u8 vec[32];
+    for(int i=0; i<32;i++) { vec[i]=i; }
+    for(int i=0; i<=16;i++) {
+        auto SP = SuperVector<16>::loadu(vec+i);
+        for(int j=0; j<16; j++) {
+            ASSERT_EQ(SP.u.u8[j],vec[j+i]);
+        }
+    }
+}
+
+TEST(SuperVectorUtilsTest, Load128c) {
+    u8 ALIGN_ATTR(16) vec[32];
+    for(int i=0; i<32;i++) { vec[i]=i; }
+    for(int i=0;i<=16;i+=16) {
+        auto SP = SuperVector<16>::load(vec+i);
+        for(int j=0; j<16; j++){
+            ASSERT_EQ(SP.u.u8[j],vec[j+i]);
+        }
+    }    
+}
+
+TEST(SuperVectorUtilsTest,Equal128c){
+    u8 vec[32];
+     for (int i=0; i<32; i++) {vec[i]=i;};
+    auto SP1 = SuperVector<16>::loadu(vec);
+    auto SP2 = SuperVector<16>::loadu(vec+16);
+    u8 buf[16]={0};
+    /*check for equality byte by byte*/
+    for (int s=0; s<16; s++){
+        if(vec[s]==vec[s+16]){
+            buf[s]=1;
+        }
+    }
+    auto SPResult = SP1.eq(SP2);
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SPResult.u.s8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest,And128c){
+    auto SPResult = SuperVector<16>::Zeroes() & SuperVector<16>::Ones();
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],0);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OPAnd128c){
+    auto SP1 = SuperVector<16>::Zeroes(); 
+    auto SP2 = SuperVector<16>::Ones();
+    SP2 = SP2.opand(SP1);
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SP2.u.u8[i],0);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OR128c){
+    auto SPResult = SuperVector<16>::Zeroes() | SuperVector<16>::Ones();
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],0xff);
+    }
+}
+
+TEST(SuperVectorUtilsTest,XOR128c){
+    srand (time(NULL));
+    u8 vec[16];
+    for (int i=0; i<16; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[16];
+    for (int i=0; i<16; i++) {
+        vec2[i] = rand() % 100 + 1;
+    }
+    auto SP1 = SuperVector<16>::loadu(vec);
+    auto SP2 = SuperVector<16>::loadu(vec2);
+    auto SPResult = SP1 ^ SP2;
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],vec[i] ^ vec2[i]);
+    }
+}
+
+
+TEST(SuperVectorUtilsTest,OPXOR128c){
+    srand (time(NULL));
+    u8 vec[16];
+    for (int i=0; i<16; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[16];
+    for (int i=0; i<16; i++) {
+        vec2[i] = rand() % 100 + 1;
+    }
+    auto SP1 = SuperVector<16>::loadu(vec);
+    auto SP2 = SuperVector<16>::loadu(vec2);
+    auto SPResult = SP1.opxor(SP2);
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],vec[i] ^ vec2[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OPANDNOT128c){
+    auto SP1 = SuperVector<16>::Zeroes(); 
+    auto SP2 = SuperVector<16>::Ones();
+    SP1 = SP1.opandnot(SP2);
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SP1.u.u8[i],0xff);
+    }
+    SP2 = SP2.opandnot(SP1);
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(SP2.u.u8[i],0);
+    }    
+}
+
+TEST(SuperVectorUtilsTest,Movemask128c){
+    srand (time(NULL));
+    u8 vec[16] = {0};
+    u8 vec2[16] = {0};
+    u16 r = rand() % 100 + 1;
+    for(int i=0; i<16; i++) {
+        if (r & (1 << i)) {
+            vec[i] = 0xff;
+        }
+    }
+    auto SP = SuperVector<16>::loadu(vec);
+    u64a mask = SP.comparemask();
+    for (int i = 0; i < 16; i++) {
+        if (mask & (1ull << (i * SuperVector<16>::mask_width()))) {
+            vec2[i] = 0xff;
+        }
+    }
+    for (int i=0; i<16; i++) {
+        ASSERT_EQ(vec[i],vec2[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest,Eqmask128c){
+    srand (time(NULL));
+    u8 vec[16];
+    for (int i = 0; i<16; i++) { vec[i] = rand() % 64 + 0;}
+    u8 vec2[16];
+    for (int i = 0; i<16; i++) { vec2[i]= rand() % 100 + 67;}
+    auto SP = SuperVector<16>::loadu(vec);
+    auto SP1 = SuperVector<16>::loadu(vec2);
+    u64a mask = SP.eqmask(SP);
+    for (u32 i = 0; i < 16; ++i) {
+        ASSERT_TRUE(mask & (1ull << (i * SuperVector<16>::mask_width())));
+    }
+    mask = SP.eqmask(SP1);
+    ASSERT_EQ(mask,0);
+    vec2[0] = vec[0];
+    vec2[1] = vec[1];
+    auto SP2 = SuperVector<16>::loadu(vec2);
+    mask = SP.eqmask(SP2);
+    ASSERT_TRUE(mask & 1);
+    ASSERT_TRUE(mask & (1ull << SuperVector<16>::mask_width()));
+    for (u32 i = 2; i < 16; ++i) {
+        ASSERT_FALSE(mask & (1ull << (i * SuperVector<16>::mask_width())));
+    }
+}
+
+/*Define LSHIFT128 macro*/
+#define TEST_LSHIFT128(buf, vec, v, l) {                                                  \
+                                           auto v_shifted = v << (l);                     \
+                                           for (int i=15; i>= l; --i) {                   \
+                                               buf[i] = vec[i-l];                         \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<16; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift128c){
+    u8 vec[16];
+    for (int i = 0; i<16; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<16>::loadu(vec);
+    u8 buf[16];
+    for (int j = 0; j<16; j++) { 
+        TEST_LSHIFT128(buf, vec, SP, j);
+    }
+}
+
+TEST(SuperVectorUtilsTest,LShift64_128c){
+    u64a vec[2] = {128, 512};
+    auto SP = SuperVector<16>::loadu(vec);
+    for(int s = 0; s<16; s++) {
+        auto SP_after_shift = SP.vshl_64(s);
+        for (int i=0; i<2; i++) {
+            ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] << s);
+        }
+    }   
+}
+
+TEST(SuperVectorUtilsTest,RShift64_128c){
+    u64a vec[2] = {128, 512};
+    auto SP = SuperVector<16>::loadu(vec);
+    for(int s = 0; s<16; s++) {
+        auto SP_after_shift = SP.vshr_64(s);
+        for (int i=0; i<2; i++) {
+            ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] >> s);
+        }
+    }   
+}
+
+/*Define RSHIFT128 macro*/
+#define TEST_RSHIFT128(buf, vec, v, l) {                                                  \
+                                           auto v_shifted = v >> (l);                     \
+                                           for (int i=0; i<16-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                           }                                              \
+                                           for (int i=16-l; i<16; i++) {                  \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<16; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,RShift128c){
+    u8 vec[16];
+    for (int i = 0; i<16; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<16>::loadu(vec);
+    u8 buf[16];
+    for (int j = 0; j<16; j++) { 
+        TEST_RSHIFT128(buf, vec, SP, j);
+    }
+}
+
+TEST(SuperVectorUtilsTest,pshufb128c) {
+    srand (time(NULL));
+    u8 vec[16];
+    for (int i=0; i<16; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[16];
+    for (int i=0; i<16; i++) {
+        vec2[i]=i + (rand() % 15 + 0);
+    }
+    auto SP1 = SuperVector<16>::loadu(vec);
+    auto SP2 = SuperVector<16>::loadu(vec2);
+    auto SResult = SP1.template pshufb<true>(SP2);
+    for (int i=0; i<16; i++) {
+	if(vec2[i] & 0x80){
+	   ASSERT_EQ(SResult.u.u8[i], 0);
+	}else{
+           ASSERT_EQ(vec[vec2[i] % 16 ],SResult.u.u8[i]);
+	}
+    }
+}
+
+
+/*Define LSHIFT128_128 macro*/
+#define TEST_LSHIFT128_128(buf, vec, v, l) {                                              \
+                                           auto v_shifted = v.vshl_128(l);                \
+                                           for (int i=15; i>= l; --i) {                   \
+                                               buf[i] = vec[i-l];                         \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<16; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift128_128c){
+    u8 vec[16];
+    for (int i = 0; i<16; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<16>::loadu(vec);
+    u8 buf[16];
+    for (int j = 0; j<16; j++) { 
+        TEST_LSHIFT128_128(buf, vec, SP, j);
+    }   
+}
+
+/*Define RSHIFT128_128 macro*/
+#define TEST_RSHIFT128_128(buf, vec, v, l) {                                              \
+                                           auto v_shifted = v.vshr_128(l);                \
+                                           for (int i=0; i<16-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                           }                                              \
+                                           for (int i=16-l; i<16; i++) {                  \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<16; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,RShift128_128c){
+    u8 vec[16];
+    for (int i = 0; i<16; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<16>::loadu(vec);
+    u8 buf[16];
+    for (int j = 0; j<16; j++) { 
+        TEST_RSHIFT128_128(buf, vec, SP, j);
+    }
+}
+
+/*Define ALIGNR128 macro*/
+#define TEST_ALIGNR128(v1, v2, buf, l) {                                                 \
+                                           auto v_aligned = v2.alignr(v1, l);            \
+                                           for (size_t i=0; i<16; i++) {                 \
+                                               ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \
+                                           }                                             \
+                                       }
+
+TEST(SuperVectorUtilsTest,Alignr128c){
+    u8 vec[32];
+    for (int i=0; i<32; i++) {
+        vec[i]=i;
+    }
+    auto SP1 = SuperVector<16>::loadu(vec);
+    auto SP2 = SuperVector<16>::loadu(vec+16);
+    for (int j = 0; j<16; j++){
+        TEST_ALIGNR128(SP1, SP2, vec, j);
+    }
+}
+
+
+
+#if defined(HAVE_AVX2)
+TEST(SuperVectorUtilsTest, Zero256c) {
+    auto zeroes = SuperVector<32>::Zeroes();
+    u8 buf[32]{0};
+    for(int i=0; i<32; i++) {
+        ASSERT_EQ(zeroes.u.u8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest, Ones256c) {
+    auto ones = SuperVector<32>::Ones();
+    u8 buf[32];
+    for (int i=0; i<32; i++) { buf[i]=0xff; }
+    for(int i=0; i<32; i++) {
+        ASSERT_EQ(ones.u.u8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest, Loadu256c) {
+    u8 vec[64];
+    for(int i=0; i<64;i++) { vec[i]=i; }
+    for(int i=0; i<=32;i++) {
+        auto SP = SuperVector<32>::loadu(vec+i);
+        for(int j=0; j<32; j++) {
+            ASSERT_EQ(SP.u.u8[j],vec[j+i]);
+        }
+    }
+}
+
+TEST(SuperVectorUtilsTest, Load256c) {
+    u8 ALIGN_ATTR(32) vec[64];
+    for(int i=0; i<64;i++) { vec[i]=i; }
+    for(int i=0;i<=32;i+=32) {
+        auto SP = SuperVector<32>::load(vec+i);
+        for(int j=0; j<32; j++){
+            ASSERT_EQ(SP.u.u8[j],vec[j+i]);
+        }
+    }    
+}
+
+TEST(SuperVectorUtilsTest,Equal256c){
+    u8 vec[64];
+     for (int i=0; i<64; i++) {vec[i]=i;};
+    auto SP1 = SuperVector<32>::loadu(vec);
+    auto SP2 = SuperVector<32>::loadu(vec+32);
+    u8 buf[32]={0};
+    /*check for equality byte by byte*/
+    for (int s=0; s<32; s++){
+        if(vec[s]==vec[s+32]){
+            buf[s]=1;
+        }
+    }
+    auto SPResult = SP1.eq(SP2);
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(SPResult.u.s8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest,And256c){
+    auto SPResult = SuperVector<32>::Zeroes() & SuperVector<32>::Ones();
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],0);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OPAnd256c){
+    auto SP1 = SuperVector<32>::Zeroes(); 
+    auto SP2 = SuperVector<32>::Ones();
+    SP2 = SP2.opand(SP1);
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(SP2.u.u8[i],0);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OR256c){
+    auto SPResult = SuperVector<32>::Zeroes() | SuperVector<32>::Ones();
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],0xff);
+    }
+}
+
+TEST(SuperVectorUtilsTest,XOR256c){
+    srand (time(NULL));
+    u8 vec[32];
+    for (int i=0; i<32; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[32];
+    for (int i=0; i<32; i++) {
+        vec2[i] = rand() % 100 + 1;
+    }
+    auto SP1 = SuperVector<32>::loadu(vec);
+    auto SP2 = SuperVector<32>::loadu(vec2);
+    auto SPResult = SP1 ^ SP2;
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],vec[i] ^ vec2[i]);
+    }
+}
+
+
+TEST(SuperVectorUtilsTest,OPXOR256c){
+    srand (time(NULL));
+    u8 vec[32];
+    for (int i=0; i<32; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[32];
+    for (int i=0; i<32; i++) {
+        vec2[i] = rand() % 100 + 1;
+    }
+    auto SP1 = SuperVector<32>::loadu(vec);
+    auto SP2 = SuperVector<32>::loadu(vec2);
+    auto SPResult = SP1.opxor(SP2);
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],vec[i] ^ vec2[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OPANDNOT256c){
+    auto SP1 = SuperVector<32>::Zeroes(); 
+    auto SP2 = SuperVector<32>::Ones();
+    SP2 = SP2.opandnot(SP1);
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(SP2.u.s8[i],0);
+    }
+}
+
+TEST(SuperVectorUtilsTest,Movemask256c){
+    srand (time(NULL));
+    u8 vec[32] = {0};
+    u8 vec2[32] = {0};
+    u32 r = rand() % 100 + 1;
+    for(int i=0; i<32; i++) {
+        if (r & (1 << i)) {
+            vec[i] = 0xff;
+        }
+    }
+    auto SP = SuperVector<32>::loadu(vec);
+    u64a mask = SP.comparemask();
+    for(int i=0; i<32; i++) {
+        if (mask & (1ull << (i * SuperVector<32>::mask_width()))) {
+            vec2[i] = 0xff;
+        }
+    }
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(vec[i],vec2[i]);
+    }
+}
+
+
+TEST(SuperVectorUtilsTest,Eqmask256c){
+    srand (time(NULL));
+    u8 vec[32];
+    for (int i = 0; i<32; i++) { vec[i] = rand() % 64 + 0;}
+    u8 vec2[32];
+    for (int i = 0; i<32; i++) { vec2[i]= rand() % 100 + 67;}
+    auto SP = SuperVector<32>::loadu(vec);
+    auto SP1 = SuperVector<32>::loadu(vec2);
+    u64a mask = SP.eqmask(SP);
+    for (u32 i = 0; i < 32; ++i) {
+        ASSERT_TRUE(mask & (1ull << (i * SuperVector<32>::mask_width())));
+    }
+    mask = SP.eqmask(SP1);
+    ASSERT_EQ(mask,0);
+    vec2[0] = vec[0];
+    vec2[1] = vec[1];
+    auto SP2 = SuperVector<32>::loadu(vec2);
+    mask = SP.eqmask(SP2);
+    ASSERT_TRUE(mask & 1);
+    ASSERT_TRUE(mask & (1ull << SuperVector<32>::mask_width()));
+    for (u32 i = 2; i < 32; ++i) {
+        ASSERT_FALSE(mask & (1ull << (i * SuperVector<32>::mask_width())));
+    }
+}
+
+TEST(SuperVectorUtilsTest,pshufb256c) {
+    srand (time(NULL));
+    u8 vec[32];
+    for (int i=0; i<32; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[32];
+    for (int i=0; i<32; i++) {
+        vec2[i]=i;
+    }
+    auto SP1 = SuperVector<32>::loadu(vec);
+    auto SP2 = SuperVector<32>::loadu(vec2);
+    auto SResult = SP1.pshufb(SP2);
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]);
+    }
+}
+
+
+/*Define LSHIFT256 macro*/
+#define TEST_LSHIFT256(buf, vec, v, l) {                                                  \
+                                           auto v_shifted = v << (l);                     \
+                                           for (int i=31; i>= l; --i) {                   \
+                                               buf[i] = vec[i-l];                         \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<32; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift256c){
+    u8 vec[32];
+    for (int i = 0; i<32; i++) { vec[i]= i+1;}
+    auto SP = SuperVector<32>::loadu(vec);
+    u8 buf[32];
+    for (int j = 0; j<32; j++) { 
+        TEST_LSHIFT256(buf, vec, SP, j);
+    }
+}
+
+
+TEST(SuperVectorUtilsTest,LShift64_256c){
+    u64a vec[4] = {128, 512, 256, 1024};
+    auto SP = SuperVector<32>::loadu(vec);
+    for(int s = 0; s<32; s++) {
+        auto SP_after_shift = SP.vshl_64(s);
+        for (int i=0; i<4; i++) {
+            ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] << s);
+        }
+    }   
+}
+
+TEST(SuperVectorUtilsTest,RShift64_256c){
+    u64a vec[4] = {128, 512, 256, 1024};
+    auto SP = SuperVector<32>::loadu(vec);
+    for(int s = 0; s<32; s++) {
+        auto SP_after_shift = SP.vshr_64(s);
+        for (int i=0; i<4; i++) {
+            ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] >> s);
+        }
+    }   
+}
+
+
+/*Define RSHIFT256 macro*/
+#define TEST_RSHIFT256(buf, vec, v, l) {                                                  \
+                                           auto v_shifted = v >> (l);                     \
+                                           for (int i=0; i<32-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                           }                                              \
+                                           for (int i=32-l; i<32; i++) {                  \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<32; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,RShift256c){
+    u8 vec[32];
+    for (int i = 0; i<32; i++) { vec[i]= i+1;}
+    auto SP = SuperVector<32>::loadu(vec);
+    u8 buf[32];
+    for (int j = 0; j<32; j++) { 
+        TEST_RSHIFT256(buf, vec, SP, j);
+    }
+}
+
+
+
+
+
+/*Define LSHIFT128_256 macro*/
+#define TEST_LSHIFT128_256(buf, vec, v, l) {                                              \
+                                           auto v_shifted = v.vshl_128(l);                \
+                                           for (int i=15; i>= l; --i) {                   \
+                                               buf[i] = vec[i-l];                         \
+                                               buf[i+16] = vec[(16+i)-l];                 \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                               buf[i+16]= 0;                              \
+                                           }                                              \
+                                           for(int i=0; i<32; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift128_256c){
+    u8 vec[32];
+    for (int i = 0; i<32; i++) { vec[i]= i+1;}
+    auto SP = SuperVector<32>::loadu(vec);
+    u8 buf[32];
+    for (int j=0; j<16; j++) {
+        TEST_LSHIFT128_256(buf, vec, SP, j);
+    }
+}
+
+/*Define RSHIFT128_128 macro*/
+#define TEST_RSHIFT128_256(buf, vec, v, l) {                                              \
+                                           auto v_shifted = v.vshr_128(l);                \
+                                           for (int i=0; i<16-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                               buf[i+16] = vec[(i+16)+l];                 \
+                                           }                                              \
+                                           for (int i=16-l; i<16; i++) {                  \
+                                               buf[i] = 0;                                \
+                                               buf[i+16] = 0;                             \
+                                           }                                              \
+                                           for(int i=0; i<32; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,RShift128_256c){
+    u8 vec[32];
+    for (int i = 0; i<32; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<32>::loadu(vec);
+    u8 buf[32];
+    for(int j=0; j<16; j++) {
+        TEST_RSHIFT128_256(buf, vec, SP, j);
+    }
+}
+
+
+/*Define ALIGNR256 macro*/
+#define TEST_ALIGNR256(v1, v2, buf, l) {                                                  \
+                                           auto v_aligned = v2.alignr(v1, l);             \
+                                           for (size_t i=0; i<32; i++) {                  \
+                                               ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]);  \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,Alignr256c){
+    u8 vec[64];
+    for (int i=0; i<64; i++) {
+        vec[i]=i;
+    }
+    auto SP1 = SuperVector<32>::loadu(vec);
+    auto SP2 = SuperVector<32>::loadu(vec+32);
+    for(int j=0; j<32; j++) {
+        TEST_ALIGNR256(SP1, SP2, vec, j);
+    }
+}
+
+#endif // HAVE_AVX2
+
+
+#if defined(HAVE_AVX512)
+
+TEST(SuperVectorUtilsTest, Zero512c) {
+    auto zeroes = SuperVector<64>::Zeroes();
+    u8 buf[64]{0};
+    for(int i=0; i<64; i++) {
+        ASSERT_EQ(zeroes.u.u8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest, Ones512c) {
+    auto ones = SuperVector<64>::Ones();
+    u8 buf[64];
+    for (int i=0; i<64; i++) { buf[i]=0xff; }
+    for(int i=0; i<64; i++) {
+        ASSERT_EQ(ones.u.u8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest, Loadu512c) {
+    u8 vec[128];
+    for(int i=0; i<128;i++) { vec[i]=i; }
+    for(int i=0; i<=64;i++) {
+        auto SP = SuperVector<64>::loadu(vec+i);
+        for(int j=0; j<64; j++) {
+            ASSERT_EQ(SP.u.u8[j],vec[j+i]);
+        }
+    }
+}
+
+TEST(SuperVectorUtilsTest, Load512c) {
+    u8 ALIGN_ATTR(64) vec[128];
+    for(int i=0; i<128;i++) { vec[i]=i; }
+    for(int i=0;i<=64;i+=64) {
+        auto SP = SuperVector<64>::load(vec+i);
+        for(int j=0; j<64; j++){
+            ASSERT_EQ(SP.u.u8[j],vec[j+i]);
+        }
+    }    
+}
+
+TEST(SuperVectorUtilsTest,Equal512c){
+    u8 vec[128];
+     for (int i=0; i<128; i++) {vec[i]=i;};
+    auto SP1 = SuperVector<64>::loadu(vec);
+    auto SP2 = SuperVector<64>::loadu(vec+64);
+    u8 buf[64]={0};
+    /*check for equality byte by byte*/
+    for (int s=0; s<64; s++){
+        if(vec[s]==vec[s+64]){
+            buf[s]=1;
+        }
+    }
+    auto SPResult = SP1.eq(SP2);
+    for (int i=0; i<64; i++) {
+        ASSERT_EQ(SPResult.u.s8[i],buf[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest,And512c){
+    auto SPResult = SuperVector<64>::Zeroes() & SuperVector<64>::Ones();
+    for (int i=0; i<32; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],0);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OPAnd512c){
+    auto SP1 = SuperVector<64>::Zeroes(); 
+    auto SP2 = SuperVector<64>::Ones();
+    SP2 = SP2.opand(SP1);
+    for (int i=0; i<64; i++) {
+        ASSERT_EQ(SP2.u.u8[i],0);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OR512c){
+    auto SPResult = SuperVector<64>::Zeroes() | SuperVector<64>::Ones();
+    for (int i=0; i<64; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],0xff);
+    }
+}
+
+TEST(SuperVectorUtilsTest,XOR512c){
+    srand (time(NULL));
+    u8 vec[64];
+    for (int i=0; i<64; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[64];
+    for (int i=0; i<64; i++) {
+        vec2[i] = rand() % 100 + 1;
+    }
+    auto SP1 = SuperVector<64>::loadu(vec);
+    auto SP2 = SuperVector<64>::loadu(vec2);
+    auto SPResult = SP1 ^ SP2;
+    for (int i=0; i<64; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],vec[i] ^ vec2[i]);
+    }
+}
+
+
+TEST(SuperVectorUtilsTest,OPXOR512c){
+    srand (time(NULL));
+    u8 vec[64];
+    for (int i=0; i<64; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[64];
+    for (int i=0; i<64; i++) {
+        vec2[i] = rand() % 100 + 1;
+    }
+    auto SP1 = SuperVector<64>::loadu(vec);
+    auto SP2 = SuperVector<64>::loadu(vec2);
+    auto SPResult = SP1.opxor(SP2);
+    for (int i=0; i<64; i++) {
+        ASSERT_EQ(SPResult.u.u8[i],vec[i] ^ vec2[i]);
+    }
+}
+
+TEST(SuperVectorUtilsTest,OPANDNOT512c){
+    auto SP1 = SuperVector<64>::Zeroes(); 
+    auto SP2 = SuperVector<64>::Ones();
+    SP2 = SP2.opandnot(SP1);
+    for (int i=0; i<64; i++) {
+        ASSERT_EQ(SP2.u.s8[i],0);
+    }
+}
+
+
+TEST(SuperVectorUtilsTest,Movemask512c){
+    srand (time(NULL));
+    u8 vec[64] = {0};
+    u64a r = rand() % 100 + 1;
+    for(int i=0; i<64; i++) {
+        if (r & (1ULL << i)) {
+            vec[i] = 0xff;
+        }
+    }
+    auto SP = SuperVector<64>::loadu(vec);
+    u8 vec2[64] = {0};
+    u64a mask = SP.comparemask();
+    for(int i=0; i<64; i++) {
+        if (mask & (1ULL << i)) {
+            vec2[i] = 0xff;
+        }
+    }
+    for (int i=0; i<64; i++){
+        //printf("%d)  vec =%i , vec2 = %i \n",i,vec[i],vec2[i]);
+        ASSERT_EQ(vec[i],vec2[i]);
+    }
+}
+
+
+TEST(SuperVectorUtilsTest,Eqmask512c){
+    srand (time(NULL));
+    u8 vec[64];
+    for (int i = 0; i<64; i++) { vec[i] = rand() % 64 + 0;}
+    u8 vec2[64];
+    for (int i = 0; i<64; i++) { vec2[i]= rand() % 100 + 67;}
+    auto SP = SuperVector<64>::loadu(vec);
+    auto SP1 = SuperVector<64>::loadu(vec2);
+    u64a mask = SP.eqmask(SP);
+    // Mask width for 64 bit type cannot be more than 1.
+    ASSERT_EQ(SuperVector<64>::mask_width(), 1);
+    ASSERT_EQ(mask,0xFFFFFFFFFFFFFFFF);
+    mask = SP.eqmask(SP1);
+    ASSERT_EQ(mask,0);
+    vec2[0] = vec[0];
+    vec2[1] = vec[1];
+    auto SP2 = SuperVector<64>::loadu(vec2);
+    mask = SP.eqmask(SP2);
+    ASSERT_EQ(mask,3);
+}
+
+TEST(SuperVectorUtilsTest,pshufb512c) {
+    srand (time(NULL));
+    u8 vec[64];
+    for (int i=0; i<64; i++) {
+        vec[i] = rand() % 100 + 1;
+    }
+    u8 vec2[64];
+    for (int i=0; i<64; i++) {
+        vec2[i]=i;
+    }
+    auto SP1 = SuperVector<64>::loadu(vec);
+    auto SP2 = SuperVector<64>::loadu(vec2);
+    auto SResult = SP1.pshufb(SP2);
+    for (int i=0; i<64; i++) {
+        ASSERT_EQ(vec[vec2[i]],SResult.u.u8[i]);
+    }
+}
+
+/*Define LSHIFT512 macro*/
+#define TEST_LSHIFT512(buf, vec, v, l) {                                                  \
+                                           auto v_shifted = v << (l);                     \
+                                           for (int i=63; i>= l; --i) {                   \
+                                               buf[i] = vec[i-l];                         \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<64; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift512c){
+    u8 vec[64];
+    for (int i = 0; i<64; i++) { vec[i]= i+1;}
+    auto SP = SuperVector<64>::loadu(vec);
+    u8 buf[64];
+    for (int j = 0; j<64; j++) { 
+        TEST_LSHIFT512(buf, vec, SP, j);
+    }
+}
+
+
+TEST(SuperVectorUtilsTest,LShift64_512c){
+    u64a vec[8] = {32, 64, 128, 256, 512, 512, 256, 1024};
+    auto SP = SuperVector<64>::loadu(vec);
+    for(int s = 0; s<64; s++) {
+        auto SP_after_shift = SP.vshl_64(s);
+        for (int i=0; i<8; i++) {
+            ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] << s);
+        }
+    }   
+}
+
+TEST(SuperVectorUtilsTest,RShift64_512c){
+    u64a vec[8] = {32, 64, 128, 256, 512, 512, 256, 1024};
+    auto SP = SuperVector<64>::loadu(vec);
+    for(int s = 0; s<64; s++) {
+        auto SP_after_shift = SP.vshr_64(s);
+        for (int i=0; i<8; i++) {
+            ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] >> s);
+        }
+    }   
+}
+
+
+/*Define RSHIFT512 macro*/
+#define TEST_RSHIFT512(buf, vec, v, l) {                                                  \
+                                           auto v_shifted = v >> (l);                     \
+                                           for (int i=0; i<64-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                           }                                              \
+                                           for (int i=64-l; i<64; i++) {                  \
+                                               buf[i] = 0;                                \
+                                           }                                              \
+                                           for(int i=0; i<64; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,RShift512c){
+    u8 vec[64];
+    for (int i = 0; i<64; i++) { vec[i]= i+1;}
+    auto SP = SuperVector<64>::loadu(vec);
+    u8 buf[64];
+    for (int j = 0; j<64; j++) { 
+        TEST_RSHIFT512(buf, vec, SP, j);
+    }
+}
+
+
+/*Define RSHIFT128_512 macro*/
+#define TEST_RSHIFT128_512(buf, vec, v, l) {                                              \
+                                           auto v_shifted = v.vshr_128(l);                \
+                                           for (int i=0; i<16-l; i++) {                   \
+                                               buf[i] = vec[i+l];                         \
+                                               buf[i+16] = vec[(i+16)+l];                 \
+                                               buf[i+32] = vec[(i+32)+l];                 \
+                                               buf[i+48] = vec[(i+48)+l];                 \
+                                           }                                              \
+                                           for (int i=16-l; i<16; i++) {                  \
+                                               buf[i] = 0;                                \
+                                               buf[i+16] = 0;                             \
+                                               buf[i+32] = 0;                             \
+                                               buf[i+48] = 0;                             \
+                                           }                                              \
+                                           for(int i=0; i<64; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+TEST(SuperVectorUtilsTest,RShift128_512c){
+    u8 vec[64];
+    for (int i = 0; i<64; i++ ){ vec[i] = i+1; }
+    auto SP = SuperVector<64>::loadu(vec);
+    u8 buf[64] = {1};
+    for(int j=0; j<16; j++){
+        TEST_RSHIFT128_512(buf, vec, SP, j)
+    }      
+}
+
+/*Define LSHIFT512 macro*/
+#define TEST_LSHIFT128_512(buf, vec, v, l) {                                              \
+                                           auto v_shifted = v.vshl_128(l);                \
+                                           for (int i=15; i>=l; --i) {                    \
+                                               buf[i] = vec[i-l];                         \
+                                               buf[i+16] = vec[(i+16)-l];                 \
+                                               buf[i+32] = vec[(i+32)-l];                 \
+                                               buf[i+48] = vec[(i+48)-l];                 \
+                                           }                                              \
+                                           for (int i=0; i<l; i++) {                      \
+                                               buf[i] = 0;                                \
+                                               buf[i+16] = 0;                             \
+                                               buf[i+32] = 0;                             \
+                                               buf[i+48] = 0;                             \
+                                           }                                              \
+                                           for(int i=0; i<64; i++) {                      \
+                                               ASSERT_EQ(v_shifted.u.u8[i], buf[i]);      \
+                                           }                                              \
+                                       }
+
+TEST(SuperVectorUtilsTest,LShift128_512c){
+    u8 vec[64];
+    for (int i = 0; i<64; i++) { vec[i]= i+1;}
+    auto SP = SuperVector<64>::loadu(vec);
+    u8 buf[64] = {1};
+    for(int j=0; j<16;j++){
+        TEST_LSHIFT128_512(buf, vec, SP, j);
+    }
+}
+
+
+/*Define ALIGNR512 macro*/
+#define TEST_ALIGNR512(v1, v2, buf, l) {                                                 \
+                                           auto v_aligned = v1.alignr(v2, l);            \
+                                           for (size_t i=0; i<64; i++) {                 \
+                                               ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \
+                                           }                                             \
+                                       }
+
+TEST(SuperVectorUtilsTest,Alignr512c){
+    u8 vec[128];
+    for (int i=0; i<128; i++) {
+        vec[i]=i;
+    }
+    auto SP1 = SuperVector<64>::loadu(vec);
+    auto SP2 = SuperVector<64>::loadu(vec+64);
+    for(int j=0; j<64; j++){
+        TEST_ALIGNR512(SP1, SP2, vec, j);
+    }
+}
+
+#endif // HAVE_AVX512
diff --git a/unit/internal/utf8_validate.cpp b/unit/internal/utf8_validate.cpp
index 033579420..03f529036 100644
--- a/unit/internal/utf8_validate.cpp
+++ b/unit/internal/utf8_validate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2022, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -64,8 +64,8 @@ static ValidUtf8TestInfo valid_utf8_tests[] = {
     {"공동경비구역", true},
     {"জলসাঘর", true},
 
-    // Invalid one-byte caseS.
-    {"\x7f", false},
+    // Valid one-byte caseS.
+    {"\x7f", true}, // \x7f is valid
 
     // These bytes should never appear in a UTF-8 stream.
     {"\xc0", false},
diff --git a/unit/internal/vermicelli.cpp b/unit/internal/vermicelli.cpp
index 5e4a82539..e6d976ade 100644
--- a/unit/internal/vermicelli.cpp
+++ b/unit/internal/vermicelli.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,7 +30,7 @@
 #include "config.h"
 
 #include "gtest/gtest.h"
-#include "nfa/vermicelli.h"
+#include "nfa/vermicelli.hpp"
 
 TEST(Vermicelli, ExecNoMatch1) {
     char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
@@ -522,3 +523,631 @@ TEST(DoubleVermicelliMasked, Exec4) {
     }
 }
 
+#ifdef HAVE_SVE2
+
+#include "nfa/vermicellicompile.h"
+using namespace ue2;
+
+TEST(Vermicelli16, ExecNoMatch1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    chars.set('B');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            const u8 *rv = vermicelli16Exec(matches, buf + i, buf + strlen(t1) - j);
+            ASSERT_EQ(buf + strlen(t1) - j, rv);
+        }
+    }
+}
+
+TEST(Vermicelli16, Exec1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelli16Exec(matches, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 17, rv);
+    }
+}
+
+TEST(Vermicelli16,  Exec2) {
+    char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbabbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelli16Exec(matches, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 17, rv);
+    }
+}
+
+TEST(Vermicelli16,  Exec3) {
+    char t1[] = "bbbbbbbbbbbbbbbbbAaaaaaaaaaaaaaaaaaaaaaabbbbbbbbabbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    m128 matches_a;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches_a);
+    ASSERT_TRUE(ret);
+
+    chars.set('A');
+    m128 matches_A;
+    ret = vermicelli16Build(chars, (u8 *)&matches_A);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelli16Exec(matches_a, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 18, rv);
+
+        rv = vermicelli16Exec(matches_A, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 17, rv);
+    }
+}
+
+TEST(Vermicelli16, Exec4) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('a');
+    m128 matches_a;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches_a);
+    ASSERT_TRUE(ret);
+
+    chars.set('A');
+    m128 matches_A;
+    ret = vermicelli16Build(chars, (u8 *)&matches_A);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 31; i++) {
+        t1[48 - i] = 'a';
+        const u8 *rv = vermicelli16Exec(matches_a, buf, buf + strlen(t1));
+        ASSERT_EQ(buf + 48 - i, rv);
+
+        rv = vermicelli16Exec(matches_A, buf, buf + strlen(t1));
+        ASSERT_EQ(buf + 48 - i, rv);
+    }
+}
+
+TEST(Vermicelli16, Exec5) {
+    char t1[] = "qqqqqqqqqqqqqqqqqabcdefghijklmnopqqqqqqqqqqqqqqqqqqqqq";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    m128 matches[16];
+    bool ret;
+
+    for (int i = 0; i < 16; ++i) {
+        chars.set('p' - i);
+        ret = vermicelli16Build(chars, (u8 *)&matches[i]);
+        ASSERT_TRUE(ret);
+    }
+
+    for (int j = 0; j < 16; ++j) {
+        for (size_t i = 0; i < 16; i++) {
+            const u8 *rv = vermicelli16Exec(matches[j], buf + i,buf + strlen(t1));
+            ASSERT_EQ(buf - j + 32, rv);
+        }
+    }
+}
+
+TEST(NVermicelli16, ExecNoMatch1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    chars.set('B');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            const u8 *rv = nvermicelli16Exec(matches, buf + i, buf + strlen(t1) - j);
+            ASSERT_EQ((buf + strlen(t1) - j), rv);
+        }
+    }
+}
+
+TEST(NVermicelli16, Exec1) {
+    char t1[] = "bbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = nvermicelli16Exec(matches, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 17, rv);
+    }
+}
+
+TEST(NVermicelli16,  Exec2) {
+    char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbabbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    chars.set('A');
+    m128 matches;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = nvermicelli16Exec(matches, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 17, rv);
+    }
+}
+
+TEST(NVermicelli16,  Exec3) {
+    char t1[] = "bbbbbbbbbbbbbbbbbAaaaaaaaaaaaaaaaaaaaaaabbbbbbbbabbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    m128 matches_b;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches_b);
+    ASSERT_TRUE(ret);
+
+    chars.set('A');
+    m128 matches_A;
+    ret = vermicelli16Build(chars, (u8 *)&matches_A);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = nvermicelli16Exec(matches_b, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 17, rv);
+
+        rv = nvermicelli16Exec(matches_A, buf + i, buf + strlen(t1));
+        ASSERT_EQ(buf + 18, rv);
+    }
+}
+
+TEST(NVermicelli16, Exec4) {
+    char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    chars.set('b');
+    m128 matches_b;
+    bool ret = vermicelli16Build(chars, (u8 *)&matches_b);
+    ASSERT_TRUE(ret);
+
+    chars.set('A');
+    m128 matches_A;
+    ret = vermicelli16Build(chars, (u8 *)&matches_A);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 31; i++) {
+        t1[48 - i] = 'a';
+        const u8 *rv = nvermicelli16Exec(matches_b, buf, buf + strlen(t1));
+        ASSERT_EQ(buf + 48 - i, rv);
+
+        rv = nvermicelli16Exec(matches_A, buf, buf + strlen(t1));
+        ASSERT_EQ(buf + 48 - i, rv);
+    }
+}
+
+TEST(NVermicelli16, Exec5) {
+    char t1[] = "aaaaaaaaaaaaaaaaaabcdefghijklmnopqaaaaaaaaaaaaaaaaaaaaa";
+    const u8 *buf = (const u8 *)t1;
+
+    CharReach chars;
+    m128 matches[16];
+    bool ret;
+
+    for (int i = 0; i < 16; ++i) {
+        chars.set('a' + i);
+        ret = vermicelli16Build(chars, (u8 *)&matches[i]);
+        ASSERT_TRUE(ret);
+    }
+
+    for (int j = 0; j < 16; ++j) {
+        for (size_t i = 0; i < 16; i++) {
+            const u8 *rv = nvermicelli16Exec(matches[j], buf + i, buf + strlen(t1));
+            ASSERT_EQ(buf + j + 18, rv);
+        }
+    }
+}
+
+TEST(DoubleVermicelli16, ExecNoMatch1) {
+    std::string t1("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches;
+    u64a firsts;
+    flat_set<std::pair<u8, u8>> pairs;
+    for (int i = 0; i < 16; i += 2) {
+      pairs.insert(std::make_pair('a' + i, 'a' + i + 1));
+    }
+    bool ret = vermicelliDouble16Build(pairs, (u8 *)&matches, (u8 *)&firsts);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            const u8 *rv = vermicelliDouble16Exec(matches, firsts,
+                                      t1_raw + i, t1_raw + t1.length() - i - j);
+            ASSERT_EQ(t1_raw + t1.length() - i - j, rv);
+        }
+    }
+}
+
+TEST(DoubleVermicelli16, ExecNoMatch2) {
+    std::string t1("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches;
+    u64a firsts;
+    flat_set<std::pair<u8, u8>> pairs;
+    pairs.insert(std::make_pair('a', 'b'));
+    pairs.insert(std::make_pair('A', 'B'));
+    pairs.insert(std::make_pair('B', 'A'));
+    pairs.insert(std::make_pair('B', 'B'));
+    bool ret = vermicelliDouble16Build(pairs, (u8 *)&matches, (u8 *)&firsts);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            const u8 *rv = vermicelliDouble16Exec(matches, firsts,
+                                      t1_raw + i, t1_raw + t1.length() - i - j);
+            ASSERT_EQ(t1_raw + t1.length() - i - j, rv);
+        }
+    }
+}
+
+TEST(DoubleVermicelli16, ExecNoMatch3) {
+    std::string t1("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches;
+    u64a firsts;
+    flat_set<std::pair<u8, u8>> pairs;
+    pairs.insert(std::make_pair('a', 'b'));
+    pairs.insert(std::make_pair('B', 'B'));
+    pairs.insert(std::make_pair('A', 'B'));
+    pairs.insert(std::make_pair('b', 'a'));
+    bool ret = vermicelliDouble16Build(pairs, (u8 *)&matches, (u8 *)&firsts);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            /* partial match */
+            const u8 *rv = vermicelliDouble16Exec(matches, firsts,
+                                      t1_raw + i, t1_raw + t1.length() - i - j);
+            ASSERT_EQ(t1_raw + t1.length() - i - j - 1, rv);
+        }
+    }
+}
+
+TEST(DoubleVermicelli16, Exec1) {
+    std::string t1("bbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches;
+    u64a firsts;
+    flat_set<std::pair<u8, u8>> pairs;
+    pairs.insert(std::make_pair('a', 'b'));
+    bool ret = vermicelliDouble16Build(pairs, (u8 *)&matches, (u8 *)&firsts);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelliDouble16Exec(matches, firsts,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 18, rv);
+    }
+
+    pairs.insert(std::make_pair('b', 'a'));
+    ret = vermicelliDouble16Build(pairs, (u8 *)&matches, (u8 *)&firsts);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelliDouble16Exec(matches, firsts,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+    }
+}
+
+TEST(DoubleVermicelli16,  Exec2) {
+    std::string t1("bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbaaaaabbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches;
+    u64a firsts;
+    flat_set<std::pair<u8, u8>> pairs;
+    pairs.insert(std::make_pair('a', 'a'));
+    bool ret = vermicelliDouble16Build(pairs, (u8 *)&matches, (u8 *)&firsts);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelliDouble16Exec(matches, firsts,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+    }
+}
+
+TEST(DoubleVermicelliMasked16, ExecNoMatch1) {
+    std::string t1("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches1;
+    bool ret = vermicelliDoubleMasked16Build('a', 'b', 0xff, 0xff, (u8 *)&matches1);
+    ASSERT_TRUE(ret);
+    m128 matches2;
+    ret = vermicelliDoubleMasked16Build('B', 'B', 0xff, CASE_CLEAR, (u8 *)&matches2);
+    ASSERT_TRUE(ret);
+    m128 matches3;
+    ret = vermicelliDoubleMasked16Build('A', 'B', CASE_CLEAR, CASE_CLEAR, (u8 *)&matches3);
+    ASSERT_TRUE(ret);
+    m128 matches4;
+    ret = vermicelliDoubleMasked16Build('B', 'B', CASE_CLEAR, 0xff, (u8 *)&matches4);
+    ASSERT_TRUE(ret);
+    m128 matches5;
+    ret = vermicelliDoubleMasked16Build('B', 'A', 0xff, 0xff, (u8 *)&matches5);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        for (size_t j = 0; j < 16; j++) {
+            const u8 *rv = vermicelliDoubleMasked16Exec(matches1, 'a', 0xff,
+                                      t1_raw + i, t1_raw + t1.length() - i - j);
+            ASSERT_EQ(t1_raw + t1.length() - i - j, rv);
+
+            rv = vermicelliDoubleMasked16Exec(matches2, 'B', 0xff, t1_raw + i,
+                                              t1_raw + t1.length() - i - j);
+            ASSERT_EQ(t1_raw + t1.length() - i - j, rv);
+
+            rv = vermicelliDoubleMasked16Exec(matches3, 'A', CASE_CLEAR,
+                                      t1_raw + i, t1_raw + t1.length() - i - j);
+            ASSERT_EQ(t1_raw + t1.length() - i - j, rv);
+
+            /* partial match */
+            rv = vermicelliDoubleMasked16Exec(matches4, 'B', CASE_CLEAR,
+                                      t1_raw + i, t1_raw + t1.length() - i - j);
+            ASSERT_EQ(t1_raw + t1.length() - i - j - 1, rv);
+
+            rv = vermicelliDoubleMasked16Exec(matches5, 'B', 0xff, t1_raw + i,
+                                              t1_raw + t1.length() - i - j);
+            ASSERT_EQ(t1_raw + t1.length() - i - j, rv);
+        }
+    }
+}
+
+TEST(DoubleVermicelliMasked16, Exec1) {
+    std::string t1("bbbbbbbbbbbbbbbbbbabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbabbbbbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches1;
+    bool ret = vermicelliDoubleMasked16Build('a', 'b', 0xff, 0xff, (u8 *)&matches1);
+    ASSERT_TRUE(ret);
+    m128 matches2;
+    ret = vermicelliDoubleMasked16Build('A', 'B', CASE_CLEAR, CASE_CLEAR, (u8 *)&matches2);
+    ASSERT_TRUE(ret);
+    m128 matches3;
+    ret = vermicelliDoubleMasked16Build('a', 'B', 0xff, CASE_CLEAR, (u8 *)&matches3);
+    ASSERT_TRUE(ret);
+    m128 matches4;
+    ret = vermicelliDoubleMasked16Build('A', 'b', CASE_CLEAR, 0xff, (u8 *)&matches4);
+    ASSERT_TRUE(ret);
+    m128 matches5;
+    ret = vermicelliDoubleMasked16Build('b', 'a', 0xff, 0xff, (u8 *)&matches5);
+    ASSERT_TRUE(ret);
+    m128 matches6;
+    ret = vermicelliDoubleMasked16Build('B', 'A', CASE_CLEAR, CASE_CLEAR, (u8 *)&matches6);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelliDoubleMasked16Exec(matches1, 'a', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 18, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches2, 'A', CASE_CLEAR,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 18, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches3, 'a', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 18, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches4, 'A', CASE_CLEAR,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 18, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches5, 'b', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches6, 'B', CASE_CLEAR,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+    }
+}
+
+TEST(DoubleVermicelliMasked16,  Exec2) {
+    std::string t1("bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbaaaaabbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches1;
+    bool ret = vermicelliDoubleMasked16Build('a', 'a', 0xff, 0xff, (u8 *)&matches1);
+    ASSERT_TRUE(ret);
+    m128 matches2;
+    ret = vermicelliDoubleMasked16Build('A', 'A', CASE_CLEAR, CASE_CLEAR, (u8 *)&matches2);
+    ASSERT_TRUE(ret);
+    m128 matches3;
+    ret = vermicelliDoubleMasked16Build('a', 'A', 0xff, CASE_CLEAR, (u8 *)&matches3);
+    ASSERT_TRUE(ret);
+    m128 matches4;
+    ret = vermicelliDoubleMasked16Build('A', 'a', CASE_CLEAR, 0xff, (u8 *)&matches4);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelliDoubleMasked16Exec(matches1, 'a', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches2, 'A', CASE_CLEAR,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches3, 'a', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches4, 'A', CASE_CLEAR,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+    }
+}
+
+TEST(DoubleVermicelliMasked16,  Exec3) {
+    /*              012345678901234567890123 */
+    std::string t1("bbbbbbbbbbbbbbbbbaAaaAAaaaaaaaaaaaaaaaaaabbbbbbbaaaaabbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches1;
+    bool ret = vermicelliDoubleMasked16Build('A', 'a', 0xff, 0xff, (u8 *)&matches1);
+    ASSERT_TRUE(ret);
+    m128 matches2;
+    ret = vermicelliDoubleMasked16Build('A', 'A', CASE_CLEAR, CASE_CLEAR, (u8 *)&matches2);
+    ASSERT_TRUE(ret);
+    m128 matches3;
+    ret = vermicelliDoubleMasked16Build('A', 'A', 0xff, 0xff, (u8 *)&matches3);
+    ASSERT_TRUE(ret);
+    m128 matches4;
+    ret = vermicelliDoubleMasked16Build('a', 'A', 0xff, 0xff, (u8 *)&matches4);
+    ASSERT_TRUE(ret);
+    m128 matches5;
+    ret = vermicelliDoubleMasked16Build('a', 'A', 0xff, CASE_CLEAR, (u8 *)&matches5);
+    ASSERT_TRUE(ret);
+    m128 matches6;
+    ret = vermicelliDoubleMasked16Build('A', 'a', CASE_CLEAR, 0xff, (u8 *)&matches6);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelliDoubleMasked16Exec(matches1, 'A', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 18, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches2, 'A', CASE_CLEAR,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches3, 'A', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 21, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches4, 'a', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches5, 'a', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches6, 'A', CASE_CLEAR,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 18, rv);
+    }
+}
+
+TEST(DoubleVermicelliMasked16, Exec4) {
+    std::string t1("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches1;
+    bool ret = vermicelliDoubleMasked16Build('a', 'a', 0xff, 0xff, (u8 *)&matches1);
+    ASSERT_TRUE(ret);
+    m128 matches2;
+    ret = vermicelliDoubleMasked16Build('A', 'A', CASE_CLEAR, CASE_CLEAR, (u8 *)&matches2);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 31; i++) {
+        t1[48 - i] = 'a';
+        t1[48 - i + 1] = 'a';
+        const u8 *rv = vermicelliDoubleMasked16Exec(matches1, 'a', 0xff, t1_raw,
+                                                    t1_raw + t1.length());
+        ASSERT_EQ(t1_raw + 48 - i, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches2, 'A', CASE_CLEAR, t1_raw,
+                                          t1_raw + t1.length());
+        ASSERT_EQ(t1_raw + 48 - i, rv);
+    }
+}
+
+TEST(DoubleVermicelliMasked16,  Exec5) {
+    std::string t1("bbbbbbbbbbbbbbbbbaCaGaOCaChBfcNgBFGiLbbbbbbbbbbbbbbbbbbbbbbbb");
+    const u8 *t1_raw = (const u8 *)t1.c_str();
+
+    m128 matches1;
+    bool ret = vermicelliDoubleMasked16Build('a', 'B', 0xff, 0xde, (u8 *)&matches1);
+    ASSERT_TRUE(ret);
+    m128 matches2;
+    ret = vermicelliDoubleMasked16Build('a', 'D', 0xff, 0xdc, (u8 *)&matches2);
+    ASSERT_TRUE(ret);
+    m128 matches3;
+    ret = vermicelliDoubleMasked16Build('D', 'a', 0xdc, 0xff, (u8 *)&matches3);
+    ASSERT_TRUE(ret);
+    m128 matches4;
+    ret = vermicelliDoubleMasked16Build('A', 'B', 0xdf, 0xde, (u8 *)&matches4);
+    ASSERT_TRUE(ret);
+    m128 matches5;
+    ret = vermicelliDoubleMasked16Build('B', 'a', 0xde, 0xff, (u8 *)&matches5);
+    ASSERT_TRUE(ret);
+    m128 matches6;
+    ret = vermicelliDoubleMasked16Build('B', 'A', 0xde, 0xdf, (u8 *)&matches6);
+    ASSERT_TRUE(ret);
+
+    for (size_t i = 0; i < 16; i++) {
+        const u8 *rv = vermicelliDoubleMasked16Exec(matches1, 'a', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches2, 'a', 0xff,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 19, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches3, 'D', 0xdc,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 20, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches4, 'A', 0xdf,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 17, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches5, 'B', 0xde,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 16, rv);
+
+        rv = vermicelliDoubleMasked16Exec(matches6, 'B', 0xde,
+                                          t1_raw + i, t1_raw + t1.length() - i);
+        ASSERT_EQ(t1_raw + 16, rv);
+    }
+}
+
+#endif // HAVE_SVE2
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index ea942ef1a..97fd4c7d9 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -6,8 +6,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS} ${HS_CXX_FLAGS}")
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}
     ${PROJECT_SOURCE_DIR})
 
+message("RAGEL_C_FLAGS" ${RAGEL_C_FLAGS})
+
 set_source_files_properties(
-    ${CMAKE_BINARY_DIR}/tools/ExpressionParser.cpp
+    ExpressionParser.cpp
     PROPERTIES
     COMPILE_FLAGS "${RAGEL_C_FLAGS}")
 
diff --git a/util/ExpressionParser.rl b/util/ExpressionParser.rl
index fec479229..b93f069d3 100644
--- a/util/ExpressionParser.rl
+++ b/util/ExpressionParser.rl
@@ -55,6 +55,7 @@ enum ParamKey {
 
 %%{
     machine ExpressionParser;
+    alphtype unsigned char;
 
     action accumulateNum {
         num = (num * 10) + (fc - '0');
diff --git a/util/cross_compile.cpp b/util/cross_compile.cpp
index 0d1369984..df2aff5a0 100644
--- a/util/cross_compile.cpp
+++ b/util/cross_compile.cpp
@@ -31,7 +31,6 @@
 #include "cross_compile.h"
 #include "src/ue2common.h"
 #include "src/hs_compile.h"
-#include "src/util/make_unique.h"
 
 #include <sstream>
 #include <string>
@@ -74,7 +73,7 @@ unique_ptr<hs_platform_info> xcompileReadMode(const char *s) {
         return nullptr;
     } else {
         DEBUG_PRINTF("cpu_features %llx\n", rv.cpu_features);
-        return ue2::make_unique<hs_platform_info>(rv);
+        return std::make_unique<hs_platform_info>(rv);
     }
 }
 
diff --git a/util/expression_path.h b/util/expression_path.h
index ac4ca97da..e667adbb2 100644
--- a/util/expression_path.h
+++ b/util/expression_path.h
@@ -38,10 +38,8 @@
 #include <vector>
 
 #include <sys/stat.h>
-#if !defined(_WIN32)
 #include <unistd.h>
 #include <libgen.h>
-#endif
 
 //
 // Utility functions
@@ -52,7 +50,6 @@
  */
 static inline
 std::string inferExpressionPath(const std::string &sigFile) {
-#ifndef _WIN32
     // POSIX variant.
 
     // dirname() may modify its argument, so we must make a copy.
@@ -60,25 +57,11 @@ std::string inferExpressionPath(const std::string &sigFile) {
     path.push_back(0); // ensure null termination.
 
     std::string rv = dirname(path.data());
-#else
-    // Windows variant.
-    if (sigFile.size() >= _MAX_DIR) {
-        return std::string();
-    }
-    char path[_MAX_DIR];
-    _splitpath(sigFile.c_str(), nullptr, path, nullptr, nullptr);
-    std::string rv(path);
-#endif
 
     rv += "/../pcre";
     return rv;
 }
 
-#if defined(_WIN32)
-#define stat _stat
-#define S_IFREG _S_IFREG
-#endif
-
 static inline
 bool isDir(const std::string &filename) {
     struct stat s;
diff --git a/util/expressions.cpp b/util/expressions.cpp
index d6334bad9..74bf4ba21 100644
--- a/util/expressions.cpp
+++ b/util/expressions.cpp
@@ -40,14 +40,9 @@
 
 #include <sys/types.h>
 #include <sys/stat.h>
-#if !defined(_WIN32)
 #include <dirent.h>
 #include <fcntl.h>
 #include <unistd.h>
-#else
-// Windows support is probably very fragile
-#include <windows.h>
-#endif
 
 #include <boost/algorithm/string/trim.hpp>
 
@@ -98,11 +93,6 @@ void processLine(string &line, unsigned lineNum,
     }
 }
 
-#if defined(_WIN32)
-#define stat _stat
-#define S_ISDIR(st_m) (_S_IFDIR & (st_m))
-#define S_ISREG(st_m) (_S_IFREG & (st_m))
-#endif
 void HS_CDECL loadExpressionsFromFile(const string &fname, ExpressionMap &exprMap) {
     struct stat st;
     if (stat(fname.c_str(), &st) != 0) {
@@ -143,7 +133,6 @@ bool isIgnorable(const std::string &f) {
     return false;
 }
 
-#ifndef _WIN32
 void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
     // Is our input path a file or a directory?
     struct stat st;
@@ -197,62 +186,6 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
         exit(1);
     }
 }
-#else // windows TODO: improve
-void HS_CDECL loadExpressions(const string &inPath, ExpressionMap &exprMap) {
-    // Is our input path a file or a directory?
-    struct stat st;
-    if (stat(inPath.c_str(), &st) != 0) {
-        cerr << "Can't stat path: '" << inPath << "'" << endl;
-        exit(1);
-    }
-    if (S_ISREG(st.st_mode)) {
-        // process file
-        try {
-            loadExpressionsFromFile(inPath, exprMap);
-        } catch (runtime_error &e) {
-            cerr << e.what() << ": '" << inPath << "'" << endl;
-            exit(1);
-        }
-    } else if (S_ISDIR(st.st_mode)) {
-        WIN32_FIND_DATA ffd;
-        HANDLE hFind = INVALID_HANDLE_VALUE;
-        string glob = inPath + "/*";
-        hFind = FindFirstFile(glob.c_str(), &ffd);
-        if (hFind == INVALID_HANDLE_VALUE) {
-            cerr << "Can't open directory: '" << inPath << "'" << endl;
-            exit(1);
-        }
-        do {
-            string basename(ffd.cFileName);
-            string fname(inPath);
-            fname.push_back('/');
-            fname.append(basename);
-
-            // Ignore '.' and '..'
-            if (basename == "." || basename == "..") {
-                continue;
-            }
-
-            // Skip emacs backup files, dotfiles (such as VIM swap).
-            if (isIgnorable(basename)) {
-                cerr << "Ignoring signature file " << fname << endl;
-                continue;
-            }
-
-            try {
-                loadExpressionsFromFile(fname, exprMap);
-            } catch (runtime_error &e) {
-                cerr << e.what() << ": '" << fname << "'" << endl;
-                exit(1);
-            }
-        } while (FindNextFile(hFind, &ffd) != 0);
-        FindClose(hFind);
-    } else {
-        cerr << "Can't stat path: '" << inPath << "'" << endl;
-        exit(1);
-    }
-}
-#endif
 
 void HS_CDECL loadSignatureList(const string &inFile,
                                 SignatureSet &signatures) {
diff --git a/util/ng_corpus_editor.cpp b/util/ng_corpus_editor.cpp
index ac4f8b654..c1149216d 100644
--- a/util/ng_corpus_editor.cpp
+++ b/util/ng_corpus_editor.cpp
@@ -268,12 +268,12 @@ void CorpusEditorUtf8::flip_case(vector<unichar> &corpus) {
 unichar CorpusEditorUtf8::chooseCodePoint(void) {
     /* We need to ensure that we don't pick a surrogate cp */
     const u32 range =
-        MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
+        MAX_UNICODE + 1 - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
     unichar raw = props.rand(0, range - 1);
     if (raw < UNICODE_SURROGATE_MIN) {
         return raw;
     } else {
-        return raw + UNICODE_SURROGATE_MAX + 1;
+        return raw + UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
     }
 }
 
diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index e5e8e06cd..6c3f613d2 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -41,7 +41,6 @@
 #include "ue2common.h"
 #include "util/container.h"
 #include "util/graph_range.h"
-#include "util/make_unique.h"
 #include "util/ue2string.h"
 #include "util/unicode_def.h"
 #include "util/unicode_set.h"
@@ -141,7 +140,7 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
     const size_t MAX_OPEN = min((size_t)1000, corpusLimit * 10);
 
     vector<unique_ptr<VertexPath>> open;
-    open.push_back(ue2::make_unique<VertexPath>(1, g.start));
+    open.push_back(std::make_unique<VertexPath>(1, g.start));
 
     unordered_set<NFAVertex> one_way_in;
     for (const auto &v : vertices_range(g)) {
@@ -200,7 +199,7 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
             if (boost::next(ai) == ae) {
                 new_path = std::move(p);
             } else {
-                new_path = make_unique<VertexPath>(*p);
+                new_path = std::make_unique<VertexPath>(*p);
             }
 
             new_path->push_back(v);
@@ -477,14 +476,14 @@ void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
  * that we've been asked for. */
 unichar CorpusGeneratorUtf8::getRandomChar() {
     u32 range = MAX_UNICODE + 1
-                - (UNICODE_SURROGATE_MAX + UNICODE_SURROGATE_MIN + 1);
+                - (UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1);
     range = min(cProps.alphabetSize, range);
     assert(range);
 
     unichar c = 'a' + cProps.rand(0, range - 1);
 
     if (c >= UNICODE_SURROGATE_MIN) {
-        c =+ UNICODE_SURROGATE_MAX + 1;
+        c += UNICODE_SURROGATE_MAX - UNICODE_SURROGATE_MIN + 1;
     }
 
     return c % (MAX_UNICODE + 1);
@@ -714,8 +713,8 @@ unique_ptr<CorpusGenerator> makeCorpusGenerator(const NGHolder &graph,
                                                 const ExpressionInfo &expr,
                                                 CorpusProperties &props) {
     if (expr.utf8) {
-        return ue2::make_unique<CorpusGeneratorUtf8>(graph, expr, props);
+        return std::make_unique<CorpusGeneratorUtf8>(graph, expr, props);
     } else {
-        return ue2::make_unique<CorpusGeneratorImpl>(graph, expr, props);
+        return std::make_unique<CorpusGeneratorImpl>(graph, expr, props);
     }
 }
diff --git a/util/ng_corpus_properties.cpp b/util/ng_corpus_properties.cpp
index e784e0582..511ad60ac 100644
--- a/util/ng_corpus_properties.cpp
+++ b/util/ng_corpus_properties.cpp
@@ -42,7 +42,7 @@ CorpusProperties::CorpusProperties()
     : matchness(100), unmatchness(0), randomness(0), prefixRange(0, 0),
       suffixRange(0, 0), cycleMin(1), cycleMax(1),
       corpusLimit(DEFAULT_CORPUS_GENERATOR_LIMIT), editDistance(0),
-      alphabetSize(~0) {
+      alphabetSize(~0), rngSeed(0) {
     // empty
 }
 
diff --git a/util/win_getopt.h b/util/win_getopt.h
deleted file mode 100644
index 7ec9abfbc..000000000
--- a/util/win_getopt.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2018, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef WIN_GETOPT_H
-#define WIN_GETOPT_H
-
-#include <windows.h>
-#define ILLEGAL (int)'?'
-#define END -1
-#define SPECIAL_OPT 1
-
-int optind = 0;
-char *optarg;
-static char EMPT[] = "";
-static char *ptr = EMPT;
-static int no_argument = 0;
-static int required_argument = 1;
-static const char no_arg[] = "option doesn't take an argument --%.*s";
-static const char non_opt_string[] = "not an option : %s";
-static const char ill_shortopt_char[] = "unknown option -%c";
-static const char ill_longopt_string[] = "unknown option --%s";
-static const char req_arg_string[] = "option requires an argument --%s";
-static const char req_arg_char[] = "option requires an argument -%c";
-
-struct option {
-    const char *name;
-    int has_arg;
-    int *flag;
-    int value;
-};
-
-static
-void warn(const char *fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    vfprintf(stdout, fmt, args);
-    fprintf(stdout, "\n");
-    va_end(args);
-}
-
-int getopt_long(int nargc, char *const *nargv, const char *options,
-                const struct option *long_options, int *idx) {
-    char *check, *equal;
-    size_t current_opt_len;
-    bool all_flag = false;
-    int match = -1;
-    // illegal
-    if (options == NULL) {
-        return ILLEGAL;
-    }
-    if (optind == 0) {
-        optind = 1;
-    }
-    if (optind >= nargc) {
-        return END;
-    }
-    if (*options == '-') {
-        all_flag = true;
-        ++options;
-    }
-    optarg = NULL;
-    // illegal
-    if (*(ptr = nargv[optind]) != '-') {
-        ptr = EMPT;
-        if (all_flag) {
-            optarg = nargv[optind++];
-            return SPECIAL_OPT;
-        } else {
-            warn(non_opt_string, nargv[optind]);
-            return ILLEGAL;
-        }
-    }
-    // likely a short option ?
-    if (ptr[1] != '\0' && *++ptr != '-' && ptr[1] == '\0') {
-        char opt_char = *ptr;
-        ptr = EMPT;
-        // really short option ?
-        if ((check = (char *)strchr(options, opt_char)) != NULL) {
-            if (check[1] == ':') {
-                ++optind;
-                if (optind >= nargc) {
-                    warn(req_arg_char, opt_char);
-                    return ILLEGAL;
-                } else {
-                    optarg = nargv[optind];
-                }
-            }
-            ++optind;
-            return opt_char;
-        } else { // illegal
-            warn(ill_shortopt_char, opt_char);
-            return ILLEGAL;
-        }
-    }
-    // we meet '--'
-    if (*ptr == '-' && ptr[1] == '\0') {
-        ptr = EMPT;
-        return END;
-    }
-    // we meet '--foo' , long option
-    if (long_options != NULL && *ptr == '-' && ptr[1] != '\0') {
-        ++ptr;
-        if ((equal = strchr(ptr, '=')) != NULL) {
-            // found --option=arg
-            current_opt_len = equal - ptr;
-            ++equal;
-        } else {
-            current_opt_len = strlen(ptr);
-        }
-        for (int i = 0; long_options[i].name; i++) {
-            if (!strcmp(ptr, long_options[i].name )) {
-                match = i;
-                break;
-            }
-        }
-        if (match == -1) { // don't match
-            warn(ill_longopt_string, ptr);
-            ptr = EMPT;
-            return ILLEGAL;
-        } else {
-            ++optind;
-            if (long_options[match].has_arg == required_argument) {
-                if (equal) {
-                    optarg = equal;
-                } else if (optind < nargc) {
-                    optarg = nargv[optind++];
-                } else {
-                    warn(req_arg_string, ptr);
-                    ptr = EMPT;
-                    return ILLEGAL;
-                }
-            }
-            if (long_options[match].has_arg == no_argument && equal) {
-                warn(no_arg, (int)current_opt_len, ptr);
-                ptr = EMPT;
-                return ILLEGAL;
-            }
-            ptr = EMPT;
-            if (long_options[match].flag) {
-                *long_options[match].flag = long_options[match].value;
-                return 0;
-            } else {
-                return (long_options[match].value);
-            }
-        }
-    }
-    warn(non_opt_string, ptr);
-    ptr = EMPT;
-    return ILLEGAL;
-}
-
-#endif