Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions lib/common/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,15 @@
*/
#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2")

/* Target attributes for ARM SVE/SVE2 dynamic dispatch. */
#if defined(__aarch64__) || defined(_M_ARM64)
# define SVE_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("+sve")
# define SVE2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("+sve+sve2")
#else
# define SVE_TARGET_ATTRIBUTE
# define SVE2_TARGET_ATTRIBUTE
#endif

/* prefetch
* can be disabled, by declaring NO_PREFETCH build macro */
#if defined(NO_PREFETCH)
Expand Down Expand Up @@ -218,10 +227,10 @@
# if defined(__ARM_NEON) || defined(_M_ARM64)
# define ZSTD_ARCH_ARM_NEON
# endif
# if defined(__ARM_FEATURE_SVE)
# if defined(__ARM_FEATURE_SVE) && !defined(__APPLE__)
# define ZSTD_ARCH_ARM_SVE
# endif
# if defined(__ARM_FEATURE_SVE2)
# if defined(__ARM_FEATURE_SVE2) && !defined(__APPLE__)
# define ZSTD_ARCH_ARM_SVE2
# endif
# if defined(__riscv) && defined(__riscv_vector)
Expand All @@ -239,7 +248,7 @@
# elif defined(ZSTD_ARCH_ARM_NEON)
# include <arm_neon.h>
# endif
# if defined(ZSTD_ARCH_ARM_SVE) || defined(ZSTD_ARCH_ARM_SVE2)
# if defined(ZSTD_ARCH_ARM_SVE) || defined(ZSTD_ARCH_ARM_SVE2) || DYNAMIC_SVE2
# include <arm_sve.h>
# endif
# if defined(ZSTD_ARCH_RISCV_RVV)
Expand Down
75 changes: 75 additions & 0 deletions lib/common/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@
#include <intrin.h>
#endif

/* Include headers needed for ARM feature detection */
#if defined(__aarch64__) || defined(_M_ARM64)
# if defined(__linux__) || defined(__ANDROID__)
# include <sys/auxv.h>
# elif defined(__APPLE__)
# include <sys/types.h>
# include <sys/sysctl.h>
# endif
#endif

typedef struct {
U32 f1c;
U32 f1d;
Expand Down Expand Up @@ -246,4 +256,69 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {

#undef X

/* ====================================================
* ARM CPU feature detection
* ==================================================== */
#if defined(__aarch64__) || defined(_M_ARM64)

typedef struct {
unsigned long hwcap;
unsigned long hwcap2;
} ZSTD_arm_cpuinfo_t;

#define ZSTD_ARM_HWCAP_SVE_BIT 22
#define ZSTD_ARM_HWCAP2_SVE2_BIT 1

MEM_STATIC ZSTD_arm_cpuinfo_t ZSTD_arm_cpuinfo(void) {
ZSTD_arm_cpuinfo_t info;
info.hwcap = 0;
info.hwcap2 = 0;

#if defined(__linux__) || defined(__ANDROID__)
/* Use getauxval() to read AT_HWCAP and AT_HWCAP2 */
# ifndef AT_HWCAP
# define AT_HWCAP 16
# endif
# ifndef AT_HWCAP2
# define AT_HWCAP2 26
# endif
info.hwcap = getauxval(AT_HWCAP);
info.hwcap2 = getauxval(AT_HWCAP2);

#elif defined(__APPLE__)
/* Apple Silicon (M1/M2/M3 etc.) does not implement SVE/SVE2.
* Apple uses a custom ARM ISA with different extensions (AMX, etc.).
* Leave hwcap as 0 to indicate no SVE/SVE2 support. */
(void)info;

#elif defined(_WIN32)
/* Windows on ARM - use IsProcessorFeaturePresent() */
# ifndef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
# define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE 46
# endif
# ifndef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
# define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE 47
# endif
/* Map Windows processor features to Linux-style hwcap bits for consistency */
if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
info.hwcap |= (1UL << ZSTD_ARM_HWCAP_SVE_BIT);
}
if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) {
info.hwcap2 |= (1UL << ZSTD_ARM_HWCAP2_SVE2_BIT);
}
#endif

return info;
}

MEM_STATIC int ZSTD_arm_cpuinfo_sve(ZSTD_arm_cpuinfo_t const cpuinfo) {
return ((cpuinfo.hwcap) & (1UL << ZSTD_ARM_HWCAP_SVE_BIT)) != 0;
}

MEM_STATIC int ZSTD_arm_cpuinfo_sve2(ZSTD_arm_cpuinfo_t const cpuinfo) {
return ((cpuinfo.hwcap2) & (1UL << ZSTD_ARM_HWCAP2_SVE2_BIT)) != 0;
}

#endif /* __aarch64__ || _M_ARM64 */

#endif /* ZSTD_COMMON_CPU_H */
36 changes: 36 additions & 0 deletions lib/common/portability_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,42 @@
# endif
#endif

/* Enable runtime SVE dispatch based on the CPU.
* Enabled for gcc & clang on aarch64 when SVE isn't enabled by default.
* Disabled on Apple platforms as they don't support SVE.
*
* NOTE: Currently not used - no SVE-only (without SVE2) optimizations exist.
* CPUs with SVE but not SVE2 (e.g., Fujitsu A64FX) could benefit from
* SVE-only implementations in the future.
*/
#ifndef DYNAMIC_SVE
# if ((defined(__clang__) && __has_attribute(__target__)) \
|| defined(__GNUC__)) \
&& (defined(__aarch64__) || defined(_M_ARM64)) \
&& !defined(__ARM_FEATURE_SVE) \
&& !defined(__APPLE__)
# define DYNAMIC_SVE 1
# else
# define DYNAMIC_SVE 0
# endif
#endif

/* Enable runtime SVE2 dispatch based on the CPU.
* Enabled for gcc & clang on aarch64 when SVE2 isn't enabled by default.
* Disabled on Apple platforms as they don't support SVE2.
*/
#ifndef DYNAMIC_SVE2
# if ((defined(__clang__) && __has_attribute(__target__)) \
|| defined(__GNUC__)) \
&& (defined(__aarch64__) || defined(_M_ARM64)) \
&& !defined(__ARM_FEATURE_SVE2) \
&& !defined(__APPLE__)
# define DYNAMIC_SVE2 1
# else
# define DYNAMIC_SVE2 0
# endif
#endif

/**
* Only enable assembly for GNU C compatible compilers,
* because other platforms may not support GAS assembly syntax.
Expand Down
20 changes: 20 additions & 0 deletions lib/common/zstd_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,4 +323,24 @@ MEM_STATIC int ZSTD_cpuSupportsBmi2(void)
return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
}

#if defined(__aarch64__) || defined(_M_ARM64)
/**
* @returns true iff the CPU supports SVE.
*/
MEM_STATIC int ZSTD_cpuSupportsSve(void)
{
ZSTD_arm_cpuinfo_t cpuinfo = ZSTD_arm_cpuinfo();
return ZSTD_arm_cpuinfo_sve(cpuinfo);
}

/**
* @returns true iff the CPU supports SVE2.
*/
MEM_STATIC int ZSTD_cpuSupportsSve2(void)
{
ZSTD_arm_cpuinfo_t cpuinfo = ZSTD_arm_cpuinfo();
return ZSTD_arm_cpuinfo_sve2(cpuinfo);
}
#endif

#endif /* ZSTD_CCOMMON_H_MODULE */
46 changes: 41 additions & 5 deletions lib/compress/hist.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "../common/mem.h" /* U32, BYTE, etc. */
#include "../common/debug.h" /* assert, DEBUGLOG */
#include "../common/error_private.h" /* ERROR */
#include "../common/zstd_internal.h" /* ZSTD_cpuSupportsSve2 */
#include "hist.h"

#if defined(ZSTD_ARCH_ARM_SVE2)
Expand Down Expand Up @@ -71,9 +72,23 @@ unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,

typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e;

#if defined(ZSTD_ARCH_ARM_SVE2)
/* ========================================================================
* ARM SVE2 Histogram Implementation
* ========================================================================
* Uses SVE2-specific instructions (svhistseg_u8, svaddwb_u16, svaddwt_u16)
* which are not available in base SVE.
*
* NOTE: CPUs with SVE but not SVE2 (e.g., Fujitsu A64FX) could benefit from
* an SVE-only implementation using base SVE instructions. This would require
* rewriting the histogram algorithm without SVE2's specialized histogram and
* widening add instructions.
* ======================================================================== */
#if defined(ZSTD_ARCH_ARM_SVE2) || (DYNAMIC_SVE2 && (defined(__aarch64__) || defined(_M_ARM64)))
FORCE_INLINE_TEMPLATE size_t min_size(size_t a, size_t b) { return a < b ? a : b; }

#if DYNAMIC_SVE2
SVE2_TARGET_ATTRIBUTE
#endif
static
svuint16_t HIST_count_6_sve2(const BYTE* const src, size_t size, U32* const dst,
const svuint8_t c0, const svuint8_t c1,
Expand Down Expand Up @@ -174,6 +189,9 @@ svuint16_t HIST_count_6_sve2(const BYTE* const src, size_t size, U32* const dst,
return svmax_u16_x(vl128, hh0, hh8);
}

#if DYNAMIC_SVE2
SVE2_TARGET_ATTRIBUTE
#endif
static size_t HIST_count_sve2(unsigned* count, unsigned* maxSymbolValuePtr,
const void* source, size_t sourceSize,
HIST_checkInput_e check)
Expand Down Expand Up @@ -397,15 +415,25 @@ size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
{
if (sourceSize < HIST_FAST_THRESHOLD) /* heuristic threshold */
return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);

#if defined(ZSTD_ARCH_ARM_SVE2)
/* Static SVE2: always use SVE2 path */
(void)workSpace;
(void)workSpaceSize;
return HIST_count_sve2(count, maxSymbolValuePtr, source, sourceSize, trustInput);
#else
#elif DYNAMIC_SVE2
/* Dynamic SVE2: check at runtime */
if (ZSTD_cpuSupportsSve2()) {
(void)workSpace;
(void)workSpaceSize;
return HIST_count_sve2(count, maxSymbolValuePtr, source, sourceSize, trustInput);
}
#endif

/* Default implementation */
if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace);
#endif
}

/* HIST_count_wksp() :
Expand All @@ -416,14 +444,22 @@ size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
void* workSpace, size_t workSpaceSize)
{
#if defined(ZSTD_ARCH_ARM_SVE2)
/* Static SVE2: always use SVE2 path */
if (*maxSymbolValuePtr < 255)
return HIST_count_sve2(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue);
#else
#elif DYNAMIC_SVE2
/* Dynamic SVE2: check at runtime */
if (*maxSymbolValuePtr < 255 && ZSTD_cpuSupportsSve2()) {
return HIST_count_sve2(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue);
}
#endif

/* Default implementation */
if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
if (*maxSymbolValuePtr < 255)
return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace);
#endif

*maxSymbolValuePtr = 255;
return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize);
}
Expand Down
6 changes: 6 additions & 0 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
ZSTD_memset(cctx, 0, sizeof(*cctx));
cctx->customMem = memManager;
cctx->bmi2 = ZSTD_cpuSupportsBmi2();
#if DYNAMIC_SVE2
cctx->sve2 = ZSTD_cpuSupportsSve2();
#endif
{ size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);
assert(!ZSTD_isError(err));
(void)err;
Expand Down Expand Up @@ -153,6 +156,9 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize)
cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE);
cctx->tmpWkspSize = TMP_WORKSPACE_SIZE;
cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
#if DYNAMIC_SVE2
cctx->sve2 = ZSTD_cpuSupportsSve2();
#endif
return cctx;
}

Expand Down
3 changes: 3 additions & 0 deletions lib/compress/zstd_compress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,9 @@ struct ZSTD_CCtx_s {
ZSTD_compressionStage_e stage;
int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
#if DYNAMIC_SVE2
int sve2; /* == 1 if the CPU supports SVE2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
#endif
ZSTD_CCtx_params requestedParams;
ZSTD_CCtx_params appliedParams;
ZSTD_CCtx_params simpleApiParams; /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */
Expand Down