diff --git a/lib/common/compiler.h b/lib/common/compiler.h index 5e70570ec7a..5985c0019fb 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -131,6 +131,15 @@ */ #define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2") +/* Target attributes for ARM SVE/SVE2 dynamic dispatch. */ +#if defined(__aarch64__) || defined(_M_ARM64) +# define SVE_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("+sve") +# define SVE2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("+sve+sve2") +#else +# define SVE_TARGET_ATTRIBUTE +# define SVE2_TARGET_ATTRIBUTE +#endif + /* prefetch * can be disabled, by declaring NO_PREFETCH build macro */ #if defined(NO_PREFETCH) @@ -218,10 +227,10 @@ # if defined(__ARM_NEON) || defined(_M_ARM64) # define ZSTD_ARCH_ARM_NEON # endif -# if defined(__ARM_FEATURE_SVE) +# if defined(__ARM_FEATURE_SVE) && !defined(__APPLE__) # define ZSTD_ARCH_ARM_SVE # endif -# if defined(__ARM_FEATURE_SVE2) +# if defined(__ARM_FEATURE_SVE2) && !defined(__APPLE__) # define ZSTD_ARCH_ARM_SVE2 # endif # if defined(__riscv) && defined(__riscv_vector) @@ -239,7 +248,7 @@ # elif defined(ZSTD_ARCH_ARM_NEON) # include # endif -# if defined(ZSTD_ARCH_ARM_SVE) || defined(ZSTD_ARCH_ARM_SVE2) +# if defined(ZSTD_ARCH_ARM_SVE) || defined(ZSTD_ARCH_ARM_SVE2) || DYNAMIC_SVE2 # include # endif # if defined(ZSTD_ARCH_RISCV_RVV) diff --git a/lib/common/cpu.h b/lib/common/cpu.h index 3f15d560f0c..ab00f123ac5 100644 --- a/lib/common/cpu.h +++ b/lib/common/cpu.h @@ -22,6 +22,16 @@ #include #endif +/* Include headers needed for ARM feature detection */ +#if defined(__aarch64__) || defined(_M_ARM64) +# if defined(__linux__) || defined(__ANDROID__) +# include +# elif defined(__APPLE__) +# include +# include +# endif +#endif + typedef struct { U32 f1c; U32 f1d; @@ -246,4 +256,69 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { #undef X +/* ==================================================== + * ARM CPU feature detection + * ==================================================== */ +#if defined(__aarch64__) || defined(_M_ARM64) + +typedef struct { + unsigned long hwcap; + unsigned long hwcap2; +} ZSTD_arm_cpuinfo_t; + +#define ZSTD_ARM_HWCAP_SVE_BIT 22 +#define ZSTD_ARM_HWCAP2_SVE2_BIT 1 + +MEM_STATIC ZSTD_arm_cpuinfo_t ZSTD_arm_cpuinfo(void) { + ZSTD_arm_cpuinfo_t info; + info.hwcap = 0; + info.hwcap2 = 0; + +#if defined(__linux__) || defined(__ANDROID__) + /* Use getauxval() to read AT_HWCAP and AT_HWCAP2 */ +# ifndef AT_HWCAP +# define AT_HWCAP 16 +# endif +# ifndef AT_HWCAP2 +# define AT_HWCAP2 26 +# endif + info.hwcap = getauxval(AT_HWCAP); + info.hwcap2 = getauxval(AT_HWCAP2); + +#elif defined(__APPLE__) + /* Apple Silicon (M1/M2/M3 etc.) does not implement SVE/SVE2. + * Apple uses a custom ARM ISA with different extensions (AMX, etc.). + * Leave hwcap as 0 to indicate no SVE/SVE2 support. */ + (void)info; + +#elif defined(_WIN32) + /* Windows on ARM - use IsProcessorFeaturePresent() */ +# ifndef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE +# define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE 46 +# endif +# ifndef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE +# define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE 47 +# endif + /* Map Windows processor features to Linux-style hwcap bits for consistency */ + if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) { + info.hwcap |= (1UL << ZSTD_ARM_HWCAP_SVE_BIT); + } + if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) { + info.hwcap2 |= (1UL << ZSTD_ARM_HWCAP2_SVE2_BIT); + } +#endif + + return info; +} + +MEM_STATIC int ZSTD_arm_cpuinfo_sve(ZSTD_arm_cpuinfo_t const cpuinfo) { + return ((cpuinfo.hwcap) & (1UL << ZSTD_ARM_HWCAP_SVE_BIT)) != 0; +} + +MEM_STATIC int ZSTD_arm_cpuinfo_sve2(ZSTD_arm_cpuinfo_t const cpuinfo) { + return ((cpuinfo.hwcap2) & (1UL << ZSTD_ARM_HWCAP2_SVE2_BIT)) != 0; +} + +#endif /* __aarch64__ || _M_ARM64 */ + #endif /* ZSTD_COMMON_CPU_H */ diff --git a/lib/common/portability_macros.h b/lib/common/portability_macros.h index bcca634e419..e523ebafc48 100644 --- a/lib/common/portability_macros.h +++ b/lib/common/portability_macros.h @@ -102,6 +102,42 @@ # endif #endif +/* Enable runtime SVE dispatch based on the CPU. + * Enabled for gcc & clang on aarch64 when SVE isn't enabled by default. + * Disabled on Apple platforms as they don't support SVE. + * + * NOTE: Currently not used - no SVE-only (without SVE2) optimizations exist. + * CPUs with SVE but not SVE2 (e.g., Fujitsu A64FX) could benefit from + * SVE-only implementations in the future. + */ +#ifndef DYNAMIC_SVE +# if ((defined(__clang__) && __has_attribute(__target__)) \ + || defined(__GNUC__)) \ + && (defined(__aarch64__) || defined(_M_ARM64)) \ + && !defined(__ARM_FEATURE_SVE) \ + && !defined(__APPLE__) +# define DYNAMIC_SVE 1 +# else +# define DYNAMIC_SVE 0 +# endif +#endif + +/* Enable runtime SVE2 dispatch based on the CPU. + * Enabled for gcc & clang on aarch64 when SVE2 isn't enabled by default. + * Disabled on Apple platforms as they don't support SVE2. + */ +#ifndef DYNAMIC_SVE2 +# if ((defined(__clang__) && __has_attribute(__target__)) \ + || defined(__GNUC__)) \ + && (defined(__aarch64__) || defined(_M_ARM64)) \ + && !defined(__ARM_FEATURE_SVE2) \ + && !defined(__APPLE__) +# define DYNAMIC_SVE2 1 +# else +# define DYNAMIC_SVE2 0 +# endif +#endif + /** * Only enable assembly for GNU C compatible compilers, * because other platforms may not support GAS assembly syntax. diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index 86a0fc5c809..47894391513 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -323,4 +323,24 @@ MEM_STATIC int ZSTD_cpuSupportsBmi2(void) return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); } +#if defined(__aarch64__) || defined(_M_ARM64) +/** + * @returns true iff the CPU supports SVE. + */ +MEM_STATIC int ZSTD_cpuSupportsSve(void) +{ + ZSTD_arm_cpuinfo_t cpuinfo = ZSTD_arm_cpuinfo(); + return ZSTD_arm_cpuinfo_sve(cpuinfo); +} + +/** + * @returns true iff the CPU supports SVE2. + */ +MEM_STATIC int ZSTD_cpuSupportsSve2(void) +{ + ZSTD_arm_cpuinfo_t cpuinfo = ZSTD_arm_cpuinfo(); + return ZSTD_arm_cpuinfo_sve2(cpuinfo); +} +#endif + #endif /* ZSTD_CCOMMON_H_MODULE */ diff --git a/lib/compress/hist.c b/lib/compress/hist.c index 3692bc250ca..3df1af29a4b 100644 --- a/lib/compress/hist.c +++ b/lib/compress/hist.c @@ -17,6 +17,7 @@ #include "../common/mem.h" /* U32, BYTE, etc. */ #include "../common/debug.h" /* assert, DEBUGLOG */ #include "../common/error_private.h" /* ERROR */ +#include "../common/zstd_internal.h" /* ZSTD_cpuSupportsSve2 */ #include "hist.h" #if defined(ZSTD_ARCH_ARM_SVE2) @@ -71,9 +72,23 @@ unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e; -#if defined(ZSTD_ARCH_ARM_SVE2) +/* ======================================================================== + * ARM SVE2 Histogram Implementation + * ======================================================================== + * Uses SVE2-specific instructions (svhistseg_u8, svaddwb_u16, svaddwt_u16) + * which are not available in base SVE. + * + * NOTE: CPUs with SVE but not SVE2 (e.g., Fujitsu A64FX) could benefit from + * an SVE-only implementation using base SVE instructions. This would require + * rewriting the histogram algorithm without SVE2's specialized histogram and + * widening add instructions. + * ======================================================================== */ +#if defined(ZSTD_ARCH_ARM_SVE2) || (DYNAMIC_SVE2 && (defined(__aarch64__) || defined(_M_ARM64))) FORCE_INLINE_TEMPLATE size_t min_size(size_t a, size_t b) { return a < b ? a : b; } +#if DYNAMIC_SVE2 +SVE2_TARGET_ATTRIBUTE +#endif static svuint16_t HIST_count_6_sve2(const BYTE* const src, size_t size, U32* const dst, const svuint8_t c0, const svuint8_t c1, @@ -174,6 +189,9 @@ svuint16_t HIST_count_6_sve2(const BYTE* const src, size_t size, U32* const dst, return svmax_u16_x(vl128, hh0, hh8); } +#if DYNAMIC_SVE2 +SVE2_TARGET_ATTRIBUTE +#endif static size_t HIST_count_sve2(unsigned* count, unsigned* maxSymbolValuePtr, const void* source, size_t sourceSize, HIST_checkInput_e check) @@ -397,15 +415,25 @@ size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, { if (sourceSize < HIST_FAST_THRESHOLD) /* heuristic threshold */ return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize); + #if defined(ZSTD_ARCH_ARM_SVE2) + /* Static SVE2: always use SVE2 path */ (void)workSpace; (void)workSpaceSize; return HIST_count_sve2(count, maxSymbolValuePtr, source, sourceSize, trustInput); -#else +#elif DYNAMIC_SVE2 + /* Dynamic SVE2: check at runtime */ + if (ZSTD_cpuSupportsSve2()) { + (void)workSpace; + (void)workSpaceSize; + return HIST_count_sve2(count, maxSymbolValuePtr, source, sourceSize, trustInput); + } +#endif + + /* Default implementation */ if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall); return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace); -#endif } /* HIST_count_wksp() : @@ -416,14 +444,22 @@ size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr, void* workSpace, size_t workSpaceSize) { #if defined(ZSTD_ARCH_ARM_SVE2) + /* Static SVE2: always use SVE2 path */ if (*maxSymbolValuePtr < 255) return HIST_count_sve2(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue); -#else +#elif DYNAMIC_SVE2 + /* Dynamic SVE2: check at runtime */ + if (*maxSymbolValuePtr < 255 && ZSTD_cpuSupportsSve2()) { + return HIST_count_sve2(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue); + } +#endif + + /* Default implementation */ if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall); if (*maxSymbolValuePtr < 255) return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace); -#endif + *maxSymbolValuePtr = 255; return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize); } diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 1d6f0fcae0e..3740ca5a98c 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -113,6 +113,9 @@ static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager) ZSTD_memset(cctx, 0, sizeof(*cctx)); cctx->customMem = memManager; cctx->bmi2 = ZSTD_cpuSupportsBmi2(); +#if DYNAMIC_SVE2 + cctx->sve2 = ZSTD_cpuSupportsSve2(); +#endif { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); assert(!ZSTD_isError(err)); (void)err; @@ -153,6 +156,9 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +#if DYNAMIC_SVE2 + cctx->sve2 = ZSTD_cpuSupportsSve2(); +#endif return cctx; } diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 13a394b3816..85f43ff985c 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -473,6 +473,9 @@ struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ +#if DYNAMIC_SVE2 + int sve2; /* == 1 if the CPU supports SVE2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ +#endif ZSTD_CCtx_params requestedParams; ZSTD_CCtx_params appliedParams; ZSTD_CCtx_params simpleApiParams; /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */