zilliztech · lyang24 · Dec 31, 2025
diff --git a/.github/workflows/ut.yaml b/.github/workflows/ut.yaml
@@ -48,6 +48,15 @@ jobs:
           && conan install .. --build=missing -s build_type=Release -o with_ut=True -o with_diskann=True -o with_asan=True -s compiler.libcxx=libstdc++11 \
           && conan build .. \
           && ./Release/tests/ut/knowhere_tests
+      - name: Run Sparse SIMD Benchmark
+        run: |
+          cd build
+          if [ -f ./Release/benchmark/benchmark_sparse_simd ]; then
+            echo "Running sparse SIMD benchmark..."
+            ./Release/benchmark/benchmark_sparse_simd
+          else
+            echo "Sparse SIMD benchmark not found, skipping..."
+          fi
       - name: Save Cache
         uses: ./.github/actions/cache-save
         with:

diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,9 @@ docker-compose-devcontainer.yml.tmp
 
 *.code-workspace
 
+# Claude Code local settings
+.claude/settings.local.json
+
 # Docker generated cache file
 .docker/
 

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -55,3 +55,17 @@ benchmark_test(benchmark_simd_qps              hdf5/benchmark_simd_qps.cpp)
 
 benchmark_test(gen_hdf5_file hdf5/gen_hdf5_file.cpp)
 benchmark_test(gen_fbin_file hdf5/gen_fbin_file.cpp)
+
+# Sparse SIMD benchmark (x86_64 only, standalone, no HDF5 required)
+# Only build on x86_64/AMD64, skip on ARM/aarch64/arm64
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64|X86_64)$")
+    message(STATUS "Building sparse SIMD benchmark for ${CMAKE_SYSTEM_PROCESSOR}")
+    add_executable(benchmark_sparse_simd benchmark_sparse_simd.cpp)
+    target_link_libraries(benchmark_sparse_simd knowhere)
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+        target_compile_options(benchmark_sparse_simd PRIVATE -mavx512f -mavx512dq)
+    endif()
+    install(TARGETS benchmark_sparse_simd DESTINATION unittest)
+else()
+    message(STATUS "Skipping sparse SIMD benchmark on ${CMAKE_SYSTEM_PROCESSOR} (x86_64 only)")
+endif()
diff --git a/benchmark/Makefile.sparse_simd b/benchmark/Makefile.sparse_simd
@@ -0,0 +1,25 @@
+# Standalone Makefile for sparse SIMD benchmark
+# Usage: make -f Makefile.sparse_simd
+
+CXX ?= g++
+CXXFLAGS = -std=c++17 -O3 -Wall -I../include -I.. -mavx512f -mavx512dq
+LDFLAGS = -pthread
+
+# Detect build directory
+BUILD_DIR = ../build
+
+BENCHMARK_BIN = benchmark_sparse_simd_standalone
+
+all: $(BENCHMARK_BIN)
+
+$(BENCHMARK_BIN): benchmark_sparse_simd.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $< $(LDFLAGS)
+
+run: $(BENCHMARK_BIN)
+	@echo "Running sparse SIMD benchmark..."
+	@./$(BENCHMARK_BIN)
+
+clean:
+	rm -f $(BENCHMARK_BIN)
+
+.PHONY: all run clean
diff --git a/benchmark/README_sparse_simd.md b/benchmark/README_sparse_simd.md
@@ -0,0 +1,147 @@
+# Sparse Inverted Index SIMD Benchmark
+
+Comprehensive benchmark for the AVX512-optimized sparse inverted index implementation.
+
+## Features
+
+- **Multiple dataset sizes**: Small (10K docs), Medium (100K docs), Large (1M docs)
+- **Both metrics**: IP (Inner Product) and BM25
+- **Realistic data**: Power-law posting list distributions
+- **Correctness verification**: Validates AVX512 results against scalar baseline
+- **Performance metrics**: Reports speedup, absolute timings, and throughput
+- **CI-friendly output**: Clean, parseable output format
+
+## Building
+
+### Option 1: CMake (integrated with main build)
+
+```bash
+cd knowhere
+mkdir -p build && cd build
+cmake ..
+make benchmark_sparse_simd
+```
+
+The binary will be at: `build/benchmark/benchmark_sparse_simd`
+
+### Option 2: Standalone Makefile (quick testing)
+
+```bash
+cd knowhere/benchmark
+make -f Makefile.sparse_simd
+./benchmark_sparse_simd_standalone
+```
+
+**Note**: AVX512 requires a compatible CPU and compiler flags `-mavx512f -mavx512dq`
+
+## Running
+
+### Run all benchmarks
+```bash
+./benchmark_sparse_simd
+```
+
+### Expected Output
+
+```
+╔══════════════════════════════════════════════════════════════════╗
+║  Sparse Inverted Index SIMD Benchmark                           ║
+╚══════════════════════════════════════════════════════════════════╝
+
+=== Small dataset (IP metric) ===
+Dataset: 10000 docs, 1000 vocab, query length: 10
+Avg posting list length: 50.0
+CPU Capabilities: AVX512F=1, AVX2=1
+
+[Scalar Fallback]
+  Time: 0.123 ms
+  Non-zero scores: 450 / 10000
+
+[AVX512 SIMD]
+  Time: 0.045 ms
+  Non-zero scores: 450 / 10000
+
+[Verification]
+  Max difference: 0.000001
+  Avg difference: 0.000000 (over 0 elements)
+  Correctness: PASS
+
+[Performance]
+  Speedup: 2.73x
+  Scalar:  0.123 ms (baseline)
+  AVX512:  0.045 ms (36.6% of baseline)
+==========================================
+```
+
+## Benchmark Details
+
+### Dataset Characteristics
+
+- **Posting lists**: Realistic power-law distribution (common terms have longer lists)
+- **Query terms**: Random selection with variable weights
+- **Document IDs**: Random distribution (tests random memory access performance)
+- **Doc lengths**: Normal distribution around average (for BM25)
+
+### What is Measured
+
+1. **Scalar Baseline**: Simple double-loop implementation matching original code
+2. **AVX512 SIMD**: Optimized implementation with:
+   - 16-wide vectorization
+   - 2x loop unrolling (32 elements/iteration)
+   - Hardware gather/scatter operations
+
+### Verification
+
+The benchmark validates correctness by:
+- Comparing AVX512 results against scalar baseline
+- Checking max absolute difference (should be < 0.001)
+- Counting non-zero scores (should match exactly)
+
+### Performance Metrics
+
+- **Time**: Average execution time over 50 runs (after 5 warmup runs)
+- **Speedup**: Ratio of scalar time to AVX512 time
+- **Throughput**: Queries per second (for multi-query benchmarks)
+
+## Expected Performance
+
+On AVX512-capable CPUs (Intel Skylake-X or newer), expect:
+
+- **IP metric**: 2-4x speedup
+- **BM25 metric**: 1.5-2.5x speedup (limited by scalar BM25 computation)
+- **Large posting lists**: Better speedup (amortizes gather latency)
+- **Short posting lists**: Lower speedup (tail loop overhead)
+
+## CI Integration
+
+The benchmark is designed for CI runs:
+
+1. **Exit code**: Returns 0 on success, 1 on verification failure
+2. **Output format**: Easy to parse for regression detection
+3. **Quick runtime**: ~1-2 seconds for all configurations
+4. **No external data**: Generates synthetic datasets on-the-fly
+
+## Troubleshooting
+
+### "Illegal instruction" error
+
+Your CPU doesn't support AVX512. Check with:
+```bash
+grep avx512 /proc/cpuinfo
+```
+
+### Build fails with "unrecognized command line option '-mavx512f'"
+
+Your compiler is too old. Requires GCC 4.9+ or Clang 3.9+.
+
+### Verification fails
+
+This indicates a bug in the SIMD implementation. Please report with:
+- CPU model (`cat /proc/cpuinfo | grep "model name"`)
+- Compiler version (`g++ --version` or `clang++ --version`)
+- Full benchmark output
+
+## Implementation Details
+
+See `src/simd/sparse_simd.h` for the AVX512 implementation and
+`src/index/sparse/sparse_inverted_index.h` for the runtime dispatcher.