Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
187 commits
Select commit Hold shift + click to select a range
a4a2c7a
tier-first search order in get() for better cache locality
joshuaisaact Mar 23, 2026
25c5de2
Revert "tier-first search order in get() for better cache locality"
joshuaisaact Mar 23, 2026
330ddb6
cross-tier prefetching in get() to hide inter-tier latency
joshuaisaact Mar 23, 2026
d5d671a
Revert "cross-tier prefetching in get() to hide inter-tier latency"
joshuaisaact Mar 23, 2026
f9fc247
remove all prefetching from get() to test if it helps or hurts
joshuaisaact Mar 23, 2026
18b4ce8
add tier-0 probe-0 fast path in get()
joshuaisaact Mar 23, 2026
446e450
Revert "add tier-0 probe-0 fast path in get()"
joshuaisaact Mar 23, 2026
b48b0a3
reduce MAX_PROBES from 32 to 24
joshuaisaact Mar 23, 2026
6bbdd09
reduce MAX_PROBES from 24 to 20
joshuaisaact Mar 23, 2026
6ca9ddc
cap get() tier search to 8 tiers
joshuaisaact Mar 23, 2026
d42e9ee
reduce MAX_LOOKUP_TIERS from 8 to 6
joshuaisaact Mar 23, 2026
33c9feb
Revert "reduce MAX_LOOKUP_TIERS from 8 to 6"
joshuaisaact Mar 23, 2026
9530f6c
use fixed-size arrays for tier metadata instead of heap slices
joshuaisaact Mar 23, 2026
1c52805
Revert "use fixed-size arrays for tier metadata instead of heap slices"
joshuaisaact Mar 23, 2026
907f4f8
switch hash to Stafford variant 13 (splitmix64 finalizer)
joshuaisaact Mar 23, 2026
a8629ca
Revert "switch hash to Stafford variant 13 (splitmix64 finalizer)"
joshuaisaact Mar 23, 2026
bbf67f0
switch to linear probing for better cache locality
joshuaisaact Mar 23, 2026
6e5b3dc
add early exit on empty bucket slots in get()
joshuaisaact Mar 23, 2026
e580524
Revert "add early exit on empty bucket slots in get()"
joshuaisaact Mar 23, 2026
5f62379
combined fp+empty SIMD check with per-tier early exit
joshuaisaact Mar 23, 2026
ac8b602
Revert "combined fp+empty SIMD check with per-tier early exit"
joshuaisaact Mar 23, 2026
6f57a0a
double tier 0 size to concentrate elements for faster lookup
joshuaisaact Mar 23, 2026
f7328f9
quadruple tier 0 size (2x capacity / BUCKET_SIZE)
joshuaisaact Mar 23, 2026
be1c046
Revert "quadruple tier 0 size (2x capacity / BUCKET_SIZE)"
joshuaisaact Mar 23, 2026
710938e
reduce MAX_LOOKUP_TIERS from 8 to 4 with larger tier 0
joshuaisaact Mar 23, 2026
3d427e3
reduce MAX_LOOKUP_TIERS from 4 to 3
joshuaisaact Mar 23, 2026
6188a77
reduce MAX_LOOKUP_TIERS from 3 to 2
joshuaisaact Mar 23, 2026
c9ea0fe
reduce MAX_LOOKUP_TIERS to 1 (tier 0 only)
joshuaisaact Mar 23, 2026
8c9b749
simplify get() to single tier 0 loop, no tier iteration
joshuaisaact Mar 23, 2026
eaf5bbe
Revert "simplify get() to single tier 0 loop, no tier iteration"
joshuaisaact Mar 23, 2026
a4c6c90
use inline for to unroll probe loop in get()
joshuaisaact Mar 23, 2026
db48705
Revert "use inline for to unroll probe loop in get()"
joshuaisaact Mar 23, 2026
4841fba
reduce MAX_PROBES from 20 to 10
joshuaisaact Mar 23, 2026
4260bd4
reduce MAX_PROBES from 10 to 8
joshuaisaact Mar 23, 2026
004b0b7
reduce MAX_PROBES from 8 to 6
joshuaisaact Mar 23, 2026
03b078a
Revert "reduce MAX_PROBES from 8 to 6"
joshuaisaact Mar 23, 2026
2f6f0b8
increase BUCKET_SIZE to 32 for AVX2, generic mask types
joshuaisaact Mar 23, 2026
94cd8b0
Revert "increase BUCKET_SIZE to 32 for AVX2, generic mask types"
joshuaisaact Mar 23, 2026
784e8ca
try quadratic probing instead of linear
joshuaisaact Mar 23, 2026
eb3837d
Revert "try quadratic probing instead of linear"
joshuaisaact Mar 23, 2026
6f15cd3
inline for unroll 8-probe loop and eliminate tier loop
joshuaisaact Mar 23, 2026
ab64ffb
Revert "inline for unroll 8-probe loop and eliminate tier loop"
joshuaisaact Mar 23, 2026
dd36b35
simplify get() to tier-0-only with comptime MAX_PROBES loop bound
joshuaisaact Mar 23, 2026
e070ea9
remove insert prefetch
joshuaisaact Mar 23, 2026
9ab9cc0
delay batch transition to 90% fill (was 75%) to keep more in tier 0
joshuaisaact Mar 23, 2026
ace8f81
limit insertIntoTier probe depth to MAX_PROBES for lookup alignment
joshuaisaact Mar 23, 2026
4a423da
try 4x tier 0 capacity with probe-limited insert
joshuaisaact Mar 23, 2026
dd35c71
Revert "try 4x tier 0 capacity with probe-limited insert"
joshuaisaact Mar 23, 2026
bd6cd0b
tighten batch threshold to 0.08 (92% fill)
joshuaisaact Mar 23, 2026
c6a1a44
optimize remove() to tier-0-only with comptime loop bound
joshuaisaact Mar 23, 2026
5b88308
try faster 64-bit multiply hash
joshuaisaact Mar 23, 2026
56af0b9
try single-multiply hash
joshuaisaact Mar 23, 2026
97c8fa3
noinline findKeyInBucket to keep hot path tight
joshuaisaact Mar 23, 2026
49e1c9c
Revert "noinline findKeyInBucket to keep hot path tight"
joshuaisaact Mar 23, 2026
532520f
fast path for single FP match in findKeyInBucket
joshuaisaact Mar 23, 2026
2ae4de7
cache tier0 metadata in struct fields for faster get/remove
joshuaisaact Mar 23, 2026
022c478
hardcode tier0_start=0, eliminate addition in get/remove
joshuaisaact Mar 23, 2026
6f82228
remove unused tier0_start field
joshuaisaact Mar 23, 2026
7ab6486
try MAX_PROBES=6 again with all other optimizations
joshuaisaact Mar 23, 2026
2ce441c
Revert "try MAX_PROBES=6 again with all other optimizations"
joshuaisaact Mar 23, 2026
c9e700c
cache bucket mask directly, eliminate subtraction per probe
joshuaisaact Mar 23, 2026
740bc74
prefetch keys for probe 0 to hide key read latency
joshuaisaact Mar 23, 2026
1e118e7
Revert "prefetch keys for probe 0 to hide key read latency"
joshuaisaact Mar 23, 2026
bfd53c3
branchless fingerprint using 7 bits + 1 (range 1-128)
joshuaisaact Mar 23, 2026
2200af5
Revert "branchless fingerprint using 7 bits + 1 (range 1-128)"
joshuaisaact Mar 23, 2026
23f2a59
reorder struct fields: hot path first for cache line alignment
joshuaisaact Mar 23, 2026
37c7627
try inline for with precomputed probe array
joshuaisaact Mar 23, 2026
f6b581a
Revert "try inline for with precomputed probe array"
joshuaisaact Mar 23, 2026
90f051a
use bits 32-39 for fingerprint, less correlation with bucket index
joshuaisaact Mar 23, 2026
45de2ab
try bits 24-31 for fingerprint
joshuaisaact Mar 23, 2026
b3fdcef
ultra-fast path: check slot 0 with validity check
joshuaisaact Mar 23, 2026
e4a3802
separate probe 0 check to avoid redundant work in main loop
joshuaisaact Mar 23, 2026
bb13814
unroll probes 0 and 1 before the loop
joshuaisaact Mar 23, 2026
4718a34
Revert "unroll probes 0 and 1 before the loop"
joshuaisaact Mar 23, 2026
53ffd0a
prefetch values for probe 0 while SIMD runs
joshuaisaact Mar 23, 2026
ebb9f26
Revert "prefetch values for probe 0 while SIMD runs"
joshuaisaact Mar 23, 2026
e22f93b
branchless fingerprint using max/min clamp
joshuaisaact Mar 23, 2026
97389fa
Revert "branchless fingerprint using max/min clamp"
joshuaisaact Mar 23, 2026
6600b40
batch threshold 0.06 (94% fill)
joshuaisaact Mar 23, 2026
f0f51aa
Revert "batch threshold 0.06 (94% fill)"
joshuaisaact Mar 23, 2026
28eded4
batch threshold 0.12 (88% fill)
joshuaisaact Mar 23, 2026
4e2f019
batch threshold 0.15 (85% fill)
joshuaisaact Mar 23, 2026
f380498
Revert "batch threshold 0.15 (85% fill)"
joshuaisaact Mar 23, 2026
7aa8cd6
add experiment results, benchmark harness, and autoresearch artifacts
joshuaisaact Mar 23, 2026
1393871
add rigorous abseil comparison, correct stale README claims
joshuaisaact Mar 23, 2026
c886b94
add program-v2: optimization target is now abseil flat_hash_map
joshuaisaact Mar 23, 2026
34e40e3
add AGENTS.md with non-discoverable repo guidance
joshuaisaact Mar 23, 2026
7807ffe
interleave keys and values into entries array for cache locality
joshuaisaact Mar 23, 2026
236afac
reduce MAX_PROBES 8 to 7
joshuaisaact Mar 23, 2026
0675665
reduce MAX_PROBES 7 to 6
joshuaisaact Mar 23, 2026
984921d
Revert "reduce MAX_PROBES 7 to 6"
joshuaisaact Mar 23, 2026
7b36a0a
two-round multiply hash for better fingerprint distribution
joshuaisaact Mar 23, 2026
83d89ba
Revert "two-round multiply hash for better fingerprint distribution"
joshuaisaact Mar 23, 2026
fb76b19
use bits 56-63 for fingerprint (highest byte, most independent)
joshuaisaact Mar 23, 2026
fbfbbf2
Revert "use bits 56-63 for fingerprint (highest byte, most independent)"
joshuaisaact Mar 23, 2026
fce62d7
batch threshold 0.08 (92% fill before tier 1)
joshuaisaact Mar 23, 2026
4cb90c7
Revert "batch threshold 0.08 (92% fill before tier 1)"
joshuaisaact Mar 23, 2026
93e4a08
early termination in get() on empty fingerprint slots
joshuaisaact Mar 23, 2026
f21570b
Revert "early termination in get() on empty fingerprint slots"
joshuaisaact Mar 23, 2026
2cf2a29
return value directly from findValueInBucket to avoid re-indexing
joshuaisaact Mar 23, 2026
749a1e9
mark get() as inline
joshuaisaact Mar 23, 2026
6fa6f37
Revert "mark get() as inline"
joshuaisaact Mar 23, 2026
68cb2f5
golden ratio hash constant 0x9E3779B97F4A7C15
joshuaisaact Mar 23, 2026
cedfe75
Revert "golden ratio hash constant 0x9E3779B97F4A7C15"
joshuaisaact Mar 23, 2026
46f1ae9
minimal early termination via matchEmpty after findValueInBucket miss
joshuaisaact Mar 23, 2026
6768f0c
Revert "minimal early termination via matchEmpty after findValueInBuc…
joshuaisaact Mar 23, 2026
c5b0caa
batch threshold 0.14 (86% fill before tier 1)
joshuaisaact Mar 23, 2026
50b619b
Revert "batch threshold 0.14 (86% fill before tier 1)"
joshuaisaact Mar 23, 2026
f97dad2
always try tier 0 first in insert to maximize get() hit rate
joshuaisaact Mar 23, 2026
5566394
Revert "always try tier 0 first in insert to maximize get() hit rate"
joshuaisaact Mar 23, 2026
8d00aa7
prefetch entries for probe 0 before fingerprint check
joshuaisaact Mar 23, 2026
ad3bf7c
also prefetch entries for probe 1 while checking probe 0
joshuaisaact Mar 23, 2026
5b39875
Revert "also prefetch entries for probe 1 while checking probe 0"
joshuaisaact Mar 23, 2026
4cc0deb
prefetch with locality=1 (low temporal) for entries
joshuaisaact Mar 23, 2026
97803b7
Revert "prefetch with locality=1 (low temporal) for entries"
joshuaisaact Mar 23, 2026
a355018
remove xor-shift from hash (just multiply)
joshuaisaact Mar 23, 2026
758f93f
use upper hash bits for bucket index (better distribution without xor…
joshuaisaact Mar 23, 2026
dd25176
merge probe 0 back into loop (prefetch before loop)
joshuaisaact Mar 23, 2026
713b666
reduce MAX_PROBES 7 to 6 (upper-bit hash has better distribution)
joshuaisaact Mar 23, 2026
e5a018e
Revert "reduce MAX_PROBES 7 to 6 (upper-bit hash has better distribut…
joshuaisaact Mar 23, 2026
56102de
branchless fingerprint clamping with @max/@min
joshuaisaact Mar 23, 2026
2722e1e
Revert "branchless fingerprint clamping with @max/@min"
joshuaisaact Mar 23, 2026
eba4f60
prefetch both fingerprints and entries for probe 0
joshuaisaact Mar 23, 2026
e431166
Revert "prefetch both fingerprints and entries for probe 0"
joshuaisaact Mar 23, 2026
248089c
use for range instead of while in get() probe loop
joshuaisaact Mar 23, 2026
d1db70b
inline for in get() probe loop (comptime unroll)
joshuaisaact Mar 23, 2026
927dc4a
Revert "inline for in get() probe loop (comptime unroll)"
joshuaisaact Mar 23, 2026
c51c9e9
batch threshold 0.10 (90% fill)
joshuaisaact Mar 23, 2026
c102884
Revert "batch threshold 0.10 (90% fill)"
joshuaisaact Mar 23, 2026
269d9c4
stride-3 probing to reduce secondary clustering
joshuaisaact Mar 23, 2026
d8c6338
Revert "stride-3 probing to reduce secondary clustering"
joshuaisaact Mar 23, 2026
56755e5
prefetch two cache lines of entries for probe 0
joshuaisaact Mar 23, 2026
ca489d3
Revert "prefetch two cache lines of entries for probe 0"
joshuaisaact Mar 23, 2026
df2c24a
try Murmur3 mixing constant 0xbf58476d1ce4e5b9
joshuaisaact Mar 23, 2026
21ed7db
Revert "try Murmur3 mixing constant 0xbf58476d1ce4e5b9"
joshuaisaact Mar 23, 2026
fa8f1ec
rigorous benchmark: random keys, fair capacity, median of 10, miss test
joshuaisaact Mar 23, 2026
573d1cf
abseil-style control encoding: empty=0x80, 7-bit fps, cheap early ter…
joshuaisaact Mar 23, 2026
76f7903
Revert "abseil-style control encoding: empty=0x80, 7-bit fps, cheap e…
joshuaisaact Mar 23, 2026
4c2dd16
early termination via matchEmpty (retry with random keys benchmark)
joshuaisaact Mar 23, 2026
456a5f0
Revert "early termination via matchEmpty (retry with random keys benc…
joshuaisaact Mar 23, 2026
4fe8fdb
move prefetch inside probe loop (abseil pattern: one per iteration)
joshuaisaact Mar 23, 2026
7a37536
Revert "move prefetch inside probe loop (abseil pattern: one per iter…
joshuaisaact Mar 23, 2026
5f0ba61
restore xor-shift in hash (may help random key distribution)
joshuaisaact Mar 23, 2026
3b49a27
xor-shift >> 28 for more upper-bit mixing
joshuaisaact Mar 23, 2026
86c8d0d
Revert "xor-shift >> 28 for more upper-bit mixing"
joshuaisaact Mar 23, 2026
7d2318e
add v2 experiment log and insights
joshuaisaact Mar 23, 2026
5e8eaeb
gitignore benchmark binaries and logs
joshuaisaact Mar 23, 2026
edf731b
independent fingerprint hash (second multiply, parallel on superscalar)
joshuaisaact Mar 23, 2026
7e51903
Revert "independent fingerprint hash (second multiply, parallel on su…
joshuaisaact Mar 23, 2026
a950f10
test 7-bit fingerprint from top bits (abseil H2 style)
joshuaisaact Mar 23, 2026
112a5c2
Revert "test 7-bit fingerprint from top bits (abseil H2 style)"
joshuaisaact Mar 23, 2026
b426a0c
update insights with honest benchmark findings
joshuaisaact Mar 23, 2026
b08912e
use findEmptyInBucket in insert (skip tombstone check during bulk ins…
joshuaisaact Mar 23, 2026
ac3797f
Revert "use findEmptyInBucket in insert (skip tombstone check during …
joshuaisaact Mar 23, 2026
35aff5d
paper-faithful multi-tier get(): search all tiers, not just tier 0
joshuaisaact Mar 23, 2026
178c3ad
limit multi-tier search to tier 0 + tier 1 only
joshuaisaact Mar 23, 2026
c5ae13d
tier 1: probe 0 only (minimal code, finds ~95% of tier-1 elements)
joshuaisaact Mar 23, 2026
1129bce
Revert "tier 1: probe 0 only (minimal code, finds ~95% of tier-1 elem…
joshuaisaact Mar 23, 2026
9188eba
Reapply "tier 1: probe 0 only (minimal code, finds ~95% of tier-1 ele…
joshuaisaact Mar 23, 2026
818f9bf
Revert "Reapply "tier 1: probe 0 only (minimal code, finds ~95% of ti…
joshuaisaact Mar 23, 2026
0df48a8
restore tier-0-only get() (multi-tier causes compiler cascading regre…
joshuaisaact Mar 23, 2026
d511474
update README with honest benchmark results and paper divergence notes
joshuaisaact Mar 23, 2026
aaa9168
add program-v3: paper-faithful multi-tier lookup research
joshuaisaact Mar 24, 2026
83f9649
noinline getSlowPath for tier 1+ search (keep get() I-cache footprint…
joshuaisaact Mar 24, 2026
b725080
limit getSlowPath to tier 1 only (tiers 2+ are empty at 99% load)
joshuaisaact Mar 24, 2026
65aee5b
Revert "limit getSlowPath to tier 1 only (tiers 2+ are empty at 99% l…
joshuaisaact Mar 24, 2026
48345e4
Reapply "limit getSlowPath to tier 1 only (tiers 2+ are empty at 99% …
joshuaisaact Mar 24, 2026
0c05a9b
restore tier-0-only get() as clean base for insert-side experiments
joshuaisaact Mar 24, 2026
2696554
MAX_PROBES 7 -> 8 (more elements fit in tier 0)
joshuaisaact Mar 24, 2026
93a99b8
batch threshold 0.05 (95% fill) with MAX_PROBES=8
joshuaisaact Mar 24, 2026
e5abd48
Revert "batch threshold 0.05 (95% fill) with MAX_PROBES=8"
joshuaisaact Mar 24, 2026
3a03edf
Reapply "batch threshold 0.05 (95% fill) with MAX_PROBES=8"
joshuaisaact Mar 24, 2026
c6f6467
restore clean v3 baseline (MAX_PROBES=7, threshold=0.12)
joshuaisaact Mar 24, 2026
7736f8b
paper-faithful get(): noinline cold-hinted getOtherTiers for tier 1+
joshuaisaact Mar 24, 2026
b2448b0
use opaque function pointer for tier 1+ overflow (prevent LLVM codege…
joshuaisaact Mar 24, 2026
906ae65
limit overflow to tier 1 only (tiers 2+ empty at 99% load)
joshuaisaact Mar 24, 2026
776cd39
log v3 experiments
joshuaisaact Mar 24, 2026
6f7d750
early termination in overflow function (safe with 100% find rate)
joshuaisaact Mar 24, 2026
32dd07a
early termination in tier-0 loop (jump to overflow on empty slot)
joshuaisaact Mar 24, 2026
9837b5d
Revert "early termination in tier-0 loop (jump to overflow on empty s…
joshuaisaact Mar 24, 2026
1c47413
log v3 early termination experiments
joshuaisaact Mar 24, 2026
94c1d05
v3 insights: elastic hash 40-65% faster than abseil at normal loads
joshuaisaact Mar 24, 2026
ad1bfed
add verification program: are these results real?
joshuaisaact Mar 24, 2026
ab25178
verification checks 1-3: capacity, shuffled access, size independence
joshuaisaact Mar 24, 2026
71f0fb9
complete verification: realistic workloads, hash cost, shuffled access
joshuaisaact Mar 24, 2026
7e7a711
update README and PR with verified benchmark results
joshuaisaact Mar 24, 2026
65683f7
verify: gcc vs clang abseil - no unfair compiler advantage
joshuaisaact Mar 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
.zig-cache
zig-out
.claude
NOTES.md
bench.log
bench-abseil
abseil-v2.log
elastic-v2.log
17 changes: 17 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# AGENTS.md

## Frozen files

`src/simple.zig` and `src/bench.zig` are reference implementations. Do not modify them.

## Autoresearch programs

`program.md` and `program-v2.md` are autonomous agent programs (not documentation). When asked to "start" or "run" one, read it fully and execute its loop. Each defines its own set of frozen files, editable files, and keep/revert criteria -- read before editing anything.

## Zig skills

The skills `zig-perf`, `zig-quality`, `zig-safety`, `zig-style`, and `zig-testing` are available globally.

## Abseil comparison benchmarks

The abseil benchmark (`bench-abseil.cpp`, created by program-v2) requires system-installed `abseil-cpp` with pkg-config modules: `absl_hash`, `absl_raw_hash_set`, `absl_hashtablez_sampler`.
187 changes: 69 additions & 118 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,163 +1,114 @@
# elastic-hash-zig

> **Disclaimer:** I'm still learning Zig and there may be memory crimes.

Elastic hashing implementation in Zig. Based on [Elastic Hashing](https://arxiv.org/pdf/2501.02305).
SIMD hash table in Zig, inspired by [Optimal Bounds for Open Addressing Without Reordering](https://arxiv.org/abs/2501.02305) (Farach-Colton, Krapivin, Kuszmaul 2025). Uses the paper's tiered batch insertion and multi-tier lookup via opaque overflow.

Requires Zig 0.14+ (tested on 0.16.0-dev).

See my blog post for a walkthrough: [www.joshtuddenham.dev/blog/hashmaps](https://www.joshtuddenham.dev/blog/hashmaps)

## Results

### True 99% Load Factor (of actual capacity)

The hybrid implementation compared to Zig's `std.HashMap` at **true 99% of actual capacity**. std.HashMap is based on Google's [SwissTable](https://abseil.io/about/design/swisstables).

| Capacity | Insert | Lookup |
|----------|--------|--------|
| 16k | **4.34x** | **1.15x** |
| 65k | **7.92x** | **1.68x** |
| 262k | **4.77x** | 0.75x |
| 524k | **4.33x** | 0.78x |
| 1M | **4.40x** | 0.77x |
| 2M | **4.44x** | 0.72x |

**Insert is 4-8x faster** across all sizes at true 99% load.

**Lookup** wins at smaller sizes (16k-65k), loses ~25% at larger sizes due to φ-ordering cache effects.

### Delete Performance

Delete at 99% load factor (deleting 50% of elements):

| Capacity | Delete |
|----------|--------|
| 16k | **1.72x** |
| 65k | **2.63x** |
| 262k | **1.29x** |
| 1M | **1.14x** |

**Delete is faster** across all sizes at high load, with bigger wins at smaller sizes.
## vs Google's abseil `flat_hash_map`

### Comptime vs Runtime
Benchmarked against `absl::flat_hash_map` (the original SwissTable) with u64 keys. Both sides use `reserve(n)` / `init(n)` for the same target capacity. Random keys via splitmix64, median of 10 runs, 2 warmup discards. Full methodology and verification in `verify-results.md`.

When capacity is known at compile time, the comptime version significantly outperforms runtime:
### Hit lookup (shuffled random access, n=1,048,576)

| n | Insert | Lookup |
|---|--------|--------|
| 10k | **2.06x** | **4.63x** |
| 100k | **2.73x** | **6.71x** |
| 1M | **2.21x** | **2.36x** |
| Load | Gap (abseil/elastic) | Winner |
|------|---------------------|--------|
| 10% | **1.16** | Elastic 16% faster |
| 25% | **1.21** | Elastic 21% faster |
| 50% | **1.18** | Elastic 18% faster |
| 75% | **1.08** | Elastic 8% faster |
| 90% | 0.96 | Roughly tied |
| 99% | 0.86 | Abseil 14% faster |

## Key Findings
### Realistic workloads

### What Works
| Workload | 100K | 500K | 1M |
|----------|------|------|-----|
| Mixed r/w (80% hit, 10% miss, 5% ins, 5% del) | 0.77 | **1.49** | 0.98 |
| Hot-key / zipf-like lookup | 0.72 | **1.07** | **1.29** |
| Build-then-read (insert N, 10N random reads) | 0.77 | 0.98 | 0.81 |

1. **Insert-heavy workloads at high load**: 4-8x faster than std.HashMap at 99% load
2. **Delete operations**: 1.1-2.6x faster than std.HashMap at high load
3. **Known-capacity scenarios**: Comptime version is 2-7x faster
4. **Small-to-medium datasets**: Both insert and lookup win up to ~65k elements
5. **Worst-case guarantees**: O(log²(1/ε)) expected probes from the paper
### Delete performance

### What Doesn't Work
2-3x faster than abseil at all sizes and loads. O(1) tombstone marking vs abseil's find-then-erase.

1. **Lookup at large sizes**: φ-ordering causes cache misses when jumping between tiers
2. **General-purpose replacement**: std.HashMap wins for typical mixed workloads
3. **Memory locality**: Tiered structure hurts cache performance vs flat Swiss table
### Where elastic hash wins

### Why std.HashMap Still Wins on Lookup
**Hit lookups at 500K-2M elements, 10-75% load.** The tiered architecture keeps hot fingerprint metadata (1MB for tier 0) in L2 cache, while abseil's flat control byte array (2MB after reserve) spills to L3. This gives a ~15-20% advantage on random-access hit lookups in the sweet spot.

std.HashMap uses SIMD too (Swiss table design), plus:
- Flat memory layout (better cache locality)
- No tier jumping during probes
- Optimized for typical 80% load factor
**Mixed read/write workloads at 500K.** Up to 50% faster when the access pattern includes inserts and deletes alongside lookups.

The elastic hash pays a cache penalty for the φ-ordering that provides worst-case guarantees.
**Delete at all sizes.** 2-3x faster consistently.

### vs Google's Original SwissTable (abseil)
### Where abseil wins

Benchmarking against Google's `absl::flat_hash_map` (the original SwissTable) reveals both Zig implementations are significantly slower:
**Miss lookups: 2-3x faster.** Abseil's early termination on empty control byte groups stops miss probing after 1-2 groups. Our tiered structure scans 7 probes in tier 0 + 7 in tier 1 before concluding a miss.

| Operation | Google SwissTable | Zig std.HashMap | Elastic Hash |
|-----------|-------------------|-----------------|--------------|
| Insert 1M @ 99% | 57ms | 779ms | 217ms |
| Lookup 1M @ 99% | 43ms | 533ms | 1008ms |
**Small tables (<100K).** Everything fits in L1, our tier overhead costs more than it saves.

Google's implementation is **10-20x faster** than both Zig hashmaps. This is due to:
- Years of optimization by Google engineers
- Hand-tuned SIMD intrinsics for each platform
- Cache prefetching and memory layout optimizations
- 8-byte groups on ARM (vs 16-byte here)
**Large tables (>4M).** Neither side's metadata fits in L2; abseil's flat layout has slightly less overhead.

**The takeaway**: Within Zig, elastic hash wins on insert/delete. But abseil is in a different performance league entirely.
**High load (99%).** Tier 0 is nearly full, probe depths increase, and the metadata density advantage disappears.

### Why We Win on Insert
### Caveats

The batch insertion algorithm from the paper distributes elements efficiently:
- Fills tier 0 to 75%, then starts using tier 1
- Uses probe limits based on empty fraction (ε)
- Avoids long probe chains that hurt std.HashMap at high load
- Tested with u64 keys only. Abseil's hash is designed for strings and composite keys; our multiply hash is integer-specialized.
- Single machine (x86_64, ~512KB L2). CPUs with different L2 sizes would shift the sweet spot.
- Compiled with g++ (abseil) vs Zig/LLVM (elastic hash). Different compiler backends may generate different code quality.

## Architecture

### Real Elastic Hashing
### Relationship to the paper

The implementation uses `tier0 = capacity/2` so elements actually spread across tiers:
- Tier 0: ~50% of elements
- Tier 1: ~25% of elements
- Tier 2: ~12.5% of elements
- etc.
**Insertion** follows the paper: tiered arrays with geometrically decreasing sizes, batch insertion with three cases based on tier fullness, and probe limits from the f(epsilon) function.

This is "real" elastic hashing as described in the paper, not just a single-tier SIMD hash table.
**Lookup** searches tier 0 (fast inline path), then calls through an opaque function pointer to check tier 1 (cold overflow path). The function pointer boundary prevents LLVM from cascading optimizations that bloat the hot loop. At 99% load, `get()` finds 100% of elements (97.3% in tier 0, 2.7% in tier 1 via overflow). Early termination on empty slots in the overflow function reduces miss cost in tier 1.

### SIMD Fingerprint Scanning
### SIMD bucketed probing

- 16-byte buckets scanned with SIMD vector comparison
- 8-bit fingerprints (top byte of hash, 0=empty, 0xFF=tombstone)
- 16-element buckets scanned with SSE2 vector comparison
- 8-bit fingerprints (bits 32-39 of hash), 0=empty, 0xFF=tombstone
- `@ctz` on bitmask for fast slot finding
- Tombstone-based deletion (like std.HashMap)
- Linear probing across buckets with upper-bit hash indexing

### Separated Memory Layout
### Memory layout

Fingerprints, keys, and values stored in separate arrays:
- Fingerprint scanning doesn't pollute cache with keys/values
- 4 buckets' fingerprints fit in one 64-byte cache line
- Fingerprints: separate dense array (1MB at 1M elements, fits in L2)
- Entries: interleaved key-value pairs (value load is free after key check -- same cache line)
- Software prefetch for entries at probe 0 (hides L3/DRAM latency for random access)

## Files
### Key parameters

- `src/simple.zig` - Minimal implementation (~100 lines). Start here if you're learning.
- `src/main.zig` - Optimized version with fingerprinting, batch insertion, and the φ priority function from the paper.
- `src/hybrid.zig` - SIMD-accelerated version with:
- `HybridElasticHash` - Runtime version
- `ComptimeHybridElasticHash` - Compile-time version (faster when capacity is known)
- `src/bench.zig` - Benchmarks
| Parameter | Value | Why |
|-----------|-------|-----|
| BUCKET_SIZE | 16 | One SSE2 comparison per bucket |
| MAX_PROBES | 7 | Minimum for 99% load correctness |
| Batch threshold | 0.12 | 88% fill in tier 0 before tier 1 |
| Hash | `key * c ^ (key * c >> 32)` | Single multiply, upper bits for bucket index |

## Usage
## Files

### Test
- `src/simple.zig` - Minimal implementation (~100 lines). Start here.
- `src/main.zig` - Base implementation with fingerprinting and batch insertion.
- `src/hybrid.zig` - SIMD-accelerated version:
- `HybridElasticHash` - Runtime version (primary optimization target)
- `ComptimeHybridElasticHash` - Compile-time version
- `src/bench.zig` - Full benchmark suite
- `src/autobench.zig` - Focused benchmark for abseil comparison
- `bench-abseil.cpp` - Abseil benchmark (identical keys/capacity)
- `bench-realistic.cpp` - Realistic workload benchmarks
- `bench-v2.sh` - Runner that builds and compares both
- `verify-results.md` - Verification methodology and findings

```
zig build test
```

### Benchmark
## Usage

```
zig build bench
zig build test # run tests
zig build bench # full benchmark
bash bench-v2.sh # comparison vs abseil (requires abseil-cpp)
```

## Conclusion

**Is this useful?** Yes, for specific use cases:

| Use Case | Recommendation |
|----------|----------------|
| Write-heavy, high load (>95%) | **Use elastic hash** (4-8x insert win) |
| Delete-heavy, high load | **Use elastic hash** (1.1-2.6x delete win) |
| Known capacity at compile time | **Use ComptimeHybridElasticHash** (2-7x faster) |
| Small datasets (<65k) | **Use elastic hash** (wins both insert and lookup) |
| General purpose | Use std.HashMap |
| Read-heavy, large datasets | Use std.HashMap |
## Optimization log

The elastic hash is not a drop-in replacement for std.HashMap, but it's a genuine win for write-heavy workloads at high load factors - which is exactly what the paper claimed.
40+ experiments across three rounds. See `results-v2.tsv`, `results-v3.tsv` for logs and `insights-v2.md`, `insights-v3.md`, `verify-results.md` for analysis.
127 changes: 127 additions & 0 deletions bench-abseil.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <vector>
#include "absl/container/flat_hash_map.h"

template <typename T>
inline void do_not_optimize(T const& val) {
asm volatile("" : : "r,m"(val) : "memory");
}

static uint64_t splitmix64(uint64_t& state) {
state += 0x9e3779b97f4a7c15;
uint64_t z = state;
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
return z ^ (z >> 31);
}

constexpr uint64_t KEY_SEED = 0xDEADBEEF12345678;
constexpr uint64_t MISS_SEED = 0xCAFEBABE87654321;
constexpr int TOTAL_RUNS = 12;
constexpr int WARMUP = 2;
constexpr int MEASURED = TOTAL_RUNS - WARMUP;

struct BenchResult {
uint64_t insert_us;
uint64_t lookup_us;
uint64_t delete_us;
uint64_t miss_us;
};

static uint64_t median(uint64_t* arr, int n) {
std::sort(arr, arr + n);
return arr[n / 2];
}

static BenchResult bench(size_t n, size_t fill) {
// Pre-generate keys (same for every run)
std::vector<uint64_t> keys(fill), miss_keys(fill);
uint64_t ks = KEY_SEED, ms = MISS_SEED;
for (size_t i = 0; i < fill; i++) {
keys[i] = splitmix64(ks);
miss_keys[i] = splitmix64(ms);
}

uint64_t ins[MEASURED], lkp[MEASURED], del[MEASURED], mis[MEASURED];

for (int r = 0; r < TOTAL_RUNS; r++) {
absl::flat_hash_map<uint64_t, uint64_t> map;
map.reserve(n);

auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < fill; i++) {
map.emplace(keys[i], i);
}
auto end = std::chrono::steady_clock::now();
uint64_t insert_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();

start = std::chrono::steady_clock::now();
for (size_t i = 0; i < fill; i++) {
auto it = map.find(keys[i]);
do_not_optimize(it->second);
}
end = std::chrono::steady_clock::now();
uint64_t lookup_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();

start = std::chrono::steady_clock::now();
for (size_t i = 0; i < fill; i++) {
auto it = map.find(miss_keys[i]);
do_not_optimize(it == map.end());
}
end = std::chrono::steady_clock::now();
uint64_t miss_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();

start = std::chrono::steady_clock::now();
for (size_t i = 0; i < fill / 2; i++) {
auto it = map.find(keys[i]);
if (it != map.end()) map.erase(it);
}
end = std::chrono::steady_clock::now();
uint64_t delete_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();

if (r >= WARMUP) {
int idx = r - WARMUP;
ins[idx] = insert_us;
lkp[idx] = lookup_us;
del[idx] = delete_us;
mis[idx] = miss_us;
}
}

return {median(ins, MEASURED), median(lkp, MEASURED),
median(del, MEASURED), median(mis, MEASURED)};
}

int main() {
constexpr size_t sizes[] = {16384, 65536, 262144, 1048576, 2097152};
constexpr int load_pcts[] = {10, 25, 50, 75, 90};

fprintf(stderr, "\n=== Abseil flat_hash_map Benchmark (median of %d, %d warmup) ===\n\n",
MEASURED, WARMUP);

// All sizes at 99% load
for (size_t n : sizes) {
size_t fill = n * 99 / 100;
auto r = bench(n, fill);
printf("RESULT\tn=%zu\tload=99\tinsert_us=%lu\tlookup_us=%lu\tdelete_us=%lu\tmiss_us=%lu\n",
n, r.insert_us, r.lookup_us, r.delete_us, r.miss_us);
fflush(stdout);
}

// Multiple load factors at 1M
constexpr size_t n = 1048576;
for (int pct : load_pcts) {
size_t fill = n * pct / 100;
auto r = bench(n, fill);
printf("RESULT\tn=%zu\tload=%d\tinsert_us=%lu\tlookup_us=%lu\tdelete_us=%lu\tmiss_us=%lu\n",
n, pct, r.insert_us, r.lookup_us, r.delete_us, r.miss_us);
fflush(stdout);
}

fprintf(stderr, "\nDONE\n");
return 0;
}
Loading