chacha20 by l0rinc · Pull Request #88 · l0rinc/bitcoin

l0rinc · 2026-01-04T18:38:14Z

for compiler in gcc clang; do   if [ "$compiler" = "gcc" ]; then CC=gcc; CXX=g++; COMP_VER=$(gcc -dumpfullversion);   else CC=clang; CXX=clang++; COMP_VER=$(clang -dumpversion); fi &&   echo "> Compiler: $compiler $COMP_VER" &&   for commit in 781876e277373ba777458b9f74a4107edf777fe5 36604f9c310b773ee71268d037637cc8b7471f35; do     git fetch origin $commit >/dev/null 2>&1 && git checkout $commit >/dev/null 2>&1 && echo "" && git log -1 --pretty='%h %s' &&     rm -rf build >/dev/null 2>&1 && cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=$CC -DCMAKE_CXX_COMPILER=$CXX >/dev/null 2>&1 &&     cmake --build build -j$(nproc) >/dev/null 2>&1 &&     for i in $(seq 0 1); do       build/bin/bench_bitcoin -filter='CHACHA20_.*' -min-time=10000;     done;   done; done                                                                                                                                                                
> Compiler: gcc 15.0.1

781876e277 chacha20: Add generic vectorized chacha20 implementation

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                4.83 |      206,958,878.47 |    0.0% |           23.35 |           11.56 |  2.020 |           0.00 |    0.2% |     11.00 | `CHACHA20_1MB`
|                4.80 |      208,137,730.48 |    0.0% |           20.59 |           11.50 |  1.790 |           0.07 |    0.0% |     11.00 | `CHACHA20_256BYTES`
|                2.69 |      372,003,472.99 |    0.1% |           17.84 |            6.44 |  2.773 |           0.19 |    0.0% |     11.00 | `CHACHA20_64BYTES`

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                4.83 |      207,069,975.88 |    0.0% |           23.35 |           11.56 |  2.021 |           0.00 |    0.1% |     11.00 | `CHACHA20_1MB`
|                4.81 |      208,067,368.76 |    0.0% |           20.59 |           11.51 |  1.789 |           0.07 |    0.0% |     11.00 | `CHACHA20_256BYTES`
|                2.67 |      374,590,193.11 |    0.1% |           17.84 |            6.39 |  2.791 |           0.19 |    0.0% |     11.00 | `CHACHA20_64BYTES`

36604f9c31 refactor: improve GCC handling for NEON/AArch64 in ChaCha20 vectorized paths
> Compiler: clang 22.0.0

781876e277 chacha20: Add generic vectorized chacha20 implementation

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                1.79 |      557,959,494.89 |    0.0% |            9.60 |            4.29 |  2.240 |           0.00 |    0.1% |     11.01 | `CHACHA20_1MB`
|                1.64 |      610,307,932.25 |    0.1% |            6.55 |            3.92 |  1.670 |           0.10 |    0.0% |     11.00 | `CHACHA20_256BYTES`
|                2.60 |      384,698,439.18 |    0.0% |           18.44 |            6.23 |  2.962 |           0.31 |    0.0% |     11.00 | `CHACHA20_64BYTES`

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                1.79 |      559,005,536.76 |    0.0% |            9.60 |            4.28 |  2.244 |           0.00 |    0.1% |     11.00 | `CHACHA20_1MB`
|                1.64 |      610,786,505.54 |    0.1% |            6.55 |            3.92 |  1.672 |           0.10 |    0.0% |     11.00 | `CHACHA20_256BYTES`
|                2.61 |      383,769,985.63 |    0.0% |           18.44 |            6.24 |  2.954 |           0.31 |    0.0% |     11.00 | `CHACHA20_64BYTES`

36604f9c31 refactor: improve GCC handling for NEON/AArch64 in ChaCha20 vectorized paths

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                1.85 |      539,703,724.67 |    0.1% |            9.69 |            4.43 |  2.187 |           0.00 |    0.1% |     11.00 | `CHACHA20_1MB`
|                1.73 |      577,268,413.16 |    0.3% |            6.76 |            4.15 |  1.629 |           0.11 |    0.0% |     10.98 | `CHACHA20_256BYTES`
|                2.61 |      383,738,420.85 |    0.0% |           18.44 |            6.24 |  2.954 |           0.31 |    0.0% |     11.00 | `CHACHA20_64BYTES`

|             ns/byte |              byte/s |    err% |        ins/byte |        cyc/byte |    IPC |       bra/byte |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|                1.84 |      542,736,861.01 |    0.0% |            9.69 |            4.41 |  2.198 |           0.00 |    0.1% |     11.00 | `CHACHA20_1MB`
|                1.73 |      577,511,857.60 |    0.1% |            6.76 |            4.15 |  1.629 |           0.11 |    0.0% |     11.00 | `CHACHA20_256BYTES`
|                2.58 |      388,189,914.99 |    0.0% |           18.44 |            6.17 |  2.988 |           0.31 |    0.0% |     11.00 | `CHACHA20_64BYTES`

lorinc@M4-Max bitcoin % for commit in ab23325 1cf4ca6 781876e e81ad4f 684c6b8; do
git fetch origin $commit >/dev/null 2>&1 && git checkout $commit >/dev/null 2>&1 && echo "" && git log -1 --pretty='%h %s' &&
rm -rf build >/dev/null 2>&1 && cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release >/dev/null 2>&1 &&
cmake --build build -j$(nproc) >/dev/null 2>&1 &&
for _ in $(seq 5); do
sleep 5;
sudo taskpolicy -t 5 -l 5 nice -n -20 ./build/bin/bench_bitcoin -filter='CHACHA20_.*' -min-time=1000;
done;
done

ab23325 Merge bitcoin#33866: refactor: Let CCoinsViewCache::BatchWrite return void

ns/byte	byte/s	err%	total	benchmark
0.94	1,058,268,894.46	0.1%	1.10	`CHACHA20_1MB`
0.95	1,048,416,152.60	0.0%	1.10	`CHACHA20_256BYTES`
1.02	984,038,960.26	0.1%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.94	1,060,040,366.28	0.1%	1.10	`CHACHA20_1MB`
0.96	1,047,073,717.58	0.1%	1.10	`CHACHA20_256BYTES`
1.01	988,687,480.14	0.1%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.94	1,058,577,350.59	0.1%	1.10	`CHACHA20_1MB`
0.95	1,048,430,547.51	0.1%	1.10	`CHACHA20_256BYTES`
1.01	988,243,103.68	0.1%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.94	1,062,574,399.41	0.1%	1.10	`CHACHA20_1MB`
0.95	1,049,985,619.12	0.1%	1.10	`CHACHA20_256BYTES`
1.01	994,965,993.01	0.9%	1.07	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.94	1,061,505,032.08	0.4%	1.10	`CHACHA20_1MB`
0.96	1,045,737,313.10	0.2%	1.10	`CHACHA20_256BYTES`
0.99	1,008,315,324.40	0.5%	1.06	`CHACHA20_64BYTES`

1cf4ca6 chacha20: move single-block crypt to inline helper function

ns/byte	byte/s	err%	total	benchmark
0.95	1,057,645,664.12	0.6%	1.11	`CHACHA20_1MB`
0.95	1,054,704,109.21	0.2%	1.10	`CHACHA20_256BYTES`
1.00	999,328,767.58	0.4%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.94	1,067,467,363.20	0.6%	1.09	`CHACHA20_1MB`
0.95	1,049,898,455.45	0.3%	1.10	`CHACHA20_256BYTES`
1.00	998,002,850.63	0.3%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.94	1,061,319,270.40	0.3%	1.10	`CHACHA20_1MB`
0.95	1,049,840,423.60	0.3%	1.11	`CHACHA20_256BYTES`
0.98	1,018,308,605.16	0.6%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.92	1,089,061,018.31	0.7%	1.10	`CHACHA20_1MB`
0.94	1,069,084,780.31	0.8%	1.10	`CHACHA20_256BYTES`
0.97	1,026,499,631.32	0.2%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.92	1,082,938,842.72	0.6%	1.10	`CHACHA20_1MB`
0.92	1,084,811,196.80	0.5%	1.10	`CHACHA20_256BYTES`
0.97	1,029,920,328.74	0.4%	1.06	`CHACHA20_64BYTES`

781876e chacha20: Add generic vectorized chacha20 implementation

ns/byte	byte/s	err%	total	benchmark
0.47	2,125,353,221.24	0.3%	1.10	`CHACHA20_1MB`
0.70	1,429,018,109.32	0.2%	1.10	`CHACHA20_256BYTES`
0.96	1,040,329,069.62	0.5%	1.05	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.47	2,119,953,176.39	0.1%	1.10	`CHACHA20_1MB`
0.69	1,458,624,203.47	0.8%	1.10	`CHACHA20_256BYTES`
0.96	1,045,066,539.72	0.3%	1.05	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.47	2,130,234,806.57	0.3%	1.10	`CHACHA20_1MB`
0.68	1,460,207,386.67	0.5%	1.10	`CHACHA20_256BYTES`
0.95	1,052,170,850.45	0.3%	1.05	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.47	2,136,197,849.41	0.3%	1.10	`CHACHA20_1MB`
0.68	1,472,577,603.43	0.5%	1.10	`CHACHA20_256BYTES`
0.95	1,049,933,878.00	0.4%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.47	2,127,184,778.91	0.1%	1.10	`CHACHA20_1MB`
0.68	1,481,420,351.81	0.6%	1.09	`CHACHA20_256BYTES`
0.95	1,050,776,578.07	0.3%	1.05	`CHACHA20_64BYTES`

e81ad4f refactor: replace recursive templates in ChaCha20 implementation with static_for loops

ns/byte	byte/s	err%	total	benchmark
0.47	2,114,253,720.25	0.2%	1.10	`CHACHA20_1MB`
0.71	1,417,796,186.07	0.5%	1.09	`CHACHA20_256BYTES`
0.97	1,028,289,610.80	1.0%	1.05	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.47	2,117,216,904.18	0.2%	1.10	`CHACHA20_1MB`
0.69	1,445,997,595.12	0.4%	1.10	`CHACHA20_256BYTES`
0.97	1,032,485,796.21	0.2%	1.05	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.47	2,116,239,613.66	0.4%	1.10	`CHACHA20_1MB`
0.69	1,445,589,055.85	1.1%	1.10	`CHACHA20_256BYTES`
0.97	1,033,384,684.85	0.4%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.47	2,123,535,080.30	0.2%	1.10	`CHACHA20_1MB`
0.68	1,461,407,516.49	0.4%	1.10	`CHACHA20_256BYTES`
0.96	1,041,673,508.08	0.3%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.47	2,109,132,889.91	1.1%	1.11	`CHACHA20_1MB`
0.69	1,458,662,628.90	0.6%	1.10	`CHACHA20_256BYTES`
0.96	1,046,068,940.07	0.2%	1.10	`CHACHA20_64BYTES`

684c6b8 refactor: replace template-based static_for use in ChaCha20 with runtime iteration

ns/byte	byte/s	err%	total	benchmark
0.60	1,659,589,463.76	0.1%	1.06	`CHACHA20_1MB`
0.97	1,031,143,486.52	0.5%	1.10	`CHACHA20_256BYTES`
0.97	1,026,590,256.31	0.3%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.60	1,656,108,069.32	0.2%	1.06	`CHACHA20_1MB`
0.96	1,042,447,290.91	0.6%	1.09	`CHACHA20_256BYTES`
0.97	1,025,706,259.48	0.5%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.60	1,664,479,265.88	0.2%	1.06	`CHACHA20_1MB`
0.96	1,037,608,553.47	0.6%	1.10	`CHACHA20_256BYTES`
0.97	1,029,755,444.87	0.6%	1.05	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.60	1,667,859,284.70	0.4%	1.06	`CHACHA20_1MB`
0.95	1,048,295,704.45	0.2%	1.10	`CHACHA20_256BYTES`
0.96	1,040,519,808.92	0.3%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.60	1,670,046,100.32	0.3%	1.06	`CHACHA20_1MB`
0.95	1,055,358,439.82	0.2%	1.10	`CHACHA20_256BYTES`
0.97	1,035,578,740.44	0.3%	1.06	`CHACHA20_64BYTES`

3b47fec refactor: unroll ChaCha20 vector operations for improved clarity and efficiency

ns/byte	byte/s	err%	total	benchmark
0.45	2,199,678,091.89	0.3%	1.11	`CHACHA20_1MB`
0.66	1,507,291,665.04	0.1%	1.10	`CHACHA20_256BYTES`
0.90	1,112,959,504.77	0.7%	1.10	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.46	2,195,684,136.65	0.4%	1.10	`CHACHA20_1MB`
0.66	1,506,660,295.78	0.4%	1.10	`CHACHA20_256BYTES`
0.92	1,087,933,892.78	2.5%	1.13	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.45	2,203,401,712.10	0.2%	1.10	`CHACHA20_1MB`
0.66	1,507,625,039.08	0.2%	1.10	`CHACHA20_256BYTES`
0.90	1,108,996,437.73	0.3%	1.10	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.46	2,183,490,417.79	0.5%	1.11	`CHACHA20_1MB`
0.66	1,507,655,302.52	0.1%	1.10	`CHACHA20_256BYTES`
0.90	1,115,916,915.48	0.2%	1.10	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.45	2,202,195,041.84	0.3%	1.10	`CHACHA20_1MB`
0.66	1,507,910,301.61	0.4%	1.10	`CHACHA20_256BYTES`
0.89	1,119,904,705.18	0.5%	1.10	`CHACHA20_64BYTES`

3e6fcae refactor: unify loop unrolling macros and refactor ChaCha20 vector operations for clarity

ns/byte	byte/s	err%	total	benchmark
0.46	2,159,748,881.74	0.1%	1.10	`CHACHA20_1MB`
0.65	1,549,391,853.78	0.0%	1.10	`CHACHA20_256BYTES`
0.91	1,097,296,385.46	0.4%	1.10	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.46	2,175,945,129.30	0.0%	1.10	`CHACHA20_1MB`
0.65	1,548,893,301.86	0.1%	1.10	`CHACHA20_256BYTES`
0.90	1,113,094,172.35	0.4%	1.10	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.46	2,192,142,331.94	0.3%	1.10	`CHACHA20_1MB`
0.65	1,532,186,313.62	0.5%	1.10	`CHACHA20_256BYTES`
0.88	1,132,883,209.14	0.3%	1.10	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.46	2,194,963,166.87	0.6%	1.11	`CHACHA20_1MB`
0.65	1,548,631,622.21	0.1%	1.11	`CHACHA20_256BYTES`
0.89	1,128,422,075.21	0.6%	1.07	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.45	2,220,489,171.03	0.3%	1.10	`CHACHA20_1MB`
0.65	1,548,464,997.06	0.1%	1.10	`CHACHA20_256BYTES`
0.88	1,135,806,636.22	0.2%	1.10	`CHACHA20_64BYTES`

64471e2a64 refactor: modularize ChaCha20 vector operations and consolidate common patterns

ns/byte	byte/s	err%	total	benchmark
0.48	2,081,892,000.27	0.6%	1.11	`CHACHA20_1MB`
0.68	1,473,786,338.04	0.4%	1.10	`CHACHA20_256BYTES`
0.96	1,042,868,052.36	0.3%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.48	2,075,353,491.83	0.5%	1.10	`CHACHA20_1MB`
0.67	1,483,727,345.67	0.3%	1.10	`CHACHA20_256BYTES`
0.95	1,049,641,969.66	0.5%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.48	2,078,360,609.28	0.5%	1.10	`CHACHA20_1MB`
0.68	1,475,035,491.76	0.4%	1.10	`CHACHA20_256BYTES`
0.96	1,046,741,829.89	0.2%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.48	2,080,665,985.65	0.2%	1.10	`CHACHA20_1MB`
0.67	1,487,879,644.40	0.3%	1.10	`CHACHA20_256BYTES`
0.96	1,047,080,450.94	0.5%	1.06	`CHACHA20_64BYTES`

ns/byte	byte/s	err%	total	benchmark
0.48	2,073,967,519.81	0.6%	1.10	`CHACHA20_1MB`
0.68	1,475,708,659.69	0.4%	1.09	`CHACHA20_256BYTES`
0.95	1,048,601,229.29	0.2%	1.06	`CHACHA20_64BYTES`

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 2.20 [2.20-2.21], 256B 2.35 [2.35-2.35], 64B 2.59 [2.57-2.62] - gcc 14: 1MB 2.45 [2.44-2.45], 256B 2.51 [2.51-2.51], 64B 2.69 [2.68-2.70] CHACHA20_64BYTES is the single-block path, so it's a good sanity-check for noise. Assembly (scalar path): both compilers lower `std::rotl` to rotates and keep the round math in scalar registers. Example (gcc, quarterround fragment): eor w3, w3, w7 ror w3, w3, #16 add w5, w5, w2 Delta vs base: no measurable change (this is a refactor to simplify later vector work).

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.80 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.57-2.60] - gcc 14: 1MB 5.37 [5.37-5.38], 256B 5.14 [5.13-5.15], 64B 2.70 [2.70-2.75] The speedup/slowdown only shows up once we hit the multi-block path (1MB/256B). Single-block (64B) remains scalar and stays ~unchanged. Assembly highlights (AArch64): - clang emits NEON-friendly rotates/shuffles (`shl`+`usra` and `ext`) with a small stack frame. - gcc emits a very large stack frame and scalar pack/unpack sequences around shuffles. Example prologue (gcc): mov x13, #0x9160 sub sp, sp, x13 Example inner-sequence (gcc): fmov x18, d18 bfxil x10, x18, #0, #32 Example inner-sequence (clang): usra v25.4s, v16.4s, #25 ext v22.16b, v10.16b, v10.16b, #4 Delta vs previous commit: - clang: ~18% faster at 1MB (2.20 -> 1.80 ns/B) - gcc: ~2.2x slower at 1MB (2.45 -> 5.37 ns/B) due to poor multi-state codegen.

… `static_for` loops Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.80 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.58-2.60] - gcc 14: 1MB 6.66 [6.64-6.79], 256B 5.02 [5.02-5.03], 64B 2.70 [2.68-2.72] This refactor keeps clang flat, but makes gcc's 1MB case substantially worse. Assembly highlights (gcc): instruction count explodes (CHACHA20_1MB `ins/byte` ~43.7) with many vector loads/stores and branches (lambda clones / `ld1`/`st1` heavy). Example (from one of the inlined helper clones): st1 {v26.16b-v27.16b}, [x4] ldp q26, q27, [x2, #64] Delta vs previous commit: - gcc: 1MB 5.37 -> 6.66 ns/B (regression) - clang: essentially unchanged.

…ime iteration Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.85 [1.85-1.89], 256B 1.72 [1.72-1.73], 64B 2.59 [2.59-2.60] - gcc 14: 1MB 4.51 [4.50-4.51], 256B 4.59 [4.58-4.59], 64B 2.72 [2.70-2.72] This is the first refactor that materially helps gcc again: the multi-state path shrinks substantially (much less codegen bloat), reducing `ins/byte` (43.7 -> 25.5) for CHACHA20_1MB. Assembly highlight (gcc): far less scalar shuffling glue and reduced stack pressure (stack allocation drops from ~0x16c0 to ~0x1530, and objdump size shrinks sharply). Delta vs previous commit: - gcc: 1MB 6.66 -> 4.51 ns/B (still slower than scalar baseline, but improved) - clang: slight regression (1.80 -> 1.85 ns/B), consistent with less aggressive unrolling.

…efficiency Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.79 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.58-2.60] - gcc 14: 1MB 5.36 [5.35-5.36], 256B 5.16 [5.15-5.16], 64B 2.72 [2.69-2.73] The additional unrolling helps clang but hurts gcc again. On gcc the multi-state function grows and spills more (large stack frame), pushing 1MB back near the original regression. Delta vs previous commit: - gcc: 1MB 4.51 -> 5.36 ns/B (regression) - clang: 1MB 1.85 -> 1.79 ns/B (improvement)

…0 vector implementation Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.86 [1.86-1.87], 256B 1.73 [1.72-1.73], 64B 2.60 [2.58-2.60] - gcc 14: 1MB 5.74 [5.73-5.74], 256B 5.29 [5.29-5.30], 64B 2.71 [2.69-2.73] This reshuffle/loop consolidation ends up worsening both compilers slightly, but the impact is far larger on gcc. The gcc variant again has a huge stack frame and many extra instructions in the multi-state path (`ins/byte` ~35.7 for CHACHA20_1MB). Assembly contrast (AArch64): - clang: still uses `ext` for lane shuffles and keeps stack relatively small. - gcc: spills and uses scalar pack/unpack sequences; stack allocation is ~0x60a0. Delta vs previous commit: - clang: 1MB 1.79 -> 1.86 ns/B - gcc: 1MB 5.36 -> 5.74 ns/B

…20 handling Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.86 [1.85-1.86], 256B 1.73 [1.72-1.73], 64B 2.59 [2.58-2.60] - gcc 14: 1MB 5.74 [5.73-5.75], 256B 5.29 [5.28-5.29], 64B 2.71 [2.69-2.73] On this Cortex-A76 benchmark, results are unchanged vs the prior commit (within measurement noise). The changes here primarily prepare/extend the generic logic for a broader set of targets.

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.86 [1.85-1.86], 256B 1.72 [1.72-1.73], 64B 2.59 [2.59-2.60] - gcc 14: 1MB 5.79 [5.78-5.81], 256B 5.29 [5.28-5.29], 64B 2.71 [2.69-2.72] This change is mostly about refining GCC gating on other architectures (e.g. x86 with/without AVX2). On AArch64 it doesn't improve GCC's multi-state codegen yet; GCC still emits a very large vectorized function (stack allocation ~0x5920) and high instruction counts.

…d paths Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.86 [1.86-1.86], 256B 1.73 [1.72-1.73], 64B 2.59 [2.58-2.60] - gcc 14: 1MB 2.45 [2.44-2.45], 256B 2.53 [2.52-2.53], 64B 2.71 [2.69-2.72] Key point: gcc's multi-state vectorized path was a regression on AArch64 (5.7 ns/B class). This commit avoids that by disabling all multi-state variants for gcc on AArch64, effectively falling back to the scalar implementation for multi-block inputs (bringing gcc back near baseline). Also fix the build when all multi-state paths are disabled: avoid referencing `process_blocks<N>` from code that is preprocessor-disabled, so GCC can compile cleanly with a complete disable set.

On AArch64/NEON, GCC's codegen for 256-bit `__builtin_shufflevector` patterns was the root cause of the large perf gap (scalar spills + `fmov`/`bfi`/`bfxil` sequences). Keep Clang on the existing 256-bit vector path, but use a GCC-specific split-lane `vec256` representation (two 128-bit lanes) so GCC can use native NEON shuffles and keep the state in registers. This also enables a multi-state path for GCC again on AArch64 (use 8/4-state; keep 16/6 disabled to limit register pressure). Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median ns/byte): - GCC 14.2: 1MB 1.85, 256B 2.17, 64B 2.71 - Clang 22: 1MB 1.87, 256B 1.73, 64B 2.59

On AArch64/NEON there are 32 128-bit vector registers. The “16-state” variant (8 half-states) needs ~64 128-bit lanes worth of live state (because `vec256` lowers to two 128-bit lanes on NEON), so it spills heavily (notably on clang). Disable `STATES_16` on AArch64 to force the 8-state path, which fits in registers and is substantially faster. Also disable `STATES_6` on AArch64: it increases code size and hurts the common 8/4-state path on this target. Make the per-half-state helpers compile-time sized (no runtime `half_states` argument). This lets compilers fully specialize the inner loops; GCC in particular stops generating extra control-flow and spill glue around the multi-state path. Finally, on AArch64/NEON clang's codegen for the aligned I/O fast-path (`std::assume_aligned` + 32-byte memcpy) is slower than the plain unaligned variant. Prefer the unaligned path for clang. Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=10000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.47 [1.46-1.48], 256B 1.64 [1.64-1.65], 64B 2.59 [2.59-2.60] - gcc 14: 1MB 1.71 [1.71-1.71], 256B 1.95 [1.95-1.97], 64B 2.70 [2.69-2.72] Delta vs previous commit (CHACHA20_1MB, -min-time=10000): - clang: 1.86 -> 1.47 ns/B (avoid 16-state spills; avoid aligned fast-path) - gcc: 1.85 -> 1.71 ns/B (tightened half-state loops)

…per functions

l0rinc · 2026-02-17T15:32:59Z

src/crypto/chacha20.cpp

 }

-inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes) noexcept
+static inline void chacha20_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, uint32_t input[12]) noexcept


l0rinc · 2026-02-17T15:33:15Z

src/crypto/chacha20.cpp

    std::byte* c = out_bytes.data();
-    size_t blocks = out_bytes.size() / BLOCKLEN;
-    assert(blocks * BLOCKLEN == out_bytes.size());
+    size_t blocks = out_bytes.size() / ChaCha20Aligned::BLOCKLEN;


direct again

l0rinc · 2026-02-17T15:33:17Z

src/crypto/chacha20.cpp

 }

-inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes) noexcept
+static inline void chacha20_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, uint32_t input[12]) noexcept


l0rinc · 2026-02-17T16:05:54Z

src/crypto/chacha20.cpp

 }

-inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes) noexcept
+static inline void chacha20_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, uint32_t input[12]) noexcept


a535ff1:
commit hash added automatically

l0rinc · 2026-02-17T16:06:13Z

src/crypto/chacha20.cpp

+#include <crypto/chacha20_vec.h>
 #include <support/cleanse.h>

 #include <algorithm>


3c4a209:
yup, works correctly

l0rinc · 2026-02-17T16:33:41Z

src/crypto/chacha20.cpp

 #include <crypto/common.h>
 #include <crypto/chacha20.h>
+#include <crypto/chacha20_vec.h>
 #include <support/cleanse.h>


another direct

l0rinc · 2026-02-17T16:41:05Z

src/crypto/chacha20.cpp

 #include <algorithm>
 #include <bit>
 #include <cassert>
+#include <limits>


direct comment from global

l0rinc · 2026-02-17T16:41:27Z

src/crypto/chacha20_vec.ipp

+}
+
+template <size_t N, typename Fn>
+ALWAYS_INLINE void static_for(Fn&& fn)


0d8d440:
direct comment from commits

l0rinc · 2026-02-17T16:49:50Z

src/crypto/chacha20_vec.ipp


 using vec256 = uint32_t __attribute__((__vector_size__(32)));

+// Like Bitcoin Core's `ALWAYS_INLINE` in other files, but kept local to avoid touching shared headers.


9d0a168:
another penging

l0rinc · 2026-02-17T16:50:05Z

src/crypto/chacha20_vec.ipp

 {
-    for (size_t i = 0; i < half_states; ++i) {
-        arr[i] = vec;
+    CHACHA20_VEC_UNROLL(8)


2b44fd0:
direct comment

l0rinc · 2026-02-17T16:50:08Z

src/crypto/chacha20_vec.ipp


 using vec256 = uint32_t __attribute__((__vector_size__(32)));

+// Like Bitcoin Core's `ALWAYS_INLINE` in other files, but kept local to avoid touching shared headers.


9d0a168:
another pending

l0rinc · 2026-02-17T16:58:33Z

src/crypto/chacha20.cpp

-inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes) noexcept
+static inline void chacha20_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, uint32_t input[12]) noexcept
 {
    assert(in_bytes.size() == out_bytes.size());


a535ff1:
pending

l0rinc · 2026-02-17T16:58:47Z

src/crypto/chacha20.cpp

+
+static_assert(ChaCha20Aligned::BLOCKLEN == CHACHA20_VEC_BLOCKLEN);

 #define QUARTERROUND(a,b,c,d) \


3c4a209:
pending 2

l0rinc · 2026-02-17T16:59:03Z

src/crypto/chacha20.cpp

-inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes) noexcept
+static inline void chacha20_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, uint32_t input[12]) noexcept
 {
    assert(in_bytes.size() == out_bytes.size());


a535ff1:
pending

l0rinc · 2026-02-17T16:59:04Z

src/crypto/chacha20.cpp

+
+static_assert(ChaCha20Aligned::BLOCKLEN == CHACHA20_VEC_BLOCKLEN);

 #define QUARTERROUND(a,b,c,d) \


3c4a209:
pending 2

l0rinc · 2026-02-17T18:32:46Z

src/crypto/chacha20.cpp


 #define QUARTERROUND(a,b,c,d) \
  a += b; d = std::rotl(d ^ a, 16); \
  c += d; b = std::rotl(b ^ c, 12); \


a535ff1:
works

l0rinc · 2026-02-17T18:33:02Z

src/crypto/chacha20.cpp

        blocks -= 1;
-        c += BLOCKLEN;
-        m += BLOCKLEN;
+        c += ChaCha20Aligned::BLOCKLEN;


l0rinc · 2026-02-17T18:33:21Z

src/crypto/chacha20.cpp

@@ -7,11 +7,15 @@

 #include <crypto/common.h>
 #include <crypto/chacha20.h>


3c4a209:
direct

l0rinc · 2026-02-17T18:33:23Z

src/crypto/chacha20.cpp


 #define QUARTERROUND(a,b,c,d) \
  a += b; d = std::rotl(d ^ a, 16); \
  c += d; b = std::rotl(b ^ c, 12); \


a535ff1:
works ;)

l0rinc · 2026-02-17T18:33:24Z

src/crypto/chacha20.cpp

        blocks -= 1;
-        c += BLOCKLEN;
-        m += BLOCKLEN;
+        c += ChaCha20Aligned::BLOCKLEN;


l0rinc · 2026-02-17T22:50:05Z

src/crypto/chacha20_vec.ipp

 }

+template <size_t N, typename Fn, size_t... Is>
+ALWAYS_INLINE void static_for_impl(Fn&& fn, std::index_sequence<Is...>)


0d8d440:
pending comment

l0rinc · 2026-02-17T22:50:17Z

src/crypto/chacha20_vec.ipp

+ALWAYS_INLINE void static_for_impl(Fn&& fn, std::index_sequence<Is...>)
+{
+    (fn(std::integral_constant<size_t, Is>{}), ...);
+}


0d8d440:
direct comment ;)

theuni and others added 11 commits January 3, 2026 23:33

l0rinc force-pushed the detached474 branch from 36604f9 to bac9377 Compare January 9, 2026 19:41

chacha20: refactor vector operations with utility-based loops and hel…

12b92cf

…per functions

l0rinc commented Feb 17, 2026

View reviewed changes


		using vec256 = uint32_t __attribute__((__vector_size__(32)));

		// Like Bitcoin Core's `ALWAYS_INLINE` in other files, but kept local to avoid touching shared headers.


		static_assert(ChaCha20Aligned::BLOCKLEN == CHACHA20_VEC_BLOCKLEN);

		#define QUARTERROUND(a,b,c,d) \

		@@ -7,11 +7,15 @@

		#include <crypto/common.h>
		#include <crypto/chacha20.h>

Conversation

l0rinc commented Jan 4, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

l0rinc Feb 17, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

l0rinc Feb 17, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

l0rinc Feb 17, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

l0rinc commented Jan 4, 2026 •

edited

Loading

l0rinc Feb 17, 2026 •

edited

Loading

l0rinc Feb 17, 2026 •

edited

Loading

l0rinc Feb 17, 2026 •

edited

Loading