Skip to content

Commit 74a6ae6

Browse files
committed
Improve parser performance docs
1 parent 8e35dac commit 74a6ae6

File tree

5 files changed

+104
-80
lines changed

5 files changed

+104
-80
lines changed

DOCUMENTATION.md

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -193,33 +193,33 @@ Source: `bench/results/latest.json` (`stable` profile).
193193

194194
| Fixture | ours-fastest | ours-strictest | lol-html | lexbor |
195195
|---|---:|---:|---:|---:|
196-
| `rust-lang.html` | 1805.42 | 1878.49 | 1476.56 | 334.08 |
197-
| `wiki-html.html` | 1063.88 | 1102.18 | 1116.01 | 243.40 |
198-
| `mdn-html.html` | 2433.30 | 2019.48 | 1718.10 | 390.32 |
199-
| `w3-html52.html` | 786.03 | 823.12 | 627.76 | 174.28 |
200-
| `hn.html` | 1367.82 | 1274.17 | 858.10 | 224.01 |
196+
| `rust-lang.html` | 1757.85 | 1696.05 | 1496.39 | 334.42 |
197+
| `wiki-html.html` | 1629.79 | 1371.63 | 1217.86 | 271.77 |
198+
| `mdn-html.html` | 2582.15 | 2477.57 | 1859.72 | 408.71 |
199+
| `w3-html52.html` | 1041.44 | 1011.79 | 746.69 | 199.72 |
200+
| `hn.html` | 1249.80 | 1159.05 | 867.50 | 224.31 |
201201

202202
#### Query Match Throughput (ours)
203203

204204
| Case | strictest ops/s | strictest ns/op | fastest ops/s | fastest ns/op |
205205
|---|---:|---:|---:|---:|
206-
| `attr-heavy-button` | 145033053.03 | 6.89 | 146021853.63 | 6.85 |
207-
| `attr-heavy-nav` | 143537922.72 | 6.97 | 144591413.58 | 6.92 |
206+
| `attr-heavy-button` | 142538467.57 | 7.02 | 144381814.82 | 6.93 |
207+
| `attr-heavy-nav` | 144225430.12 | 6.93 | 145815319.07 | 6.86 |
208208

209209
#### Cached Query Throughput (ours)
210210

211211
| Case | strictest ops/s | strictest ns/op | fastest ops/s | fastest ns/op |
212212
|---|---:|---:|---:|---:|
213-
| `attr-heavy-button` | 216319113.96 | 4.62 | 208243528.31 | 4.80 |
214-
| `attr-heavy-nav` | 212898679.39 | 4.70 | 211159795.17 | 4.74 |
213+
| `attr-heavy-button` | 211326241.22 | 4.73 | 213762911.28 | 4.68 |
214+
| `attr-heavy-nav` | 216667352.78 | 4.62 | 218657139.05 | 4.57 |
215215

216216
#### Query Parse Throughput (ours)
217217

218218
| Selector case | Ops/s | ns/op |
219219
|---|---:|---:|
220-
| `simple` | 19437161.99 | 51.45 |
221-
| `complex` | 6082568.65 | 164.40 |
222-
| `grouped` | 7141994.24 | 140.02 |
220+
| `simple` | 18217710.38 | 54.89 |
221+
| `complex` | 6068387.66 | 164.79 |
222+
| `grouped` | 7021230.50 | 142.43 |
223223

224224
For full per-parser, per-fixture tables and gate output:
225225
- `bench/results/latest.md`

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ Source: `bench/results/latest.json` (`stable` profile).
2323
### Parse Throughput (Average Across Fixtures)
2424

2525
```text
26-
ours-fastest │████████████████████│ 1491.29 MB/s (100.00%)
27-
ours-strictest │███████████████████░│ 1419.49 MB/s (95.19%)
28-
lol-html │███████████████░░░░│ 1159.31 MB/s (77.74%)
29-
lexbor │███░░░░░░░░░░░░░░░░│ 273.22 MB/s (18.32%)
26+
ours-fastest │████████████████████│ 1652.21 MB/s (100.00%)
27+
ours-strictest │███████████████████░│ 1543.22 MB/s (93.40%)
28+
lol-html │███████████████░░░░░│ 1237.63 MB/s (74.91%)
29+
lexbor │███░░░░░░░░░░░░░░░░░│ 287.78 MB/s (17.42%)
3030
```
3131

3232
### Conformance Snapshot

src/html/attr_inline.zig

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,7 @@ fn parseRawValue(source: []u8, span_end: usize, eq_index: usize) RawValue {
315315
}
316316

317317
if (c == '\'' or c == '"') {
318-
const q = c;
319-
const j = scanner.findByte(source, i + 1, q) orelse span_end;
318+
const j = scanner.findByte(source, i + 1, c) orelse span_end;
320319
const next_start = if (j < span_end) j + 1 else span_end;
321320
return .{ .kind = .quoted, .start = i + 1, .end = j, .next_start = next_start };
322321
}

src/html/parser.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ fn Parser(comptime Doc: type, comptime opts: anytype) type {
393393
while (self.i < self.input.len and tables.WhitespaceTable[self.input[self.i]]) : (self.i += 1) {}
394394
}
395395

396-
fn findRawTextClose(noalias self: *Self, tag_name: []const u8, start: usize) ?struct { content_end: usize, close_end: usize } {
396+
inline fn findRawTextClose(noalias self: *Self, tag_name: []const u8, start: usize) ?struct { content_end: usize, close_end: usize } {
397397
var j = scanner.findByte(self.input, start, '<') orelse return null;
398398
const tag_len = tag_name.len;
399399
if (tag_len == 0) return null;

src/html/scanner.zig

Lines changed: 86 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,24 @@ pub const TagEnd = struct {
1111

1212
/// Finds `needle` byte in `hay` from `start`, using SIMD where available.
1313
pub inline fn findByte(hay: []const u8, start: usize, needle: u8) ?usize {
14-
return findByteDispatch(hay, start, needle);
14+
// return findByteDispatch(hay, start, needle);
15+
return @call(.always_inline, indexOfScalarPos, .{hay, start, needle});
1516
}
1617

1718
/// Scans from `start` to next `>` while skipping quoted `>` inside attributes.
18-
pub fn findTagEndRespectQuotes(hay: []const u8, start: usize) ?TagEnd {
19-
const first = findAny3Dispatch(hay, start, '>', '"', '\'') orelse return null;
20-
const first_ch = hay[first];
21-
if (first_ch == '>') return finalizeTagEnd(hay, start, first);
22-
23-
var quote = first_ch;
24-
var i = first + 1;
25-
while (i < hay.len) {
26-
const q_pos = findByteDispatch(hay, i, quote) orelse return null;
27-
i = q_pos + 1;
28-
29-
const pos = findAny3Dispatch(hay, i, '>', '"', '\'') orelse return null;
30-
const ch = hay[pos];
31-
if (ch == '>') return finalizeTagEnd(hay, start, pos);
32-
quote = ch;
33-
i = pos + 1;
19+
pub fn findTagEndRespectQuotes(hay: []const u8, _start: usize) ?TagEnd {
20+
var start = _start;
21+
var end = findAny3Dispatch(hay, start) orelse {@branchHint(.cold); return null;};
22+
blk: switch (hay[end]) {
23+
'>' => return finalizeTagEnd(hay, start, end),
24+
'\'', '"' => |q| {
25+
start = 1 + end;
26+
start = 1 + (findByte(hay, start, q) orelse {@branchHint(.cold); return null;});
27+
end = findAny3Dispatch(hay, start) orelse {@branchHint(.cold); return null;};
28+
continue :blk hay[end];
29+
},
30+
else => unreachable,
3431
}
35-
return null;
3632
}
3733

3834
inline fn finalizeTagEnd(hay: []const u8, start: usize, gt_index: usize) TagEnd {
@@ -74,44 +70,33 @@ inline fn finalizeTagEnd(hay: []const u8, start: usize, gt_index: usize) TagEnd
7470
};
7571
}
7672

77-
inline fn findByteDispatch(hay: []const u8, start: usize, needle: u8) ?usize {
78-
// Compile-time architecture dispatch keeps a single callsite shape while
79-
// selecting the fastest available vector width.
80-
if (comptime builtin.cpu.arch == .x86_64 and std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
81-
return findByteVec(32, hay, start, needle);
82-
}
83-
if (comptime builtin.cpu.arch == .x86_64 and std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
84-
return findByteVec(16, hay, start, needle);
85-
}
86-
if (comptime builtin.cpu.arch == .aarch64) {
87-
return findByteVec(16, hay, start, needle);
88-
}
89-
return std.mem.indexOfScalarPos(u8, hay, start, needle);
90-
}
91-
92-
inline fn findAny3Dispatch(hay: []const u8, start: usize, a: u8, b: u8, c: u8) ?usize {
73+
inline fn findAny3Dispatch(hay:[]const u8, start: usize) ?usize {
9374
if (comptime builtin.cpu.arch == .x86_64 and std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
94-
return findAny3Vec(32, hay, start, a, b, c);
75+
return findAny3Vec(32, hay, start);
9576
}
9677
if (comptime builtin.cpu.arch == .x86_64 and std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
97-
return findAny3Vec(16, hay, start, a, b, c);
78+
return findAny3Vec(16, hay, start);
9879
}
9980
if (comptime builtin.cpu.arch == .aarch64) {
100-
return findAny3Vec(16, hay, start, a, b, c);
81+
return findAny3Vec(16, hay, start);
10182
}
102-
return findAny3Scalar(hay, start, a, b, c);
83+
return findAny3Scalar(hay, start);
10384
}
10485

105-
inline fn findAny3Scalar(hay: []const u8, start: usize, a: u8, b: u8, c: u8) ?usize {
106-
var i = start;
107-
while (i < hay.len) : (i += 1) {
108-
const ch = hay[i];
86+
inline fn findAny3Scalar(hay:[]const u8, start: usize) ?usize {
87+
const a = '>';
88+
const b = '"';
89+
const c = '\'';
90+
for (hay[start..], start..) |ch, i| {
10991
if (ch == a or ch == b or ch == c) return i;
11092
}
11193
return null;
11294
}
11395

114-
inline fn findAny3Vec(comptime lanes: comptime_int, hay: []const u8, start: usize, a: u8, b: u8, c: u8) ?usize {
96+
inline fn findAny3Vec(comptime lanes: comptime_int, hay: []const u8, start: usize) ?usize {
97+
const a = '>';
98+
const b = '"';
99+
const c = '\'';
115100
const Vec = @Vector(lanes, u8);
116101
const a_vec: Vec = @splat(a);
117102
const b_vec: Vec = @splat(b);
@@ -123,34 +108,74 @@ inline fn findAny3Vec(comptime lanes: comptime_int, hay: []const u8, start: usiz
123108
const vec: Vec = chunk;
124109
const mask = (vec == a_vec) | (vec == b_vec) | (vec == c_vec);
125110
if (@reduce(.Or, mask)) {
126-
var j: usize = 0;
127-
while (j < lanes) : (j += 1) {
128-
const ch = chunk[j];
129-
if (ch == a or ch == b or ch == c) return i + j;
111+
for (hay[i..], i..) |ch, j| {
112+
if (ch == a or ch == b or ch == c) return j;
130113
}
114+
unreachable;
115+
} else {
116+
@branchHint(.likely);
131117
}
132118
}
133-
return findAny3Scalar(hay, i, a, b, c);
119+
return findAny3Scalar(hay, i);
134120
}
135121

136-
inline fn findByteVec(comptime lanes: comptime_int, hay: []const u8, start: usize, needle: u8) ?usize {
137-
const Vec = @Vector(lanes, u8);
138-
const needle_vec: Vec = @splat(needle);
139122

140-
var i = start;
141-
while (i + lanes <= hay.len) : (i += lanes) {
142-
const chunk: [lanes]u8 = hay[i..][0..lanes].*;
143-
const vec: Vec = chunk;
144-
const mask = vec == needle_vec;
145-
if (@reduce(.Or, mask)) {
146-
var j: usize = 0;
147-
while (j < lanes) : (j += 1) {
148-
if (chunk[j] == needle) return i + j;
123+
inline fn indexOfScalarPos(slice: []const u8, start_index: usize, value: u8) ?usize {
124+
if (start_index >= slice.len) return null;
125+
126+
var i: usize = start_index;
127+
if (!@inComptime()) {
128+
if (std.simd.suggestVectorLength(u8)) |block_len| {
129+
// For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result
130+
// in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning.
131+
//
132+
// Use `std.simd.suggestVectorLength(T)` to get the same alignment as used in this function
133+
// however this usually isn't necessary unless your arch has a performance penalty due to this.
134+
//
135+
// This may differ for other arch's. Arm for example costs a cycle when loading across a cache
136+
// line so explicit alignment prologues may be worth exploration.
137+
138+
// Unrolling here is ~10% improvement. We can then do one bounds check every 2 blocks
139+
// instead of one which adds up.
140+
const Block = @Vector(block_len, u8);
141+
if (i + 2 * block_len < slice.len) {
142+
const mask: Block = @splat(value);
143+
while (true) {
144+
inline for (0..2) |_| {
145+
const block: Block = slice[i..][0..block_len].*;
146+
const matches = block == mask;
147+
if (@reduce(.Or, matches)) {
148+
return i + std.simd.firstTrue(matches).?;
149+
}
150+
i += block_len;
151+
}
152+
if (i + 2 * block_len >= slice.len) break;
153+
}
154+
}
155+
156+
// {block_len, block_len / 2} check
157+
inline for (0..2) |j| {
158+
const block_x_len = block_len / (1 << j);
159+
comptime if (block_x_len < 4) break;
160+
161+
const BlockX = @Vector(block_x_len, u8);
162+
if (i + block_x_len < slice.len) {
163+
const mask: BlockX = @splat(value);
164+
const block: BlockX = slice[i..][0..block_x_len].*;
165+
const matches = block == mask;
166+
if (@reduce(.Or, matches)) {
167+
return i + std.simd.firstTrue(matches).?;
168+
}
169+
i += block_x_len;
170+
}
149171
}
150172
}
151173
}
152174

153-
return std.mem.indexOfScalarPos(u8, hay, i, needle);
175+
for (slice[i..], i..) |c, j| {
176+
if (c == value) return j;
177+
}
178+
return null;
154179
}
155180

156181
test "findByte helper matches scalar behavior" {

0 commit comments

Comments
 (0)