Skip to content

Commit 9f0ecd9

Browse files
committed
Add parser mode guidance doc
1 parent fc040b3 commit 9f0ecd9

9 files changed

Lines changed: 195 additions & 55 deletions

File tree

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,18 @@ if (node == null) {
108108
}
109109
```
110110

111+
### Instrumentation hooks
112+
113+
```zig
114+
var hooks = Hooks{};
115+
try html.parseWithHooks(&doc, &input, .{}, &hooks);
116+
_ = try html.queryOneRuntimeWithHooks(&doc, "a.primary", &hooks);
117+
```
118+
119+
Reference examples:
120+
- `examples/debug_query_report.zig`
121+
- `examples/instrumentation_hooks.zig`
122+
111123
## Parse Option Recipes
112124

113125
Two bundles are used by the benchmark harness and conformance runner:

bench/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ This directory benchmarks `htmlparser` against other high-performance HTML parse
88

99
It also benchmarks `htmlparser` query parsing throughput (runtime selector compile path).
1010
Query sections remain `htmlparser`-only; external parser comparisons are parse throughput only.
11+
`query-parse` is mode-independent and is measured once (`ours`) to avoid duplicate compatibility rows.
1112

1213
`htmlparser` parse results are reported in two internal benchmark modes:
1314

docs/getting-started.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,26 @@ _ = node;
4949

5050
Source of truth: `examples/runtime_selector.zig` and `examples/compiled_selector.zig`.
5151

52+
## Debug Diagnostics and Hooks
53+
54+
Selector mismatch diagnostics:
55+
56+
```zig
57+
var report: html.QueryDebugReport = .{};
58+
const node = try doc.queryOneRuntimeDebug("a[href^=https]", &report);
59+
_ = node;
60+
```
61+
62+
Instrumentation wrappers:
63+
64+
```zig
65+
var hooks = Hooks{};
66+
try html.parseWithHooks(&doc, &input, .{}, &hooks);
67+
_ = try html.queryOneRuntimeWithHooks(&doc, "a.primary", &hooks);
68+
```
69+
70+
Source of truth: `examples/debug_query_report.zig` and `examples/instrumentation_hooks.zig`.
71+
5272
## Parse Options
5373

5474
```zig

examples/debug_query_report.zig

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
const std = @import("std");
2+
const html = @import("htmlparser");
3+
const default_options: html.ParseOptions = .{};
4+
const Document = default_options.GetDocument();
5+
6+
fn run() !void {
7+
var doc = Document.init(std.testing.allocator);
8+
defer doc.deinit();
9+
10+
var input = "<div><a id='one' class='nav'></a><a id='two'></a></div>".*;
11+
try doc.parse(&input, .{});
12+
13+
var report: html.QueryDebugReport = .{};
14+
const node = try doc.queryOneRuntimeDebug("a[href^=https]", &report);
15+
try std.testing.expect(node == null);
16+
try std.testing.expect(report.visited_elements > 0);
17+
try std.testing.expect(report.near_miss_len > 0);
18+
try std.testing.expect(report.near_misses[0].reason.kind != .none);
19+
}
20+
21+
test "query debug report for selector mismatch" {
22+
try run();
23+
}

examples/instrumentation_hooks.zig

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
const std = @import("std");
2+
const html = @import("htmlparser");
3+
const default_options: html.ParseOptions = .{};
4+
const Document = default_options.GetDocument();
5+
6+
const Hooks = struct {
7+
parse_start_calls: usize = 0,
8+
parse_end_calls: usize = 0,
9+
query_start_calls: usize = 0,
10+
query_end_calls: usize = 0,
11+
12+
pub fn onParseStart(self: *@This(), _: usize) void {
13+
self.parse_start_calls += 1;
14+
}
15+
16+
pub fn onParseEnd(self: *@This(), _: html.ParseInstrumentationStats) void {
17+
self.parse_end_calls += 1;
18+
}
19+
20+
pub fn onQueryStart(self: *@This(), _: html.QueryInstrumentationKind, _: usize) void {
21+
self.query_start_calls += 1;
22+
}
23+
24+
pub fn onQueryEnd(self: *@This(), _: html.QueryInstrumentationStats) void {
25+
self.query_end_calls += 1;
26+
}
27+
};
28+
29+
fn run() !void {
30+
var doc = Document.init(std.testing.allocator);
31+
defer doc.deinit();
32+
33+
var hooks: Hooks = .{};
34+
var input = "<div><span id='x'></span></div>".*;
35+
try html.parseWithHooks(&doc, &input, .{}, &hooks);
36+
try std.testing.expectEqual(@as(usize, 1), hooks.parse_start_calls);
37+
try std.testing.expectEqual(@as(usize, 1), hooks.parse_end_calls);
38+
39+
_ = try html.queryOneRuntimeWithHooks(&doc, "span#x", &hooks);
40+
try std.testing.expectEqual(@as(usize, 1), hooks.query_start_calls);
41+
try std.testing.expectEqual(@as(usize, 1), hooks.query_end_calls);
42+
}
43+
44+
test "instrumentation hook wrappers" {
45+
try run();
46+
}

src/bench/bench.zig

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,7 @@ fn parseMode(arg: []const u8) !BenchMode {
1414
return error.InvalidBenchMode;
1515
}
1616

17-
fn parseDocForParseBench(noalias doc: *Document, input: []u8, mode: BenchMode) !void {
18-
switch (mode) {
19-
.strictest => try doc.parse(input, .{
20-
.eager_child_views = true,
21-
.drop_whitespace_text_nodes = false,
22-
}),
23-
.fastest => try doc.parse(input, .{
24-
.eager_child_views = false,
25-
.drop_whitespace_text_nodes = true,
26-
}),
27-
}
28-
}
29-
30-
fn parseDocForQueryBench(noalias doc: *Document, input: []u8, mode: BenchMode) !void {
17+
fn parseDocForBench(noalias doc: *Document, input: []u8, mode: BenchMode) !void {
3118
switch (mode) {
3219
.strictest => try doc.parse(input, .{
3320
.eager_child_views = true,
@@ -92,9 +79,9 @@ pub fn runParseFile(path: []const u8, iterations: usize, mode: BenchMode) !u64 {
9279
defer doc.deinit();
9380
if (working_opt) |working| {
9481
@memcpy(working, input);
95-
try parseDocForParseBench(&doc, working, mode);
82+
try parseDocForBench(&doc, working, mode);
9683
} else {
97-
try parseDocForParseBench(&doc, input, mode);
84+
try parseDocForBench(&doc, input, mode);
9885
}
9986
}
10087
_ = parse_arena.reset(.retain_capacity);
@@ -136,7 +123,7 @@ pub fn runQueryMatch(path: []const u8, selector: []const u8, iterations: usize,
136123

137124
var doc = Document.init(alloc);
138125
defer doc.deinit();
139-
try parseDocForQueryBench(&doc, working, mode);
126+
try parseDocForBench(&doc, working, mode);
140127

141128
const start = std.time.nanoTimestamp();
142129
var i: usize = 0;
@@ -166,7 +153,7 @@ pub fn runQueryCompiled(path: []const u8, selector: []const u8, iterations: usiz
166153

167154
var doc = Document.init(alloc);
168155
defer doc.deinit();
169-
try parseDocForQueryBench(&doc, working, mode);
156+
try parseDocForBench(&doc, working, mode);
170157

171158
const start = std.time.nanoTimestamp();
172159
var i: usize = 0;
@@ -198,14 +185,6 @@ pub fn main() !void {
198185
return;
199186
}
200187

201-
if (args.len == 5 and std.mem.eql(u8, args[1], "query-parse")) {
202-
_ = try parseMode(args[2]);
203-
const iterations = try std.fmt.parseInt(usize, args[4], 10);
204-
const total_ns = try runQueryParse(args[3], iterations);
205-
std.debug.print("{d}\n", .{total_ns});
206-
return;
207-
}
208-
209188
if (args.len == 5 and std.mem.eql(u8, args[1], "query-match")) {
210189
const iterations = try std.fmt.parseInt(usize, args[4], 10);
211190
const total_ns = try runQueryMatch(args[2], args[3], iterations, .fastest);
@@ -246,8 +225,8 @@ pub fn main() !void {
246225

247226
if (args.len != 3) {
248227
std.debug.print(
249-
"usage:\n {s} <html-file> <iterations>\n {s} parse <strictest|fastest> <html-file> <iterations>\n {s} query-parse <selector> <iterations>\n {s} query-parse <strictest|fastest> <selector> <iterations>\n {s} query-match <html-file> <selector> <iterations>\n {s} query-match <strictest|fastest> <html-file> <selector> <iterations>\n {s} query-compiled <html-file> <selector> <iterations>\n {s} query-compiled <strictest|fastest> <html-file> <selector> <iterations>\n",
250-
.{ args[0], args[0], args[0], args[0], args[0], args[0], args[0], args[0] },
228+
"usage:\n {s} <html-file> <iterations>\n {s} parse <strictest|fastest> <html-file> <iterations>\n {s} query-parse <selector> <iterations>\n {s} query-match <html-file> <selector> <iterations>\n {s} query-match <strictest|fastest> <html-file> <selector> <iterations>\n {s} query-compiled <html-file> <selector> <iterations>\n {s} query-compiled <strictest|fastest> <html-file> <selector> <iterations>\n",
229+
.{ args[0], args[0], args[0], args[0], args[0], args[0], args[0] },
251230
);
252231
std.process.exit(2);
253232
}

src/examples_tests.zig

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,53 @@ test "example parity: strictest and fastest selectors agree" {
122122
try std.testing.expectEqual(@as(usize, 2), strictest_count);
123123
try std.testing.expectEqual(strictest_count, fastest_count);
124124
}
125+
126+
test "example parity: debug query report" {
127+
var doc = Document.init(std.testing.allocator);
128+
defer doc.deinit();
129+
130+
var input = "<div><a id='one' class='nav'></a><a id='two'></a></div>".*;
131+
try doc.parse(&input, .{});
132+
133+
var report: html.QueryDebugReport = .{};
134+
const node = try doc.queryOneRuntimeDebug("a[href^=https]", &report);
135+
try std.testing.expect(node == null);
136+
try std.testing.expect(report.visited_elements > 0);
137+
try std.testing.expect(report.near_miss_len > 0);
138+
try std.testing.expect(report.near_misses[0].reason.kind != .none);
139+
}
140+
141+
test "example parity: instrumentation hooks" {
142+
const Hooks = struct {
143+
parse_start_calls: usize = 0,
144+
parse_end_calls: usize = 0,
145+
query_start_calls: usize = 0,
146+
query_end_calls: usize = 0,
147+
148+
pub fn onParseStart(self: *@This(), _: usize) void {
149+
self.parse_start_calls += 1;
150+
}
151+
pub fn onParseEnd(self: *@This(), _: html.ParseInstrumentationStats) void {
152+
self.parse_end_calls += 1;
153+
}
154+
pub fn onQueryStart(self: *@This(), _: html.QueryInstrumentationKind, _: usize) void {
155+
self.query_start_calls += 1;
156+
}
157+
pub fn onQueryEnd(self: *@This(), _: html.QueryInstrumentationStats) void {
158+
self.query_end_calls += 1;
159+
}
160+
};
161+
162+
var doc = Document.init(std.testing.allocator);
163+
defer doc.deinit();
164+
var hooks: Hooks = .{};
165+
166+
var input = "<div><span id='x'></span></div>".*;
167+
try html.parseWithHooks(&doc, &input, .{}, &hooks);
168+
_ = try html.queryOneRuntimeWithHooks(&doc, "span#x", &hooks);
169+
170+
try std.testing.expectEqual(@as(usize, 1), hooks.parse_start_calls);
171+
try std.testing.expectEqual(@as(usize, 1), hooks.parse_end_calls);
172+
try std.testing.expectEqual(@as(usize, 1), hooks.query_start_calls);
173+
try std.testing.expectEqual(@as(usize, 1), hooks.query_end_calls);
174+
}

src/html/document.zig

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1408,25 +1408,25 @@ test "leading child combinator works in node-scoped queries" {
14081408
try std.testing.expectEqual(@as(usize, 1), hsoob_count);
14091409
}
14101410

1411-
test "attribute parsing preserves selector/query behavior for representative input" {
1411+
test "parse option bundles preserve selector/query behavior for representative input" {
14121412
const alloc = std.testing.allocator;
14131413

1414-
var eager_doc = Document.init(alloc);
1415-
defer eager_doc.deinit();
1416-
var deferred_doc = Document.init(alloc);
1417-
defer deferred_doc.deinit();
1414+
var strict_doc = Document.init(alloc);
1415+
defer strict_doc.deinit();
1416+
var fast_doc = Document.init(alloc);
1417+
defer fast_doc.deinit();
14181418

1419-
var eager_html = ("<html><body>" ++
1419+
var strict_html = ("<html><body>" ++
14201420
"<div id='x' class='alpha beta' data-k='v' data-q='1>2'>x</div>" ++
14211421
"<img id='im' src='a.png' />" ++
14221422
"<a id='a1' href='https://example.com' class='nav button'>ok</a>" ++
14231423
"<p id='p1'>a<span id='s1'>b</span></p>" ++
14241424
"<div id='e' a= ></div>" ++
14251425
"</body></html>").*;
1426-
var deferred_html = eager_html;
1426+
var fast_html = strict_html;
14271427

1428-
try eager_doc.parse(&eager_html, .{});
1429-
try deferred_doc.parse(&deferred_html, .{
1428+
try strict_doc.parse(&strict_html, .{});
1429+
try fast_doc.parse(&fast_html, .{
14301430
.eager_child_views = false,
14311431
});
14321432

@@ -1439,14 +1439,14 @@ test "attribute parsing preserves selector/query behavior for representative inp
14391439
};
14401440

14411441
for (selectors) |sel| {
1442-
const a = try eager_doc.queryOneRuntime(sel);
1443-
const b = try deferred_doc.queryOneRuntime(sel);
1442+
const a = try strict_doc.queryOneRuntime(sel);
1443+
const b = try fast_doc.queryOneRuntime(sel);
14441444
try std.testing.expect((a == null) == (b == null));
14451445
}
14461446

1447-
const eager_empty = (eager_doc.queryOne("#e") orelse return error.TestUnexpectedResult).getAttributeValue("a") orelse return error.TestUnexpectedResult;
1448-
const deferred_empty = (deferred_doc.queryOne("#e") orelse return error.TestUnexpectedResult).getAttributeValue("a") orelse return error.TestUnexpectedResult;
1449-
try std.testing.expectEqualStrings(eager_empty, deferred_empty);
1447+
const strict_empty = (strict_doc.queryOne("#e") orelse return error.TestUnexpectedResult).getAttributeValue("a") orelse return error.TestUnexpectedResult;
1448+
const fast_empty = (fast_doc.queryOne("#e") orelse return error.TestUnexpectedResult).getAttributeValue("a") orelse return error.TestUnexpectedResult;
1449+
try std.testing.expectEqualStrings(strict_empty, fast_empty);
14501450
}
14511451

14521452
test "attribute scanner handles quoted > and self-closing tails" {

0 commit comments

Comments
 (0)