@@ -11,28 +11,24 @@ pub const TagEnd = struct {
1111
1212/// Finds `needle` byte in `hay` from `start`, using SIMD where available.
1313pub inline fn findByte (hay : []const u8 , start : usize , needle : u8 ) ? usize {
14- return findByteDispatch (hay , start , needle );
14+ // return findByteDispatch(hay, start, needle);
15+ return @call (.always_inline , indexOfScalarPos , .{hay , start , needle });
1516}
1617
1718/// Scans from `start` to next `>` while skipping quoted `>` inside attributes.
18- pub fn findTagEndRespectQuotes (hay : []const u8 , start : usize ) ? TagEnd {
19- const first = findAny3Dispatch (hay , start , '>' , '"' , '\' ' ) orelse return null ;
20- const first_ch = hay [first ];
21- if (first_ch == '>' ) return finalizeTagEnd (hay , start , first );
22-
23- var quote = first_ch ;
24- var i = first + 1 ;
25- while (i < hay .len ) {
26- const q_pos = findByteDispatch (hay , i , quote ) orelse return null ;
27- i = q_pos + 1 ;
28-
29- const pos = findAny3Dispatch (hay , i , '>' , '"' , '\' ' ) orelse return null ;
30- const ch = hay [pos ];
31- if (ch == '>' ) return finalizeTagEnd (hay , start , pos );
32- quote = ch ;
33- i = pos + 1 ;
19+ pub fn findTagEndRespectQuotes (hay : []const u8 , _start : usize ) ? TagEnd {
20+ var start = _start ;
21+ var end = findAny3Dispatch (hay , start ) orelse {@branchHint (.cold ); return null ;};
22+ blk : switch (hay [end ]) {
23+ '>' = > return finalizeTagEnd (hay , start , end ),
24+ '\' ' , '"' = > | q | {
25+ start = 1 + end ;
26+ start = 1 + (findByte (hay , start , q ) orelse {@branchHint (.cold ); return null ;});
27+ end = findAny3Dispatch (hay , start ) orelse {@branchHint (.cold ); return null ;};
28+ continue :blk hay [end ];
29+ },
30+ else = > unreachable ,
3431 }
35- return null ;
3632}
3733
3834inline fn finalizeTagEnd (hay : []const u8 , start : usize , gt_index : usize ) TagEnd {
@@ -74,44 +70,33 @@ inline fn finalizeTagEnd(hay: []const u8, start: usize, gt_index: usize) TagEnd
7470 };
7571}
7672
77- inline fn findByteDispatch (hay : []const u8 , start : usize , needle : u8 ) ? usize {
78- // Compile-time architecture dispatch keeps a single callsite shape while
79- // selecting the fastest available vector width.
80- if (comptime builtin .cpu .arch == .x86_64 and std .Target .x86 .featureSetHas (builtin .cpu .features , .avx2 )) {
81- return findByteVec (32 , hay , start , needle );
82- }
83- if (comptime builtin .cpu .arch == .x86_64 and std .Target .x86 .featureSetHas (builtin .cpu .features , .sse2 )) {
84- return findByteVec (16 , hay , start , needle );
85- }
86- if (comptime builtin .cpu .arch == .aarch64 ) {
87- return findByteVec (16 , hay , start , needle );
88- }
89- return std .mem .indexOfScalarPos (u8 , hay , start , needle );
90- }
91-
92- inline fn findAny3Dispatch (hay : []const u8 , start : usize , a : u8 , b : u8 , c : u8 ) ? usize {
73+ inline fn findAny3Dispatch (hay :[]const u8 , start : usize ) ? usize {
9374 if (comptime builtin .cpu .arch == .x86_64 and std .Target .x86 .featureSetHas (builtin .cpu .features , .avx2 )) {
94- return findAny3Vec (32 , hay , start , a , b , c );
75+ return findAny3Vec (32 , hay , start );
9576 }
9677 if (comptime builtin .cpu .arch == .x86_64 and std .Target .x86 .featureSetHas (builtin .cpu .features , .sse2 )) {
97- return findAny3Vec (16 , hay , start , a , b , c );
78+ return findAny3Vec (16 , hay , start );
9879 }
9980 if (comptime builtin .cpu .arch == .aarch64 ) {
100- return findAny3Vec (16 , hay , start , a , b , c );
81+ return findAny3Vec (16 , hay , start );
10182 }
102- return findAny3Scalar (hay , start , a , b , c );
83+ return findAny3Scalar (hay , start );
10384}
10485
105- inline fn findAny3Scalar (hay : []const u8 , start : usize , a : u8 , b : u8 , c : u8 ) ? usize {
106- var i = start ;
107- while (i < hay .len ) : (i += 1 ) {
108- const ch = hay [i ];
86+ inline fn findAny3Scalar (hay :[]const u8 , start : usize ) ? usize {
87+ const a = '>' ;
88+ const b = '"' ;
89+ const c = '\' ' ;
90+ for (hay [start .. ], start .. ) | ch , i | {
10991 if (ch == a or ch == b or ch == c ) return i ;
11092 }
11193 return null ;
11294}
11395
114- inline fn findAny3Vec (comptime lanes : comptime_int , hay : []const u8 , start : usize , a : u8 , b : u8 , c : u8 ) ? usize {
96+ inline fn findAny3Vec (comptime lanes : comptime_int , hay : []const u8 , start : usize ) ? usize {
97+ const a = '>' ;
98+ const b = '"' ;
99+ const c = '\' ' ;
115100 const Vec = @Vector (lanes , u8 );
116101 const a_vec : Vec = @splat (a );
117102 const b_vec : Vec = @splat (b );
@@ -123,34 +108,74 @@ inline fn findAny3Vec(comptime lanes: comptime_int, hay: []const u8, start: usiz
123108 const vec : Vec = chunk ;
124109 const mask = (vec == a_vec ) | (vec == b_vec ) | (vec == c_vec );
125110 if (@reduce (.Or , mask )) {
126- var j : usize = 0 ;
127- while (j < lanes ) : (j += 1 ) {
128- const ch = chunk [j ];
129- if (ch == a or ch == b or ch == c ) return i + j ;
111+ for (hay [i .. ], i .. ) | ch , j | {
112+ if (ch == a or ch == b or ch == c ) return j ;
130113 }
114+ unreachable ;
115+ } else {
116+ @branchHint (.likely );
131117 }
132118 }
133- return findAny3Scalar (hay , i , a , b , c );
119+ return findAny3Scalar (hay , i );
134120}
135121
136- inline fn findByteVec (comptime lanes : comptime_int , hay : []const u8 , start : usize , needle : u8 ) ? usize {
137- const Vec = @Vector (lanes , u8 );
138- const needle_vec : Vec = @splat (needle );
139122
140- var i = start ;
141- while (i + lanes <= hay .len ) : (i += lanes ) {
142- const chunk : [lanes ]u8 = hay [i .. ][0.. lanes ].* ;
143- const vec : Vec = chunk ;
144- const mask = vec == needle_vec ;
145- if (@reduce (.Or , mask )) {
146- var j : usize = 0 ;
147- while (j < lanes ) : (j += 1 ) {
148- if (chunk [j ] == needle ) return i + j ;
123+ inline fn indexOfScalarPos (slice : []const u8 , start_index : usize , value : u8 ) ? usize {
124+ if (start_index >= slice .len ) return null ;
125+
126+ var i : usize = start_index ;
127+ if (! @inComptime ()) {
128+ if (std .simd .suggestVectorLength (u8 )) | block_len | {
129+ // For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result
130+ // in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning.
131+ //
132+ // Use `std.simd.suggestVectorLength(T)` to get the same alignment as used in this function
133+ // however this usually isn't necessary unless your arch has a performance penalty due to this.
134+ //
135+ // This may differ for other arch's. Arm for example costs a cycle when loading across a cache
136+ // line so explicit alignment prologues may be worth exploration.
137+
138+ // Unrolling here is ~10% improvement. We can then do one bounds check every 2 blocks
139+ // instead of one which adds up.
140+ const Block = @Vector (block_len , u8 );
141+ if (i + 2 * block_len < slice .len ) {
142+ const mask : Block = @splat (value );
143+ while (true ) {
144+ inline for (0.. 2) | _ | {
145+ const block : Block = slice [i .. ][0.. block_len ].* ;
146+ const matches = block == mask ;
147+ if (@reduce (.Or , matches )) {
148+ return i + std .simd .firstTrue (matches ).? ;
149+ }
150+ i += block_len ;
151+ }
152+ if (i + 2 * block_len >= slice .len ) break ;
153+ }
154+ }
155+
156+ // {block_len, block_len / 2} check
157+ inline for (0.. 2) | j | {
158+ const block_x_len = block_len / (1 << j );
159+ comptime if (block_x_len < 4 ) break ;
160+
161+ const BlockX = @Vector (block_x_len , u8 );
162+ if (i + block_x_len < slice .len ) {
163+ const mask : BlockX = @splat (value );
164+ const block : BlockX = slice [i .. ][0.. block_x_len ].* ;
165+ const matches = block == mask ;
166+ if (@reduce (.Or , matches )) {
167+ return i + std .simd .firstTrue (matches ).? ;
168+ }
169+ i += block_x_len ;
170+ }
149171 }
150172 }
151173 }
152174
153- return std .mem .indexOfScalarPos (u8 , hay , i , needle );
175+ for (slice [i .. ], i .. ) | c , j | {
176+ if (c == value ) return j ;
177+ }
178+ return null ;
154179}
155180
156181test "findByte helper matches scalar behavior" {
0 commit comments