From caa7637abb6ff4dd406c049144564e084d3a6c99 Mon Sep 17 00:00:00 2001 From: john xu Date: Sun, 28 Dec 2025 15:25:38 +0000 Subject: [PATCH 1/7] perf: SIMD scan ASCII runs in input loop --- bench/bench.zig | 96 +++++++++++++++++++++++++++++++++++++++++++++++++ src/Loop.zig | 55 ++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) diff --git a/bench/bench.zig b/bench/bench.zig index 83afc8b4..152e432c 100644 --- a/bench/bench.zig +++ b/bench/bench.zig @@ -20,6 +20,93 @@ fn printResults(writer: anytype, label: []const u8, iterations: usize, elapsed_n ); } +fn asciiPrintableRunLen(input: []const u8) usize { + const VecLenOpt = std.simd.suggestVectorLength(u8); + if (VecLenOpt) |VecLen| { + const Vec = @Vector(VecLen, u8); + const lo: Vec = @splat(0x20); + const hi: Vec = @splat(0x7E); + var i: usize = 0; + while (i + VecLen <= input.len) : (i += VecLen) { + const chunk = @as(*const [VecLen]u8, @ptrCast(input[i..].ptr)).*; + const vec: Vec = chunk; + const ok = (vec >= lo) & (vec <= hi); + if (!@reduce(.And, ok)) { + var j: usize = 0; + while (j < VecLen) : (j += 1) { + const b = input[i + j]; + if (b < 0x20 or b > 0x7E) return i + j; + } + } + } + while (i < input.len) : (i += 1) { + const b = input[i]; + if (b < 0x20 or b > 0x7E) return i; + } + return input.len; + } + + var i: usize = 0; + while (i < input.len) : (i += 1) { + const b = input[i]; + if (b < 0x20 or b > 0x7E) return i; + } + return input.len; +} + +fn benchParseStreamBaseline(writer: anytype, label: []const u8, parser: *vaxis.Parser, input: []const u8, iterations: usize) !void { + var timer = try std.time.Timer.start(); + var i: usize = 0; + while (i < iterations) : (i += 1) { + var idx: usize = 0; + while (idx < input.len) { + const result = try parser.parse(input[idx..], null); + if (result.n == 0) break; + idx += result.n; + std.mem.doNotOptimizeAway(result); + } + std.mem.doNotOptimizeAway(idx); + } + const elapsed_ns = timer.read(); + try printResults(writer, label, iterations, elapsed_ns, input.len * iterations); +} + +fn benchParseStreamSimd(writer: anytype, label: []const u8, parser: *vaxis.Parser, input: []const u8, iterations: usize) !void { + var timer = try std.time.Timer.start(); + var i: usize = 0; + while (i < iterations) : (i += 1) { + var idx: usize = 0; + while (idx < input.len) { + const slice = input[idx..]; + var ascii_len = asciiPrintableRunLen(slice); + if (ascii_len > 0 and ascii_len < slice.len and slice[ascii_len] >= 0x80) { + ascii_len -= 1; + } + if (ascii_len > 0) { + var j: usize = 0; + while (j < ascii_len) : (j += 1) { + const key: vaxis.Key = .{ + .codepoint = slice[j], + .text = slice[j .. j + 1], + }; + const event: vaxis.Event = .{ .key_press = key }; + std.mem.doNotOptimizeAway(event); + } + idx += ascii_len; + continue; + } + + const result = try parser.parse(slice, null); + if (result.n == 0) break; + idx += result.n; + std.mem.doNotOptimizeAway(result); + } + std.mem.doNotOptimizeAway(idx); + } + const elapsed_ns = timer.read(); + try printResults(writer, label, iterations, elapsed_ns, input.len * iterations); +} + pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); @@ -59,4 +146,13 @@ pub fn main() !void { const dirty_ns = timer.read(); const dirty_bytes: usize = dirty_writer.writer.end; try printResults(stdout, "dirty", iterations, dirty_ns, dirty_bytes); + + var parser: vaxis.Parser = .{}; + const mixed_stream = "The quick brown fox jumps over the lazy dog " ++ + "1234567890 !@#$%^&*() " ++ + "\x1b[A" ++ + "世界 1️⃣ 👩‍🚀!" ++ + "\r"; + try benchParseStreamBaseline(stdout, "parse_stream_loop_baseline", &parser, mixed_stream, iterations); + try benchParseStreamSimd(stdout, "parse_stream_loop_simd", &parser, mixed_stream, iterations); } diff --git a/src/Loop.zig b/src/Loop.zig index a11e2f79..f4d3f8d6 100644 --- a/src/Loop.zig +++ b/src/Loop.zig @@ -10,6 +10,40 @@ const Vaxis = @import("Vaxis.zig"); const log = std.log.scoped(.vaxis); +fn asciiPrintableRunLen(input: []const u8) usize { + const VecLenOpt = std.simd.suggestVectorLength(u8); + if (VecLenOpt) |VecLen| { + const Vec = @Vector(VecLen, u8); + const lo: Vec = @splat(0x20); + const hi: Vec = @splat(0x7E); + var i: usize = 0; + while (i + VecLen <= input.len) : (i += VecLen) { + const chunk = @as(*const [VecLen]u8, @ptrCast(input[i..].ptr)).*; + const vec: Vec = chunk; + const ok = (vec >= lo) & (vec <= hi); + if (!@reduce(.And, ok)) { + var j: usize = 0; + while (j < VecLen) : (j += 1) { + const b = input[i + j]; + if (b < 0x20 or b > 0x7E) return i + j; + } + } + } + while (i < input.len) : (i += 1) { + const b = input[i]; + if (b < 0x20 or b > 0x7E) return i; + } + return input.len; + } + + var i: usize = 0; + while (i < input.len) : (i += 1) { + const b = input[i]; + if (b < 0x20 or b > 0x7E) return i; + } + return input.len; +} + pub fn Loop(comptime T: type) type { return struct { const Self = @This(); @@ -137,6 +171,27 @@ pub fn Loop(comptime T: type) type { const n = try self.tty.read(buf[read_start..]); var seq_start: usize = 0; while (seq_start < n) { + if (@hasField(Event, "key_press")) { + const input = buf[seq_start..n]; + var ascii_len = asciiPrintableRunLen(input); + if (ascii_len > 0 and ascii_len < input.len and input[ascii_len] >= 0x80) { + ascii_len -= 1; + } + if (ascii_len > 0) { + var i: usize = 0; + while (i < ascii_len) : (i += 1) { + const key: vaxis.Key = .{ + .codepoint = input[i], + .text = input[i .. i + 1], + }; + const event: Event = .{ .key_press = key }; + try handleEventGeneric(self, self.vaxis, &cache, Event, event, paste_allocator); + } + read_start = 0; + seq_start += ascii_len; + continue; + } + } const result = try parser.parse(buf[seq_start..n], paste_allocator); if (result.n == 0) { // copy the read to the beginning. We don't use memcpy because From 72ae4970ee7e79aebdf515a21e30dd9432f2eb50 Mon Sep 17 00:00:00 2001 From: john xu Date: Sun, 28 Dec 2025 15:33:23 +0000 Subject: [PATCH 2/7] refactor: share ASCII run scanner --- bench/bench.zig | 37 ++--------------------------------- src/Loop.zig | 37 ++--------------------------------- src/ascii.zig | 51 +++++++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 1 + 4 files changed, 56 insertions(+), 70 deletions(-) create mode 100644 src/ascii.zig diff --git a/bench/bench.zig b/bench/bench.zig index 152e432c..350483df 100644 --- a/bench/bench.zig +++ b/bench/bench.zig @@ -1,5 +1,6 @@ const std = @import("std"); const vaxis = @import("vaxis"); +const ascii = vaxis.ascii; fn parseIterations(allocator: std.mem.Allocator) !usize { var args = try std.process.argsWithAllocator(allocator); @@ -20,40 +21,6 @@ fn printResults(writer: anytype, label: []const u8, iterations: usize, elapsed_n ); } -fn asciiPrintableRunLen(input: []const u8) usize { - const VecLenOpt = std.simd.suggestVectorLength(u8); - if (VecLenOpt) |VecLen| { - const Vec = @Vector(VecLen, u8); - const lo: Vec = @splat(0x20); - const hi: Vec = @splat(0x7E); - var i: usize = 0; - while (i + VecLen <= input.len) : (i += VecLen) { - const chunk = @as(*const [VecLen]u8, @ptrCast(input[i..].ptr)).*; - const vec: Vec = chunk; - const ok = (vec >= lo) & (vec <= hi); - if (!@reduce(.And, ok)) { - var j: usize = 0; - while (j < VecLen) : (j += 1) { - const b = input[i + j]; - if (b < 0x20 or b > 0x7E) return i + j; - } - } - } - while (i < input.len) : (i += 1) { - const b = input[i]; - if (b < 0x20 or b > 0x7E) return i; - } - return input.len; - } - - var i: usize = 0; - while (i < input.len) : (i += 1) { - const b = input[i]; - if (b < 0x20 or b > 0x7E) return i; - } - return input.len; -} - fn benchParseStreamBaseline(writer: anytype, label: []const u8, parser: *vaxis.Parser, input: []const u8, iterations: usize) !void { var timer = try std.time.Timer.start(); var i: usize = 0; @@ -78,7 +45,7 @@ fn benchParseStreamSimd(writer: anytype, label: []const u8, parser: *vaxis.Parse var idx: usize = 0; while (idx < input.len) { const slice = input[idx..]; - var ascii_len = asciiPrintableRunLen(slice); + var ascii_len = ascii.printableRunLen(slice); if (ascii_len > 0 and ascii_len < slice.len and slice[ascii_len] >= 0x80) { ascii_len -= 1; } diff --git a/src/Loop.zig b/src/Loop.zig index f4d3f8d6..9118d369 100644 --- a/src/Loop.zig +++ b/src/Loop.zig @@ -5,45 +5,12 @@ const GraphemeCache = @import("GraphemeCache.zig"); const Parser = @import("Parser.zig"); const Queue = @import("queue.zig").Queue; const vaxis = @import("main.zig"); +const ascii = @import("ascii.zig"); const Tty = vaxis.Tty; const Vaxis = @import("Vaxis.zig"); const log = std.log.scoped(.vaxis); -fn asciiPrintableRunLen(input: []const u8) usize { - const VecLenOpt = std.simd.suggestVectorLength(u8); - if (VecLenOpt) |VecLen| { - const Vec = @Vector(VecLen, u8); - const lo: Vec = @splat(0x20); - const hi: Vec = @splat(0x7E); - var i: usize = 0; - while (i + VecLen <= input.len) : (i += VecLen) { - const chunk = @as(*const [VecLen]u8, @ptrCast(input[i..].ptr)).*; - const vec: Vec = chunk; - const ok = (vec >= lo) & (vec <= hi); - if (!@reduce(.And, ok)) { - var j: usize = 0; - while (j < VecLen) : (j += 1) { - const b = input[i + j]; - if (b < 0x20 or b > 0x7E) return i + j; - } - } - } - while (i < input.len) : (i += 1) { - const b = input[i]; - if (b < 0x20 or b > 0x7E) return i; - } - return input.len; - } - - var i: usize = 0; - while (i < input.len) : (i += 1) { - const b = input[i]; - if (b < 0x20 or b > 0x7E) return i; - } - return input.len; -} - pub fn Loop(comptime T: type) type { return struct { const Self = @This(); @@ -173,7 +140,7 @@ pub fn Loop(comptime T: type) type { while (seq_start < n) { if (@hasField(Event, "key_press")) { const input = buf[seq_start..n]; - var ascii_len = asciiPrintableRunLen(input); + var ascii_len = ascii.printableRunLen(input); if (ascii_len > 0 and ascii_len < input.len and input[ascii_len] >= 0x80) { ascii_len -= 1; } diff --git a/src/ascii.zig b/src/ascii.zig new file mode 100644 index 00000000..a8150698 --- /dev/null +++ b/src/ascii.zig @@ -0,0 +1,51 @@ +const std = @import("std"); + +pub fn printableRunLen(input: []const u8) usize { + const VecLenOpt = std.simd.suggestVectorLength(u8); + if (VecLenOpt) |VecLen| { + const Vec = @Vector(VecLen, u8); + const lo: Vec = @splat(0x20); + const hi: Vec = @splat(0x7E); + var i: usize = 0; + while (i + VecLen <= input.len) : (i += VecLen) { + const chunk = @as(*const [VecLen]u8, @ptrCast(input[i..].ptr)).*; + const vec: Vec = chunk; + const ok = (vec >= lo) & (vec <= hi); + if (!@reduce(.And, ok)) { + var j: usize = 0; + while (j < VecLen) : (j += 1) { + const b = input[i + j]; + if (b < 0x20 or b > 0x7E) return i + j; + } + } + } + while (i < input.len) : (i += 1) { + const b = input[i]; + if (b < 0x20 or b > 0x7E) return i; + } + return input.len; + } + + var i: usize = 0; + while (i < input.len) : (i += 1) { + const b = input[i]; + if (b < 0x20 or b > 0x7E) return i; + } + return input.len; +} + +test "printableRunLen: empty" { + try std.testing.expectEqual(@as(usize, 0), printableRunLen("")); +} + +test "printableRunLen: ascii run" { + try std.testing.expectEqual(@as(usize, 4), printableRunLen("abcd")); +} + +test "printableRunLen: stops at control" { + try std.testing.expectEqual(@as(usize, 1), printableRunLen("a\nb")); +} + +test "printableRunLen: stops at utf8" { + try std.testing.expectEqual(@as(usize, 5), printableRunLen("hello世界")); +} diff --git a/src/main.zig b/src/main.zig index 644626ee..df623331 100644 --- a/src/main.zig +++ b/src/main.zig @@ -29,6 +29,7 @@ pub const ctlseqs = @import("ctlseqs.zig"); pub const GraphemeCache = @import("GraphemeCache.zig"); pub const Event = @import("event.zig").Event; pub const unicode = @import("unicode.zig"); +pub const ascii = @import("ascii.zig"); pub const vxfw = @import("vxfw/vxfw.zig"); From 6da8289fcce45a1d5eee0f71935e7ca687eba383 Mon Sep 17 00:00:00 2001 From: john xu Date: Sun, 28 Dec 2025 15:37:43 +0000 Subject: [PATCH 3/7] fix: refine ASCII run boundary handling --- bench/bench.zig | 5 +---- src/Loop.zig | 5 +---- src/ascii.zig | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/bench/bench.zig b/bench/bench.zig index 350483df..7176631b 100644 --- a/bench/bench.zig +++ b/bench/bench.zig @@ -45,10 +45,7 @@ fn benchParseStreamSimd(writer: anytype, label: []const u8, parser: *vaxis.Parse var idx: usize = 0; while (idx < input.len) { const slice = input[idx..]; - var ascii_len = ascii.printableRunLen(slice); - if (ascii_len > 0 and ascii_len < slice.len and slice[ascii_len] >= 0x80) { - ascii_len -= 1; - } + const ascii_len = ascii.fastPathLen(slice); if (ascii_len > 0) { var j: usize = 0; while (j < ascii_len) : (j += 1) { diff --git a/src/Loop.zig b/src/Loop.zig index 9118d369..5d0dd473 100644 --- a/src/Loop.zig +++ b/src/Loop.zig @@ -140,10 +140,7 @@ pub fn Loop(comptime T: type) type { while (seq_start < n) { if (@hasField(Event, "key_press")) { const input = buf[seq_start..n]; - var ascii_len = ascii.printableRunLen(input); - if (ascii_len > 0 and ascii_len < input.len and input[ascii_len] >= 0x80) { - ascii_len -= 1; - } + const ascii_len = ascii.fastPathLen(input); if (ascii_len > 0) { var i: usize = 0; while (i < ascii_len) : (i += 1) { diff --git a/src/ascii.zig b/src/ascii.zig index a8150698..253999db 100644 --- a/src/ascii.zig +++ b/src/ascii.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const uucode = @import("uucode"); pub fn printableRunLen(input: []const u8) usize { const VecLenOpt = std.simd.suggestVectorLength(u8); @@ -34,6 +35,29 @@ pub fn printableRunLen(input: []const u8) usize { return input.len; } +pub fn fastPathLen(input: []const u8) usize { + const run = printableRunLen(input); + if (run == 0) return 0; + if (run < input.len) { + const next = input[run..]; + const first = next[0]; + if (first >= 0x80) { + const seq_len = std.unicode.utf8ByteSequenceLength(first) catch return run; + if (next.len < seq_len) return run - 1; + const cp = std.unicode.utf8Decode(next[0..seq_len]) catch return run; + const gc = uucode.get(.general_category, cp); + switch (gc) { + .mark_nonspacing, + .mark_spacing_combining, + .mark_enclosing, + => return run - 1, + else => {}, + } + } + } + return run; +} + test "printableRunLen: empty" { try std.testing.expectEqual(@as(usize, 0), printableRunLen("")); } @@ -49,3 +73,19 @@ test "printableRunLen: stops at control" { test "printableRunLen: stops at utf8" { try std.testing.expectEqual(@as(usize, 5), printableRunLen("hello世界")); } + +test "fastPathLen: keeps ascii before utf8" { + try std.testing.expectEqual(@as(usize, 5), fastPathLen("hello世界")); +} + +test "fastPathLen: holds for combining mark" { + try std.testing.expectEqual(@as(usize, 0), fastPathLen("a\u{0301}")); +} + +test "fastPathLen: holds for keycap" { + try std.testing.expectEqual(@as(usize, 0), fastPathLen("1\u{20E3}")); +} + +test "fastPathLen: holds for incomplete utf8" { + try std.testing.expectEqual(@as(usize, 0), fastPathLen("a\xE2")); +} From 109f2912a1e351757737a3ad3107eb13c2100e98 Mon Sep 17 00:00:00 2001 From: john xu Date: Sun, 28 Dec 2025 15:56:42 +0000 Subject: [PATCH 4/7] bench: isolate parser instances --- bench/bench.zig | 7 ++++--- src/ascii.zig | 7 +++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/bench/bench.zig b/bench/bench.zig index 7176631b..7433586f 100644 --- a/bench/bench.zig +++ b/bench/bench.zig @@ -111,12 +111,13 @@ pub fn main() !void { const dirty_bytes: usize = dirty_writer.writer.end; try printResults(stdout, "dirty", iterations, dirty_ns, dirty_bytes); - var parser: vaxis.Parser = .{}; + var parser_baseline: vaxis.Parser = .{}; + var parser_simd: vaxis.Parser = .{}; const mixed_stream = "The quick brown fox jumps over the lazy dog " ++ "1234567890 !@#$%^&*() " ++ "\x1b[A" ++ "世界 1️⃣ 👩‍🚀!" ++ "\r"; - try benchParseStreamBaseline(stdout, "parse_stream_loop_baseline", &parser, mixed_stream, iterations); - try benchParseStreamSimd(stdout, "parse_stream_loop_simd", &parser, mixed_stream, iterations); + try benchParseStreamBaseline(stdout, "parse_stream_loop_baseline", &parser_baseline, mixed_stream, iterations); + try benchParseStreamSimd(stdout, "parse_stream_loop_simd", &parser_simd, mixed_stream, iterations); } diff --git a/src/ascii.zig b/src/ascii.zig index 253999db..d7a930f9 100644 --- a/src/ascii.zig +++ b/src/ascii.zig @@ -1,6 +1,7 @@ const std = @import("std"); const uucode = @import("uucode"); +/// Returns the length of a contiguous run of printable ASCII bytes (0x20..0x7E). pub fn printableRunLen(input: []const u8) usize { const VecLenOpt = std.simd.suggestVectorLength(u8); if (VecLenOpt) |VecLen| { @@ -35,6 +36,12 @@ pub fn printableRunLen(input: []const u8) usize { return input.len; } +/// Returns the safe fast-path length for ASCII runs. +/// +/// This behaves like printableRunLen, but if the next codepoint is a combining +/// mark (Mn/Mc/Me, including keycaps/variation selectors), it leaves the last +/// ASCII byte for the parser to avoid breaking grapheme clusters. If the +/// following UTF-8 sequence is incomplete, it also leaves the last ASCII byte. pub fn fastPathLen(input: []const u8) usize { const run = printableRunLen(input); if (run == 0) return 0; From 4a06cd62dadbbcc3ea1ab8e77ffbe3a0a30e15de Mon Sep 17 00:00:00 2001 From: john xu Date: Sun, 28 Dec 2025 16:18:42 +0000 Subject: [PATCH 5/7] fix: include buffered bytes in read loop --- src/Loop.zig | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Loop.zig b/src/Loop.zig index 5d0dd473..982631b1 100644 --- a/src/Loop.zig +++ b/src/Loop.zig @@ -136,10 +136,11 @@ pub fn Loop(comptime T: type) type { // read loop read_loop: while (!self.should_quit) { const n = try self.tty.read(buf[read_start..]); + const total = read_start + n; var seq_start: usize = 0; - while (seq_start < n) { + while (seq_start < total) { if (@hasField(Event, "key_press")) { - const input = buf[seq_start..n]; + const input = buf[seq_start..total]; const ascii_len = ascii.fastPathLen(input); if (ascii_len > 0) { var i: usize = 0; @@ -156,15 +157,15 @@ pub fn Loop(comptime T: type) type { continue; } } - const result = try parser.parse(buf[seq_start..n], paste_allocator); + const result = try parser.parse(buf[seq_start..total], paste_allocator); if (result.n == 0) { // copy the read to the beginning. We don't use memcpy because // this could be overlapping, and it's also rare const initial_start = seq_start; - while (seq_start < n) : (seq_start += 1) { + while (seq_start < total) : (seq_start += 1) { buf[seq_start - initial_start] = buf[seq_start]; } - read_start = seq_start - initial_start + 1; + read_start = total - initial_start; continue :read_loop; } read_start = 0; From c222cf5551a55035a8f7d6f624ed82b86a4a7e9e Mon Sep 17 00:00:00 2001 From: john xu Date: Sun, 28 Dec 2025 16:37:03 +0000 Subject: [PATCH 6/7] test: cover ASCII boundary and incomplete UTF-8 cases --- src/ascii.zig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/ascii.zig b/src/ascii.zig index d7a930f9..ce04da1e 100644 --- a/src/ascii.zig +++ b/src/ascii.zig @@ -81,6 +81,10 @@ test "printableRunLen: stops at utf8" { try std.testing.expectEqual(@as(usize, 5), printableRunLen("hello世界")); } +test "printableRunLen: includes space and tilde" { + try std.testing.expectEqual(@as(usize, 2), printableRunLen(" ~")); +} + test "fastPathLen: keeps ascii before utf8" { try std.testing.expectEqual(@as(usize, 5), fastPathLen("hello世界")); } @@ -96,3 +100,7 @@ test "fastPathLen: holds for keycap" { test "fastPathLen: holds for incomplete utf8" { try std.testing.expectEqual(@as(usize, 0), fastPathLen("a\xE2")); } + +test "fastPathLen: leaves last ascii for incomplete utf8 after run" { + try std.testing.expectEqual(@as(usize, 2), fastPathLen("abc\xE2")); +} From d5d295bdfbc7657740006e35461eb87a560dd84d Mon Sep 17 00:00:00 2001 From: john xu Date: Sun, 28 Dec 2025 16:53:54 +0000 Subject: [PATCH 7/7] fix: treat variation selectors as combining --- src/ascii.zig | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/ascii.zig b/src/ascii.zig index ce04da1e..a583953e 100644 --- a/src/ascii.zig +++ b/src/ascii.zig @@ -39,9 +39,10 @@ pub fn printableRunLen(input: []const u8) usize { /// Returns the safe fast-path length for ASCII runs. /// /// This behaves like printableRunLen, but if the next codepoint is a combining -/// mark (Mn/Mc/Me, including keycaps/variation selectors), it leaves the last -/// ASCII byte for the parser to avoid breaking grapheme clusters. If the -/// following UTF-8 sequence is incomplete, it also leaves the last ASCII byte. +/// mark (Mn/Mc/Me) or a variation selector (U+FE00..U+FE0F, U+E0100..U+E01EF), +/// it leaves the last ASCII byte for the parser to avoid breaking grapheme +/// clusters. If the following UTF-8 sequence is incomplete, it also leaves the +/// last ASCII byte. pub fn fastPathLen(input: []const u8) usize { const run = printableRunLen(input); if (run == 0) return 0; @@ -52,6 +53,7 @@ pub fn fastPathLen(input: []const u8) usize { const seq_len = std.unicode.utf8ByteSequenceLength(first) catch return run; if (next.len < seq_len) return run - 1; const cp = std.unicode.utf8Decode(next[0..seq_len]) catch return run; + if (isVariationSelector(cp)) return run - 1; const gc = uucode.get(.general_category, cp); switch (gc) { .mark_nonspacing, @@ -65,6 +67,10 @@ pub fn fastPathLen(input: []const u8) usize { return run; } +fn isVariationSelector(cp: u21) bool { + return (cp >= 0xFE00 and cp <= 0xFE0F) or (cp >= 0xE0100 and cp <= 0xE01EF); +} + test "printableRunLen: empty" { try std.testing.expectEqual(@as(usize, 0), printableRunLen("")); } @@ -97,6 +103,10 @@ test "fastPathLen: holds for keycap" { try std.testing.expectEqual(@as(usize, 0), fastPathLen("1\u{20E3}")); } +test "fastPathLen: holds for variation selector" { + try std.testing.expectEqual(@as(usize, 0), fastPathLen("a\u{FE0F}")); +} + test "fastPathLen: holds for incomplete utf8" { try std.testing.expectEqual(@as(usize, 0), fastPathLen("a\xE2")); }