From 45878105be78c8394b2cbd428bbbe2049640029d Mon Sep 17 00:00:00 2001 From: Yorick Terweijden Date: Wed, 4 Dec 2024 21:56:37 +0100 Subject: [PATCH 1/2] pre-compile regexp2 with regexp2cg --- codec/cl100k_base.go | 5 +- codec/o200k_base.go | 18 +- codec/p50k_base.go | 5 +- codec/p50k_edit.go | 5 +- codec/r50k_base.go | 5 +- codec/regexp.gen.go | 1555 ++++++++++++++++++++++++++++++++++++++++++ go.mod | 11 +- go.sum | 6 + tokenizer_test.go | 285 +++----- 9 files changed, 1696 insertions(+), 199 deletions(-) create mode 100644 codec/regexp.gen.go diff --git a/codec/cl100k_base.go b/codec/cl100k_base.go index db573f7..592fb08 100644 --- a/codec/cl100k_base.go +++ b/codec/cl100k_base.go @@ -4,10 +4,13 @@ import "github.com/dlclark/regexp2" func NewCl100kBase() *Codec { cl100kBaseVocabOnce.Do(cl100kBaseVocabInit) + + splitRegexp := regexp2.MustCompile(`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, regexp2.None) + return &Codec{ name: "cl100k_base", vocabulary: cl100kBaseVocab, - splitRegexp: regexp2.MustCompile(`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, regexp2.None), + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 100257, "<|fim_prefix|>": 100258, diff --git a/codec/o200k_base.go b/codec/o200k_base.go index 81568bf..825bba8 100644 --- a/codec/o200k_base.go +++ b/codec/o200k_base.go @@ -4,17 +4,15 @@ import "github.com/dlclark/regexp2" func NewO200kBase() *Codec { o200kBaseVocabOnce.Do(o200kBaseVocabInit) + + splitRegexp := regexp2.MustCompile( + `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`, + regexp2.None) + return &Codec{ - name: "o200k_base", - vocabulary: o200kBaseVocab, - splitRegexp: regexp2.MustCompile( - `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|`+ - `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|`+ - `\p{N}{1,3}|`+` ?[^\s\p{L}\p{N}]+[\r\n/]*|`+ - `\s*[\r\n]+|`+ - `\s+(?!\S)|`+ - `\s+`, - regexp2.None), + name: "o200k_base", + vocabulary: o200kBaseVocab, + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 199999, "<|endofprompt|>": 200018, diff --git a/codec/p50k_base.go b/codec/p50k_base.go index 4b64a4b..453ef5a 100644 --- a/codec/p50k_base.go +++ b/codec/p50k_base.go @@ -4,10 +4,13 @@ import "github.com/dlclark/regexp2" func NewP50kBase() *Codec { p50kBaseVocabOnce.Do(p50kBaseVocabInit) + + splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) + return &Codec{ name: "p50k_base", vocabulary: p50kBaseVocab, - splitRegexp: regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None), + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 50256, }, diff --git a/codec/p50k_edit.go b/codec/p50k_edit.go index 7a05aa8..1d1fd1c 100644 --- a/codec/p50k_edit.go +++ b/codec/p50k_edit.go @@ -4,10 +4,13 @@ import "github.com/dlclark/regexp2" func NewP50kEdit() *Codec { p50kBaseVocabOnce.Do(p50kBaseVocabInit) + + splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) + return &Codec{ name: "p50k_edit", vocabulary: p50kBaseVocab, - splitRegexp: regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None), + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 50256, "<|fim_prefix|>": 50281, diff --git a/codec/r50k_base.go b/codec/r50k_base.go index 3c99721..4c792f8 100644 --- a/codec/r50k_base.go +++ b/codec/r50k_base.go @@ -4,10 +4,13 @@ import "github.com/dlclark/regexp2" func NewR50kBase() *Codec { r50kBaseVocabOnce.Do(r50kBaseVocabInit) + + splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) + return &Codec{ name: "r50k_base", vocabulary: r50kBaseVocab, - splitRegexp: regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None), + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 50256, }, diff --git a/codec/regexp.gen.go b/codec/regexp.gen.go new file mode 100644 index 0000000..3b00a59 --- /dev/null +++ b/codec/regexp.gen.go @@ -0,0 +1,1555 @@ +package codec + +import ( + "github.com/dlclark/regexp2" + "github.com/dlclark/regexp2/helpers" + "github.com/dlclark/regexp2/syntax" + "unicode" +) + +/* +Capture(index = 0, unindex = -1) + Atomic + Alternate + Concatenate + One(Ch = ') + Atomic + Alternate + Set(Set = [r-t]) + Multi(String = "re") + Multi(String = "ve") + One(Ch = m) + Multi(String = "ll") + One(Ch = d) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [\p{N}])(Min = 1, Max = inf) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 1, Max = inf) + NegLook + Set(Set = [^\s]) + SetloopAtomic(Set = [\s])(Min = 1, Max = inf) +*/ +// From p50k_base.go:8:37 +// Pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" +// Options: regexp2.None +type splitRegexp_Engine struct{} + +func (splitRegexp_Engine) Caps() map[int]int { return nil } +func (splitRegexp_Engine) CapNames() map[string]int { return nil } +func (splitRegexp_Engine) CapsList() []string { return nil } +func (splitRegexp_Engine) CapSize() int { return 1 } + +func (splitRegexp_Engine) FindFirstChar(r *regexp2.Runner) bool { + pos := r.Runtextpos + // Empty matches aren't possible + if pos < len(r.Runtext) { + return true + } + + // No match found + r.Runtextpos = len(r.Runtext) + return false +} + +func (splitRegexp_Engine) Execute(r *regexp2.Runner) error { + atomic_stackpos := 0 + alternation_starting_pos := 0 + alternation_starting_pos1 := 0 + iteration := 0 + iteration1 := 0 + iteration2 := 0 + var charloop_starting_pos, charloop_ending_pos = 0, 0 + iteration3 := 0 + negativelookahead_starting_pos := 0 + iteration4 := 0 + pos := r.Runtextpos + matchStart := pos + + var slice = r.Runtext[pos:] + + // Node: Atomic + // Atomic group. + atomic_stackpos = r.Runstackpos + + // Node: Alternate + // Match with 6 alternative expressions, atomically. + alternation_starting_pos = pos + + // Branch 0 + // Node: Concatenate + // Node: One(Ch = ') + // Match '\”. + if len(slice) == 0 || slice[0] != '\'' { + goto AlternationBranch + } + + // Node: Atomic + // Node: Alternate + // Match with 6 alternative expressions, atomically. + alternation_starting_pos1 = pos + + // Branch 0 + // Node: Set(Set = [r-t]) + // Match [r-t]. + if len(slice) < 2 || !helpers.IsBetween(slice[1], 'r', 't') { + goto AlternationBranch1 + } + + pos += 2 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch1: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Multi(String = "re") + // Match the string "re". + if !helpers.StartsWith(slice[1:], []rune("re")) { + goto AlternationBranch2 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch2: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 2 + // Node: Multi(String = "ve") + // Match the string "ve". + if !helpers.StartsWith(slice[1:], []rune("ve")) { + goto AlternationBranch3 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch3: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 3 + // Node: One(Ch = m) + // Match 'm'. + if len(slice) < 2 || slice[1] != 'm' { + goto AlternationBranch4 + } + + pos += 2 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch4: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 4 + // Node: Multi(String = "ll") + // Match the string "ll". + if !helpers.StartsWith(slice[1:], []rune("ll")) { + goto AlternationBranch5 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch5: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 5 + // Node: One(Ch = d) + // Match 'd'. + if len(slice) < 2 || slice[1] != 'd' { + goto AlternationBranch + } + + pos += 2 + slice = r.Runtext[pos:] + +AlternationMatch1: + ; + + goto AlternationMatch + +AlternationBranch: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) + // Match [\p{L}] atomically at least once. + iteration = 0 + for iteration < len(slice) && unicode.In(slice[iteration], unicode.L) { + iteration++ + } + + if iteration == 0 { + goto AlternationBranch6 + } + + slice = slice[iteration:] + pos += iteration + + goto AlternationMatch + +AlternationBranch6: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 2 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = inf) + // Match [\p{N}] atomically at least once. + iteration1 = 0 + for iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.N) { + iteration1++ + } + + if iteration1 == 0 { + goto AlternationBranch7 + } + + slice = slice[iteration1:] + pos += iteration1 + + goto AlternationMatch + +AlternationBranch7: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 3 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + // Match [^\s\p{L}\p{N}] atomically at least once. + iteration2 = 0 + for iteration2 < len(slice) && set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113.CharIn(slice[iteration2]) { + iteration2++ + } + + if iteration2 == 0 { + goto AlternationBranch8 + } + + slice = slice[iteration2:] + pos += iteration2 + + goto AlternationMatch + +AlternationBranch8: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 4 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 1, Max = inf) + // Match [\s] greedily at least once. + charloop_starting_pos = pos + + iteration3 = 0 + for iteration3 < len(slice) && unicode.IsSpace(slice[iteration3]) { + iteration3++ + } + + if iteration3 == 0 { + goto AlternationBranch9 + } + + slice = slice[iteration3:] + pos += iteration3 + + charloop_ending_pos = pos + charloop_starting_pos++ + goto CharLoopEnd + +CharLoopBacktrack: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos >= charloop_ending_pos { + goto AlternationBranch9 + } + charloop_ending_pos-- + pos = charloop_ending_pos + slice = r.Runtext[pos:] + +CharLoopEnd: + + // Node: NegLook + // Zero-width negative lookahead + negativelookahead_starting_pos = pos + + if err := r.CheckTimeout(); err != nil { + return err + } + // Node: Set(Set = [^\s]) + // Match [^\s]. + if len(slice) == 0 || unicode.IsSpace(slice[0]) { + goto NegativeLookaroundMatch + } + + goto CharLoopBacktrack + +NegativeLookaroundMatch: + pos = negativelookahead_starting_pos + slice = r.Runtext[pos:] + + goto AlternationMatch + +AlternationBranch9: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 5 + // Node: SetloopAtomic(Set = [\s])(Min = 1, Max = inf) + // Match [\s] atomically at least once. + iteration4 = 0 + for iteration4 < len(slice) && unicode.IsSpace(slice[iteration4]) { + iteration4++ + } + + if iteration4 == 0 { + return nil // The input didn't match. + } + + slice = slice[iteration4:] + pos += iteration4 + +AlternationMatch: + ; + + r.Runstackpos = atomic_stackpos + + // The input matched. + r.Runtextpos = pos + r.Capture(0, matchStart, pos) + // just to prevent an unused var error in certain regex's + var _ = slice + return nil +} + +/* +Capture(index = 0, unindex = -1) + Atomic + Alternate + Concatenate + One(Ch = ') + Atomic + Alternate + Set(Set = [R-Tr-tſ]) + Concatenate + Set(Set = [Rr]) + Set(Set = [Ee]) + Concatenate + Set(Set = [Vv]) + Set(Set = [Ee]) + Set(Set = [Mm]) + SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + Set(Set = [Dd]) + Concatenate + Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) + SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + SetloopAtomic(Set = [\n\r])(Min = 0, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 0, Max = inf) + SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 1, Max = inf) + NegLook + Set(Set = [^\s]) + SetloopAtomic(Set = [\s])(Min = 1, Max = inf) +*/ +// From cl100k_base.go:8:37 +// Pattern: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" +// Options: regexp2.None +type splitRegexp_2_Engine struct{} + +func (splitRegexp_2_Engine) Caps() map[int]int { return nil } +func (splitRegexp_2_Engine) CapNames() map[string]int { return nil } +func (splitRegexp_2_Engine) CapsList() []string { return nil } +func (splitRegexp_2_Engine) CapSize() int { return 1 } + +func (splitRegexp_2_Engine) FindFirstChar(r *regexp2.Runner) bool { + pos := r.Runtextpos + // Empty matches aren't possible + if pos < len(r.Runtext) { + return true + } + + // No match found + r.Runtextpos = len(r.Runtext) + return false +} + +func (splitRegexp_2_Engine) Execute(r *regexp2.Runner) error { + atomic_stackpos := 0 + alternation_starting_pos := 0 + alternation_starting_pos1 := 0 + var charloop_starting_pos, charloop_ending_pos = 0, 0 + iteration := 0 + iteration1 := 0 + iteration2 := 0 + iteration3 := 0 + var charloop_starting_pos1, charloop_ending_pos1 = 0, 0 + iteration4 := 0 + iteration5 := 0 + var charloop_starting_pos2, charloop_ending_pos2 = 0, 0 + iteration6 := 0 + negativelookahead_starting_pos := 0 + iteration7 := 0 + pos := r.Runtextpos + matchStart := pos + + var slice = r.Runtext[pos:] + + // Node: Atomic + // Atomic group. + atomic_stackpos = r.Runstackpos + + // Node: Alternate + // Match with 7 alternative expressions, atomically. + alternation_starting_pos = pos + + // Branch 0 + // Node: Concatenate + // Node: One(Ch = ') + // Match '\”. + if len(slice) == 0 || slice[0] != '\'' { + goto AlternationBranch + } + + // Node: Atomic + // Node: Alternate + // Match with 6 alternative expressions, atomically. + alternation_starting_pos1 = pos + + // Branch 0 + // Node: Set(Set = [R-Tr-tſ]) + // Match [R-Tr-tſ]. + if len(slice) < 2 || !set_4a1357005dba18ced5af0bde8202a98fab7a3500c675e3cb1c60597d7a36436a.CharIn(slice[1]) { + goto AlternationBranch1 + } + + pos += 2 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch1: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Concatenate + if len(slice) < 3 || + !helpers.StartsWithIgnoreCase(slice[1:], []rune("re")) /* Match the string "re" (case-insensitive) */ { + goto AlternationBranch2 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch2: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 2 + // Node: Concatenate + if len(slice) < 3 || + !helpers.StartsWithIgnoreCase(slice[1:], []rune("ve")) /* Match the string "ve" (case-insensitive) */ { + goto AlternationBranch3 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch3: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 3 + // Node: Set(Set = [Mm]) + // Match [Mm]. + if len(slice) < 2 || (slice[1]|0x20 != 'm') { + goto AlternationBranch4 + } + + pos += 2 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch4: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 4 + // Node: SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + // Match [Ll] exactly 2 times. + if len(slice) < 3 || + (slice[1]|0x20 != 'l') || + (slice[2]|0x20 != 'l') { + goto AlternationBranch5 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch5: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 5 + // Node: Set(Set = [Dd]) + // Match [Dd]. + if len(slice) < 2 || (slice[1]|0x20 != 'd') { + goto AlternationBranch + } + + pos += 2 + slice = r.Runtext[pos:] + +AlternationMatch1: + ; + + goto AlternationMatch + +AlternationBranch: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Concatenate + // Node: Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + // Match [^\n\r\p{L}\p{N}] greedily, optionally. + charloop_starting_pos = pos + + if len(slice) > 0 && set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f.CharIn(slice[0]) { + slice = slice[1:] + pos++ + } + + charloop_ending_pos = pos + goto CharLoopEnd + +CharLoopBacktrack: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos >= charloop_ending_pos { + goto AlternationBranch6 + } + charloop_ending_pos-- + pos = charloop_ending_pos + slice = r.Runtext[pos:] + +CharLoopEnd: + + // Node: SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) + // Match [\p{L}] atomically at least once. + iteration = 0 + for iteration < len(slice) && unicode.In(slice[iteration], unicode.L) { + iteration++ + } + + if iteration == 0 { + goto CharLoopBacktrack + } + + slice = slice[iteration:] + pos += iteration + + goto AlternationMatch + +AlternationBranch6: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 2 + // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + // Match [\p{N}] atomically at least 1 and at most 3 times. + iteration1 = 0 + for iteration1 < 3 && iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.N) { + iteration1++ + } + + if iteration1 == 0 { + goto AlternationBranch7 + } + + slice = slice[iteration1:] + pos += iteration1 + + goto AlternationMatch + +AlternationBranch7: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 3 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + // Match [^\s\p{L}\p{N}] atomically at least once. + iteration2 = 0 + for iteration2 < len(slice) && set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113.CharIn(slice[iteration2]) { + iteration2++ + } + + if iteration2 == 0 { + goto AlternationBranch8 + } + + slice = slice[iteration2:] + pos += iteration2 + + // Node: SetloopAtomic(Set = [\n\r])(Min = 0, Max = inf) + // Match [\n\r] atomically any number of times. + iteration3 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') + if iteration3 < 0 { + iteration3 = len(slice) + } + + slice = slice[iteration3:] + pos += iteration3 + + goto AlternationMatch + +AlternationBranch8: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 4 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 0, Max = inf) + // Match [\s] greedily any number of times. + charloop_starting_pos1 = pos + + iteration4 = 0 + for iteration4 < len(slice) && unicode.IsSpace(slice[iteration4]) { + iteration4++ + } + + slice = slice[iteration4:] + pos += iteration4 + + charloop_ending_pos1 = pos + goto CharLoopEnd1 + +CharLoopBacktrack1: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos1 >= charloop_ending_pos1 { + goto AlternationBranch9 + } + charloop_ending_pos1 = helpers.IndexOfAny2(r.Runtext[charloop_starting_pos1:charloop_ending_pos1], '\n', '\r') + if charloop_ending_pos1 < 0 { // miss + goto AlternationBranch9 + } + charloop_ending_pos1 += charloop_starting_pos1 + pos = charloop_ending_pos1 + slice = r.Runtext[pos:] + +CharLoopEnd1: + + // Node: SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + // Match [\n\r] atomically at least once. + iteration5 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') + if iteration5 < 0 { + iteration5 = len(slice) + } + + if iteration5 == 0 { + goto CharLoopBacktrack1 + } + + slice = slice[iteration5:] + pos += iteration5 + + goto AlternationMatch + +AlternationBranch9: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 5 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 1, Max = inf) + // Match [\s] greedily at least once. + charloop_starting_pos2 = pos + + iteration6 = 0 + for iteration6 < len(slice) && unicode.IsSpace(slice[iteration6]) { + iteration6++ + } + + if iteration6 == 0 { + goto AlternationBranch10 + } + + slice = slice[iteration6:] + pos += iteration6 + + charloop_ending_pos2 = pos + charloop_starting_pos2++ + goto CharLoopEnd2 + +CharLoopBacktrack2: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos2 >= charloop_ending_pos2 { + goto AlternationBranch10 + } + charloop_ending_pos2-- + pos = charloop_ending_pos2 + slice = r.Runtext[pos:] + +CharLoopEnd2: + + // Node: NegLook + // Zero-width negative lookahead + negativelookahead_starting_pos = pos + + if err := r.CheckTimeout(); err != nil { + return err + } + // Node: Set(Set = [^\s]) + // Match [^\s]. + if len(slice) == 0 || unicode.IsSpace(slice[0]) { + goto NegativeLookaroundMatch + } + + goto CharLoopBacktrack2 + +NegativeLookaroundMatch: + pos = negativelookahead_starting_pos + slice = r.Runtext[pos:] + + goto AlternationMatch + +AlternationBranch10: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 6 + // Node: SetloopAtomic(Set = [\s])(Min = 1, Max = inf) + // Match [\s] atomically at least once. + iteration7 = 0 + for iteration7 < len(slice) && unicode.IsSpace(slice[iteration7]) { + iteration7++ + } + + if iteration7 == 0 { + return nil // The input didn't match. + } + + slice = slice[iteration7:] + pos += iteration7 + +AlternationMatch: + ; + + r.Runstackpos = atomic_stackpos + + // The input matched. + r.Runtextpos = pos + r.Capture(0, matchStart, pos) + // just to prevent an unused var error in certain regex's + var _ = slice + return nil +} + +/* +Capture(index = 0, unindex = -1) + Atomic + Alternate + Concatenate + Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + Setloop(Set = [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}])(Min = 0, Max = inf) + Setloop(Set = [\p{Ll}\p{Lm}\p{Lo}\p{M}])(Min = 1, Max = inf) + Atomic + Loop(Min = 0, Max = 1) + Concatenate + One(Ch = ') + Atomic + Alternate + Set(Set = [STstſ]) + Concatenate + Set(Set = [Rr]) + Set(Set = [Ee]) + Concatenate + Set(Set = [Vv]) + Set(Set = [Ee]) + Set(Set = [Mm]) + SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + Set(Set = [Dd]) + Concatenate + Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + Setloop(Set = [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}])(Min = 1, Max = inf) + Setloop(Set = [\p{Ll}\p{Lm}\p{Lo}\p{M}])(Min = 0, Max = inf) + Atomic + Loop(Min = 0, Max = 1) + Concatenate + One(Ch = ') + Atomic + Alternate + Set(Set = [STstſ]) + Concatenate + Set(Set = [Rr]) + Set(Set = [Ee]) + Concatenate + Set(Set = [Vv]) + Set(Set = [Ee]) + Set(Set = [Mm]) + SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + Set(Set = [Dd]) + SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + SetloopAtomic(Set = [\n\r/])(Min = 0, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 0, Max = inf) + SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 1, Max = inf) + NegLook + Set(Set = [^\s]) + SetloopAtomic(Set = [\s])(Min = 1, Max = inf) +*/ +// From o200k_base.go:9:3 +// Pattern: "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" +// Options: regexp2.None +type splitRegexp_3_Engine struct{} + +func (splitRegexp_3_Engine) Caps() map[int]int { return nil } +func (splitRegexp_3_Engine) CapNames() map[string]int { return nil } +func (splitRegexp_3_Engine) CapsList() []string { return nil } +func (splitRegexp_3_Engine) CapSize() int { return 1 } + +func (splitRegexp_3_Engine) FindFirstChar(r *regexp2.Runner) bool { + pos := r.Runtextpos + // Empty matches aren't possible + if pos < len(r.Runtext) { + return true + } + + // No match found + r.Runtextpos = len(r.Runtext) + return false +} + +func (splitRegexp_3_Engine) Execute(r *regexp2.Runner) error { + atomic_stackpos := 0 + alternation_starting_pos := 0 + var charloop_starting_pos, charloop_ending_pos = 0, 0 + var charloop_starting_pos1, charloop_ending_pos1 = 0, 0 + iteration := 0 + var charloop_starting_pos2, charloop_ending_pos2 = 0, 0 + iteration1 := 0 + loop_iteration := 0 + startingStackpos := 0 + var charloop_starting_pos3, charloop_ending_pos3 = 0, 0 + var charloop_starting_pos4, charloop_ending_pos4 = 0, 0 + iteration2 := 0 + var charloop_starting_pos5, charloop_ending_pos5 = 0, 0 + iteration3 := 0 + loop_iteration1 := 0 + startingStackpos1 := 0 + iteration4 := 0 + iteration5 := 0 + iteration6 := 0 + var charloop_starting_pos6, charloop_ending_pos6 = 0, 0 + iteration7 := 0 + iteration8 := 0 + var charloop_starting_pos7, charloop_ending_pos7 = 0, 0 + iteration9 := 0 + negativelookahead_starting_pos := 0 + iteration10 := 0 + pos := r.Runtextpos + matchStart := pos + + var slice = r.Runtext[pos:] + + // Node: Atomic + // Atomic group. + atomic_stackpos = r.Runstackpos + + // Node: Alternate + // Match with 7 alternative expressions, atomically. + alternation_starting_pos = pos + + // Branch 0 + // Node: Concatenate + // Node: Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + // Match [^\n\r\p{L}\p{N}] greedily, optionally. + charloop_starting_pos = pos + + if len(slice) > 0 && set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f.CharIn(slice[0]) { + slice = slice[1:] + pos++ + } + + charloop_ending_pos = pos + goto CharLoopEnd + +CharLoopBacktrack: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos >= charloop_ending_pos { + goto AlternationBranch + } + charloop_ending_pos-- + pos = charloop_ending_pos + slice = r.Runtext[pos:] + +CharLoopEnd: + + // Node: Setloop(Set = [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}])(Min = 0, Max = inf) + // Match [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}] greedily any number of times. + charloop_starting_pos1 = pos + + iteration = 0 + for iteration < len(slice) && unicode.In(slice[iteration], unicode.Lu, unicode.Lt, unicode.Lm, unicode.Lo, unicode.M) { + iteration++ + } + + slice = slice[iteration:] + pos += iteration + + charloop_ending_pos1 = pos + goto CharLoopEnd1 + +CharLoopBacktrack1: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos1 >= charloop_ending_pos1 { + goto CharLoopBacktrack + } + charloop_ending_pos1-- + pos = charloop_ending_pos1 + slice = r.Runtext[pos:] + +CharLoopEnd1: + + // Node: Setloop(Set = [\p{Ll}\p{Lm}\p{Lo}\p{M}])(Min = 1, Max = inf) + // Match [\p{Ll}\p{Lm}\p{Lo}\p{M}] greedily at least once. + charloop_starting_pos2 = pos + + iteration1 = 0 + for iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.Ll, unicode.Lm, unicode.Lo, unicode.M) { + iteration1++ + } + + if iteration1 == 0 { + goto CharLoopBacktrack1 + } + + slice = slice[iteration1:] + pos += iteration1 + + charloop_ending_pos2 = pos + charloop_starting_pos2++ + goto CharLoopEnd2 + +CharLoopBacktrack2: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos2 >= charloop_ending_pos2 { + goto CharLoopBacktrack1 + } + charloop_ending_pos2 = helpers.LastIndexOfAny1(r.Runtext[charloop_starting_pos2:charloop_ending_pos2], '\'') + if charloop_ending_pos2 < 0 { // miss + goto CharLoopBacktrack1 + } + charloop_ending_pos2 += charloop_starting_pos2 + pos = charloop_ending_pos2 + slice = r.Runtext[pos:] + +CharLoopEnd2: + + // Node: Atomic + // Node: Loop(Min = 0, Max = 1) + // Optional (greedy). + startingStackpos = r.Runstackpos + loop_iteration = 0 + +LoopBody: + ; + r.StackPush(pos) + + loop_iteration++ + + // Node: Concatenate + // Node: One(Ch = ') + // Match '\”. + if len(slice) == 0 || slice[0] != '\'' { + goto LoopIterationNoMatch + } + + // Node: Atomic + // Node: Alternate + // Match with 6 alternative expressions, atomically. + if len(slice) < 2 { + goto LoopIterationNoMatch + } + + switch slice[1] { + case 'S', 'T', 's', 't', 'ſ': + pos += 2 + slice = r.Runtext[pos:] + + case 'R', 'r': + // Node: Concatenate + // Node: Empty + + // Node: Set(Set = [Ee]) + // Match [Ee]. + if len(slice) < 3 || (slice[2]|0x20 != 'e') { + goto LoopIterationNoMatch + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'V', 'v': + // Node: Concatenate + // Node: Empty + + // Node: Set(Set = [Ee]) + // Match [Ee]. + if len(slice) < 3 || (slice[2]|0x20 != 'e') { + goto LoopIterationNoMatch + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'M', 'm': + pos += 2 + slice = r.Runtext[pos:] + + case 'L', 'l': + // Node: SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + // Match [Ll] exactly 2 times. + if len(slice) < 3 || + (slice[1]|0x20 != 'l') || + (slice[2]|0x20 != 'l') { + goto LoopIterationNoMatch + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'D', 'd': + pos += 2 + slice = r.Runtext[pos:] + + default: + goto LoopIterationNoMatch + } + + // The loop has an upper bound of 1. Continue iterating greedily if it hasn't yet been reached. + if loop_iteration == 0 { + goto LoopBody + } + goto LoopEnd + + // The loop iteration failed. Put state back to the way it was before the iteration. +LoopIterationNoMatch: + loop_iteration-- + if loop_iteration < 0 { + // Unable to match the remainder of the expression after exhausting the loop. + goto CharLoopBacktrack2 + } + pos = r.StackPop() + slice = r.Runtext[pos:] +LoopEnd: + r.Runstackpos = startingStackpos // Ensure any remaining backtracking state is removed. + + goto AlternationMatch + +AlternationBranch: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Concatenate + // Node: Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + // Match [^\n\r\p{L}\p{N}] greedily, optionally. + charloop_starting_pos3 = pos + + if len(slice) > 0 && set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f.CharIn(slice[0]) { + slice = slice[1:] + pos++ + } + + charloop_ending_pos3 = pos + goto CharLoopEnd3 + +CharLoopBacktrack3: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos3 >= charloop_ending_pos3 { + goto AlternationBranch1 + } + charloop_ending_pos3-- + pos = charloop_ending_pos3 + slice = r.Runtext[pos:] + +CharLoopEnd3: + + // Node: Setloop(Set = [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}])(Min = 1, Max = inf) + // Match [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}] greedily at least once. + charloop_starting_pos4 = pos + + iteration2 = 0 + for iteration2 < len(slice) && unicode.In(slice[iteration2], unicode.Lu, unicode.Lt, unicode.Lm, unicode.Lo, unicode.M) { + iteration2++ + } + + if iteration2 == 0 { + goto CharLoopBacktrack3 + } + + slice = slice[iteration2:] + pos += iteration2 + + charloop_ending_pos4 = pos + charloop_starting_pos4++ + goto CharLoopEnd4 + +CharLoopBacktrack4: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos4 >= charloop_ending_pos4 { + goto CharLoopBacktrack3 + } + charloop_ending_pos4-- + pos = charloop_ending_pos4 + slice = r.Runtext[pos:] + +CharLoopEnd4: + + // Node: Setloop(Set = [\p{Ll}\p{Lm}\p{Lo}\p{M}])(Min = 0, Max = inf) + // Match [\p{Ll}\p{Lm}\p{Lo}\p{M}] greedily any number of times. + charloop_starting_pos5 = pos + + iteration3 = 0 + for iteration3 < len(slice) && unicode.In(slice[iteration3], unicode.Ll, unicode.Lm, unicode.Lo, unicode.M) { + iteration3++ + } + + slice = slice[iteration3:] + pos += iteration3 + + charloop_ending_pos5 = pos + goto CharLoopEnd5 + +CharLoopBacktrack5: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos5 >= charloop_ending_pos5 { + goto CharLoopBacktrack4 + } + charloop_ending_pos5 = helpers.LastIndexOfAny1(r.Runtext[charloop_starting_pos5:charloop_ending_pos5], '\'') + if charloop_ending_pos5 < 0 { // miss + goto CharLoopBacktrack4 + } + charloop_ending_pos5 += charloop_starting_pos5 + pos = charloop_ending_pos5 + slice = r.Runtext[pos:] + +CharLoopEnd5: + + // Node: Atomic + // Node: Loop(Min = 0, Max = 1) + // Optional (greedy). + startingStackpos1 = r.Runstackpos + loop_iteration1 = 0 + +LoopBody1: + ; + r.StackPush(pos) + + loop_iteration1++ + + // Node: Concatenate + // Node: One(Ch = ') + // Match '\”. + if len(slice) == 0 || slice[0] != '\'' { + goto LoopIterationNoMatch1 + } + + // Node: Atomic + // Node: Alternate + // Match with 6 alternative expressions, atomically. + if len(slice) < 2 { + goto LoopIterationNoMatch1 + } + + switch slice[1] { + case 'S', 'T', 's', 't', 'ſ': + pos += 2 + slice = r.Runtext[pos:] + + case 'R', 'r': + // Node: Concatenate + // Node: Empty + + // Node: Set(Set = [Ee]) + // Match [Ee]. + if len(slice) < 3 || (slice[2]|0x20 != 'e') { + goto LoopIterationNoMatch1 + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'V', 'v': + // Node: Concatenate + // Node: Empty + + // Node: Set(Set = [Ee]) + // Match [Ee]. + if len(slice) < 3 || (slice[2]|0x20 != 'e') { + goto LoopIterationNoMatch1 + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'M', 'm': + pos += 2 + slice = r.Runtext[pos:] + + case 'L', 'l': + // Node: SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + // Match [Ll] exactly 2 times. + if len(slice) < 3 || + (slice[1]|0x20 != 'l') || + (slice[2]|0x20 != 'l') { + goto LoopIterationNoMatch1 + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'D', 'd': + pos += 2 + slice = r.Runtext[pos:] + + default: + goto LoopIterationNoMatch1 + } + + // The loop has an upper bound of 1. Continue iterating greedily if it hasn't yet been reached. + if loop_iteration1 == 0 { + goto LoopBody1 + } + goto LoopEnd1 + + // The loop iteration failed. Put state back to the way it was before the iteration. +LoopIterationNoMatch1: + loop_iteration1-- + if loop_iteration1 < 0 { + // Unable to match the remainder of the expression after exhausting the loop. + goto CharLoopBacktrack5 + } + pos = r.StackPop() + slice = r.Runtext[pos:] +LoopEnd1: + r.Runstackpos = startingStackpos1 // Ensure any remaining backtracking state is removed. + + goto AlternationMatch + +AlternationBranch1: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 2 + // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + // Match [\p{N}] atomically at least 1 and at most 3 times. + iteration4 = 0 + for iteration4 < 3 && iteration4 < len(slice) && unicode.In(slice[iteration4], unicode.N) { + iteration4++ + } + + if iteration4 == 0 { + goto AlternationBranch2 + } + + slice = slice[iteration4:] + pos += iteration4 + + goto AlternationMatch + +AlternationBranch2: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 3 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + // Match [^\s\p{L}\p{N}] atomically at least once. + iteration5 = 0 + for iteration5 < len(slice) && set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113.CharIn(slice[iteration5]) { + iteration5++ + } + + if iteration5 == 0 { + goto AlternationBranch3 + } + + slice = slice[iteration5:] + pos += iteration5 + + // Node: SetloopAtomic(Set = [\n\r/])(Min = 0, Max = inf) + // Match [\n\r/] atomically any number of times. + iteration6 = helpers.IndexOfAnyExcept3(slice, '\n', '\r', '/') + if iteration6 < 0 { + iteration6 = len(slice) + } + + slice = slice[iteration6:] + pos += iteration6 + + goto AlternationMatch + +AlternationBranch3: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 4 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 0, Max = inf) + // Match [\s] greedily any number of times. + charloop_starting_pos6 = pos + + iteration7 = 0 + for iteration7 < len(slice) && unicode.IsSpace(slice[iteration7]) { + iteration7++ + } + + slice = slice[iteration7:] + pos += iteration7 + + charloop_ending_pos6 = pos + goto CharLoopEnd6 + +CharLoopBacktrack6: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos6 >= charloop_ending_pos6 { + goto AlternationBranch4 + } + charloop_ending_pos6 = helpers.IndexOfAny2(r.Runtext[charloop_starting_pos6:charloop_ending_pos6], '\n', '\r') + if charloop_ending_pos6 < 0 { // miss + goto AlternationBranch4 + } + charloop_ending_pos6 += charloop_starting_pos6 + pos = charloop_ending_pos6 + slice = r.Runtext[pos:] + +CharLoopEnd6: + + // Node: SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + // Match [\n\r] atomically at least once. + iteration8 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') + if iteration8 < 0 { + iteration8 = len(slice) + } + + if iteration8 == 0 { + goto CharLoopBacktrack6 + } + + slice = slice[iteration8:] + pos += iteration8 + + goto AlternationMatch + +AlternationBranch4: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 5 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 1, Max = inf) + // Match [\s] greedily at least once. + charloop_starting_pos7 = pos + + iteration9 = 0 + for iteration9 < len(slice) && unicode.IsSpace(slice[iteration9]) { + iteration9++ + } + + if iteration9 == 0 { + goto AlternationBranch5 + } + + slice = slice[iteration9:] + pos += iteration9 + + charloop_ending_pos7 = pos + charloop_starting_pos7++ + goto CharLoopEnd7 + +CharLoopBacktrack7: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos7 >= charloop_ending_pos7 { + goto AlternationBranch5 + } + charloop_ending_pos7-- + pos = charloop_ending_pos7 + slice = r.Runtext[pos:] + +CharLoopEnd7: + + // Node: NegLook + // Zero-width negative lookahead + negativelookahead_starting_pos = pos + + if err := r.CheckTimeout(); err != nil { + return err + } + // Node: Set(Set = [^\s]) + // Match [^\s]. + if len(slice) == 0 || unicode.IsSpace(slice[0]) { + goto NegativeLookaroundMatch + } + + goto CharLoopBacktrack7 + +NegativeLookaroundMatch: + pos = negativelookahead_starting_pos + slice = r.Runtext[pos:] + + goto AlternationMatch + +AlternationBranch5: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 6 + // Node: SetloopAtomic(Set = [\s])(Min = 1, Max = inf) + // Match [\s] atomically at least once. + iteration10 = 0 + for iteration10 < len(slice) && unicode.IsSpace(slice[iteration10]) { + iteration10++ + } + + if iteration10 == 0 { + return nil // The input didn't match. + } + + slice = slice[iteration10:] + pos += iteration10 + +AlternationMatch: + ; + + r.Runstackpos = atomic_stackpos + + // The input matched. + r.Runtextpos = pos + r.Capture(0, matchStart, pos) + // just to prevent an unused var error in certain regex's + var _ = slice + return nil +} + +// The set [^\s\p{L}\p{N}] +var set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113 = syntax.NewCharSetRuntime("\x01\x00\x00\x00\x00\x03\x00\x00\x00\x01 \x01L\x01N") + +// The set [R-Tr-tſ] +var set_4a1357005dba18ced5af0bde8202a98fab7a3500c675e3cb1c60597d7a36436a = syntax.NewCharSetRuntime("\x00\x03\x00\x00\x00\x00\x00\x00\x00RTrtſſ") + +// The set [^\n\r\p{L}\p{N}] +var set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f = syntax.NewCharSetRuntime("\x01\x02\x00\x00\x00\x02\x00\x00\x00\n\n\r\r\x01L\x01N") + +func init() { + regexp2.RegisterEngine("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_Engine{}) + regexp2.RegisterEngine("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_2_Engine{}) + regexp2.RegisterEngine("[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_3_Engine{}) + var _ = helpers.Min + var _ = syntax.NewCharSetRuntime + var _ = unicode.IsDigit +} diff --git a/go.mod b/go.mod index 31e0ea4..d14997a 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,12 @@ module github.com/tiktoken-go/tokenizer -go 1.20 +go 1.21.4 -require github.com/dlclark/regexp2 v1.9.0 +toolchain go1.22.0 + +require github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b + +require ( + github.com/dlclark/regexp2cg v0.1.0 // indirect + github.com/pkg/errors v0.9.1 // indirect +) diff --git a/go.sum b/go.sum index fbc0ebc..d15ad1e 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,8 @@ github.com/dlclark/regexp2 v1.9.0 h1:pTK/l/3qYIKaRXuHnEnIf7Y5NxfRPfpb7dis6/gdlVI= github.com/dlclark/regexp2 v1.9.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b h1:AJKOdc+1fRSJ0/75Jty1npvxUUD0y7hQDg15LMAHhyU= +github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b/go.mod h1:YvCrhrh/qlds8EhFKPtJprdXn5fWBllSw1qo99dZyiQ= +github.com/dlclark/regexp2cg v0.1.0 h1:SdeGspyGihv995cqVSv6UPOHjm/2J1pvQbEQlFDEzNA= +github.com/dlclark/regexp2cg v0.1.0/go.mod h1:n3VoFWNg5bIAwl8j4N/K74dsot/olhCCaIWufs8mUJI= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= diff --git a/tokenizer_test.go b/tokenizer_test.go index 7d4d726..3e64d48 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -1,208 +1,127 @@ package tokenizer_test import ( + "fmt" "testing" "github.com/tiktoken-go/tokenizer" ) -func TestO200kBaseEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.O200kBase) - if err != nil { - t.Fatalf("can't create tokenizer") - } - - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{24912, 2375}}, - {text: "hello world", ids: []uint{24912, 220, 2375}}, - {text: "hello world", ids: []uint{24912, 256, 2375}}, - {text: "supercalifragilistic", ids: []uint{17789, 5842, 366, 17764, 311, 6207}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{2167, 1761, 1412, 581, 553, 11, 889, 1761, 625, 1412, 581, 1340, 413, 13}}, - } - - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) - } +type testTokenizer struct { + encoding tokenizer.Encoding + data []testTokenizerData } -func TestCl100kEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.Cl100kBase) - if err != nil { - t.Fatalf("can't create tokenizer") - } - - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{15339, 1917}}, - {text: "hello world", ids: []uint{15339, 220, 1917}}, - {text: "hello world", ids: []uint{15339, 256, 1917}}, - {text: "supercalifragilistic", ids: []uint{13066, 3035, 278, 333, 4193, 321, 4633}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{1687, 1440, 1148, 584, 527, 11, 719, 1440, 539, 1148, 584, 1253, 387, 13}}, - } - - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) - } +type testTokenizerData struct { + text string + ids []uint } -func TestR50kBaseEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.R50kBase) - if err != nil { - t.Fatalf("can't create tokenizer") +var ( + tokenizerTests = []testTokenizer{ + { + encoding: tokenizer.O200kBase, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{24912, 2375}}, + {text: "hello world", ids: []uint{24912, 220, 2375}}, + {text: "hello world", ids: []uint{24912, 256, 2375}}, + {text: "supercalifragilistic", ids: []uint{17789, 5842, 366, 17764, 311, 6207}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{2167, 1761, 1412, 581, 553, 11, 889, 1761, 625, 1412, 581, 1340, 413, 13}}, + }, + }, + { + encoding: tokenizer.Cl100kBase, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{15339, 1917}}, + {text: "hello world", ids: []uint{15339, 220, 1917}}, + {text: "hello world", ids: []uint{15339, 256, 1917}}, + {text: "supercalifragilistic", ids: []uint{13066, 3035, 278, 333, 4193, 321, 4633}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{1687, 1440, 1148, 584, 527, 11, 719, 1440, 539, 1148, 584, 1253, 387, 13}}, + }, + }, + { + encoding: tokenizer.R50kBase, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{31373, 995}}, + {text: "hello world", ids: []uint{31373, 220, 995}}, + {text: "hello world", ids: []uint{31373, 220, 220, 995}}, + {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, + }, + }, + { + encoding: tokenizer.P50kBase, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{31373, 995}}, + {text: "hello world", ids: []uint{31373, 220, 995}}, + {text: "hello world", ids: []uint{31373, 50257, 995}}, + {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, + }, + }, + { + encoding: tokenizer.P50kEdit, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{31373, 995}}, + {text: "hello world", ids: []uint{31373, 220, 995}}, + {text: "hello world", ids: []uint{31373, 50257, 995}}, + {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, + }, + }, } +) - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{31373, 995}}, - {text: "hello world", ids: []uint{31373, 220, 995}}, - {text: "hello world", ids: []uint{31373, 220, 220, 995}}, - {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, - } +func TestTokenizer(t *testing.T) { + for _, test := range tokenizerTests { + tokenizer, err := tokenizer.Get(test.encoding) + if err != nil { + t.Fatalf("can't create tokenizer") + } - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) + for _, data := range test.data { + t.Run(fmt.Sprintf("%s: %s", test.encoding, data.text), func(t *testing.T) { + ids, _, err := tokenizer.Encode(data.text) + if err != nil { + t.Fatalf("error encoding: %v", err) + } + + if !sliceEqual(ids, data.ids) { + t.Fatalf("input: %s want: %v got: %v", data.text, data.ids, ids) + } + + text, err := tokenizer.Decode(ids) + if err != nil { + t.Fatalf("error decoding: %v", err) + } + + if text != data.text { + t.Fatalf("input: %v want: %s got: %s", data.ids, data.text, text) + } + }) + } } } -func TestP50kBaseEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.P50kBase) - if err != nil { - t.Fatalf("can't create tokenizer") - } - - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{31373, 995}}, - {text: "hello world", ids: []uint{31373, 220, 995}}, - {text: "hello world", ids: []uint{31373, 50257, 995}}, - {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, - } +var tokens []uint - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) - } -} +func BenchmarkTokenizer(b *testing.B) { + for _, test := range tokenizerTests { + tokenizer, err := tokenizer.Get(test.encoding) + if err != nil { + b.Fatalf("can't create tokenizer") + } -func TestP50kEditEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.P50kEdit) - if err != nil { - t.Fatalf("can't create tokenizer") - } + for _, data := range test.data { + b.Run(fmt.Sprintf("%s: %s", test.encoding, data.text), func(b *testing.B) { + for i := 0; i < b.N; i++ { - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{31373, 995}}, - {text: "hello world", ids: []uint{31373, 220, 995}}, - {text: "hello world", ids: []uint{31373, 50257, 995}}, - {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, - } + tokens, _, _ = tokenizer.Encode(data.text) + } - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) + _ = tokens + }) + } } } From 2e19c1421ebbcdd3d959ebd04fc8e44b70e64696 Mon Sep 17 00:00:00 2001 From: Mariano Wahlmann Date: Wed, 12 Feb 2025 16:54:35 -0600 Subject: [PATCH 2/2] Added a go generate command for the compiled regular expression --- .github/workflows/go.yml | 2 +- codec/regexp.gen.go | 484 +++++++++++++++++++-------------------- codec/regexp.go | 2 + go.mod | 4 +- go.sum | 6 +- 5 files changed, 250 insertions(+), 248 deletions(-) create mode 100644 codec/regexp.go diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index cd89848..74b42e9 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -19,7 +19,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: "1.20" + go-version: "1.24" - name: Build run: go build -v ./... diff --git a/codec/regexp.gen.go b/codec/regexp.gen.go index 3b00a59..3acf8fa 100644 --- a/codec/regexp.gen.go +++ b/codec/regexp.gen.go @@ -15,29 +15,35 @@ Capture(index = 0, unindex = -1) One(Ch = ') Atomic Alternate - Set(Set = [r-t]) - Multi(String = "re") - Multi(String = "ve") - One(Ch = m) - Multi(String = "ll") - One(Ch = d) + Set(Set = [R-Tr-tſ]) + Concatenate + Set(Set = [Rr]) + Set(Set = [Ee]) + Concatenate + Set(Set = [Vv]) + Set(Set = [Ee]) + Set(Set = [Mm]) + SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + Set(Set = [Dd]) Concatenate - OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) - Concatenate - OneloopAtomic(Ch = \ )(Min = 0, Max = 1) - SetloopAtomic(Set = [\p{N}])(Min = 1, Max = inf) + SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) Concatenate OneloopAtomic(Ch = \ )(Min = 0, Max = 1) SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + SetloopAtomic(Set = [\n\r])(Min = 0, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 0, Max = inf) + SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) Concatenate Setloop(Set = [\s])(Min = 1, Max = inf) NegLook Set(Set = [^\s]) SetloopAtomic(Set = [\s])(Min = 1, Max = inf) */ -// From p50k_base.go:8:37 -// Pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" +// From cl100k_base.go:8:37 +// Pattern: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" // Options: regexp2.None type splitRegexp_Engine struct{} @@ -62,13 +68,18 @@ func (splitRegexp_Engine) Execute(r *regexp2.Runner) error { atomic_stackpos := 0 alternation_starting_pos := 0 alternation_starting_pos1 := 0 + var charloop_starting_pos, charloop_ending_pos = 0, 0 iteration := 0 iteration1 := 0 iteration2 := 0 - var charloop_starting_pos, charloop_ending_pos = 0, 0 iteration3 := 0 - negativelookahead_starting_pos := 0 + var charloop_starting_pos1, charloop_ending_pos1 = 0, 0 iteration4 := 0 + iteration5 := 0 + var charloop_starting_pos2, charloop_ending_pos2 = 0, 0 + iteration6 := 0 + negativelookahead_starting_pos := 0 + iteration7 := 0 pos := r.Runtextpos matchStart := pos @@ -79,7 +90,7 @@ func (splitRegexp_Engine) Execute(r *regexp2.Runner) error { atomic_stackpos = r.Runstackpos // Node: Alternate - // Match with 6 alternative expressions, atomically. + // Match with 7 alternative expressions, atomically. alternation_starting_pos = pos // Branch 0 @@ -96,9 +107,9 @@ func (splitRegexp_Engine) Execute(r *regexp2.Runner) error { alternation_starting_pos1 = pos // Branch 0 - // Node: Set(Set = [r-t]) - // Match [r-t]. - if len(slice) < 2 || !helpers.IsBetween(slice[1], 'r', 't') { + // Node: Set(Set = [R-Tr-tſ]) + // Match [R-Tr-tſ]. + if len(slice) < 2 || !set_4a1357005dba18ced5af0bde8202a98fab7a3500c675e3cb1c60597d7a36436a.CharIn(slice[1]) { goto AlternationBranch1 } @@ -111,9 +122,9 @@ AlternationBranch1: slice = r.Runtext[pos:] // Branch 1 - // Node: Multi(String = "re") - // Match the string "re". - if !helpers.StartsWith(slice[1:], []rune("re")) { + // Node: Concatenate + if len(slice) < 3 || + !helpers.StartsWithIgnoreCase(slice[1:], []rune("re")) /* Match the string "re" (case-insensitive) */ { goto AlternationBranch2 } @@ -126,9 +137,9 @@ AlternationBranch2: slice = r.Runtext[pos:] // Branch 2 - // Node: Multi(String = "ve") - // Match the string "ve". - if !helpers.StartsWith(slice[1:], []rune("ve")) { + // Node: Concatenate + if len(slice) < 3 || + !helpers.StartsWithIgnoreCase(slice[1:], []rune("ve")) /* Match the string "ve" (case-insensitive) */ { goto AlternationBranch3 } @@ -141,9 +152,9 @@ AlternationBranch3: slice = r.Runtext[pos:] // Branch 3 - // Node: One(Ch = m) - // Match 'm'. - if len(slice) < 2 || slice[1] != 'm' { + // Node: Set(Set = [Mm]) + // Match [Mm]. + if len(slice) < 2 || (slice[1]|0x20 != 'm') { goto AlternationBranch4 } @@ -156,9 +167,11 @@ AlternationBranch4: slice = r.Runtext[pos:] // Branch 4 - // Node: Multi(String = "ll") - // Match the string "ll". - if !helpers.StartsWith(slice[1:], []rune("ll")) { + // Node: SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + // Match [Ll] exactly 2 times. + if len(slice) < 3 || + (slice[1]|0x20 != 'l') || + (slice[2]|0x20 != 'l') { goto AlternationBranch5 } @@ -171,9 +184,9 @@ AlternationBranch5: slice = r.Runtext[pos:] // Branch 5 - // Node: One(Ch = d) - // Match 'd'. - if len(slice) < 2 || slice[1] != 'd' { + // Node: Set(Set = [Dd]) + // Match [Dd]. + if len(slice) < 2 || (slice[1]|0x20 != 'd') { goto AlternationBranch } @@ -191,13 +204,32 @@ AlternationBranch: // Branch 1 // Node: Concatenate - // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) - // Match ' ' atomically, optionally. - if len(slice) > 0 && slice[0] == ' ' { + // Node: Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + // Match [^\n\r\p{L}\p{N}] greedily, optionally. + charloop_starting_pos = pos + + if len(slice) > 0 && set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f.CharIn(slice[0]) { slice = slice[1:] pos++ } + charloop_ending_pos = pos + goto CharLoopEnd + +CharLoopBacktrack: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos >= charloop_ending_pos { + goto AlternationBranch6 + } + charloop_ending_pos-- + pos = charloop_ending_pos + slice = r.Runtext[pos:] + +CharLoopEnd: + // Node: SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) // Match [\p{L}] atomically at least once. iteration = 0 @@ -206,7 +238,7 @@ AlternationBranch: } if iteration == 0 { - goto AlternationBranch6 + goto CharLoopBacktrack } slice = slice[iteration:] @@ -219,18 +251,10 @@ AlternationBranch6: slice = r.Runtext[pos:] // Branch 2 - // Node: Concatenate - // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) - // Match ' ' atomically, optionally. - if len(slice) > 0 && slice[0] == ' ' { - slice = slice[1:] - pos++ - } - - // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = inf) - // Match [\p{N}] atomically at least once. + // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + // Match [\p{N}] atomically at least 1 and at most 3 times. iteration1 = 0 - for iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.N) { + for iteration1 < 3 && iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.N) { iteration1++ } @@ -270,6 +294,16 @@ AlternationBranch7: slice = slice[iteration2:] pos += iteration2 + // Node: SetloopAtomic(Set = [\n\r])(Min = 0, Max = inf) + // Match [\n\r] atomically any number of times. + iteration3 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') + if iteration3 < 0 { + iteration3 = len(slice) + } + + slice = slice[iteration3:] + pos += iteration3 + goto AlternationMatch AlternationBranch8: @@ -278,39 +312,94 @@ AlternationBranch8: // Branch 4 // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 0, Max = inf) + // Match [\s] greedily any number of times. + charloop_starting_pos1 = pos + + iteration4 = 0 + for iteration4 < len(slice) && unicode.IsSpace(slice[iteration4]) { + iteration4++ + } + + slice = slice[iteration4:] + pos += iteration4 + + charloop_ending_pos1 = pos + goto CharLoopEnd1 + +CharLoopBacktrack1: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos1 >= charloop_ending_pos1 { + goto AlternationBranch9 + } + charloop_ending_pos1 = helpers.IndexOfAny2(r.Runtext[charloop_starting_pos1:charloop_ending_pos1], '\n', '\r') + if charloop_ending_pos1 < 0 { // miss + goto AlternationBranch9 + } + charloop_ending_pos1 += charloop_starting_pos1 + pos = charloop_ending_pos1 + slice = r.Runtext[pos:] + +CharLoopEnd1: + + // Node: SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + // Match [\n\r] atomically at least once. + iteration5 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') + if iteration5 < 0 { + iteration5 = len(slice) + } + + if iteration5 == 0 { + goto CharLoopBacktrack1 + } + + slice = slice[iteration5:] + pos += iteration5 + + goto AlternationMatch + +AlternationBranch9: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 5 + // Node: Concatenate // Node: Setloop(Set = [\s])(Min = 1, Max = inf) // Match [\s] greedily at least once. - charloop_starting_pos = pos + charloop_starting_pos2 = pos - iteration3 = 0 - for iteration3 < len(slice) && unicode.IsSpace(slice[iteration3]) { - iteration3++ + iteration6 = 0 + for iteration6 < len(slice) && unicode.IsSpace(slice[iteration6]) { + iteration6++ } - if iteration3 == 0 { - goto AlternationBranch9 + if iteration6 == 0 { + goto AlternationBranch10 } - slice = slice[iteration3:] - pos += iteration3 + slice = slice[iteration6:] + pos += iteration6 - charloop_ending_pos = pos - charloop_starting_pos++ - goto CharLoopEnd + charloop_ending_pos2 = pos + charloop_starting_pos2++ + goto CharLoopEnd2 -CharLoopBacktrack: +CharLoopBacktrack2: if err := r.CheckTimeout(); err != nil { return err } - if charloop_starting_pos >= charloop_ending_pos { - goto AlternationBranch9 + if charloop_starting_pos2 >= charloop_ending_pos2 { + goto AlternationBranch10 } - charloop_ending_pos-- - pos = charloop_ending_pos + charloop_ending_pos2-- + pos = charloop_ending_pos2 slice = r.Runtext[pos:] -CharLoopEnd: +CharLoopEnd2: // Node: NegLook // Zero-width negative lookahead @@ -325,7 +414,7 @@ CharLoopEnd: goto NegativeLookaroundMatch } - goto CharLoopBacktrack + goto CharLoopBacktrack2 NegativeLookaroundMatch: pos = negativelookahead_starting_pos @@ -333,24 +422,24 @@ NegativeLookaroundMatch: goto AlternationMatch -AlternationBranch9: +AlternationBranch10: pos = alternation_starting_pos slice = r.Runtext[pos:] - // Branch 5 + // Branch 6 // Node: SetloopAtomic(Set = [\s])(Min = 1, Max = inf) // Match [\s] atomically at least once. - iteration4 = 0 - for iteration4 < len(slice) && unicode.IsSpace(slice[iteration4]) { - iteration4++ + iteration7 = 0 + for iteration7 < len(slice) && unicode.IsSpace(slice[iteration7]) { + iteration7++ } - if iteration4 == 0 { + if iteration7 == 0 { return nil // The input didn't match. } - slice = slice[iteration4:] - pos += iteration4 + slice = slice[iteration7:] + pos += iteration7 AlternationMatch: ; @@ -373,35 +462,29 @@ Capture(index = 0, unindex = -1) One(Ch = ') Atomic Alternate - Set(Set = [R-Tr-tſ]) - Concatenate - Set(Set = [Rr]) - Set(Set = [Ee]) - Concatenate - Set(Set = [Vv]) - Set(Set = [Ee]) - Set(Set = [Mm]) - SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) - Set(Set = [Dd]) + Set(Set = [r-t]) + Multi(String = "re") + Multi(String = "ve") + One(Ch = m) + Multi(String = "ll") + One(Ch = d) Concatenate - Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) - SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) Concatenate OneloopAtomic(Ch = \ )(Min = 0, Max = 1) - SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) - SetloopAtomic(Set = [\n\r])(Min = 0, Max = inf) + SetloopAtomic(Set = [\p{N}])(Min = 1, Max = inf) Concatenate - Setloop(Set = [\s])(Min = 0, Max = inf) - SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) Concatenate Setloop(Set = [\s])(Min = 1, Max = inf) NegLook Set(Set = [^\s]) SetloopAtomic(Set = [\s])(Min = 1, Max = inf) */ -// From cl100k_base.go:8:37 -// Pattern: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" +// From p50k_base.go:8:37 +// Pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" // Options: regexp2.None type splitRegexp_2_Engine struct{} @@ -426,18 +509,13 @@ func (splitRegexp_2_Engine) Execute(r *regexp2.Runner) error { atomic_stackpos := 0 alternation_starting_pos := 0 alternation_starting_pos1 := 0 - var charloop_starting_pos, charloop_ending_pos = 0, 0 iteration := 0 iteration1 := 0 iteration2 := 0 + var charloop_starting_pos, charloop_ending_pos = 0, 0 iteration3 := 0 - var charloop_starting_pos1, charloop_ending_pos1 = 0, 0 - iteration4 := 0 - iteration5 := 0 - var charloop_starting_pos2, charloop_ending_pos2 = 0, 0 - iteration6 := 0 negativelookahead_starting_pos := 0 - iteration7 := 0 + iteration4 := 0 pos := r.Runtextpos matchStart := pos @@ -448,7 +526,7 @@ func (splitRegexp_2_Engine) Execute(r *regexp2.Runner) error { atomic_stackpos = r.Runstackpos // Node: Alternate - // Match with 7 alternative expressions, atomically. + // Match with 6 alternative expressions, atomically. alternation_starting_pos = pos // Branch 0 @@ -465,9 +543,9 @@ func (splitRegexp_2_Engine) Execute(r *regexp2.Runner) error { alternation_starting_pos1 = pos // Branch 0 - // Node: Set(Set = [R-Tr-tſ]) - // Match [R-Tr-tſ]. - if len(slice) < 2 || !set_4a1357005dba18ced5af0bde8202a98fab7a3500c675e3cb1c60597d7a36436a.CharIn(slice[1]) { + // Node: Set(Set = [r-t]) + // Match [r-t]. + if len(slice) < 2 || !helpers.IsBetween(slice[1], 'r', 't') { goto AlternationBranch1 } @@ -480,9 +558,9 @@ AlternationBranch1: slice = r.Runtext[pos:] // Branch 1 - // Node: Concatenate - if len(slice) < 3 || - !helpers.StartsWithIgnoreCase(slice[1:], []rune("re")) /* Match the string "re" (case-insensitive) */ { + // Node: Multi(String = "re") + // Match the string "re". + if !helpers.StartsWith(slice[1:], []rune("re")) { goto AlternationBranch2 } @@ -495,9 +573,9 @@ AlternationBranch2: slice = r.Runtext[pos:] // Branch 2 - // Node: Concatenate - if len(slice) < 3 || - !helpers.StartsWithIgnoreCase(slice[1:], []rune("ve")) /* Match the string "ve" (case-insensitive) */ { + // Node: Multi(String = "ve") + // Match the string "ve". + if !helpers.StartsWith(slice[1:], []rune("ve")) { goto AlternationBranch3 } @@ -510,9 +588,9 @@ AlternationBranch3: slice = r.Runtext[pos:] // Branch 3 - // Node: Set(Set = [Mm]) - // Match [Mm]. - if len(slice) < 2 || (slice[1]|0x20 != 'm') { + // Node: One(Ch = m) + // Match 'm'. + if len(slice) < 2 || slice[1] != 'm' { goto AlternationBranch4 } @@ -525,11 +603,9 @@ AlternationBranch4: slice = r.Runtext[pos:] // Branch 4 - // Node: SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) - // Match [Ll] exactly 2 times. - if len(slice) < 3 || - (slice[1]|0x20 != 'l') || - (slice[2]|0x20 != 'l') { + // Node: Multi(String = "ll") + // Match the string "ll". + if !helpers.StartsWith(slice[1:], []rune("ll")) { goto AlternationBranch5 } @@ -542,9 +618,9 @@ AlternationBranch5: slice = r.Runtext[pos:] // Branch 5 - // Node: Set(Set = [Dd]) - // Match [Dd]. - if len(slice) < 2 || (slice[1]|0x20 != 'd') { + // Node: One(Ch = d) + // Match 'd'. + if len(slice) < 2 || slice[1] != 'd' { goto AlternationBranch } @@ -562,32 +638,13 @@ AlternationBranch: // Branch 1 // Node: Concatenate - // Node: Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) - // Match [^\n\r\p{L}\p{N}] greedily, optionally. - charloop_starting_pos = pos - - if len(slice) > 0 && set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f.CharIn(slice[0]) { + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { slice = slice[1:] pos++ } - charloop_ending_pos = pos - goto CharLoopEnd - -CharLoopBacktrack: - - if err := r.CheckTimeout(); err != nil { - return err - } - if charloop_starting_pos >= charloop_ending_pos { - goto AlternationBranch6 - } - charloop_ending_pos-- - pos = charloop_ending_pos - slice = r.Runtext[pos:] - -CharLoopEnd: - // Node: SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) // Match [\p{L}] atomically at least once. iteration = 0 @@ -596,7 +653,7 @@ CharLoopEnd: } if iteration == 0 { - goto CharLoopBacktrack + goto AlternationBranch6 } slice = slice[iteration:] @@ -609,10 +666,18 @@ AlternationBranch6: slice = r.Runtext[pos:] // Branch 2 - // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) - // Match [\p{N}] atomically at least 1 and at most 3 times. + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = inf) + // Match [\p{N}] atomically at least once. iteration1 = 0 - for iteration1 < 3 && iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.N) { + for iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.N) { iteration1++ } @@ -652,16 +717,6 @@ AlternationBranch7: slice = slice[iteration2:] pos += iteration2 - // Node: SetloopAtomic(Set = [\n\r])(Min = 0, Max = inf) - // Match [\n\r] atomically any number of times. - iteration3 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') - if iteration3 < 0 { - iteration3 = len(slice) - } - - slice = slice[iteration3:] - pos += iteration3 - goto AlternationMatch AlternationBranch8: @@ -670,94 +725,39 @@ AlternationBranch8: // Branch 4 // Node: Concatenate - // Node: Setloop(Set = [\s])(Min = 0, Max = inf) - // Match [\s] greedily any number of times. - charloop_starting_pos1 = pos - - iteration4 = 0 - for iteration4 < len(slice) && unicode.IsSpace(slice[iteration4]) { - iteration4++ - } - - slice = slice[iteration4:] - pos += iteration4 - - charloop_ending_pos1 = pos - goto CharLoopEnd1 - -CharLoopBacktrack1: - - if err := r.CheckTimeout(); err != nil { - return err - } - if charloop_starting_pos1 >= charloop_ending_pos1 { - goto AlternationBranch9 - } - charloop_ending_pos1 = helpers.IndexOfAny2(r.Runtext[charloop_starting_pos1:charloop_ending_pos1], '\n', '\r') - if charloop_ending_pos1 < 0 { // miss - goto AlternationBranch9 - } - charloop_ending_pos1 += charloop_starting_pos1 - pos = charloop_ending_pos1 - slice = r.Runtext[pos:] - -CharLoopEnd1: - - // Node: SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) - // Match [\n\r] atomically at least once. - iteration5 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') - if iteration5 < 0 { - iteration5 = len(slice) - } - - if iteration5 == 0 { - goto CharLoopBacktrack1 - } - - slice = slice[iteration5:] - pos += iteration5 - - goto AlternationMatch - -AlternationBranch9: - pos = alternation_starting_pos - slice = r.Runtext[pos:] - - // Branch 5 - // Node: Concatenate // Node: Setloop(Set = [\s])(Min = 1, Max = inf) // Match [\s] greedily at least once. - charloop_starting_pos2 = pos + charloop_starting_pos = pos - iteration6 = 0 - for iteration6 < len(slice) && unicode.IsSpace(slice[iteration6]) { - iteration6++ + iteration3 = 0 + for iteration3 < len(slice) && unicode.IsSpace(slice[iteration3]) { + iteration3++ } - if iteration6 == 0 { - goto AlternationBranch10 + if iteration3 == 0 { + goto AlternationBranch9 } - slice = slice[iteration6:] - pos += iteration6 + slice = slice[iteration3:] + pos += iteration3 - charloop_ending_pos2 = pos - charloop_starting_pos2++ - goto CharLoopEnd2 + charloop_ending_pos = pos + charloop_starting_pos++ + goto CharLoopEnd -CharLoopBacktrack2: +CharLoopBacktrack: if err := r.CheckTimeout(); err != nil { return err } - if charloop_starting_pos2 >= charloop_ending_pos2 { - goto AlternationBranch10 + if charloop_starting_pos >= charloop_ending_pos { + goto AlternationBranch9 } - charloop_ending_pos2-- - pos = charloop_ending_pos2 + charloop_ending_pos-- + pos = charloop_ending_pos slice = r.Runtext[pos:] -CharLoopEnd2: +CharLoopEnd: // Node: NegLook // Zero-width negative lookahead @@ -772,7 +772,7 @@ CharLoopEnd2: goto NegativeLookaroundMatch } - goto CharLoopBacktrack2 + goto CharLoopBacktrack NegativeLookaroundMatch: pos = negativelookahead_starting_pos @@ -780,24 +780,24 @@ NegativeLookaroundMatch: goto AlternationMatch -AlternationBranch10: +AlternationBranch9: pos = alternation_starting_pos slice = r.Runtext[pos:] - // Branch 6 + // Branch 5 // Node: SetloopAtomic(Set = [\s])(Min = 1, Max = inf) // Match [\s] atomically at least once. - iteration7 = 0 - for iteration7 < len(slice) && unicode.IsSpace(slice[iteration7]) { - iteration7++ + iteration4 = 0 + for iteration4 < len(slice) && unicode.IsSpace(slice[iteration4]) { + iteration4++ } - if iteration7 == 0 { + if iteration4 == 0 { return nil // The input didn't match. } - slice = slice[iteration7:] - pos += iteration7 + slice = slice[iteration4:] + pos += iteration4 AlternationMatch: ; @@ -1536,18 +1536,18 @@ AlternationMatch: return nil } -// The set [^\s\p{L}\p{N}] -var set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113 = syntax.NewCharSetRuntime("\x01\x00\x00\x00\x00\x03\x00\x00\x00\x01 \x01L\x01N") - // The set [R-Tr-tſ] var set_4a1357005dba18ced5af0bde8202a98fab7a3500c675e3cb1c60597d7a36436a = syntax.NewCharSetRuntime("\x00\x03\x00\x00\x00\x00\x00\x00\x00RTrtſſ") // The set [^\n\r\p{L}\p{N}] var set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f = syntax.NewCharSetRuntime("\x01\x02\x00\x00\x00\x02\x00\x00\x00\n\n\r\r\x01L\x01N") +// The set [^\s\p{L}\p{N}] +var set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113 = syntax.NewCharSetRuntime("\x01\x00\x00\x00\x00\x03\x00\x00\x00\x01 \x01L\x01N") + func init() { - regexp2.RegisterEngine("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_Engine{}) - regexp2.RegisterEngine("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_2_Engine{}) + regexp2.RegisterEngine("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_Engine{}) + regexp2.RegisterEngine("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_2_Engine{}) regexp2.RegisterEngine("[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_3_Engine{}) var _ = helpers.Min var _ = syntax.NewCharSetRuntime diff --git a/codec/regexp.go b/codec/regexp.go new file mode 100644 index 0000000..d501026 --- /dev/null +++ b/codec/regexp.go @@ -0,0 +1,2 @@ +//go:generate go tool github.com/dlclark/regexp2cg -package codec -o regexp.gen.go +package codec diff --git a/go.mod b/go.mod index d14997a..f6ba237 100644 --- a/go.mod +++ b/go.mod @@ -4,9 +4,11 @@ go 1.21.4 toolchain go1.22.0 +tool github.com/dlclark/regexp2cg + require github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b require ( - github.com/dlclark/regexp2cg v0.1.0 // indirect + github.com/dlclark/regexp2cg v0.2.0 // indirect github.com/pkg/errors v0.9.1 // indirect ) diff --git a/go.sum b/go.sum index d15ad1e..c6600a1 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,6 @@ -github.com/dlclark/regexp2 v1.9.0 h1:pTK/l/3qYIKaRXuHnEnIf7Y5NxfRPfpb7dis6/gdlVI= -github.com/dlclark/regexp2 v1.9.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b h1:AJKOdc+1fRSJ0/75Jty1npvxUUD0y7hQDg15LMAHhyU= github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b/go.mod h1:YvCrhrh/qlds8EhFKPtJprdXn5fWBllSw1qo99dZyiQ= -github.com/dlclark/regexp2cg v0.1.0 h1:SdeGspyGihv995cqVSv6UPOHjm/2J1pvQbEQlFDEzNA= -github.com/dlclark/regexp2cg v0.1.0/go.mod h1:n3VoFWNg5bIAwl8j4N/K74dsot/olhCCaIWufs8mUJI= +github.com/dlclark/regexp2cg v0.2.0 h1:YTk+oP9dO74myroxiopnf/zlGOSuTGIuYhRx769YFk4= +github.com/dlclark/regexp2cg v0.2.0/go.mod h1:K2c4ctxtSQjzgeMKKgi1rEflZVVJWZWlUUdmtjOp/y8= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=