diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index cd89848..74b42e9 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -19,7 +19,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: "1.20" + go-version: "1.24" - name: Build run: go build -v ./... diff --git a/codec/cl100k_base.go b/codec/cl100k_base.go index db573f7..592fb08 100644 --- a/codec/cl100k_base.go +++ b/codec/cl100k_base.go @@ -4,10 +4,13 @@ import "github.com/dlclark/regexp2" func NewCl100kBase() *Codec { cl100kBaseVocabOnce.Do(cl100kBaseVocabInit) + + splitRegexp := regexp2.MustCompile(`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, regexp2.None) + return &Codec{ name: "cl100k_base", vocabulary: cl100kBaseVocab, - splitRegexp: regexp2.MustCompile(`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, regexp2.None), + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 100257, "<|fim_prefix|>": 100258, diff --git a/codec/o200k_base.go b/codec/o200k_base.go index 81568bf..825bba8 100644 --- a/codec/o200k_base.go +++ b/codec/o200k_base.go @@ -4,17 +4,15 @@ import "github.com/dlclark/regexp2" func NewO200kBase() *Codec { o200kBaseVocabOnce.Do(o200kBaseVocabInit) + + splitRegexp := regexp2.MustCompile( + `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`, + regexp2.None) + return &Codec{ - name: "o200k_base", - vocabulary: o200kBaseVocab, - splitRegexp: regexp2.MustCompile( - `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|`+ - `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|`+ - `\p{N}{1,3}|`+` ?[^\s\p{L}\p{N}]+[\r\n/]*|`+ - `\s*[\r\n]+|`+ - `\s+(?!\S)|`+ - `\s+`, - regexp2.None), + name: "o200k_base", + vocabulary: o200kBaseVocab, + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 199999, "<|endofprompt|>": 200018, diff --git a/codec/p50k_base.go b/codec/p50k_base.go index 4b64a4b..453ef5a 100644 --- a/codec/p50k_base.go +++ b/codec/p50k_base.go @@ -4,10 +4,13 @@ import "github.com/dlclark/regexp2" func NewP50kBase() *Codec { p50kBaseVocabOnce.Do(p50kBaseVocabInit) + + splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) + return &Codec{ name: "p50k_base", vocabulary: p50kBaseVocab, - splitRegexp: regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None), + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 50256, }, diff --git a/codec/p50k_edit.go b/codec/p50k_edit.go index 7a05aa8..1d1fd1c 100644 --- a/codec/p50k_edit.go +++ b/codec/p50k_edit.go @@ -4,10 +4,13 @@ import "github.com/dlclark/regexp2" func NewP50kEdit() *Codec { p50kBaseVocabOnce.Do(p50kBaseVocabInit) + + splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) + return &Codec{ name: "p50k_edit", vocabulary: p50kBaseVocab, - splitRegexp: regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None), + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 50256, "<|fim_prefix|>": 50281, diff --git a/codec/r50k_base.go b/codec/r50k_base.go index 3c99721..4c792f8 100644 --- a/codec/r50k_base.go +++ b/codec/r50k_base.go @@ -4,10 +4,13 @@ import "github.com/dlclark/regexp2" func NewR50kBase() *Codec { r50kBaseVocabOnce.Do(r50kBaseVocabInit) + + splitRegexp := regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None) + return &Codec{ name: "r50k_base", vocabulary: r50kBaseVocab, - splitRegexp: regexp2.MustCompile(`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`, regexp2.None), + splitRegexp: splitRegexp, specialTokens: map[string]uint{ "<|endoftext|>": 50256, }, diff --git a/codec/regexp.gen.go b/codec/regexp.gen.go new file mode 100644 index 0000000..3acf8fa --- /dev/null +++ b/codec/regexp.gen.go @@ -0,0 +1,1555 @@ +package codec + +import ( + "github.com/dlclark/regexp2" + "github.com/dlclark/regexp2/helpers" + "github.com/dlclark/regexp2/syntax" + "unicode" +) + +/* +Capture(index = 0, unindex = -1) + Atomic + Alternate + Concatenate + One(Ch = ') + Atomic + Alternate + Set(Set = [R-Tr-tſ]) + Concatenate + Set(Set = [Rr]) + Set(Set = [Ee]) + Concatenate + Set(Set = [Vv]) + Set(Set = [Ee]) + Set(Set = [Mm]) + SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + Set(Set = [Dd]) + Concatenate + Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) + SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + SetloopAtomic(Set = [\n\r])(Min = 0, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 0, Max = inf) + SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 1, Max = inf) + NegLook + Set(Set = [^\s]) + SetloopAtomic(Set = [\s])(Min = 1, Max = inf) +*/ +// From cl100k_base.go:8:37 +// Pattern: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" +// Options: regexp2.None +type splitRegexp_Engine struct{} + +func (splitRegexp_Engine) Caps() map[int]int { return nil } +func (splitRegexp_Engine) CapNames() map[string]int { return nil } +func (splitRegexp_Engine) CapsList() []string { return nil } +func (splitRegexp_Engine) CapSize() int { return 1 } + +func (splitRegexp_Engine) FindFirstChar(r *regexp2.Runner) bool { + pos := r.Runtextpos + // Empty matches aren't possible + if pos < len(r.Runtext) { + return true + } + + // No match found + r.Runtextpos = len(r.Runtext) + return false +} + +func (splitRegexp_Engine) Execute(r *regexp2.Runner) error { + atomic_stackpos := 0 + alternation_starting_pos := 0 + alternation_starting_pos1 := 0 + var charloop_starting_pos, charloop_ending_pos = 0, 0 + iteration := 0 + iteration1 := 0 + iteration2 := 0 + iteration3 := 0 + var charloop_starting_pos1, charloop_ending_pos1 = 0, 0 + iteration4 := 0 + iteration5 := 0 + var charloop_starting_pos2, charloop_ending_pos2 = 0, 0 + iteration6 := 0 + negativelookahead_starting_pos := 0 + iteration7 := 0 + pos := r.Runtextpos + matchStart := pos + + var slice = r.Runtext[pos:] + + // Node: Atomic + // Atomic group. + atomic_stackpos = r.Runstackpos + + // Node: Alternate + // Match with 7 alternative expressions, atomically. + alternation_starting_pos = pos + + // Branch 0 + // Node: Concatenate + // Node: One(Ch = ') + // Match '\”. + if len(slice) == 0 || slice[0] != '\'' { + goto AlternationBranch + } + + // Node: Atomic + // Node: Alternate + // Match with 6 alternative expressions, atomically. + alternation_starting_pos1 = pos + + // Branch 0 + // Node: Set(Set = [R-Tr-tſ]) + // Match [R-Tr-tſ]. + if len(slice) < 2 || !set_4a1357005dba18ced5af0bde8202a98fab7a3500c675e3cb1c60597d7a36436a.CharIn(slice[1]) { + goto AlternationBranch1 + } + + pos += 2 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch1: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Concatenate + if len(slice) < 3 || + !helpers.StartsWithIgnoreCase(slice[1:], []rune("re")) /* Match the string "re" (case-insensitive) */ { + goto AlternationBranch2 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch2: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 2 + // Node: Concatenate + if len(slice) < 3 || + !helpers.StartsWithIgnoreCase(slice[1:], []rune("ve")) /* Match the string "ve" (case-insensitive) */ { + goto AlternationBranch3 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch3: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 3 + // Node: Set(Set = [Mm]) + // Match [Mm]. + if len(slice) < 2 || (slice[1]|0x20 != 'm') { + goto AlternationBranch4 + } + + pos += 2 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch4: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 4 + // Node: SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + // Match [Ll] exactly 2 times. + if len(slice) < 3 || + (slice[1]|0x20 != 'l') || + (slice[2]|0x20 != 'l') { + goto AlternationBranch5 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch5: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 5 + // Node: Set(Set = [Dd]) + // Match [Dd]. + if len(slice) < 2 || (slice[1]|0x20 != 'd') { + goto AlternationBranch + } + + pos += 2 + slice = r.Runtext[pos:] + +AlternationMatch1: + ; + + goto AlternationMatch + +AlternationBranch: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Concatenate + // Node: Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + // Match [^\n\r\p{L}\p{N}] greedily, optionally. + charloop_starting_pos = pos + + if len(slice) > 0 && set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f.CharIn(slice[0]) { + slice = slice[1:] + pos++ + } + + charloop_ending_pos = pos + goto CharLoopEnd + +CharLoopBacktrack: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos >= charloop_ending_pos { + goto AlternationBranch6 + } + charloop_ending_pos-- + pos = charloop_ending_pos + slice = r.Runtext[pos:] + +CharLoopEnd: + + // Node: SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) + // Match [\p{L}] atomically at least once. + iteration = 0 + for iteration < len(slice) && unicode.In(slice[iteration], unicode.L) { + iteration++ + } + + if iteration == 0 { + goto CharLoopBacktrack + } + + slice = slice[iteration:] + pos += iteration + + goto AlternationMatch + +AlternationBranch6: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 2 + // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + // Match [\p{N}] atomically at least 1 and at most 3 times. + iteration1 = 0 + for iteration1 < 3 && iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.N) { + iteration1++ + } + + if iteration1 == 0 { + goto AlternationBranch7 + } + + slice = slice[iteration1:] + pos += iteration1 + + goto AlternationMatch + +AlternationBranch7: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 3 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + // Match [^\s\p{L}\p{N}] atomically at least once. + iteration2 = 0 + for iteration2 < len(slice) && set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113.CharIn(slice[iteration2]) { + iteration2++ + } + + if iteration2 == 0 { + goto AlternationBranch8 + } + + slice = slice[iteration2:] + pos += iteration2 + + // Node: SetloopAtomic(Set = [\n\r])(Min = 0, Max = inf) + // Match [\n\r] atomically any number of times. + iteration3 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') + if iteration3 < 0 { + iteration3 = len(slice) + } + + slice = slice[iteration3:] + pos += iteration3 + + goto AlternationMatch + +AlternationBranch8: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 4 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 0, Max = inf) + // Match [\s] greedily any number of times. + charloop_starting_pos1 = pos + + iteration4 = 0 + for iteration4 < len(slice) && unicode.IsSpace(slice[iteration4]) { + iteration4++ + } + + slice = slice[iteration4:] + pos += iteration4 + + charloop_ending_pos1 = pos + goto CharLoopEnd1 + +CharLoopBacktrack1: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos1 >= charloop_ending_pos1 { + goto AlternationBranch9 + } + charloop_ending_pos1 = helpers.IndexOfAny2(r.Runtext[charloop_starting_pos1:charloop_ending_pos1], '\n', '\r') + if charloop_ending_pos1 < 0 { // miss + goto AlternationBranch9 + } + charloop_ending_pos1 += charloop_starting_pos1 + pos = charloop_ending_pos1 + slice = r.Runtext[pos:] + +CharLoopEnd1: + + // Node: SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + // Match [\n\r] atomically at least once. + iteration5 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') + if iteration5 < 0 { + iteration5 = len(slice) + } + + if iteration5 == 0 { + goto CharLoopBacktrack1 + } + + slice = slice[iteration5:] + pos += iteration5 + + goto AlternationMatch + +AlternationBranch9: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 5 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 1, Max = inf) + // Match [\s] greedily at least once. + charloop_starting_pos2 = pos + + iteration6 = 0 + for iteration6 < len(slice) && unicode.IsSpace(slice[iteration6]) { + iteration6++ + } + + if iteration6 == 0 { + goto AlternationBranch10 + } + + slice = slice[iteration6:] + pos += iteration6 + + charloop_ending_pos2 = pos + charloop_starting_pos2++ + goto CharLoopEnd2 + +CharLoopBacktrack2: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos2 >= charloop_ending_pos2 { + goto AlternationBranch10 + } + charloop_ending_pos2-- + pos = charloop_ending_pos2 + slice = r.Runtext[pos:] + +CharLoopEnd2: + + // Node: NegLook + // Zero-width negative lookahead + negativelookahead_starting_pos = pos + + if err := r.CheckTimeout(); err != nil { + return err + } + // Node: Set(Set = [^\s]) + // Match [^\s]. + if len(slice) == 0 || unicode.IsSpace(slice[0]) { + goto NegativeLookaroundMatch + } + + goto CharLoopBacktrack2 + +NegativeLookaroundMatch: + pos = negativelookahead_starting_pos + slice = r.Runtext[pos:] + + goto AlternationMatch + +AlternationBranch10: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 6 + // Node: SetloopAtomic(Set = [\s])(Min = 1, Max = inf) + // Match [\s] atomically at least once. + iteration7 = 0 + for iteration7 < len(slice) && unicode.IsSpace(slice[iteration7]) { + iteration7++ + } + + if iteration7 == 0 { + return nil // The input didn't match. + } + + slice = slice[iteration7:] + pos += iteration7 + +AlternationMatch: + ; + + r.Runstackpos = atomic_stackpos + + // The input matched. + r.Runtextpos = pos + r.Capture(0, matchStart, pos) + // just to prevent an unused var error in certain regex's + var _ = slice + return nil +} + +/* +Capture(index = 0, unindex = -1) + Atomic + Alternate + Concatenate + One(Ch = ') + Atomic + Alternate + Set(Set = [r-t]) + Multi(String = "re") + Multi(String = "ve") + One(Ch = m) + Multi(String = "ll") + One(Ch = d) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [\p{N}])(Min = 1, Max = inf) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 1, Max = inf) + NegLook + Set(Set = [^\s]) + SetloopAtomic(Set = [\s])(Min = 1, Max = inf) +*/ +// From p50k_base.go:8:37 +// Pattern: "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" +// Options: regexp2.None +type splitRegexp_2_Engine struct{} + +func (splitRegexp_2_Engine) Caps() map[int]int { return nil } +func (splitRegexp_2_Engine) CapNames() map[string]int { return nil } +func (splitRegexp_2_Engine) CapsList() []string { return nil } +func (splitRegexp_2_Engine) CapSize() int { return 1 } + +func (splitRegexp_2_Engine) FindFirstChar(r *regexp2.Runner) bool { + pos := r.Runtextpos + // Empty matches aren't possible + if pos < len(r.Runtext) { + return true + } + + // No match found + r.Runtextpos = len(r.Runtext) + return false +} + +func (splitRegexp_2_Engine) Execute(r *regexp2.Runner) error { + atomic_stackpos := 0 + alternation_starting_pos := 0 + alternation_starting_pos1 := 0 + iteration := 0 + iteration1 := 0 + iteration2 := 0 + var charloop_starting_pos, charloop_ending_pos = 0, 0 + iteration3 := 0 + negativelookahead_starting_pos := 0 + iteration4 := 0 + pos := r.Runtextpos + matchStart := pos + + var slice = r.Runtext[pos:] + + // Node: Atomic + // Atomic group. + atomic_stackpos = r.Runstackpos + + // Node: Alternate + // Match with 6 alternative expressions, atomically. + alternation_starting_pos = pos + + // Branch 0 + // Node: Concatenate + // Node: One(Ch = ') + // Match '\”. + if len(slice) == 0 || slice[0] != '\'' { + goto AlternationBranch + } + + // Node: Atomic + // Node: Alternate + // Match with 6 alternative expressions, atomically. + alternation_starting_pos1 = pos + + // Branch 0 + // Node: Set(Set = [r-t]) + // Match [r-t]. + if len(slice) < 2 || !helpers.IsBetween(slice[1], 'r', 't') { + goto AlternationBranch1 + } + + pos += 2 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch1: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Multi(String = "re") + // Match the string "re". + if !helpers.StartsWith(slice[1:], []rune("re")) { + goto AlternationBranch2 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch2: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 2 + // Node: Multi(String = "ve") + // Match the string "ve". + if !helpers.StartsWith(slice[1:], []rune("ve")) { + goto AlternationBranch3 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch3: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 3 + // Node: One(Ch = m) + // Match 'm'. + if len(slice) < 2 || slice[1] != 'm' { + goto AlternationBranch4 + } + + pos += 2 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch4: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 4 + // Node: Multi(String = "ll") + // Match the string "ll". + if !helpers.StartsWith(slice[1:], []rune("ll")) { + goto AlternationBranch5 + } + + pos += 3 + slice = r.Runtext[pos:] + goto AlternationMatch1 + +AlternationBranch5: + pos = alternation_starting_pos1 + slice = r.Runtext[pos:] + + // Branch 5 + // Node: One(Ch = d) + // Match 'd'. + if len(slice) < 2 || slice[1] != 'd' { + goto AlternationBranch + } + + pos += 2 + slice = r.Runtext[pos:] + +AlternationMatch1: + ; + + goto AlternationMatch + +AlternationBranch: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [\p{L}])(Min = 1, Max = inf) + // Match [\p{L}] atomically at least once. + iteration = 0 + for iteration < len(slice) && unicode.In(slice[iteration], unicode.L) { + iteration++ + } + + if iteration == 0 { + goto AlternationBranch6 + } + + slice = slice[iteration:] + pos += iteration + + goto AlternationMatch + +AlternationBranch6: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 2 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = inf) + // Match [\p{N}] atomically at least once. + iteration1 = 0 + for iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.N) { + iteration1++ + } + + if iteration1 == 0 { + goto AlternationBranch7 + } + + slice = slice[iteration1:] + pos += iteration1 + + goto AlternationMatch + +AlternationBranch7: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 3 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + // Match [^\s\p{L}\p{N}] atomically at least once. + iteration2 = 0 + for iteration2 < len(slice) && set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113.CharIn(slice[iteration2]) { + iteration2++ + } + + if iteration2 == 0 { + goto AlternationBranch8 + } + + slice = slice[iteration2:] + pos += iteration2 + + goto AlternationMatch + +AlternationBranch8: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 4 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 1, Max = inf) + // Match [\s] greedily at least once. + charloop_starting_pos = pos + + iteration3 = 0 + for iteration3 < len(slice) && unicode.IsSpace(slice[iteration3]) { + iteration3++ + } + + if iteration3 == 0 { + goto AlternationBranch9 + } + + slice = slice[iteration3:] + pos += iteration3 + + charloop_ending_pos = pos + charloop_starting_pos++ + goto CharLoopEnd + +CharLoopBacktrack: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos >= charloop_ending_pos { + goto AlternationBranch9 + } + charloop_ending_pos-- + pos = charloop_ending_pos + slice = r.Runtext[pos:] + +CharLoopEnd: + + // Node: NegLook + // Zero-width negative lookahead + negativelookahead_starting_pos = pos + + if err := r.CheckTimeout(); err != nil { + return err + } + // Node: Set(Set = [^\s]) + // Match [^\s]. + if len(slice) == 0 || unicode.IsSpace(slice[0]) { + goto NegativeLookaroundMatch + } + + goto CharLoopBacktrack + +NegativeLookaroundMatch: + pos = negativelookahead_starting_pos + slice = r.Runtext[pos:] + + goto AlternationMatch + +AlternationBranch9: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 5 + // Node: SetloopAtomic(Set = [\s])(Min = 1, Max = inf) + // Match [\s] atomically at least once. + iteration4 = 0 + for iteration4 < len(slice) && unicode.IsSpace(slice[iteration4]) { + iteration4++ + } + + if iteration4 == 0 { + return nil // The input didn't match. + } + + slice = slice[iteration4:] + pos += iteration4 + +AlternationMatch: + ; + + r.Runstackpos = atomic_stackpos + + // The input matched. + r.Runtextpos = pos + r.Capture(0, matchStart, pos) + // just to prevent an unused var error in certain regex's + var _ = slice + return nil +} + +/* +Capture(index = 0, unindex = -1) + Atomic + Alternate + Concatenate + Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + Setloop(Set = [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}])(Min = 0, Max = inf) + Setloop(Set = [\p{Ll}\p{Lm}\p{Lo}\p{M}])(Min = 1, Max = inf) + Atomic + Loop(Min = 0, Max = 1) + Concatenate + One(Ch = ') + Atomic + Alternate + Set(Set = [STstſ]) + Concatenate + Set(Set = [Rr]) + Set(Set = [Ee]) + Concatenate + Set(Set = [Vv]) + Set(Set = [Ee]) + Set(Set = [Mm]) + SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + Set(Set = [Dd]) + Concatenate + Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + Setloop(Set = [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}])(Min = 1, Max = inf) + Setloop(Set = [\p{Ll}\p{Lm}\p{Lo}\p{M}])(Min = 0, Max = inf) + Atomic + Loop(Min = 0, Max = 1) + Concatenate + One(Ch = ') + Atomic + Alternate + Set(Set = [STstſ]) + Concatenate + Set(Set = [Rr]) + Set(Set = [Ee]) + Concatenate + Set(Set = [Vv]) + Set(Set = [Ee]) + Set(Set = [Mm]) + SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + Set(Set = [Dd]) + SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + Concatenate + OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + SetloopAtomic(Set = [\n\r/])(Min = 0, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 0, Max = inf) + SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + Concatenate + Setloop(Set = [\s])(Min = 1, Max = inf) + NegLook + Set(Set = [^\s]) + SetloopAtomic(Set = [\s])(Min = 1, Max = inf) +*/ +// From o200k_base.go:9:3 +// Pattern: "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" +// Options: regexp2.None +type splitRegexp_3_Engine struct{} + +func (splitRegexp_3_Engine) Caps() map[int]int { return nil } +func (splitRegexp_3_Engine) CapNames() map[string]int { return nil } +func (splitRegexp_3_Engine) CapsList() []string { return nil } +func (splitRegexp_3_Engine) CapSize() int { return 1 } + +func (splitRegexp_3_Engine) FindFirstChar(r *regexp2.Runner) bool { + pos := r.Runtextpos + // Empty matches aren't possible + if pos < len(r.Runtext) { + return true + } + + // No match found + r.Runtextpos = len(r.Runtext) + return false +} + +func (splitRegexp_3_Engine) Execute(r *regexp2.Runner) error { + atomic_stackpos := 0 + alternation_starting_pos := 0 + var charloop_starting_pos, charloop_ending_pos = 0, 0 + var charloop_starting_pos1, charloop_ending_pos1 = 0, 0 + iteration := 0 + var charloop_starting_pos2, charloop_ending_pos2 = 0, 0 + iteration1 := 0 + loop_iteration := 0 + startingStackpos := 0 + var charloop_starting_pos3, charloop_ending_pos3 = 0, 0 + var charloop_starting_pos4, charloop_ending_pos4 = 0, 0 + iteration2 := 0 + var charloop_starting_pos5, charloop_ending_pos5 = 0, 0 + iteration3 := 0 + loop_iteration1 := 0 + startingStackpos1 := 0 + iteration4 := 0 + iteration5 := 0 + iteration6 := 0 + var charloop_starting_pos6, charloop_ending_pos6 = 0, 0 + iteration7 := 0 + iteration8 := 0 + var charloop_starting_pos7, charloop_ending_pos7 = 0, 0 + iteration9 := 0 + negativelookahead_starting_pos := 0 + iteration10 := 0 + pos := r.Runtextpos + matchStart := pos + + var slice = r.Runtext[pos:] + + // Node: Atomic + // Atomic group. + atomic_stackpos = r.Runstackpos + + // Node: Alternate + // Match with 7 alternative expressions, atomically. + alternation_starting_pos = pos + + // Branch 0 + // Node: Concatenate + // Node: Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + // Match [^\n\r\p{L}\p{N}] greedily, optionally. + charloop_starting_pos = pos + + if len(slice) > 0 && set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f.CharIn(slice[0]) { + slice = slice[1:] + pos++ + } + + charloop_ending_pos = pos + goto CharLoopEnd + +CharLoopBacktrack: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos >= charloop_ending_pos { + goto AlternationBranch + } + charloop_ending_pos-- + pos = charloop_ending_pos + slice = r.Runtext[pos:] + +CharLoopEnd: + + // Node: Setloop(Set = [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}])(Min = 0, Max = inf) + // Match [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}] greedily any number of times. + charloop_starting_pos1 = pos + + iteration = 0 + for iteration < len(slice) && unicode.In(slice[iteration], unicode.Lu, unicode.Lt, unicode.Lm, unicode.Lo, unicode.M) { + iteration++ + } + + slice = slice[iteration:] + pos += iteration + + charloop_ending_pos1 = pos + goto CharLoopEnd1 + +CharLoopBacktrack1: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos1 >= charloop_ending_pos1 { + goto CharLoopBacktrack + } + charloop_ending_pos1-- + pos = charloop_ending_pos1 + slice = r.Runtext[pos:] + +CharLoopEnd1: + + // Node: Setloop(Set = [\p{Ll}\p{Lm}\p{Lo}\p{M}])(Min = 1, Max = inf) + // Match [\p{Ll}\p{Lm}\p{Lo}\p{M}] greedily at least once. + charloop_starting_pos2 = pos + + iteration1 = 0 + for iteration1 < len(slice) && unicode.In(slice[iteration1], unicode.Ll, unicode.Lm, unicode.Lo, unicode.M) { + iteration1++ + } + + if iteration1 == 0 { + goto CharLoopBacktrack1 + } + + slice = slice[iteration1:] + pos += iteration1 + + charloop_ending_pos2 = pos + charloop_starting_pos2++ + goto CharLoopEnd2 + +CharLoopBacktrack2: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos2 >= charloop_ending_pos2 { + goto CharLoopBacktrack1 + } + charloop_ending_pos2 = helpers.LastIndexOfAny1(r.Runtext[charloop_starting_pos2:charloop_ending_pos2], '\'') + if charloop_ending_pos2 < 0 { // miss + goto CharLoopBacktrack1 + } + charloop_ending_pos2 += charloop_starting_pos2 + pos = charloop_ending_pos2 + slice = r.Runtext[pos:] + +CharLoopEnd2: + + // Node: Atomic + // Node: Loop(Min = 0, Max = 1) + // Optional (greedy). + startingStackpos = r.Runstackpos + loop_iteration = 0 + +LoopBody: + ; + r.StackPush(pos) + + loop_iteration++ + + // Node: Concatenate + // Node: One(Ch = ') + // Match '\”. + if len(slice) == 0 || slice[0] != '\'' { + goto LoopIterationNoMatch + } + + // Node: Atomic + // Node: Alternate + // Match with 6 alternative expressions, atomically. + if len(slice) < 2 { + goto LoopIterationNoMatch + } + + switch slice[1] { + case 'S', 'T', 's', 't', 'ſ': + pos += 2 + slice = r.Runtext[pos:] + + case 'R', 'r': + // Node: Concatenate + // Node: Empty + + // Node: Set(Set = [Ee]) + // Match [Ee]. + if len(slice) < 3 || (slice[2]|0x20 != 'e') { + goto LoopIterationNoMatch + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'V', 'v': + // Node: Concatenate + // Node: Empty + + // Node: Set(Set = [Ee]) + // Match [Ee]. + if len(slice) < 3 || (slice[2]|0x20 != 'e') { + goto LoopIterationNoMatch + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'M', 'm': + pos += 2 + slice = r.Runtext[pos:] + + case 'L', 'l': + // Node: SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + // Match [Ll] exactly 2 times. + if len(slice) < 3 || + (slice[1]|0x20 != 'l') || + (slice[2]|0x20 != 'l') { + goto LoopIterationNoMatch + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'D', 'd': + pos += 2 + slice = r.Runtext[pos:] + + default: + goto LoopIterationNoMatch + } + + // The loop has an upper bound of 1. Continue iterating greedily if it hasn't yet been reached. + if loop_iteration == 0 { + goto LoopBody + } + goto LoopEnd + + // The loop iteration failed. Put state back to the way it was before the iteration. +LoopIterationNoMatch: + loop_iteration-- + if loop_iteration < 0 { + // Unable to match the remainder of the expression after exhausting the loop. + goto CharLoopBacktrack2 + } + pos = r.StackPop() + slice = r.Runtext[pos:] +LoopEnd: + r.Runstackpos = startingStackpos // Ensure any remaining backtracking state is removed. + + goto AlternationMatch + +AlternationBranch: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 1 + // Node: Concatenate + // Node: Setloop(Set = [^\n\r\p{L}\p{N}])(Min = 0, Max = 1) + // Match [^\n\r\p{L}\p{N}] greedily, optionally. + charloop_starting_pos3 = pos + + if len(slice) > 0 && set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f.CharIn(slice[0]) { + slice = slice[1:] + pos++ + } + + charloop_ending_pos3 = pos + goto CharLoopEnd3 + +CharLoopBacktrack3: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos3 >= charloop_ending_pos3 { + goto AlternationBranch1 + } + charloop_ending_pos3-- + pos = charloop_ending_pos3 + slice = r.Runtext[pos:] + +CharLoopEnd3: + + // Node: Setloop(Set = [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}])(Min = 1, Max = inf) + // Match [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}] greedily at least once. + charloop_starting_pos4 = pos + + iteration2 = 0 + for iteration2 < len(slice) && unicode.In(slice[iteration2], unicode.Lu, unicode.Lt, unicode.Lm, unicode.Lo, unicode.M) { + iteration2++ + } + + if iteration2 == 0 { + goto CharLoopBacktrack3 + } + + slice = slice[iteration2:] + pos += iteration2 + + charloop_ending_pos4 = pos + charloop_starting_pos4++ + goto CharLoopEnd4 + +CharLoopBacktrack4: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos4 >= charloop_ending_pos4 { + goto CharLoopBacktrack3 + } + charloop_ending_pos4-- + pos = charloop_ending_pos4 + slice = r.Runtext[pos:] + +CharLoopEnd4: + + // Node: Setloop(Set = [\p{Ll}\p{Lm}\p{Lo}\p{M}])(Min = 0, Max = inf) + // Match [\p{Ll}\p{Lm}\p{Lo}\p{M}] greedily any number of times. + charloop_starting_pos5 = pos + + iteration3 = 0 + for iteration3 < len(slice) && unicode.In(slice[iteration3], unicode.Ll, unicode.Lm, unicode.Lo, unicode.M) { + iteration3++ + } + + slice = slice[iteration3:] + pos += iteration3 + + charloop_ending_pos5 = pos + goto CharLoopEnd5 + +CharLoopBacktrack5: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos5 >= charloop_ending_pos5 { + goto CharLoopBacktrack4 + } + charloop_ending_pos5 = helpers.LastIndexOfAny1(r.Runtext[charloop_starting_pos5:charloop_ending_pos5], '\'') + if charloop_ending_pos5 < 0 { // miss + goto CharLoopBacktrack4 + } + charloop_ending_pos5 += charloop_starting_pos5 + pos = charloop_ending_pos5 + slice = r.Runtext[pos:] + +CharLoopEnd5: + + // Node: Atomic + // Node: Loop(Min = 0, Max = 1) + // Optional (greedy). + startingStackpos1 = r.Runstackpos + loop_iteration1 = 0 + +LoopBody1: + ; + r.StackPush(pos) + + loop_iteration1++ + + // Node: Concatenate + // Node: One(Ch = ') + // Match '\”. + if len(slice) == 0 || slice[0] != '\'' { + goto LoopIterationNoMatch1 + } + + // Node: Atomic + // Node: Alternate + // Match with 6 alternative expressions, atomically. + if len(slice) < 2 { + goto LoopIterationNoMatch1 + } + + switch slice[1] { + case 'S', 'T', 's', 't', 'ſ': + pos += 2 + slice = r.Runtext[pos:] + + case 'R', 'r': + // Node: Concatenate + // Node: Empty + + // Node: Set(Set = [Ee]) + // Match [Ee]. + if len(slice) < 3 || (slice[2]|0x20 != 'e') { + goto LoopIterationNoMatch1 + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'V', 'v': + // Node: Concatenate + // Node: Empty + + // Node: Set(Set = [Ee]) + // Match [Ee]. + if len(slice) < 3 || (slice[2]|0x20 != 'e') { + goto LoopIterationNoMatch1 + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'M', 'm': + pos += 2 + slice = r.Runtext[pos:] + + case 'L', 'l': + // Node: SetloopAtomic(Set = [Ll])(Min = 2, Max = 2) + // Match [Ll] exactly 2 times. + if len(slice) < 3 || + (slice[1]|0x20 != 'l') || + (slice[2]|0x20 != 'l') { + goto LoopIterationNoMatch1 + } + + pos += 3 + slice = r.Runtext[pos:] + + case 'D', 'd': + pos += 2 + slice = r.Runtext[pos:] + + default: + goto LoopIterationNoMatch1 + } + + // The loop has an upper bound of 1. Continue iterating greedily if it hasn't yet been reached. + if loop_iteration1 == 0 { + goto LoopBody1 + } + goto LoopEnd1 + + // The loop iteration failed. Put state back to the way it was before the iteration. +LoopIterationNoMatch1: + loop_iteration1-- + if loop_iteration1 < 0 { + // Unable to match the remainder of the expression after exhausting the loop. + goto CharLoopBacktrack5 + } + pos = r.StackPop() + slice = r.Runtext[pos:] +LoopEnd1: + r.Runstackpos = startingStackpos1 // Ensure any remaining backtracking state is removed. + + goto AlternationMatch + +AlternationBranch1: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 2 + // Node: SetloopAtomic(Set = [\p{N}])(Min = 1, Max = 3) + // Match [\p{N}] atomically at least 1 and at most 3 times. + iteration4 = 0 + for iteration4 < 3 && iteration4 < len(slice) && unicode.In(slice[iteration4], unicode.N) { + iteration4++ + } + + if iteration4 == 0 { + goto AlternationBranch2 + } + + slice = slice[iteration4:] + pos += iteration4 + + goto AlternationMatch + +AlternationBranch2: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 3 + // Node: Concatenate + // Node: OneloopAtomic(Ch = \ )(Min = 0, Max = 1) + // Match ' ' atomically, optionally. + if len(slice) > 0 && slice[0] == ' ' { + slice = slice[1:] + pos++ + } + + // Node: SetloopAtomic(Set = [^\s\p{L}\p{N}])(Min = 1, Max = inf) + // Match [^\s\p{L}\p{N}] atomically at least once. + iteration5 = 0 + for iteration5 < len(slice) && set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113.CharIn(slice[iteration5]) { + iteration5++ + } + + if iteration5 == 0 { + goto AlternationBranch3 + } + + slice = slice[iteration5:] + pos += iteration5 + + // Node: SetloopAtomic(Set = [\n\r/])(Min = 0, Max = inf) + // Match [\n\r/] atomically any number of times. + iteration6 = helpers.IndexOfAnyExcept3(slice, '\n', '\r', '/') + if iteration6 < 0 { + iteration6 = len(slice) + } + + slice = slice[iteration6:] + pos += iteration6 + + goto AlternationMatch + +AlternationBranch3: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 4 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 0, Max = inf) + // Match [\s] greedily any number of times. + charloop_starting_pos6 = pos + + iteration7 = 0 + for iteration7 < len(slice) && unicode.IsSpace(slice[iteration7]) { + iteration7++ + } + + slice = slice[iteration7:] + pos += iteration7 + + charloop_ending_pos6 = pos + goto CharLoopEnd6 + +CharLoopBacktrack6: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos6 >= charloop_ending_pos6 { + goto AlternationBranch4 + } + charloop_ending_pos6 = helpers.IndexOfAny2(r.Runtext[charloop_starting_pos6:charloop_ending_pos6], '\n', '\r') + if charloop_ending_pos6 < 0 { // miss + goto AlternationBranch4 + } + charloop_ending_pos6 += charloop_starting_pos6 + pos = charloop_ending_pos6 + slice = r.Runtext[pos:] + +CharLoopEnd6: + + // Node: SetloopAtomic(Set = [\n\r])(Min = 1, Max = inf) + // Match [\n\r] atomically at least once. + iteration8 = helpers.IndexOfAnyExcept2(slice, '\n', '\r') + if iteration8 < 0 { + iteration8 = len(slice) + } + + if iteration8 == 0 { + goto CharLoopBacktrack6 + } + + slice = slice[iteration8:] + pos += iteration8 + + goto AlternationMatch + +AlternationBranch4: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 5 + // Node: Concatenate + // Node: Setloop(Set = [\s])(Min = 1, Max = inf) + // Match [\s] greedily at least once. + charloop_starting_pos7 = pos + + iteration9 = 0 + for iteration9 < len(slice) && unicode.IsSpace(slice[iteration9]) { + iteration9++ + } + + if iteration9 == 0 { + goto AlternationBranch5 + } + + slice = slice[iteration9:] + pos += iteration9 + + charloop_ending_pos7 = pos + charloop_starting_pos7++ + goto CharLoopEnd7 + +CharLoopBacktrack7: + + if err := r.CheckTimeout(); err != nil { + return err + } + if charloop_starting_pos7 >= charloop_ending_pos7 { + goto AlternationBranch5 + } + charloop_ending_pos7-- + pos = charloop_ending_pos7 + slice = r.Runtext[pos:] + +CharLoopEnd7: + + // Node: NegLook + // Zero-width negative lookahead + negativelookahead_starting_pos = pos + + if err := r.CheckTimeout(); err != nil { + return err + } + // Node: Set(Set = [^\s]) + // Match [^\s]. + if len(slice) == 0 || unicode.IsSpace(slice[0]) { + goto NegativeLookaroundMatch + } + + goto CharLoopBacktrack7 + +NegativeLookaroundMatch: + pos = negativelookahead_starting_pos + slice = r.Runtext[pos:] + + goto AlternationMatch + +AlternationBranch5: + pos = alternation_starting_pos + slice = r.Runtext[pos:] + + // Branch 6 + // Node: SetloopAtomic(Set = [\s])(Min = 1, Max = inf) + // Match [\s] atomically at least once. + iteration10 = 0 + for iteration10 < len(slice) && unicode.IsSpace(slice[iteration10]) { + iteration10++ + } + + if iteration10 == 0 { + return nil // The input didn't match. + } + + slice = slice[iteration10:] + pos += iteration10 + +AlternationMatch: + ; + + r.Runstackpos = atomic_stackpos + + // The input matched. + r.Runtextpos = pos + r.Capture(0, matchStart, pos) + // just to prevent an unused var error in certain regex's + var _ = slice + return nil +} + +// The set [R-Tr-tſ] +var set_4a1357005dba18ced5af0bde8202a98fab7a3500c675e3cb1c60597d7a36436a = syntax.NewCharSetRuntime("\x00\x03\x00\x00\x00\x00\x00\x00\x00RTrtſſ") + +// The set [^\n\r\p{L}\p{N}] +var set_4a7765fc40e8e8c7561121585d1545f9a51b44bf010a50f8b1b086a02ec1f07f = syntax.NewCharSetRuntime("\x01\x02\x00\x00\x00\x02\x00\x00\x00\n\n\r\r\x01L\x01N") + +// The set [^\s\p{L}\p{N}] +var set_93b362dc942d41f83c0905ca6229c31c41f863d4ab4d2d9e17dae21b5b922113 = syntax.NewCharSetRuntime("\x01\x00\x00\x00\x00\x03\x00\x00\x00\x01 \x01L\x01N") + +func init() { + regexp2.RegisterEngine("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_Engine{}) + regexp2.RegisterEngine("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_2_Engine{}) + regexp2.RegisterEngine("[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", regexp2.None, &splitRegexp_3_Engine{}) + var _ = helpers.Min + var _ = syntax.NewCharSetRuntime + var _ = unicode.IsDigit +} diff --git a/codec/regexp.go b/codec/regexp.go new file mode 100644 index 0000000..d501026 --- /dev/null +++ b/codec/regexp.go @@ -0,0 +1,2 @@ +//go:generate go tool github.com/dlclark/regexp2cg -package codec -o regexp.gen.go +package codec diff --git a/go.mod b/go.mod index 31e0ea4..f6ba237 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,14 @@ module github.com/tiktoken-go/tokenizer -go 1.20 +go 1.21.4 -require github.com/dlclark/regexp2 v1.9.0 +toolchain go1.22.0 + +tool github.com/dlclark/regexp2cg + +require github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b + +require ( + github.com/dlclark/regexp2cg v0.2.0 // indirect + github.com/pkg/errors v0.9.1 // indirect +) diff --git a/go.sum b/go.sum index fbc0ebc..c6600a1 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,6 @@ -github.com/dlclark/regexp2 v1.9.0 h1:pTK/l/3qYIKaRXuHnEnIf7Y5NxfRPfpb7dis6/gdlVI= -github.com/dlclark/regexp2 v1.9.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b h1:AJKOdc+1fRSJ0/75Jty1npvxUUD0y7hQDg15LMAHhyU= +github.com/dlclark/regexp2 v1.11.5-0.20240806004527-5bbbed8ea10b/go.mod h1:YvCrhrh/qlds8EhFKPtJprdXn5fWBllSw1qo99dZyiQ= +github.com/dlclark/regexp2cg v0.2.0 h1:YTk+oP9dO74myroxiopnf/zlGOSuTGIuYhRx769YFk4= +github.com/dlclark/regexp2cg v0.2.0/go.mod h1:K2c4ctxtSQjzgeMKKgi1rEflZVVJWZWlUUdmtjOp/y8= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= diff --git a/tokenizer_test.go b/tokenizer_test.go index 7d4d726..3e64d48 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -1,208 +1,127 @@ package tokenizer_test import ( + "fmt" "testing" "github.com/tiktoken-go/tokenizer" ) -func TestO200kBaseEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.O200kBase) - if err != nil { - t.Fatalf("can't create tokenizer") - } - - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{24912, 2375}}, - {text: "hello world", ids: []uint{24912, 220, 2375}}, - {text: "hello world", ids: []uint{24912, 256, 2375}}, - {text: "supercalifragilistic", ids: []uint{17789, 5842, 366, 17764, 311, 6207}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{2167, 1761, 1412, 581, 553, 11, 889, 1761, 625, 1412, 581, 1340, 413, 13}}, - } - - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) - } +type testTokenizer struct { + encoding tokenizer.Encoding + data []testTokenizerData } -func TestCl100kEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.Cl100kBase) - if err != nil { - t.Fatalf("can't create tokenizer") - } - - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{15339, 1917}}, - {text: "hello world", ids: []uint{15339, 220, 1917}}, - {text: "hello world", ids: []uint{15339, 256, 1917}}, - {text: "supercalifragilistic", ids: []uint{13066, 3035, 278, 333, 4193, 321, 4633}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{1687, 1440, 1148, 584, 527, 11, 719, 1440, 539, 1148, 584, 1253, 387, 13}}, - } - - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) - } +type testTokenizerData struct { + text string + ids []uint } -func TestR50kBaseEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.R50kBase) - if err != nil { - t.Fatalf("can't create tokenizer") +var ( + tokenizerTests = []testTokenizer{ + { + encoding: tokenizer.O200kBase, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{24912, 2375}}, + {text: "hello world", ids: []uint{24912, 220, 2375}}, + {text: "hello world", ids: []uint{24912, 256, 2375}}, + {text: "supercalifragilistic", ids: []uint{17789, 5842, 366, 17764, 311, 6207}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{2167, 1761, 1412, 581, 553, 11, 889, 1761, 625, 1412, 581, 1340, 413, 13}}, + }, + }, + { + encoding: tokenizer.Cl100kBase, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{15339, 1917}}, + {text: "hello world", ids: []uint{15339, 220, 1917}}, + {text: "hello world", ids: []uint{15339, 256, 1917}}, + {text: "supercalifragilistic", ids: []uint{13066, 3035, 278, 333, 4193, 321, 4633}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{1687, 1440, 1148, 584, 527, 11, 719, 1440, 539, 1148, 584, 1253, 387, 13}}, + }, + }, + { + encoding: tokenizer.R50kBase, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{31373, 995}}, + {text: "hello world", ids: []uint{31373, 220, 995}}, + {text: "hello world", ids: []uint{31373, 220, 220, 995}}, + {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, + }, + }, + { + encoding: tokenizer.P50kBase, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{31373, 995}}, + {text: "hello world", ids: []uint{31373, 220, 995}}, + {text: "hello world", ids: []uint{31373, 50257, 995}}, + {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, + }, + }, + { + encoding: tokenizer.P50kEdit, + data: []testTokenizerData{ + {text: "hello world", ids: []uint{31373, 995}}, + {text: "hello world", ids: []uint{31373, 220, 995}}, + {text: "hello world", ids: []uint{31373, 50257, 995}}, + {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, + {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, + }, + }, } +) - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{31373, 995}}, - {text: "hello world", ids: []uint{31373, 220, 995}}, - {text: "hello world", ids: []uint{31373, 220, 220, 995}}, - {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, - } +func TestTokenizer(t *testing.T) { + for _, test := range tokenizerTests { + tokenizer, err := tokenizer.Get(test.encoding) + if err != nil { + t.Fatalf("can't create tokenizer") + } - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) + for _, data := range test.data { + t.Run(fmt.Sprintf("%s: %s", test.encoding, data.text), func(t *testing.T) { + ids, _, err := tokenizer.Encode(data.text) + if err != nil { + t.Fatalf("error encoding: %v", err) + } + + if !sliceEqual(ids, data.ids) { + t.Fatalf("input: %s want: %v got: %v", data.text, data.ids, ids) + } + + text, err := tokenizer.Decode(ids) + if err != nil { + t.Fatalf("error decoding: %v", err) + } + + if text != data.text { + t.Fatalf("input: %v want: %s got: %s", data.ids, data.text, text) + } + }) + } } } -func TestP50kBaseEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.P50kBase) - if err != nil { - t.Fatalf("can't create tokenizer") - } - - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{31373, 995}}, - {text: "hello world", ids: []uint{31373, 220, 995}}, - {text: "hello world", ids: []uint{31373, 50257, 995}}, - {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, - } +var tokens []uint - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) - } -} +func BenchmarkTokenizer(b *testing.B) { + for _, test := range tokenizerTests { + tokenizer, err := tokenizer.Get(test.encoding) + if err != nil { + b.Fatalf("can't create tokenizer") + } -func TestP50kEditEncoding(t *testing.T) { - tokenizer, err := tokenizer.Get(tokenizer.P50kEdit) - if err != nil { - t.Fatalf("can't create tokenizer") - } + for _, data := range test.data { + b.Run(fmt.Sprintf("%s: %s", test.encoding, data.text), func(b *testing.B) { + for i := 0; i < b.N; i++ { - tests := []struct { - text string - ids []uint - }{ - {text: "hello world", ids: []uint{31373, 995}}, - {text: "hello world", ids: []uint{31373, 220, 995}}, - {text: "hello world", ids: []uint{31373, 50257, 995}}, - {text: "supercalifragilistic", ids: []uint{16668, 9948, 361, 22562, 346, 2569}}, - {text: "We know what we are, but know not what we may be.", ids: []uint{1135, 760, 644, 356, 389, 11, 475, 760, 407, 644, 356, 743, 307, 13}}, - } + tokens, _, _ = tokenizer.Encode(data.text) + } - for _, test := range tests { - t.Run(test.text, func(t *testing.T) { - ids, _, err := tokenizer.Encode(test.text) - if err != nil { - t.Fatalf("error encoding: %v", err) - } - - if !sliceEqual(ids, test.ids) { - t.Fatalf("input: %s want: %v got: %v", test.text, test.ids, ids) - } - - text, err := tokenizer.Decode(ids) - if err != nil { - t.Fatalf("error decoding: %v", err) - } - - if text != test.text { - t.Fatalf("input: %v want: %s got: %s", test.ids, test.text, text) - } - }) + _ = tokens + }) + } } }