From 5fcaf2444fd793f3f96edde839397ea40587aff2 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 14 Sep 2025 14:52:42 -0400 Subject: [PATCH] Adopt uax29 segmenter Replacing blevesearch/segment. ~2x perf improvement. --- analysis/tokenizer/unicode/unicode.go | 116 ++++++++++++++------- analysis/tokenizer/unicode/unicode_test.go | 28 ----- go.mod | 2 +- go.sum | 4 +- 4 files changed, 81 insertions(+), 69 deletions(-) diff --git a/analysis/tokenizer/unicode/unicode.go b/analysis/tokenizer/unicode/unicode.go index b694a3ee4..e60df9b72 100644 --- a/analysis/tokenizer/unicode/unicode.go +++ b/analysis/tokenizer/unicode/unicode.go @@ -15,7 +15,10 @@ package unicode import ( - "github.com/blevesearch/segment" + "unicode" + "unicode/utf8" + + "github.com/clipperhouse/uax29/v2/words" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/registry" @@ -37,7 +40,7 @@ func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream { ta := []analysis.Token(nil) taNext := 0 - segmenter := segment.NewWordSegmenterDirect(input) + segmenter := words.FromBytes(input) start := 0 pos := 1 @@ -52,46 +55,48 @@ func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream { return remainingLen / avgSegmentLen } - for segmenter.Segment() { - segmentBytes := segmenter.Bytes() + for segmenter.Next() { + segmentBytes := segmenter.Value() + if !alphaNumericBackwardsCompat(segmentBytes) { + continue + } end := start + len(segmentBytes) - if segmenter.Type() != segment.None { - if taNext >= len(ta) { - remainingSegments := guessRemaining(end) - if remainingSegments > 1000 { - remainingSegments = 1000 - } - if remainingSegments < 1 { - remainingSegments = 1 - } - - ta = make([]analysis.Token, remainingSegments) - taNext = 0 + if taNext >= len(ta) { + remainingSegments := guessRemaining(end) + if remainingSegments > 1000 { + remainingSegments = 1000 + } + if remainingSegments < 1 { + remainingSegments = 1 } - token := &ta[taNext] - taNext++ + ta = make([]analysis.Token, remainingSegments) + taNext = 0 + } - token.Term = segmentBytes - token.Start = start - token.End = end - token.Position = pos - token.Type = convertType(segmenter.Type()) + token := &ta[taNext] + taNext++ - if len(rv) >= cap(rv) { // When rv is full, save it into rvx. - rvx = append(rvx, rv) + token.Term = segmentBytes + token.Start = segmenter.Start() + token.End = segmenter.End() + token.Position = pos + token.Type = getType(segmentBytes) - rvCap := cap(rv) * 2 - if rvCap > 256 { - rvCap = 256 - } + if len(rv) >= cap(rv) { // When rv is full, save it into rvx. + rvx = append(rvx, rv) - rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger. + rvCap := cap(rv) * 2 + if rvCap > 256 { + rvCap = 256 } - rv = append(rv, token) - pos++ + rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger. } + + rv = append(rv, token) + pos++ + start = end } @@ -121,14 +126,49 @@ func init() { } } -func convertType(segmentWordType int) analysis.TokenType { - switch segmentWordType { - case segment.Ideo: - return analysis.Ideographic - case segment.Kana: +func getType(segment []byte) analysis.TokenType { + switch { + case words.BleveIdeographic(segment): return analysis.Ideographic - case segment.Number: + case words.BleveNumeric(segment): return analysis.Numeric } return analysis.AlphaNumeric } + +// alphaNumeric is a filter which returns only tokens +// that contain a Letter or Number, as defined by Unicode. +func alphaNumeric(token []byte) bool { + pos := 0 + for pos < len(token) { + r, w := utf8.DecodeRune(token[pos:]) + if unicode.IsLetter(r) || unicode.IsNumber(r) { + // we use these methods instead of unicode.In for + // performance; these methods have ASCII fast paths + return true + } + pos += w + } + + return false +} + +// alphaNumericBackwardsCompat is a filter which returns only tokens +// that contain a Letter or Number, as defined by Unicode. +// It filters out Thai characters as the old segmenter did. +func alphaNumericBackwardsCompat(token []byte) bool { + for pos := 0; pos < len(token); { + r, w := utf8.DecodeRune(token[pos:]) + + if unicode.IsLetter(r) || unicode.IsNumber(r) { + // Filter out Thai characters (except numbers) to match old segmenter behavior + if unicode.Is(unicode.Thai, r) && !unicode.IsNumber(r) { + return false + } + return true + } + pos += w + } + + return false +} diff --git a/analysis/tokenizer/unicode/unicode_test.go b/analysis/tokenizer/unicode/unicode_test.go index 0e0cfd4cc..c990a3eb9 100644 --- a/analysis/tokenizer/unicode/unicode_test.go +++ b/analysis/tokenizer/unicode/unicode_test.go @@ -19,7 +19,6 @@ import ( "testing" "github.com/blevesearch/bleve/v2/analysis" - "github.com/blevesearch/segment" ) func TestUnicode(t *testing.T) { @@ -173,30 +172,3 @@ func BenchmarkTokenizeEnglishText(b *testing.B) { } } - -func TestConvertType(t *testing.T) { - tests := []struct { - in int - out analysis.TokenType - }{ - { - segment.Ideo, analysis.Ideographic, - }, - { - segment.Kana, analysis.Ideographic, - }, - { - segment.Number, analysis.Numeric, - }, - { - segment.Letter, analysis.AlphaNumeric, - }, - } - - for _, test := range tests { - actual := convertType(test.in) - if actual != test.out { - t.Errorf("expected %d, got %d for %d", test.out, actual, test.in) - } - } -} diff --git a/go.mod b/go.mod index ca435f288..ca5e41080 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,6 @@ require ( github.com/blevesearch/goleveldb v1.0.1 github.com/blevesearch/gtreap v0.1.1 github.com/blevesearch/scorch_segment_api/v2 v2.3.10 - github.com/blevesearch/segment v0.9.1 github.com/blevesearch/snowball v0.6.1 github.com/blevesearch/snowballstem v0.9.0 github.com/blevesearch/stempel v0.2.0 @@ -27,6 +26,7 @@ require ( github.com/blevesearch/zapx/v14 v14.4.2 github.com/blevesearch/zapx/v15 v15.4.2 github.com/blevesearch/zapx/v16 v16.2.4 + github.com/clipperhouse/uax29/v2 v2.2.0 github.com/couchbase/moss v0.2.0 github.com/spf13/cobra v1.8.1 go.etcd.io/bbolt v1.4.0 diff --git a/go.sum b/go.sum index fec5104f7..74ff24778 100644 --- a/go.sum +++ b/go.sum @@ -22,8 +22,6 @@ github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCD github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= github.com/blevesearch/scorch_segment_api/v2 v2.3.10 h1:Yqk0XD1mE0fDZAJXTjawJ8If/85JxnLd8v5vG/jWE/s= github.com/blevesearch/scorch_segment_api/v2 v2.3.10/go.mod h1:Z3e6ChN3qyN35yaQpl00MfI5s8AxUJbpTR/DL8QOQ+8= -github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= -github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A= github.com/blevesearch/snowball v0.6.1/go.mod h1:ZF0IBg5vgpeoUhnMza2v0A/z8m1cWPlwhke08LpNusg= github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= @@ -46,6 +44,8 @@ github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFx github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= github.com/blevesearch/zapx/v16 v16.2.4 h1:tGgfvleXTAkwsD5mEzgM3zCS/7pgocTCnO1oyAUjlww= github.com/blevesearch/zapx/v16 v16.2.4/go.mod h1:Rti/REtuuMmzwsI8/C/qIzRaEoSK/wiFYw5e5ctUKKs= +github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY= +github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=