From 5fcaf2444fd793f3f96edde839397ea40587aff2 Mon Sep 17 00:00:00 2001
From: Matt Sherman <mwsherman@gmail.com>
Date: Sun, 14 Sep 2025 14:52:42 -0400
Subject: [PATCH] Adopt uax29 segmenter

Replacing blevesearch/segment. ~2x perf improvement.
---
 analysis/tokenizer/unicode/unicode.go      | 116 ++++++++++++++-------
 analysis/tokenizer/unicode/unicode_test.go |  28 -----
 go.mod                                     |   2 +-
 go.sum                                     |   4 +-
 4 files changed, 81 insertions(+), 69 deletions(-)

diff --git a/analysis/tokenizer/unicode/unicode.go b/analysis/tokenizer/unicode/unicode.go
index b694a3ee4..e60df9b72 100644
--- a/analysis/tokenizer/unicode/unicode.go
+++ b/analysis/tokenizer/unicode/unicode.go
@@ -15,7 +15,10 @@
 package unicode
 
 import (
-	"github.com/blevesearch/segment"
+	"unicode"
+	"unicode/utf8"
+
+	"github.com/clipperhouse/uax29/v2/words"
 
 	"github.com/blevesearch/bleve/v2/analysis"
 	"github.com/blevesearch/bleve/v2/registry"
@@ -37,7 +40,7 @@ func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	ta := []analysis.Token(nil)
 	taNext := 0
 
-	segmenter := segment.NewWordSegmenterDirect(input)
+	segmenter := words.FromBytes(input)
 	start := 0
 	pos := 1
 
@@ -52,46 +55,48 @@ func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
 		return remainingLen / avgSegmentLen
 	}
 
-	for segmenter.Segment() {
-		segmentBytes := segmenter.Bytes()
+	for segmenter.Next() {
+		segmentBytes := segmenter.Value()
+		if !alphaNumericBackwardsCompat(segmentBytes) {
+			continue
+		}
 		end := start + len(segmentBytes)
-		if segmenter.Type() != segment.None {
-			if taNext >= len(ta) {
-				remainingSegments := guessRemaining(end)
-				if remainingSegments > 1000 {
-					remainingSegments = 1000
-				}
-				if remainingSegments < 1 {
-					remainingSegments = 1
-				}
-
-				ta = make([]analysis.Token, remainingSegments)
-				taNext = 0
+		if taNext >= len(ta) {
+			remainingSegments := guessRemaining(end)
+			if remainingSegments > 1000 {
+				remainingSegments = 1000
+			}
+			if remainingSegments < 1 {
+				remainingSegments = 1
 			}
 
-			token := &ta[taNext]
-			taNext++
+			ta = make([]analysis.Token, remainingSegments)
+			taNext = 0
+		}
 
-			token.Term = segmentBytes
-			token.Start = start
-			token.End = end
-			token.Position = pos
-			token.Type = convertType(segmenter.Type())
+		token := &ta[taNext]
+		taNext++
 
-			if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
-				rvx = append(rvx, rv)
+		token.Term = segmentBytes
+		token.Start = segmenter.Start()
+		token.End = segmenter.End()
+		token.Position = pos
+		token.Type = getType(segmentBytes)
 
-				rvCap := cap(rv) * 2
-				if rvCap > 256 {
-					rvCap = 256
-				}
+		if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
+			rvx = append(rvx, rv)
 
-				rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
+			rvCap := cap(rv) * 2
+			if rvCap > 256 {
+				rvCap = 256
 			}
 
-			rv = append(rv, token)
-			pos++
+			rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
 		}
+
+		rv = append(rv, token)
+		pos++
+
 		start = end
 	}
 
@@ -121,14 +126,49 @@ func init() {
 	}
 }
 
-func convertType(segmentWordType int) analysis.TokenType {
-	switch segmentWordType {
-	case segment.Ideo:
-		return analysis.Ideographic
-	case segment.Kana:
+func getType(segment []byte) analysis.TokenType {
+	switch {
+	case words.BleveIdeographic(segment):
 		return analysis.Ideographic
-	case segment.Number:
+	case words.BleveNumeric(segment):
 		return analysis.Numeric
 	}
 	return analysis.AlphaNumeric
 }
+
+// alphaNumeric is a filter which returns only tokens
+// that contain a Letter or Number, as defined by Unicode.
+func alphaNumeric(token []byte) bool {
+	pos := 0
+	for pos < len(token) {
+		r, w := utf8.DecodeRune(token[pos:])
+		if unicode.IsLetter(r) || unicode.IsNumber(r) {
+			// we use these methods instead of unicode.In for
+			// performance; these methods have ASCII fast paths
+			return true
+		}
+		pos += w
+	}
+
+	return false
+}
+
+// alphaNumericBackwardsCompat is a filter which returns only tokens
+// that contain a Letter or Number, as defined by Unicode.
+// It filters out Thai characters as the old segmenter did.
+func alphaNumericBackwardsCompat(token []byte) bool {
+	for pos := 0; pos < len(token); {
+		r, w := utf8.DecodeRune(token[pos:])
+
+		if unicode.IsLetter(r) || unicode.IsNumber(r) {
+			// Filter out Thai characters (except numbers) to match old segmenter behavior
+			if unicode.Is(unicode.Thai, r) && !unicode.IsNumber(r) {
+				return false
+			}
+			return true
+		}
+		pos += w
+	}
+
+	return false
+}
diff --git a/analysis/tokenizer/unicode/unicode_test.go b/analysis/tokenizer/unicode/unicode_test.go
index 0e0cfd4cc..c990a3eb9 100644
--- a/analysis/tokenizer/unicode/unicode_test.go
+++ b/analysis/tokenizer/unicode/unicode_test.go
@@ -19,7 +19,6 @@ import (
 	"testing"
 
 	"github.com/blevesearch/bleve/v2/analysis"
-	"github.com/blevesearch/segment"
 )
 
 func TestUnicode(t *testing.T) {
@@ -173,30 +172,3 @@ func BenchmarkTokenizeEnglishText(b *testing.B) {
 	}
 
 }
-
-func TestConvertType(t *testing.T) {
-	tests := []struct {
-		in  int
-		out analysis.TokenType
-	}{
-		{
-			segment.Ideo, analysis.Ideographic,
-		},
-		{
-			segment.Kana, analysis.Ideographic,
-		},
-		{
-			segment.Number, analysis.Numeric,
-		},
-		{
-			segment.Letter, analysis.AlphaNumeric,
-		},
-	}
-
-	for _, test := range tests {
-		actual := convertType(test.in)
-		if actual != test.out {
-			t.Errorf("expected %d, got %d for %d", test.out, actual, test.in)
-		}
-	}
-}
diff --git a/go.mod b/go.mod
index ca435f288..ca5e41080 100644
--- a/go.mod
+++ b/go.mod
@@ -15,7 +15,6 @@ require (
 	github.com/blevesearch/goleveldb v1.0.1
 	github.com/blevesearch/gtreap v0.1.1
 	github.com/blevesearch/scorch_segment_api/v2 v2.3.10
-	github.com/blevesearch/segment v0.9.1
 	github.com/blevesearch/snowball v0.6.1
 	github.com/blevesearch/snowballstem v0.9.0
 	github.com/blevesearch/stempel v0.2.0
@@ -27,6 +26,7 @@ require (
 	github.com/blevesearch/zapx/v14 v14.4.2
 	github.com/blevesearch/zapx/v15 v15.4.2
 	github.com/blevesearch/zapx/v16 v16.2.4
+	github.com/clipperhouse/uax29/v2 v2.2.0
 	github.com/couchbase/moss v0.2.0
 	github.com/spf13/cobra v1.8.1
 	go.etcd.io/bbolt v1.4.0
diff --git a/go.sum b/go.sum
index fec5104f7..74ff24778 100644
--- a/go.sum
+++ b/go.sum
@@ -22,8 +22,6 @@ github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCD
 github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
 github.com/blevesearch/scorch_segment_api/v2 v2.3.10 h1:Yqk0XD1mE0fDZAJXTjawJ8If/85JxnLd8v5vG/jWE/s=
 github.com/blevesearch/scorch_segment_api/v2 v2.3.10/go.mod h1:Z3e6ChN3qyN35yaQpl00MfI5s8AxUJbpTR/DL8QOQ+8=
-github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
-github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
 github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
 github.com/blevesearch/snowball v0.6.1/go.mod h1:ZF0IBg5vgpeoUhnMza2v0A/z8m1cWPlwhke08LpNusg=
 github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
@@ -46,6 +44,8 @@ github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFx
 github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
 github.com/blevesearch/zapx/v16 v16.2.4 h1:tGgfvleXTAkwsD5mEzgM3zCS/7pgocTCnO1oyAUjlww=
 github.com/blevesearch/zapx/v16 v16.2.4/go.mod h1:Rti/REtuuMmzwsI8/C/qIzRaEoSK/wiFYw5e5ctUKKs=
+github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY=
+github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
 github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
 github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
 github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=