Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 78 additions & 38 deletions analysis/tokenizer/unicode/unicode.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
package unicode

import (
"github.com/blevesearch/segment"
"unicode"
"unicode/utf8"

"github.com/clipperhouse/uax29/v2/words"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
Expand All @@ -37,7 +40,7 @@ func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
ta := []analysis.Token(nil)
taNext := 0

segmenter := segment.NewWordSegmenterDirect(input)
segmenter := words.FromBytes(input)
start := 0
pos := 1

Expand All @@ -52,46 +55,48 @@ func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
return remainingLen / avgSegmentLen
}

for segmenter.Segment() {
segmentBytes := segmenter.Bytes()
for segmenter.Next() {
segmentBytes := segmenter.Value()
if !alphaNumericBackwardsCompat(segmentBytes) {
continue
}
end := start + len(segmentBytes)
if segmenter.Type() != segment.None {
if taNext >= len(ta) {
remainingSegments := guessRemaining(end)
if remainingSegments > 1000 {
remainingSegments = 1000
}
if remainingSegments < 1 {
remainingSegments = 1
}

ta = make([]analysis.Token, remainingSegments)
taNext = 0
if taNext >= len(ta) {
remainingSegments := guessRemaining(end)
if remainingSegments > 1000 {
remainingSegments = 1000
}
if remainingSegments < 1 {
remainingSegments = 1
}

token := &ta[taNext]
taNext++
ta = make([]analysis.Token, remainingSegments)
taNext = 0
}

token.Term = segmentBytes
token.Start = start
token.End = end
token.Position = pos
token.Type = convertType(segmenter.Type())
token := &ta[taNext]
taNext++

if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
rvx = append(rvx, rv)
token.Term = segmentBytes
token.Start = segmenter.Start()
token.End = segmenter.End()
token.Position = pos
token.Type = getType(segmentBytes)

rvCap := cap(rv) * 2
if rvCap > 256 {
rvCap = 256
}
if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
rvx = append(rvx, rv)

rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
rvCap := cap(rv) * 2
if rvCap > 256 {
rvCap = 256
}

rv = append(rv, token)
pos++
rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
}

rv = append(rv, token)
pos++

start = end
}

Expand Down Expand Up @@ -121,14 +126,49 @@ func init() {
}
}

func convertType(segmentWordType int) analysis.TokenType {
switch segmentWordType {
case segment.Ideo:
return analysis.Ideographic
case segment.Kana:
func getType(segment []byte) analysis.TokenType {
switch {
case words.BleveIdeographic(segment):
return analysis.Ideographic
case segment.Number:
case words.BleveNumeric(segment):
return analysis.Numeric
}
return analysis.AlphaNumeric
}

// alphaNumeric is a filter which returns only tokens
// that contain a Letter or Number, as defined by Unicode.
func alphaNumeric(token []byte) bool {
pos := 0
for pos < len(token) {
r, w := utf8.DecodeRune(token[pos:])
if unicode.IsLetter(r) || unicode.IsNumber(r) {
// we use these methods instead of unicode.In for
// performance; these methods have ASCII fast paths
return true
}
pos += w
}

return false
}

// alphaNumericBackwardsCompat is a filter which returns only tokens
// that contain a Letter or Number, as defined by Unicode.
// It filters out Thai characters as the old segmenter did.
func alphaNumericBackwardsCompat(token []byte) bool {
for pos := 0; pos < len(token); {
r, w := utf8.DecodeRune(token[pos:])

if unicode.IsLetter(r) || unicode.IsNumber(r) {
// Filter out Thai characters (except numbers) to match old segmenter behavior
if unicode.Is(unicode.Thai, r) && !unicode.IsNumber(r) {
return false
}
return true
}
pos += w
}

return false
}
28 changes: 0 additions & 28 deletions analysis/tokenizer/unicode/unicode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"testing"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/segment"
)

func TestUnicode(t *testing.T) {
Expand Down Expand Up @@ -173,30 +172,3 @@ func BenchmarkTokenizeEnglishText(b *testing.B) {
}

}

func TestConvertType(t *testing.T) {
tests := []struct {
in int
out analysis.TokenType
}{
{
segment.Ideo, analysis.Ideographic,
},
{
segment.Kana, analysis.Ideographic,
},
{
segment.Number, analysis.Numeric,
},
{
segment.Letter, analysis.AlphaNumeric,
},
}

for _, test := range tests {
actual := convertType(test.in)
if actual != test.out {
t.Errorf("expected %d, got %d for %d", test.out, actual, test.in)
}
}
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ require (
github.com/blevesearch/goleveldb v1.0.1
github.com/blevesearch/gtreap v0.1.1
github.com/blevesearch/scorch_segment_api/v2 v2.3.10
github.com/blevesearch/segment v0.9.1
github.com/blevesearch/snowball v0.6.1
github.com/blevesearch/snowballstem v0.9.0
github.com/blevesearch/stempel v0.2.0
Expand All @@ -27,6 +26,7 @@ require (
github.com/blevesearch/zapx/v14 v14.4.2
github.com/blevesearch/zapx/v15 v15.4.2
github.com/blevesearch/zapx/v16 v16.2.4
github.com/clipperhouse/uax29/v2 v2.2.0
github.com/couchbase/moss v0.2.0
github.com/spf13/cobra v1.8.1
go.etcd.io/bbolt v1.4.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCD
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.3.10 h1:Yqk0XD1mE0fDZAJXTjawJ8If/85JxnLd8v5vG/jWE/s=
github.com/blevesearch/scorch_segment_api/v2 v2.3.10/go.mod h1:Z3e6ChN3qyN35yaQpl00MfI5s8AxUJbpTR/DL8QOQ+8=
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
github.com/blevesearch/snowball v0.6.1/go.mod h1:ZF0IBg5vgpeoUhnMza2v0A/z8m1cWPlwhke08LpNusg=
github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
Expand All @@ -46,6 +44,8 @@ github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFx
github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
github.com/blevesearch/zapx/v16 v16.2.4 h1:tGgfvleXTAkwsD5mEzgM3zCS/7pgocTCnO1oyAUjlww=
github.com/blevesearch/zapx/v16 v16.2.4/go.mod h1:Rti/REtuuMmzwsI8/C/qIzRaEoSK/wiFYw5e5ctUKKs=
github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY=
github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=
Expand Down