Skip to content

Commit 17a0e81

Browse files
committed
index: use ngram tokenizer
1 parent 24ea202 commit 17a0e81

2 files changed

Lines changed: 111 additions & 58 deletions

File tree

internal/indexer/indexer.go

Lines changed: 109 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,25 @@ import (
1616
"github.com/AvengeMedia/danksearch/internal/errdefs"
1717
"github.com/AvengeMedia/danksearch/internal/log"
1818
bleve "github.com/blevesearch/bleve/v2"
19+
_ "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
20+
_ "github.com/blevesearch/bleve/v2/analysis/token/edgengram"
21+
_ "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
22+
_ "github.com/blevesearch/bleve/v2/analysis/token/ngram"
23+
_ "github.com/blevesearch/bleve/v2/analysis/tokenizer/single"
1924
"github.com/blevesearch/bleve/v2/mapping"
2025
query "github.com/blevesearch/bleve/v2/search/query"
2126
)
2227

2328
type Document struct {
24-
Path string `json:"path"`
25-
Filename string `json:"filename"`
26-
Title string `json:"title"`
27-
Body string `json:"body"`
28-
ContentType string `json:"content_type"`
29-
ModTime time.Time `json:"mtime"`
30-
Size int64 `json:"size"`
31-
Hash string `json:"hash"`
29+
Path string `json:"path"`
30+
Filename string `json:"filename"`
31+
FilenameSub string `json:"filename_sub"`
32+
FilenamePrefix string `json:"filename_prefix"`
33+
Body string `json:"body"`
34+
ContentType string `json:"content_type"`
35+
ModTime time.Time `json:"mtime"`
36+
Size int64 `json:"size"`
37+
Hash string `json:"hash"`
3238
}
3339

3440
type Indexer struct {
@@ -111,6 +117,49 @@ func getStoreConfig() map[string]interface{} {
111117

112118
func buildIndexMapping() mapping.IndexMapping {
113119
m := bleve.NewIndexMapping()
120+
121+
err := m.AddCustomTokenFilter("ngram_2_15", map[string]interface{}{
122+
"type": "ngram",
123+
"min": float64(2),
124+
"max": float64(15),
125+
})
126+
if err != nil {
127+
panic(err)
128+
}
129+
130+
err = m.AddCustomTokenFilter("edge_ngram_2_30", map[string]interface{}{
131+
"type": "edge_ngram",
132+
"min": float64(2),
133+
"max": float64(30),
134+
})
135+
if err != nil {
136+
panic(err)
137+
}
138+
139+
err = m.AddCustomAnalyzer("filename_ngram", map[string]interface{}{
140+
"type": "custom",
141+
"tokenizer": "single",
142+
"token_filters": []string{
143+
"to_lower",
144+
"ngram_2_15",
145+
},
146+
})
147+
if err != nil {
148+
panic(err)
149+
}
150+
151+
err = m.AddCustomAnalyzer("filename_edge", map[string]interface{}{
152+
"type": "custom",
153+
"tokenizer": "single",
154+
"token_filters": []string{
155+
"to_lower",
156+
"edge_ngram_2_30",
157+
},
158+
})
159+
if err != nil {
160+
panic(err)
161+
}
162+
114163
docMapping := bleve.NewDocumentMapping()
115164

116165
pathField := bleve.NewTextFieldMapping()
@@ -120,14 +169,18 @@ func buildIndexMapping() mapping.IndexMapping {
120169

121170
filenameField := bleve.NewTextFieldMapping()
122171
filenameField.Store = true
123-
filenameField.IncludeTermVectors = true
124172
filenameField.Analyzer = "keyword"
125173
docMapping.AddFieldMappingsAt("filename", filenameField)
126174

127-
titleField := bleve.NewTextFieldMapping()
128-
titleField.Store = true
129-
titleField.IncludeTermVectors = true
130-
docMapping.AddFieldMappingsAt("title", titleField)
175+
filenameSubField := bleve.NewTextFieldMapping()
176+
filenameSubField.Store = false
177+
filenameSubField.Analyzer = "filename_ngram"
178+
docMapping.AddFieldMappingsAt("filename_sub", filenameSubField)
179+
180+
filenamePrefixField := bleve.NewTextFieldMapping()
181+
filenamePrefixField.Store = false
182+
filenamePrefixField.Analyzer = "filename_edge"
183+
docMapping.AddFieldMappingsAt("filename_prefix", filenamePrefixField)
131184

132185
bodyField := bleve.NewTextFieldMapping()
133186
bodyField.Store = false
@@ -203,12 +256,13 @@ func (i *Indexer) readDocument(path string, info os.FileInfo) (*Document, error)
203256
}
204257

205258
doc := &Document{
206-
Path: path,
207-
Filename: filename,
208-
Title: filename,
209-
ContentType: contentType,
210-
ModTime: info.ModTime(),
211-
Size: info.Size(),
259+
Path: path,
260+
Filename: filename,
261+
FilenameSub: filename,
262+
FilenamePrefix: filename,
263+
ContentType: contentType,
264+
ModTime: info.ModTime(),
265+
Size: info.Size(),
212266
}
213267

214268
if i.config.IsTextFile(path) {
@@ -266,37 +320,17 @@ func (i *Indexer) SearchWithOptions(opts *SearchOptions) (*bleve.SearchResult, e
266320
// Build the main query
267321
var mainQuery query.Query
268322

269-
// Special case: match all
270323
if opts.Query == "*" {
271324
mainQuery = bleve.NewMatchAllQuery()
272325
} else if opts.Field != "" {
273-
// Field-specific search
274326
mainQuery = i.buildFieldQuery(opts.Query, opts.Field, opts.Fuzzy)
275327
} else {
276-
// Search across all fields with boosting
277-
queryLower := strings.ToLower(opts.Query)
278-
filenamePattern := "*" + queryLower + "*"
279-
280-
filenameQuery := bleve.NewWildcardQuery(filenamePattern)
281-
filenameQuery.SetField("filename")
282-
filenameQuery.SetBoost(10.0)
283-
284-
titleQuery := bleve.NewWildcardQuery(filenamePattern)
285-
titleQuery.SetField("title")
286-
titleQuery.SetBoost(5.0)
287-
328+
filenameQuery := i.buildFilenameQuery(opts.Query, 20.0, 10.0)
288329
bodyQuery := bleve.NewMatchQuery(opts.Query)
289330
bodyQuery.SetField("body")
290331
bodyQuery.SetBoost(1.0)
291332

292-
if opts.Fuzzy {
293-
fuzzyBodyQuery := bleve.NewFuzzyQuery(opts.Query)
294-
fuzzyBodyQuery.SetField("body")
295-
fuzzyBodyQuery.SetBoost(0.5)
296-
mainQuery = bleve.NewDisjunctionQuery(filenameQuery, titleQuery, bodyQuery, fuzzyBodyQuery)
297-
} else {
298-
mainQuery = bleve.NewDisjunctionQuery(filenameQuery, titleQuery, bodyQuery)
299-
}
333+
mainQuery = bleve.NewDisjunctionQuery(filenameQuery, bodyQuery)
300334
}
301335

302336
// Build filters
@@ -393,32 +427,51 @@ func (i *Indexer) SearchWithOptions(opts *SearchOptions) (*bleve.SearchResult, e
393427
return result, nil
394428
}
395429

430+
func (i *Indexer) buildFilenameQuery(queryStr string, boostPrefix, boostContains float64) query.Query {
431+
q := strings.TrimSpace(queryStr)
432+
if q == "" {
433+
return bleve.NewMatchNoneQuery()
434+
}
435+
436+
disj := bleve.NewDisjunctionQuery()
437+
438+
prefixQuery := bleve.NewPrefixQuery(strings.ToLower(q))
439+
prefixQuery.SetField("filename_prefix")
440+
prefixQuery.SetBoost(boostPrefix)
441+
disj.AddQuery(prefixQuery)
442+
443+
if len(q) >= 2 {
444+
matchQuery := bleve.NewMatchQuery(q)
445+
matchQuery.SetField("filename_sub")
446+
matchQuery.SetBoost(boostContains)
447+
disj.AddQuery(matchQuery)
448+
}
449+
450+
if len(disj.Disjuncts) == 1 {
451+
return disj.Disjuncts[0]
452+
}
453+
return disj
454+
}
455+
396456
func (i *Indexer) buildFieldQuery(queryStr, field string, fuzzy bool) query.Query {
397-
queryLower := strings.ToLower(queryStr)
457+
if field == "filename" {
458+
return i.buildFilenameQuery(queryStr, 2.0, 1.0)
459+
}
398460

399-
switch field {
400-
case "filename", "title":
401-
pattern := "*" + queryLower + "*"
402-
q := bleve.NewWildcardQuery(pattern)
403-
q.SetField(field)
404-
return q
405-
case "body":
461+
if field == "body" {
406462
if fuzzy {
407463
q := bleve.NewFuzzyQuery(queryStr)
408464
q.SetField("body")
409465
return q
410466
}
411-
// Use match query - searches for all words in the query
412-
// Note: Special characters like //, !, etc. are normalized by the analyzer
413467
q := bleve.NewMatchQuery(queryStr)
414468
q.SetField("body")
415469
return q
416-
default:
417-
// Fallback to match query
418-
q := bleve.NewMatchQuery(queryStr)
419-
q.SetField(field)
420-
return q
421470
}
471+
472+
q := bleve.NewMatchQuery(queryStr)
473+
q.SetField(field)
474+
return q
422475
}
423476

424477
func (i *Indexer) ReindexAll() error {

internal/indexer/indexer_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@ func TestReadDocument(t *testing.T) {
229229
t.Errorf("Path = %v, want %v", doc.Path, testFile)
230230
}
231231

232-
if doc.Title != "test.go" {
233-
t.Errorf("Title = %v, want test.go", doc.Title)
232+
if doc.Filename != "test.go" {
233+
t.Errorf("Filename = %v, want test.go", doc.Filename)
234234
}
235235

236236
if doc.Body != content {

0 commit comments

Comments
 (0)