From 96e7fbd3ad69c76712736bfb900bd7d77d667f00 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 13 Jan 2026 19:03:24 +0530 Subject: [PATCH 1/9] MB-69881: Improved APIs and perf optimizations for vector search (#2270) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use a `bitset` to track eligible documents instead of a slice of `N uint64s`, reducing memory usage from `8N bytes` to `N/8 bytes` per segment (up to `64×` reduction) and improving cache locality. - Pass an iterator over eligible documents that iterates the bitset directly, allowing direct translation into a bitset of eligible vector IDs in the storage layer and eliminating the need for a separate slice intermediary. - Fix garbage creation in the `UnadornedPostingsIterator`, which previously allocated a temporary struct per Next() call to wrap a doc number and satisfy the `Postings` interface; the iterator now returns a single reusable struct (one-time allocation) consistent with the working of the `PostingsIterator` in the storage-layer. - Avoid unnecessary `BytesRead` statistics computation when executing searches in no-scoring mode, removing redundant work as a micro-optimization. --------- Co-authored-by: Abhinav Dangeti --- analysis/analyzer/custom/custom.go | 2 +- analysis/datetime/iso/iso.go | 4 +- cmd/bleve/cmd/registry.go | 4 +- go.mod | 13 ++-- go.sum | 18 +++--- index/scorch/optimize.go | 2 + index/scorch/optimize_knn.go | 10 +--- index/scorch/segment_plugin.go | 4 +- index/scorch/snapshot_index.go | 2 + index/scorch/snapshot_index_tfr.go | 29 ++++++--- index/scorch/snapshot_vector_index.go | 85 ++++++++++++++++++++++++--- index/scorch/unadorned.go | 32 ++++++---- index_test.go | 12 ++-- search/query/query_string_parser.go | 2 +- 14 files changed, 156 insertions(+), 63 deletions(-) diff --git a/analysis/analyzer/custom/custom.go b/analysis/analyzer/custom/custom.go index 5df940e5e..9040e0283 100644 --- a/analysis/analyzer/custom/custom.go +++ b/analysis/analyzer/custom/custom.go @@ -140,7 +140,7 @@ func convertInterfaceSliceToStringSlice(interfaceSlice []interface{}, objType st if ok { stringSlice[i] = stringObj } else { - return nil, fmt.Errorf(objType + " name must be a string") + return nil, fmt.Errorf("%s name must be a string", objType) } } diff --git a/analysis/datetime/iso/iso.go b/analysis/datetime/iso/iso.go index df947a6c0..7a8186999 100644 --- a/analysis/datetime/iso/iso.go +++ b/analysis/datetime/iso/iso.go @@ -118,7 +118,7 @@ func letterCounter(layout string, idx int) int { } func invalidFormatError(character byte, count int) error { - return fmt.Errorf("invalid format string, unknown format specifier: " + strings.Repeat(string(character), count)) + return fmt.Errorf("invalid format string, unknown format specifier: %s", strings.Repeat(string(character), count)) } func parseISOString(layout string) (string, error) { @@ -146,7 +146,7 @@ func parseISOString(layout string) (string, error) { // second text literal delimiter if idx == len(layout) { // text literal delimiter not found error - return "", fmt.Errorf("invalid format string, expected text literal delimiter: " + string(textLiteralDelimiter)) + return "", fmt.Errorf("invalid format string, expected text literal delimiter: %s", string(textLiteralDelimiter)) } // increment idx to skip the second text literal delimiter idx++ diff --git a/cmd/bleve/cmd/registry.go b/cmd/bleve/cmd/registry.go index 9d5fc3f4e..5c6566b12 100644 --- a/cmd/bleve/cmd/registry.go +++ b/cmd/bleve/cmd/registry.go @@ -71,12 +71,12 @@ var registryCmd = &cobra.Command{ func printType(label string, types, instances []string) { sort.Strings(types) sort.Strings(instances) - fmt.Printf(label + " Types:\n") + fmt.Printf("%s Types:\n", label) for _, name := range types { fmt.Printf("\t%s\n", name) } fmt.Println() - fmt.Printf(label + " Instances:\n") + fmt.Printf("%s Instances:\n", label) for _, name := range instances { fmt.Printf("\t%s\n", name) } diff --git a/go.mod b/go.mod index fa5a92752..d62d86018 100644 --- a/go.mod +++ b/go.mod @@ -1,20 +1,18 @@ module github.com/blevesearch/bleve/v2 -go 1.23 - -toolchain go1.23.9 +go 1.24 require ( github.com/RoaringBitmap/roaring/v2 v2.4.5 github.com/bits-and-blooms/bitset v1.22.0 - github.com/blevesearch/bleve_index_api v1.2.9-0.20250929185838-e1be6a8cc229 + github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728 github.com/blevesearch/geo v0.2.4 - github.com/blevesearch/go-faiss v1.0.25 + github.com/blevesearch/go-faiss v1.0.27 github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/goleveldb v1.0.1 github.com/blevesearch/gtreap v0.1.1 - github.com/blevesearch/scorch_segment_api/v2 v2.3.10 + github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df github.com/blevesearch/segment v0.9.1 github.com/blevesearch/snowball v0.6.1 github.com/blevesearch/snowballstem v0.9.0 @@ -26,7 +24,8 @@ require ( github.com/blevesearch/zapx/v13 v13.4.2 github.com/blevesearch/zapx/v14 v14.4.2 github.com/blevesearch/zapx/v15 v15.4.2 - github.com/blevesearch/zapx/v16 v16.2.5-0.20251215174251-3f2bc83c91c0 + github.com/blevesearch/zapx/v16 v16.2.8 + github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 github.com/spf13/cobra v1.8.1 diff --git a/go.sum b/go.sum index 63768ce54..4eca460aa 100644 --- a/go.sum +++ b/go.sum @@ -3,12 +3,12 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/ github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/blevesearch/bleve_index_api v1.2.9-0.20250929185838-e1be6a8cc229 h1:q0hzsKqukWjLO5MEahNWP994XvbY1B2ZSzuM/Vfhx/A= -github.com/blevesearch/bleve_index_api v1.2.9-0.20250929185838-e1be6a8cc229/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= +github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728 h1:qFnvr+SqVOCbhMl5sVynhuwVkv1yrc7Vhrn8lVdw1nU= +github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko= github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk= github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8= -github.com/blevesearch/go-faiss v1.0.25 h1:lel1rkOUGbT1CJ0YgzKwC7k+XH0XVBHnCVWahdCXk4U= -github.com/blevesearch/go-faiss v1.0.25/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-faiss v1.0.27 h1:7cBImYDDQ82WJd5RUZ1ie6zXztCsC73W94ZzwOjkatk= +github.com/blevesearch/go-faiss v1.0.27/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:kDy+zgJFJJoJYBvdfBSiZYBbdsUL0XcjHYWezpQBGPA= github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:9eJDeqxJ3E7WnLebQUlPD7ZjSce7AnDb9vjGmMCbD0A= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= @@ -20,8 +20,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.3.10 h1:Yqk0XD1mE0fDZAJXTjawJ8If/85JxnLd8v5vG/jWE/s= -github.com/blevesearch/scorch_segment_api/v2 v2.3.10/go.mod h1:Z3e6ChN3qyN35yaQpl00MfI5s8AxUJbpTR/DL8QOQ+8= +github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df h1:gBuVkzZLUpGJGnCBRgY0ruZVjppD7WaQLeHZei7QQnU= +github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df/go.mod h1:f8fXitmMpzgNziIMqUlpTrfPxVVDN8at9k7POEohvJU= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A= @@ -44,8 +44,10 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= -github.com/blevesearch/zapx/v16 v16.2.5-0.20251215174251-3f2bc83c91c0 h1:HZssgO3JqQFBTrrBTb5LWkfGlOhUdPzUjsPHQoKEjhg= -github.com/blevesearch/zapx/v16 v16.2.5-0.20251215174251-3f2bc83c91c0/go.mod h1:Rti/REtuuMmzwsI8/C/qIzRaEoSK/wiFYw5e5ctUKKs= +github.com/blevesearch/zapx/v16 v16.2.8 h1:SlnzF0YGtSlrsOE3oE7EgEX6BIepGpeqxs1IjMbHLQI= +github.com/blevesearch/zapx/v16 v16.2.8/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14= +github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c h1:OfYh0noLbJmt6k2tqYlnSU3zMZEJbFfbSClSGG59A/M= +github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c/go.mod h1:ybWwo00MGrNJuFDnl9smEBVUCZmNANf0+E/QVBmfBTs= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index 20a0706ef..658fb08dd 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -395,5 +395,7 @@ func (i *IndexSnapshot) unadornedTermFieldReader( recycle: false, // signal downstream that this is a special unadorned termFieldReader unadorned: true, + // unadorned TFRs do not require bytes read tracking + updateBytesRead: false, } } diff --git a/index/scorch/optimize_knn.go b/index/scorch/optimize_knn.go index 3b3bc3d19..a69cbce81 100644 --- a/index/scorch/optimize_knn.go +++ b/index/scorch/optimize_knn.go @@ -34,8 +34,6 @@ type OptimizeVR struct { totalCost uint64 // maps field to vector readers vrs map[string][]*IndexSnapshotVectorReader - // if at least one of the vector readers requires filtered kNN. - requiresFiltering bool } // This setting _MUST_ only be changed during init and not after. @@ -79,8 +77,7 @@ func (o *OptimizeVR) Finish() error { wg.Done() }() for field, vrs := range o.vrs { - vecIndex, err := segment.InterpretVectorIndex(field, - o.requiresFiltering, origSeg.deleted) + vecIndex, err := segment.InterpretVectorIndex(field, origSeg.deleted) if err != nil { errorsM.Lock() errors = append(errors, err) @@ -103,7 +100,7 @@ func (o *OptimizeVR) Finish() error { // kNN search. if vr.eligibleSelector != nil { pl, err = vecIndex.SearchWithFilter(vr.vector, vr.k, - vr.eligibleSelector.SegmentEligibleDocs(index), vr.searchParams) + vr.eligibleSelector.SegmentEligibleDocuments(index), vr.searchParams) } else { pl, err = vecIndex.Search(vr.vector, vr.k, vr.searchParams) } @@ -157,9 +154,6 @@ func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context, return octx, nil } o.ctx = ctx - if !o.requiresFiltering { - o.requiresFiltering = s.eligibleSelector != nil - } if o.snapshot != s.snapshot { o.invokeSearcherEndCallback() diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index 790a8008a..c44f9cf7b 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -28,6 +28,7 @@ import ( zapv14 "github.com/blevesearch/zapx/v14" zapv15 "github.com/blevesearch/zapx/v15" zapv16 "github.com/blevesearch/zapx/v16" + zapv17 "github.com/blevesearch/zapx/v17" ) // SegmentPlugin represents the essential functions required by a package to plug in @@ -73,7 +74,8 @@ var defaultSegmentPlugin SegmentPlugin func init() { ResetSegmentPlugins() - RegisterSegmentPlugin(&zapv16.ZapPlugin{}, true) + RegisterSegmentPlugin(&zapv17.ZapPlugin{}, true) + RegisterSegmentPlugin(&zapv16.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv15.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv14.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv13.ZapPlugin{}, false) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 4f67a3c0b..358f14102 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -671,6 +671,8 @@ func (is *IndexSnapshot) TermFieldReader(ctx context.Context, term []byte, field rv.incrementBytesRead(bytesRead - prevBytesReadItr) } } + // ONLY update the bytes read value beyond this point for this TFR if scoring is enabled + rv.updateBytesRead = rv.includeFreq || rv.includeNorm || rv.includeTermVectors atomic.AddUint64(&is.parent.stats.TotTermSearchersStarted, uint64(1)) return rv, nil } diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 315c5686c..be46290bd 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -51,6 +51,10 @@ type IndexSnapshotTermFieldReader struct { bytesRead uint64 ctx context.Context unadorned bool + // flag to indicate whether to increment our bytesRead + // value after creation of the TFR while iterating our postings + // lists + updateBytesRead bool } func (i *IndexSnapshotTermFieldReader) incrementBytesRead(val uint64) { @@ -83,10 +87,15 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in if rv == nil { rv = &index.TermFieldDoc{} } + var prevBytesRead uint64 // find the next hit for i.segmentOffset < len(i.iterators) { - prevBytesRead := i.iterators[i.segmentOffset].BytesRead() - next, err := i.iterators[i.segmentOffset].Next() + // get our current postings iterator + curItr := i.iterators[i.segmentOffset] + if i.updateBytesRead { + prevBytesRead = curItr.BytesRead() + } + next, err := curItr.Next() if err != nil { return nil, err } @@ -99,13 +108,15 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in i.currID = rv.ID i.currPosting = next - // postingsIterators is maintain the bytesRead stat in a cumulative fashion. - // this is because there are chances of having a series of loadChunk calls, - // and they have to be added together before sending the bytesRead at this point - // upstream. - bytesRead := i.iterators[i.segmentOffset].BytesRead() - if bytesRead > prevBytesRead { - i.incrementBytesRead(bytesRead - prevBytesRead) + if i.updateBytesRead { + // postingsIterators maintains the bytesRead stat in a cumulative fashion. + // this is because there are chances of having a series of loadChunk calls, + // and they have to be added together before sending the bytesRead at this point + // upstream. + bytesRead := curItr.BytesRead() + if bytesRead > prevBytesRead { + i.incrementBytesRead(bytesRead - prevBytesRead) + } } return rv, nil } diff --git a/index/scorch/snapshot_vector_index.go b/index/scorch/snapshot_vector_index.go index db5e06745..93bb94b49 100644 --- a/index/scorch/snapshot_vector_index.go +++ b/index/scorch/snapshot_vector_index.go @@ -22,6 +22,7 @@ import ( "encoding/json" "fmt" + "github.com/bits-and-blooms/bitset" index "github.com/blevesearch/bleve_index_api" segment_api "github.com/blevesearch/scorch_segment_api/v2" ) @@ -49,17 +50,82 @@ func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32, return rv, nil } +// eligibleDocumentList represents the list of eligible documents within a segment. +type eligibleDocumentList struct { + bs *bitset.BitSet +} + +// Iterator returns an iterator for the eligible document IDs. +func (edl *eligibleDocumentList) Iterator() index.EligibleDocumentIterator { + if edl.bs == nil { + // no eligible documents + return emptyEligibleIterator + } + // return the iterator + return &eligibleDocumentIterator{ + bs: edl.bs, + } +} + +// Count returns the number of eligible document IDs. +func (edl *eligibleDocumentList) Count() uint64 { + if edl.bs == nil { + return 0 + } + return uint64(edl.bs.Count()) +} + +// emptyEligibleDocumentList is a reusable empty eligible document list. +var emptyEligibleDocumentList = &eligibleDocumentList{} + +// eligibleDocumentIterator iterates over eligible document IDs within a segment. +type eligibleDocumentIterator struct { + bs *bitset.BitSet + current uint +} + +// Next returns the next eligible document ID and whether it exists. +func (it *eligibleDocumentIterator) Next() (id uint64, ok bool) { + next, found := it.bs.NextSet(it.current) + if !found { + return 0, false + } + it.current = next + 1 + return uint64(next), true +} + +// emptyEligibleIterator is a reusable empty eligible document iterator. +var emptyEligibleIterator = &emptyEligibleDocumentIterator{} + +// emptyEligibleDocumentIterator is an iterator that always returns no documents. +type emptyEligibleDocumentIterator struct{} + +// Next always returns false for empty iterator. +func (it *emptyEligibleDocumentIterator) Next() (id uint64, ok bool) { + return 0, false +} + // eligibleDocumentSelector is used to filter out documents that are eligible for // the KNN search from a pre-filter query. type eligibleDocumentSelector struct { - // segment ID -> segment local doc nums - eligibleDocNums map[int][]uint64 + // segment ID -> segment local doc nums in a bitset + eligibleDocNums []*bitset.BitSet is *IndexSnapshot } -// SegmentEligibleDocs returns the list of eligible local doc numbers for the given segment. -func (eds *eligibleDocumentSelector) SegmentEligibleDocs(segmentID int) []uint64 { - return eds.eligibleDocNums[segmentID] +// SegmentEligibleDocuments returns an EligibleDocumentList for the specified segment ID. +func (eds *eligibleDocumentSelector) SegmentEligibleDocuments(segmentID int) index.EligibleDocumentList { + if eds.eligibleDocNums == nil || segmentID < 0 || segmentID >= len(eds.eligibleDocNums) { + return emptyEligibleDocumentList + } + bs := eds.eligibleDocNums[segmentID] + if bs == nil { + // no eligible documents for this segment + return emptyEligibleDocumentList + } + return &eligibleDocumentList{ + bs: bs, + } } // AddEligibleDocumentMatch adds a document match to the list of eligible documents. @@ -72,14 +138,19 @@ func (eds *eligibleDocumentSelector) AddEligibleDocumentMatch(id index.IndexInte if err != nil { return err } + // allocate a bitset for this segment if needed + if eds.eligibleDocNums[segIdx] == nil { + // the size of the bitset is the full size of the segment (which is the max local doc num + 1) + eds.eligibleDocNums[segIdx] = bitset.New(uint(eds.is.segment[segIdx].FullSize())) + } // Add the local doc number to the list of eligible doc numbers for this segment. - eds.eligibleDocNums[segIdx] = append(eds.eligibleDocNums[segIdx], docNum) + eds.eligibleDocNums[segIdx].Set(uint(docNum)) return nil } func (is *IndexSnapshot) NewEligibleDocumentSelector() index.EligibleDocumentSelector { return &eligibleDocumentSelector{ - eligibleDocNums: map[int][]uint64{}, + eligibleDocNums: make([]*bitset.BitSet, len(is.segment)), is: is, } } diff --git a/index/scorch/unadorned.go b/index/scorch/unadorned.go index 18ce1c582..a37fb37ff 100644 --- a/index/scorch/unadorned.go +++ b/index/scorch/unadorned.go @@ -38,6 +38,7 @@ func init() { type unadornedPostingsIteratorBitmap struct { actual roaring.IntPeekable actualBM *roaring.Bitmap + next UnadornedPosting // reused across Next() calls } func (i *unadornedPostingsIteratorBitmap) Next() (segment.Posting, error) { @@ -53,7 +54,10 @@ func (i *unadornedPostingsIteratorBitmap) nextAtOrAfter(atOrAfter uint64) (segme if !exists { return nil, nil } - return UnadornedPosting(docNum), nil + i.next = UnadornedPosting{} // clear the struct + rv := &i.next + rv.docNum = docNum + return rv, nil } func (i *unadornedPostingsIteratorBitmap) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool) { @@ -112,8 +116,9 @@ func newUnadornedPostingsIteratorFromBitmap(bm *roaring.Bitmap) segment.Postings const docNum1HitFinished = math.MaxUint64 type unadornedPostingsIterator1Hit struct { - docNumOrig uint64 // original 1-hit docNum used to create this iterator - docNum uint64 // current docNum + docNumOrig uint64 // original 1-hit docNum used to create this iterator + docNum uint64 // current docNum + next UnadornedPosting // reused across Next() calls } func (i *unadornedPostingsIterator1Hit) Next() (segment.Posting, error) { @@ -129,7 +134,10 @@ func (i *unadornedPostingsIterator1Hit) nextAtOrAfter(atOrAfter uint64) (segment if !exists { return nil, nil } - return UnadornedPosting(docNum), nil + i.next = UnadornedPosting{} // clear the struct + rv := &i.next + rv.docNum = docNum + return rv, nil } func (i *unadornedPostingsIterator1Hit) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool) { @@ -176,24 +184,26 @@ type ResetablePostingsIterator interface { ResetIterator() } -type UnadornedPosting uint64 +type UnadornedPosting struct { + docNum uint64 +} -func (p UnadornedPosting) Number() uint64 { - return uint64(p) +func (p *UnadornedPosting) Number() uint64 { + return p.docNum } -func (p UnadornedPosting) Frequency() uint64 { +func (p *UnadornedPosting) Frequency() uint64 { return 0 } -func (p UnadornedPosting) Norm() float64 { +func (p *UnadornedPosting) Norm() float64 { return 0 } -func (p UnadornedPosting) Locations() []segment.Location { +func (p *UnadornedPosting) Locations() []segment.Location { return nil } -func (p UnadornedPosting) Size() int { +func (p *UnadornedPosting) Size() int { return reflectStaticSizeUnadornedPosting } diff --git a/index_test.go b/index_test.go index b0d0d8d0c..ae93561a7 100644 --- a/index_test.go +++ b/index_test.go @@ -612,9 +612,9 @@ func TestBytesRead(t *testing.T) { stats, _ := idx.StatsMap()["index"].(map[string]interface{}) prevBytesRead, _ := stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead := uint64(22049) + expectedBytesRead := uint64(21164) if supportForVectorSearch { - expectedBytesRead = 22459 + expectedBytesRead = 21574 } if prevBytesRead != expectedBytesRead && res.Cost == prevBytesRead { @@ -770,9 +770,9 @@ func TestBytesReadStored(t *testing.T) { stats, _ := idx.StatsMap()["index"].(map[string]interface{}) bytesRead, _ := stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead := uint64(11911) + expectedBytesRead := uint64(11025) if supportForVectorSearch { - expectedBytesRead = 12321 + expectedBytesRead = 11435 } if bytesRead != expectedBytesRead && bytesRead == res.Cost { @@ -847,9 +847,9 @@ func TestBytesReadStored(t *testing.T) { stats, _ = idx1.StatsMap()["index"].(map[string]interface{}) bytesRead, _ = stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead = uint64(4097) + expectedBytesRead = uint64(3212) if supportForVectorSearch { - expectedBytesRead = 4507 + expectedBytesRead = 3622 } if bytesRead != expectedBytesRead && bytesRead == res.Cost { diff --git a/search/query/query_string_parser.go b/search/query/query_string_parser.go index 3fb7731b8..8aebedd41 100644 --- a/search/query/query_string_parser.go +++ b/search/query/query_string_parser.go @@ -41,7 +41,7 @@ func parseQuerySyntax(query string) (rq Query, err error) { doParse(lex) if len(lex.errs) > 0 { - return nil, fmt.Errorf(strings.Join(lex.errs, "\n")) + return nil, fmt.Errorf("%s", strings.Join(lex.errs, "\n")) } return lex.query, nil } From 969e20f8ed2a331b266c0899993040517e746d4e Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Fri, 16 Jan 2026 00:51:10 +0530 Subject: [PATCH 2/9] MB-27666: Hierarchy Search (#2224) Add support for nested fields in indexing and querying - Parse and index nested JSON objects - Enable queries on nested fields - Preserve hierarchical relationships in index Requires: - https://github.com/blevesearch/bleve_index_api/pull/70 - https://github.com/blevesearch/bleve_index_api/releases/tag/v1.3.0 - https://github.com/blevesearch/scorch_segment_api/pull/63 - https://github.com/blevesearch/scorch_segment_api/releases/tag/v2.4.0 - https://github.com/blevesearch/zapx/pull/339, https://github.com/blevesearch/zapx/pull/365 - https://github.com/blevesearch/zapx/releases/tag/v17.0.0 - https://github.com/blevesearch/zapx/releases/tag/v16.3.0 Resolves: - https://github.com/blevesearch/bleve/issues/15 - https://github.com/blevesearch/bleve/issues/637 - https://github.com/blevesearch/bleve/issues/1297 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Abhinav Dangeti --- README.md | 1 + docs/hierarchy.md | 376 ++++++ docs/vectors.md | 4 +- document/document.go | 37 +- go.mod | 8 +- go.sum | 16 +- index/scorch/introducer.go | 5 + index/scorch/scorch.go | 6 + index/scorch/snapshot_index.go | 147 ++- index/scorch/snapshot_index_doc.go | 5 +- index/scorch/snapshot_index_tfr.go | 9 +- index/scorch/snapshot_index_vr.go | 9 +- index/scorch/snapshot_segment.go | 21 + index_impl.go | 204 +++- mapping.go | 14 + mapping/document.go | 75 +- mapping/index.go | 87 +- mapping/mapping.go | 19 + mapping/mapping_vectors.go | 1 + registry/nested.go | 136 +++ registry/registry.go | 2 + search.go | 100 +- search/collector/nested.go | 103 ++ search/collector/topn.go | 144 ++- search/explanation.go | 49 + .../highlighter/simple/highlighter_simple.go | 6 +- search/query/conjunction.go | 46 +- search/query/query.go | 9 +- search/scorer/scorer_knn.go | 2 +- search/scorer/scorer_term.go | 2 +- search/search.go | 55 +- search/searcher/search_conjunction_nested.go | 499 ++++++++ search/searcher/search_disjunction_heap.go | 7 +- search/searcher/search_match_all.go | 33 + search/searcher/search_numeric_range.go | 2 +- search/util.go | 122 +- search_knn.go | 18 +- search_nested_test.go | 1046 +++++++++++++++++ search_no_knn.go | 2 +- 39 files changed, 3199 insertions(+), 228 deletions(-) create mode 100644 docs/hierarchy.md create mode 100644 registry/nested.go create mode 100644 search/collector/nested.go create mode 100644 search/searcher/search_conjunction_nested.go create mode 100644 search_nested_test.go diff --git a/README.md b/README.md index e40715f9f..d41c053fc 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ A modern indexing + search library in GO * [geo spatial search](https://github.com/blevesearch/bleve/blob/master/geo/README.md) * approximate k-nearest neighbors via [vector search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md) * [synonym search](https://github.com/blevesearch/bleve/blob/master/docs/synonyms.md) + * [hierarchical nested search](https://github.com/blevesearch/bleve/blob/master/docs/hierarchy.md) * [tf-idf](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#tf-idf) / [bm25](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#bm25) scoring models * Hybrid search: exact + semantic * Query time boosting diff --git a/docs/hierarchy.md b/docs/hierarchy.md new file mode 100644 index 000000000..bcbc259c4 --- /dev/null +++ b/docs/hierarchy.md @@ -0,0 +1,376 @@ +# Hierarchical nested search + +* *v2.6.0* (and after) will come with support for **Array indexing and hierarchy search**. +* We've achieved this by embedding nested documents within our bleve (scorch) indexes. +* Usage of zap file format: [v17](https://github.com/blevesearch/zapx/blob/master/zap.md). Here we preserve hierarchical document relationships within segments, continuing to conform to the segmented architecture of *scorch*. + +## Supported + +* Indexing `Arrays` allows specifying fields that contain arrays of objects. Each object in the array can have its own set of fields, enabling the representation of hierarchical data structures within a single document. + +```json +{ + "id": "1", + "name": "John Doe", + "addresses": [ + { + "type": "home", + "street": "123 Main St", + "city": "Hometown", + "zip": "12345" + }, + { + "type": "work", + "street": "456 Corporate Blvd", + "city": "Metropolis", + "zip": "67890" + } + ] +} +``` + +* Multi-level arrays: Arrays can contain objects that themselves have array fields, allowing for deeply nested structures, such as a list of projects, each with its own list of tasks. + +```json +{ + "id": "2", + "name": "Jane Smith", + "projects": [ + { + "name": "Project Alpha", + "tasks": [ + {"title": "Task 1", "status": "completed"}, + {"title": "Task 2", "status": "in-progress"} + ] + }, + { + "name": "Project Beta", + "tasks": [ + {"title": "Task A", "status": "not-started"}, + {"title": "Task B", "status": "completed"} + ] + } + ] +} +``` + +* Multiple arrays: A document can have multiple fields that are arrays, each representing different hierarchical data, such as a list of phone numbers and a list of email addresses. + +```json +{ + "id": "3", + "name": "Alice Johnson", + "phones": [ + {"type": "mobile", "number": "555-1234"}, + {"type": "home", "number": "555-5678"} + ], + "emails": [ + {"type": "personal", "address": "alice@example.com"}, + {"type": "work", "address": "alice@work.com"} + ] +} +``` + +* Hybrid arrays: Multi-level and multiple arrays can be combined within the same document to represent complex hierarchical data structures, such as a company with multiple departments, each having its own list of employees and projects. + +```json +{ + "id": "doc1", + "company": { + "id": "c1", + "name": "TechCorp", + "departments": [ + { + "name": "Engineering", + "budget": 2000000, + "employees": [ + {"name": "Alice", "role": "Engineer"}, + {"name": "Bob", "role": "Manager"} + ], + "projects": [ + {"title": "Project X", "status": "ongoing"}, + {"title": "Project Y", "status": "completed"} + ] + }, + { + "name": "Sales", + "budget": 300000, + "employees": [ + {"name": "Eve", "role": "Salesperson"}, + {"name": "Mallory", "role": "Manager"} + ], + "projects": [ + {"title": "Project A", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} + ] + } + ], + "locations": [ + {"city": "Athens","country": "Greece"}, + {"city": "Berlin","country": "USA"} + ] + } +} +``` + +* Earlier versions of Bleve only supported flat arrays of primitive types (e.g., strings, numbers), and would flatten nested structures, losing the hierarchical relationships, so the above complex documents could not be accurately represented or queried. For example, the "employees" and "projects" fields within each department would be flattened, making it impossible to associate employees with their respective departments. + +* From v2.6.0 onwards, Bleve allows for accurate representation and querying of complex nested structures, preserving the relationships between different levels of the hierarchy, across multi-level, multiple and hybrid arrays. + +* The addition of `nested` document mappings enable defining fields that contain arrays of objects, giving the option to preserve the hierarchical relationships within the array during indexing. Having `nested` as false (default) will flatten the objects within the array, losing the hierarchy, which was the earlier behavior. + +```json +{ + "departments": { + "dynamic": false, + "enabled": true, + "nested": true, + "properties": { + "employees": { + "dynamic": false, + "enabled": true, + "nested": true + }, + "projects": { + "dynamic": false, + "enabled": true, + "nested": true + } + } + }, + "locations": { + "dynamic": false, + "enabled": true, + "nested": true + } +} +``` + +* Any Bleve query (e.g., match, phrase, term, fuzzy, numeric/date range etc.) can be executed against fields within nested documents, with no special handling required. The query processor will automatically traverse the nested structures to find matches. Additional search constructs +like vector search, synonym search, hybrid and pre-filtered vector search integrate seamlessly with hierarchy search. + +* Conjunction Queries (AND queries) and other queries that depend on term co-occurrence within the same hierarchical context will respect the boundaries of nested documents. This means that terms must appear within the same nested object to be considered a match. For example, a conjunction query searching for an employee named "Alice" with the role "Engineer" within the "Engineering" department will only return results where both name and role terms are found within the same employee object, which is itself within a "Engineering" department object. + +* Some other search constructs will have enhanced precision with hierarchy search. + * Field-Level Highlighting: Only fields within the matched nested object are retrieved and highlighted, ensuring highlights appear in the correct hierarchical context. For example, a match in `departments[name=Engineering].employees` highlights only employees in that department. + + * Nested Faceting / Aggregations: Facets are computed within matched nested objects, producing context-aware buckets. E.g., a facet on `departments.projects.status` returns ongoing or completed only for projects in matched departments. + + * Sorting by Nested Fields: Sorting can use fields from the relevant nested object, e.g., ordering companies by `departments.budget sorts` based on the budget of the specific matched department, not unrelated departments. + +* Vector Search (KNN / Multi-KNN): When an array of objects is marked as nested and contains vector fields, each vector is treated as belonging to its own nested document. Vector similarity is computed only within the same nested object, not across siblings. For example, if `departments.employees` is a nested array where each employee has a `skills_vector`, a KNN search using the embedding of `machine learning engineer` will match only employees whose own `skills_vector` is similar; other employees vectors within the same department or document do not contribute to the score or match. This also means that a vector search query for `K = 3` will return the top 3 most similar employees across all departments and all companies, and may return multiple employees from the same department or company if they rank among the top 3 most similar overall. + +* Pre-Filtered Vector Search: When vector search is combined with filters on fields inside a nested array, the filters are applied first to pick which nested items are eligible. The vector search then runs only on those filtered items. For example, if `departments.employees` is a `nested` array, a pre-filtered KNN query for employees with the role `Manager` in the `Sales` department will first narrow the candidate set to only employees who meet those field conditions, and then compute vector similarity on the `skills_vector` of that filtered subset. This ensures that vector search results come only from the employees that satisfy the filter, while still treating each employee as an independent vector candidate. + +## Indexing + +Below is an example of using the Bleve API to index documents with hierarchical structures, using hybrid arrays and nested mappings. + +```go +// Define a document to be indexed. +docJSON := + `{ + "company": { + "id": "c3", + "name": "WebSolutions", + "departments": [ + { + "name": "HR", + "budget": 800000, + "employees": [ + {"name": "Eve", "role": "Manager"}, + {"name": "Frank", "role": "HR"} + ], + "projects": [ + {"title": "Project Beta", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} + ] + }, + { + "name": "Engineering", + "budget": 200000, + "employees": [ + {"name": "Heidi", "role": "Support Engineer"}, + {"name": "Ivan", "role": "Manager"} + ], + "projects": [ + {"title": "Project Helpdesk", "status": "ongoing"}, + {"title": "Project FAQ", "status": "completed"} + ] + } + ], + "locations": [ + {"city": "Edinburgh", "country": "UK"}, + {"city": "London", "country": "Canada"} + ] + } + }` + +// Define departments as a nested document mapping (since it contains arrays of objects) +// and index name and budget fields +departmentsMapping := bleve.NewNestedDocumentMapping() +departmentsMapping.AddFieldMappingsAt("name", bleve.NewTextFieldMapping()) +departmentsMapping.AddFieldMappingsAt("budget", bleve.NewNumericFieldMapping()) + +// Define employees as a nested document mapping within departments (since it contains arrays of objects) +// and index name and role fields +employeesMapping := bleve.NewNestedDocumentMapping() +employeesMapping.AddFieldMappingsAt("name", bleve.NewTextFieldMapping()) +employeesMapping.AddFieldMappingsAt("role", bleve.NewTextFieldMapping()) +departmentsMapping.AddSubDocumentMapping("employees", employeesMapping) + +// Define projects as a nested document mapping within departments (since it contains arrays of objects) +// and index title and status fields +projectsMapping := bleve.NewNestedDocumentMapping() +projectsMapping.AddFieldMappingsAt("title", bleve.NewTextFieldMapping()) +projectsMapping.AddFieldMappingsAt("status", bleve.NewTextFieldMapping()) +departmentsMapping.AddSubDocumentMapping("projects", projectsMapping) + +// Define locations as a nested document mapping (since it contains arrays of objects) +// and index city and country fields +locationsMapping := bleve.NewNestedDocumentMapping() +locationsMapping.AddFieldMappingsAt("city", bleve.NewTextFieldMapping()) +locationsMapping.AddFieldMappingsAt("country", bleve.NewTextFieldMapping()) + +// Define company as a document mapping and index its name field and +// add departments and locations as sub-document mappings +companyMapping := bleve.NewDocumentMapping() +companyMapping.AddFieldMappingsAt("name", bleve.NewTextFieldMapping()) +companyMapping.AddSubDocumentMapping("departments", departmentsMapping) +companyMapping.AddSubDocumentMapping("locations", locationsMapping) + +// Define the final index mapping and add company as a sub-document mapping in the default mapping +indexMapping := bleve.NewIndexMapping() +indexMapping.DefaultMapping.AddSubDocumentMapping("company", companyMapping) + +// Create the index with the defined mapping +index, err := bleve.New("hierarchy_example.bleve", indexMapping) +if err != nil { + panic(err) +} + +// Unmarshal the document JSON into a map, for indexing +var doc map[string]interface{} +err = json.Unmarshal([]byte(docJSON), &doc) +if err != nil { + panic(err) +} + +// Index the document +err = index.Index("doc1", doc) +if err != nil { + panic(err) +} +``` + +## Querying + +```go +// Open the index +index, err := bleve.Open("hierarchy_example.bleve") +if err != nil { + panic(err) +} + +var ( + req *bleve.SearchRequest + res *bleve.SearchResult +) + +// Example 1: Simple Match Query on a field within a nested document, should work as if it were a flat field +q1 := bleve.NewMatchQuery("Engineer") +q1.SetField("company.departments.employees.role") +req = bleve.NewSearchRequest(q1) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Match Query Results:", res) + +// Example 2: Conjunction Query (AND) on fields within the same nested document +// like finding employees with name "Eve" and role "Manager". This will only match +// if both terms are in the same employee object. +q1 = bleve.NewMatchQuery("Eve") +q1.SetField("company.departments.employees.name") +q2 := bleve.NewMatchQuery("Manager") +q2.SetField("company.departments.employees.role") +conjQuery := bleve.NewConjunctionQuery( + q1, + q2, +) +req = bleve.NewSearchRequest(conjQuery) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Conjunction Query Results:", res) + +// Example 3: Multi-level Nested Query, finding projects with status "ongoing" +// within the "Engineering" department. This ensures both conditions are met +// within the correct hierarchy, i.e., the ongoing project must belong to the +// Engineering department. +q1 = bleve.NewMatchQuery("Engineering") +q1.SetField("company.departments.name") +q2 = bleve.NewMatchQuery("ongoing") +q2.SetField("company.departments.projects.status") +multiLevelQuery := bleve.NewConjunctionQuery( + q1, + q2, +) +req = bleve.NewSearchRequest(multiLevelQuery) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Multi-level Nested Query Results:", res) + +// Example 4: Multiple Arrays Query, finding documents with a location in "London" +// and an employee with the role "Manager". This checks conditions across different arrays. +q1 = bleve.NewMatchQuery("London") +q1.SetField("company.locations.city") +q2 = bleve.NewMatchQuery("Manager") +q2.SetField("company.departments.employees.role") +multiArrayQuery := bleve.NewConjunctionQuery( + q1, + q2, +) +req = bleve.NewSearchRequest(multiArrayQuery) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Multiple Arrays Query Results:", res) + +// Hybrid Arrays Query, combining multi-level and multiple arrays, +// finding documents with a Manager named Ivan working in Edinburgh, UK +q1 = bleve.NewMatchQuery("Ivan") +q1.SetField("company.departments.employees.name") +q2 = bleve.NewMatchQuery("Manager") +q2.SetField("company.departments.employees.role") +q3 := bleve.NewMatchQuery("Edinburgh") +q3.SetField("company.locations.city") +q4 := bleve.NewMatchQuery("UK") +q4.SetField("company.locations.country") +hybridArrayQuery := bleve.NewConjunctionQuery( + bleve.NewConjunctionQuery( + q1, + q2, + ), + bleve.NewConjunctionQuery( + q3, + q4, + ), +) +req = bleve.NewSearchRequest(hybridArrayQuery) +res, err = index.Search(req) +if err != nil { + panic(err) +} +fmt.Println("Hybrid Arrays Query Results:", res) + +// Close the index when done +err = index.Close() +if err != nil { + panic(err) +} +``` diff --git a/docs/vectors.md b/docs/vectors.md index 19ff0a6ad..c580b6f00 100644 --- a/docs/vectors.md +++ b/docs/vectors.md @@ -18,7 +18,9 @@ | `v2.4.1`, `v2.4.2` | [blevesearch/faiss@d9db66a](https://github.com/blevesearch/faiss/tree/d9db66a38518d99eb334218697e1df0732f3fdf8) (modified v1.7.4) | | `v2.4.3`, `v2.4.4` | [blevesearch/faiss@b747c55](https://github.com/blevesearch/faiss/tree/b747c55a93a9627039c34d44b081f375dca94e57) (modified v1.8.0) | | `v2.5.0`, `v2.5.1` | [blevesearch/faiss@352484e](https://github.com/blevesearch/faiss/tree/352484e0fc9d1f8f46737841efe5f26e0f383f71) (modified v1.10.0) | - | `v2.5.2`, `v2.5.3` | [blevesearch/faiss@b3d4e00](https://github.com/blevesearch/faiss/tree/b3d4e00a69425b95e0b283da7801efc9f66b580d) (modified v1.11.0) | + | `v2.5.2`, `v2.5.3`, `v2.5.4` | [blevesearch/faiss@b3d4e00](https://github.com/blevesearch/faiss/tree/b3d4e00a69425b95e0b283da7801efc9f66b580d) (modified v1.11.0) | + | `v2.5.5`, `v2.5.6`, `v2.5.7` | [blevesearch/faiss@8a59a0c](https://github.com/blevesearch/faiss/tree/8a59a0c552fa2d14fa871f6b6bc793de1d277f5e) (modified v1.12.0) | + | `v2.6.0` | [blevesearch/faiss@608356b](https://github.com/blevesearch/faiss/tree/608356b7c9630e891ff87cc49cc7bb460c3870d3) (modified v1.13.1) | ## Supported diff --git a/document/document.go b/document/document.go index 569d57bd6..7efea56da 100644 --- a/document/document.go +++ b/document/document.go @@ -18,6 +18,7 @@ import ( "fmt" "reflect" + "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" ) @@ -30,8 +31,9 @@ func init() { } type Document struct { - id string `json:"id"` - Fields []Field `json:"fields"` + id string + Fields []Field `json:"fields"` + NestedDocuments []*Document `json:"nested_documents"` CompositeFields []*CompositeField StoredFieldsSize uint64 indexed bool @@ -157,3 +159,34 @@ func (d *Document) SetIndexed() { func (d *Document) Indexed() bool { return d.indexed } + +func (d *Document) AddNestedDocument(doc *Document) { + d.NestedDocuments = append(d.NestedDocuments, doc) +} + +func (d *Document) NestedFields() search.FieldSet { + if len(d.NestedDocuments) == 0 { + return nil + } + fieldSet := search.NewFieldSet() + var collectFields func(index.Document) + collectFields = func(doc index.Document) { + // Add all field names from this nested document + doc.VisitFields(func(field index.Field) { + fieldSet.AddField(field.Name()) + }) + // Recursively collect from this document's nested documents + if nd, ok := doc.(index.NestedDocument); ok { + nd.VisitNestedDocuments(collectFields) + } + } + // Start collection from nested documents only (not root document) + d.VisitNestedDocuments(collectFields) + return fieldSet +} + +func (d *Document) VisitNestedDocuments(visitor func(doc index.Document)) { + for _, doc := range d.NestedDocuments { + visitor(doc) + } +} diff --git a/go.mod b/go.mod index d62d86018..80164509a 100644 --- a/go.mod +++ b/go.mod @@ -5,14 +5,14 @@ go 1.24 require ( github.com/RoaringBitmap/roaring/v2 v2.4.5 github.com/bits-and-blooms/bitset v1.22.0 - github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728 + github.com/blevesearch/bleve_index_api v1.3.0 github.com/blevesearch/geo v0.2.4 github.com/blevesearch/go-faiss v1.0.27 github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/goleveldb v1.0.1 github.com/blevesearch/gtreap v0.1.1 - github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df + github.com/blevesearch/scorch_segment_api/v2 v2.4.0 github.com/blevesearch/segment v0.9.1 github.com/blevesearch/snowball v0.6.1 github.com/blevesearch/snowballstem v0.9.0 @@ -24,8 +24,8 @@ require ( github.com/blevesearch/zapx/v13 v13.4.2 github.com/blevesearch/zapx/v14 v14.4.2 github.com/blevesearch/zapx/v15 v15.4.2 - github.com/blevesearch/zapx/v16 v16.2.8 - github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c + github.com/blevesearch/zapx/v16 v16.3.0 + github.com/blevesearch/zapx/v17 v17.0.0 github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 github.com/spf13/cobra v1.8.1 diff --git a/go.sum b/go.sum index 4eca460aa..d7ecb6509 100644 --- a/go.sum +++ b/go.sum @@ -3,8 +3,8 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/ github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728 h1:qFnvr+SqVOCbhMl5sVynhuwVkv1yrc7Vhrn8lVdw1nU= -github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko= +github.com/blevesearch/bleve_index_api v1.3.0 h1:DsMpWVjFNlBw9/6pyWf59XoqcAkhHj3H0UWiQsavb6E= +github.com/blevesearch/bleve_index_api v1.3.0/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko= github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk= github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8= github.com/blevesearch/go-faiss v1.0.27 h1:7cBImYDDQ82WJd5RUZ1ie6zXztCsC73W94ZzwOjkatk= @@ -20,8 +20,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df h1:gBuVkzZLUpGJGnCBRgY0ruZVjppD7WaQLeHZei7QQnU= -github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df/go.mod h1:f8fXitmMpzgNziIMqUlpTrfPxVVDN8at9k7POEohvJU= +github.com/blevesearch/scorch_segment_api/v2 v2.4.0 h1:OtipwURRzZv6UFmHQnbEqOY90eotINQ2TtSSpWfYuWU= +github.com/blevesearch/scorch_segment_api/v2 v2.4.0/go.mod h1:JalWE/eyEgISwhqtKXoaHMKf5t+F4kXiYrgg0ds3ylw= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A= @@ -44,10 +44,10 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= -github.com/blevesearch/zapx/v16 v16.2.8 h1:SlnzF0YGtSlrsOE3oE7EgEX6BIepGpeqxs1IjMbHLQI= -github.com/blevesearch/zapx/v16 v16.2.8/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14= -github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c h1:OfYh0noLbJmt6k2tqYlnSU3zMZEJbFfbSClSGG59A/M= -github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c/go.mod h1:ybWwo00MGrNJuFDnl9smEBVUCZmNANf0+E/QVBmfBTs= +github.com/blevesearch/zapx/v16 v16.3.0 h1:hF6VlN15E9CB40RMPyqOIhlDw1OOo9RItumhKMQktxw= +github.com/blevesearch/zapx/v16 v16.3.0/go.mod h1:zCFjv7McXWm1C8rROL+3mUoD5WYe2RKsZP3ufqcYpLY= +github.com/blevesearch/zapx/v17 v17.0.0 h1:srLJFkv5ghz1Z8iVz5uoOK89G2NvI4KdMG7aF3Cx7rE= +github.com/blevesearch/zapx/v17 v17.0.0/go.mod h1:/pi9Gq7byQcduhNB6Vk08+ZXGVGPjZoNc5QnQY8lkOo= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 209da5b8d..58cf63b93 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -167,6 +167,11 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newss.deleted = nil } + // update the deleted bitmap to include any nested/sub-documents as well + // if the segment supports that + if ns, ok := newss.segment.(segment.NestedSegment); ok { + newss.deleted = ns.AddNestedDocuments(newss.deleted) + } // check for live size before copying if newss.LiveSize() > 0 { newSnapshot.segment = append(newSnapshot.segment, newss) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 54dcb9274..db13a1a50 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -764,6 +764,12 @@ func analyze(d index.Document, fn customAnalyzerPluginInitFunc) { } } }) + if nd, ok := d.(index.NestedDocument); ok { + nd.VisitNestedDocuments(func(doc index.Document) { + doc.AddIDField() + analyze(doc, fn) + }) + } } func (s *Scorch) AddEligibleForRemoval(epoch uint64) { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 358f14102..3c5170bbc 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -17,7 +17,6 @@ package scorch import ( "container/heap" "context" - "encoding/binary" "fmt" "os" "path/filepath" @@ -42,9 +41,8 @@ type asynchSegmentResult struct { dict segment.TermDictionary dictItr segment.DictionaryIterator - cardinality int - index int - docs *roaring.Bitmap + index int + docs *roaring.Bitmap thesItr segment.ThesaurusIterator @@ -59,11 +57,11 @@ func init() { var err error lb1, err = lev.NewLevenshteinAutomatonBuilder(1, true) if err != nil { - panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) + panic(fmt.Errorf("levenshtein automaton ed1 builder err: %v", err)) } lb2, err = lev.NewLevenshteinAutomatonBuilder(2, true) if err != nil { - panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) + panic(fmt.Errorf("levenshtein automaton ed2 builder err: %v", err)) } } @@ -464,7 +462,7 @@ func (is *IndexSnapshot) GetInternal(key []byte) ([]byte, error) { func (is *IndexSnapshot) DocCount() (uint64, error) { var rv uint64 for _, segment := range is.segment { - rv += segment.Count() + rv += segment.CountRoot() } return rv, nil } @@ -491,7 +489,7 @@ func (is *IndexSnapshot) Document(id string) (rv index.Document, err error) { return nil, nil } - docNum, err := docInternalToNumber(next.ID) + docNum, err := next.ID.Value() if err != nil { return nil, err } @@ -554,7 +552,7 @@ func (is *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (in } func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return "", err } @@ -572,7 +570,7 @@ func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { } func (is *IndexSnapshot) segmentIndexAndLocalDocNum(id index.IndexInternalID) (int, uint64, error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return 0, 0, err } @@ -749,25 +747,6 @@ func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReade is.m2.Unlock() } -func docNumberToBytes(buf []byte, in uint64) []byte { - if len(buf) != 8 { - if cap(buf) >= 8 { - buf = buf[0:8] - } else { - buf = make([]byte, 8) - } - } - binary.BigEndian.PutUint64(buf, in) - return buf -} - -func docInternalToNumber(in index.IndexInternalID) (uint64, error) { - if len(in) != 8 { - return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in) - } - return binary.BigEndian.Uint64(in), nil -} - func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( segmentIndex int, localDocNum uint64, fields []string, cFields []string, visitor index.DocValueVisitor, dvs segment.DocVisitState) ( @@ -853,7 +832,7 @@ func (dvr *DocValueReader) BytesRead() uint64 { func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, visitor index.DocValueVisitor, ) (err error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return err } @@ -1163,3 +1142,111 @@ func (is *IndexSnapshot) ThesaurusKeysRegexp(name string, func (is *IndexSnapshot) UpdateSynonymSearchCount(delta uint64) { atomic.AddUint64(&is.parent.stats.TotSynonymSearches, delta) } + +// Update current snapshot updated field data as well as pass it on to all segments and segment bases +func (is *IndexSnapshot) UpdateFieldsInfo(updatedFields map[string]*index.UpdateFieldInfo) { + is.m.Lock() + defer is.m.Unlock() + + is.MergeUpdateFieldsInfo(updatedFields) + + for _, segmentSnapshot := range is.segment { + segmentSnapshot.UpdateFieldsInfo(is.updatedFields) + } +} + +// Merge given updated field information with existing updated field information +func (is *IndexSnapshot) MergeUpdateFieldsInfo(updatedFields map[string]*index.UpdateFieldInfo) { + if is.updatedFields == nil { + is.updatedFields = updatedFields + } else { + for fieldName, info := range updatedFields { + if val, ok := is.updatedFields[fieldName]; ok { + val.Deleted = val.Deleted || info.Deleted + val.Index = val.Index || info.Index + val.DocValues = val.DocValues || info.DocValues + val.Store = val.Store || info.Store + } else { + is.updatedFields[fieldName] = info + } + } + } +} + +// TermFrequencies returns the top N terms ordered by the frequencies +// for a given field across all segments in the index snapshot. +func (is *IndexSnapshot) TermFrequencies(field string, limit int, descending bool) ( + termFreqs []index.TermFreq, err error) { + if len(is.segment) == 0 { + return nil, nil + } + + if limit <= 0 { + return nil, fmt.Errorf("limit must be positive") + } + + // Use FieldDict which aggregates term frequencies across all segments + fieldDict, err := is.FieldDict(field) + if err != nil { + return nil, fmt.Errorf("failed to get field dictionary for field %s: %v", field, err) + } + defer fieldDict.Close() + + // Preallocate slice with capacity equal to the number of unique terms + // in the field dictionary + termFreqs = make([]index.TermFreq, 0, fieldDict.Cardinality()) + + // Iterate through all terms using FieldDict + for { + dictEntry, err := fieldDict.Next() + if err != nil { + return nil, fmt.Errorf("error iterating field dictionary: %v", err) + } + if dictEntry == nil { + break // End of terms + } + + termFreqs = append(termFreqs, index.TermFreq{ + Term: dictEntry.Term, + Frequency: dictEntry.Count, + }) + } + + // Sort by frequency (descending or ascending) + sort.Slice(termFreqs, func(i, j int) bool { + if termFreqs[i].Frequency == termFreqs[j].Frequency { + // If frequencies are equal, sort by term lexicographically + return termFreqs[i].Term < termFreqs[j].Term + } + if descending { + return termFreqs[i].Frequency > termFreqs[j].Frequency + } + return termFreqs[i].Frequency < termFreqs[j].Frequency + }) + + if limit >= len(termFreqs) { + return termFreqs, nil + } + + return termFreqs[:limit], nil +} + +// Ancestors returns the ancestor IDs for the given document ID. The prealloc +// slice can be provided to avoid allocations downstream, and MUST be empty. +func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID, prealloc []index.AncestorID) ([]index.AncestorID, error) { + // get segment and local doc num for the ID + seg, ldoc, err := i.segmentIndexAndLocalDocNum(ID) + if err != nil { + return nil, err + } + // get ancestors from the segment + prealloc = i.segment[seg].Ancestors(ldoc, prealloc) + // get global offset for the segment (correcting factor for multi-segment indexes) + globalOffset := i.offsets[seg] + // adjust ancestors to global doc numbers, not local to segment + for idx := range prealloc { + prealloc[idx] = prealloc[idx].Add(globalOffset) + } + // return adjusted ancestors + return prealloc, nil +} diff --git a/index/scorch/snapshot_index_doc.go b/index/scorch/snapshot_index_doc.go index 0a979bfb5..4048a199b 100644 --- a/index/scorch/snapshot_index_doc.go +++ b/index/scorch/snapshot_index_doc.go @@ -15,7 +15,6 @@ package scorch import ( - "bytes" "reflect" "github.com/RoaringBitmap/roaring/v2" @@ -49,7 +48,7 @@ func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { next := i.iterators[i.segmentOffset].Next() // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] - return docNumberToBytes(nil, uint64(next)+globalOffset), nil + return index.NewIndexInternalID(nil, uint64(next)+globalOffset), nil } return nil, nil } @@ -63,7 +62,7 @@ func (i *IndexSnapshotDocIDReader) Advance(ID index.IndexInternalID) (index.Inde if next == nil { return nil, nil } - for bytes.Compare(next, ID) < 0 { + for next.Compare(ID) < 0 { next, err = i.Next() if err != nil { return nil, err diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index be46290bd..c81f05338 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -15,7 +15,6 @@ package scorch import ( - "bytes" "context" "fmt" "reflect" @@ -103,7 +102,7 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] nnum := next.Number() - rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) + rv.ID = index.NewIndexInternalID(rv.ID, nnum+globalOffset) i.postingToTermFieldDoc(next, rv) i.currID = rv.ID @@ -157,7 +156,7 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { // FIXME do something better // for now, if we need to seek backwards, then restart from the beginning - if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { + if i.currPosting != nil && i.currID.Compare(ID) >= 0 { // Check if the TFR is a special unadorned composite optimization. // Such a TFR will NOT have a valid `term` or `field` set, making it // impossible for the TFR to replace itself with a new one. @@ -182,7 +181,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo } } } - num, err := docInternalToNumber(ID) + num, err := ID.Value() if err != nil { return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) } @@ -207,7 +206,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo if preAlloced == nil { preAlloced = &index.TermFieldDoc{} } - preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + preAlloced.ID = index.NewIndexInternalID(preAlloced.ID, next.Number()+ i.snapshot.offsets[segIndex]) i.postingToTermFieldDoc(next, preAlloced) i.currID = preAlloced.ID diff --git a/index/scorch/snapshot_index_vr.go b/index/scorch/snapshot_index_vr.go index 7c6741125..78e4b151e 100644 --- a/index/scorch/snapshot_index_vr.go +++ b/index/scorch/snapshot_index_vr.go @@ -18,7 +18,6 @@ package scorch import ( - "bytes" "context" "encoding/json" "fmt" @@ -91,7 +90,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] nnum := next.Number() - rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) + rv.ID = index.NewIndexInternalID(rv.ID, nnum+globalOffset) rv.Score = float64(next.Score()) i.currID = rv.ID @@ -108,7 +107,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, preAlloced *index.VectorDoc) (*index.VectorDoc, error) { - if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { + if i.currPosting != nil && i.currID.Compare(ID) >= 0 { i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, i.searchParams, i.eligibleSelector) if err != nil { @@ -119,7 +118,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, *i = *(i2.(*IndexSnapshotVectorReader)) } - num, err := docInternalToNumber(ID) + num, err := ID.Value() if err != nil { return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) } @@ -144,7 +143,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, if preAlloced == nil { preAlloced = &index.VectorDoc{} } - preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + preAlloced.ID = index.NewIndexInternalID(preAlloced.ID, next.Number()+ i.snapshot.offsets[segIndex]) i.currID = preAlloced.ID i.currPosting = next diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index ec65bf800..f4a4a7ae8 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -112,6 +112,19 @@ func (s *SegmentSnapshot) Count() uint64 { return rv } +// this counts the root documents in the segment this differs from Count() in that +// Count() counts all live documents including nested children, whereas this method +// counts only root live documents +func (s *SegmentSnapshot) CountRoot() uint64 { + var rv uint64 + if nsb, ok := s.segment.(segment.NestedSegment); ok { + rv = nsb.CountRoot(s.deleted) + } else { + rv = s.Count() + } + return rv +} + func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { rv, err := s.segment.DocNumbers(docIDs) if err != nil { @@ -338,3 +351,11 @@ func (c *cachedMeta) fetchMeta(field string) (rv interface{}) { c.m.RUnlock() return rv } + +func (s *SegmentSnapshot) Ancestors(docNum uint64, prealloc []index.AncestorID) []index.AncestorID { + nsb, ok := s.segment.(segment.NestedSegment) + if !ok { + return append(prealloc, index.NewAncestorID(docNum)) + } + return nsb.Ancestors(docNum, prealloc) +} diff --git a/index_impl.go b/index_impl.go index 5cc0c5899..1f75f190a 100644 --- a/index_impl.go +++ b/index_impl.go @@ -509,8 +509,7 @@ func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader in return nil, err } - fs := make(query.FieldSet) - fs, err := query.ExtractFields(req.Query, i.m, fs) + fs, err := query.ExtractFields(req.Query, i.m, search.NewFieldSet()) if err != nil { return nil, err } @@ -561,9 +560,64 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr err = cerr } }() + // ------------------------------------------------------------------------------------------ + // set up additional contexts for any search operation that will proceed from + // here, such as presearch, knn collector, topn collector etc. + + // Scoring model callback to be used to get scoring model + scoringModelCallback := func() string { + if isBM25Enabled(i.m) { + return index.BM25Scoring + } + return index.DefaultScoringModel + } + ctx = context.WithValue(ctx, search.GetScoringModelCallbackKey, + search.GetScoringModelCallbackFn(scoringModelCallback)) + + // This callback and variable handles the tracking of bytes read + // 1. as part of creation of tfr and its Next() calls which is + // accounted by invoking this callback when the TFR is closed. + // 2. the docvalues portion (accounted in collector) and the retrieval + // of stored fields bytes (by LoadAndHighlightFields) + var totalSearchCost uint64 + sendBytesRead := func(bytesRead uint64) { + totalSearchCost += bytesRead + } + // Ensure IO cost accounting and result cost assignment happen on all return paths + defer func() { + if sr != nil { + sr.Cost = totalSearchCost + } + if is, ok := indexReader.(*scorch.IndexSnapshot); ok { + is.UpdateIOStats(totalSearchCost) + } + search.RecordSearchCost(ctx, search.DoneM, 0) + }() + + ctx = context.WithValue(ctx, search.SearchIOStatsCallbackKey, search.SearchIOStatsCallbackFunc(sendBytesRead)) + + // Geo buffer pool callback to be used for getting geo buffer pool + var bufPool *s2.GeoBufferPool + getBufferPool := func() *s2.GeoBufferPool { + if bufPool == nil { + bufPool = s2.NewGeoBufferPool(search.MaxGeoBufPoolSize, search.MinGeoBufPoolSize) + } + + return bufPool + } + + ctx = context.WithValue(ctx, search.GeoBufferPoolCallbackKey, search.GeoBufferPoolCallbackFunc(getBufferPool)) + // check if the index mapping has any nested fields, which should force + // all collectors and searchers to be run in nested mode + if nm, ok := i.m.(mapping.NestedMapping); ok { + if nm.CountNested() > 0 { + ctx = context.WithValue(ctx, search.NestedSearchKey, true) + } + } + // ------------------------------------------------------------------------------------------ if _, ok := ctx.Value(search.PreSearchKey).(bool); ok { - preSearchResult, err := i.preSearch(ctx, req, indexReader) + sr, err = i.preSearch(ctx, req, indexReader) if err != nil { return nil, err } @@ -577,7 +631,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr // time stat searchDuration := time.Since(searchStart) atomic.AddUint64(&i.stats.searchTime, uint64(searchDuration)) - return preSearchResult, nil + + return sr, nil } var reverseQueryExecution bool @@ -588,11 +643,9 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr req.SearchBefore = nil } - var coll *collector.TopNCollector - if req.SearchAfter != nil { - coll = collector.NewTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter) - } else { - coll = collector.NewTopNCollector(req.Size, req.From, req.Sort) + coll, err := i.buildTopNCollector(ctx, req, indexReader) + if err != nil { + return nil, err } var knnHits []*search.DocumentMatch @@ -650,7 +703,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } } - setKnnHitsInCollector(knnHits, req, coll) + setKnnHitsInCollector(knnHits, coll) if fts != nil { if is, ok := indexReader.(*scorch.IndexSnapshot); ok { @@ -659,44 +712,12 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr ctx = context.WithValue(ctx, search.FieldTermSynonymMapKey, fts) } - scoringModelCallback := func() string { - if isBM25Enabled(i.m) { - return index.BM25Scoring - } - return index.DefaultScoringModel - } - ctx = context.WithValue(ctx, search.GetScoringModelCallbackKey, - search.GetScoringModelCallbackFn(scoringModelCallback)) - // set the bm25Stats (stats important for consistent scoring) in // the context object if bm25Stats != nil { ctx = context.WithValue(ctx, search.BM25StatsKey, bm25Stats) } - // This callback and variable handles the tracking of bytes read - // 1. as part of creation of tfr and its Next() calls which is - // accounted by invoking this callback when the TFR is closed. - // 2. the docvalues portion (accounted in collector) and the retrieval - // of stored fields bytes (by LoadAndHighlightFields) - var totalSearchCost uint64 - sendBytesRead := func(bytesRead uint64) { - totalSearchCost += bytesRead - } - - ctx = context.WithValue(ctx, search.SearchIOStatsCallbackKey, search.SearchIOStatsCallbackFunc(sendBytesRead)) - - var bufPool *s2.GeoBufferPool - getBufferPool := func() *s2.GeoBufferPool { - if bufPool == nil { - bufPool = s2.NewGeoBufferPool(search.MaxGeoBufPoolSize, search.MinGeoBufPoolSize) - } - - return bufPool - } - - ctx = context.WithValue(ctx, search.GeoBufferPoolCallbackKey, search.GeoBufferPoolCallbackFunc(getBufferPool)) - searcher, err := req.Query.Searcher(ctx, indexReader, i.m, search.SearcherOptions{ Explain: req.Explain, IncludeTermVectors: req.IncludeLocations || req.Highlight != nil, @@ -709,14 +730,6 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if serr := searcher.Close(); err == nil && serr != nil { err = serr } - if sr != nil { - sr.Cost = totalSearchCost - } - if sr, ok := indexReader.(*scorch.IndexSnapshot); ok { - sr.UpdateIOStats(totalSearchCost) - } - - search.RecordSearchCost(ctx, search.DoneM, 0) }() if req.Facets != nil { @@ -811,7 +824,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if i.name != "" && hit.Index == "" { hit.Index = i.name } - err, storedFieldsBytes := LoadAndHighlightFields(hit, req, i.name, indexReader, highlighter) + err, storedFieldsBytes := LoadAndHighlightAllFields(hit, req, i.name, indexReader, highlighter) if err != nil { return nil, err } @@ -972,6 +985,56 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, return nil, totalStoredFieldsBytes } +const NestedDocumentKey = "_$nested" + +// LoadAndHighlightAllFields loads stored fields + highlights for root and its descendants. +// All descendant documents are collected into a _$nested array in the root DocumentMatch. +func LoadAndHighlightAllFields( + root *search.DocumentMatch, + req *SearchRequest, + indexName string, + r index.IndexReader, + highlighter highlight.Highlighter, +) (error, uint64) { + var totalStoredFieldsBytes uint64 + // load root fields/highlights + err, bytes := LoadAndHighlightFields(root, req, indexName, r, highlighter) + totalStoredFieldsBytes += bytes + if err != nil { + return err, totalStoredFieldsBytes + } + // collect all descendant documents + nestedDocs := make([]*search.NestedDocumentMatch, 0, len(root.Descendants)) + // create a dummy desc DocumentMatch to reuse LoadAndHighlightFields + desc := &search.DocumentMatch{} + for _, descID := range root.Descendants { + extID, err := r.ExternalID(descID) + if err != nil { + return err, totalStoredFieldsBytes + } + // reset desc for reuse + desc.ID = extID + desc.IndexInternalID = descID + desc.Locations = root.Locations + err, bytes := LoadAndHighlightFields(desc, req, indexName, r, highlighter) + totalStoredFieldsBytes += bytes + if err != nil { + return err, totalStoredFieldsBytes + } + // copy fields to nested doc and append + if len(desc.Fields) != 0 || len(desc.Fragments) != 0 { + nestedDocs = append(nestedDocs, search.NewNestedDocumentMatch(desc.Fields, desc.Fragments)) + } + desc.Fields = nil + desc.Fragments = nil + } + // add nested documents to root under _$nested key + if len(nestedDocs) > 0 { + root.AddFieldValue(NestedDocumentKey, nestedDocs) + } + return nil, totalStoredFieldsBytes +} + // Fields returns the name of all the fields this // Index has operated on. func (i *indexImpl) Fields() (fields []string, err error) { @@ -1192,6 +1255,9 @@ func (f *indexImplFieldDict) Cardinality() int { // helper function to remove duplicate entries from slice of strings func deDuplicate(fields []string) []string { + if len(fields) == 0 { + return fields + } entries := make(map[string]struct{}) ret := []string{} for _, entry := range fields { @@ -1286,3 +1352,39 @@ func (i *indexImpl) FireIndexEvent() { internalEventIndex.FireIndexEvent() } } + +func (i *indexImpl) buildTopNCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader) (*collector.TopNCollector, error) { + newCollector := func() *collector.TopNCollector { + if req.SearchAfter != nil { + return collector.NewTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter) + } + return collector.NewTopNCollector(req.Size, req.From, req.Sort) + } + + newNestedCollector := func(nr index.NestedReader) *collector.TopNCollector { + if req.SearchAfter != nil { + return collector.NewNestedTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter, nr) + } + return collector.NewNestedTopNCollector(req.Size, req.From, req.Sort, nr) + } + + // check if we are in nested mode + if nestedMode, ok := ctx.Value(search.NestedSearchKey).(bool); ok && nestedMode { + // get the nested reader from the index reader + if nr, ok := reader.(index.NestedReader); ok { + // check if the mapping has any nested fields that intersect + if nm, ok := i.m.(mapping.NestedMapping); ok { + var fs search.FieldSet + var err error + fs, err = query.ExtractFields(req.Query, i.m, fs) + if err != nil { + return nil, err + } + if nm.IntersectsPrefix(fs) { + return newNestedCollector(nr), nil + } + } + } + } + return newCollector(), nil +} diff --git a/mapping.go b/mapping.go index 723105a29..af02db386 100644 --- a/mapping.go +++ b/mapping.go @@ -34,6 +34,20 @@ func NewDocumentStaticMapping() *mapping.DocumentMapping { return mapping.NewDocumentStaticMapping() } +// NewNestedDocumentMapping returns a new document mapping +// that will treat all objects as nested documents. +func NewNestedDocumentMapping() *mapping.DocumentMapping { + return mapping.NewNestedDocumentMapping() +} + +// NewNestedDocumentStaticMapping returns a new document mapping +// that will treat all objects as nested documents and +// will not automatically index parts of a nested document +// without an explicit mapping. +func NewNestedDocumentStaticMapping() *mapping.DocumentMapping { + return mapping.NewNestedDocumentStaticMapping() +} + // NewDocumentDisabledMapping returns a new document // mapping that will not perform any indexing. func NewDocumentDisabledMapping() *mapping.DocumentMapping { diff --git a/mapping/document.go b/mapping/document.go index a78b27e11..3da925038 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -22,6 +22,7 @@ import ( "reflect" "time" + "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/bleve/v2/util" ) @@ -44,6 +45,7 @@ type DocumentMapping struct { Dynamic bool `json:"dynamic"` Properties map[string]*DocumentMapping `json:"properties,omitempty"` Fields []*FieldMapping `json:"fields,omitempty"` + Nested bool `json:"nested,omitempty"` DefaultAnalyzer string `json:"default_analyzer,omitempty"` DefaultSynonymSource string `json:"default_synonym_source,omitempty"` @@ -230,6 +232,17 @@ func NewDocumentMapping() *DocumentMapping { } } +// NewNestedDocumentMapping returns a new document +// mapping that treats sub-documents as nested +// objects. +func NewNestedDocumentMapping() *DocumentMapping { + return &DocumentMapping{ + Nested: true, + Enabled: true, + Dynamic: true, + } +} + // NewDocumentStaticMapping returns a new document // mapping that will not automatically index parts // of a document without an explicit mapping. @@ -239,6 +252,17 @@ func NewDocumentStaticMapping() *DocumentMapping { } } +// NewNestedDocumentStaticMapping returns a new document +// mapping that treats sub-documents as nested +// objects and will not automatically index parts +// of the nested document without an explicit mapping. +func NewNestedDocumentStaticMapping() *DocumentMapping { + return &DocumentMapping{ + Enabled: true, + Nested: true, + } +} + // NewDocumentDisabledMapping returns a new document // mapping that will not perform any indexing. func NewDocumentDisabledMapping() *DocumentMapping { @@ -312,6 +336,11 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "nested": + err := util.UnmarshalJSON(v, &dm.Nested) + if err != nil { + return err + } case "default_analyzer": err := util.UnmarshalJSON(v, &dm.DefaultAnalyzer) if err != nil { @@ -381,6 +410,18 @@ func (dm *DocumentMapping) defaultSynonymSource(path []string) string { return rv } +// baseType returns the base type of v by dereferencing pointers +func baseType(v interface{}) reflect.Type { + if v == nil { + return nil + } + t := reflect.TypeOf(v) + for t.Kind() == reflect.Pointer { + t = t.Elem() + } + return t +} + func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { // allow default "json" tag to be overridden structTagKey := dm.StructTagKey @@ -434,11 +475,39 @@ func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes } } case reflect.Slice, reflect.Array: + subDocMapping, _ := dm.documentMappingForPathElements(path) + allowNested := subDocMapping != nil && subDocMapping.Nested for i := 0; i < val.Len(); i++ { - if val.Index(i).CanInterface() { - fieldVal := val.Index(i).Interface() - dm.processProperty(fieldVal, path, append(indexes, uint64(i)), context) + // for each array element, check if it can be represented as an interface + idxVal := val.Index(i) + // skip invalid values + if !idxVal.CanInterface() { + continue + } + // get the actual value in interface form + actual := idxVal.Interface() + // if nested mapping, only create nested document for object elements + if allowNested && actual != nil { + // check the kind of the actual value, is it an object (struct or map)? + typ := baseType(actual) + if typ == nil { + continue + } + kind := typ.Kind() + // only create nested docs for real JSON objects + if kind == reflect.Struct || kind == reflect.Map { + // Create nested document only for only object elements + nestedDocument := document.NewDocument( + fmt.Sprintf("%s_$%s_$%d", context.doc.ID(), encodePath(path), i)) + nestedContext := context.im.newWalkContext(nestedDocument, dm) + dm.processProperty(actual, path, append(indexes, uint64(i)), nestedContext) + context.doc.AddNestedDocument(nestedDocument) + continue + } } + // non-nested mapping, or non-object element in nested mapping + // process the element normally + dm.processProperty(actual, path, append(indexes, uint64(i)), context) } case reflect.Ptr: ptrElem := val.Elem() diff --git a/mapping/index.go b/mapping/index.go index 7878cce8b..bafb6ee89 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -17,12 +17,14 @@ package mapping import ( "encoding/json" "fmt" + "strings" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/analysis/analyzer/standard" "github.com/blevesearch/bleve/v2/analysis/datetime/optional" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/registry" + "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -195,11 +197,19 @@ func (im *IndexMappingImpl) Validate() error { // the map will hold the fully qualified field name to FieldMapping, so we can // check for conflicts as we validate each DocumentMapping. fieldAliasCtx := make(map[string]*FieldMapping) + // ensure that the nested property is not set for top-level default mapping + if im.DefaultMapping.Nested { + return fmt.Errorf("default mapping cannot be nested") + } err = im.DefaultMapping.Validate(im.cache, []string{}, fieldAliasCtx) if err != nil { return err } - for _, docMapping := range im.TypeMapping { + for name, docMapping := range im.TypeMapping { + // ensure that the nested property is not set for top-level mappings + if docMapping.Nested { + return fmt.Errorf("type mapping named: %s cannot be nested", name) + } err = docMapping.Validate(im.cache, []string{}, fieldAliasCtx) if err != nil { return err @@ -366,7 +376,13 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} // see if the _all field was disabled allMapping, _ := docMapping.documentMappingForPath("_all") if allMapping == nil || allMapping.Enabled { - field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, index.IndexField|index.IncludeTermVectors) + excludedFromAll := walkContext.excludedFromAll + nf := doc.NestedFields() + if nf != nil { + // if the document has any nested fields, exclude them from _all + excludedFromAll = append(excludedFromAll, nf.Slice()...) + } + field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, excludedFromAll, index.IndexField|index.IncludeTermVectors) doc.AddField(field) } doc.SetIndexed() @@ -574,3 +590,70 @@ func (im *IndexMappingImpl) SynonymSourceVisitor(visitor analysis.SynonymSourceV } return nil } + +func (im *IndexMappingImpl) buildNestedPrefixes() map[string]int { + prefixDepth := make(map[string]int) + var collectNestedFields func(dm *DocumentMapping, pathComponents []string, currentDepth int) + collectNestedFields = func(dm *DocumentMapping, pathComponents []string, currentDepth int) { + for name, docMapping := range dm.Properties { + newPathComponents := append(pathComponents, name) + if docMapping.Nested { + // This is a nested field boundary + newDepth := currentDepth + 1 + prefixDepth[strings.Join(newPathComponents, pathSeparator)] = newDepth + // Continue deeper with incremented depth + collectNestedFields(docMapping, newPathComponents, newDepth) + } else { + // Not nested, continue with same depth + collectNestedFields(docMapping, newPathComponents, currentDepth) + } + } + } + // Start from depth 0 (root) + if im.DefaultMapping != nil && im.DefaultMapping.Enabled { + collectNestedFields(im.DefaultMapping, []string{}, 0) + } + // Now do this for each type mapping + for _, docMapping := range im.TypeMapping { + if docMapping.Enabled { + collectNestedFields(docMapping, []string{}, 0) + } + } + return prefixDepth +} + +func (im *IndexMappingImpl) NestedDepth(fs search.FieldSet) (int, int) { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return 0, 0 + } + + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.NestedDepth(fs) +} + +func (im *IndexMappingImpl) CountNested() int { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return 0 + } + + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.CountNested() +} + +func (im *IndexMappingImpl) IntersectsPrefix(fs search.FieldSet) bool { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return false + } + + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.IntersectsPrefix(fs) +} diff --git a/mapping/mapping.go b/mapping/mapping.go index a6c1591b8..7ff2f9927 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -20,6 +20,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/document" + "github.com/blevesearch/bleve/v2/search" ) // A Classifier is an interface describing any object which knows how to @@ -74,3 +75,21 @@ type SynonymMapping interface { SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error } + +// A NestedMapping extends the IndexMapping interface to provide +// additional methods for working with nested object mappings. +type NestedMapping interface { + // NestedDepth returns two values: + // - common: the highest nested level that is common to all given field paths, + // if 0 then there is no common nested level among the given field paths + // - max: the highest nested level that applies to at least one of the given field paths + // if 0 then none of the given field paths are nested + NestedDepth(fieldPaths search.FieldSet) (int, int) + + // IntersectsPrefix returns true if any of the given + // field paths intersect with a known nested prefix + IntersectsPrefix(fieldPaths search.FieldSet) bool + + // CountNested returns the number of nested object mappings + CountNested() int +} diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index 393262b35..7c7ff1b98 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -197,6 +197,7 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac fieldName := getFieldName(pathString, path, fm) options := fm.Options() + field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector, fm.Dims, similarity, vectorIndexOptimizedFor, options) context.doc.AddField(field) diff --git a/registry/nested.go b/registry/nested.go new file mode 100644 index 000000000..fee7fda62 --- /dev/null +++ b/registry/nested.go @@ -0,0 +1,136 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package registry + +import ( + "strings" + "sync" + + "github.com/blevesearch/bleve/v2/search" +) + +// NestedFieldCache caches nested field prefixes and their corresponding nesting levels. +// A nested field prefix is a field path prefix that indicates the start of a nested document. +// The nesting level indicates how deep the nested document is in the overall document structure. +type NestedFieldCache struct { + // nested prefix -> nested level + prefixDepth map[string]int + once sync.Once + m sync.RWMutex +} + +func NewNestedFieldCache() *NestedFieldCache { + return &NestedFieldCache{} +} + +func (nfc *NestedFieldCache) InitOnce(buildFunc func() map[string]int) { + nfc.once.Do(func() { + nfc.m.Lock() + defer nfc.m.Unlock() + nfc.prefixDepth = buildFunc() + }) +} + +// NestedDepth returns two values: +// - common: The nesting level of the longest prefix that applies to every field path +// in the provided FieldSet. A value of 0 means no nested prefix is shared +// across all field paths. +// - max: The nesting level of the longest prefix that applies to at least one +// field path in the provided FieldSet. A value of 0 means none of the +// field paths match any nested prefix. +func (nfc *NestedFieldCache) NestedDepth(fieldPaths search.FieldSet) (common int, max int) { + // if no field paths, no nested depth + if len(fieldPaths) == 0 { + return + } + nfc.m.RLock() + defer nfc.m.RUnlock() + // if no cached prefixes, no nested depth + if len(nfc.prefixDepth) == 0 { + return + } + // for each prefix, check if its a common prefix or matches any path + // update common and max accordingly with the highest nesting level + // possible for each respective case + for prefix, level := range nfc.prefixDepth { + // only check prefixes that could increase one of the results + if level <= common && level <= max { + continue + } + // check prefix against field paths, getting whether it matches all paths (common) + // and whether it matches at least one path (any) + matchAll, matchAny := nfc.prefixMatch(prefix, fieldPaths) + // if it matches all paths, update common + if matchAll && level > common { + common = level + } + // if it matches any path, update max + if matchAny && level > max { + max = level + } + } + return common, max +} + +// CountNested returns the number of nested prefixes +func (nfc *NestedFieldCache) CountNested() int { + nfc.m.RLock() + defer nfc.m.RUnlock() + + return len(nfc.prefixDepth) +} + +// IntersectsPrefix returns true if any of the given +// field paths have a nested prefix +func (nfc *NestedFieldCache) IntersectsPrefix(fieldPaths search.FieldSet) bool { + // if no field paths, no intersection + if len(fieldPaths) == 0 { + return false + } + nfc.m.RLock() + defer nfc.m.RUnlock() + // if no cached prefixes, no intersection + if len(nfc.prefixDepth) == 0 { + return false + } + // Check each cached nested prefix to see if it intersects with any path + for prefix := range nfc.prefixDepth { + _, matchAny := nfc.prefixMatch(prefix, fieldPaths) + if matchAny { + return true + } + } + return false +} + +// prefixMatch checks whether the prefix matches all paths (common) and whether it matches at least one path (any) +// Caller must hold the read lock. +func (nfc *NestedFieldCache) prefixMatch(prefix string, fieldPaths search.FieldSet) (common bool, any bool) { + common = true + any = false + for path := range fieldPaths { + has := strings.HasPrefix(path, prefix) + if has { + any = true + } else { + common = false + } + // early exit if we have determined both values + if any && !common { + break + } + } + return common, any +} diff --git a/registry/registry.go b/registry/registry.go index 69ee8dd86..36f209d4f 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -49,6 +49,7 @@ type Cache struct { Fragmenters *FragmenterCache Highlighters *HighlighterCache SynonymSources *SynonymSourceCache + NestedPrefixes *NestedFieldCache } func NewCache() *Cache { @@ -63,6 +64,7 @@ func NewCache() *Cache { Fragmenters: NewFragmenterCache(), Highlighters: NewHighlighterCache(), SynonymSources: NewSynonymSourceCache(), + NestedPrefixes: NewNestedFieldCache(), } } diff --git a/search.go b/search.go index 2c25e0551..3c6035bfd 100644 --- a/search.go +++ b/search.go @@ -18,6 +18,7 @@ import ( "fmt" "reflect" "sort" + "strings" "time" "github.com/blevesearch/bleve/v2/analysis" @@ -473,48 +474,97 @@ func (sr *SearchResult) Size() int { } func (sr *SearchResult) String() string { - rv := "" + rv := &strings.Builder{} if sr.Total > 0 { - if sr.Request != nil && sr.Request.Size > 0 { - rv = fmt.Sprintf("%d matches, showing %d through %d, took %s\n", sr.Total, sr.Request.From+1, sr.Request.From+len(sr.Hits), sr.Took) + switch { + case sr.Request != nil && sr.Request.Size > 0: + start := sr.Request.From + 1 + end := sr.Request.From + len(sr.Hits) + fmt.Fprintf(rv, "%d matches, showing %d through %d, took %s\n", sr.Total, start, end, sr.Took) for i, hit := range sr.Hits { - rv += fmt.Sprintf("%5d. %s (%f)\n", i+sr.Request.From+1, hit.ID, hit.Score) - for fragmentField, fragments := range hit.Fragments { - rv += fmt.Sprintf("\t%s\n", fragmentField) - for _, fragment := range fragments { - rv += fmt.Sprintf("\t\t%s\n", fragment) - } - } - for otherFieldName, otherFieldValue := range hit.Fields { - if _, ok := hit.Fragments[otherFieldName]; !ok { - rv += fmt.Sprintf("\t%s\n", otherFieldName) - rv += fmt.Sprintf("\t\t%v\n", otherFieldValue) - } - } + rv = formatHit(rv, hit, start+i) + } + case sr.Request == nil: + fmt.Fprintf(rv, "%d matches, took %s\n", sr.Total, sr.Took) + for i, hit := range sr.Hits { + rv = formatHit(rv, hit, i+1) } - } else { - rv = fmt.Sprintf("%d matches, took %s\n", sr.Total, sr.Took) + default: + fmt.Fprintf(rv, "%d matches, took %s\n", sr.Total, sr.Took) } } else { - rv = "No matches" + fmt.Fprintf(rv, "No matches\n") } if len(sr.Facets) > 0 { - rv += "Facets:\n" + fmt.Fprintf(rv, "Facets:\n") for fn, f := range sr.Facets { - rv += fmt.Sprintf("%s(%d)\n", fn, f.Total) + fmt.Fprintf(rv, "%s(%d)\n", fn, f.Total) for _, t := range f.Terms.Terms() { - rv += fmt.Sprintf("\t%s(%d)\n", t.Term, t.Count) + fmt.Fprintf(rv, "\t%s(%d)\n", t.Term, t.Count) } for _, n := range f.NumericRanges { - rv += fmt.Sprintf("\t%s(%d)\n", n.Name, n.Count) + fmt.Fprintf(rv, "\t%s(%d)\n", n.Name, n.Count) } for _, d := range f.DateRanges { - rv += fmt.Sprintf("\t%s(%d)\n", d.Name, d.Count) + fmt.Fprintf(rv, "\t%s(%d)\n", d.Name, d.Count) } if f.Other != 0 { - rv += fmt.Sprintf("\tOther(%d)\n", f.Other) + fmt.Fprintf(rv, "\tOther(%d)\n", f.Other) + } + } + } + return rv.String() +} + +// formatHit is a helper function to format a single hit in the search result for +// the String() method of SearchResult +func formatHit(rv *strings.Builder, hit *search.DocumentMatch, hitNumber int) *strings.Builder { + fmt.Fprintf(rv, "%5d. %s (%f)\n", hitNumber, hit.ID, hit.Score) + for fragmentField, fragments := range hit.Fragments { + fmt.Fprintf(rv, "\t%s\n", fragmentField) + for _, fragment := range fragments { + fmt.Fprintf(rv, "\t\t%s\n", fragment) + } + } + for otherFieldName, otherFieldValue := range hit.Fields { + if otherFieldName == NestedDocumentKey { + continue + } + if _, ok := hit.Fragments[otherFieldName]; !ok { + fmt.Fprintf(rv, "\t%s\n", otherFieldName) + fmt.Fprintf(rv, "\t\t%v\n", otherFieldValue) + } + } + // nested documents + if nested, ok := hit.Fields[NestedDocumentKey]; ok { + if list, ok := nested.([]*search.NestedDocumentMatch); ok { + fmt.Fprintf(rv, "\t%s (%d nested documents)\n", NestedDocumentKey, len(list)) + for ni, nd := range list { + fmt.Fprintf(rv, "\t\tNested #%d:\n", ni+1) + for f, frags := range nd.Fragments { + fmt.Fprintf(rv, "\t\t\t%s\n", f) + for _, frag := range frags { + fmt.Fprintf(rv, "\t\t\t\t%s\n", frag) + } + } + for f, v := range nd.Fields { + if _, ok := nd.Fragments[f]; !ok { + fmt.Fprintf(rv, "\t\t\t%s\n", f) + fmt.Fprintf(rv, "\t\t\t\t%v\n", v) + } + } + } + } + } + if len(hit.DecodedSort) > 0 { + fmt.Fprintf(rv, "\t_sort: [") + for k, v := range hit.DecodedSort { + if k > 0 { + fmt.Fprintf(rv, ", ") } + fmt.Fprintf(rv, "%v", v) } + fmt.Fprintf(rv, "]\n") } return rv } diff --git a/search/collector/nested.go b/search/collector/nested.go new file mode 100644 index 000000000..ce2f79090 --- /dev/null +++ b/search/collector/nested.go @@ -0,0 +1,103 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" +) + +type collectStoreNested struct { + // descAdder is used to customize how descendants are merged into their parent + descAdder search.DescendantAdderCallbackFn + // nested reader to retrieve ancestor information + nr index.NestedReader + // the current root document match being built + currRoot *search.DocumentMatch + // the ancestor ID of the current root document being built + currRootAncestorID index.AncestorID + // prealloc slice for ancestor IDs + ancestors []index.AncestorID +} + +func newStoreNested(nr index.NestedReader, descAdder search.DescendantAdderCallbackFn) *collectStoreNested { + rv := &collectStoreNested{ + descAdder: descAdder, + nr: nr, + } + return rv +} + +// ProcessNestedDocument adds a document to the nested store, merging it into its root document +// as needed. If the returned DocumentMatch is nil, the incoming doc has been merged +// into its parent and should not be processed further. If the returned DocumentMatch +// is non-nil, it represents a complete root document that should be processed further. +// NOTE: This implementation assumes that documents are added in increasing order of their internal IDs +// which is guaranteed by all searchers in bleve. +func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, doc *search.DocumentMatch) (*search.DocumentMatch, error) { + // find ancestors for the doc + var err error + c.ancestors, err = c.nr.Ancestors(doc.IndexInternalID, c.ancestors[:0]) + if err != nil { + return nil, err + } + if len(c.ancestors) == 0 { + // should not happen, every doc should have at least itself as ancestor + return nil, nil + } + // root docID is the last ancestor + rootID := c.ancestors[len(c.ancestors)-1] + // check if there is an interim root already and if the incoming doc belongs to it + if c.currRoot != nil && c.currRootAncestorID.Equals(rootID) { + // there is an interim root already, and the incoming doc belongs to it + if err := c.descAdder(c.currRoot, doc); err != nil { + return nil, err + } + // recycle the child document now that it's merged into the interim root + ctx.DocumentMatchPool.Put(doc) + return nil, nil + } + // completedRoot is the root document match to return, if any + var completedRoot *search.DocumentMatch + if c.currRoot != nil { + // we have an existing interim root, return it for processing + completedRoot = c.currRoot + } + // no interim root for now so either we have a root document incoming + // or we have a child doc and need to create an interim root + if len(c.ancestors) == 1 { + // incoming doc is the root itself + c.currRoot = doc + c.currRootAncestorID = rootID + return completedRoot, nil + } + // this is a child doc, create interim root + newDM := ctx.DocumentMatchPool.Get() + newDM.IndexInternalID = rootID.ToIndexInternalID(newDM.IndexInternalID) + // merge the incoming doc into the new interim root + c.currRoot = newDM + c.currRootAncestorID = rootID + if err := c.descAdder(c.currRoot, doc); err != nil { + return nil, err + } + // recycle the child document now that it's merged into the interim root + ctx.DocumentMatchPool.Put(doc) + return completedRoot, nil +} + +// Current returns the current interim root document match being built, if any +func (c *collectStoreNested) Current() *search.DocumentMatch { + return c.currRoot +} diff --git a/search/collector/topn.go b/search/collector/topn.go index fc338f54e..af708aaf7 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -77,7 +77,9 @@ type TopNCollector struct { searchAfter *search.DocumentMatch knnHits map[string]*search.DocumentMatch - computeNewScoreExpl search.ScoreExplCorrectionCallbackFunc + hybridMergeCallback search.HybridMergeCallbackFn + + nestedStore *collectStoreNested } // CheckDoneEvery controls how frequently we check the context deadline @@ -87,25 +89,74 @@ const CheckDoneEvery = uint64(1024) // skipping over the first 'skip' hits // ordering hits by the provided sort order func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { - return newTopNCollector(size, skip, sort) + return newTopNCollector(size, skip, sort, nil) } // NewTopNCollectorAfter builds a collector to find the top 'size' hits // skipping over the first 'skip' hits // ordering hits by the provided sort order +// starting after the provided 'after' sort values func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector { - rv := newTopNCollector(size, 0, sort) + rv := newTopNCollector(size, 0, sort, nil) + rv.searchAfter = createSearchAfterDocument(sort, after) + return rv +} + +// NewNestedTopNCollector builds a collector to find the top 'size' hits +// skipping over the first 'skip' hits +// ordering hits by the provided sort order +// while ensuring the nested documents are handled correctly +// (i.e. parent document is returned instead of nested document) +func NewNestedTopNCollector(size int, skip int, sort search.SortOrder, nr index.NestedReader) *TopNCollector { + return newTopNCollector(size, skip, sort, nr) +} + +// NewNestedTopNCollectorAfter builds a collector to find the top 'size' hits +// skipping over the first 'skip' hits +// ordering hits by the provided sort order +// starting after the provided 'after' sort values +// while ensuring the nested documents are handled correctly +// (i.e. parent document is returned instead of nested document) +func NewNestedTopNCollectorAfter(size int, sort search.SortOrder, after []string, nr index.NestedReader) *TopNCollector { + rv := newTopNCollector(size, 0, sort, nr) rv.searchAfter = createSearchAfterDocument(sort, after) return rv } -func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { +func newTopNCollector(size int, skip int, sort search.SortOrder, nr index.NestedReader) *TopNCollector { hc := &TopNCollector{size: size, skip: skip, sort: sort} hc.store = getOptimalCollectorStore(size, skip, func(i, j *search.DocumentMatch) int { return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j) }) + if nr != nil { + descAdder := func(parent, child *search.DocumentMatch) error { + // add descendant score to parent score + parent.Score += child.Score + // merge explanations + parent.Expl = parent.Expl.MergeWith(child.Expl) + // merge field term locations + parent.FieldTermLocations = search.MergeFieldTermLocationsFromMatch(parent.FieldTermLocations, child) + // add child's ID to parent's Descendants + // add other as descendant only if it is not the same document + if !parent.IndexInternalID.Equals(child.IndexInternalID) { + // Add a copy of child.IndexInternalID to descendants, because + // child.IndexInternalID will be reset when 'child' is recycled. + var descendantID index.IndexInternalID + // first check if parent's descendants slice has capacity to reuse + if len(parent.Descendants) < cap(parent.Descendants) { + // reuse the buffer element at len(parent.Descendants) + descendantID = parent.Descendants[:len(parent.Descendants)+1][len(parent.Descendants)] + } + // copy the contents of id into descendantID, allocating if needed + parent.Descendants = append(parent.Descendants, index.NewIndexInternalIDFrom(descendantID, child.IndexInternalID)) + } + return nil + } + hc.nestedStore = newStoreNested(nr, search.DescendantAdderCallbackFn(descAdder)) + } + // these lookups traverse an interface, so do once up-front if sort.RequiresDocID() { hc.needDocIds = true @@ -236,8 +287,13 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, default: next, err = searcher.Next(searchContext) } + // use a local totalDocs for counting total docs seen + // for context deadline checking, as hc.total is only + // incremented for actual(root) collected documents, and + // we need to check deadline for every document seen (root or nested) + var totalDocs uint64 for err == nil && next != nil { - if hc.total%CheckDoneEvery == 0 { + if totalDocs%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -245,27 +301,60 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, default: } } - - err = hc.adjustDocumentMatch(searchContext, reader, next) - if err != nil { - break - } - - err = hc.prepareDocumentMatch(searchContext, reader, next, false) - if err != nil { - break + totalDocs++ + if hc.nestedStore != nil { + // This may be a nested document — add it to the nested store first. + // If the nested store returns nil, the document was merged into its parent + // and should not be processed further. + // If it returns a non-nil document, it represents a complete root document + // and should be processed further. + next, err = hc.nestedStore.ProcessNestedDocument(searchContext, next) + if err != nil { + break + } } - - err = dmHandler(next) - if err != nil { - break + if next != nil { + err = hc.adjustDocumentMatch(searchContext, reader, next) + if err != nil { + break + } + err = hc.prepareDocumentMatch(searchContext, reader, next, false) + if err != nil { + break + } + err = dmHandler(next) + if err != nil { + break + } } - next, err = searcher.Next(searchContext) } if err != nil { return err } + + // if we have a nested store, we may have an interim root + // that needs to be returned for processing + if hc.nestedStore != nil { + currRoot := hc.nestedStore.Current() + if currRoot != nil { + err = hc.adjustDocumentMatch(searchContext, reader, currRoot) + if err != nil { + return err + } + // no descendants at this point + err = hc.prepareDocumentMatch(searchContext, reader, currRoot, false) + if err != nil { + return err + } + + err = dmHandler(currRoot) + if err != nil { + return err + } + } + } + if hc.knnHits != nil { // we may have some knn hits left that did not match any of the top N tf-idf hits // we need to add them to the collector store to consider them as well. @@ -319,7 +408,10 @@ func (hc *TopNCollector) adjustDocumentMatch(ctx *search.SearchContext, return err } if knnHit, ok := hc.knnHits[d.ID]; ok { - d.Score, d.Expl = hc.computeNewScoreExpl(d, knnHit) + // we have a knn hit corresponding to this document + hc.hybridMergeCallback(d, knnHit) + // remove this knn hit from the map as it's already + // been merged delete(hc.knnHits, d.ID) } } @@ -454,6 +546,14 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc } } + // first visit descendants if any + for _, descID := range d.Descendants { + err := hc.dvReader.VisitDocValues(descID, v) + if err != nil { + return err + } + } + // now visit the doc values for this document err := hc.dvReader.VisitDocValues(d.IndexInternalID, v) if hc.facetsBuilder != nil { hc.facetsBuilder.EndDoc() @@ -532,10 +632,10 @@ func (hc *TopNCollector) FacetResults() search.FacetResults { return nil } -func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, newScoreExplComputer search.ScoreExplCorrectionCallbackFunc) { +func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, hybridMergeCallback search.HybridMergeCallbackFn) { hc.knnHits = make(map[string]*search.DocumentMatch, len(knnHits)) for _, hit := range knnHits { hc.knnHits[hit.ID] = hit } - hc.computeNewScoreExpl = newScoreExplComputer + hc.hybridMergeCallback = hybridMergeCallback } diff --git a/search/explanation.go b/search/explanation.go index 924050016..98c5e099d 100644 --- a/search/explanation.go +++ b/search/explanation.go @@ -29,6 +29,8 @@ func init() { reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) } +const MergedExplMessage = "sum of merged explanations:" + type Explanation struct { Value float64 `json:"value"` Message string `json:"message"` @@ -54,3 +56,50 @@ func (expl *Explanation) Size() int { return sizeInBytes } + +// MergeExpl merges two explanations into one. +// If either explanation is nil, the other is returned. +// If the first explanation is already a merged explanation, +// the second explanation is appended to its children. +// Otherwise, a new merged explanation is created +// with the two explanations as its children. +func (expl *Explanation) MergeWith(other *Explanation) *Explanation { + if expl == nil { + return other + } + if other == nil || expl == other { + return expl + } + + newScore := expl.Value + other.Value + + // if both are merged explanations, combine children + if expl.Message == MergedExplMessage && other.Message == MergedExplMessage { + expl.Value = newScore + expl.Children = append(expl.Children, other.Children...) + return expl + } + + // atleast one is not a merged explanation see which one it is + // if expl is merged, append other + if expl.Message == MergedExplMessage { + // append other as a child to first + expl.Value = newScore + expl.Children = append(expl.Children, other) + return expl + } + + // if other is merged, append expl + if other.Message == MergedExplMessage { + other.Value = newScore + other.Children = append(other.Children, expl) + return other + } + // create a new explanation to hold the merged one + rv := &Explanation{ + Value: expl.Value + other.Value, + Message: MergedExplMessage, + Children: []*Explanation{expl, other}, + } + return rv +} diff --git a/search/highlight/highlighter/simple/highlighter_simple.go b/search/highlight/highlighter/simple/highlighter_simple.go index e898a1e61..d0adfa81f 100644 --- a/search/highlight/highlighter/simple/highlighter_simple.go +++ b/search/highlight/highlighter/simple/highlighter_simple.go @@ -146,12 +146,8 @@ func (s *Highlighter) BestFragmentsInField(dm *search.DocumentMatch, doc index.D formattedFragments[i] += s.sep } } - - if dm.Fragments == nil { - dm.Fragments = make(search.FieldFragmentMap, 0) - } if len(formattedFragments) > 0 { - dm.Fragments[field] = formattedFragments + dm.AddFragments(field, formattedFragments) } return formattedFragments diff --git a/search/query/conjunction.go b/search/query/conjunction.go index a2043720a..6870b1ae2 100644 --- a/search/query/conjunction.go +++ b/search/query/conjunction.go @@ -54,14 +54,39 @@ func (q *ConjunctionQuery) AddQuery(aq ...Query) { func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { ss := make([]search.Searcher, 0, len(q.Conjuncts)) + cleanup := func() { + for _, searcher := range ss { + if searcher != nil { + _ = searcher.Close() + } + } + } + nestedMode, _ := ctx.Value(search.NestedSearchKey).(bool) + var nm mapping.NestedMapping + if nestedMode { + var ok bool + // get the nested mapping + if nm, ok = m.(mapping.NestedMapping); !ok { + // shouldn't be in nested mode if no nested mapping + nestedMode = false + } + } + // set of fields used in this query + var qfs search.FieldSet + var err error + for _, conjunct := range q.Conjuncts { + // Gather fields when nested mode is enabled + if nestedMode { + qfs, err = ExtractFields(conjunct, m, qfs) + if err != nil { + cleanup() + return nil, err + } + } sr, err := conjunct.Searcher(ctx, i, m, options) if err != nil { - for _, searcher := range ss { - if searcher != nil { - _ = searcher.Close() - } - } + cleanup() return nil, err } if _, ok := sr.(*searcher.MatchNoneSearcher); ok && q.queryStringMode { @@ -75,6 +100,17 @@ func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m return searcher.NewMatchNoneSearcher(i) } + if nestedMode { + // first determine the nested depth info for the query fields + commonDepth, maxDepth := nm.NestedDepth(qfs) + // if we have common depth == max depth then we can just use + // the normal conjunction searcher, as all fields share the same + // nested context, otherwise we need to use the nested conjunction searcher + if commonDepth < maxDepth { + return searcher.NewNestedConjunctionSearcher(ctx, i, ss, commonDepth, options) + } + } + return searcher.NewConjunctionSearcher(ctx, i, ss, options) } diff --git a/search/query/query.go b/search/query/query.go index 6df38da37..0b3d0a9c4 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -450,13 +450,10 @@ func DumpQuery(m mapping.IndexMapping, query Query) (string, error) { return string(data), err } -// FieldSet represents a set of queried fields. -type FieldSet map[string]struct{} - // ExtractFields returns a set of fields referenced by the query. // The returned set may be nil if the query does not explicitly reference any field // and the DefaultSearchField is unset in the index mapping. -func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, error) { +func ExtractFields(q Query, m mapping.IndexMapping, fs search.FieldSet) (search.FieldSet, error) { if q == nil || m == nil { return fs, nil } @@ -469,9 +466,9 @@ func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, erro } if f != "" { if fs == nil { - fs = make(FieldSet) + fs = search.NewFieldSet() } - fs[f] = struct{}{} + fs.AddField(f) } case *QueryStringQuery: var expandedQuery Query diff --git a/search/scorer/scorer_knn.go b/search/scorer/scorer_knn.go index 8d9043427..06f50cd4a 100644 --- a/search/scorer/scorer_knn.go +++ b/search/scorer/scorer_knn.go @@ -123,7 +123,7 @@ func (sqs *KNNQueryScorer) Score(ctx *search.SearchContext, if sqs.options.Explain { rv.Expl = scoreExplanation } - rv.IndexInternalID = append(rv.IndexInternalID, knnMatch.ID...) + rv.IndexInternalID = index.NewIndexInternalIDFrom(rv.IndexInternalID, knnMatch.ID) return rv } diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index f5f8ec935..d7e77f977 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -243,7 +243,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term } } - rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...) + rv.IndexInternalID = index.NewIndexInternalIDFrom(rv.IndexInternalID, termMatch.ID) if len(termMatch.Vectors) > 0 { if cap(rv.FieldTermLocations) < len(termMatch.Vectors) { diff --git a/search/search.go b/search/search.go index d199b79cb..f03ed13e0 100644 --- a/search/search.go +++ b/search/search.go @@ -172,9 +172,9 @@ type DocumentMatch struct { // used to indicate the sub-scores that combined to form the // final score for this document match. This is only populated - // when the search request's query is a DisjunctionQuery - // or a ConjunctionQuery. The map key is the index of the sub-query - // in the DisjunctionQuery or ConjunctionQuery. The map value is the + // when the search request's query is a DisjunctionQuery. + // The map key is the index of the sub-query + // in the DisjunctionQuery. The map value is the // sub-score for that sub-query. ScoreBreakdown map[int]float64 `json:"score_breakdown,omitempty"` @@ -185,6 +185,10 @@ type DocumentMatch struct { // of the index that this match came from // of the current alias view, used in alias of aliases scenario IndexNames []string `json:"index_names,omitempty"` + + // Descendants holds the IDs of any child/descendant document that contributed + // to this root DocumentMatch. + Descendants []index.IndexInternalID `json:"-"` } func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { @@ -208,6 +212,21 @@ func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { dm.Fields[name] = valSlice } +func (dm *DocumentMatch) AddFragments(field string, fragments []string) { + if dm.Fragments == nil { + dm.Fragments = make(FieldFragmentMap) + } +OUTER: + for _, newFrag := range fragments { + for _, existingFrag := range dm.Fragments[field] { + if existingFrag == newFrag { + continue OUTER // no duplicates allowed + } + } + dm.Fragments[field] = append(dm.Fragments[field], newFrag) + } +} + // Reset allows an already allocated DocumentMatch to be reused func (dm *DocumentMatch) Reset() *DocumentMatch { // remember the []byte used for the IndexInternalID @@ -219,6 +238,15 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { for i := range ftls { // recycle the ArrayPositions of each location ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] } + // remember the score breakdown map + scoreBreakdown := dm.ScoreBreakdown + // clear out the score breakdown map + clear(scoreBreakdown) + // remember the Descendants backing array + descendants := dm.Descendants + for i := range descendants { // recycle each IndexInternalID + descendants[i] = descendants[i][:0] + } // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) @@ -228,6 +256,10 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { dm.DecodedSort = dm.DecodedSort[:0] // reuse the FieldTermLocations already allocated (and reset len to 0) dm.FieldTermLocations = ftls[:0] + // reuse the Descendants already allocated (and reset len to 0) + dm.Descendants = descendants[:0] + // reuse the score breakdown map already allocated (after clearing it) + dm.ScoreBreakdown = scoreBreakdown return dm } @@ -400,3 +432,20 @@ func (sc *SearchContext) Size() int { return sizeInBytes } + +// A NestedDocumentMatch is like a DocumentMatch but used for nested documents +// and does not have score or locations, or a score and is mainly used to +// hold field values and fragments, to be embedded in the parent DocumentMatch +type NestedDocumentMatch struct { + Fields map[string]interface{} `json:"fields,omitempty"` + Fragments FieldFragmentMap `json:"fragments,omitempty"` +} + +// NewNestedDocumentMatch creates a new NestedDocumentMatch instance +// with the given fields and fragments +func NewNestedDocumentMatch(fields map[string]interface{}, fragments FieldFragmentMap) *NestedDocumentMatch { + return &NestedDocumentMatch{ + Fields: fields, + Fragments: fragments, + } +} diff --git a/search/searcher/search_conjunction_nested.go b/search/searcher/search_conjunction_nested.go new file mode 100644 index 000000000..3abf9490d --- /dev/null +++ b/search/searcher/search_conjunction_nested.go @@ -0,0 +1,499 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "context" + "fmt" + "math" + "reflect" + "slices" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeNestedConjunctionSearcher int + +func init() { + var ncs NestedConjunctionSearcher + reflectStaticSizeNestedConjunctionSearcher = int(reflect.TypeOf(ncs).Size()) +} + +type NestedConjunctionSearcher struct { + nestedReader index.NestedReader + searchers []search.Searcher + queryNorm float64 + currs []*search.DocumentMatch + currAncestors [][]index.AncestorID + currKeys []index.AncestorID + initialized bool + joinIdx int + options search.SearcherOptions + docQueue *CoalesceQueue + // reusable ID buffer for Advance() calls + advanceID index.IndexInternalID + // reusable buffer for Advance() calls + ancestors []index.AncestorID +} + +func NewNestedConjunctionSearcher(ctx context.Context, indexReader index.IndexReader, + searchers []search.Searcher, joinIdx int, options search.SearcherOptions) (search.Searcher, error) { + + var nr index.NestedReader + var ok bool + if nr, ok = indexReader.(index.NestedReader); !ok { + return nil, fmt.Errorf("indexReader does not support nested documents") + } + + // build our searcher + rv := NestedConjunctionSearcher{ + nestedReader: nr, + options: options, + searchers: searchers, + currs: make([]*search.DocumentMatch, len(searchers)), + currAncestors: make([][]index.AncestorID, len(searchers)), + currKeys: make([]index.AncestorID, len(searchers)), + joinIdx: joinIdx, + docQueue: NewCoalesceQueue(), + } + rv.computeQueryNorm() + + return &rv, nil +} + +func (s *NestedConjunctionSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + +func (s *NestedConjunctionSearcher) Size() int { + sizeInBytes := reflectStaticSizeNestedConjunctionSearcher + size.SizeOfPtr + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + +func (s *NestedConjunctionSearcher) Weight() float64 { + var rv float64 + for _, searcher := range s.searchers { + rv += searcher.Weight() + } + return rv +} + +func (s *NestedConjunctionSearcher) SetQueryNorm(qnorm float64) { + for _, searcher := range s.searchers { + searcher.SetQueryNorm(qnorm) + } +} + +func (s *NestedConjunctionSearcher) Count() uint64 { + // for now return a worst case + var sum uint64 + for _, searcher := range s.searchers { + sum += searcher.Count() + } + return sum +} + +func (s *NestedConjunctionSearcher) Close() (rv error) { + for _, searcher := range s.searchers { + err := searcher.Close() + if err != nil && rv == nil { + rv = err + } + } + return rv +} + +func (s *NestedConjunctionSearcher) Min() int { + return 0 +} + +func (s *NestedConjunctionSearcher) DocumentMatchPoolSize() int { + rv := len(s.currs) + for _, s := range s.searchers { + rv += s.DocumentMatchPoolSize() + } + return rv +} + +func (s *NestedConjunctionSearcher) initialize(ctx *search.SearchContext) (bool, error) { + var err error + for i, searcher := range s.searchers { + if s.currs[i] != nil { + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Next(ctx) + if err != nil { + return false, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return true, nil + } + // get the ancestry chain for this match + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return false, err + } + // check if the ancestry chain is > joinIdx, if not we reset the joinIdx + // to the minimum possible value across all searchers, ideally this will be + // done in query construction time itself, by using the covering depth across + // all sub-queries, but we do this here as a fallback + if s.joinIdx >= len(s.currAncestors[i]) { + s.joinIdx = len(s.currAncestors[i]) - 1 + } + } + // build currKeys for each searcher, do it here as we may have adjusted joinIdx + for i := range s.searchers { + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + } + s.initialized = true + return false, nil +} + +func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { + // initialize on first call to Next, by getting first match + // from each searcher and their ancestry chains + if !s.initialized { + done, err := s.initialize(ctx) + if err != nil { + return nil, err + } + if done { + return nil, nil + } + } + // check if the docQueue has any buffered matches + if s.docQueue.Len() > 0 { + return s.docQueue.Dequeue() + } + // now enter the main alignment loop + n := len(s.searchers) +OUTER: + for { + // pick the pivot searcher with the highest key (ancestor at joinIdx level) + if s.currs[0] == nil { + return nil, nil + } + maxKey := s.currKeys[0] + for i := 1; i < n; i++ { + // currs[i] is nil means one of the searchers is exhausted + if s.currs[i] == nil { + return nil, nil + } + currKey := s.currKeys[i] + if maxKey.Compare(currKey) < 0 { + maxKey = currKey + } + } + // store maxkey as advanceID only once only if needed + var advanceID index.IndexInternalID + // flag to track if all searchers are aligned + var aligned bool = true + // now try to align all other searchers to the + // we check if the a searchers key matches maxKey + // if not, we advance the pivot searcher to maxKey + // else do nothing and move to the next searcher + for i := 0; i < n; i++ { + cmp := s.currKeys[i].Compare(maxKey) + if cmp < 0 { + // not aligned, so advance this searcher to maxKey + // convert maxKey to advanceID only once + if advanceID == nil { + advanceID = s.toAdvanceID(maxKey) + } + var err error + ctx.DocumentMatchPool.Put(s.currs[i]) + s.currs[i], err = s.searchers[i].Advance(ctx, advanceID) + if err != nil { + return nil, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return nil, err + } + // recalc key + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + // recalc cmp + cmp = s.currKeys[i].Compare(maxKey) + } + if cmp != 0 { + // not aligned + aligned = false + } + } + // now check if all the searchers are aligned at the same maxKey + // if they are not aligned, we need to restart the loop of picking + // the pivot searcher with the highest key + if !aligned { + continue OUTER + } + // if we are here, all the searchers are aligned at maxKey + // now we need to buffer all the intermediate matches for every + // searcher at this key, until either the searcher's key changes + // or the searcher is exhausted + for i := 0; i < n; i++ { + for { + // buffer the current match + recycle, err := s.docQueue.Enqueue(s.currs[i]) + if err != nil { + return nil, err + } + if recycle != nil { + // we got a match to recycle + ctx.DocumentMatchPool.Put(recycle) + } + // advance to next match + s.currs[i], err = s.searchers[i].Next(ctx) + if err != nil { + return nil, err + } + if s.currs[i] == nil { + // searcher exhausted, break out + break + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return nil, err + } + // recalc key + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + // check if key has changed + if !s.currKeys[i].Equals(maxKey) { + // key changed, break out + break + } + } + } + // finalize the docQueue for dequeueing + s.docQueue.Finalize() + // finally return the first buffered match + return s.docQueue.Dequeue() + } +} + +// ancestorFromRoot gets the AncestorID at the given position from the root +// if pos is 0, it returns the root AncestorID, and so on +func ancestorFromRoot(ancestors []index.AncestorID, pos int) index.AncestorID { + return ancestors[len(ancestors)-pos-1] +} + +// toAdvanceID converts an AncestorID to IndexInternalID, reusing the advanceID buffer. +// The returned ID is safe to pass to Advance() since Advance() never retains references. +func (s *NestedConjunctionSearcher) toAdvanceID(key index.AncestorID) index.IndexInternalID { + // Reset length to 0 while preserving capacity for buffer reuse + s.advanceID = s.advanceID[:0] + // Convert key to IndexInternalID, reusing the underlying buffer + s.advanceID = key.ToIndexInternalID(s.advanceID) + return s.advanceID +} + +func (s *NestedConjunctionSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { + if !s.initialized { + done, err := s.initialize(ctx) + if err != nil { + return nil, err + } + if done { + return nil, nil + } + } + // first check if the docQueue has any buffered matches + // if so we first check if any of them can satisfy the Advance(ID) + for s.docQueue.Len() > 0 { + dm, err := s.docQueue.Dequeue() + if err != nil { + return nil, err + } + if dm.IndexInternalID.Compare(ID) >= 0 { + return dm, nil + } + // otherwise recycle this match + ctx.DocumentMatchPool.Put(dm) + } + var err error + // now we first get the ancestry chain for the given ID + s.ancestors, err = s.nestedReader.Ancestors(ID, s.ancestors[:0]) + if err != nil { + return nil, err + } + // we now follow the the following logic for each searcher: + // let S be the length of the ancestry chain for the searcher + // let I be the length of the ancestry chain for the given ID + // 1. if S > I: + // then we just Advance() the searcher to the given ID if required + // 2. else if S <= I: + // then we get the AncestorID at position (S - 1) from the root of + // the given ID's ancestry chain, and Advance() the searcher to + // it if required + for i, searcher := range s.searchers { + if s.currs[i] == nil { + return nil, nil // already exhausted, nothing to do + } + var targetID index.IndexInternalID + S := len(s.currAncestors[i]) + I := len(s.ancestors) + if S > I { + // case 1: S > I + targetID = ID + } else { + // case 2: S <= I + targetID = s.toAdvanceID(ancestorFromRoot(s.ancestors, S-1)) + } + if s.currs[i].IndexInternalID.Compare(targetID) < 0 { + // need to advance this searcher + ctx.DocumentMatchPool.Put(s.currs[i]) + s.currs[i], err = searcher.Advance(ctx, targetID) + if err != nil { + return nil, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return nil, err + } + // recalc key + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + } + } + // we need to call Next() in a loop until we reach or exceed the given ID + // the Next() call basically gives us a match that is aligned correctly, but + // if joinIdx < I, we can have multiple matches for the same joinIdx ancestor + // and they may be < ID, so we need to loop + for { + next, err := s.Next(ctx) + if err != nil { + return nil, err + } + if next == nil { + return nil, nil + } + if next.IndexInternalID.Compare(ID) >= 0 { + return next, nil + } + ctx.DocumentMatchPool.Put(next) + } +} + +// ------------------------------------------------------------------------------------------ +type CoalesceQueue struct { + order []*search.DocumentMatch // queue of DocumentMatch + items map[uint64]*search.DocumentMatch // map of ID to DocumentMatch +} + +func NewCoalesceQueue() *CoalesceQueue { + cq := &CoalesceQueue{ + order: make([]*search.DocumentMatch, 0), + items: make(map[uint64]*search.DocumentMatch), + } + return cq +} + +// Enqueue adds the given DocumentMatch to the queue. If a DocumentMatch with the same +// IndexInternalID already exists in the queue, it merges the scores and explanations, +// and returns the given DocumentMatch for recycling. If it's a new entry, it adds it +// to the queue and returns nil. +func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) (*search.DocumentMatch, error) { + val, err := it.IndexInternalID.Value() + if err != nil { + // cannot coalesce without a valid uint64 ID + return nil, err + } + + if existing, ok := cq.items[val]; ok { + // merge with current version + existing.Score += it.Score + existing.Expl = existing.Expl.MergeWith(it.Expl) + existing.FieldTermLocations = search.MergeFieldTermLocationsFromMatch( + existing.FieldTermLocations, it) + // return it to caller for recycling + return it, nil + } + + // first time we see this ID — enqueue + cq.items[val] = it + // append to order slice (this is a stack) + cq.order = append(cq.order, it) + // no recycling needed as we added a new item + return nil, nil +} + +// Finalize prepares the queue for dequeue operations by sorting the items based on +// their IndexInternalID values. This MUST be called before any Dequeue operations, +// and after all Enqueue operations are complete. The sort is done in descending order +// so that dequeueing will basically be popping from the end of the slice, allowing for +// slice reuse. +func (cq *CoalesceQueue) Finalize() { + slices.SortFunc(cq.order, func(a, b *search.DocumentMatch) int { + return b.IndexInternalID.Compare(a.IndexInternalID) + }) +} + +// Dequeue removes and returns the next DocumentMatch from the queue in sorted order. +// If the queue is empty, it returns nil. +func (cq *CoalesceQueue) Dequeue() (*search.DocumentMatch, error) { + if cq.Len() == 0 { + return nil, nil + } + + // pop from end of slice + rv := cq.order[len(cq.order)-1] + cq.order = cq.order[:len(cq.order)-1] + + val, err := rv.IndexInternalID.Value() + if err != nil { + return nil, err + } + + delete(cq.items, val) + return rv, nil +} + +// Len returns the number of DocumentMatch items currently in the queue. +func (cq *CoalesceQueue) Len() int { + return len(cq.order) +} diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index 3da876bd3..4c68e5691 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -15,7 +15,6 @@ package searcher import ( - "bytes" "container/heap" "context" "math" @@ -169,7 +168,7 @@ func (s *DisjunctionHeapSearcher) updateMatches() error { matchingIdxs = append(matchingIdxs, next.matchingIdx) // now as long as top of heap matches, keep popping - for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { + for len(s.heap) > 0 && next.curr.IndexInternalID.Equals(s.heap[0].curr.IndexInternalID) { next = heap.Pop(s).(*SearcherCurr) matching = append(matching, next.curr) matchingCurrs = append(matchingCurrs, next) @@ -264,7 +263,7 @@ func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, // find all searchers that actually need to be advanced // advance them, using s.matchingCurrs as temp storage - for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { + for len(s.heap) > 0 && s.heap[0].curr.IndexInternalID.Compare(ID) < 0 { searcherCurr := heap.Pop(s).(*SearcherCurr) ctx.DocumentMatchPool.Put(searcherCurr.curr) curr, err := searcherCurr.searcher.Advance(ctx, ID) @@ -347,7 +346,7 @@ func (s *DisjunctionHeapSearcher) Less(i, j int) bool { } else if s.heap[j].curr == nil { return false } - return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 + return s.heap[i].curr.IndexInternalID.Compare(s.heap[j].curr.IndexInternalID) < 0 } func (s *DisjunctionHeapSearcher) Swap(i, j int) { diff --git a/search/searcher/search_match_all.go b/search/searcher/search_match_all.go index 57d8d0727..57966a924 100644 --- a/search/searcher/search_match_all.go +++ b/search/searcher/search_match_all.go @@ -36,6 +36,8 @@ type MatchAllSearcher struct { reader index.DocIDReader scorer *scorer.ConstantScorer count uint64 + nested bool + ancestors []index.AncestorID } func NewMatchAllSearcher(ctx context.Context, indexReader index.IndexReader, boost float64, options search.SearcherOptions) (*MatchAllSearcher, error) { @@ -50,11 +52,15 @@ func NewMatchAllSearcher(ctx context.Context, indexReader index.IndexReader, boo } scorer := scorer.NewConstantScorer(1.0, boost, options) + // check if we are in nested mode + nested, _ := ctx.Value(search.NestedSearchKey).(bool) + return &MatchAllSearcher{ indexReader: indexReader, reader: reader, scorer: scorer, count: count, + nested: nested, }, nil } @@ -76,6 +82,23 @@ func (s *MatchAllSearcher) SetQueryNorm(qnorm float64) { s.scorer.SetQueryNorm(qnorm) } +func (s *MatchAllSearcher) isNested(id index.IndexInternalID) bool { + // if not running in nested mode, always return false + if !s.nested { + return false + } + var err error + // check if this doc has ancestors, if so it is nested + if nr, ok := s.reader.(index.NestedReader); ok { + s.ancestors, err = nr.Ancestors(id, s.ancestors[:0]) + if err != nil { + return false + } + return len(s.ancestors) > 1 + } + return false +} + func (s *MatchAllSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { id, err := s.reader.Next() if err != nil { @@ -86,6 +109,11 @@ func (s *MatchAllSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatc return nil, nil } + if s.isNested(id) { + // if nested then skip and get next + return s.Next(ctx) + } + // score match docMatch := s.scorer.Score(ctx, id) // return doc match @@ -103,6 +131,11 @@ func (s *MatchAllSearcher) Advance(ctx *search.SearchContext, ID index.IndexInte return nil, nil } + if s.isNested(id) { + // if nested then return next + return s.Next(ctx) + } + // score match docMatch := s.scorer.Score(ctx, id) diff --git a/search/searcher/search_numeric_range.go b/search/searcher/search_numeric_range.go index f086051c1..cd8f00719 100644 --- a/search/searcher/search_numeric_range.go +++ b/search/searcher/search_numeric_range.go @@ -132,7 +132,7 @@ func filterCandidateTerms(indexReader index.IndexReader, for err == nil && tfd != nil { termBytes := []byte(tfd.Term) i := sort.Search(len(terms), func(i int) bool { return bytes.Compare(terms[i], termBytes) >= 0 }) - if i < len(terms) && bytes.Compare(terms[i], termBytes) == 0 { + if i < len(terms) && bytes.Equal(terms[i], termBytes) { rv = append(rv, terms[i]) } terms = terms[i:] diff --git a/search/util.go b/search/util.go index 06f8f99d5..0c1ec9ab9 100644 --- a/search/util.go +++ b/search/util.go @@ -50,41 +50,54 @@ func MergeTermLocationMaps(rv, other TermLocationMap) TermLocationMap { func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) []FieldTermLocation { n := len(dest) for _, dm := range matches { - n += len(dm.FieldTermLocations) + if dm != nil { + n += len(dm.FieldTermLocations) + } } if cap(dest) < n { dest = append(make([]FieldTermLocation, 0, n), dest...) } for _, dm := range matches { - for _, ftl := range dm.FieldTermLocations { - dest = append(dest, FieldTermLocation{ - Field: ftl.Field, - Term: ftl.Term, - Location: Location{ - Pos: ftl.Location.Pos, - Start: ftl.Location.Start, - End: ftl.Location.End, - ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), - }, - }) + if dm != nil { + dest = mergeFieldTermLocationFromMatch(dest, dm) } } return dest } -type SearchIOStatsCallbackFunc func(uint64) +// MergeFieldTermLocationsFromMatch merges field term locations from a single DocumentMatch +// into dest, returning the updated slice. +func MergeFieldTermLocationsFromMatch(dest []FieldTermLocation, match *DocumentMatch) []FieldTermLocation { + if match == nil { + return dest + } + n := len(dest) + len(match.FieldTermLocations) + if cap(dest) < n { + dest = append(make([]FieldTermLocation, 0, n), dest...) + } + return mergeFieldTermLocationFromMatch(dest, match) +} + +// mergeFieldTermLocationFromMatch appends field term locations from a DocumentMatch into dest. +// Assumes dest has sufficient capacity. +func mergeFieldTermLocationFromMatch(dest []FieldTermLocation, dm *DocumentMatch) []FieldTermLocation { + for _, ftl := range dm.FieldTermLocations { + dest = append(dest, FieldTermLocation{ + Field: ftl.Field, + Term: ftl.Term, + Location: Location{ + Pos: ftl.Location.Pos, + Start: ftl.Location.Start, + End: ftl.Location.End, + ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), + }, + }) + } -// Implementation of SearchIncrementalCostCallbackFn should handle the following messages -// - add: increment the cost of a search operation -// (which can be specific to a query type as well) -// - abort: query was aborted due to a cancel of search's context (for eg), -// which can be handled differently as well -// - done: indicates that a search was complete and the tracked cost can be -// handled safely by the implementation. -type SearchIncrementalCostCallbackFn func(SearchIncrementalCostCallbackMsg, - SearchQueryType, uint64) + return dest +} type ( SearchIncrementalCostCallbackMsg uint @@ -152,6 +165,10 @@ const ( // BM25StatsKey is used to store and transport the BM25 Data // to the actual search phase which would use it to perform the search. BM25StatsKey ContextKey = "_bm25_stats_key" + + // NestedSearchKey is used to communicate whether the search is performed + // in an index with nested documents + NestedSearchKey ContextKey = "_nested_search_key" ) func RecordSearchCost(ctx context.Context, @@ -180,9 +197,7 @@ const ( MinGeoBufPoolSize = 24 ) -type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool - -// *PreSearchDataKey are used to store the data gathered during the presearch phase +// PreSearchDataKey are used to store the data gathered during the presearch phase // which would be use in the actual search phase. const ( KnnPreSearchDataKey = "_knn_pre_search_data_key" @@ -193,14 +208,39 @@ const ( const GlobalScoring = "_global_scoring" type ( + // SearcherStartCallbackFn is a callback function type used to signal the start of + // searcher creation phase. SearcherStartCallbackFn func(size uint64) error - SearcherEndCallbackFn func(size uint64) error + // SearcherEndCallbackFn is a callback function type used to signal the end of + // a searcher creation phase. + SearcherEndCallbackFn func(size uint64) error + // GetScoringModelCallbackFn is a callback function type used to get the scoring model + // to be used for scoring documents during search. + GetScoringModelCallbackFn func() string + // HybridMergeCallbackFn is a callback function type used to merge a KNN document match + // into a full text search document match, of the same docID as part of hybrid search. + HybridMergeCallbackFn func(ftsMatch *DocumentMatch, knnMatch *DocumentMatch) + // DescendantAdderCallback is a callback function type used to customize how a descendant + // DocumentMatch is merged into its parent. This allows different descendant addition strategies for + // different use cases (e.g., TopN vs KNN collection). + DescendantAdderCallbackFn func(parent *DocumentMatch, descendant *DocumentMatch) error + // GeoBufferPoolCallbackFunc is a callback function type used to get the geo buffer pool + // to be used during geo searches. + GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool + // SearchIOStatsCallbackFunc is a callback function type used to report search IO stats + // during search. + SearchIOStatsCallbackFunc func(uint64) + // Implementation of SearchIncrementalCostCallbackFn should handle the following messages + // - add: increment the cost of a search operation + // (which can be specific to a query type as well) + // - abort: query was aborted due to a cancel of search's context (for eg), + // which can be handled differently as well + // - done: indicates that a search was complete and the tracked cost can be + // handled safely by the implementation. + SearchIncrementalCostCallbackFn func(SearchIncrementalCostCallbackMsg, + SearchQueryType, uint64) ) -type GetScoringModelCallbackFn func() string - -type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation) - // field -> term -> synonyms type FieldTermSynonymMap map[string]map[string][]string @@ -233,3 +273,25 @@ type BM25Stats struct { DocCount float64 `json:"doc_count"` FieldCardinality map[string]int `json:"field_cardinality"` } + +// FieldSet represents a set of queried fields. +type FieldSet map[string]struct{} + +// NewFieldSet creates a new FieldSet. +func NewFieldSet() FieldSet { + return make(map[string]struct{}) +} + +// Add adds a field to the set. +func (fs FieldSet) AddField(field string) { + fs[field] = struct{}{} +} + +// Slice returns the fields in this set as a slice of strings. +func (fs FieldSet) Slice() []string { + rv := make([]string, 0, len(fs)) + for field := range fs { + rv = append(rv, field) + } + return rv +} diff --git a/search_knn.go b/search_knn.go index ca3c5d113..271fad3b2 100644 --- a/search_knn.go +++ b/search_knn.go @@ -356,7 +356,7 @@ func addSortAndFieldsToKNNHits(req *SearchRequest, knnHits []*search.DocumentMat } } req.Sort.Value(hit) - err, _ = LoadAndHighlightFields(hit, req, "", reader, nil) + err, _ = LoadAndHighlightAllFields(hit, req, "", reader, nil) if err != nil { return err } @@ -436,17 +436,15 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea return knnHits, nil } -func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) { +func setKnnHitsInCollector(knnHits []*search.DocumentMatch, coll *collector.TopNCollector) { if len(knnHits) > 0 { - newScoreExplComputer := func(queryMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) (float64, *search.Explanation) { - totalScore := queryMatch.Score + knnMatch.Score - if !req.Explain { - // exit early as we don't need to compute the explanation - return totalScore, nil - } - return totalScore, &search.Explanation{Value: totalScore, Message: "sum of:", Children: []*search.Explanation{queryMatch.Expl, knnMatch.Expl}} + mergeFn := func(ftsMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) { + // Boost the FTS score using the KNN score + ftsMatch.Score += knnMatch.Score + // Combine the FTS explanation with the KNN explanation, if present + ftsMatch.Expl.MergeWith(knnMatch.Expl) } - coll.SetKNNHits(knnHits, search.ScoreExplCorrectionCallbackFunc(newScoreExplComputer)) + coll.SetKNNHits(knnHits, search.HybridMergeCallbackFn(mergeFn)) } } diff --git a/search_nested_test.go b/search_nested_test.go new file mode 100644 index 000000000..f22fd5bc0 --- /dev/null +++ b/search_nested_test.go @@ -0,0 +1,1046 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bleve + +import ( + "encoding/json" + "fmt" + "testing" + + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/highlight/highlighter/ansi" + "github.com/blevesearch/bleve/v2/search/query" +) + +func createNestedIndexMapping() mapping.IndexMapping { + /* + company + ├── id + ├── name + ├── departments[] (nested) + │ ├── name + │ ├── budget + │ ├── employees[] (nested) + │ │ ├── name + │ │ ├── role + │ └── projects[] (nested) + │ ├── title + │ ├── status + └── locations[] (nested) + ├── city + ├── country + */ + + // Create the index mapping + imap := mapping.NewIndexMapping() + + // Create company mapping + companyMapping := mapping.NewDocumentMapping() + + // Company ID field + companyIDField := mapping.NewTextFieldMapping() + companyMapping.AddFieldMappingsAt("id", companyIDField) + + // Company name field + companyNameField := mapping.NewTextFieldMapping() + companyMapping.AddFieldMappingsAt("name", companyNameField) + + // Departments mapping + departmentsMapping := mapping.NewNestedDocumentMapping() + + // Department name field + deptNameField := mapping.NewTextFieldMapping() + departmentsMapping.AddFieldMappingsAt("name", deptNameField) + + // Department budget field + deptBudgetField := mapping.NewNumericFieldMapping() + departmentsMapping.AddFieldMappingsAt("budget", deptBudgetField) + + // Employees mapping + employeesMapping := mapping.NewNestedDocumentMapping() + + // Employee name field + empNameField := mapping.NewTextFieldMapping() + employeesMapping.AddFieldMappingsAt("name", empNameField) + + // Employee role field + empRoleField := mapping.NewTextFieldMapping() + employeesMapping.AddFieldMappingsAt("role", empRoleField) + + departmentsMapping.AddSubDocumentMapping("employees", employeesMapping) + + // Projects mapping + projectsMapping := mapping.NewNestedDocumentMapping() + + // Project title field + projTitleField := mapping.NewTextFieldMapping() + projectsMapping.AddFieldMappingsAt("title", projTitleField) + + // Project status field + projStatusField := mapping.NewTextFieldMapping() + projectsMapping.AddFieldMappingsAt("status", projStatusField) + + departmentsMapping.AddSubDocumentMapping("projects", projectsMapping) + + companyMapping.AddSubDocumentMapping("departments", departmentsMapping) + + // Locations mapping + locationsMapping := mapping.NewNestedDocumentMapping() + + // Location city field + cityField := mapping.NewTextFieldMapping() + locationsMapping.AddFieldMappingsAt("city", cityField) + + // Location country field + countryField := mapping.NewTextFieldMapping() + locationsMapping.AddFieldMappingsAt("country", countryField) + + companyMapping.AddSubDocumentMapping("locations", locationsMapping) + + // Add company to type mapping + imap.DefaultMapping.AddSubDocumentMapping("company", companyMapping) + + return imap +} + +func TestNestedPrefixes(t *testing.T) { + imap := createNestedIndexMapping() + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + if err := idx.Close(); err != nil { + t.Fatal(err) + } + }() + + nmap, ok := imap.(mapping.NestedMapping) + if !ok { + t.Fatal("index mapping is not a NestedMapping") + } + + // ---------------------------------------------------------------------- + // Test 1: Employee Role AND Employee Name + // ---------------------------------------------------------------------- + fs := search.NewFieldSet() + fs.AddField("company.departments.employees.role") + fs.AddField("company.departments.employees.name") + + expectedCommon := 2 + expectedMax := 2 + + common, max := nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test1: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 2: Employee Role AND Employee Name AND Department Name + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + fs.AddField("company.departments.employees.role") + fs.AddField("company.departments.employees.name") + fs.AddField("company.departments.name") + + expectedCommon = 1 + expectedMax = 2 // employees nested deeper + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test2: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 3: Employee Role AND Location City + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + fs.AddField("company.departments.employees.role") + fs.AddField("company.locations.city") + + expectedCommon = 0 + expectedMax = 2 // employees deeper than locations (1) + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test3: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 4: Company Name AND Location Country + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + fs.AddField("company.name") + fs.AddField("company.locations.country") + fs.AddField("company.locations.city") + + expectedCommon = 0 + expectedMax = 1 // locations.country and locations.city share depth 1 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test4: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 5: Department Budget AND Project Status AND Employee Name + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + fs.AddField("company.departments.budget") + fs.AddField("company.departments.projects.status") + fs.AddField("company.departments.employees.name") + + expectedCommon = 1 + expectedMax = 2 // employees + projects go deeper + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test5: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 6: Single Field + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + fs.AddField("company.id") + + expectedCommon = 0 + expectedMax = 0 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test6: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 7: No Fields + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + + expectedCommon = 0 + expectedMax = 0 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test7: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 8: All Fields + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + fs.AddField("company.id") + fs.AddField("company.name") + fs.AddField("company.departments.name") + fs.AddField("company.departments.budget") + fs.AddField("company.departments.employees.name") + fs.AddField("company.departments.employees.role") + fs.AddField("company.departments.projects.title") + fs.AddField("company.departments.projects.status") + fs.AddField("company.locations.city") + fs.AddField("company.locations.country") + + expectedCommon = 0 // spans different contexts + expectedMax = 2 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test8: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 9: Project Title AND Project Status + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + fs.AddField("company.departments.projects.title") + fs.AddField("company.departments.projects.status") + + expectedCommon = 2 + expectedMax = 2 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test9: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } + + // ---------------------------------------------------------------------- + // Test 10: Department Name AND Location Country + // ---------------------------------------------------------------------- + fs = search.NewFieldSet() + fs.AddField("company.departments.name") + fs.AddField("company.locations.country") + fs.AddField("company.locations.city") + + expectedCommon = 0 + expectedMax = 1 // locations share depth 1 + + common, max = nmap.NestedDepth(fs) + if common != expectedCommon || max != expectedMax { + t.Fatalf("Test10: expected (common=%d, max=%d), got (common=%d, max=%d)", + expectedCommon, expectedMax, common, max) + } +} + +func TestNestedConjunctionQuery(t *testing.T) { + imap := createNestedIndexMapping() + err := imap.Validate() + if err != nil { + t.Fatalf("expected valid nested index mapping, got error: %v", err) + } + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + // Index 3 sample documents + docs := []struct { + id string + data string + }{ + { + id: "doc1", + data: `{ + "company": { + "id": "c1", + "name": "TechCorp", + "departments": [ + { + "name": "Engineering", + "budget": 2000000, + "employees": [ + {"name": "Alice", "role": "Engineer"}, + {"name": "Bob", "role": "Manager"} + ], + "projects": [ + {"title": "Project X", "status": "ongoing"}, + {"title": "Project Y", "status": "completed"} + ] + }, + { + "name": "Sales", + "budget": 300000, + "employees": [ + {"name": "Eve", "role": "Salesperson"}, + {"name": "Mallory", "role": "Manager"} + ], + "projects": [ + {"title": "Project A", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} + ] + } + ], + "locations": [ + {"city": "Athens", "country": "Greece"}, + {"city": "Berlin", "country": "USA"} + ] + } + }`, + }, + { + id: "doc2", + data: `{ + "company" : { + "id": "c2", + "name": "BizInc", + "departments": [ + { + "name": "Marketing", + "budget": 800000, + "employees": [ + {"name": "Eve", "role": "Marketer"}, + {"name": "David", "role": "Manager"} + ], + "projects": [ + {"title": "Project Z", "status": "ongoing"}, + {"title": "Project W", "status": "planned"} + ] + }, + { + "name": "Engineering", + "budget": 800000, + "employees": [ + {"name": "Frank", "role": "Manager"}, + {"name": "Grace", "role": "Engineer"} + ], + "projects": [ + {"title": "Project Alpha", "status": "completed"}, + {"title": "Project Beta", "status": "ongoing"} + ] + } + ], + "locations": [ + {"city": "Athens", "country": "USA"}, + {"city": "London", "country": "UK"} + ] + } + }`, + }, + { + id: "doc3", + data: `{ + "company": { + "id": "c3", + "name": "WebSolutions", + "departments": [ + { + "name": "HR", + "budget": 800000, + "employees": [ + {"name": "Eve", "role": "Manager"}, + {"name": "Frank", "role": "HR"} + ], + "projects": [ + {"title": "Project Beta", "status": "completed"}, + {"title": "Project B", "status": "ongoing"} + ] + }, + { + "name": "Engineering", + "budget": 200000, + "employees": [ + {"name": "Heidi", "role": "Support Engineer"}, + {"name": "Ivan", "role": "Manager"} + ], + "projects": [ + {"title": "Project Helpdesk", "status": "ongoing"}, + {"title": "Project FAQ", "status": "completed"} + ] + } + ], + "locations": [ + {"city": "Edinburgh", "country": "UK"}, + {"city": "London", "country": "Canada"} + ] + } + }`, + }, + } + + for _, doc := range docs { + var dataMap map[string]interface{} + err := json.Unmarshal([]byte(doc.data), &dataMap) + if err != nil { + t.Fatalf("failed to unmarshal document %s: %v", doc.id, err) + } + err = idx.Index(doc.id, dataMap) + if err != nil { + t.Fatalf("failed to index document %s: %v", doc.id, err) + } + } + + var buildReq = func(subQueries []query.Query) *SearchRequest { + rv := NewSearchRequest(query.NewConjunctionQuery(subQueries)) + rv.SortBy([]string{"_id"}) + rv.Fields = []string{"*"} + rv.Highlight = NewHighlightWithStyle(ansi.Name) + return rv + } + + var ( + req *SearchRequest + res *SearchResult + deptNameQuery *query.MatchQuery + deptBudgetQuery *query.NumericRangeQuery + empNameQuery *query.MatchQuery + empRoleQuery *query.MatchQuery + projTitleQuery *query.MatchPhraseQuery + projStatusQuery *query.MatchQuery + countryQuery *query.MatchQuery + cityQuery *query.MatchQuery + ) + + // Test 1: Find companies with a department named "Engineering" AND budget over 900000 + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + min := float64(800000) + deptBudgetQuery = query.NewNumericRangeQuery(&min, nil) + deptBudgetQuery.SetField("company.departments.budget") + + req = buildReq([]query.Query{deptNameQuery, deptBudgetQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc1" || res.Hits[1].ID != "doc2" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } + + // Test 2: Find companies with an employee named "Eve" AND project status "completed" + empNameQuery = query.NewMatchQuery("Eve") + empNameQuery.SetField("company.departments.employees.name") + + projStatusQuery = query.NewMatchQuery("completed") + projStatusQuery.SetField("company.departments.projects.status") + + req = buildReq([]query.Query{empNameQuery, projStatusQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc1" || res.Hits[1].ID != "doc3" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } + + // Test 3: Find companies located in "Athens, USA" AND with an Engineering department + countryQuery = query.NewMatchQuery("USA") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("Athens") + cityQuery.SetField("company.locations.city") + + locQuery := query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + req = buildReq([]query.Query{locQuery, deptNameQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc2" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 4a: Find companies located in "Athens, USA" AND with an Engineering department with a budget over 1M + countryQuery = query.NewMatchQuery("USA") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("Athens") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + min = float64(1000000) + deptBudgetQuery = query.NewNumericRangeQuery(&min, nil) + deptBudgetQuery.SetField("company.departments.budget") + + deptQuery := query.NewConjunctionQuery([]query.Query{deptNameQuery, deptBudgetQuery}) + + req = buildReq([]query.Query{locQuery, deptQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } + + // Test 4b: Find companies located in "Athens, Greece" AND with an Engineering department with a budget over 1M + countryQuery = query.NewMatchQuery("Greece") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("Athens") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + min = float64(1000000) + deptBudgetQuery = query.NewNumericRangeQuery(&min, nil) + deptBudgetQuery.SetField("company.departments.budget") + + deptQuery = query.NewConjunctionQuery([]query.Query{deptNameQuery, deptBudgetQuery}) + + req = buildReq([]query.Query{locQuery, deptQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hits, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc1" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 5a: Find companies with an employee named "Frank" AND role "Manager" whose department is + // handling a project titled "Project Beta" which is marked as "completed" + empNameQuery = query.NewMatchQuery("Frank") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery := query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + projTitleQuery = query.NewMatchPhraseQuery("Project Beta") + projTitleQuery.SetField("company.departments.projects.title") + + projStatusQuery = query.NewMatchQuery("completed") + projStatusQuery.SetField("company.departments.projects.status") + + projQuery := query.NewConjunctionQuery([]query.Query{projTitleQuery, projStatusQuery}) + + req = buildReq([]query.Query{empQuery, projQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hit, got %d", len(res.Hits)) + } + + // Test 5b: Find companies with an employee named "Frank" AND role "Manager" whose department is + // handling a project titled "Project Beta" which is marked as "ongoing" + empNameQuery = query.NewMatchQuery("Frank") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + projTitleQuery = query.NewMatchPhraseQuery("Project Beta") + projTitleQuery.SetField("company.departments.projects.title") + + projStatusQuery = query.NewMatchQuery("ongoing") + projStatusQuery.SetField("company.departments.projects.status") + + projQuery = query.NewConjunctionQuery([]query.Query{projTitleQuery, projStatusQuery}) + + req = buildReq([]query.Query{empQuery, projQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc2" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 6a: Find companies with an employee named "Eve" AND role "Manager" + // who is working in a department located in "London, UK" + empNameQuery = query.NewMatchQuery("Eve") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("UK") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{empQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hit, got %d", len(res.Hits)) + } + + // Test 6b: Find companies with an employee named "Eve" AND role "Manager" + // who is working in a department located in "London, Canada" + empNameQuery = query.NewMatchQuery("Eve") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("Canada") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{empQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc3" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 7a: Find companies where Ivan the Manager works London, UK + + empNameQuery = query.NewMatchQuery("Ivan") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("UK") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{empQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hit, got %d", len(res.Hits)) + } + + // Test 7b: Find companies where Ivan the Manager works London, Canada + + empNameQuery = query.NewMatchQuery("Ivan") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("Canada") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{empQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc3" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 8: Find companies where Frank the Manager works in Engineering department located in London, UK + empNameQuery = query.NewMatchQuery("Frank") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + deptQuery = query.NewConjunctionQuery([]query.Query{empQuery, deptNameQuery}) + + countryQuery = query.NewMatchQuery("UK") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + req = buildReq([]query.Query{deptQuery, locQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc2" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } +} + +func TestNestedArrayConjunctionQuery(t *testing.T) { + imap := NewIndexMapping() + groupsMapping := mapping.NewNestedDocumentMapping() + + nameField := mapping.NewTextFieldMapping() + groupsMapping.AddFieldMappingsAt("first_name", nameField) + groupsMapping.AddFieldMappingsAt("last_name", nameField) + + imap.DefaultMapping.AddSubDocumentMapping("groups", groupsMapping) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + docs := []string{ + `{ + "groups": [ + [ + { + "first_name": "Alice", + "last_name": "Smith" + }, + { + "first_name": "Bob", + "last_name": "Johnson" + } + ], + [ + { + "first_name": "Charlie", + "last_name": "Williams" + }, + { + "first_name": "Diana", + "last_name": "Brown" + } + ] + ] + }`, + `{ + "groups": [ + { + "first_name": "Alice", + "last_name": "Smith" + }, + { + "first_name": "Bob", + "last_name": "Johnson" + }, + { + "first_name": "Charlie", + "last_name": "Williams" + }, + { + "first_name": "Diana", + "last_name": "Brown" + } + ] + }`, + } + + for i, doc := range docs { + var dataMap map[string]interface{} + err := json.Unmarshal([]byte(doc), &dataMap) + if err != nil { + t.Fatalf("failed to unmarshal document %d: %v", i, err) + } + err = idx.Index(fmt.Sprintf("%d", i+1), dataMap) + if err != nil { + t.Fatalf("failed to index document %d: %v", i, err) + } + } + + var ( + firstNameQuery *query.MatchQuery + lastNameQuery *query.MatchQuery + conjQuery *query.ConjunctionQuery + searchReq *SearchRequest + res *SearchResult + ) + + // Search for documents where first_name is "Alice" AND last_name is "Johnson" + firstNameQuery = query.NewMatchQuery("Alice") + firstNameQuery.SetField("groups.first_name") + + lastNameQuery = query.NewMatchQuery("Johnson") + lastNameQuery.SetField("groups.last_name") + + conjQuery = query.NewConjunctionQuery([]query.Query{firstNameQuery, lastNameQuery}) + + searchReq = NewSearchRequest(conjQuery) + searchReq.SortBy([]string{"_id"}) + + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("search failed: %v", err) + } + + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } + + // Search for documents where first_name is "Bob" AND last_name is "Johnson" + firstNameQuery = query.NewMatchQuery("Bob") + firstNameQuery.SetField("groups.first_name") + + lastNameQuery = query.NewMatchQuery("Johnson") + lastNameQuery.SetField("groups.last_name") + + conjQuery = query.NewConjunctionQuery([]query.Query{firstNameQuery, lastNameQuery}) + + searchReq = NewSearchRequest(conjQuery) + searchReq.SortBy([]string{"_id"}) + + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("search failed: %v", err) + } + + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + + if res.Hits[0].ID != "1" || res.Hits[1].ID != "2" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } + + // Search for documents where first_name is "Alice" AND last_name is "Williams" + firstNameQuery = query.NewMatchQuery("Alice") + firstNameQuery.SetField("groups.first_name") + + lastNameQuery = query.NewMatchQuery("Williams") + lastNameQuery.SetField("groups.last_name") + + conjQuery = query.NewConjunctionQuery([]query.Query{firstNameQuery, lastNameQuery}) + + searchReq = NewSearchRequest(conjQuery) + searchReq.SortBy([]string{"_id"}) + + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("search failed: %v", err) + } + + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } + + // Search for documents where first_name is "Diana" AND last_name is "Brown" + firstNameQuery = query.NewMatchQuery("Diana") + firstNameQuery.SetField("groups.first_name") + + lastNameQuery = query.NewMatchQuery("Brown") + lastNameQuery.SetField("groups.last_name") + + conjQuery = query.NewConjunctionQuery([]query.Query{firstNameQuery, lastNameQuery}) + + searchReq = NewSearchRequest(conjQuery) + searchReq.SortBy([]string{"_id"}) + + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("search failed: %v", err) + } + + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + + if res.Hits[0].ID != "1" || res.Hits[1].ID != "2" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } +} + +func TestValidNestedMapping(t *testing.T) { + // ensure that top-level mappings - DefaultMapping and any type mappings - cannot be nested mappings + imap := mapping.NewIndexMapping() + nestedMapping := mapping.NewNestedDocumentMapping() + imap.DefaultMapping = nestedMapping + err := imap.Validate() + if err == nil { + t.Fatalf("expected error for nested DefaultMapping, got nil") + } + // invalid nested type mapping + imap = mapping.NewIndexMapping() + imap.AddDocumentMapping("type1", nestedMapping) + err = imap.Validate() + if err == nil { + t.Fatalf("expected error for nested type mapping, got nil") + } + // valid nested mappings within DefaultMapping + imap = mapping.NewIndexMapping() + docMapping := mapping.NewDocumentMapping() + nestedMapping = mapping.NewNestedDocumentMapping() + fieldMapping := mapping.NewTextFieldMapping() + nestedMapping.AddFieldMappingsAt("field1", fieldMapping) + docMapping.AddSubDocumentMapping("nestedField", nestedMapping) + imap.DefaultMapping = docMapping + err = imap.Validate() + if err != nil { + t.Fatalf("expected valid nested mapping, got error: %v", err) + } + // valid nested mappings within type mapping + imap = mapping.NewIndexMapping() + docMapping = mapping.NewDocumentMapping() + nestedMapping = mapping.NewNestedDocumentMapping() + fieldMapping = mapping.NewTextFieldMapping() + nestedMapping.AddFieldMappingsAt("field1", fieldMapping) + docMapping.AddSubDocumentMapping("nestedField", nestedMapping) + imap.AddDocumentMapping("type1", docMapping) + err = imap.Validate() + if err != nil { + t.Fatalf("expected valid nested mapping, got error: %v", err) + } + // some nested type mappings + imap = mapping.NewIndexMapping() + nestedMapping = mapping.NewNestedDocumentMapping() + regularMapping := mapping.NewDocumentMapping() + imap.AddDocumentMapping("non_nested1", regularMapping) + imap.AddDocumentMapping("non_nested2", regularMapping) + imap.AddDocumentMapping("nested1", nestedMapping) + imap.AddDocumentMapping("nested2", nestedMapping) + err = imap.Validate() + if err == nil { + t.Fatalf("expected error for nested type mappings, got nil") + } +} diff --git a/search_no_knn.go b/search_no_knn.go index c91980589..54eba4219 100644 --- a/search_no_knn.go +++ b/search_no_knn.go @@ -177,7 +177,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea return nil, nil } -func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) { +func setKnnHitsInCollector(knnHits []*search.DocumentMatch, coll *collector.TopNCollector) { } func requestHasKNN(req *SearchRequest) bool { From c20eae05c639a888c0c2ce81a06a931689a89f3b Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 20 Jan 2026 02:09:13 +0530 Subject: [PATCH 3/9] MB-27666: Fix nested mode handling for field-implicit queries (#2272) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix nested-mode search for field-implicit queries (`match_all`, `docID`): These queries previously filtered nested documents at the searcher level, but since `ExtractFields` returned no fields, the nested collector was never activated. In compound queries (e.g., Boolean filters), this caused a level mismatch—searcher filtering vs nested-level execution—resulting in zero matches. - Unify filtering at collector level: Introduced `HasID()` and `HasAll()` flags in `FieldSet` to track field-implicit queries. Removed searcher-level filtering from `MatchAllSearcher`. - Nested collector now correctly activates, ensuring consistent behavior regardless of query composition. - Fix an edge case where usage of `match_all query` as a sub-clause in a `NestedConjunctionSearcher` could cause a panic. - Fix a case where root document fields would get excluded from the `_all` field even when included. - Fix outdated information in `hierarchy.md` - Add unit tests for Hierarchical Nested Vector Search. --- docs/hierarchy.md | 10 +- document/document.go | 22 - index_impl.go | 2 +- mapping/index.go | 8 +- search/query/conjunction.go | 6 + search/query/query.go | 5 + search/query/query_test.go | 12 + search/searcher/search_match_all.go | 33 -- search/util.go | 17 +- search_knn_test.go | 614 ++++++++++++++++++++++++++++ search_nested_test.go | 170 ++++++++ 11 files changed, 824 insertions(+), 75 deletions(-) diff --git a/docs/hierarchy.md b/docs/hierarchy.md index bcbc259c4..39410864d 100644 --- a/docs/hierarchy.md +++ b/docs/hierarchy.md @@ -1,6 +1,6 @@ # Hierarchical nested search -* *v2.6.0* (and after) will come with support for **Array indexing and hierarchy search**. +* *v2.6.0* (and after) will come with support for **Array indexing and hierarchical nested search**. * We've achieved this by embedding nested documents within our bleve (scorch) indexes. * Usage of zap file format: [v17](https://github.com/blevesearch/zapx/blob/master/zap.md). Here we preserve hierarchical document relationships within segments, continuing to conform to the segmented architecture of *scorch*. @@ -146,7 +146,7 @@ } ``` -* Any Bleve query (e.g., match, phrase, term, fuzzy, numeric/date range etc.) can be executed against fields within nested documents, with no special handling required. The query processor will automatically traverse the nested structures to find matches. Additional search constructs +* Any Bleve query (e.g., `match`, `phrase`, `term`, `fuzzy`, `numeric/date range` etc.) can be executed against fields within nested documents, with no special handling required. The query processor will automatically traverse the nested structures to find matches. Additional search constructs like vector search, synonym search, hybrid and pre-filtered vector search integrate seamlessly with hierarchy search. * Conjunction Queries (AND queries) and other queries that depend on term co-occurrence within the same hierarchical context will respect the boundaries of nested documents. This means that terms must appear within the same nested object to be considered a match. For example, a conjunction query searching for an employee named "Alice" with the role "Engineer" within the "Engineering" department will only return results where both name and role terms are found within the same employee object, which is itself within a "Engineering" department object. @@ -156,11 +156,11 @@ like vector search, synonym search, hybrid and pre-filtered vector search integr * Nested Faceting / Aggregations: Facets are computed within matched nested objects, producing context-aware buckets. E.g., a facet on `departments.projects.status` returns ongoing or completed only for projects in matched departments. - * Sorting by Nested Fields: Sorting can use fields from the relevant nested object, e.g., ordering companies by `departments.budget sorts` based on the budget of the specific matched department, not unrelated departments. + * Sorting by Nested Fields: Sorting can use fields from the relevant nested object, e.g., ordering companies by `departments.budget` sorts based on the budget of the specific matched department, not unrelated departments. -* Vector Search (KNN / Multi-KNN): When an array of objects is marked as nested and contains vector fields, each vector is treated as belonging to its own nested document. Vector similarity is computed only within the same nested object, not across siblings. For example, if `departments.employees` is a nested array where each employee has a `skills_vector`, a KNN search using the embedding of `machine learning engineer` will match only employees whose own `skills_vector` is similar; other employees vectors within the same department or document do not contribute to the score or match. This also means that a vector search query for `K = 3` will return the top 3 most similar employees across all departments and all companies, and may return multiple employees from the same department or company if they rank among the top 3 most similar overall. +* Vector Search (KNN / Multi-KNN): When a document contains an array of objects with vector/multi-vector fields, the final document score and ranking are identical whether or not the array is marked as `nested`. In both cases, the highest-scoring vector is selected; either directly from the array (non-nested) or from the best-matching nested object with its score bubbled up to the parent document. -* Pre-Filtered Vector Search: When vector search is combined with filters on fields inside a nested array, the filters are applied first to pick which nested items are eligible. The vector search then runs only on those filtered items. For example, if `departments.employees` is a `nested` array, a pre-filtered KNN query for employees with the role `Manager` in the `Sales` department will first narrow the candidate set to only employees who meet those field conditions, and then compute vector similarity on the `skills_vector` of that filtered subset. This ensures that vector search results come only from the employees that satisfy the filter, while still treating each employee as an independent vector candidate. +* Pre-Filtered Vector Search: When vector search is combined with filters on fields inside a nested array, the filters are applied first to pick which nested items are eligible. Vector similarity is then computed only on the vector fields of those filtered nested objects. For example, if `departments.employees` is a `nested` array, a pre-filtered KNN query for employees with a `skills_vector` matching `machine learning engineer`, a role of `Manager`, and belonging to the `Sales` department will first narrow the candidate set to only employees who meet the requirement, and then compute vector similarity on the `skills_vector` of that filtered subset. This ensures that vector search results come only from the employees that satisfy the filter, and not from unrelated employees in other departments. ## Indexing diff --git a/document/document.go b/document/document.go index 7efea56da..53033757b 100644 --- a/document/document.go +++ b/document/document.go @@ -18,7 +18,6 @@ import ( "fmt" "reflect" - "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/size" index "github.com/blevesearch/bleve_index_api" ) @@ -164,27 +163,6 @@ func (d *Document) AddNestedDocument(doc *Document) { d.NestedDocuments = append(d.NestedDocuments, doc) } -func (d *Document) NestedFields() search.FieldSet { - if len(d.NestedDocuments) == 0 { - return nil - } - fieldSet := search.NewFieldSet() - var collectFields func(index.Document) - collectFields = func(doc index.Document) { - // Add all field names from this nested document - doc.VisitFields(func(field index.Field) { - fieldSet.AddField(field.Name()) - }) - // Recursively collect from this document's nested documents - if nd, ok := doc.(index.NestedDocument); ok { - nd.VisitNestedDocuments(collectFields) - } - } - // Start collection from nested documents only (not root document) - d.VisitNestedDocuments(collectFields) - return fieldSet -} - func (d *Document) VisitNestedDocuments(visitor func(doc index.Document)) { for _, doc := range d.NestedDocuments { visitor(doc) diff --git a/index_impl.go b/index_impl.go index 1f75f190a..a50347a1f 100644 --- a/index_impl.go +++ b/index_impl.go @@ -1380,7 +1380,7 @@ func (i *indexImpl) buildTopNCollector(ctx context.Context, req *SearchRequest, if err != nil { return nil, err } - if nm.IntersectsPrefix(fs) { + if fs.HasID() || nm.IntersectsPrefix(fs) { return newNestedCollector(nr), nil } } diff --git a/mapping/index.go b/mapping/index.go index bafb6ee89..143ff5a31 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -376,13 +376,7 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} // see if the _all field was disabled allMapping, _ := docMapping.documentMappingForPath("_all") if allMapping == nil || allMapping.Enabled { - excludedFromAll := walkContext.excludedFromAll - nf := doc.NestedFields() - if nf != nil { - // if the document has any nested fields, exclude them from _all - excludedFromAll = append(excludedFromAll, nf.Slice()...) - } - field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, excludedFromAll, index.IndexField|index.IncludeTermVectors) + field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, index.IndexField|index.IncludeTermVectors) doc.AddField(field) } doc.SetIndexed() diff --git a/search/query/conjunction.go b/search/query/conjunction.go index 6870b1ae2..631956dca 100644 --- a/search/query/conjunction.go +++ b/search/query/conjunction.go @@ -106,6 +106,12 @@ func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m // if we have common depth == max depth then we can just use // the normal conjunction searcher, as all fields share the same // nested context, otherwise we need to use the nested conjunction searcher + // also, if we are querying the _all or _id fields, we need to use + // the nested conjunction searcher as well, with common depth 0 + // indicating matches happen only at the root level + if qfs.HasAll() || qfs.HasID() { + commonDepth = 0 + } if commonDepth < maxDepth { return searcher.NewNestedConjunctionSearcher(ctx, i, ss, commonDepth, options) } diff --git a/search/query/query.go b/search/query/query.go index 0b3d0a9c4..b3852a1c3 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -497,6 +497,11 @@ func ExtractFields(q Query, m mapping.IndexMapping, fs search.FieldSet) (search. break } } + case *DocIDQuery, *MatchAllQuery: + if fs == nil { + fs = search.NewFieldSet() + } + fs.AddField("_id") } return fs, err } diff --git a/search/query/query_test.go b/search/query/query_test.go index 60c1fa374..fa020d7d7 100644 --- a/search/query/query_test.go +++ b/search/query/query_test.go @@ -1017,6 +1017,18 @@ func TestExtractFields(t *testing.T) { }`, expFields: []string{"text"}, }, + { + query: `{ + "match_all": {} + }`, + expFields: []string{"_id"}, + }, + { + query: `{ + "ids": ["a", "b", "c"] + }`, + expFields: []string{"_id"}, + }, } m := mapping.NewIndexMapping() diff --git a/search/searcher/search_match_all.go b/search/searcher/search_match_all.go index 57966a924..57d8d0727 100644 --- a/search/searcher/search_match_all.go +++ b/search/searcher/search_match_all.go @@ -36,8 +36,6 @@ type MatchAllSearcher struct { reader index.DocIDReader scorer *scorer.ConstantScorer count uint64 - nested bool - ancestors []index.AncestorID } func NewMatchAllSearcher(ctx context.Context, indexReader index.IndexReader, boost float64, options search.SearcherOptions) (*MatchAllSearcher, error) { @@ -52,15 +50,11 @@ func NewMatchAllSearcher(ctx context.Context, indexReader index.IndexReader, boo } scorer := scorer.NewConstantScorer(1.0, boost, options) - // check if we are in nested mode - nested, _ := ctx.Value(search.NestedSearchKey).(bool) - return &MatchAllSearcher{ indexReader: indexReader, reader: reader, scorer: scorer, count: count, - nested: nested, }, nil } @@ -82,23 +76,6 @@ func (s *MatchAllSearcher) SetQueryNorm(qnorm float64) { s.scorer.SetQueryNorm(qnorm) } -func (s *MatchAllSearcher) isNested(id index.IndexInternalID) bool { - // if not running in nested mode, always return false - if !s.nested { - return false - } - var err error - // check if this doc has ancestors, if so it is nested - if nr, ok := s.reader.(index.NestedReader); ok { - s.ancestors, err = nr.Ancestors(id, s.ancestors[:0]) - if err != nil { - return false - } - return len(s.ancestors) > 1 - } - return false -} - func (s *MatchAllSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { id, err := s.reader.Next() if err != nil { @@ -109,11 +86,6 @@ func (s *MatchAllSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatc return nil, nil } - if s.isNested(id) { - // if nested then skip and get next - return s.Next(ctx) - } - // score match docMatch := s.scorer.Score(ctx, id) // return doc match @@ -131,11 +103,6 @@ func (s *MatchAllSearcher) Advance(ctx *search.SearchContext, ID index.IndexInte return nil, nil } - if s.isNested(id) { - // if nested then return next - return s.Next(ctx) - } - // score match docMatch := s.scorer.Score(ctx, id) diff --git a/search/util.go b/search/util.go index 0c1ec9ab9..22d2aae90 100644 --- a/search/util.go +++ b/search/util.go @@ -287,11 +287,14 @@ func (fs FieldSet) AddField(field string) { fs[field] = struct{}{} } -// Slice returns the fields in this set as a slice of strings. -func (fs FieldSet) Slice() []string { - rv := make([]string, 0, len(fs)) - for field := range fs { - rv = append(rv, field) - } - return rv +// HasID returns true if the field set contains the "_id" field. +func (fs FieldSet) HasID() bool { + _, ok := fs["_id"] + return ok +} + +// HasAll returns true if the field set contains the "_all" field. +func (fs FieldSet) HasAll() bool { + _, ok := fs["_all"] + return ok } diff --git a/search_knn_test.go b/search_knn_test.go index d87a24ebe..d053705ca 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -26,11 +26,14 @@ import ( "fmt" "math" "math/rand" + "reflect" "sort" "strconv" "sync" "testing" + "time" + "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword" "github.com/blevesearch/bleve/v2/analysis/lang/en" "github.com/blevesearch/bleve/v2/index/scorch" "github.com/blevesearch/bleve/v2/mapping" @@ -2046,3 +2049,614 @@ func TestNumVecsStat(t *testing.T) { } } } + +func TestIndexUpdateVector(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + indexMappingBefore := mapping.NewIndexMapping() + indexMappingBefore.TypeMapping = map[string]*mapping.DocumentMapping{} + indexMappingBefore.DefaultMapping = &mapping.DocumentMapping{ + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{ + "a": { + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{}, + Fields: []*mapping.FieldMapping{ + { + Type: "vector", + Index: true, + Dims: 4, + Similarity: "l2_norm", + VectorIndexOptimizedFor: "latency", + }, + }, + }, + "b": { + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{}, + Fields: []*mapping.FieldMapping{ + { + Type: "vector", + Index: true, + Dims: 4, + Similarity: "l2_norm", + VectorIndexOptimizedFor: "latency", + }, + }, + }, + "c": { + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{}, + Fields: []*mapping.FieldMapping{ + { + Type: "vector_base64", + Index: true, + Dims: 4, + Similarity: "l2_norm", + VectorIndexOptimizedFor: "latency", + }, + }, + }, + "d": { + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{}, + Fields: []*mapping.FieldMapping{ + { + Type: "vector_base64", + Index: true, + Dims: 4, + Similarity: "l2_norm", + VectorIndexOptimizedFor: "latency", + }, + }, + }, + }, + Fields: []*mapping.FieldMapping{}, + } + indexMappingBefore.IndexDynamic = false + indexMappingBefore.StoreDynamic = false + indexMappingBefore.DocValuesDynamic = false + + index, err := New(tmpIndexPath, indexMappingBefore) + if err != nil { + t.Fatal(err) + } + doc1 := map[string]interface{}{"a": []float32{0.32894259691238403, 0.6973215341567993, 0.6835201978683472, 0.38296082615852356}, "b": []float32{0.32894259691238403, 0.6973215341567993, 0.6835201978683472, 0.38296082615852356}, "c": "L5MOPw7NID5SQMU9pHUoPw==", "d": "L5MOPw7NID5SQMU9pHUoPw=="} + doc2 := map[string]interface{}{"a": []float32{0.0018692062003538013, 0.41076546907424927, 0.5675257444381714, 0.45832985639572144}, "b": []float32{0.0018692062003538013, 0.41076546907424927, 0.5675257444381714, 0.45832985639572144}, "c": "czloP94ZCD71ldY+GbAOPw==", "d": "czloP94ZCD71ldY+GbAOPw=="} + doc3 := map[string]interface{}{"a": []float32{0.7853356599807739, 0.6904757618904114, 0.5643226504325867, 0.682637631893158}, "b": []float32{0.7853356599807739, 0.6904757618904114, 0.5643226504325867, 0.682637631893158}, "c": "Chh6P2lOqT47mjg/0odlPg==", "d": "Chh6P2lOqT47mjg/0odlPg=="} + batch := index.NewBatch() + err = batch.Index("001", doc1) + if err != nil { + t.Fatal(err) + } + err = batch.Index("002", doc2) + if err != nil { + t.Fatal(err) + } + err = batch.Index("003", doc3) + if err != nil { + t.Fatal(err) + } + err = index.Batch(batch) + if err != nil { + t.Fatal(err) + } + err = index.Close() + if err != nil { + t.Fatal(err) + } + + indexMappingAfter := mapping.NewIndexMapping() + indexMappingAfter.TypeMapping = map[string]*mapping.DocumentMapping{} + indexMappingAfter.DefaultMapping = &mapping.DocumentMapping{ + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{ + "a": { + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{}, + Fields: []*mapping.FieldMapping{ + { + Type: "vector", + Index: true, + Dims: 4, + Similarity: "l2_norm", + VectorIndexOptimizedFor: "latency", + }, + }, + }, + "c": { + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{}, + Fields: []*mapping.FieldMapping{ + { + Type: "vector_base64", + Index: true, + Dims: 4, + Similarity: "l2_norm", + VectorIndexOptimizedFor: "latency", + }, + }, + }, + "d": { + Enabled: true, + Dynamic: false, + Properties: map[string]*mapping.DocumentMapping{}, + Fields: []*mapping.FieldMapping{ + { + Type: "vector_base64", + Index: false, + Dims: 4, + Similarity: "l2_norm", + VectorIndexOptimizedFor: "latency", + }, + }, + }, + }, + Fields: []*mapping.FieldMapping{}, + } + indexMappingAfter.IndexDynamic = false + indexMappingAfter.StoreDynamic = false + indexMappingAfter.DocValuesDynamic = false + + mappingString, err := json.Marshal(indexMappingAfter) + if err != nil { + t.Fatal(err) + } + config := map[string]interface{}{ + "updated_mapping": string(mappingString), + } + + index, err = OpenUsing(tmpIndexPath, config) + if err != nil { + t.Fatal(err) + } + + q1 := NewSearchRequest(NewMatchNoneQuery()) + q1.AddKNN("a", []float32{1, 2, 3, 4}, 3, 1.0) + res1, err := index.Search(q1) + if err != nil { + t.Fatal(err) + } + if len(res1.Hits) != 3 { + t.Fatalf("Expected 3 hits, got %d", len(res1.Hits)) + } + q2 := NewSearchRequest(NewMatchNoneQuery()) + q2.AddKNN("b", []float32{1, 2, 3, 4}, 3, 1.0) + res2, err := index.Search(q2) + if err != nil { + t.Fatal(err) + } + if len(res2.Hits) != 0 { + t.Fatalf("Expected 0 hits, got %d", len(res2.Hits)) + } + q3 := NewSearchRequest(NewMatchNoneQuery()) + q3.AddKNN("c", []float32{1, 2, 3, 4}, 3, 1.0) + res3, err := index.Search(q3) + if err != nil { + t.Fatal(err) + } + if len(res3.Hits) != 3 { + t.Fatalf("Expected 3 hits, got %d", len(res3.Hits)) + } + q4 := NewSearchRequest(NewMatchNoneQuery()) + q4.AddKNN("d", []float32{1, 2, 3, 4}, 3, 1.0) + res4, err := index.Search(q4) + if err != nil { + t.Fatal(err) + } + if len(res4.Hits) != 0 { + t.Fatalf("Expected 0 hits, got %d", len(res4.Hits)) + } +} + +func TestIndexInsightsTermFrequencies(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + mp := mapping.NewIndexMapping() + textMapping := mapping.NewTextFieldMapping() + textMapping.Analyzer = "en" + mp.DefaultMapping.AddFieldMappingsAt("text", textMapping) + + idx, err := New(tmpIndexPath, mp) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + data := []map[string]string{ + { + "id": "one", + "text": "She sells sea shells by the sea shore", + }, + { + "id": "two", + "text": "The quick brown fox jumps over the lazy dog", + }, + { + "id": "three", + "text": "She sold sea shells to the person with the dog", + }, + { + "id": "four", + "text": "But there are a lot of dogs on the beach", + }, + { + "id": "five", + "text": "To hell with the foxes", + }, + { + "id": "six", + "text": "What about the dogs", + }, + { + "id": "seven", + "text": "Dogs are OK, foxes are not", + }, + } + + expectTermFreqs := []index.TermFreq{ + {Term: "dog", Frequency: 5}, + {Term: "fox", Frequency: 3}, + {Term: "sea", Frequency: 2}, + {Term: "shell", Frequency: 2}, + {Term: "beach", Frequency: 1}, + } + + for _, d := range data { + err = idx.Index(d["id"], d) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + } + + insightsIdx, ok := idx.(InsightsIndex) + if !ok { + t.Fatal("index does not support insights") + } + + termFreqs, err := insightsIdx.TermFrequencies("text", 5, true) + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(termFreqs, expectTermFreqs) { + t.Fatalf("term freqs do not match: got: %v, expected: %v", termFreqs, expectTermFreqs) + } +} + +func TestIndexInsightsCentroidCardinalities(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + vectorDims := 5 + + mp := mapping.NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = vectorDims + vecFieldMapping.Similarity = index.CosineSimilarity + mp.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) + + idx, err := New(tmpIndexPath, mp) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + rand.Seed(time.Now().UnixNano()) + min, max := float32(-10.0), float32(10.0) + genRandomVector := func() []float32 { + vec := make([]float32, vectorDims) + for i := range vec { + vec[i] = min + rand.Float32()*(max-min) + } + return vec + } + + batch := idx.NewBatch() + for i := 1; i <= 50000; i++ { + if err = batch.Index(fmt.Sprintf("doc-%d", i), map[string]interface{}{ + "vec": genRandomVector(), + }); err != nil { + t.Fatalf("error indexing doc: %v", err) + } + + if i%200 == 0 { + err = idx.Batch(batch) + if err != nil { + t.Fatalf("Error adding batch to index: %v", err) + } + batch = idx.NewBatch() + } + } + + if batch.Size() > 0 { + // In case doc count is not a multiple of 200, we need to add the final batch + err = idx.Batch(batch) + if err != nil { + t.Errorf("Error adding final batch to index: %v", err) + } + } + + insightsIdx, ok := idx.(InsightsIndex) + if !ok { + t.Fatal("index does not support insights") + } + + centroids, err := insightsIdx.CentroidCardinalities("vec", 5, true) + if err != nil { + t.Fatal(err) + } + + if len(centroids) != 5 { + t.Fatalf("expected 5 centroids, got %d", len(centroids)) + } + + for _, entry := range centroids { + if len(entry.Index) == 0 { + t.Fatal("expected index name for each centroid") + } + } +} + +func TestHierarchicalNestedVectorSearch(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + dataset := ` + [ + { + "id": "doc1", + "items": [ + { + "description": "I like trains", + "embedding_vector": [ + 1, + 0, + 0 + ], + "type": "transport" + }, + { + "description": "I love pizza", + "embedding_vector": [ + 0, + 1, + 0 + ], + "type": "food" + } + ] + }, + { + "id": "doc2", + "items": [ + { + "description": "I go to school by bus", + "embedding_vector": [ + 0.9, + 0.1, + 0 + ], + "type": "transport" + }, + { + "description": "Sushi is delicious", + "embedding_vector": [ + 0, + 1, + 0 + ], + "type": "food" + } + ] + }, + { + "id": "doc3", + "items": [ + { + "description": "Hamburgers are tasty", + "embedding_vector": [ + 0, + 0.8, + 0.2 + ], + "type": "food" + }, + { + "description": "I enjoy biking", + "embedding_vector": [ + 0.7, + 0, + 0.3 + ], + "type": "transport" + } + ] + } + ]` + var documents []map[string]interface{} + err := json.Unmarshal([]byte(dataset), &documents) + if err != nil { + t.Fatalf("failed to unmarshal dataset: %v", err) + } + indexMapping := NewIndexMapping() + vecFieldMapping := mapping.NewVectorFieldMapping() + vecFieldMapping.Dims = 3 + vecFieldMapping.Similarity = index.CosineSimilarity + + typeMapping := mapping.NewTextFieldMapping() + typeMapping.Analyzer = keyword.Name + + descMapping := mapping.NewTextFieldMapping() + descMapping.Analyzer = en.AnalyzerName + + // items is NOT nested + itemsMapping := mapping.NewDocumentMapping() + itemsMapping.AddFieldMappingsAt("embedding_vector", vecFieldMapping) + itemsMapping.AddFieldMappingsAt("type", typeMapping) + itemsMapping.AddFieldMappingsAt("description", descMapping) + + indexMapping.DefaultMapping.AddSubDocumentMapping("items", itemsMapping) + idx, err := New(tmpIndexPath, indexMapping) + if err != nil { + t.Fatalf("failed to create index: %v", err) + } + defer func() { + if err := idx.Close(); err != nil { + t.Fatalf("failed to close index: %v", err) + } + }() + + batch := idx.NewBatch() + for _, doc := range documents { + err := batch.Index(doc["id"].(string), doc) + if err != nil { + t.Fatalf("failed to index document %s: %v", doc["id"], err) + } + } + err = idx.Batch(batch) + if err != nil { + t.Fatalf("failed to batch index documents: %v", err) + } + + // Plain vector search + searchReq := NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("items.embedding_vector", []float32{0, 1, 0}, 5, 1.0) + searchReq.SortBy([]string{"-_score", "_id"}) + + res, err := idx.Search(searchReq) + if err != nil { + t.Fatalf("failed to execute search: %v", err) + } + + expectedOrder := []string{"doc1", "doc2", "doc3"} + expectedScores := []float64{1.0, 1.0, 0.970} + if len(res.Hits) != len(expectedOrder) { + t.Fatalf("expected %d hits, got %d", len(expectedOrder), len(res.Hits)) + } + for i, expectedID := range expectedOrder { + if res.Hits[i].ID != expectedID { + t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expectedID, res.Hits[i].ID) + } + if math.Abs(res.Hits[i].Score-expectedScores[i]) > 0.01 { + t.Fatalf("at rank %d, expected score %.3f, got %.3f", i+1, expectedScores[i], res.Hits[i].Score) + } + } + + // Filtered vector search - should match output of plain vector search in non-nested case + filterQuery := NewTermQuery("transport") + filterQuery.SetField("items.type") + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNNWithFilter("items.embedding_vector", []float32{0, 1, 0}, 5, 1.0, filterQuery) + searchReq.SortBy([]string{"-_score", "_id"}) + res, err = idx.Search(searchReq) + if err != nil { + t.Fatalf("failed to execute filtered search: %v", err) + } + if len(res.Hits) != len(expectedOrder) { + t.Fatalf("expected %d hits, got %d", len(expectedOrder), len(res.Hits)) + } + for i, expectedID := range expectedOrder { + if res.Hits[i].ID != expectedID { + t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expectedID, res.Hits[i].ID) + } + if math.Abs(res.Hits[i].Score-expectedScores[i]) > 0.01 { + t.Fatalf("at rank %d, expected score %.3f, got %.3f", i+1, expectedScores[i], res.Hits[i].Score) + } + } + + // items IS nested + nestedItemsMapping := mapping.NewNestedDocumentMapping() + nestedItemsMapping.AddFieldMappingsAt("embedding_vector", vecFieldMapping) + nestedItemsMapping.AddFieldMappingsAt("type", typeMapping) + nestedItemsMapping.AddFieldMappingsAt("description", descMapping) + + indexMappingNested := NewIndexMapping() + indexMappingNested.DefaultMapping.AddSubDocumentMapping("items", nestedItemsMapping) + idxNested, err := New(tmpIndexPath+"_nested", indexMappingNested) + if err != nil { + t.Fatalf("failed to create nested index: %v", err) + } + defer func() { + if err := idxNested.Close(); err != nil { + t.Fatalf("failed to close nested index: %v", err) + } + }() + + batch = idxNested.NewBatch() + for _, doc := range documents { + err := batch.Index(doc["id"].(string), doc) + if err != nil { + t.Fatalf("failed to index document %s in nested index: %v", doc["id"], err) + } + } + err = idxNested.Batch(batch) + if err != nil { + t.Fatalf("failed to batch index documents in nested index: %v", err) + } + // Plain vector search on nested index + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNN("items.embedding_vector", []float32{0, 1, 0}, 5, 1.0) + searchReq.SortBy([]string{"-_score", "_id"}) + + res, err = idxNested.Search(searchReq) + if err != nil { + t.Fatalf("failed to execute search on nested index: %v", err) + } + // Exact same behavior as non-nested in this case + if len(res.Hits) != len(expectedOrder) { + t.Fatalf("expected %d hits, got %d", len(expectedOrder), len(res.Hits)) + } + for i, expectedID := range expectedOrder { + if res.Hits[i].ID != expectedID { + t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expectedID, res.Hits[i].ID) + } + if math.Abs(res.Hits[i].Score-expectedScores[i]) > 0.01 { + t.Fatalf("at rank %d, expected score %.3f, got %.3f", i+1, expectedScores[i], res.Hits[i].Score) + } + } + + // Filtered vector search on nested index - should NOT match output of plain vector search in nested case + filterQuery = NewTermQuery("transport") + filterQuery.SetField("items.type") + searchReq = NewSearchRequest(query.NewMatchNoneQuery()) + searchReq.AddKNNWithFilter("items.embedding_vector", []float32{0, 1, 0}, 5, 1.0, filterQuery) + searchReq.SortBy([]string{"-_score", "_id"}) + res, err = idxNested.Search(searchReq) + if err != nil { + t.Fatalf("failed to execute filtered search on nested index: %v", err) + } + expectedNestedOrder := []string{"doc2", "doc1", "doc3"} + expectedNestedScores := []float64{0.110, 0, 0} + if len(res.Hits) != len(expectedNestedOrder) { + t.Fatalf("expected %d hits, got %d", len(expectedNestedOrder), len(res.Hits)) + } + for i, expectedID := range expectedNestedOrder { + if res.Hits[i].ID != expectedID { + t.Fatalf("at rank %d, expected docID %s, got %s", i+1, expectedID, res.Hits[i].ID) + } + if math.Abs(res.Hits[i].Score-expectedNestedScores[i]) > 0.01 { + t.Fatalf("at rank %d, expected score %.3f, got %.3f", i+1, expectedNestedScores[i], res.Hits[i].Score) + } + } +} diff --git a/search_nested_test.go b/search_nested_test.go index f22fd5bc0..3c55c839e 100644 --- a/search_nested_test.go +++ b/search_nested_test.go @@ -806,6 +806,176 @@ func TestNestedConjunctionQuery(t *testing.T) { if res.Hits[0].ID != "doc2" { t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) } + + // Test 9: Match_All query must return only top-level documents + matchAllQuery := query.NewMatchAllQuery() + req = buildReq([]query.Query{matchAllQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 3 { + t.Fatalf("expected 3 hits, got %d", len(res.Hits)) + } + + // Test 10: DocID query must return only top-level documents + docIDQuery := query.NewDocIDQuery([]string{"doc1", "doc2", "doc3", "doc2_$company.locations_$0", "doc3_$company.departments_$0_$company.departments.employees_$0"}) + req = buildReq([]query.Query{docIDQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 3 { + t.Fatalf("expected 3 hits, got %d", len(res.Hits)) + } + + // Test 11: Boolean query in Filter-only mode must return correct top-level documents + empNameQuery = query.NewMatchQuery("Frank") + empNameQuery.SetField("company.departments.employees.name") + + boolQuery := query.NewBooleanQuery(nil, nil, nil) + boolQuery.AddFilter(empNameQuery) + + req = buildReq([]query.Query{boolQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 2 { + t.Fatalf("expected 2 hits, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc2" || res.Hits[1].ID != "doc3" { + t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) + } + + // Test 12: Boolean query Must clause should work in nested context + empNameQuery = query.NewMatchQuery("Ivan") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + countryQuery = query.NewMatchQuery("Canada") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + boolQuery = query.NewBooleanQuery([]query.Query{empQuery, locQuery}, nil, nil) + req = buildReq([]query.Query{boolQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc3" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + // Test 13: Queries targetting _all field should: + // - match only top-level fields when no specific field is set + // - not match nested fields when no specific field is set + // - work correctly when combined with nested field queries, + // returning only top-level documents where both conditions are met + allRootFieldQuery := query.NewMatchQuery("TechCorp") + + req = buildReq([]query.Query{allRootFieldQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc1" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + allNestedFieldQuery := query.NewMatchQuery("Alice") + req = buildReq([]query.Query{allNestedFieldQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } + + allMixedQuery := buildReq([]query.Query{allRootFieldQuery, allNestedFieldQuery}) + res, err = idx.Search(allMixedQuery) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } + + nestedFieldQuery := query.NewMatchQuery("Alice") + nestedFieldQuery.SetField("company.departments.employees.name") + + allMixedQueryWithNested := buildReq([]query.Query{allRootFieldQuery, nestedFieldQuery}) + res, err = idx.Search(allMixedQueryWithNested) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc1" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + empNameQuery = query.NewMatchQuery("Frank") + empNameQuery.SetField("company.departments.employees.name") + + empRoleQuery = query.NewMatchQuery("Manager") + empRoleQuery.SetField("company.departments.employees.role") + + empQuery = query.NewConjunctionQuery([]query.Query{empNameQuery, empRoleQuery}) + + deptNameQuery = query.NewMatchQuery("Engineering") + deptNameQuery.SetField("company.departments.name") + + deptQuery = query.NewConjunctionQuery([]query.Query{empQuery, deptNameQuery}) + + countryQuery = query.NewMatchQuery("UK") + countryQuery.SetField("company.locations.country") + + cityQuery = query.NewMatchQuery("London") + cityQuery.SetField("company.locations.city") + + locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) + + // mixed queries with _all field and _id field should match at root level always + companyNameAllQuery := query.NewMatchQuery("BizInc") + matchAllQuery = query.NewMatchAllQuery() + req = buildReq([]query.Query{deptQuery, locQuery, companyNameAllQuery, matchAllQuery}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + if res.Hits[0].ID != "doc2" { + t.Fatalf("unexpected hit ID: %v", res.Hits[0].ID) + } + + companyNameAllQueryNoMatch := query.NewMatchQuery("WebSolutions") + req = buildReq([]query.Query{deptQuery, locQuery, companyNameAllQueryNoMatch}) + res, err = idx.Search(req) + if err != nil { + t.Fatalf("search failed: %v", err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } } func TestNestedArrayConjunctionQuery(t *testing.T) { From 27507f2343eec3c854e2e9d7c521018c2c0f71f1 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 20 Jan 2026 19:08:32 +0530 Subject: [PATCH 4/9] fix merge conflicts --- index/scorch/snapshot_index.go | 88 ------------------ search_knn_test.go | 162 --------------------------------- search_nested_test.go | 21 +---- 3 files changed, 1 insertion(+), 270 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 3c5170bbc..78af89778 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -1143,94 +1143,6 @@ func (is *IndexSnapshot) UpdateSynonymSearchCount(delta uint64) { atomic.AddUint64(&is.parent.stats.TotSynonymSearches, delta) } -// Update current snapshot updated field data as well as pass it on to all segments and segment bases -func (is *IndexSnapshot) UpdateFieldsInfo(updatedFields map[string]*index.UpdateFieldInfo) { - is.m.Lock() - defer is.m.Unlock() - - is.MergeUpdateFieldsInfo(updatedFields) - - for _, segmentSnapshot := range is.segment { - segmentSnapshot.UpdateFieldsInfo(is.updatedFields) - } -} - -// Merge given updated field information with existing updated field information -func (is *IndexSnapshot) MergeUpdateFieldsInfo(updatedFields map[string]*index.UpdateFieldInfo) { - if is.updatedFields == nil { - is.updatedFields = updatedFields - } else { - for fieldName, info := range updatedFields { - if val, ok := is.updatedFields[fieldName]; ok { - val.Deleted = val.Deleted || info.Deleted - val.Index = val.Index || info.Index - val.DocValues = val.DocValues || info.DocValues - val.Store = val.Store || info.Store - } else { - is.updatedFields[fieldName] = info - } - } - } -} - -// TermFrequencies returns the top N terms ordered by the frequencies -// for a given field across all segments in the index snapshot. -func (is *IndexSnapshot) TermFrequencies(field string, limit int, descending bool) ( - termFreqs []index.TermFreq, err error) { - if len(is.segment) == 0 { - return nil, nil - } - - if limit <= 0 { - return nil, fmt.Errorf("limit must be positive") - } - - // Use FieldDict which aggregates term frequencies across all segments - fieldDict, err := is.FieldDict(field) - if err != nil { - return nil, fmt.Errorf("failed to get field dictionary for field %s: %v", field, err) - } - defer fieldDict.Close() - - // Preallocate slice with capacity equal to the number of unique terms - // in the field dictionary - termFreqs = make([]index.TermFreq, 0, fieldDict.Cardinality()) - - // Iterate through all terms using FieldDict - for { - dictEntry, err := fieldDict.Next() - if err != nil { - return nil, fmt.Errorf("error iterating field dictionary: %v", err) - } - if dictEntry == nil { - break // End of terms - } - - termFreqs = append(termFreqs, index.TermFreq{ - Term: dictEntry.Term, - Frequency: dictEntry.Count, - }) - } - - // Sort by frequency (descending or ascending) - sort.Slice(termFreqs, func(i, j int) bool { - if termFreqs[i].Frequency == termFreqs[j].Frequency { - // If frequencies are equal, sort by term lexicographically - return termFreqs[i].Term < termFreqs[j].Term - } - if descending { - return termFreqs[i].Frequency > termFreqs[j].Frequency - } - return termFreqs[i].Frequency < termFreqs[j].Frequency - }) - - if limit >= len(termFreqs) { - return termFreqs, nil - } - - return termFreqs[:limit], nil -} - // Ancestors returns the ancestor IDs for the given document ID. The prealloc // slice can be provided to avoid allocations downstream, and MUST be empty. func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID, prealloc []index.AncestorID) ([]index.AncestorID, error) { diff --git a/search_knn_test.go b/search_knn_test.go index d053705ca..d13c43cf8 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -26,12 +26,10 @@ import ( "fmt" "math" "math/rand" - "reflect" "sort" "strconv" "sync" "testing" - "time" "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword" "github.com/blevesearch/bleve/v2/analysis/lang/en" @@ -2258,166 +2256,6 @@ func TestIndexUpdateVector(t *testing.T) { } } -func TestIndexInsightsTermFrequencies(t *testing.T) { - tmpIndexPath := createTmpIndexPath(t) - defer cleanupTmpIndexPath(t, tmpIndexPath) - - mp := mapping.NewIndexMapping() - textMapping := mapping.NewTextFieldMapping() - textMapping.Analyzer = "en" - mp.DefaultMapping.AddFieldMappingsAt("text", textMapping) - - idx, err := New(tmpIndexPath, mp) - if err != nil { - t.Fatal(err) - } - defer func() { - err = idx.Close() - if err != nil { - t.Fatal(err) - } - }() - - data := []map[string]string{ - { - "id": "one", - "text": "She sells sea shells by the sea shore", - }, - { - "id": "two", - "text": "The quick brown fox jumps over the lazy dog", - }, - { - "id": "three", - "text": "She sold sea shells to the person with the dog", - }, - { - "id": "four", - "text": "But there are a lot of dogs on the beach", - }, - { - "id": "five", - "text": "To hell with the foxes", - }, - { - "id": "six", - "text": "What about the dogs", - }, - { - "id": "seven", - "text": "Dogs are OK, foxes are not", - }, - } - - expectTermFreqs := []index.TermFreq{ - {Term: "dog", Frequency: 5}, - {Term: "fox", Frequency: 3}, - {Term: "sea", Frequency: 2}, - {Term: "shell", Frequency: 2}, - {Term: "beach", Frequency: 1}, - } - - for _, d := range data { - err = idx.Index(d["id"], d) - if err != nil { - t.Errorf("Error updating index: %v", err) - } - } - - insightsIdx, ok := idx.(InsightsIndex) - if !ok { - t.Fatal("index does not support insights") - } - - termFreqs, err := insightsIdx.TermFrequencies("text", 5, true) - if err != nil { - t.Fatal(err) - } - - if !reflect.DeepEqual(termFreqs, expectTermFreqs) { - t.Fatalf("term freqs do not match: got: %v, expected: %v", termFreqs, expectTermFreqs) - } -} - -func TestIndexInsightsCentroidCardinalities(t *testing.T) { - tmpIndexPath := createTmpIndexPath(t) - defer cleanupTmpIndexPath(t, tmpIndexPath) - - vectorDims := 5 - - mp := mapping.NewIndexMapping() - vecFieldMapping := mapping.NewVectorFieldMapping() - vecFieldMapping.Dims = vectorDims - vecFieldMapping.Similarity = index.CosineSimilarity - mp.DefaultMapping.AddFieldMappingsAt("vec", vecFieldMapping) - - idx, err := New(tmpIndexPath, mp) - if err != nil { - t.Fatal(err) - } - defer func() { - err = idx.Close() - if err != nil { - t.Fatal(err) - } - }() - - rand.Seed(time.Now().UnixNano()) - min, max := float32(-10.0), float32(10.0) - genRandomVector := func() []float32 { - vec := make([]float32, vectorDims) - for i := range vec { - vec[i] = min + rand.Float32()*(max-min) - } - return vec - } - - batch := idx.NewBatch() - for i := 1; i <= 50000; i++ { - if err = batch.Index(fmt.Sprintf("doc-%d", i), map[string]interface{}{ - "vec": genRandomVector(), - }); err != nil { - t.Fatalf("error indexing doc: %v", err) - } - - if i%200 == 0 { - err = idx.Batch(batch) - if err != nil { - t.Fatalf("Error adding batch to index: %v", err) - } - batch = idx.NewBatch() - } - } - - if batch.Size() > 0 { - // In case doc count is not a multiple of 200, we need to add the final batch - err = idx.Batch(batch) - if err != nil { - t.Errorf("Error adding final batch to index: %v", err) - } - } - - insightsIdx, ok := idx.(InsightsIndex) - if !ok { - t.Fatal("index does not support insights") - } - - centroids, err := insightsIdx.CentroidCardinalities("vec", 5, true) - if err != nil { - t.Fatal(err) - } - - if len(centroids) != 5 { - t.Fatalf("expected 5 centroids, got %d", len(centroids)) - } - - for _, entry := range centroids { - if len(entry.Index) == 0 { - t.Fatal("expected index name for each centroid") - } - } -} - func TestHierarchicalNestedVectorSearch(t *testing.T) { tmpIndexPath := createTmpIndexPath(t) defer cleanupTmpIndexPath(t, tmpIndexPath) diff --git a/search_nested_test.go b/search_nested_test.go index 3c55c839e..1c61a573f 100644 --- a/search_nested_test.go +++ b/search_nested_test.go @@ -829,25 +829,6 @@ func TestNestedConjunctionQuery(t *testing.T) { t.Fatalf("expected 3 hits, got %d", len(res.Hits)) } - // Test 11: Boolean query in Filter-only mode must return correct top-level documents - empNameQuery = query.NewMatchQuery("Frank") - empNameQuery.SetField("company.departments.employees.name") - - boolQuery := query.NewBooleanQuery(nil, nil, nil) - boolQuery.AddFilter(empNameQuery) - - req = buildReq([]query.Query{boolQuery}) - res, err = idx.Search(req) - if err != nil { - t.Fatalf("search failed: %v", err) - } - if len(res.Hits) != 2 { - t.Fatalf("expected 2 hits, got %d", len(res.Hits)) - } - if res.Hits[0].ID != "doc2" || res.Hits[1].ID != "doc3" { - t.Fatalf("unexpected hit IDs: %v, %v", res.Hits[0].ID, res.Hits[1].ID) - } - // Test 12: Boolean query Must clause should work in nested context empNameQuery = query.NewMatchQuery("Ivan") empNameQuery.SetField("company.departments.employees.name") @@ -865,7 +846,7 @@ func TestNestedConjunctionQuery(t *testing.T) { locQuery = query.NewConjunctionQuery([]query.Query{countryQuery, cityQuery}) - boolQuery = query.NewBooleanQuery([]query.Query{empQuery, locQuery}, nil, nil) + boolQuery := query.NewBooleanQuery([]query.Query{empQuery, locQuery}, nil, nil) req = buildReq([]query.Query{boolQuery}) res, err = idx.Search(req) if err != nil { From acf1d6d49464434ae2a6761bc1fb92172a69e6a4 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 20 Jan 2026 19:14:34 +0530 Subject: [PATCH 5/9] fix conflict again --- search_knn_test.go | 208 --------------------------------------------- 1 file changed, 208 deletions(-) diff --git a/search_knn_test.go b/search_knn_test.go index d13c43cf8..a3d0c08c0 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -2048,214 +2048,6 @@ func TestNumVecsStat(t *testing.T) { } } -func TestIndexUpdateVector(t *testing.T) { - tmpIndexPath := createTmpIndexPath(t) - defer cleanupTmpIndexPath(t, tmpIndexPath) - - indexMappingBefore := mapping.NewIndexMapping() - indexMappingBefore.TypeMapping = map[string]*mapping.DocumentMapping{} - indexMappingBefore.DefaultMapping = &mapping.DocumentMapping{ - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{ - "a": { - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{}, - Fields: []*mapping.FieldMapping{ - { - Type: "vector", - Index: true, - Dims: 4, - Similarity: "l2_norm", - VectorIndexOptimizedFor: "latency", - }, - }, - }, - "b": { - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{}, - Fields: []*mapping.FieldMapping{ - { - Type: "vector", - Index: true, - Dims: 4, - Similarity: "l2_norm", - VectorIndexOptimizedFor: "latency", - }, - }, - }, - "c": { - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{}, - Fields: []*mapping.FieldMapping{ - { - Type: "vector_base64", - Index: true, - Dims: 4, - Similarity: "l2_norm", - VectorIndexOptimizedFor: "latency", - }, - }, - }, - "d": { - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{}, - Fields: []*mapping.FieldMapping{ - { - Type: "vector_base64", - Index: true, - Dims: 4, - Similarity: "l2_norm", - VectorIndexOptimizedFor: "latency", - }, - }, - }, - }, - Fields: []*mapping.FieldMapping{}, - } - indexMappingBefore.IndexDynamic = false - indexMappingBefore.StoreDynamic = false - indexMappingBefore.DocValuesDynamic = false - - index, err := New(tmpIndexPath, indexMappingBefore) - if err != nil { - t.Fatal(err) - } - doc1 := map[string]interface{}{"a": []float32{0.32894259691238403, 0.6973215341567993, 0.6835201978683472, 0.38296082615852356}, "b": []float32{0.32894259691238403, 0.6973215341567993, 0.6835201978683472, 0.38296082615852356}, "c": "L5MOPw7NID5SQMU9pHUoPw==", "d": "L5MOPw7NID5SQMU9pHUoPw=="} - doc2 := map[string]interface{}{"a": []float32{0.0018692062003538013, 0.41076546907424927, 0.5675257444381714, 0.45832985639572144}, "b": []float32{0.0018692062003538013, 0.41076546907424927, 0.5675257444381714, 0.45832985639572144}, "c": "czloP94ZCD71ldY+GbAOPw==", "d": "czloP94ZCD71ldY+GbAOPw=="} - doc3 := map[string]interface{}{"a": []float32{0.7853356599807739, 0.6904757618904114, 0.5643226504325867, 0.682637631893158}, "b": []float32{0.7853356599807739, 0.6904757618904114, 0.5643226504325867, 0.682637631893158}, "c": "Chh6P2lOqT47mjg/0odlPg==", "d": "Chh6P2lOqT47mjg/0odlPg=="} - batch := index.NewBatch() - err = batch.Index("001", doc1) - if err != nil { - t.Fatal(err) - } - err = batch.Index("002", doc2) - if err != nil { - t.Fatal(err) - } - err = batch.Index("003", doc3) - if err != nil { - t.Fatal(err) - } - err = index.Batch(batch) - if err != nil { - t.Fatal(err) - } - err = index.Close() - if err != nil { - t.Fatal(err) - } - - indexMappingAfter := mapping.NewIndexMapping() - indexMappingAfter.TypeMapping = map[string]*mapping.DocumentMapping{} - indexMappingAfter.DefaultMapping = &mapping.DocumentMapping{ - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{ - "a": { - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{}, - Fields: []*mapping.FieldMapping{ - { - Type: "vector", - Index: true, - Dims: 4, - Similarity: "l2_norm", - VectorIndexOptimizedFor: "latency", - }, - }, - }, - "c": { - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{}, - Fields: []*mapping.FieldMapping{ - { - Type: "vector_base64", - Index: true, - Dims: 4, - Similarity: "l2_norm", - VectorIndexOptimizedFor: "latency", - }, - }, - }, - "d": { - Enabled: true, - Dynamic: false, - Properties: map[string]*mapping.DocumentMapping{}, - Fields: []*mapping.FieldMapping{ - { - Type: "vector_base64", - Index: false, - Dims: 4, - Similarity: "l2_norm", - VectorIndexOptimizedFor: "latency", - }, - }, - }, - }, - Fields: []*mapping.FieldMapping{}, - } - indexMappingAfter.IndexDynamic = false - indexMappingAfter.StoreDynamic = false - indexMappingAfter.DocValuesDynamic = false - - mappingString, err := json.Marshal(indexMappingAfter) - if err != nil { - t.Fatal(err) - } - config := map[string]interface{}{ - "updated_mapping": string(mappingString), - } - - index, err = OpenUsing(tmpIndexPath, config) - if err != nil { - t.Fatal(err) - } - - q1 := NewSearchRequest(NewMatchNoneQuery()) - q1.AddKNN("a", []float32{1, 2, 3, 4}, 3, 1.0) - res1, err := index.Search(q1) - if err != nil { - t.Fatal(err) - } - if len(res1.Hits) != 3 { - t.Fatalf("Expected 3 hits, got %d", len(res1.Hits)) - } - q2 := NewSearchRequest(NewMatchNoneQuery()) - q2.AddKNN("b", []float32{1, 2, 3, 4}, 3, 1.0) - res2, err := index.Search(q2) - if err != nil { - t.Fatal(err) - } - if len(res2.Hits) != 0 { - t.Fatalf("Expected 0 hits, got %d", len(res2.Hits)) - } - q3 := NewSearchRequest(NewMatchNoneQuery()) - q3.AddKNN("c", []float32{1, 2, 3, 4}, 3, 1.0) - res3, err := index.Search(q3) - if err != nil { - t.Fatal(err) - } - if len(res3.Hits) != 3 { - t.Fatalf("Expected 3 hits, got %d", len(res3.Hits)) - } - q4 := NewSearchRequest(NewMatchNoneQuery()) - q4.AddKNN("d", []float32{1, 2, 3, 4}, 3, 1.0) - res4, err := index.Search(q4) - if err != nil { - t.Fatal(err) - } - if len(res4.Hits) != 0 { - t.Fatalf("Expected 0 hits, got %d", len(res4.Hits)) - } -} - func TestHierarchicalNestedVectorSearch(t *testing.T) { tmpIndexPath := createTmpIndexPath(t) defer cleanupTmpIndexPath(t, tmpIndexPath) From 1f88ce0547b65d1681bbaa4b9583bcc4909a48f3 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 20 Jan 2026 19:37:55 +0530 Subject: [PATCH 6/9] fix conflict --- search/search.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/search/search.go b/search/search.go index f03ed13e0..f47fe063d 100644 --- a/search/search.go +++ b/search/search.go @@ -238,10 +238,6 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { for i := range ftls { // recycle the ArrayPositions of each location ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] } - // remember the score breakdown map - scoreBreakdown := dm.ScoreBreakdown - // clear out the score breakdown map - clear(scoreBreakdown) // remember the Descendants backing array descendants := dm.Descendants for i := range descendants { // recycle each IndexInternalID @@ -258,8 +254,6 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { dm.FieldTermLocations = ftls[:0] // reuse the Descendants already allocated (and reset len to 0) dm.Descendants = descendants[:0] - // reuse the score breakdown map already allocated (after clearing it) - dm.ScoreBreakdown = scoreBreakdown return dm } From bd159fb2c12d7b8d70df69e1a66969de72322144 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 20 Jan 2026 19:38:51 +0530 Subject: [PATCH 7/9] fix 2 --- search/search.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/search/search.go b/search/search.go index f47fe063d..a5a75f345 100644 --- a/search/search.go +++ b/search/search.go @@ -233,6 +233,8 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { indexInternalID := dm.IndexInternalID // remember the []interface{} used for sort sort := dm.Sort + // remember the []string used for decoded sort + decodedSort := dm.DecodedSort // remember the FieldTermLocations backing array ftls := dm.FieldTermLocations for i := range ftls { // recycle the ArrayPositions of each location @@ -249,7 +251,8 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { dm.IndexInternalID = indexInternalID[:0] // reuse the []interface{} already allocated (and reset len to 0) dm.Sort = sort[:0] - dm.DecodedSort = dm.DecodedSort[:0] + // reuse the []string already allocated (and reset len to 0) + dm.DecodedSort = decodedSort[:0] // reuse the FieldTermLocations already allocated (and reset len to 0) dm.FieldTermLocations = ftls[:0] // reuse the Descendants already allocated (and reset len to 0) From ea9d3db2189b1afcda8d4bd8e98268a779d052e2 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 20 Jan 2026 19:40:46 +0530 Subject: [PATCH 8/9] few fixes --- search_knn.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/search_knn.go b/search_knn.go index 271fad3b2..a7c262b2c 100644 --- a/search_knn.go +++ b/search_knn.go @@ -385,21 +385,33 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea continue } // Applies to all supported types of queries. - filterSearcher, _ := filterQ.Searcher(ctx, reader, i.m, search.SearcherOptions{ + filterSearcher, err := filterQ.Searcher(ctx, reader, i.m, search.SearcherOptions{ Score: "none", // just want eligible hits --> don't compute scores if not needed }) + if err != nil { + return nil, err + } // Using the index doc count to determine collector size since we do not // have an estimate of the number of eligible docs in the index yet. indexDocCount, err := i.DocCount() if err != nil { + // close the searcher before returning + filterSearcher.Close() return nil, err } filterColl := collector.NewEligibleCollector(int(indexDocCount)) err = filterColl.Collect(ctx, filterSearcher, reader) if err != nil { + // close the searcher before returning + filterSearcher.Close() return nil, err } knnFilterResults[idx] = filterColl.EligibleSelector() + // Close the filter searcher, as we are done with it. + err = filterSearcher.Close() + if err != nil { + return nil, err + } } // Add the filter hits when creating the kNN query @@ -413,6 +425,11 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea if err != nil { return nil, err } + defer func() { + if serr := knnSearcher.Close(); err == nil && serr != nil { + err = serr + } + }() knnCollector := collector.NewKNNCollector(kArray, sumOfK) err = knnCollector.Collect(ctx, knnSearcher, reader) if err != nil { From d90565aa3ad9d4fdcba38cd40d64e2ea9593a3b9 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Tue, 20 Jan 2026 23:07:50 +0530 Subject: [PATCH 9/9] go mod tidy --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 80164509a..cb2e10e6a 100644 --- a/go.mod +++ b/go.mod @@ -24,7 +24,7 @@ require ( github.com/blevesearch/zapx/v13 v13.4.2 github.com/blevesearch/zapx/v14 v14.4.2 github.com/blevesearch/zapx/v15 v15.4.2 - github.com/blevesearch/zapx/v16 v16.3.0 + github.com/blevesearch/zapx/v16 v16.2.5-0.20260120173507-6b28cfe03ac2 github.com/blevesearch/zapx/v17 v17.0.0 github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 diff --git a/go.sum b/go.sum index d7ecb6509..67b4d8f50 100644 --- a/go.sum +++ b/go.sum @@ -44,8 +44,8 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= -github.com/blevesearch/zapx/v16 v16.3.0 h1:hF6VlN15E9CB40RMPyqOIhlDw1OOo9RItumhKMQktxw= -github.com/blevesearch/zapx/v16 v16.3.0/go.mod h1:zCFjv7McXWm1C8rROL+3mUoD5WYe2RKsZP3ufqcYpLY= +github.com/blevesearch/zapx/v16 v16.2.5-0.20260120173507-6b28cfe03ac2 h1:IV3iGtQ+Yt+3F5w8k0GBaEGFZgk5XQqYbmUftO/c/k4= +github.com/blevesearch/zapx/v16 v16.2.5-0.20260120173507-6b28cfe03ac2/go.mod h1:zCFjv7McXWm1C8rROL+3mUoD5WYe2RKsZP3ufqcYpLY= github.com/blevesearch/zapx/v17 v17.0.0 h1:srLJFkv5ghz1Z8iVz5uoOK89G2NvI4KdMG7aF3Cx7rE= github.com/blevesearch/zapx/v17 v17.0.0/go.mod h1:/pi9Gq7byQcduhNB6Vk08+ZXGVGPjZoNc5QnQY8lkOo= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=