From bdd9010477145a2e33680dc299b4d9dc3bc89dc4 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Thu, 12 Jun 2025 01:16:54 +0530 Subject: [PATCH 1/3] first draft --- document/field_nested.go | 93 ++++++++++++++++++++++++ index/scorch/snapshot_index.go | 15 +++- index/scorch/snapshot_index_tfr.go | 1 + mapping/document.go | 6 ++ mapping/index.go | 8 ++- search/query/nested.go | 101 ++++++++++++++++++++++++++ search/query/query.go | 11 ++- search/util.go | 7 ++ search_test.go | 111 +++++++++++++++++++++++++++++ 9 files changed, 349 insertions(+), 4 deletions(-) create mode 100644 document/field_nested.go create mode 100644 search/query/nested.go diff --git a/document/field_nested.go b/document/field_nested.go new file mode 100644 index 000000000..54dcf5efd --- /dev/null +++ b/document/field_nested.go @@ -0,0 +1,93 @@ +// Copyright (c) 2025 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package document + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeNestedField int + +func init() { + var f NestedField + reflectStaticSizeNestedField = int(reflect.TypeOf(f).Size()) +} + +const DefaultNestedIndexingOptions = index.IndexField + +type NestedField struct { + name string + options index.FieldIndexingOptions + numPlainTextBytes uint64 + + nestedDocuments []index.Document + + docAnalyzer index.DocumentAnalyzer +} + +func (s *NestedField) Size() int { + return reflectStaticSizeNestedField + size.SizeOfPtr + + len(s.name) +} + +func (s *NestedField) Name() string { + return s.name +} + +func (s *NestedField) ArrayPositions() []uint64 { + return nil +} + +func (s *NestedField) Options() index.FieldIndexingOptions { + return s.options +} + +func (s *NestedField) NumPlainTextBytes() uint64 { + return s.numPlainTextBytes +} + +func (s *NestedField) AnalyzedLength() int { + return 0 +} + +func (s *NestedField) EncodedFieldType() byte { + return 'e' +} + +func (s *NestedField) AnalyzedTokenFrequencies() index.TokenFrequencies { + return nil +} + +func (s *NestedField) Analyze() { + for _, doc := range s.nestedDocuments { + s.docAnalyzer.Analyze(doc) + } +} + +func (s *NestedField) Value() []byte { + return nil +} + +func NewNestedField(name string, nestedDocuments []index.Document, docAnalyzer index.DocumentAnalyzer) *NestedField { + return &NestedField{ + name: name, + options: DefaultNestedIndexingOptions, + nestedDocuments: nestedDocuments, + docAnalyzer: docAnalyzer, + } +} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 4f67a3c0b..3e8434764 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -28,6 +28,7 @@ import ( "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/document" + "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" @@ -621,6 +622,12 @@ func (is *IndexSnapshot) TermFieldReader(ctx context.Context, term []byte, field rv.includeTermVectors = includeTermVectors rv.currPosting = nil rv.currID = rv.currID[:0] + rv.nestInfo = nil + if ctx != nil { + if nInfo, ok := ctx.Value(search.NestedInfoCallbackKey).(*search.NestedInfo); ok { + rv.nestInfo = nInfo + } + } if rv.dicts == nil { rv.dicts = make([]segment.TermDictionary, len(is.segment)) @@ -634,7 +641,13 @@ func (is *IndexSnapshot) TermFieldReader(ctx context.Context, term []byte, field segBytesRead := s.segment.BytesRead() rv.incrementBytesRead(segBytesRead) } - dict, err := s.segment.Dictionary(field) + var dict segment.TermDictionary + var err error + if nestedSegment, ok := s.segment.(segment.NestedSegment); ok && rv.nestInfo != nil { + dict, err = nestedSegment.NestedDictionary(field, rv.nestInfo.Path, rv.nestInfo.ArrayPosition) + } else { + dict, err = s.segment.Dictionary(field) + } if err != nil { return nil, err } diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 315c5686c..843d9fbe7 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -51,6 +51,7 @@ type IndexSnapshotTermFieldReader struct { bytesRead uint64 ctx context.Context unadorned bool + nestInfo *search.NestedInfo } func (i *IndexSnapshotTermFieldReader) incrementBytesRead(val uint64) { diff --git a/mapping/document.go b/mapping/document.go index bf93896c9..102c7a641 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -42,6 +42,7 @@ import ( type DocumentMapping struct { Enabled bool `json:"enabled"` Dynamic bool `json:"dynamic"` + Nested bool `json:"nested,omitempty"` Properties map[string]*DocumentMapping `json:"properties,omitempty"` Fields []*FieldMapping `json:"fields,omitempty"` DefaultAnalyzer string `json:"default_analyzer,omitempty"` @@ -316,6 +317,11 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "nested": + err := util.UnmarshalJSON(v, &dm.Nested) + if err != nil { + return err + } case "default_analyzer": err := util.UnmarshalJSON(v, &dm.DefaultAnalyzer) if err != nil { diff --git a/mapping/index.go b/mapping/index.go index a40feb470..dcb0fd8ed 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -196,11 +196,17 @@ func (im *IndexMappingImpl) Validate() error { if err != nil { return err } - for _, docMapping := range im.TypeMapping { + if im.DefaultMapping.Nested { + return fmt.Errorf("default mapping cannot be nested") + } + for typ, docMapping := range im.TypeMapping { err = docMapping.Validate(im.cache, "", fieldAliasCtx) if err != nil { return err } + if docMapping.Nested { + return fmt.Errorf("document mapping for type '%s' cannot be nested", typ) + } } if _, ok := index.SupportedScoringModels[im.ScoringModel]; !ok && im.ScoringModel != "" { diff --git a/search/query/nested.go b/search/query/nested.go new file mode 100644 index 000000000..6a770464c --- /dev/null +++ b/search/query/nested.go @@ -0,0 +1,101 @@ +// Copyright (c) 2025 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package query + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" + "github.com/blevesearch/bleve/v2/util" + index "github.com/blevesearch/bleve_index_api" +) + +type NestedQuery struct { + Path string `json:"path"` + InnerQuery Query `json:"query"` +} + +func NewNestedQuery(path string, innerQuery Query) *NestedQuery { + return &NestedQuery{ + Path: path, + InnerQuery: innerQuery, + } +} + +func (q *NestedQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { + nr, ok := i.(index.NestedReader) + if !ok { + return nil, fmt.Errorf("nested searcher requires an index reader that supports nested documents") + } + childCount := nr.ChildCount(q.Path) + if childCount == 0 { + return nil, fmt.Errorf("nested searcher: path %q has no child documents", q.Path) + } + innerSearchers := make([]search.Searcher, 0, childCount) + for arrayPos := range childCount { + nctx := context.WithValue(ctx, search.NestedInfoCallbackKey, &search.NestedInfo{ + Path: q.Path, + ArrayPosition: arrayPos, + }) + innerSearcher, err := q.InnerQuery.Searcher(nctx, i, m, options) + if err != nil { + return nil, fmt.Errorf("nested searcher: failed to create inner searcher at pos %d: %w", arrayPos, err) + } + innerSearchers = append(innerSearchers, innerSearcher) + } + return searcher.NewDisjunctionSearcher(ctx, i, innerSearchers, 0, options) +} + +func (q *NestedQuery) Validate() error { + if q.Path == "" { + return fmt.Errorf("nested query must have a path") + } + if q.InnerQuery == nil { + return fmt.Errorf("nested query must have a query") + } + if vq, ok := q.InnerQuery.(ValidatableQuery); ok { + if err := vq.Validate(); err != nil { + return fmt.Errorf("nested query must have a valid query: %v", err) + } + } + return nil +} + +func (q *NestedQuery) UnmarshalJSON(data []byte) error { + tmp := struct { + Path string `json:"path"` + Query json.RawMessage `json:"query"` + }{} + err := util.UnmarshalJSON(data, &tmp) + if err != nil { + return err + } + if tmp.Path == "" { + return fmt.Errorf("nested query must have a path") + } + if tmp.Query == nil { + return fmt.Errorf("nested query must have a query") + } + q.Path = tmp.Path + q.InnerQuery, err = ParseQuery(tmp.Query) + if err != nil || q.InnerQuery == nil { + return fmt.Errorf("nested query must have a valid query: %v", err) + } + return nil +} diff --git a/search/query/query.go b/search/query/query.go index 6df38da37..17cdae8aa 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -353,7 +353,6 @@ func ParseQuery(input []byte) (Query, error) { } return &rv, nil } - _, hasGeo := tmp["geometry"] if hasGeo { var rv GeoShapeQuery @@ -363,7 +362,6 @@ func ParseQuery(input []byte) (Query, error) { } return &rv, nil } - _, hasCIDR := tmp["cidr"] if hasCIDR { var rv IPRangeQuery @@ -373,6 +371,15 @@ func ParseQuery(input []byte) (Query, error) { } return &rv, nil } + _, hasNested := tmp["nested"] + if hasNested { + var rv NestedQuery + err := util.UnmarshalJSON(input, &rv) + if err != nil { + return nil, err + } + return &rv, nil + } return nil, fmt.Errorf("unknown query type") } diff --git a/search/util.go b/search/util.go index 06f8f99d5..d91c764a1 100644 --- a/search/util.go +++ b/search/util.go @@ -152,6 +152,8 @@ const ( // BM25StatsKey is used to store and transport the BM25 Data // to the actual search phase which would use it to perform the search. BM25StatsKey ContextKey = "_bm25_stats_key" + + NestedInfoCallbackKey ContextKey = "_nested_info_callback_key" ) func RecordSearchCost(ctx context.Context, @@ -233,3 +235,8 @@ type BM25Stats struct { DocCount float64 `json:"doc_count"` FieldCardinality map[string]int `json:"field_cardinality"` } + +type NestedInfo struct { + Path string `json:"path"` + ArrayPosition int `json:"array_position"` +} diff --git a/search_test.go b/search_test.go index a144b605e..dbb4aed1e 100644 --- a/search_test.go +++ b/search_test.go @@ -4808,3 +4808,114 @@ func TestNumericSortAlias(t *testing.T) { } } } + +func TestNestedMapping(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + doc := ` + { + "blog": { + "title": "Tech Insights", + "posts": [ + { + "title": "AI Trends", + "published_date": "2025-04-22", + "comments": [ + { + "author": "Jane", + "text": "Very informative!", + "likes": 3 + }, + { + "author": "Tom", + "text": "Needs more detail.", + "likes": 1 + } + ] + + }, + { + "title": "Quantum Computing", + "published_date": "2024-11-15", + "comments": [ + { + "author": "Jane", + "text": "Mind-blowing!", + "likes": 5 + } + ] + } + ] + } + } + ` + imap := mapping.NewIndexMapping() + keywordMapping := NewTextFieldMapping() + keywordMapping.Analyzer = keyword.Name + englishMapping := NewTextFieldMapping() + englishMapping.Analyzer = en.AnalyzerName + numericMapping := NewNumericFieldMapping() + dateTimeMapping := NewDateTimeFieldMapping() + commentMapping := NewDocumentMapping() + commentMapping.Nested = true + commentMapping.AddFieldMappingsAt("author", englishMapping) + commentMapping.AddFieldMappingsAt("text", englishMapping) + commentMapping.AddFieldMappingsAt("likes", numericMapping) + postsMapping := NewDocumentMapping() + postsMapping.Nested = true + postsMapping.AddFieldMappingsAt("title", englishMapping) + postsMapping.AddFieldMappingsAt("published_date", dateTimeMapping) + postsMapping.AddSubDocumentMapping("comments", commentMapping) + blogMapping := NewDocumentMapping() + blogMapping.AddFieldMappingsAt("title", englishMapping) + blogMapping.AddSubDocumentMapping("posts", postsMapping) + imap.DefaultMapping.AddSubDocumentMapping("blog", blogMapping) + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + var document map[string]interface{} + err = json.Unmarshal([]byte(doc), &document) + if err != nil { + t.Fatal(err) + } + err = idx.Index("1", document) + if err != nil { + t.Fatal(err) + } + // Return all blogs with titles containing the word "Tech". + mq := query.NewMatchQuery("Tech") + mq.SetField("blog.title") + req := NewSearchRequest(mq) + res, err := idx.Search(req) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 1 { + t.Fatalf("expected 1 hit, got %d", len(res.Hits)) + } + // Return all blog posts tagged with the keyword "AI" and published in 2024. + q1 := query.NewMatchQuery("Jane") + q1.SetField("blog.posts.comments.author") + q2 := query.NewDateRangeStringQuery("2024-01-01", "2024-12-31") + q2.SetField("blog.posts.published_date") + cq := query.NewConjunctionQuery([]query.Query{q1, q2}) + req = NewSearchRequest(cq) + req.Explain = true + req.Fields = []string{"*"} + req.Highlight = NewHighlightWithStyle(ansi.Name) + res, err = idx.Search(req) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != 0 { + t.Fatalf("expected 0 hits, got %d", len(res.Hits)) + } +} From ac6bc3d81ea4466d42093f6cd13cc5020b5f75e4 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 8 Jul 2025 00:53:19 +0530 Subject: [PATCH 2/3] version2 --- index/scorch/snapshot_index.go | 10 ++-- index/scorch/snapshot_index_tfr.go | 2 +- search/query/nested.go | 19 +++++--- search/util.go | 78 ++++++++++++++++++++++++++++-- 4 files changed, 93 insertions(+), 16 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 3e8434764..2e2710b74 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -622,10 +622,10 @@ func (is *IndexSnapshot) TermFieldReader(ctx context.Context, term []byte, field rv.includeTermVectors = includeTermVectors rv.currPosting = nil rv.currID = rv.currID[:0] - rv.nestInfo = nil + rv.nestedState = nil if ctx != nil { - if nInfo, ok := ctx.Value(search.NestedInfoCallbackKey).(*search.NestedInfo); ok { - rv.nestInfo = nInfo + if nestedState, ok := ctx.Value(search.NestedStateKey).(index.NestedState); ok { + rv.nestedState = nestedState } } @@ -643,8 +643,8 @@ func (is *IndexSnapshot) TermFieldReader(ctx context.Context, term []byte, field } var dict segment.TermDictionary var err error - if nestedSegment, ok := s.segment.(segment.NestedSegment); ok && rv.nestInfo != nil { - dict, err = nestedSegment.NestedDictionary(field, rv.nestInfo.Path, rv.nestInfo.ArrayPosition) + if ns, ok := s.segment.(segment.NestedSegment); ok && rv.nestedState != nil { + dict, err = ns.NestedDictionary(rv.nestedState, field) } else { dict, err = s.segment.Dictionary(field) } diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 843d9fbe7..3ce371a26 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -51,7 +51,7 @@ type IndexSnapshotTermFieldReader struct { bytesRead uint64 ctx context.Context unadorned bool - nestInfo *search.NestedInfo + nestedState index.NestedState } func (i *IndexSnapshotTermFieldReader) incrementBytesRead(val uint64) { diff --git a/search/query/nested.go b/search/query/nested.go index 6a770464c..e6016aa26 100644 --- a/search/query/nested.go +++ b/search/query/nested.go @@ -43,16 +43,23 @@ func (q *NestedQuery) Searcher(ctx context.Context, i index.IndexReader, m mappi if !ok { return nil, fmt.Errorf("nested searcher requires an index reader that supports nested documents") } - childCount := nr.ChildCount(q.Path) + if q.Path == "" || q.InnerQuery == nil { + return nil, fmt.Errorf("nested searcher requires a valid path and inner query") + } + var baseState index.NestedState + if existing, ok := ctx.Value(search.NestedStateKey).(index.NestedState); ok { + baseState = existing + } else { + baseState = search.NewNestedState() + } + childCount := nr.ChildCount(baseState, q.Path) if childCount == 0 { return nil, fmt.Errorf("nested searcher: path %q has no child documents", q.Path) } innerSearchers := make([]search.Searcher, 0, childCount) - for arrayPos := range childCount { - nctx := context.WithValue(ctx, search.NestedInfoCallbackKey, &search.NestedInfo{ - Path: q.Path, - ArrayPosition: arrayPos, - }) + for arrayPos := 0; arrayPos < childCount; arrayPos++ { + newState := baseState.Append(q.Path, arrayPos) + nctx := context.WithValue(ctx, search.NestedStateKey, newState) innerSearcher, err := q.InnerQuery.Searcher(nctx, i, m, options) if err != nil { return nil, fmt.Errorf("nested searcher: failed to create inner searcher at pos %d: %w", arrayPos, err) diff --git a/search/util.go b/search/util.go index d91c764a1..77fec4507 100644 --- a/search/util.go +++ b/search/util.go @@ -16,7 +16,9 @@ package search import ( "context" + "slices" + index "github.com/blevesearch/bleve_index_api" "github.com/blevesearch/geo/s2" ) @@ -153,7 +155,7 @@ const ( // to the actual search phase which would use it to perform the search. BM25StatsKey ContextKey = "_bm25_stats_key" - NestedInfoCallbackKey ContextKey = "_nested_info_callback_key" + NestedStateKey ContextKey = "_nested_state_key" ) func RecordSearchCost(ctx context.Context, @@ -236,7 +238,75 @@ type BM25Stats struct { FieldCardinality map[string]int `json:"field_cardinality"` } -type NestedInfo struct { - Path string `json:"path"` - ArrayPosition int `json:"array_position"` +type nestedState struct { + paths []string + arrayPositions []int +} + +func NewNestedState() index.NestedState { + return &nestedState{ + paths: make([]string, 0), + arrayPositions: make([]int, 0), + } +} + +// Append returns a new NestedState with the given path and array position added. +// It does NOT modify the original NestedState. +func (s *nestedState) Append(path string, pos int) index.NestedState { + return &nestedState{ + paths: append(slices.Clone(s.paths), path), + arrayPositions: append(slices.Clone(s.arrayPositions), pos), + } +} + +func (s *nestedState) Empty() bool { + return len(s.paths) == 0 && len(s.arrayPositions) == 0 +} + +func (s *nestedState) Clear() { + s.paths = s.paths[:0] + s.arrayPositions = s.arrayPositions[:0] +} + +func (s *nestedState) Root() string { + if len(s.paths) == 0 { + return "" + } + return s.paths[0] +} + +func (s *nestedState) Iterator() index.NestedIterator { + return &nestedIterator{ + paths: s.paths, + arrayPosition: s.arrayPositions, + index: 0, + } +} + +type nestedIterator struct { + paths []string + arrayPosition []int + index int +} + +func (ni *nestedIterator) HasNext() bool { + return ni.index < len(ni.paths) +} + +func (ni *nestedIterator) Next() (string, int, bool) { + if ni.index >= len(ni.paths) { + return "", 0, false + } + path := ni.paths[ni.index] + arrayPos := ni.arrayPosition[ni.index] + ni.index++ + return path, arrayPos, true +} + +func (ni *nestedIterator) Reset() { + ni.index = 0 +} + +func (ni *nestedIterator) Size() int { + return len(ni.paths) } From 0ec61816913513c6a7aef41b61022aae972eee0c Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 15 Jul 2025 18:42:58 +0530 Subject: [PATCH 3/3] small interface impl --- document/field_nested.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/document/field_nested.go b/document/field_nested.go index 54dcf5efd..2aafd6646 100644 --- a/document/field_nested.go +++ b/document/field_nested.go @@ -83,6 +83,16 @@ func (s *NestedField) Value() []byte { return nil } +func (s *NestedField) NumChildren() int { + return len(s.nestedDocuments) +} + +func (s *NestedField) VisitChildren(visitor func(arrayPosition int, document index.Document)) { + for i, doc := range s.nestedDocuments { + visitor(i, doc) + } +} + func NewNestedField(name string, nestedDocuments []index.Document, docAnalyzer index.DocumentAnalyzer) *NestedField { return &NestedField{ name: name,