From 0466c32b259c27097bc634ab0bab8631bea21ef4 Mon Sep 17 00:00:00 2001 From: Likith B Date: Mon, 2 Feb 2026 19:11:49 +0530 Subject: [PATCH 1/5] MB-62985: Forcing cosine similarity for binary optimized indexes --- document/field_vector.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/document/field_vector.go b/document/field_vector.go index 4c20013c7..fcd951a32 100644 --- a/document/field_vector.go +++ b/document/field_vector.go @@ -114,6 +114,10 @@ func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64, // skip freq/norms for vector field options |= index.SkipFreqNorm + if vectorIndexOptimizedFor == index.IndexOptimizedForBinary { + similarity = index.CosineSimilarity + } + return &VectorField{ name: name, dims: dims, From 2712d88a8f0d96f68bd302472341eda553307ced Mon Sep 17 00:00:00 2001 From: Likith B Date: Fri, 13 Feb 2026 13:32:53 +0530 Subject: [PATCH 2/5] Addressing reviews and added tests --- document/field_vector.go | 2 +- mapping/mapping_vectors.go | 15 + search/query/knn.go | 3 + search_binary_test.go | 731 +++++++++++++++++++++++++++++++++++++ search_knn_test.go | 269 ++++++++++++++ 5 files changed, 1019 insertions(+), 1 deletion(-) create mode 100644 search_binary_test.go diff --git a/document/field_vector.go b/document/field_vector.go index fcd951a32..5157c557c 100644 --- a/document/field_vector.go +++ b/document/field_vector.go @@ -114,7 +114,7 @@ func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64, // skip freq/norms for vector field options |= index.SkipFreqNorm - if vectorIndexOptimizedFor == index.IndexOptimizedForBinary { + if vectorIndexOptimizedFor == index.IndexOptimizedWithBivfFlat { similarity = index.CosineSimilarity } diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index 7c7ff1b98..c6c618043 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -151,6 +151,11 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, if vectorIndexOptimizedFor == "" { vectorIndexOptimizedFor = index.DefaultIndexOptimization } + // BIVF-Flat index needs cosine similarity for correct scoring + // regardless of the similarity metric specified in the mapping. + if vectorIndexOptimizedFor == index.IndexOptimizedWithBivfFlat { + similarity = index.CosineSimilarity + } // normalize raw vector if similarity is cosine // Since the vector can be multi-vector (flattened array of multiple vectors), // we use NormalizeMultiVector to normalize each sub-vector independently. @@ -185,6 +190,11 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac if vectorIndexOptimizedFor == "" { vectorIndexOptimizedFor = index.DefaultIndexOptimization } + // BIVF-Flat index needs cosine similarity for correct scoring + // regardless of the similarity metric specified in the mapping. + if vectorIndexOptimizedFor == index.IndexOptimizedWithBivfFlat { + similarity = index.CosineSimilarity + } decodedVector, err := document.DecodeVector(encodedString) if err != nil || len(decodedVector) != fm.Dims { return @@ -289,6 +299,11 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, effectiveOptimizedFor, reflect.ValueOf(index.SupportedVectorIndexOptimizations).MapKeys()) } + // BIVF-Flat index optimization requires vector dimensions to be a multiple of 8 + if effectiveOptimizedFor == index.IndexOptimizedWithBivfFlat && field.Dims%8 != 0 { + return fmt.Errorf("field: '%s', invalid vector dimension: %d for optimization with BIVF-Flat,"+ + " dimension should be a multiple of 8", effectiveFieldName, field.Dims) + } if fieldAliasCtx != nil { // writing to a nil map is unsafe fieldAliasCtx[effectiveFieldName] = field diff --git a/search/query/knn.go b/search/query/knn.go index ea8780a41..e0389d43e 100644 --- a/search/query/knn.go +++ b/search/query/knn.go @@ -84,6 +84,9 @@ func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader, if q.K <= 0 || len(q.Vector) == 0 { return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty") } + if fieldMapping.VectorIndexOptimizedFor == index.IndexOptimizedWithBivfFlat { + similarityMetric = index.CosineSimilarity + } if similarityMetric == index.CosineSimilarity { // normalize the vector q.Vector = mapping.NormalizeVector(q.Vector) diff --git a/search_binary_test.go b/search_binary_test.go new file mode 100644 index 000000000..4c34b8a64 --- /dev/null +++ b/search_binary_test.go @@ -0,0 +1,731 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package bleve + +import ( + "bufio" + "encoding/json" + "fmt" + "math/rand" + "os" + "strings" + "testing" + "time" + + "github.com/blevesearch/bleve/v2/mapping" + index "github.com/blevesearch/bleve_index_api" +) + +// var datasetPath = "/Users/likithb/Desktop/Code/MB-62985/debug/dataset/cohere.jsonl" +// var parentPath = "/Users/likithb/Desktop/Code/MB-62985/indexes" +// var queryPath = "/Users/likithb/Desktop/Code/MB-62985/debug/queries1k.jsonl" + +// func TestBuild500k(t *testing.T) { +// f, err := os.Open(datasetPath) +// if err != nil { +// t.Fatalf("open dataset: %v", err) +// } +// defer f.Close() + +// scanner := bufio.NewScanner(f) +// buf := make([]byte, 0, 1024*1024) +// scanner.Buffer(buf, 128*1024*1024) +// dims := 768 + +// indexMapping := mapping.NewIndexMapping() + +// // Recall-optimized vector field mapping +// vecFieldFloatRecall := mapping.NewVectorFieldMapping() +// vecFieldFloatRecall.Similarity = index.CosineSimilarity +// vecFieldFloatRecall.Dims = dims +// vecFieldFloatRecall.VectorIndexOptimizedFor = index.IndexOptimizedForRecall + +// indexMapping.DefaultMapping.AddFieldMappingsAt("recall", vecFieldFloatRecall) + +// // Latency-optimized vector field mapping +// vecFieldFloatLatency := mapping.NewVectorFieldMapping() +// vecFieldFloatLatency.Similarity = index.CosineSimilarity +// vecFieldFloatLatency.Dims = dims +// vecFieldFloatLatency.VectorIndexOptimizedFor = index.IndexOptimizedForLatency + +// indexMapping.DefaultMapping.AddFieldMappingsAt("latency", vecFieldFloatLatency) + +// // Memory-efficient vector field mapping +// vecFieldFloatMemory := mapping.NewVectorFieldMapping() +// vecFieldFloatMemory.Similarity = index.CosineSimilarity +// vecFieldFloatMemory.Dims = dims +// vecFieldFloatMemory.VectorIndexOptimizedFor = index.IndexOptimizedForMemoryEfficient + +// indexMapping.DefaultMapping.AddFieldMappingsAt("memory", vecFieldFloatMemory) + +// // Binary-optimized vector field mapping +// vecFieldBinary := mapping.NewVectorFieldMapping() +// vecFieldBinary.Similarity = index.CosineSimilarity +// vecFieldBinary.Dims = dims +// vecFieldBinary.VectorIndexOptimizedFor = index.IndexOptimizedForBinary + +// indexMapping.DefaultMapping.AddFieldMappingsAt("binary", vecFieldBinary) + +// indexPath := parentPath + "/test6-50k" +// if err := os.RemoveAll(indexPath); err != nil { +// t.Fatalf("remove index path: %v", err) +// } + +// idx, err := New(indexPath, indexMapping) +// if err != nil { +// t.Fatalf("create index: %v", err) +// } + +// defer idx.Close() + +// batchSize := 500 +// batch := idx.NewBatch() +// numDocs := 50000 +// count := 0 + +// for scanner.Scan() { +// if count%batchSize == 0 && count > 0 { +// if err := idx.Batch(batch); err != nil { +// t.Fatalf("index batch at doc %d: %v", count, err) +// } +// batch = idx.NewBatch() +// fmt.Printf("indexed %d documents\n", count) +// } +// if count >= numDocs { +// break +// } +// line := strings.TrimSpace(scanner.Text()) +// if line == "" { +// continue +// } +// _, vec, ok := parseDoc(line) +// if !ok || len(vec) != dims { +// t.Fatalf("invalid doc at line %d", count) +// continue +// } +// if err := batch.Index( +// fmt.Sprintf("doc-%d", count), +// map[string]interface{}{ +// "recall": vec, +// "latency": vec, +// "memory": vec, +// "binary": vec, +// }, +// ); err != nil { +// t.Fatalf("index doc %d: %v", count, err) +// } +// count++ +// } +// if err := scanner.Err(); err != nil { +// t.Fatalf("read dataset: %v", err) +// } +// if err := idx.Batch(batch); err != nil { +// t.Fatalf("index final batch: %v", err) +// } + +// time.Sleep(15 * time.Second) +// } + +// func TestGenQueries500k(t *testing.T) { + +// f, err := os.Open(datasetPath) +// if err != nil { +// t.Fatalf("open dataset: %v", err) +// } +// defer f.Close() + +// scanner := bufio.NewScanner(f) +// buf := make([]byte, 0, 1024*1024) +// scanner.Buffer(buf, 128*1024*1024) + +// if err := os.RemoveAll(queryPath); err != nil { +// t.Fatalf("remove index path: %v", err) +// } +// queriesFile, err := os.Create(queryPath) +// if err != nil { +// t.Fatalf("create queries file: %v", err) +// } +// defer queriesFile.Close() +// writer := bufio.NewWriter(queriesFile) +// numQueries := 2000 + +// // Read 500 vectors and pick one query vector at random and repeat till numQueries or dataset ends +// batchSize := 500 +// var batch []map[string]interface{} +// count := 0 +// queriesGenerated := 0 + +// for scanner.Scan() && queriesGenerated < numQueries { +// line := strings.TrimSpace(scanner.Text()) +// if line == "" { +// continue +// } + +// var obj map[string]interface{} +// if err := json.Unmarshal([]byte(line), &obj); err != nil { +// continue +// } + +// batch = append(batch, obj) + +// // Process batch when it reaches batchSize +// if len(batch) == batchSize { +// if len(batch) > 0 { +// // Pick a random vector from the batch +// randomIdx := rand.Intn(len(batch)) +// queryObj := batch[randomIdx] + +// // Write the query to file +// queryBytes, err := json.Marshal(queryObj) +// if err != nil { +// batch = nil +// count++ +// continue +// } + +// if _, err := writer.WriteString(string(queryBytes) + "\n"); err != nil { +// t.Fatalf("write query: %v", err) +// } + +// queriesGenerated++ +// if queriesGenerated%100 == 0 { +// fmt.Printf("Generated query %d\tCount %d\n", queriesGenerated, count) +// } +// } + +// batch = nil +// } + +// count++ +// } + +// // Process remaining batch at end of file +// if len(batch) > 0 && queriesGenerated < numQueries { +// randomIdx := rand.Intn(len(batch)) +// queryObj := batch[randomIdx] + +// queryBytes, err := json.Marshal(queryObj) +// if err == nil { +// if _, err := writer.WriteString(string(queryBytes) + "\n"); err != nil { +// t.Fatalf("write query: %v", err) +// } +// fmt.Printf("Generated query %d\tCount %d\n", queriesGenerated+1, count) +// queriesGenerated++ +// } +// } + +// if err := scanner.Err(); err != nil { +// t.Fatalf("read dataset: %v", err) +// } + +// if err := writer.Flush(); err != nil { +// t.Fatalf("flush queries file: %v", err) +// } +// } + +// func TestRecall500k(t *testing.T) { + +// queryVecs := readQueryVecs(t) + +// indexPath := parentPath + "/test6-50k" + +// idx, err := Open(indexPath) +// if err != nil { +// t.Fatalf("open index: %v", err) +// } +// defer idx.Close() + +// totalRecall := 0.0 +// for i, qv := range queryVecs { +// srRec := NewSearchRequest(NewMatchNoneQuery()) +// srRec.AddKNN("recall", qv, 100, 1.0) +// srRec.Size = 100 + +// hitsRecall, err := idx.Search(srRec) +// if err != nil { +// t.Fatalf("search recall: %v", err) +// } + +// srBin := NewSearchRequest(NewMatchNoneQuery()) +// srBin.AddKNN("binary", qv, 100, 1.0) +// srBin.Size = 100 + +// hitsBin, err := idx.Search(srBin) +// if err != nil { +// t.Fatalf("search binary: %v", err) +// } + +// recallSet := make(map[string]struct{}) +// for _, hit := range hitsRecall.Hits { +// recallSet[hit.ID] = struct{}{} +// } + +// matching := 0 +// for _, hit := range hitsBin.Hits { +// if _, found := recallSet[hit.ID]; found { +// matching++ +// } +// } + +// recallRate := float64(matching) / float64(len(hitsRecall.Hits)) +// fmt.Printf("Query - %d\tRecall Rate - %f\n", i, recallRate) +// totalRecall += recallRate +// } + +// avgRecall := totalRecall / float64(len(queryVecs)) +// fmt.Printf("Average Recall Rate over %d queries: %f\n", len(queryVecs), avgRecall) +// } + +// func BenchmarkSearchBinary500k(b *testing.B) { +// queryVecs := readQueryVecsB(b) + +// indexPath := parentPath + "/test1-500k" +// idx, err := Open(indexPath) +// if err != nil { +// b.Fatalf("open index: %v", err) +// } +// defer idx.Close() + +// // Pre-build search requests +// reqs := make([]*SearchRequest, len(queryVecs)) +// for i, qv := range queryVecs { +// sr := NewSearchRequest(NewMatchNoneQuery()) +// sr.AddKNN("binary", qv, 100, 1.0) +// sr.Size = 100 +// reqs[i] = sr +// } + +// b.ResetTimer() + +// for i := 0; i < 1000; i++ { +// req := reqs[i] + +// _, err := idx.Search(req) +// if err != nil { +// b.Fatalf("search binary: %v", err) +// } +// } +// } + +// func readQueryVecs(t *testing.T) [][]float32 { +// f, err := os.Open(queryPath) +// if err != nil { +// t.Fatalf("open queries file: %v", err) +// } +// defer f.Close() + +// scanner := bufio.NewScanner(f) +// var queryVecs [][]float32 + +// for scanner.Scan() { +// line := strings.TrimSpace(scanner.Text()) +// if line == "" { +// continue +// } +// _, vec, ok := parseDoc(line) +// if !ok { +// t.Fatalf("invalid query line: %s", line) +// } +// queryVecs = append(queryVecs, vec) +// } +// if err := scanner.Err(); err != nil { +// t.Fatalf("read queries file: %v", err) +// } +// return queryVecs +// } + +// func readQueryVecsB(t *testing.B) [][]float32 { +// f, err := os.Open(queryPath) +// if err != nil { +// t.Fatalf("open queries file: %v", err) +// } +// defer f.Close() + +// scanner := bufio.NewScanner(f) +// var queryVecs [][]float32 + +// for scanner.Scan() { +// line := strings.TrimSpace(scanner.Text()) +// if line == "" { +// continue +// } +// _, vec, ok := parseDoc(line) +// if !ok { +// t.Fatalf("invalid query line: %s", line) +// } +// queryVecs = append(queryVecs, vec) +// } +// if err := scanner.Err(); err != nil { +// t.Fatalf("read queries file: %v", err) +// } +// return queryVecs +// } + +// func parseDoc(line string) (string, []float32, bool) { +// var obj map[string]interface{} +// if err := json.Unmarshal([]byte(line), &obj); err != nil { +// return "", nil, false +// } +// // txt, _ := obj["text"].(string) +// vec := anyToFloat32Slice(obj["emb"]) +// if len(vec) == 0 { +// return "", nil, false +// } +// return "", vec, true +// } + +// Helper function to convert various numeric types to []float32 +func anyToFloat32Slice(v interface{}) []float32 { + switch arr := v.(type) { + case []interface{}: + out := make([]float32, 0, len(arr)) + for _, it := range arr { + switch n := it.(type) { + case float64: + out = append(out, float32(n)) + case float32: + out = append(out, n) + case int: + out = append(out, float32(n)) + case int64: + out = append(out, float32(n)) + default: + return nil + } + } + return out + case []float64: + out := make([]float32, len(arr)) + for i := range arr { + out[i] = float32(arr[i]) + } + return out + case []float32: + return arr + default: + return nil + } +} + +var datasetPath = "/Users/likithb/Desktop/Code/MB-62985/debug/dataset/cohere.jsonl" +var queryPath = "/Users/likithb/Desktop/Code/MB-62985/debug/queries.json" +var groundTruthPath = "/Users/likithb/Desktop/Code/MB-62985/debug/groundTruth.json" +var indexDirectory = "/Users/likithb/Desktop/Code/MB-62985/indexes" + +func TestBinaryVectorSegment(t *testing.T) { + + numDocs := 1000 + dims := 768 + ver := 3 + vectors := loadVectors(t, datasetPath, numDocs) + // vectors := make([][]float32, numDocs) + // for i := 0; i < numDocs; i++ { + // vectors[i] = make([]float32, dims) + // for j := 0; j < dims; j++ { + // vectors[i][j] = randomFloat32(-1.0, 1.0) + // } + // } + // fmt.Printf("loaded %d vectors from %s\n\n", len(vectors), datasetPath) + + indexMapping := buildIndexMapping(dims) + + indexPath := fmt.Sprintf("%s/%d-%dk", indexDirectory, ver, numDocs/1000) + if err := os.RemoveAll(indexPath); err != nil { + t.Fatalf("remove index path: %v", err) + } + + idx, err := New(indexPath, indexMapping) + if err != nil { + t.Fatalf("create index: %v", err) + } + defer idx.Close() + fmt.Printf("created index\n\n") + + batchSize := 1000 + batch := idx.NewBatch() + count := 0 + + for _, vec := range vectors { + if count%batchSize == 0 && count > 0 { + if err := idx.Batch(batch); err != nil { + t.Fatalf("index batch at doc %d: %v", count, err) + } + batch = idx.NewBatch() + fmt.Printf("indexed %d documents\n", count) + } + + if err := batch.Index( + fmt.Sprintf("doc-%d", count), + map[string]interface{}{ + "binary": vec, + "recall": vec, + }, + ); err != nil { + t.Fatalf("index doc %d: %v", count, err) + } + count++ + } + if err := idx.Batch(batch); err != nil { + t.Fatalf("index final batch: %v", err) + } + fmt.Printf("indexed total %d documents\n\n", count) + vectors = nil + time.Sleep(5 * time.Second) + + // idx, err := Open(indexPath) + // if err != nil { + // t.Fatalf("open index: %v", err) + // } + // defer idx.Close() + // fmt.Printf("opened index from %s\n\n", indexPath) + + queries := loadQueries(t, queryPath) + // queries := make([][]float32, 10000) + // for i := 0; i < 10000; i++ { + // queries[i] = make([]float32, dims) + // for j := 0; j < dims; j++ { + // queries[i][j] = randomFloat32(-1.0, 1.0) + // } + // } + fmt.Printf("loaded queries from %s\n\n", queryPath) + + // groundTruths := loadGroundTruths(t, groundTruthPath) + // fmt.Printf("loaded ground truths from %s\n\n", groundTruthPath) + totalRecall := 0 + totalTimeBinary := 0 + totalTimeRecall := 0 + k := int64(3) + for i, qv := range queries { + start := time.Now() + sr := NewSearchRequest(NewMatchNoneQuery()) + sr.AddKNN("binary", qv, k, 1.0) + sr.Size = int(k) + + hits, err := idx.Search(sr) + if err != nil { + t.Fatalf("search binary: %v", err) + } + elapsed := time.Since(start) + totalTimeBinary += int(elapsed) + resultSet := make(map[int64]struct{}) + for _, hit := range hits.Hits { + var docNum int64 + _, err := fmt.Sscanf(hit.ID, "doc-%d", &docNum) + if err != nil { + t.Fatalf("parse doc ID %s: %v", hit.ID, err) + } + resultSet[docNum] = struct{}{} + } + + start = time.Now() + sr = NewSearchRequest(NewMatchNoneQuery()) + sr.AddKNN("recall", qv, k, 1.0) + sr.Size = int(k) + + hitsRec, err := idx.Search(sr) + if err != nil { + t.Fatalf("search recall: %v", err) + } + elapsed = time.Since(start) + totalTimeRecall += int(elapsed) + + groundTruths := make(map[int64]struct{}) + for _, hit := range hitsRec.Hits { + var docNum int64 + _, err := fmt.Sscanf(hit.ID, "doc-%d", &docNum) + if err != nil { + t.Fatalf("parse doc ID %s: %v", hit.ID, err) + } + groundTruths[docNum] = struct{}{} + } + matching := 0 + for docNum := range groundTruths { + if _, found := resultSet[docNum]; found { + matching++ + } + } + + // matching := 0 + // for docNum := range groundTruths[i] { + // if _, found := resultSet[docNum]; found { + // matching++ + // } + // } + + totalRecall += matching + + if i%1000 == 0 { + fmt.Printf("Processed %d queries\n", i+1) + fmt.Printf("Average Recall so far: %f\n", float64(totalRecall)/float64((i+1)*int(k))) + fmt.Printf("Average Time per Query (Binary) so far: %v\n", time.Duration(totalTimeBinary/(i+1))) + fmt.Printf("Average Time per Query (Recall) so far: %v\n", time.Duration(totalTimeRecall/(i+1))) + if i == 0 { + totalTimeBinary = 0 + totalTimeRecall = 0 + } + } + } + + avgRecall := float64(totalRecall) / float64(len(queries)*int(k)) + fmt.Printf("Average Recall over %d queries: %f\n", len(queries), avgRecall) + avgTime := time.Duration(totalTimeBinary / (len(queries) - 1)) + fmt.Printf("Average Time per Query (Binary): %v\n", avgTime) + avgTime = time.Duration(totalTimeRecall / (len(queries) - 1)) + fmt.Printf("Average Time per Query (Recall): %v\n", avgTime) +} + +func loadGroundTruths(t *testing.T, path string) []map[int64]struct{} { + var groundTruth []map[int64]struct{} + gtFile, err := os.Open(path) + if err != nil { + t.Fatalf("open ground truth file: %v", err) + } + defer gtFile.Close() + + decoder := json.NewDecoder(gtFile) + err = decoder.Decode(&groundTruth) + if err != nil { + t.Fatalf("decode ground truth from file: %v", err) + } + return groundTruth +} + +func randomFloat32(min, max float32) float32 { + return min + (max-min)*rand.Float32() +} + +func loadQueries(t *testing.T, path string) [][]float32 { + var queries [][]float32 + queryFile, err := os.Open(path) + if err != nil { + t.Fatalf("open query file: %v", err) + } + defer queryFile.Close() + + decoder := json.NewDecoder(queryFile) + err = decoder.Decode(&queries) + if err != nil { + t.Fatalf("decode queries from file: %v", err) + } + return queries +} + +func buildIndexMapping(dims int) *mapping.IndexMappingImpl { + indexMapping := mapping.NewIndexMapping() + + // Recall-optimized vector field mapping + vecFieldFloatRecall := mapping.NewVectorFieldMapping() + vecFieldFloatRecall.Similarity = index.CosineSimilarity + vecFieldFloatRecall.Dims = dims + vecFieldFloatRecall.VectorIndexOptimizedFor = index.IndexOptimizedForRecall + + indexMapping.DefaultMapping.AddFieldMappingsAt("recall", vecFieldFloatRecall) + + // Latency-optimized vector field mapping + vecFieldFloatLatency := mapping.NewVectorFieldMapping() + vecFieldFloatLatency.Similarity = index.CosineSimilarity + vecFieldFloatLatency.Dims = dims + vecFieldFloatLatency.VectorIndexOptimizedFor = index.IndexOptimizedForLatency + + indexMapping.DefaultMapping.AddFieldMappingsAt("latency", vecFieldFloatLatency) + + // Memory-efficient vector field mapping + vecFieldFloatMemory := mapping.NewVectorFieldMapping() + vecFieldFloatMemory.Similarity = index.CosineSimilarity + vecFieldFloatMemory.Dims = dims + vecFieldFloatMemory.VectorIndexOptimizedFor = index.IndexOptimizedForMemoryEfficient + + indexMapping.DefaultMapping.AddFieldMappingsAt("memory", vecFieldFloatMemory) + + // Binary-optimized vector field mapping + vecFieldBinary := mapping.NewVectorFieldMapping() + vecFieldBinary.Similarity = index.CosineSimilarity + vecFieldBinary.Dims = dims + vecFieldBinary.VectorIndexOptimizedFor = index.IndexOptimizedWithBivfFlat + + indexMapping.DefaultMapping.AddFieldMappingsAt("binary", vecFieldBinary) + + return indexMapping +} + +func loadVectors(t *testing.T, path string, maxDocs int) [][]float32 { + f, err := os.Open(path) + if err != nil { + t.Fatalf("open dataset: %v", err) + } + defer f.Close() + + scanner := bufio.NewScanner(f) + buf := make([]byte, 0, 1024*1024) + scanner.Buffer(buf, 128*1024*1024) + + var vectors [][]float32 + count := 0 + + for scanner.Scan() && count < maxDocs { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + + var obj map[string]interface{} + if err := json.Unmarshal([]byte(line), &obj); err != nil { + continue + } + + vec := anyToFloat32Slice(obj["emb"]) + vectors = append(vectors, vec) + count++ + } + if err := scanner.Err(); err != nil { + t.Fatalf("read dataset: %v", err) + } + + return vectors +} + +func TestCreateDistribution(t *testing.T) { + numVectors := 100000 + vectors := loadVectors(t, datasetPath, numVectors) + buckets := make([]int, 40) + + for i := 0; i < numVectors; i++ { + for j := 0; j < 768; j++ { + val := vectors[i][j] + bucketIdx := int((val + 2.0) / 0.1) + if bucketIdx < 0 { + bucketIdx = 0 + } else if bucketIdx >= len(buckets) { + bucketIdx = len(buckets) - 1 + } + buckets[bucketIdx]++ + } + } + + for i, count := range buckets { + lowerBound := -2.0 + float32(i)*0.1 + upperBound := lowerBound + 0.1 + fmt.Printf("Bucket %2d [%.1f, %.1f): %d\n", i, lowerBound, upperBound, count) + } + total := 0 + for _, count := range buckets { + total += count + } + fmt.Printf("Total values: %d\n", total) + fmt.Printf("Expected: %d\n", numVectors*768) +} diff --git a/search_knn_test.go b/search_knn_test.go index d053705ca..5df43326f 100644 --- a/search_knn_test.go +++ b/search_knn_test.go @@ -608,6 +608,275 @@ func TestVectorBase64Index(t *testing.T) { } } +// Test to verify that the BIVF-Flat index with vector base64 field mapping returns the +// same results as the non-optimized vector field mapping for L2, Dot Product and Cosine similarities. +// Also test to see no differences in results for any distance metric +func TestVectorBivfFlatIndex(t *testing.T) { + + dataset, searchRequests, err := readDatasetAndQueries(testInputCompressedFile) + if err != nil { + t.Fatal(err) + } + documents := makeDatasetIntoDocuments(dataset) + + _, searchRequestsCopy, err := readDatasetAndQueries(testInputCompressedFile) + if err != nil { + t.Fatal(err) + } + + for _, doc := range documents { + vec, ok := doc["vector"].([]float32) + if !ok { + t.Fatal("Typecasting vector to float array failed") + } + + buf := new(bytes.Buffer) + for _, v := range vec { + err := binary.Write(buf, binary.LittleEndian, v) + if err != nil { + t.Fatal(err) + } + } + + doc["vectorEncoded"] = base64.StdEncoding.EncodeToString(buf.Bytes()) + } + + for _, sr := range searchRequestsCopy { + for _, kr := range sr.KNN { + kr.Field = "vectorEncoded" + } + } + + contentFM := NewTextFieldMapping() + contentFM.Analyzer = en.AnalyzerName + + vecFML2 := mapping.NewVectorFieldMapping() + vecFML2.Dims = testDatasetDims + vecFML2.Similarity = index.EuclideanDistance + vecFML2.VectorIndexOptimizedFor = index.IndexOptimizedWithBivfFlat + + vecBFML2 := mapping.NewVectorBase64FieldMapping() + vecBFML2.Dims = testDatasetDims + vecBFML2.Similarity = index.EuclideanDistance + vecBFML2.VectorIndexOptimizedFor = index.IndexOptimizedWithBivfFlat + + vecFMDot := mapping.NewVectorFieldMapping() + vecFMDot.Dims = testDatasetDims + vecFMDot.Similarity = index.InnerProduct + vecFMDot.VectorIndexOptimizedFor = index.IndexOptimizedWithBivfFlat + + vecBFMDot := mapping.NewVectorBase64FieldMapping() + vecBFMDot.Dims = testDatasetDims + vecBFMDot.Similarity = index.InnerProduct + vecBFMDot.VectorIndexOptimizedFor = index.IndexOptimizedWithBivfFlat + + vecFMCosine := mapping.NewVectorFieldMapping() + vecFMCosine.Dims = testDatasetDims + vecFMCosine.Similarity = index.CosineSimilarity + + vecBFMCosine := mapping.NewVectorBase64FieldMapping() + vecBFMCosine.Dims = testDatasetDims + vecBFMCosine.Similarity = index.CosineSimilarity + vecBFMCosine.VectorIndexOptimizedFor = index.IndexOptimizedWithBivfFlat + + indexMappingL2 := NewIndexMapping() + indexMappingL2.DefaultMapping.AddFieldMappingsAt("content", contentFM) + indexMappingL2.DefaultMapping.AddFieldMappingsAt("vector", vecFML2) + indexMappingL2.DefaultMapping.AddFieldMappingsAt("vectorEncoded", vecBFML2) + + indexMappingDot := NewIndexMapping() + indexMappingDot.DefaultMapping.AddFieldMappingsAt("content", contentFM) + indexMappingDot.DefaultMapping.AddFieldMappingsAt("vector", vecFMDot) + indexMappingDot.DefaultMapping.AddFieldMappingsAt("vectorEncoded", vecBFMDot) + + indexMappingCosine := NewIndexMapping() + indexMappingCosine.DefaultMapping.AddFieldMappingsAt("content", contentFM) + indexMappingCosine.DefaultMapping.AddFieldMappingsAt("vector", vecFMCosine) + indexMappingCosine.DefaultMapping.AddFieldMappingsAt("vectorEncoded", vecBFMCosine) + + tmpIndexPathL2 := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPathL2) + + tmpIndexPathDot := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPathDot) + + tmpIndexPathCosine := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPathCosine) + + indexL2, err := New(tmpIndexPathL2, indexMappingL2) + if err != nil { + t.Fatal(err) + } + defer func() { + err := indexL2.Close() + if err != nil { + t.Fatal(err) + } + }() + + indexDot, err := New(tmpIndexPathDot, indexMappingDot) + if err != nil { + t.Fatal(err) + } + defer func() { + err := indexDot.Close() + if err != nil { + t.Fatal(err) + } + }() + + indexCosine, err := New(tmpIndexPathCosine, indexMappingCosine) + if err != nil { + t.Fatal(err) + } + defer func() { + err := indexCosine.Close() + if err != nil { + t.Fatal(err) + } + }() + + batchL2 := indexL2.NewBatch() + batchDot := indexDot.NewBatch() + batchCosine := indexCosine.NewBatch() + + for _, doc := range documents { + err = batchL2.Index(doc["id"].(string), doc) + if err != nil { + t.Fatal(err) + } + err = batchDot.Index(doc["id"].(string), doc) + if err != nil { + t.Fatal(err) + } + err = batchCosine.Index(doc["id"].(string), doc) + if err != nil { + t.Fatal(err) + } + } + + err = indexL2.Batch(batchL2) + if err != nil { + t.Fatal(err) + } + + err = indexDot.Batch(batchDot) + if err != nil { + t.Fatal(err) + } + + err = indexCosine.Batch(batchCosine) + if err != nil { + t.Fatal(err) + } + + for i := range searchRequests { + for _, operator := range knnOperators { + normQuery := searchRequests[i] + base64Query := searchRequestsCopy[i] + + normQuery.AddKNNOperator(operator) + base64Query.AddKNNOperator(operator) + + normResultL2, err := indexL2.Search(normQuery) + if err != nil { + t.Fatal(err) + } + base64ResultL2, err := indexL2.Search(base64Query) + if err != nil { + t.Fatal(err) + } + + if normResultL2 != nil && base64ResultL2 != nil { + if len(normResultL2.Hits) == len(base64ResultL2.Hits) { + for j := range normResultL2.Hits { + if normResultL2.Hits[j].ID != base64ResultL2.Hits[j].ID { + t.Fatalf("testcase %d failed: expected hit id %s, got hit id %s", i, normResultL2.Hits[j].ID, base64ResultL2.Hits[j].ID) + } + } + } + } else if (normResultL2 == nil && base64ResultL2 != nil) || + (normResultL2 != nil && base64ResultL2 == nil) { + t.Fatalf("testcase %d failed: expected result %s, got result %s", i, normResultL2, base64ResultL2) + } + + normResultDot, err := indexDot.Search(normQuery) + if err != nil { + t.Fatal(err) + } + base64ResultDot, err := indexDot.Search(base64Query) + if err != nil { + t.Fatal(err) + } + + if normResultDot != nil && base64ResultDot != nil { + if len(normResultDot.Hits) == len(base64ResultDot.Hits) { + for j := range normResultDot.Hits { + if normResultDot.Hits[j].ID != base64ResultDot.Hits[j].ID { + t.Fatalf("testcase %d failed: expected hit id %s, got hit id %s", i, normResultDot.Hits[j].ID, base64ResultDot.Hits[j].ID) + } + } + } + } else if (normResultDot == nil && base64ResultDot != nil) || + (normResultDot != nil && base64ResultDot == nil) { + t.Fatalf("testcase %d failed: expected result %s, got result %s", i, normResultDot, base64ResultDot) + } + + normResultCosine, err := indexCosine.Search(normQuery) + if err != nil { + t.Fatal(err) + } + base64ResultCosine, err := indexCosine.Search(base64Query) + if err != nil { + t.Fatal(err) + } + + if normResultCosine != nil && base64ResultCosine != nil { + if len(normResultCosine.Hits) == len(base64ResultCosine.Hits) { + for j := range normResultCosine.Hits { + if normResultCosine.Hits[j].ID != base64ResultCosine.Hits[j].ID { + t.Fatalf("testcase %d failed: expected hit id %s, got hit id %s", i, normResultCosine.Hits[j].ID, base64ResultCosine.Hits[j].ID) + } + } + } + } else if (normResultCosine == nil && base64ResultCosine != nil) || + (normResultCosine != nil && base64ResultCosine == nil) { + t.Fatalf("testcase %d failed: expected result %s, got result %s", i, normResultCosine, base64ResultCosine) + } + + if normResultCosine != nil && normResultL2 != nil { + if len(normResultCosine.Hits) == len(normResultL2.Hits) { + for j := range normResultCosine.Hits { + if normResultCosine.Hits[j].ID != normResultL2.Hits[j].ID { + if normResultCosine.Hits[j].Score != normResultL2.Hits[j].Score { + t.Fatalf("testcase %d failed: expected hit id %s, got hit id %s", i, normResultCosine.Hits[j].ID, normResultL2.Hits[j].ID) + } + } + } + } + } else if (normResultCosine == nil && normResultL2 != nil) || + (normResultCosine != nil && normResultL2 == nil) { + t.Fatalf("testcase %d failed: expected result %s, got result %s", i, normResultCosine, normResultL2) + } + + if normResultCosine != nil && normResultDot != nil { + if len(normResultCosine.Hits) == len(normResultDot.Hits) { + for j := range normResultCosine.Hits { + if normResultCosine.Hits[j].ID != normResultDot.Hits[j].ID { + if normResultCosine.Hits[j].Score != normResultDot.Hits[j].Score { + t.Fatalf("testcase %d failed: expected hit id %s, got hit id %s", i, normResultCosine.Hits[j].ID, normResultDot.Hits[j].ID) + } + } + } + } + } else if (normResultCosine == nil && normResultDot != nil) || + (normResultCosine != nil && normResultDot == nil) { + t.Fatalf("testcase %d failed: expected result %s, got result %s", i, normResultCosine, normResultDot) + } + } + } +} + type testDocument struct { ID string `json:"id"` Content string `json:"content"` From 764747e81a1596e20b3962a535fb311aace9b13d Mon Sep 17 00:00:00 2001 From: Likith B Date: Fri, 13 Feb 2026 13:34:10 +0530 Subject: [PATCH 3/5] Removed test file --- search_binary_test.go | 731 ------------------------------------------ 1 file changed, 731 deletions(-) delete mode 100644 search_binary_test.go diff --git a/search_binary_test.go b/search_binary_test.go deleted file mode 100644 index 4c34b8a64..000000000 --- a/search_binary_test.go +++ /dev/null @@ -1,731 +0,0 @@ -// Copyright (c) 2023 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build vectors -// +build vectors - -package bleve - -import ( - "bufio" - "encoding/json" - "fmt" - "math/rand" - "os" - "strings" - "testing" - "time" - - "github.com/blevesearch/bleve/v2/mapping" - index "github.com/blevesearch/bleve_index_api" -) - -// var datasetPath = "/Users/likithb/Desktop/Code/MB-62985/debug/dataset/cohere.jsonl" -// var parentPath = "/Users/likithb/Desktop/Code/MB-62985/indexes" -// var queryPath = "/Users/likithb/Desktop/Code/MB-62985/debug/queries1k.jsonl" - -// func TestBuild500k(t *testing.T) { -// f, err := os.Open(datasetPath) -// if err != nil { -// t.Fatalf("open dataset: %v", err) -// } -// defer f.Close() - -// scanner := bufio.NewScanner(f) -// buf := make([]byte, 0, 1024*1024) -// scanner.Buffer(buf, 128*1024*1024) -// dims := 768 - -// indexMapping := mapping.NewIndexMapping() - -// // Recall-optimized vector field mapping -// vecFieldFloatRecall := mapping.NewVectorFieldMapping() -// vecFieldFloatRecall.Similarity = index.CosineSimilarity -// vecFieldFloatRecall.Dims = dims -// vecFieldFloatRecall.VectorIndexOptimizedFor = index.IndexOptimizedForRecall - -// indexMapping.DefaultMapping.AddFieldMappingsAt("recall", vecFieldFloatRecall) - -// // Latency-optimized vector field mapping -// vecFieldFloatLatency := mapping.NewVectorFieldMapping() -// vecFieldFloatLatency.Similarity = index.CosineSimilarity -// vecFieldFloatLatency.Dims = dims -// vecFieldFloatLatency.VectorIndexOptimizedFor = index.IndexOptimizedForLatency - -// indexMapping.DefaultMapping.AddFieldMappingsAt("latency", vecFieldFloatLatency) - -// // Memory-efficient vector field mapping -// vecFieldFloatMemory := mapping.NewVectorFieldMapping() -// vecFieldFloatMemory.Similarity = index.CosineSimilarity -// vecFieldFloatMemory.Dims = dims -// vecFieldFloatMemory.VectorIndexOptimizedFor = index.IndexOptimizedForMemoryEfficient - -// indexMapping.DefaultMapping.AddFieldMappingsAt("memory", vecFieldFloatMemory) - -// // Binary-optimized vector field mapping -// vecFieldBinary := mapping.NewVectorFieldMapping() -// vecFieldBinary.Similarity = index.CosineSimilarity -// vecFieldBinary.Dims = dims -// vecFieldBinary.VectorIndexOptimizedFor = index.IndexOptimizedForBinary - -// indexMapping.DefaultMapping.AddFieldMappingsAt("binary", vecFieldBinary) - -// indexPath := parentPath + "/test6-50k" -// if err := os.RemoveAll(indexPath); err != nil { -// t.Fatalf("remove index path: %v", err) -// } - -// idx, err := New(indexPath, indexMapping) -// if err != nil { -// t.Fatalf("create index: %v", err) -// } - -// defer idx.Close() - -// batchSize := 500 -// batch := idx.NewBatch() -// numDocs := 50000 -// count := 0 - -// for scanner.Scan() { -// if count%batchSize == 0 && count > 0 { -// if err := idx.Batch(batch); err != nil { -// t.Fatalf("index batch at doc %d: %v", count, err) -// } -// batch = idx.NewBatch() -// fmt.Printf("indexed %d documents\n", count) -// } -// if count >= numDocs { -// break -// } -// line := strings.TrimSpace(scanner.Text()) -// if line == "" { -// continue -// } -// _, vec, ok := parseDoc(line) -// if !ok || len(vec) != dims { -// t.Fatalf("invalid doc at line %d", count) -// continue -// } -// if err := batch.Index( -// fmt.Sprintf("doc-%d", count), -// map[string]interface{}{ -// "recall": vec, -// "latency": vec, -// "memory": vec, -// "binary": vec, -// }, -// ); err != nil { -// t.Fatalf("index doc %d: %v", count, err) -// } -// count++ -// } -// if err := scanner.Err(); err != nil { -// t.Fatalf("read dataset: %v", err) -// } -// if err := idx.Batch(batch); err != nil { -// t.Fatalf("index final batch: %v", err) -// } - -// time.Sleep(15 * time.Second) -// } - -// func TestGenQueries500k(t *testing.T) { - -// f, err := os.Open(datasetPath) -// if err != nil { -// t.Fatalf("open dataset: %v", err) -// } -// defer f.Close() - -// scanner := bufio.NewScanner(f) -// buf := make([]byte, 0, 1024*1024) -// scanner.Buffer(buf, 128*1024*1024) - -// if err := os.RemoveAll(queryPath); err != nil { -// t.Fatalf("remove index path: %v", err) -// } -// queriesFile, err := os.Create(queryPath) -// if err != nil { -// t.Fatalf("create queries file: %v", err) -// } -// defer queriesFile.Close() -// writer := bufio.NewWriter(queriesFile) -// numQueries := 2000 - -// // Read 500 vectors and pick one query vector at random and repeat till numQueries or dataset ends -// batchSize := 500 -// var batch []map[string]interface{} -// count := 0 -// queriesGenerated := 0 - -// for scanner.Scan() && queriesGenerated < numQueries { -// line := strings.TrimSpace(scanner.Text()) -// if line == "" { -// continue -// } - -// var obj map[string]interface{} -// if err := json.Unmarshal([]byte(line), &obj); err != nil { -// continue -// } - -// batch = append(batch, obj) - -// // Process batch when it reaches batchSize -// if len(batch) == batchSize { -// if len(batch) > 0 { -// // Pick a random vector from the batch -// randomIdx := rand.Intn(len(batch)) -// queryObj := batch[randomIdx] - -// // Write the query to file -// queryBytes, err := json.Marshal(queryObj) -// if err != nil { -// batch = nil -// count++ -// continue -// } - -// if _, err := writer.WriteString(string(queryBytes) + "\n"); err != nil { -// t.Fatalf("write query: %v", err) -// } - -// queriesGenerated++ -// if queriesGenerated%100 == 0 { -// fmt.Printf("Generated query %d\tCount %d\n", queriesGenerated, count) -// } -// } - -// batch = nil -// } - -// count++ -// } - -// // Process remaining batch at end of file -// if len(batch) > 0 && queriesGenerated < numQueries { -// randomIdx := rand.Intn(len(batch)) -// queryObj := batch[randomIdx] - -// queryBytes, err := json.Marshal(queryObj) -// if err == nil { -// if _, err := writer.WriteString(string(queryBytes) + "\n"); err != nil { -// t.Fatalf("write query: %v", err) -// } -// fmt.Printf("Generated query %d\tCount %d\n", queriesGenerated+1, count) -// queriesGenerated++ -// } -// } - -// if err := scanner.Err(); err != nil { -// t.Fatalf("read dataset: %v", err) -// } - -// if err := writer.Flush(); err != nil { -// t.Fatalf("flush queries file: %v", err) -// } -// } - -// func TestRecall500k(t *testing.T) { - -// queryVecs := readQueryVecs(t) - -// indexPath := parentPath + "/test6-50k" - -// idx, err := Open(indexPath) -// if err != nil { -// t.Fatalf("open index: %v", err) -// } -// defer idx.Close() - -// totalRecall := 0.0 -// for i, qv := range queryVecs { -// srRec := NewSearchRequest(NewMatchNoneQuery()) -// srRec.AddKNN("recall", qv, 100, 1.0) -// srRec.Size = 100 - -// hitsRecall, err := idx.Search(srRec) -// if err != nil { -// t.Fatalf("search recall: %v", err) -// } - -// srBin := NewSearchRequest(NewMatchNoneQuery()) -// srBin.AddKNN("binary", qv, 100, 1.0) -// srBin.Size = 100 - -// hitsBin, err := idx.Search(srBin) -// if err != nil { -// t.Fatalf("search binary: %v", err) -// } - -// recallSet := make(map[string]struct{}) -// for _, hit := range hitsRecall.Hits { -// recallSet[hit.ID] = struct{}{} -// } - -// matching := 0 -// for _, hit := range hitsBin.Hits { -// if _, found := recallSet[hit.ID]; found { -// matching++ -// } -// } - -// recallRate := float64(matching) / float64(len(hitsRecall.Hits)) -// fmt.Printf("Query - %d\tRecall Rate - %f\n", i, recallRate) -// totalRecall += recallRate -// } - -// avgRecall := totalRecall / float64(len(queryVecs)) -// fmt.Printf("Average Recall Rate over %d queries: %f\n", len(queryVecs), avgRecall) -// } - -// func BenchmarkSearchBinary500k(b *testing.B) { -// queryVecs := readQueryVecsB(b) - -// indexPath := parentPath + "/test1-500k" -// idx, err := Open(indexPath) -// if err != nil { -// b.Fatalf("open index: %v", err) -// } -// defer idx.Close() - -// // Pre-build search requests -// reqs := make([]*SearchRequest, len(queryVecs)) -// for i, qv := range queryVecs { -// sr := NewSearchRequest(NewMatchNoneQuery()) -// sr.AddKNN("binary", qv, 100, 1.0) -// sr.Size = 100 -// reqs[i] = sr -// } - -// b.ResetTimer() - -// for i := 0; i < 1000; i++ { -// req := reqs[i] - -// _, err := idx.Search(req) -// if err != nil { -// b.Fatalf("search binary: %v", err) -// } -// } -// } - -// func readQueryVecs(t *testing.T) [][]float32 { -// f, err := os.Open(queryPath) -// if err != nil { -// t.Fatalf("open queries file: %v", err) -// } -// defer f.Close() - -// scanner := bufio.NewScanner(f) -// var queryVecs [][]float32 - -// for scanner.Scan() { -// line := strings.TrimSpace(scanner.Text()) -// if line == "" { -// continue -// } -// _, vec, ok := parseDoc(line) -// if !ok { -// t.Fatalf("invalid query line: %s", line) -// } -// queryVecs = append(queryVecs, vec) -// } -// if err := scanner.Err(); err != nil { -// t.Fatalf("read queries file: %v", err) -// } -// return queryVecs -// } - -// func readQueryVecsB(t *testing.B) [][]float32 { -// f, err := os.Open(queryPath) -// if err != nil { -// t.Fatalf("open queries file: %v", err) -// } -// defer f.Close() - -// scanner := bufio.NewScanner(f) -// var queryVecs [][]float32 - -// for scanner.Scan() { -// line := strings.TrimSpace(scanner.Text()) -// if line == "" { -// continue -// } -// _, vec, ok := parseDoc(line) -// if !ok { -// t.Fatalf("invalid query line: %s", line) -// } -// queryVecs = append(queryVecs, vec) -// } -// if err := scanner.Err(); err != nil { -// t.Fatalf("read queries file: %v", err) -// } -// return queryVecs -// } - -// func parseDoc(line string) (string, []float32, bool) { -// var obj map[string]interface{} -// if err := json.Unmarshal([]byte(line), &obj); err != nil { -// return "", nil, false -// } -// // txt, _ := obj["text"].(string) -// vec := anyToFloat32Slice(obj["emb"]) -// if len(vec) == 0 { -// return "", nil, false -// } -// return "", vec, true -// } - -// Helper function to convert various numeric types to []float32 -func anyToFloat32Slice(v interface{}) []float32 { - switch arr := v.(type) { - case []interface{}: - out := make([]float32, 0, len(arr)) - for _, it := range arr { - switch n := it.(type) { - case float64: - out = append(out, float32(n)) - case float32: - out = append(out, n) - case int: - out = append(out, float32(n)) - case int64: - out = append(out, float32(n)) - default: - return nil - } - } - return out - case []float64: - out := make([]float32, len(arr)) - for i := range arr { - out[i] = float32(arr[i]) - } - return out - case []float32: - return arr - default: - return nil - } -} - -var datasetPath = "/Users/likithb/Desktop/Code/MB-62985/debug/dataset/cohere.jsonl" -var queryPath = "/Users/likithb/Desktop/Code/MB-62985/debug/queries.json" -var groundTruthPath = "/Users/likithb/Desktop/Code/MB-62985/debug/groundTruth.json" -var indexDirectory = "/Users/likithb/Desktop/Code/MB-62985/indexes" - -func TestBinaryVectorSegment(t *testing.T) { - - numDocs := 1000 - dims := 768 - ver := 3 - vectors := loadVectors(t, datasetPath, numDocs) - // vectors := make([][]float32, numDocs) - // for i := 0; i < numDocs; i++ { - // vectors[i] = make([]float32, dims) - // for j := 0; j < dims; j++ { - // vectors[i][j] = randomFloat32(-1.0, 1.0) - // } - // } - // fmt.Printf("loaded %d vectors from %s\n\n", len(vectors), datasetPath) - - indexMapping := buildIndexMapping(dims) - - indexPath := fmt.Sprintf("%s/%d-%dk", indexDirectory, ver, numDocs/1000) - if err := os.RemoveAll(indexPath); err != nil { - t.Fatalf("remove index path: %v", err) - } - - idx, err := New(indexPath, indexMapping) - if err != nil { - t.Fatalf("create index: %v", err) - } - defer idx.Close() - fmt.Printf("created index\n\n") - - batchSize := 1000 - batch := idx.NewBatch() - count := 0 - - for _, vec := range vectors { - if count%batchSize == 0 && count > 0 { - if err := idx.Batch(batch); err != nil { - t.Fatalf("index batch at doc %d: %v", count, err) - } - batch = idx.NewBatch() - fmt.Printf("indexed %d documents\n", count) - } - - if err := batch.Index( - fmt.Sprintf("doc-%d", count), - map[string]interface{}{ - "binary": vec, - "recall": vec, - }, - ); err != nil { - t.Fatalf("index doc %d: %v", count, err) - } - count++ - } - if err := idx.Batch(batch); err != nil { - t.Fatalf("index final batch: %v", err) - } - fmt.Printf("indexed total %d documents\n\n", count) - vectors = nil - time.Sleep(5 * time.Second) - - // idx, err := Open(indexPath) - // if err != nil { - // t.Fatalf("open index: %v", err) - // } - // defer idx.Close() - // fmt.Printf("opened index from %s\n\n", indexPath) - - queries := loadQueries(t, queryPath) - // queries := make([][]float32, 10000) - // for i := 0; i < 10000; i++ { - // queries[i] = make([]float32, dims) - // for j := 0; j < dims; j++ { - // queries[i][j] = randomFloat32(-1.0, 1.0) - // } - // } - fmt.Printf("loaded queries from %s\n\n", queryPath) - - // groundTruths := loadGroundTruths(t, groundTruthPath) - // fmt.Printf("loaded ground truths from %s\n\n", groundTruthPath) - totalRecall := 0 - totalTimeBinary := 0 - totalTimeRecall := 0 - k := int64(3) - for i, qv := range queries { - start := time.Now() - sr := NewSearchRequest(NewMatchNoneQuery()) - sr.AddKNN("binary", qv, k, 1.0) - sr.Size = int(k) - - hits, err := idx.Search(sr) - if err != nil { - t.Fatalf("search binary: %v", err) - } - elapsed := time.Since(start) - totalTimeBinary += int(elapsed) - resultSet := make(map[int64]struct{}) - for _, hit := range hits.Hits { - var docNum int64 - _, err := fmt.Sscanf(hit.ID, "doc-%d", &docNum) - if err != nil { - t.Fatalf("parse doc ID %s: %v", hit.ID, err) - } - resultSet[docNum] = struct{}{} - } - - start = time.Now() - sr = NewSearchRequest(NewMatchNoneQuery()) - sr.AddKNN("recall", qv, k, 1.0) - sr.Size = int(k) - - hitsRec, err := idx.Search(sr) - if err != nil { - t.Fatalf("search recall: %v", err) - } - elapsed = time.Since(start) - totalTimeRecall += int(elapsed) - - groundTruths := make(map[int64]struct{}) - for _, hit := range hitsRec.Hits { - var docNum int64 - _, err := fmt.Sscanf(hit.ID, "doc-%d", &docNum) - if err != nil { - t.Fatalf("parse doc ID %s: %v", hit.ID, err) - } - groundTruths[docNum] = struct{}{} - } - matching := 0 - for docNum := range groundTruths { - if _, found := resultSet[docNum]; found { - matching++ - } - } - - // matching := 0 - // for docNum := range groundTruths[i] { - // if _, found := resultSet[docNum]; found { - // matching++ - // } - // } - - totalRecall += matching - - if i%1000 == 0 { - fmt.Printf("Processed %d queries\n", i+1) - fmt.Printf("Average Recall so far: %f\n", float64(totalRecall)/float64((i+1)*int(k))) - fmt.Printf("Average Time per Query (Binary) so far: %v\n", time.Duration(totalTimeBinary/(i+1))) - fmt.Printf("Average Time per Query (Recall) so far: %v\n", time.Duration(totalTimeRecall/(i+1))) - if i == 0 { - totalTimeBinary = 0 - totalTimeRecall = 0 - } - } - } - - avgRecall := float64(totalRecall) / float64(len(queries)*int(k)) - fmt.Printf("Average Recall over %d queries: %f\n", len(queries), avgRecall) - avgTime := time.Duration(totalTimeBinary / (len(queries) - 1)) - fmt.Printf("Average Time per Query (Binary): %v\n", avgTime) - avgTime = time.Duration(totalTimeRecall / (len(queries) - 1)) - fmt.Printf("Average Time per Query (Recall): %v\n", avgTime) -} - -func loadGroundTruths(t *testing.T, path string) []map[int64]struct{} { - var groundTruth []map[int64]struct{} - gtFile, err := os.Open(path) - if err != nil { - t.Fatalf("open ground truth file: %v", err) - } - defer gtFile.Close() - - decoder := json.NewDecoder(gtFile) - err = decoder.Decode(&groundTruth) - if err != nil { - t.Fatalf("decode ground truth from file: %v", err) - } - return groundTruth -} - -func randomFloat32(min, max float32) float32 { - return min + (max-min)*rand.Float32() -} - -func loadQueries(t *testing.T, path string) [][]float32 { - var queries [][]float32 - queryFile, err := os.Open(path) - if err != nil { - t.Fatalf("open query file: %v", err) - } - defer queryFile.Close() - - decoder := json.NewDecoder(queryFile) - err = decoder.Decode(&queries) - if err != nil { - t.Fatalf("decode queries from file: %v", err) - } - return queries -} - -func buildIndexMapping(dims int) *mapping.IndexMappingImpl { - indexMapping := mapping.NewIndexMapping() - - // Recall-optimized vector field mapping - vecFieldFloatRecall := mapping.NewVectorFieldMapping() - vecFieldFloatRecall.Similarity = index.CosineSimilarity - vecFieldFloatRecall.Dims = dims - vecFieldFloatRecall.VectorIndexOptimizedFor = index.IndexOptimizedForRecall - - indexMapping.DefaultMapping.AddFieldMappingsAt("recall", vecFieldFloatRecall) - - // Latency-optimized vector field mapping - vecFieldFloatLatency := mapping.NewVectorFieldMapping() - vecFieldFloatLatency.Similarity = index.CosineSimilarity - vecFieldFloatLatency.Dims = dims - vecFieldFloatLatency.VectorIndexOptimizedFor = index.IndexOptimizedForLatency - - indexMapping.DefaultMapping.AddFieldMappingsAt("latency", vecFieldFloatLatency) - - // Memory-efficient vector field mapping - vecFieldFloatMemory := mapping.NewVectorFieldMapping() - vecFieldFloatMemory.Similarity = index.CosineSimilarity - vecFieldFloatMemory.Dims = dims - vecFieldFloatMemory.VectorIndexOptimizedFor = index.IndexOptimizedForMemoryEfficient - - indexMapping.DefaultMapping.AddFieldMappingsAt("memory", vecFieldFloatMemory) - - // Binary-optimized vector field mapping - vecFieldBinary := mapping.NewVectorFieldMapping() - vecFieldBinary.Similarity = index.CosineSimilarity - vecFieldBinary.Dims = dims - vecFieldBinary.VectorIndexOptimizedFor = index.IndexOptimizedWithBivfFlat - - indexMapping.DefaultMapping.AddFieldMappingsAt("binary", vecFieldBinary) - - return indexMapping -} - -func loadVectors(t *testing.T, path string, maxDocs int) [][]float32 { - f, err := os.Open(path) - if err != nil { - t.Fatalf("open dataset: %v", err) - } - defer f.Close() - - scanner := bufio.NewScanner(f) - buf := make([]byte, 0, 1024*1024) - scanner.Buffer(buf, 128*1024*1024) - - var vectors [][]float32 - count := 0 - - for scanner.Scan() && count < maxDocs { - line := strings.TrimSpace(scanner.Text()) - if line == "" { - continue - } - - var obj map[string]interface{} - if err := json.Unmarshal([]byte(line), &obj); err != nil { - continue - } - - vec := anyToFloat32Slice(obj["emb"]) - vectors = append(vectors, vec) - count++ - } - if err := scanner.Err(); err != nil { - t.Fatalf("read dataset: %v", err) - } - - return vectors -} - -func TestCreateDistribution(t *testing.T) { - numVectors := 100000 - vectors := loadVectors(t, datasetPath, numVectors) - buckets := make([]int, 40) - - for i := 0; i < numVectors; i++ { - for j := 0; j < 768; j++ { - val := vectors[i][j] - bucketIdx := int((val + 2.0) / 0.1) - if bucketIdx < 0 { - bucketIdx = 0 - } else if bucketIdx >= len(buckets) { - bucketIdx = len(buckets) - 1 - } - buckets[bucketIdx]++ - } - } - - for i, count := range buckets { - lowerBound := -2.0 + float32(i)*0.1 - upperBound := lowerBound + 0.1 - fmt.Printf("Bucket %2d [%.1f, %.1f): %d\n", i, lowerBound, upperBound, count) - } - total := 0 - for _, count := range buckets { - total += count - } - fmt.Printf("Total values: %d\n", total) - fmt.Printf("Expected: %d\n", numVectors*768) -} From 19f9a4dc66febe24a81963574953a82ce27ee2a7 Mon Sep 17 00:00:00 2001 From: Likith B Date: Fri, 13 Feb 2026 15:32:34 +0530 Subject: [PATCH 4/5] Added dimensionality check for binary fields --- mapping/mapping_vectors.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index c6c618043..c703bc643 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -244,6 +244,10 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, if effectiveOptimizedFor == "" { effectiveOptimizedFor = index.DefaultIndexOptimization } + if effectiveOptimizedFor == index.IndexOptimizedWithBivfFlat && field.Dims%8 != 0 { + return fmt.Errorf("field: '%s', vector dimensions for bivf-flat must be a multiple"+ + " of 8", effectiveFieldName) + } // # If alias is present, validate the field options as per the alias. // note: reading from a nil map is safe From 467ba8d4ae880a2d9fad8f04b05a8c56861bed84 Mon Sep 17 00:00:00 2001 From: Likith B Date: Mon, 16 Feb 2026 16:11:20 +0530 Subject: [PATCH 5/5] Addressing review comments --- document/field_vector.go | 3 +++ mapping/mapping_vectors.go | 18 ++++++++---------- search/query/knn.go | 3 +++ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/document/field_vector.go b/document/field_vector.go index 5157c557c..4e142b03b 100644 --- a/document/field_vector.go +++ b/document/field_vector.go @@ -114,6 +114,9 @@ func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64, // skip freq/norms for vector field options |= index.SkipFreqNorm + // bivf-flat indexes only supports hamming distance for the primary + // binary index. Similarity here is used for the backing flat index, + // which is set to cosine similarity for recall reasons if vectorIndexOptimizedFor == index.IndexOptimizedWithBivfFlat { similarity = index.CosineSimilarity } diff --git a/mapping/mapping_vectors.go b/mapping/mapping_vectors.go index c703bc643..5acb90b98 100644 --- a/mapping/mapping_vectors.go +++ b/mapping/mapping_vectors.go @@ -151,8 +151,9 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, if vectorIndexOptimizedFor == "" { vectorIndexOptimizedFor = index.DefaultIndexOptimization } - // BIVF-Flat index needs cosine similarity for correct scoring - // regardless of the similarity metric specified in the mapping. + // bivf-flat indexes only supports hamming distance for the primary + // binary index. Similarity here is used for the backing flat index, + // which is set to cosine similarity for recall reasons if vectorIndexOptimizedFor == index.IndexOptimizedWithBivfFlat { similarity = index.CosineSimilarity } @@ -190,8 +191,9 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac if vectorIndexOptimizedFor == "" { vectorIndexOptimizedFor = index.DefaultIndexOptimization } - // BIVF-Flat index needs cosine similarity for correct scoring - // regardless of the similarity metric specified in the mapping. + // bivf-flat indexes only supports hamming distance for the primary + // binary index. Similarity here is used for the backing flat index, + // which is set to cosine similarity for recall reasons if vectorIndexOptimizedFor == index.IndexOptimizedWithBivfFlat { similarity = index.CosineSimilarity } @@ -244,10 +246,6 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, if effectiveOptimizedFor == "" { effectiveOptimizedFor = index.DefaultIndexOptimization } - if effectiveOptimizedFor == index.IndexOptimizedWithBivfFlat && field.Dims%8 != 0 { - return fmt.Errorf("field: '%s', vector dimensions for bivf-flat must be a multiple"+ - " of 8", effectiveFieldName) - } // # If alias is present, validate the field options as per the alias. // note: reading from a nil map is safe @@ -303,9 +301,9 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, effectiveOptimizedFor, reflect.ValueOf(index.SupportedVectorIndexOptimizations).MapKeys()) } - // BIVF-Flat index optimization requires vector dimensions to be a multiple of 8 + // bivf-flat's primary indexes requires vector dimensionality to be a multiple of 8 if effectiveOptimizedFor == index.IndexOptimizedWithBivfFlat && field.Dims%8 != 0 { - return fmt.Errorf("field: '%s', invalid vector dimension: %d for optimization with BIVF-Flat,"+ + return fmt.Errorf("field: '%s', incompatible vector dimensionality for BIVF-FLAT: %d,"+ " dimension should be a multiple of 8", effectiveFieldName, field.Dims) } diff --git a/search/query/knn.go b/search/query/knn.go index e0389d43e..e026979af 100644 --- a/search/query/knn.go +++ b/search/query/knn.go @@ -84,6 +84,9 @@ func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader, if q.K <= 0 || len(q.Vector) == 0 { return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty") } + // bivf-flat indexes only supports hamming distance for the primary + // binary index. Similarity here is used for the backing flat index, + // which is set to cosine similarity for recall reasons if fieldMapping.VectorIndexOptimizedFor == index.IndexOptimizedWithBivfFlat { similarityMetric = index.CosineSimilarity }