From f6170c317f14ecd6fe32e3d65105ce757deec07b Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Wed, 6 Nov 2024 15:52:49 +0530 Subject: [PATCH 01/35] real first draft --- index/scorch/snapshot_index.go | 91 +++++++++++++++++++++++++++++- index/scorch/snapshot_index_str.go | 82 +++++++++++++++++++++++++++ mapping/document.go | 11 ++++ mapping/field.go | 2 + mapping/index.go | 37 ++++++++++++ mapping/synonym.go | 56 ++++++++++++++++++ pre_search.go | 73 ++++++++++++++++++++++-- 7 files changed, 345 insertions(+), 7 deletions(-) create mode 100644 index/scorch/snapshot_index_str.go create mode 100644 mapping/synonym.go diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 79840a41f..685f1c921 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -60,6 +60,7 @@ var reflectStaticSizeIndexSnapshot int // exported variable, or at the index level by setting the FieldTFRCacheThreshold // in the kvConfig. var DefaultFieldTFRCacheThreshold uint64 = 10 +var DefaultSynonymTermReaderCacheThreshold uint64 = 10 func init() { var is interface{} = IndexSnapshot{} @@ -87,8 +88,9 @@ type IndexSnapshot struct { m sync.Mutex // Protects the fields that follow. refs int64 - m2 sync.Mutex // Protects the fields that follow. - fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's + m2 sync.Mutex // Protects the fields that follow. + fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's + synonymTermReaders map[string][]*IndexSnapshotSynonymTermReader // keyed by thesaurus name, recycled thesaurus readers } func (i *IndexSnapshot) Segments() []*SegmentSnapshot { @@ -649,6 +651,15 @@ func (is *IndexSnapshot) getFieldTFRCacheThreshold() uint64 { return DefaultFieldTFRCacheThreshold } +func (is *IndexSnapshot) getSynonymTermReaderCacheThreshold() uint64 { + if is.parent.config != nil { + if _, ok := is.parent.config["SynonymTermReaderCacheThreshold"]; ok { + return is.parent.config["SynonymTermReaderCacheThreshold"].(uint64) + } + } + return DefaultSynonymTermReaderCacheThreshold +} + func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { if !tfr.recycle { // Do not recycle an optimized unadorned term field reader (used for @@ -677,6 +688,25 @@ func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReade is.m2.Unlock() } +func (is *IndexSnapshot) recycleSynonymTermReader(str *IndexSnapshotSynonymTermReader) { + is.parent.rootLock.RLock() + obsolete := is.parent.root != is + is.parent.rootLock.RUnlock() + if obsolete { + // if we're not the current root (mutations happened), don't bother recycling + return + } + + is.m2.Lock() + if is.synonymTermReaders == nil { + is.synonymTermReaders = map[string][]*IndexSnapshotSynonymTermReader{} + } + if uint64(len(is.synonymTermReaders[str.name])) < is.getSynonymTermReaderCacheThreshold() { + is.synonymTermReaders[str.name] = append(is.synonymTermReaders[str.name], str) + } + is.m2.Unlock() +} + func docNumberToBytes(buf []byte, in uint64) []byte { if len(buf) != 8 { if cap(buf) >= 8 { @@ -956,3 +986,60 @@ func (is *IndexSnapshot) CloseCopyReader() error { // close the index snapshot normally return is.Close() } + +func (is *IndexSnapshot) allocSynonymTermReader(name string) (str *IndexSnapshotSynonymTermReader) { + is.m2.Lock() + if is.synonymTermReaders != nil { + strs := is.synonymTermReaders[name] + last := len(strs) - 1 + if last >= 0 { + str = strs[last] + strs[last] = nil + is.synonymTermReaders[name] = strs[:last] + is.m2.Unlock() + return + } + } + is.m2.Unlock() + return &IndexSnapshotSynonymTermReader{} +} + +func (is *IndexSnapshot) SynonymTermReader(ctx context.Context, thesaurusName string, term []byte) (index.SynonymTermReader, error) { + rv := is.allocSynonymTermReader(thesaurusName) + + rv.name = thesaurusName + rv.snapshot = is + if rv.postings == nil { + rv.postings = make([]segment.SynonymsList, len(is.segment)) + } + if rv.iterators == nil { + rv.iterators = make([]segment.SynonymsIterator, len(is.segment)) + } + rv.segmentOffset = 0 + + if rv.thesauri == nil { + rv.thesauri = make([]segment.Thesaurus, len(is.segment)) + for i, s := range is.segment { + if synSeg, ok := s.segment.(segment.SynonymSegment); ok { + thes, err := synSeg.Thesaurus(thesaurusName) + if err != nil { + return nil, err + } + rv.thesauri[i] = thes + } + } + } + + for i, s := range is.segment { + if _, ok := s.segment.(segment.SynonymSegment); ok { + pl, err := rv.thesauri[i].SynonymsList(term, s.deleted, rv.postings[i]) + if err != nil { + return nil, err + } + rv.postings[i] = pl + + rv.iterators[i] = pl.Iterator(rv.iterators[i]) + } + } + return rv, nil +} diff --git a/index/scorch/snapshot_index_str.go b/index/scorch/snapshot_index_str.go new file mode 100644 index 000000000..5dee83770 --- /dev/null +++ b/index/scorch/snapshot_index_str.go @@ -0,0 +1,82 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/size" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizeIndexSnapshotSynonymTermReader int + +func init() { + var istr IndexSnapshotSynonymTermReader + reflectStaticSizeIndexSnapshotSynonymTermReader = int(reflect.TypeOf(istr).Size()) +} + +type IndexSnapshotSynonymTermReader struct { + name string + snapshot *IndexSnapshot + thesauri []segment.Thesaurus + postings []segment.SynonymsList + iterators []segment.SynonymsIterator + segmentOffset int +} + +func (i *IndexSnapshotSynonymTermReader) Size() int { + sizeInBytes := reflectStaticSizeIndexSnapshotSynonymTermReader + size.SizeOfPtr + + len(i.name) + + for _, thesaurus := range i.thesauri { + sizeInBytes += thesaurus.Size() + } + + for _, postings := range i.postings { + sizeInBytes += postings.Size() + } + + for _, iterator := range i.iterators { + sizeInBytes += iterator.Size() + } + + return sizeInBytes +} + +func (i *IndexSnapshotSynonymTermReader) Next() (string, error) { + // find the next hit + for i.segmentOffset < len(i.iterators) { + if i.iterators[i.segmentOffset] != nil { + next, err := i.iterators[i.segmentOffset].Next() + if err != nil { + return "", err + } + if next != nil { + synTerm := next.Term() + return synTerm, nil + } + i.segmentOffset++ + } + } + return "", nil +} + +func (i *IndexSnapshotSynonymTermReader) Close() error { + if i.snapshot != nil { + i.snapshot.recycleSynonymTermReader(i) + } + return nil +} diff --git a/mapping/document.go b/mapping/document.go index 847326e41..5d70af912 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -112,6 +112,17 @@ func (dm *DocumentMapping) analyzerNameForPath(path string) string { return "" } +// synonymSourceForPath attempts to first find the field +// described by this path, then returns the analyzer +// configured for that field +func (dm *DocumentMapping) synonymSourceForPath(path string) string { + field := dm.fieldDescribedByPath(path) + if field != nil { + return field.SynonymSource + } + return "" +} + func (dm *DocumentMapping) fieldDescribedByPath(path string) *FieldMapping { pathElements := decodePath(path) if len(pathElements) > 1 { diff --git a/mapping/field.go b/mapping/field.go index 5c064fddd..ad5b4f424 100644 --- a/mapping/field.go +++ b/mapping/field.go @@ -80,6 +80,8 @@ type FieldMapping struct { // Applicable to vector fields only - optimization string VectorIndexOptimizedFor string `json:"vector_index_optimized_for,omitempty"` + + SynonymSource string `json:"synonym_source,omitempty"` } // NewTextFieldMapping returns a default field mapping for text diff --git a/mapping/index.go b/mapping/index.go index fe8c96713..94b2cdfa7 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -54,6 +54,7 @@ type IndexMappingImpl struct { IndexDynamic bool `json:"index_dynamic"` DocValuesDynamic bool `json:"docvalues_dynamic"` CustomAnalysis *customAnalysis `json:"analysis,omitempty"` + SynonymSources map[string]*SynonymSource `json:"synonym_sources,omitempty"` cache *registry.Cache } @@ -186,6 +187,12 @@ func (im *IndexMappingImpl) Validate() error { return err } } + for _, synSource := range im.SynonymSources { + err = synSource.Validate(im.cache) + if err != nil { + return err + } + } return nil } @@ -283,6 +290,14 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "synonym_sources": + if im.SynonymSources == nil { + im.SynonymSources = make(map[string]*SynonymSource) + } + err := util.UnmarshalJSON(v, &im.SynonymSources) + if err != nil { + return err + } default: invalidKeys = append(invalidKeys, k) } @@ -457,3 +472,25 @@ func (im *IndexMappingImpl) FieldMappingForPath(path string) FieldMapping { func (im *IndexMappingImpl) DefaultSearchField() string { return im.DefaultField } + +func (im *IndexMappingImpl) SynonymSourceForPath(path string) string { + // first we look for explicit mapping on the field + for _, docMapping := range im.TypeMapping { + synonymSource := docMapping.synonymSourceForPath(path) + if synonymSource != "" { + return synonymSource + } + } + + // now try the default mapping + pathMapping, _ := im.DefaultMapping.documentMappingForPath(path) + if pathMapping != nil { + if len(pathMapping.Fields) > 0 { + if pathMapping.Fields[0].SynonymSource != "" { + return pathMapping.Fields[0].SynonymSource + } + } + } + + return "" +} diff --git a/mapping/synonym.go b/mapping/synonym.go new file mode 100644 index 000000000..5065b3b2a --- /dev/null +++ b/mapping/synonym.go @@ -0,0 +1,56 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mapping + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/registry" +) + +type SynonymSource struct { + CollectionName string `json:"collection"` + AnalyzerName string `json:"analyzer"` +} + +func (s *SynonymSource) Collection() string { + return s.CollectionName +} + +func (s *SynonymSource) Analyzer() string { + return s.AnalyzerName +} + +func (s *SynonymSource) SetCollection(c string) { + s.CollectionName = c +} + +func (s *SynonymSource) SetAnalyzer(a string) { + s.AnalyzerName = a +} + +func (s *SynonymSource) Validate(c *registry.Cache) error { + if s.CollectionName == "" { + return fmt.Errorf("collection name is required") + } + if s.AnalyzerName == "" { + return fmt.Errorf("analyzer name is required") + } + _, err := c.AnalyzerNamed(s.AnalyzerName) + if err != nil { + return fmt.Errorf("analyzer named '%s' not found", s.AnalyzerName) + } + return nil +} diff --git a/pre_search.go b/pre_search.go index c8c55bfbc..646d25199 100644 --- a/pre_search.go +++ b/pre_search.go @@ -26,6 +26,8 @@ type preSearchResultProcessor interface { finalize(*SearchResult) } +// ----------------------------------------------------------------------------- +// KNN preSearchResultProcessor for handling KNN presearch results type knnPreSearchResultProcessor struct { addFn func(sr *SearchResult, indexName string) finalizeFn func(sr *SearchResult) @@ -44,16 +46,77 @@ func (k *knnPreSearchResultProcessor) finalize(sr *SearchResult) { } // ----------------------------------------------------------------------------- +// Synonym preSearchResultProcessor for handling Synonym presearch results +type synonymPreSearchResultProcessor struct { + addFn func(sr *SearchResult, indexName string) + finalizeFn func(sr *SearchResult) +} -func finalizePreSearchResult(req *SearchRequest, preSearchResult *SearchResult) { - if requestHasKNN(req) { - preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) +func (s *synonymPreSearchResultProcessor) add(sr *SearchResult, indexName string) { + if s.addFn != nil { + s.addFn(sr, indexName) } } +func (s *synonymPreSearchResultProcessor) finalize(sr *SearchResult) { + if s.finalizeFn != nil { + s.finalizeFn(sr) + } +} + +// ----------------------------------------------------------------------------- +// Master struct that can hold any number of presearch result processors +type compositePreSearchResultProcessor struct { + presearchResultProcessors []preSearchResultProcessor +} + +// Implements the add method, which forwards to all the internal processors +func (m *compositePreSearchResultProcessor) add(sr *SearchResult, indexName string) { + for _, p := range m.presearchResultProcessors { + p.add(sr, indexName) + } +} + +// Implements the finalize method, which forwards to all the internal processors +func (m *compositePreSearchResultProcessor) finalize(sr *SearchResult) { + for _, p := range m.presearchResultProcessors { + p.finalize(sr) + } +} + +// ----------------------------------------------------------------------------- +// Function to create the appropriate preSearchResultProcessor(s) func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor { + var processors []preSearchResultProcessor + // Add KNN processor if the request has KNN + if requestHasKNN(req) { + if knnProcessor := newKnnPreSearchResultProcessor(req); knnProcessor != nil { + processors = append(processors, knnProcessor) + } + } + // Add Synonym processor if the request has Synonym + if requestHasSynonym(req) { + if synonymProcessor := newSynonymPreSearchResultProcessor(req); synonymProcessor != nil { + processors = append(processors, synonymProcessor) + } + } + // Return based on the number of processors, optimizing for the common case of 1 processor + // If there are no processors, return nil + switch len(processors) { + case 0: + return nil + case 1: + return processors[0] + default: + return &compositePreSearchResultProcessor{ + presearchResultProcessors: processors, + } + } +} + +// ----------------------------------------------------------------------------- +func finalizePreSearchResult(req *SearchRequest, preSearchResult *SearchResult) { if requestHasKNN(req) { - return newKnnPreSearchResultProcessor(req) + preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) } - return &knnPreSearchResultProcessor{} // equivalent to nil } From 263e9904c05626267ea6f7f3f6f8cf5c54d2af1a Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Wed, 6 Nov 2024 16:19:29 +0530 Subject: [PATCH 02/35] fix bug --- index/scorch/snapshot_index_str.go | 6 +----- pre_search.go | 6 ------ 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/index/scorch/snapshot_index_str.go b/index/scorch/snapshot_index_str.go index 5dee83770..e1ba60272 100644 --- a/index/scorch/snapshot_index_str.go +++ b/index/scorch/snapshot_index_str.go @@ -39,11 +39,7 @@ type IndexSnapshotSynonymTermReader struct { func (i *IndexSnapshotSynonymTermReader) Size() int { sizeInBytes := reflectStaticSizeIndexSnapshotSynonymTermReader + size.SizeOfPtr + - len(i.name) - - for _, thesaurus := range i.thesauri { - sizeInBytes += thesaurus.Size() - } + len(i.name) + size.SizeOfString for _, postings := range i.postings { sizeInBytes += postings.Size() diff --git a/pre_search.go b/pre_search.go index 646d25199..a539afc35 100644 --- a/pre_search.go +++ b/pre_search.go @@ -94,12 +94,6 @@ func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor processors = append(processors, knnProcessor) } } - // Add Synonym processor if the request has Synonym - if requestHasSynonym(req) { - if synonymProcessor := newSynonymPreSearchResultProcessor(req); synonymProcessor != nil { - processors = append(processors, synonymProcessor) - } - } // Return based on the number of processors, optimizing for the common case of 1 processor // If there are no processors, return nil switch len(processors) { From c5fc54851ac77b2cdf741bb0106712b2e2eeede5 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Wed, 6 Nov 2024 19:19:33 +0530 Subject: [PATCH 03/35] small fix the first draft --- mapping/field.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mapping/field.go b/mapping/field.go index ad5b4f424..08af2dfa7 100644 --- a/mapping/field.go +++ b/mapping/field.go @@ -476,6 +476,11 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "synonym_source": + err := json.Unmarshal(v, &fm.SynonymSource) + if err != nil { + return err + } default: invalidKeys = append(invalidKeys, k) } From 542d34a9dcd89f314ff0954feb934fb3cbd1a461 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 15 Nov 2024 15:59:08 +0530 Subject: [PATCH 04/35] glue code for indexing path --- document/document.go | 15 ++++ document/field_synonym.go | 143 ++++++++++++++++++++++++++++++++++++++ error.go | 2 + index.go | 55 +++++++++++++++ mapping/index.go | 17 +++++ mapping/mapping.go | 5 ++ 6 files changed, 237 insertions(+) create mode 100644 document/field_synonym.go diff --git a/document/document.go b/document/document.go index 54fd6d442..0f9591c85 100644 --- a/document/document.go +++ b/document/document.go @@ -48,6 +48,13 @@ func NewDocument(id string) *Document { } } +func NewSynonymDocument(id string) *Document { + return &Document{ + id: id, + Fields: make([]Field, 0), + } +} + func (d *Document) Size() int { sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr + len(d.id) @@ -133,3 +140,11 @@ func (d *Document) VisitComposite(visitor index.CompositeFieldVisitor) { func (d *Document) HasComposite() bool { return len(d.CompositeFields) > 0 } + +func (d *Document) VisitSynonymFields(visitor index.SynonymFieldVisitor) { + for _, f := range d.Fields { + if sf, ok := f.(index.SynonymField); ok { + visitor(sf) + } + } +} diff --git a/document/field_synonym.go b/document/field_synonym.go new file mode 100644 index 000000000..0e4812690 --- /dev/null +++ b/document/field_synonym.go @@ -0,0 +1,143 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package document + +import ( + "reflect" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeSynonymField int + +func init() { + var f SynonymField + reflectStaticSizeSynonymField = int(reflect.TypeOf(f).Size()) +} + +const DefaultSynonymIndexingOptions = index.IndexField + +type SynonymField struct { + name string + analyzer analysis.Analyzer + options index.FieldIndexingOptions + input []string + synonyms []string + numPlainTextBytes uint64 + + // populated during analysis + synonymMap map[string][]string +} + +func (s *SynonymField) Size() int { + return reflectStaticSizeSynonymField + size.SizeOfPtr + + len(s.name) +} + +func (s *SynonymField) Name() string { + return s.name +} + +func (s *SynonymField) ArrayPositions() []uint64 { + return nil +} + +func (s *SynonymField) Options() index.FieldIndexingOptions { + return s.options +} + +func (s *SynonymField) NumPlainTextBytes() uint64 { + return s.numPlainTextBytes +} + +func (s *SynonymField) AnalyzedLength() int { + return 0 +} + +func (s *SynonymField) EncodedFieldType() byte { + return 'y' +} + +func (s *SynonymField) AnalyzedTokenFrequencies() index.TokenFrequencies { + return nil +} + +func (s *SynonymField) Analyze() { + var analyzedInput []string + if len(s.input) > 0 { + analyzedInput = make([]string, 0, len(s.input)) + for _, term := range s.input { + analyzedInput = append(analyzedInput, analyzeSynonymTerm(term, s.analyzer)) + } + } + analyzedSynonyms := make([]string, 0, len(s.synonyms)) + for _, syn := range s.synonyms { + analyzedSynonyms = append(analyzedSynonyms, analyzeSynonymTerm(syn, s.analyzer)) + } + s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms) +} + +func (s *SynonymField) Value() []byte { + return nil +} + +func (s *SynonymField) IterateSynonyms(visitor func(term string, synonyms []string)) { + for term, synonyms := range s.synonymMap { + visitor(term, synonyms) + } +} + +func NewSynonymField(name string, analyzer analysis.Analyzer, input []string, synonyms []string) *SynonymField { + return &SynonymField{ + name: name, + analyzer: analyzer, + options: DefaultSynonymIndexingOptions, + input: input, + synonyms: synonyms, + } +} + +func processSynonymData(input []string, synonyms []string) map[string][]string { + var synonymMap map[string][]string + if len(input) > 0 { + // Map each term to the same list of synonyms. + synonymMap = make(map[string][]string, len(input)) + for _, term := range input { + synonymMap[term] = append([]string(nil), synonyms...) // Avoid sharing slices. + } + } else { + synonymMap = make(map[string][]string, len(synonyms)) + // Precompute a map where each synonym points to all other synonyms. + for i, elem := range synonyms { + synonymMap[elem] = make([]string, 0, len(synonyms)-1) + for j, otherElem := range synonyms { + if i != j { + synonymMap[elem] = append(synonymMap[elem], otherElem) + } + } + } + } + return synonymMap +} + +func analyzeSynonymTerm(term string, analyzer analysis.Analyzer) string { + tokenStream := analyzer.Analyze([]byte(term)) + if len(tokenStream) == 0 { + return term + } + return string(tokenStream[0].Term) +} diff --git a/error.go b/error.go index 2d2751cd4..b57a61543 100644 --- a/error.go +++ b/error.go @@ -27,6 +27,7 @@ const ( ErrorEmptyID ErrorIndexReadInconsistency ErrorTwoPhaseSearchInconsistency + ErrorSynonymSearchNotSupported ) // Error represents a more strongly typed bleve error for detecting @@ -49,4 +50,5 @@ var errorMessages = map[Error]string{ ErrorEmptyID: "document ID cannot be empty", ErrorIndexReadInconsistency: "index read inconsistency detected", ErrorTwoPhaseSearchInconsistency: "2-phase search failed, likely due to an overlapping topology change", + ErrorSynonymSearchNotSupported: "synonym search not supported", } diff --git a/index.go b/index.go index acbefc695..d98f28558 100644 --- a/index.go +++ b/index.go @@ -16,6 +16,7 @@ package bleve import ( "context" + "fmt" "github.com/blevesearch/bleve/v2/index/upsidedown" @@ -63,6 +64,36 @@ func (b *Batch) Index(id string, data interface{}) error { return nil } +func (b *Batch) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + if id == "" { + return ErrorEmptyID + } + if eventIndex, ok := b.index.(index.EventIndex); ok { + eventIndex.FireIndexEvent() + } + synMap, ok := b.index.Mapping().(mapping.SynonymMapping) + if !ok { + return ErrorSynonymSearchNotSupported + } + + if err := definition.Validate(); err != nil { + return err + } + + doc := document.NewSynonymDocument(id) + err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms) + if err != nil { + return err + } + b.internal.Update(doc) + + b.lastDocSize = uint64(doc.Size() + + len(id) + size.SizeOfString) // overhead from internal + b.totalSize += b.lastDocSize + + return nil +} + func (b *Batch) LastDocSize() uint64 { return b.lastDocSize } @@ -323,3 +354,27 @@ type IndexCopyable interface { // FileSystemDirectory is the default implementation for the // index.Directory interface. type FileSystemDirectory string + +// SynonymDefinition represents a synonym mapping in Bleve. +// Each instance associates one or more input terms with a list of synonyms, +// defining how terms are treated as equivalent in searches. +type SynonymDefinition struct { + // Input is an optional list of terms for unidirectional synonym mapping. + // When terms are specified in Input, they will map to the terms in Synonyms, + // making the relationship unidirectional (each Input maps to all Synonyms). + // If Input is omitted, the relationship is bidirectional among all Synonyms. + Input []string `json:"input"` + + // Synonyms is a list of terms that are considered equivalent. + // If Input is specified, each term in Input will map to each term in Synonyms. + // If Input is not specified, the Synonyms list will be treated bidirectionally, + // meaning each term in Synonyms is treated as synonymous with all others. + Synonyms []string `json:"synonyms"` +} + +func (sd *SynonymDefinition) Validate() error { + if len(sd.Synonyms) == 0 { + return fmt.Errorf("synonym definition must have at least one synonym") + } + return nil +} diff --git a/mapping/index.go b/mapping/index.go index 94b2cdfa7..d6d355b11 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -354,6 +354,23 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} return nil } +func (im *IndexMappingImpl) MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error { + // determine all the synonym sources with the given collection + // and create a synonym field for each + for name, synSource := range im.SynonymSources { + if synSource.Collection() == collection { + // create a new field with the name of the synonym source + analyzer := im.AnalyzerNamed(synSource.Analyzer()) + if analyzer == nil { + return fmt.Errorf("unknown analyzer named: %s", synSource.Analyzer()) + } + field := document.NewSynonymField(name, analyzer, input, synonyms) + doc.AddField(field) + } + } + return nil +} + type walkContext struct { doc *document.Document im *IndexMappingImpl diff --git a/mapping/mapping.go b/mapping/mapping.go index cbfc98faa..6100d6d09 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -58,3 +58,8 @@ type IndexMapping interface { FieldMappingForPath(path string) FieldMapping } + +type SynonymMapping interface { + IndexMapping + MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error +} From 6627dcb2a5f5c7fd132d5a35b94576166788ea35 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 15 Nov 2024 18:50:54 +0530 Subject: [PATCH 05/35] unit test --- mapping/index.go | 8 ++ mapping/synonym.go | 7 ++ search_test.go | 182 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+) diff --git a/mapping/index.go b/mapping/index.go index d6d355b11..ae17a3da5 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -146,6 +146,14 @@ func (im *IndexMappingImpl) AddCustomDateTimeParser(name string, config map[stri return nil } +func (im *IndexMappingImpl) AddSynonymSource(name, collection, analyzer string) error { + if im.SynonymSources == nil { + im.SynonymSources = make(map[string]*SynonymSource) + } + im.SynonymSources[name] = NewSynonymSource(collection, analyzer) + return nil +} + // NewIndexMapping creates a new IndexMapping that will use all the default indexing rules func NewIndexMapping() *IndexMappingImpl { return &IndexMappingImpl{ diff --git a/mapping/synonym.go b/mapping/synonym.go index 5065b3b2a..597539bf0 100644 --- a/mapping/synonym.go +++ b/mapping/synonym.go @@ -25,6 +25,13 @@ type SynonymSource struct { AnalyzerName string `json:"analyzer"` } +func NewSynonymSource(collection, analyzer string) *SynonymSource { + return &SynonymSource{ + CollectionName: collection, + AnalyzerName: analyzer, + } +} + func (s *SynonymSource) Collection() string { return s.CollectionName } diff --git a/search_test.go b/search_test.go index c39a58558..2a30a4502 100644 --- a/search_test.go +++ b/search_test.go @@ -15,10 +15,12 @@ package bleve import ( + "context" "encoding/json" "fmt" "math" "reflect" + "sort" "strconv" "strings" "testing" @@ -3746,3 +3748,183 @@ func TestAutoFuzzy(t *testing.T) { } } } + +func TestSynonymSearch(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + synonymCollection := "collection1" + + synonymSourceName := "english" + + synonymAnalyzer := "simple" + + imap := mapping.NewIndexMapping() + textField := mapping.NewTextFieldMapping() + textField.Analyzer = simple.Name + imap.DefaultMapping.AddFieldMappingsAt("text", textField) + imap.AddSynonymSource(synonymSourceName, synonymCollection, synonymAnalyzer) + + err := imap.Validate() + if err != nil { + t.Fatal(err) + } + + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + documents := map[string]map[string]interface{}{ + "doc1": { + "text": "quick brown fox eats", + }, + "doc2": { + "text": "fast red wolf jumps", + }, + "doc3": { + "text": "quick red cat runs", + }, + "doc4": { + "text": "speedy brown dog barks", + }, + "doc5": { + "text": "fast green rabbit hops", + }, + } + + batch := idx.NewBatch() + for docID, doc := range documents { + err := batch.Index(docID, doc) + if err != nil { + t.Fatal(err) + } + } + + synonymDocuments := map[string]*SynonymDefinition{ + "synDoc1": { + Synonyms: []string{"quick", "fast", "speedy"}, + }, + "synDoc2": { + Input: []string{"color", "colour"}, + Synonyms: []string{"red", "green", "blue", "yellow", "brown"}, + }, + "synDoc3": { + Input: []string{"animal", "creature"}, + Synonyms: []string{"fox", "wolf", "cat", "dog", "rabbit"}, + }, + "synDoc4": { + Synonyms: []string{"eats", "jumps", "runs", "barks", "hops"}, + }, + } + + for synName, synDef := range synonymDocuments { + err := batch.IndexSynonym(synName, "collection1", synDef) + if err != nil { + t.Fatal(err) + } + } + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + + sco, err := idx.Advanced() + if err != nil { + t.Fatal(err) + } + + reader, err := sco.Reader() + if err != nil { + t.Fatal(err) + } + defer func() { + err = reader.Close() + if err != nil { + t.Fatal(err) + } + }() + + synReader, ok := reader.(index.ThesaurusReader) + if !ok { + t.Fatal("expected thesaurus reader") + } + + type testStruct struct { + queryTerm string + expectedSynonyms []string + } + + testQueries := []testStruct{ + { + queryTerm: "quick", + expectedSynonyms: []string{"fast", "speedy"}, + }, + { + queryTerm: "red", + expectedSynonyms: []string{}, + }, + { + queryTerm: "color", + expectedSynonyms: []string{"red", "green", "blue", "yellow", "brown"}, + }, + { + queryTerm: "colour", + expectedSynonyms: []string{"red", "green", "blue", "yellow", "brown"}, + }, + { + queryTerm: "animal", + expectedSynonyms: []string{"fox", "wolf", "cat", "dog", "rabbit"}, + }, + { + queryTerm: "creature", + expectedSynonyms: []string{"fox", "wolf", "cat", "dog", "rabbit"}, + }, + { + queryTerm: "fox", + expectedSynonyms: []string{}, + }, + { + queryTerm: "eats", + expectedSynonyms: []string{"jumps", "runs", "barks", "hops"}, + }, + { + queryTerm: "jumps", + expectedSynonyms: []string{"eats", "runs", "barks", "hops"}, + }, + } + + for _, test := range testQueries { + str, err := synReader.SynonymTermReader(context.Background(), synonymSourceName, []byte(test.queryTerm)) + if err != nil { + t.Fatal(err) + } + var gotSynonyms []string + for { + synonym, err := str.Next() + if err != nil { + t.Fatal(err) + } + if synonym == "" { + break + } + gotSynonyms = append(gotSynonyms, string(synonym)) + } + if len(gotSynonyms) != len(test.expectedSynonyms) { + t.Fatalf("expected %d synonyms, got %d", len(test.expectedSynonyms), len(gotSynonyms)) + } + sort.Strings(gotSynonyms) + sort.Strings(test.expectedSynonyms) + for i, syn := range gotSynonyms { + if syn != test.expectedSynonyms[i] { + t.Fatalf("expected synonym %s, got %s", test.expectedSynonyms[i], syn) + } + } + } +} From 32c67af39313f4517f70e8f9bec6d7e743e0073d Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 18 Nov 2024 19:13:31 +0530 Subject: [PATCH 06/35] query path first draft --- index_impl.go | 28 +++- mapping/mapping.go | 5 + search/query/query.go | 296 ++++++++++++++++++++++++++++++++++++++++++ search/util.go | 6 + search_test.go | 4 +- 5 files changed, 334 insertions(+), 5 deletions(-) diff --git a/index_impl.go b/index_impl.go index e6debf17a..3cce0a12c 100644 --- a/index_impl.go +++ b/index_impl.go @@ -38,6 +38,7 @@ import ( "github.com/blevesearch/bleve/v2/search/collector" "github.com/blevesearch/bleve/v2/search/facet" "github.com/blevesearch/bleve/v2/search/highlight" + "github.com/blevesearch/bleve/v2/search/query" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" "github.com/blevesearch/geo/s2" @@ -505,8 +506,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } var knnHits []*search.DocumentMatch + var fts search.FieldTermSynonymMap var ok bool - var skipKnnCollector bool if req.PreSearchData != nil { for k, v := range req.PreSearchData { switch k { @@ -517,19 +518,40 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr return nil, fmt.Errorf("knn preSearchData must be of type []*search.DocumentMatch") } } - skipKnnCollector = true + case search.SynonymPreSearchDataKey: + if v != nil { + fts, ok = v.(search.FieldTermSynonymMap) + if !ok { + return nil, fmt.Errorf("synonym preSearchData must be of type search.FieldTermSynonymMap") + } + } } } } - if !skipKnnCollector && requestHasKNN(req) { + if knnHits == nil && requestHasKNN(req) { knnHits, err = i.runKnnCollector(ctx, req, indexReader, false) if err != nil { return nil, err } } + if fts == nil { + if synMap, ok := i.m.(mapping.SynonymMapping); ok && synMap.SynonymCount() > 0 { + if synReader, ok := indexReader.(index.SynonymReader); ok { + fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) + if err != nil { + return nil, err + } + } + } + } + setKnnHitsInCollector(knnHits, req, coll) + if fts != nil { + ctx = context.WithValue(ctx, search.FieldTermSynonymMapKey, fts) + } + // This callback and variable handles the tracking of bytes read // 1. as part of creation of tfr and its Next() calls which is // accounted by invoking this callback when the TFR is closed. diff --git a/mapping/mapping.go b/mapping/mapping.go index 6100d6d09..6714c55aa 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -61,5 +61,10 @@ type IndexMapping interface { type SynonymMapping interface { IndexMapping + MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error + + SynonymSourceForPath(path string) string + + SynonymCount() int } diff --git a/search/query/query.go b/search/query/query.go index d263a0e54..22f1293ee 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -21,8 +21,10 @@ import ( "io" "log" + "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -423,3 +425,297 @@ func DumpQuery(m mapping.IndexMapping, query Query) (string, error) { data, err := json.MarshalIndent(q, "", " ") return string(data), err } + +// ExtractSynonyms extracts synonyms from the query tree and returns a map of +// field-term pairs to their synonyms. The input query tree is traversed and +// for each term query, the synonyms are extracted from the synonym source +// associated with the field. The synonyms are then added to the provided map. +// The map is returned and may be nil if no synonyms were found. +func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.SynonymReader, + query Query, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + + if r == nil || m == nil || query == nil { + return rv, nil + } + resolveFieldAndSource := func(field string) (string, string) { + if field == "" { + field = m.DefaultSearchField() + } + return field, m.SynonymSourceForPath(field) + } + handleAnalyzer := func(analyzerName, field string) (analysis.Analyzer, error) { + if analyzerName == "" { + analyzerName = m.AnalyzerNameForPath(field) + } + analyzer := m.AnalyzerNamed(analyzerName) + if analyzer == nil { + return nil, fmt.Errorf("no analyzer named '%s' registered", analyzerName) + } + return analyzer, nil + } + switch q := query.(type) { + case *BooleanQuery: + var err error + rv, err = ExtractSynonyms(ctx, m, r, q.Must, rv) + if err != nil { + return nil, err + } + rv, err = ExtractSynonyms(ctx, m, r, q.Should, rv) + if err != nil { + return nil, err + } + rv, err = ExtractSynonyms(ctx, m, r, q.MustNot, rv) + if err != nil { + return nil, err + } + return rv, nil + case *ConjunctionQuery: + for _, child := range q.Conjuncts { + var err error + rv, err = ExtractSynonyms(ctx, m, r, child, rv) + if err != nil { + return nil, err + } + } + case *DisjunctionQuery: + for _, child := range q.Disjuncts { + var err error + rv, err = ExtractSynonyms(ctx, m, r, child, rv) + if err != nil { + return nil, err + } + } + case *FuzzyQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + return addFuzzySynonymsForTerm(ctx, source, field, q.Term, q.Fuzziness, q.Prefix, r, rv) + } + case *MatchQuery, *MatchPhraseQuery: + var analyzerName, matchString, fieldVal string + var fuzziness, prefix int + if mq, ok := q.(*MatchQuery); ok { + analyzerName, fieldVal, matchString, fuzziness, prefix = mq.Analyzer, mq.FieldVal, mq.Match, mq.Fuzziness, mq.Prefix + } else if mpq, ok := q.(*MatchPhraseQuery); ok { + analyzerName, fieldVal, matchString, fuzziness = mpq.Analyzer, mpq.FieldVal, mpq.MatchPhrase, mpq.Fuzziness + } + field, source := resolveFieldAndSource(fieldVal) + if source != "" { + analyzer, err := handleAnalyzer(analyzerName, field) + if err != nil { + return nil, err + } + tokens := analyzer.Analyze([]byte(matchString)) + for _, token := range tokens { + rv, err = addFuzzySynonymsForTerm(ctx, source, field, string(token.Term), fuzziness, prefix, r, rv) + if err != nil { + return nil, err + } + } + } + case *MultiPhraseQuery, *PhraseQuery: + var fieldVal string + if mpq, ok := q.(*MultiPhraseQuery); ok { + fieldVal = mpq.FieldVal + } else if pq, ok := q.(*PhraseQuery); ok { + fieldVal = pq.FieldVal + } + field, source := resolveFieldAndSource(fieldVal) + if source != "" { + var terms []string + if mpq, ok := q.(*MultiPhraseQuery); ok { + for _, termGroup := range mpq.Terms { + terms = append(terms, termGroup...) + } + } else if pq, ok := q.(*PhraseQuery); ok { + terms = pq.Terms + } + for _, term := range terms { + var err error + rv, err = addSynonymsForTerm(ctx, source, term, field, r, rv) + if err != nil { + return nil, err + } + } + return rv, nil + } + case *PrefixQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + return addPrefixSynonymsForTerm(ctx, source, field, q.Prefix, r, rv) + } + case *QueryStringQuery: + expanded, err := expandQuery(m, q) + if err != nil { + return nil, err + } + return ExtractSynonyms(ctx, m, r, expanded, rv) + case *RegexpQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + return addRegexpSynonymsForTerm(ctx, source, field, q.Regexp, r, rv) + } + case *TermQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + return addSynonymsForTerm(ctx, source, q.Term, field, r, rv) + } + case *WildcardQuery: + field, source := resolveFieldAndSource(q.FieldVal) + if source != "" { + regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) + return addRegexpSynonymsForTerm(ctx, source, field, regexpString, r, rv) + } + } + return rv, nil +} + +// addRegexpSynonymsForTerm finds all terms that match the given regexp and +// adds their synonyms to the provided map. +func addRegexpSynonymsForTerm(ctx context.Context, src, field, term string, + r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + + if ir, ok := r.(index.IndexReaderRegexp); ok { + fieldDict, err := ir.FieldDictRegexp(field, term) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + regexpTerms := []string{term} + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + regexpTerms = append(regexpTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + if err != nil { + return nil, err + } + for _, term := range regexpTerms { + rv, err = addSynonymsForTerm(ctx, src, term, field, r, rv) + if err != nil { + return nil, err + } + } + return rv, nil + } + return nil, nil +} + +// addPrefixSynonymsForTerm finds all terms that match the given prefix and +// adds their synonyms to the provided map. +func addPrefixSynonymsForTerm(ctx context.Context, src, field, term string, + r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + // find the terms with this prefix + fieldDict, err := r.FieldDictPrefix(field, []byte(term)) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + prefixTerms := []string{term} + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + prefixTerms = append(prefixTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + if err != nil { + return nil, err + } + for _, term := range prefixTerms { + rv, err = addSynonymsForTerm(ctx, src, term, field, r, rv) + if err != nil { + return nil, err + } + } + return rv, nil +} + +// addFuzzySynonymsForTerm finds all terms that match the given term with the +// given fuzziness and adds their synonyms to the provided map. +func addFuzzySynonymsForTerm(ctx context.Context, src, field, term string, fuzziness, prefix int, + r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + if fuzziness == 0 { + return addSynonymsForTerm(ctx, src, term, field, r, rv) + } + if ir, ok := r.(index.IndexReaderFuzzy); ok { + if fuzziness > searcher.MaxFuzziness { + return nil, fmt.Errorf("fuzziness exceeds max (%d)", searcher.MaxFuzziness) + } + if fuzziness < 0 { + return nil, fmt.Errorf("invalid fuzziness, negative") + } + prefixTerm := "" + for i, r := range term { + if i < prefix { + prefixTerm += string(r) + } else { + break + } + } + fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + fuzzyTerms := []string{term} + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + fuzzyTerms = append(fuzzyTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + if err != nil { + return nil, err + } + for _, term := range fuzzyTerms { + rv, err = addSynonymsForTerm(ctx, src, term, field, r, rv) + if err != nil { + return nil, err + } + } + return rv, nil + } + return nil, nil +} + +// addSynonymsForTerm finds synonyms for the given term and adds them to the +// provided map. +func addSynonymsForTerm(ctx context.Context, src, term, field string, r index.SynonymReader, + rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + + termBytes := []byte(term) + termReader, err := r.SynonymTermReader(ctx, src, termBytes) + if err != nil { + return nil, err + } + defer func() { + if cerr := termReader.Close(); cerr != nil && err == nil { + err = cerr + } + }() + var synonyms []string + synonym, err := termReader.Next() + for err == nil && synonym != "" { + synonyms = append(synonyms, synonym) + synonym, err = termReader.Next() + } + if len(synonyms) > 0 { + if rv == nil { + rv = make(search.FieldTermSynonymMap) + } + if _, exists := rv[field]; !exists { + rv[field] = make(map[string][]string) + } + rv[field][term] = synonyms + } + return rv, err +} diff --git a/search/util.go b/search/util.go index 6472803d1..0f16db19a 100644 --- a/search/util.go +++ b/search/util.go @@ -136,6 +136,7 @@ const MinGeoBufPoolSize = 24 type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool const KnnPreSearchDataKey = "_knn_pre_search_data_key" +const SynonymPreSearchDataKey = "_synonym_pre_search_data_key" const PreSearchKey = "_presearch_key" @@ -144,5 +145,10 @@ type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *D type SearcherStartCallbackFn func(size uint64) error type SearcherEndCallbackFn func(size uint64) error +// field -> term -> synonyms +type FieldTermSynonymMap map[string]map[string][]string + +const FieldTermSynonymMapKey = "_field_term_synonym_map_key" + const SearcherStartCallbackKey = "_searcher_start_callback_key" const SearcherEndCallbackKey = "_searcher_end_callback_key" diff --git a/search_test.go b/search_test.go index 2a30a4502..b929fa989 100644 --- a/search_test.go +++ b/search_test.go @@ -3851,9 +3851,9 @@ func TestSynonymSearch(t *testing.T) { } }() - synReader, ok := reader.(index.ThesaurusReader) + synReader, ok := reader.(index.SynonymReader) if !ok { - t.Fatal("expected thesaurus reader") + t.Fatal("expected synonym reader") } type testStruct struct { From 784f45bdfae5300a5b9f0813189068f831fa4b85 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 22 Nov 2024 17:45:55 +0530 Subject: [PATCH 07/35] minor fixes --- mapping/index.go | 4 ++++ search_test.go | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/mapping/index.go b/mapping/index.go index ae17a3da5..36b423108 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -519,3 +519,7 @@ func (im *IndexMappingImpl) SynonymSourceForPath(path string) string { return "" } + +func (im *IndexMappingImpl) SynonymCount() int { + return len(im.SynonymSources) +} diff --git a/search_test.go b/search_test.go index b929fa989..bdfb2fd42 100644 --- a/search_test.go +++ b/search_test.go @@ -3749,7 +3749,7 @@ func TestAutoFuzzy(t *testing.T) { } } -func TestSynonymSearch(t *testing.T) { +func TestSynonymTermReader(t *testing.T) { tmpIndexPath := createTmpIndexPath(t) defer cleanupTmpIndexPath(t, tmpIndexPath) @@ -3759,12 +3759,13 @@ func TestSynonymSearch(t *testing.T) { synonymAnalyzer := "simple" - imap := mapping.NewIndexMapping() textField := mapping.NewTextFieldMapping() textField.Analyzer = simple.Name + textField.SynonymSource = synonymSourceName + + imap := mapping.NewIndexMapping() imap.DefaultMapping.AddFieldMappingsAt("text", textField) imap.AddSynonymSource(synonymSourceName, synonymCollection, synonymAnalyzer) - err := imap.Validate() if err != nil { t.Fatal(err) From f19cedc6f703c4255938e71e7c9bbdb87b031213 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 29 Nov 2024 12:53:33 +0530 Subject: [PATCH 08/35] bug fixes and unit tests for single index implementation --- search/query/query.go | 42 ++- search/searcher/search_fuzzy.go | 22 +- search/searcher/search_phrase.go | 37 +++ search/searcher/search_regexp.go | 4 + search/searcher/search_term.go | 57 +++- search/searcher/search_term_prefix.go | 4 + search_test.go | 372 ++++++++++++++++++++++++++ 7 files changed, 517 insertions(+), 21 deletions(-) diff --git a/search/query/query.go b/search/query/query.go index 22f1293ee..c908bbc54 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -20,6 +20,7 @@ import ( "fmt" "io" "log" + "strings" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/mapping" @@ -487,16 +488,21 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno } case *FuzzyQuery: field, source := resolveFieldAndSource(q.FieldVal) + fuzziness := q.Fuzziness + if q.autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(q.Term) + } if source != "" { - return addFuzzySynonymsForTerm(ctx, source, field, q.Term, q.Fuzziness, q.Prefix, r, rv) + return addFuzzySynonymsForTerm(ctx, source, field, q.Term, fuzziness, q.Prefix, r, rv) } case *MatchQuery, *MatchPhraseQuery: var analyzerName, matchString, fieldVal string var fuzziness, prefix int + var autoFuzzy bool if mq, ok := q.(*MatchQuery); ok { - analyzerName, fieldVal, matchString, fuzziness, prefix = mq.Analyzer, mq.FieldVal, mq.Match, mq.Fuzziness, mq.Prefix + analyzerName, fieldVal, matchString, fuzziness, prefix, autoFuzzy = mq.Analyzer, mq.FieldVal, mq.Match, mq.Fuzziness, mq.Prefix, mq.autoFuzzy } else if mpq, ok := q.(*MatchPhraseQuery); ok { - analyzerName, fieldVal, matchString, fuzziness = mpq.Analyzer, mpq.FieldVal, mpq.MatchPhrase, mpq.Fuzziness + analyzerName, fieldVal, matchString, fuzziness, autoFuzzy = mpq.Analyzer, mpq.FieldVal, mpq.MatchPhrase, mpq.Fuzziness, mpq.autoFuzzy } field, source := resolveFieldAndSource(fieldVal) if source != "" { @@ -506,6 +512,9 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno } tokens := analyzer.Analyze([]byte(matchString)) for _, token := range tokens { + if autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(string(token.Term)) + } rv, err = addFuzzySynonymsForTerm(ctx, source, field, string(token.Term), fuzziness, prefix, r, rv) if err != nil { return nil, err @@ -514,10 +523,12 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno } case *MultiPhraseQuery, *PhraseQuery: var fieldVal string + var fuzziness int + var autoFuzzy bool if mpq, ok := q.(*MultiPhraseQuery); ok { - fieldVal = mpq.FieldVal + fieldVal, fuzziness, autoFuzzy = mpq.FieldVal, mpq.Fuzziness, mpq.autoFuzzy } else if pq, ok := q.(*PhraseQuery); ok { - fieldVal = pq.FieldVal + fieldVal, fuzziness, autoFuzzy = pq.FieldVal, pq.Fuzziness, pq.autoFuzzy } field, source := resolveFieldAndSource(fieldVal) if source != "" { @@ -531,7 +542,10 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno } for _, term := range terms { var err error - rv, err = addSynonymsForTerm(ctx, source, term, field, r, rv) + if autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(term) + } + rv, err = addFuzzySynonymsForTerm(ctx, source, field, term, fuzziness, 0, r, rv) if err != nil { return nil, err } @@ -552,12 +566,12 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno case *RegexpQuery: field, source := resolveFieldAndSource(q.FieldVal) if source != "" { - return addRegexpSynonymsForTerm(ctx, source, field, q.Regexp, r, rv) + return addRegexpSynonymsForTerm(ctx, source, field, strings.TrimPrefix(q.Regexp, "^"), r, rv) } case *TermQuery: field, source := resolveFieldAndSource(q.FieldVal) if source != "" { - return addSynonymsForTerm(ctx, source, q.Term, field, r, rv) + return addSynonymsForTerm(ctx, source, field, q.Term, r, rv) } case *WildcardQuery: field, source := resolveFieldAndSource(q.FieldVal) @@ -594,7 +608,7 @@ func addRegexpSynonymsForTerm(ctx context.Context, src, field, term string, return nil, err } for _, term := range regexpTerms { - rv, err = addSynonymsForTerm(ctx, src, term, field, r, rv) + rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) if err != nil { return nil, err } @@ -628,7 +642,7 @@ func addPrefixSynonymsForTerm(ctx context.Context, src, field, term string, return nil, err } for _, term := range prefixTerms { - rv, err = addSynonymsForTerm(ctx, src, term, field, r, rv) + rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) if err != nil { return nil, err } @@ -641,7 +655,7 @@ func addPrefixSynonymsForTerm(ctx context.Context, src, field, term string, func addFuzzySynonymsForTerm(ctx context.Context, src, field, term string, fuzziness, prefix int, r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { if fuzziness == 0 { - return addSynonymsForTerm(ctx, src, term, field, r, rv) + return addSynonymsForTerm(ctx, src, field, term, r, rv) } if ir, ok := r.(index.IndexReaderFuzzy); ok { if fuzziness > searcher.MaxFuzziness { @@ -677,7 +691,7 @@ func addFuzzySynonymsForTerm(ctx context.Context, src, field, term string, fuzzi return nil, err } for _, term := range fuzzyTerms { - rv, err = addSynonymsForTerm(ctx, src, term, field, r, rv) + rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) if err != nil { return nil, err } @@ -689,8 +703,8 @@ func addFuzzySynonymsForTerm(ctx context.Context, src, field, term string, fuzzi // addSynonymsForTerm finds synonyms for the given term and adds them to the // provided map. -func addSynonymsForTerm(ctx context.Context, src, term, field string, r index.SynonymReader, - rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { +func addSynonymsForTerm(ctx context.Context, src, field, term string, + r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { termBytes := []byte(term) termReader, err := r.SynonymTermReader(ctx, src, termBytes) diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 6c29f845d..d0e5c1ec5 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -55,9 +55,11 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s // since the fuzzy candidate terms are not collected // for a term search, and the only candidate term is // the term itself - fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey) - if fuzzyTermMatches != nil { - fuzzyTermMatches.(map[string][]string)[term] = []string{term} + if ctx != nil { + fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey) + if fuzzyTermMatches != nil { + fuzzyTermMatches.(map[string][]string)[term] = []string{term} + } } return NewTermSearcher(ctx, indexReader, term, field, boost, options) } @@ -94,12 +96,22 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s fuzzyTermMatches.(map[string][]string)[term] = candidates } } + // check if the candidates are empty or have one term which is the term itself + if len(candidates) == 0 || (len(candidates) == 1 && candidates[0] == term) { + if ctx != nil { + fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey) + if fuzzyTermMatches != nil { + fuzzyTermMatches.(map[string][]string)[term] = []string{term} + } + } + return NewTermSearcher(ctx, indexReader, term, field, boost, options) + } return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field, boost, editDistances, options, true) } -func getAutoFuzziness(term string) int { +func GetAutoFuzziness(term string) int { termLength := len(term) if termLength > AutoFuzzinessHighThreshold { return MaxFuzziness @@ -111,7 +123,7 @@ func getAutoFuzziness(term string) int { func NewAutoFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string, prefix int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { - return NewFuzzySearcher(ctx, indexReader, term, prefix, getAutoFuzziness(term), field, boost, options) + return NewFuzzySearcher(ctx, indexReader, term, prefix, GetAutoFuzziness(term), field, boost, options) } type fuzzyCandidates struct { diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index bf24b465a..9c2ff7d5f 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -164,6 +164,40 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, } } + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + if fuzzinessEnabled { + for term, fuzzyTerms := range fuzzyTermMatches { + fuzzySynonymTerms := make([]string, 0, len(fuzzyTerms)) + if s, found := ts[term]; found { + fuzzySynonymTerms = append(fuzzySynonymTerms, s...) + } + for _, fuzzyTerm := range fuzzyTerms { + if fuzzyTerm == term { + continue + } + if s, found := ts[fuzzyTerm]; found { + fuzzySynonymTerms = append(fuzzySynonymTerms, s...) + } + } + if len(fuzzySynonymTerms) > 0 { + fuzzyTermMatches[term] = append(fuzzyTermMatches[term], fuzzySynonymTerms...) + } + } + } else { + for _, termPos := range terms { + for _, term := range termPos { + if s, found := ts[term]; found { + if fuzzyTermMatches == nil { + fuzzyTermMatches = make(map[string][]string) + } + fuzzyTermMatches[term] = s + } + } + } + } + } + } mustSearcher, err := NewConjunctionSearcher(ctx, indexReader, termPositionSearchers, options) if err != nil { // close any searchers already opened @@ -337,6 +371,9 @@ func (s *PhraseSearcher) expandFuzzyMatches(tlm search.TermLocationMap, expanded for term, fuzzyMatches := range s.fuzzyTermMatches { locations := tlm[term] for _, fuzzyMatch := range fuzzyMatches { + if fuzzyMatch == term { + continue + } locations = append(locations, tlm[fuzzyMatch]...) } expandedTlm[term] = locations diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index b88133e31..74caf0703 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -68,6 +68,10 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader, if err != nil { return nil, err } + // check if the candidateTerms are empty or have one term which is the term itself + if len(candidateTerms) == 0 || (len(candidateTerms) == 1 && candidateTerms[0] == pattern) { + return NewTermSearcher(ctx, indexReader, pattern, field, boost, options) + } return NewMultiTermSearcher(ctx, indexReader, candidateTerms, field, boost, options, true) diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index cd794ea32..c519d8d51 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -38,14 +38,23 @@ type TermSearcher struct { tfd index.TermFieldDoc } -func NewTermSearcher(ctx context.Context, indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { +func NewTermSearcher(ctx context.Context, indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { if isTermQuery(ctx) { ctx = context.WithValue(ctx, search.QueryTypeKey, search.Term) } return NewTermSearcherBytes(ctx, indexReader, []byte(term), field, boost, options) } -func NewTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { +func NewTermSearcherBytes(ctx context.Context, indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + if s, found := ts[string(term)]; found { + return NewSynonymSearcher(ctx, indexReader, term, s, field, boost, options) + } + } + } + } needFreqNorm := options.Score != "none" reader, err := indexReader.TermFieldReader(ctx, term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { @@ -69,6 +78,50 @@ func newTermSearcherFromReader(indexReader index.IndexReader, reader index.TermF }, nil } +func NewSynonymSearcher(ctx context.Context, indexReader index.IndexReader, term []byte, synonyms []string, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + createTermSearcher := func(term []byte, boostVal float64) (search.Searcher, error) { + needFreqNorm := options.Score != "none" + reader, err := indexReader.TermFieldReader(ctx, term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) + if err != nil { + return nil, err + } + return newTermSearcherFromReader(indexReader, reader, term, field, boostVal, options) + } + // create a searcher for the term itself + termSearcher, err := createTermSearcher(term, boost) + if err != nil { + return nil, err + } + // constituent searchers of the disjunction + qsearchers := make([]search.Searcher, 0, len(synonyms)+1) + // helper method to close all the searchers we've created + // in case of an error + qsearchersClose := func() { + for _, searcher := range qsearchers { + if searcher != nil { + _ = searcher.Close() + } + } + } + qsearchers = append(qsearchers, termSearcher) + // create a searcher for each synonym + for _, synonym := range synonyms { + synonymSearcher, err := createTermSearcher([]byte(synonym), boost/2.0) + if err != nil { + qsearchersClose() + return nil, err + } + qsearchers = append(qsearchers, synonymSearcher) + } + // create a disjunction searcher + rv, err := NewDisjunctionSearcher(ctx, indexReader, qsearchers, 0, options) + if err != nil { + qsearchersClose() + return nil, err + } + return rv, nil +} + func (s *TermSearcher) Size() int { return reflectStaticSizeTermSearcher + size.SizeOfPtr + s.reader.Size() + diff --git a/search/searcher/search_term_prefix.go b/search/searcher/search_term_prefix.go index dc16e4864..3b05e5a8d 100644 --- a/search/searcher/search_term_prefix.go +++ b/search/searcher/search_term_prefix.go @@ -52,6 +52,10 @@ func NewTermPrefixSearcher(ctx context.Context, indexReader index.IndexReader, p reportIOStats(ctx, fieldDict.BytesRead()) search.RecordSearchCost(ctx, search.AddM, fieldDict.BytesRead()) } + // check if the terms are empty or have one term which is the prefix itself + if len(terms) == 0 || (len(terms) == 1 && terms[0] == prefix) { + return NewTermSearcher(ctx, indexReader, prefix, field, boost, options) + } return NewMultiTermSearcher(ctx, indexReader, terms, field, boost, options, true) } diff --git a/search_test.go b/search_test.go index bdfb2fd42..cd1346936 100644 --- a/search_test.go +++ b/search_test.go @@ -19,6 +19,7 @@ import ( "encoding/json" "fmt" "math" + "math/rand" "reflect" "sort" "strconv" @@ -41,6 +42,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/milliseconds" "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/nanoseconds" "github.com/blevesearch/bleve/v2/analysis/datetime/timestamp/seconds" + "github.com/blevesearch/bleve/v2/analysis/lang/en" "github.com/blevesearch/bleve/v2/analysis/token/length" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/shingle" @@ -3929,3 +3931,373 @@ func TestSynonymTermReader(t *testing.T) { } } } + +func TestSynonymSearchQueries(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + synonymCollection := "collection1" + + synonymSourceName := "english" + + analyzer := en.AnalyzerName + + textField := mapping.NewTextFieldMapping() + textField.Analyzer = analyzer + textField.SynonymSource = synonymSourceName + + imap := mapping.NewIndexMapping() + imap.DefaultMapping.AddFieldMappingsAt("text", textField) + imap.AddSynonymSource(synonymSourceName, synonymCollection, analyzer) + err := imap.Validate() + if err != nil { + t.Fatal(err) + } + + idx, err := New(tmpIndexPath, imap) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + documents := map[string]map[string]interface{}{ + "doc1": { + "text": `The hardworking employee consistently strives to exceed expectations. + His industrious nature makes him a valuable asset to any team. + His conscientious attention to detail ensures that projects are completed efficiently and accurately. + He remains persistent even in the face of challenges.`, + }, + "doc2": { + "text": `The tranquil surroundings of the retreat provide a perfect escape from the hustle and bustle of city life. + Guests enjoy the peaceful atmosphere, which is perfect for relaxation and rejuvenation. + The calm environment offers the ideal place to meditate and connect with nature. + Even the most stressed individuals find themselves feeling relaxed and at ease.`, + }, + "doc3": { + "text": `The house was burned down, leaving only a charred shell behind. + The intense heat of the flames caused the walls to warp and the roof to cave in. + The seared remains of the furniture told the story of the blaze. + The incinerated remains left little more than ashes to remember what once was.`, + }, + "doc4": { + "text": `The faithful dog followed its owner everywhere, always loyal and steadfast. + It was devoted to protecting its family, and its reliable nature meant it could always be trusted. + In the face of danger, the dog remained calm, knowing its role was to stay vigilant. + Its trustworthy companionship provided comfort and security.`, + }, + "doc5": { + "text": `The lively market is bustling with activity from morning to night. + The dynamic energy of the crowd fills the air as vendors sell their wares. + Shoppers wander from stall to stall, captivated by the vibrant colors and energetic atmosphere. + This place is alive with movement and life.`, + }, + "doc6": { + "text": `In moments of crisis, bravery shines through. + It takes valor to step forward when others are afraid to act. + Heroes are defined by their guts and nerve, taking risks to protect others. + Boldness in the face of danger is what sets them apart.`, + }, + "doc7": { + "text": `Innovation is the driving force behind progress in every industry. + The company fosters an environment of invention, encouraging creativity at every level. + The focus on novelty and improvement means that ideas are always evolving. + The development of new solutions is at the core of the company's mission.`, + }, + "doc8": { + "text": `The blazing sunset cast a radiant glow over the horizon, painting the sky with hues of red and orange. + The intense heat of the day gave way to a fiery display of color. + As the sun set, the glowing light illuminated the landscape, creating a breathtaking scene. + The fiery sky was a sight to behold.`, + }, + "doc9": { + "text": `The fertile soil of the valley makes it perfect for farming. + The productive land yields abundant crops year after year. + Farmers rely on the rich, fruitful ground to sustain their livelihoods. + The area is known for its plentiful harvests, supporting both local communities and export markets.`, + }, + "doc10": { + "text": `The arid desert is a vast, dry expanse with little water or vegetation. + The barren landscape stretches as far as the eye can see, offering little respite from the scorching sun. + The desolate environment is unforgiving to those who venture too far without preparation. + The parched earth cracks under the heat, creating a harsh, unyielding terrain.`, + }, + "doc11": { + "text": `The fox is known for its cunning and intelligence. + As a predator, it relies on its sharp instincts to outwit its prey. + Its vulpine nature makes it both mysterious and fascinating. + The fox's ability to hunt with precision and stealth is what makes it such a formidable hunter.`, + }, + "doc12": { + "text": `The dog is often considered man's best friend due to its loyal nature. + As a companion, the hound provides both protection and affection. + The puppy quickly becomes a member of the family, always by your side. + Its playful energy and unshakable loyalty make it a beloved pet.`, + }, + "doc13": { + "text": `He worked tirelessly through the night, always persistent in his efforts. + His industrious approach to problem-solving kept the project moving forward. + No matter how difficult the task, he remained focused, always giving his best. + His dedication paid off when the project was completed ahead of schedule.`, + }, + "doc14": { + "text": `The river flowed calmly through the valley, its peaceful current offering a sense of tranquility. + Fishermen relaxed by the banks, enjoying the calm waters that reflected the sky above. + The tranquil nature of the river made it a perfect spot for meditation. + As the day ended, the river's quiet flow brought a sense of peace.`, + }, + "doc15": { + "text": `After the fire, all that was left was the charred remains of what once was. + The seared walls of the house told a tragic story. + The intensity of the blaze had burned everything in its path, leaving only the smoldering wreckage behind. + The incinerated objects could not be salvaged, and the damage was beyond repair.`, + }, + "doc16": { + "text": `The devoted employee always went above and beyond to complete his tasks. + His steadfast commitment to the company made him a valuable team member. + He was reliable, never failing to meet deadlines. + His trustworthiness earned him the respect of his colleagues, and was considered an + ingenious expert in his field.`, + }, + "doc17": { + "text": `The city is vibrant, full of life and energy. + The dynamic pace of the streets reflects the diverse culture of its inhabitants. + People from all walks of life contribute to the energetic atmosphere. + The city's lively spirit can be felt in every corner, from the bustling markets to the lively festivals.`, + }, + "doc18": { + "text": `In a moment of uncertainty, he made a bold decision that would change his life forever. + It took courage and nerve to take the leap, but his bravery paid off. + The guts to face the unknown allowed him to achieve something remarkable. + Being an bright scholar, the skill he demonstrated inspired those around him.`, + }, + "doc19": { + "text": `Innovation is often born from necessity, and the lightbulb is a prime example. + Thomas Edison's invention changed the world, offering a new way to see the night. + The creativity involved in developing such a groundbreaking product sparked a wave of + novelty in the scientific community. This improvement in technology continues to shape the modern world. + He was a clever academic and a smart researcher.`, + }, + "doc20": { + "text": `The fiery volcano erupted with a force that shook the earth. Its radiant lava flowed down the sides, + illuminating the night sky. The intense heat from the eruption could be felt miles away, as the + glowing lava burned everything in its path. The fiery display was both terrifying and mesmerizing.`, + }, + } + + synonymDocuments := map[string]*SynonymDefinition{ + "synDoc1": { + Synonyms: []string{"hardworking", "industrious", "conscientious", "persistent", "focused", "devoted"}, + }, + "synDoc2": { + Synonyms: []string{"tranquil", "peaceful", "calm", "relaxed", "unruffled"}, + }, + "synDoc3": { + Synonyms: []string{"burned", "charred", "seared", "incinerated", "singed"}, + }, + "synDoc4": { + Synonyms: []string{"faithful", "steadfast", "devoted", "reliable", "trustworthy"}, + }, + "synDoc5": { + Synonyms: []string{"lively", "dynamic", "energetic", "vivid", "vibrating"}, + }, + "synDoc6": { + Synonyms: []string{"bravery", "valor", "guts", "nerve", "boldness"}, + }, + "synDoc7": { + Input: []string{"innovation"}, + Synonyms: []string{"invention", "creativity", "novelty", "improvement", "development"}, + }, + "synDoc8": { + Input: []string{"blazing"}, + Synonyms: []string{"intense", "radiant", "burning", "fiery", "glowing"}, + }, + "synDoc9": { + Input: []string{"fertile"}, + Synonyms: []string{"productive", "fruitful", "rich", "abundant", "plentiful"}, + }, + "synDoc10": { + Input: []string{"arid"}, + Synonyms: []string{"dry", "barren", "desolate", "parched", "unfertile"}, + }, + "synDoc11": { + Input: []string{"fox"}, + Synonyms: []string{"vulpine", "canine", "predator", "hunter", "pursuer"}, + }, + "synDoc12": { + Input: []string{"dog"}, + Synonyms: []string{"canine", "hound", "puppy", "pup", "companion"}, + }, + "synDoc13": { + Synonyms: []string{"researcher", "scientist", "scholar", "academic", "expert"}, + }, + "synDoc14": { + Synonyms: []string{"bright", "clever", "ingenious", "sharp", "astute", "smart"}, + }, + } + + // Combine both maps into a slice of map entries (as they both have similar structure) + var combinedDocIDs []string + for id := range synonymDocuments { + combinedDocIDs = append(combinedDocIDs, id) + } + for id := range documents { + combinedDocIDs = append(combinedDocIDs, id) + } + rand.Shuffle(len(combinedDocIDs), func(i, j int) { + combinedDocIDs[i], combinedDocIDs[j] = combinedDocIDs[j], combinedDocIDs[i] + }) + + // Function to create batches of 5 + createDocBatches := func(docs []string, batchSize int) [][]string { + var batches [][]string + for i := 0; i < len(docs); i += batchSize { + end := i + batchSize + if end > len(docs) { + end = len(docs) + } + batches = append(batches, docs[i:end]) + } + return batches + } + // Create batches of 5 documents + var batchSize = 5 + docBatches := createDocBatches(combinedDocIDs, batchSize) + if len(docBatches) == 0 { + t.Fatal("expected batches") + } + totalDocs := 0 + for _, batch := range docBatches { + totalDocs += len(batch) + } + if totalDocs != len(combinedDocIDs) { + t.Fatalf("expected %d documents, got %d", len(combinedDocIDs), totalDocs) + } + + var batches []*Batch + for _, docBatch := range docBatches { + batch := idx.NewBatch() + for _, docID := range docBatch { + if synDef, ok := synonymDocuments[docID]; ok { + err := batch.IndexSynonym(docID, synonymCollection, synDef) + if err != nil { + t.Fatal(err) + } + } else { + err := batch.Index(docID, documents[docID]) + if err != nil { + t.Fatal(err) + } + } + } + batches = append(batches, batch) + } + for _, batch := range batches { + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + } + + type testStruct struct { + query string + expectHits []string + } + + testQueries := []testStruct{ + { + query: `{ + "match": "hardworking employee", + "field": "text" + }`, + expectHits: []string{"doc1", "doc13", "doc16", "doc4", "doc7"}, + }, + { + query: `{ + "match": "Hardwork and industrius efforts bring lovely and tranqual moments, with a glazing blow of valour.", + "field": "text", + "fuzziness": "auto" + }`, + expectHits: []string{ + "doc1", "doc13", "doc14", "doc15", "doc16", + "doc17", "doc18", "doc2", "doc20", "doc3", + "doc4", "doc5", "doc6", "doc7", "doc8", "doc9", + }, + }, + { + query: `{ + "prefix": "in", + "field": "text" + }`, + expectHits: []string{ + "doc1", "doc11", "doc13", "doc15", "doc16", + "doc17", "doc18", "doc19", "doc2", "doc20", + "doc3", "doc4", "doc7", "doc8", + }, + }, + { + query: `{ + "prefix": "vivid", + "field": "text" + }`, + expectHits: []string{ + "doc17", "doc5", + }, + }, + { + query: `{ + "match_phrase": "smart academic", + "field": "text" + }`, + expectHits: []string{"doc16", "doc18", "doc19"}, + }, + { + query: `{ + "match_phrase": "smrat acedemic", + "field": "text", + "fuzziness": "auto" + }`, + expectHits: []string{"doc16", "doc18", "doc19"}, + }, + { + query: `{ + "wildcard": "br*", + "field": "text" + }`, + expectHits: []string{"doc11", "doc14", "doc16", "doc18", "doc19", "doc6", "doc8"}, + }, + } + + for _, dtq := range testQueries { + q, err := query.ParseQuery([]byte(dtq.query)) + if err != nil { + t.Fatal(err) + } + sr := NewSearchRequest(q) + sr.Highlight = NewHighlightWithStyle(ansi.Name) + sr.SortBy([]string{"_id"}) + sr.Fields = []string{"*"} + sr.Size = 30 + sr.Explain = true + + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + if len(res.Hits) != len(dtq.expectHits) { + t.Fatalf("expected %d hits, got %d", len(dtq.expectHits), len(res.Hits)) + } + // sort the expected hits to match the order of the search results + sort.Strings(dtq.expectHits) + for i, hit := range res.Hits { + if hit.ID != dtq.expectHits[i] { + t.Fatalf("expected docID %s, got %s", dtq.expectHits[i], hit.ID) + } + } + } + +} From 91be4720983269c6ca81955a65403c711c8df90c Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 29 Nov 2024 15:33:34 +0530 Subject: [PATCH 09/35] remove regex optimization --- search/searcher/search_regexp.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index 74caf0703..b88133e31 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -68,10 +68,6 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader, if err != nil { return nil, err } - // check if the candidateTerms are empty or have one term which is the term itself - if len(candidateTerms) == 0 || (len(candidateTerms) == 1 && candidateTerms[0] == pattern) { - return NewTermSearcher(ctx, indexReader, pattern, field, boost, options) - } return NewMultiTermSearcher(ctx, indexReader, candidateTerms, field, boost, options, true) From 9baf914f1b8fd6f4f530b9df779f7f07a3bdcddd Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Sat, 30 Nov 2024 23:25:37 +0530 Subject: [PATCH 10/35] add default synonym sources --- index_alias_impl.go | 5 +++++ mapping/document.go | 32 +++++++++++++++++++++++++++----- mapping/index.go | 26 +++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 3c7cdcd32..73cc637f7 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -30,6 +30,7 @@ type indexAliasImpl struct { name string indexes []Index mutex sync.RWMutex + mapping mapping.IndexMapping open bool } @@ -360,6 +361,10 @@ func (i *indexAliasImpl) Mapping() mapping.IndexMapping { return nil } + if i.mapping != nil { + return i.mapping + } + err := i.isAliasToSingleIndex() if err != nil { return nil diff --git a/mapping/document.go b/mapping/document.go index 5d70af912..b470afaa4 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -40,11 +40,12 @@ import ( // are used. To disable this automatic handling, set // Dynamic to false. type DocumentMapping struct { - Enabled bool `json:"enabled"` - Dynamic bool `json:"dynamic"` - Properties map[string]*DocumentMapping `json:"properties,omitempty"` - Fields []*FieldMapping `json:"fields,omitempty"` - DefaultAnalyzer string `json:"default_analyzer,omitempty"` + Enabled bool `json:"enabled"` + Dynamic bool `json:"dynamic"` + Properties map[string]*DocumentMapping `json:"properties,omitempty"` + Fields []*FieldMapping `json:"fields,omitempty"` + DefaultAnalyzer string `json:"default_analyzer,omitempty"` + DefaultSynonymSource string `json:"default_synonym_source,omitempty"` // StructTagKey overrides "json" when looking for field names in struct tags StructTagKey string `json:"struct_tag_key,omitempty"` @@ -306,6 +307,11 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "default_synonym_source": + err := util.UnmarshalJSON(v, &dm.DefaultSynonymSource) + if err != nil { + return err + } case "properties": err := util.UnmarshalJSON(v, &dm.Properties) if err != nil { @@ -349,6 +355,22 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { return rv } +func (dm *DocumentMapping) defaultSynonymSource(path []string) string { + current := dm + rv := current.DefaultSynonymSource + for _, pathElement := range path { + var ok bool + current, ok = current.Properties[pathElement] + if !ok { + break + } + if current.DefaultSynonymSource != "" { + rv = current.DefaultSynonymSource + } + } + return rv +} + func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { // allow default "json" tag to be overridden structTagKey := dm.StructTagKey diff --git a/mapping/index.go b/mapping/index.go index 36b423108..feb7634a9 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -49,6 +49,7 @@ type IndexMappingImpl struct { DefaultType string `json:"default_type"` DefaultAnalyzer string `json:"default_analyzer"` DefaultDateTimeParser string `json:"default_datetime_parser"` + DefaultSynonymSource string `json:"default_synonym_source"` DefaultField string `json:"default_field"` StoreDynamic bool `json:"store_dynamic"` IndexDynamic bool `json:"index_dynamic"` @@ -268,6 +269,11 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "default_synonym_source": + err := util.UnmarshalJSON(v, &im.DefaultSynonymSource) + if err != nil { + return err + } case "default_field": err := util.UnmarshalJSON(v, &im.DefaultField) if err != nil { @@ -517,7 +523,25 @@ func (im *IndexMappingImpl) SynonymSourceForPath(path string) string { } } - return "" + // next we will try default synonym sources for the path + pathDecoded := decodePath(path) + for _, docMapping := range im.TypeMapping { + if docMapping.Enabled { + rv := docMapping.defaultSynonymSource(pathDecoded) + if rv != "" { + return rv + } + } + } + // now the default analyzer for the default mapping + if im.DefaultMapping.Enabled { + rv := im.DefaultMapping.defaultSynonymSource(pathDecoded) + if rv != "" { + return rv + } + } + + return im.DefaultSynonymSource } func (im *IndexMappingImpl) SynonymCount() int { From dd692bfc116d4ccd84742a8dd12f2ed8bd231405 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 3 Dec 2024 14:22:37 +0530 Subject: [PATCH 11/35] refactor code --- search.go | 8 +++ search/query/query.go | 156 ++++++++++++++++++++++++------------------ search/util.go | 13 ++++ search_knn.go | 4 +- 4 files changed, 111 insertions(+), 70 deletions(-) diff --git a/search.go b/search.go index 7861d24b8..8734867e5 100644 --- a/search.go +++ b/search.go @@ -444,6 +444,9 @@ type SearchResult struct { MaxScore float64 `json:"max_score"` Took time.Duration `json:"took"` Facets search.FacetResults `json:"facets"` + // special fields that are applicable only for search + // results that are obtained from a presearch + PreSearchResults map[string]interface{} `json:"presearch_results,omitempty"` } func (sr *SearchResult) Size() int { @@ -589,3 +592,8 @@ func (r *SearchRequest) SortFunc() func(data sort.Interface) { return sort.Sort } + +func isMatchNoneQuery(q query.Query) bool { + _, ok := q.(*query.MatchNoneQuery) + return ok +} diff --git a/search/query/query.go b/search/query/query.go index c908bbc54..5e3259c11 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -93,6 +93,18 @@ func ParsePreSearchData(input []byte) (map[string]interface{}, error) { rv = make(map[string]interface{}) } rv[search.KnnPreSearchDataKey] = value + case search.SynonymPreSearchDataKey: + var value search.FieldTermSynonymMap + if v != nil { + err := util.UnmarshalJSON(v, &value) + if err != nil { + return nil, err + } + } + if rv == nil { + rv = make(map[string]interface{}) + } + rv[search.SynonymPreSearchDataKey] = value } } return rv, nil @@ -488,11 +500,11 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno } case *FuzzyQuery: field, source := resolveFieldAndSource(q.FieldVal) - fuzziness := q.Fuzziness - if q.autoFuzzy { - fuzziness = searcher.GetAutoFuzziness(q.Term) - } if source != "" { + fuzziness := q.Fuzziness + if q.autoFuzzy { + fuzziness = searcher.GetAutoFuzziness(q.Term) + } return addFuzzySynonymsForTerm(ctx, source, field, q.Term, fuzziness, q.Prefix, r, rv) } case *MatchQuery, *MatchPhraseQuery: @@ -587,43 +599,49 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno // adds their synonyms to the provided map. func addRegexpSynonymsForTerm(ctx context.Context, src, field, term string, r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { - - if ir, ok := r.(index.IndexReaderRegexp); ok { - fieldDict, err := ir.FieldDictRegexp(field, term) - if err != nil { - return nil, err - } - defer func() { - if cerr := fieldDict.Close(); cerr != nil && err == nil { - err = cerr - } - }() - regexpTerms := []string{term} - tfd, err := fieldDict.Next() - for err == nil && tfd != nil { - regexpTerms = append(regexpTerms, tfd.Term) - tfd, err = fieldDict.Next() + // find the terms with this regexp + var ok bool + var ir index.IndexReaderRegexp + if ir, ok = r.(index.IndexReaderRegexp); !ok { + return addSynonymsForTerm(ctx, src, field, term, r, rv) + } + fieldDict, err := ir.FieldDictRegexp(field, term) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr } + }() + regexpTerms := []string{term} + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + regexpTerms = append(regexpTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + if err != nil { + return nil, err + } + for _, term := range regexpTerms { + rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) if err != nil { return nil, err } - for _, term := range regexpTerms { - rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) - if err != nil { - return nil, err - } - } - return rv, nil } - return nil, nil + return rv, nil } // addPrefixSynonymsForTerm finds all terms that match the given prefix and // adds their synonyms to the provided map. func addPrefixSynonymsForTerm(ctx context.Context, src, field, term string, r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { - // find the terms with this prefix - fieldDict, err := r.FieldDictPrefix(field, []byte(term)) + var ok bool + var ir index.IndexReaderPrefix + if ir, ok = r.(index.IndexReaderPrefix); !ok { + return addSynonymsForTerm(ctx, src, field, term, r, rv) + } + fieldDict, err := ir.FieldDictPrefix(field, []byte(term)) if err != nil { return nil, err } @@ -654,51 +672,50 @@ func addPrefixSynonymsForTerm(ctx context.Context, src, field, term string, // given fuzziness and adds their synonyms to the provided map. func addFuzzySynonymsForTerm(ctx context.Context, src, field, term string, fuzziness, prefix int, r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { - if fuzziness == 0 { + if fuzziness > searcher.MaxFuzziness { + return nil, fmt.Errorf("fuzziness exceeds max (%d)", searcher.MaxFuzziness) + } + if fuzziness < 0 { + return nil, fmt.Errorf("invalid fuzziness, negative") + } + var ok bool + var ir index.IndexReaderFuzzy + if ir, ok = r.(index.IndexReaderFuzzy); !ok || fuzziness == 0 { return addSynonymsForTerm(ctx, src, field, term, r, rv) } - if ir, ok := r.(index.IndexReaderFuzzy); ok { - if fuzziness > searcher.MaxFuzziness { - return nil, fmt.Errorf("fuzziness exceeds max (%d)", searcher.MaxFuzziness) + prefixTerm := "" + for i, r := range term { + if i < prefix { + prefixTerm += string(r) + } else { + break } - if fuzziness < 0 { - return nil, fmt.Errorf("invalid fuzziness, negative") - } - prefixTerm := "" - for i, r := range term { - if i < prefix { - prefixTerm += string(r) - } else { - break - } - } - fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) - if err != nil { - return nil, err - } - defer func() { - if cerr := fieldDict.Close(); cerr != nil && err == nil { - err = cerr - } - }() - fuzzyTerms := []string{term} - tfd, err := fieldDict.Next() - for err == nil && tfd != nil { - fuzzyTerms = append(fuzzyTerms, tfd.Term) - tfd, err = fieldDict.Next() + } + fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr } + }() + fuzzyTerms := []string{term} + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + fuzzyTerms = append(fuzzyTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + if err != nil { + return nil, err + } + for _, term := range fuzzyTerms { + rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) if err != nil { return nil, err } - for _, term := range fuzzyTerms { - rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) - if err != nil { - return nil, err - } - } - return rv, nil } - return nil, nil + return rv, nil } // addSynonymsForTerm finds synonyms for the given term and adds them to the @@ -722,6 +739,9 @@ func addSynonymsForTerm(ctx context.Context, src, field, term string, synonyms = append(synonyms, synonym) synonym, err = termReader.Next() } + if err != nil { + return nil, err + } if len(synonyms) > 0 { if rv == nil { rv = make(search.FieldTermSynonymMap) @@ -731,5 +751,5 @@ func addSynonymsForTerm(ctx context.Context, src, field, term string, } rv[field][term] = synonyms } - return rv, err + return rv, nil } diff --git a/search/util.go b/search/util.go index 0f16db19a..9f5a15cac 100644 --- a/search/util.go +++ b/search/util.go @@ -148,6 +148,19 @@ type SearcherEndCallbackFn func(size uint64) error // field -> term -> synonyms type FieldTermSynonymMap map[string]map[string][]string +func (f *FieldTermSynonymMap) MergeWith(fts FieldTermSynonymMap) { + for field, termSynonymMap := range fts { + // Ensure the field exists in the receiver + if _, exists := (*f)[field]; !exists { + (*f)[field] = make(map[string][]string) + } + for term, synonyms := range termSynonymMap { + // Append synonyms + (*f)[field][term] = append((*f)[field][term], synonyms...) + } + } +} + const FieldTermSynonymMapKey = "_field_term_synonym_map_key" const SearcherStartCallbackKey = "_searcher_start_callback_key" diff --git a/search_knn.go b/search_knn.go index 309b36593..498770a3a 100644 --- a/search_knn.go +++ b/search_knn.go @@ -381,7 +381,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea continue } - if _, ok := filterQ.(*query.MatchNoneQuery); ok { + if isMatchNoneQuery(filterQ) { // Filtering required since no hits are eligible. requiresFiltering[idx] = true // a match none query just means none the documents are eligible @@ -559,7 +559,7 @@ func requestHasKNN(req *SearchRequest) bool { func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool { // if req.Query is not match_none => then we need to go to phase 2 // to perform the actual query. - if _, ok := req.Query.(*query.MatchNoneQuery); !ok { + if !isMatchNoneQuery(req.Query) { return false } // req.Query is a match_none query From 4d4440eb9fafb0132256ac7efca7b3e75e3cac41 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 3 Dec 2024 17:43:41 +0530 Subject: [PATCH 12/35] alias path code --- index_alias_impl.go | 134 +++++++++++++++++++++++++++++++++++--------- index_impl.go | 15 ++++- pre_search.go | 44 +++++++++++---- search.go | 2 +- search_knn.go | 35 ------------ search_no_knn.go | 2 +- 6 files changed, 158 insertions(+), 74 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 73cc637f7..e4aad1eb7 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -169,7 +169,11 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // indicates that this index alias is set as an Index // in another alias, so we need to do a preSearch search // and NOT a real search - return preSearchDataSearch(ctx, req, i.indexes...) + flags := &preSearchFlags{ + knn: requestHasKNN(req), + synonyms: !isMatchNoneQuery(req.Query), + } + return preSearchDataSearch(ctx, req, flags, i.indexes...) } // at this point we know we are doing a real search @@ -183,12 +187,10 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // if necessary var preSearchData map[string]map[string]interface{} if req.PreSearchData != nil { - if requestHasKNN(req) { - var err error - preSearchData, err = redistributeKNNPreSearchData(req, i.indexes) - if err != nil { - return nil, err - } + var err error + preSearchData, err = redistributePreSearchData(req, i.indexes) + if err != nil { + return nil, err } } @@ -209,9 +211,10 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // - the request requires preSearch var preSearchDuration time.Duration var sr *SearchResult - if req.PreSearchData == nil && preSearchRequired(req) { + flags := preSearchRequired(req, i.mapping) + if req.PreSearchData == nil && flags != nil { searchStart := time.Now() - preSearchResult, err := preSearch(ctx, req, i.indexes...) + preSearchResult, err := preSearch(ctx, req, flags, i.indexes...) if err != nil { return nil, err } @@ -222,17 +225,17 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest return preSearchResult, nil } // finalize the preSearch result now - finalizePreSearchResult(req, preSearchResult) + finalizePreSearchResult(req, flags, preSearchResult) // if there are no errors, then merge the data in the preSearch result // and construct the preSearchData to be used in the actual search // if the request is satisfied by the preSearch result, then we can // directly return the preSearch result as the final result - if requestSatisfiedByPreSearch(req) { + if requestSatisfiedByPreSearch(req, flags) { sr = finalizeSearchResult(req, preSearchResult) // no need to run the 2nd phase MultiSearch(..) } else { - preSearchData, err = constructPreSearchData(req, preSearchResult, i.indexes) + preSearchData, err = constructPreSearchData(req, flags, preSearchResult, i.indexes) if err != nil { return nil, err } @@ -507,6 +510,13 @@ func (i *indexAliasImpl) Swap(in, out []Index) { } } +func (i *indexAliasImpl) SetIndexMapping(m mapping.IndexMapping) { + i.mutex.Lock() + defer i.mutex.Unlock() + + i.mapping = m +} + // createChildSearchRequest creates a separate // request from the original // For now, avoid data race on req structure. @@ -525,21 +535,48 @@ type asyncSearchResult struct { Err error } -func preSearchRequired(req *SearchRequest) bool { - return requestHasKNN(req) +// preSearchFlags is a struct to hold flags indicating why preSearch is required +type preSearchFlags struct { + knn bool + synonyms bool +} + +// preSearchRequired checks if preSearch is required and returns a boolean flag +// It only allocates the preSearchFlags struct if necessary +func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) *preSearchFlags { + // Check for KNN query + knn := requestHasKNN(req) + var synonyms bool + if !isMatchNoneQuery(req.Query) { + // Check if synonyms are defined in the mapping + if sm, ok := m.(mapping.SynonymMapping); ok && sm.SynonymCount() > 0 { + synonyms = true + } + } + if knn || synonyms { + return &preSearchFlags{ + knn: knn, + synonyms: synonyms, + } + } + return nil } -func preSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { - // create a dummy request with a match none query - // since we only care about the preSearchData in PreSearch +func preSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { + var dummyQuery = req.Query + if !flags.synonyms { + // create a dummy request with a match none query + // since we only care about the preSearchData in PreSearch + dummyQuery = query.NewMatchNoneQuery() + } dummyRequest := &SearchRequest{ - Query: query.NewMatchNoneQuery(), + Query: dummyQuery, } newCtx := context.WithValue(ctx, search.PreSearchKey, true) - if requestHasKNN(req) { + if flags.knn { addKnnToDummyRequest(dummyRequest, req) } - return preSearchDataSearch(newCtx, dummyRequest, indexes...) + return preSearchDataSearch(newCtx, dummyRequest, flags, indexes...) } // if the request is satisfied by just the preSearch result, @@ -590,29 +627,74 @@ func finalizeSearchResult(req *SearchRequest, preSearchResult *SearchResult) *Se return preSearchResult } -func requestSatisfiedByPreSearch(req *SearchRequest) bool { - if requestHasKNN(req) && isKNNrequestSatisfiedByPreSearch(req) { +func requestSatisfiedByPreSearch(req *SearchRequest, flags *preSearchFlags) bool { + // if the synonyms presearch flag is set the request can never be satisfied by + // the preSearch result as synonyms are not part of the preSearch result + if flags.synonyms { + return false + } + if flags.knn && isKNNrequestSatisfiedByPreSearch(req) { return true } return false } -func constructPreSearchData(req *SearchRequest, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { +func constructSynonymPreSearchData(rv map[string]map[string]interface{}, sr *SearchResult, indexes []Index) map[string]map[string]interface{} { + for _, index := range indexes { + rv[index.Name()][search.SynonymPreSearchDataKey] = sr.SynonymResult + } + return rv +} + +func constructPreSearchData(req *SearchRequest, flags *preSearchFlags, + preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { mergedOut := make(map[string]map[string]interface{}, len(indexes)) for _, index := range indexes { mergedOut[index.Name()] = make(map[string]interface{}) } var err error - if requestHasKNN(req) { + if flags.knn { mergedOut, err = constructKnnPreSearchData(mergedOut, preSearchResult, indexes) if err != nil { return nil, err } } + if flags.synonyms { + mergedOut = constructSynonymPreSearchData(mergedOut, preSearchResult, indexes) + } return mergedOut, nil } -func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { +func redistributePreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { + rv := make(map[string]map[string]interface{}) + for _, index := range indexes { + rv[index.Name()] = make(map[string]interface{}) + } + if knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch); ok { + // the preSearchData for KNN is a list of DocumentMatch objects + // that need to be redistributed to the right index. + // This is used only in the case of an alias tree, where the indexes + // are at the leaves of the tree, and the master alias is at the root. + // At each level of the tree, the preSearchData needs to be redistributed + // to the indexes/aliases at that level. Because the preSearchData is + // specific to each final index at the leaf. + segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) + if err != nil { + return nil, err + } + for _, index := range indexes { + rv[index.Name()][search.KnnPreSearchDataKey] = segregatedKnnHits[index.Name()] + } + } + if fts, ok := req.PreSearchData[search.SynonymPreSearchDataKey].(search.FieldTermSynonymMap); ok { + for _, index := range indexes { + rv[index.Name()][search.SynonymPreSearchDataKey] = fts + } + } + return rv, nil +} + +func preSearchDataSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { asyncResults := make(chan *asyncSearchResult, len(indexes)) // run search on each index in separate go routine var waitGroup sync.WaitGroup @@ -643,7 +725,7 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Ind if prp == nil { // first valid preSearch result // create a new preSearch result processor - prp = createPreSearchResultProcessor(req) + prp = createPreSearchResultProcessor(req, flags) } prp.add(asr.Result, asr.Name) if sr == nil { diff --git a/index_impl.go b/index_impl.go index 3cce0a12c..c085b796a 100644 --- a/index_impl.go +++ b/index_impl.go @@ -450,12 +450,25 @@ func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader in } } + var fts search.FieldTermSynonymMap + if !isMatchNoneQuery(req.Query) { + if synMap, ok := i.m.(mapping.SynonymMapping); ok { + if synReader, ok := reader.(index.SynonymReader); ok { + fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) + if err != nil { + return nil, err + } + } + } + } + return &SearchResult{ Status: &SearchStatus{ Total: 1, Successful: 1, }, - Hits: knnHits, + Hits: knnHits, + SynonymResult: fts, }, nil } diff --git a/pre_search.go b/pre_search.go index a539afc35..59a9dc2e3 100644 --- a/pre_search.go +++ b/pre_search.go @@ -14,6 +14,10 @@ package bleve +import ( + "github.com/blevesearch/bleve/v2/search" +) + // A preSearchResultProcessor processes the data in // the preSearch result from multiple // indexes in an alias and merges them together to @@ -48,19 +52,33 @@ func (k *knnPreSearchResultProcessor) finalize(sr *SearchResult) { // ----------------------------------------------------------------------------- // Synonym preSearchResultProcessor for handling Synonym presearch results type synonymPreSearchResultProcessor struct { - addFn func(sr *SearchResult, indexName string) - finalizeFn func(sr *SearchResult) + finalizedFts search.FieldTermSynonymMap +} + +func newSynonymPreSearchResultProcessor() *synonymPreSearchResultProcessor { + return &synonymPreSearchResultProcessor{} } func (s *synonymPreSearchResultProcessor) add(sr *SearchResult, indexName string) { - if s.addFn != nil { - s.addFn(sr, indexName) + // Check if SynonymResult or the synonym data key is nil + if sr.SynonymResult == nil { + return + } + + // Attempt to cast PreSearchResults to FieldTermSynonymMap + + // Merge with finalizedFts or initialize it if nil + if s.finalizedFts == nil { + s.finalizedFts = sr.SynonymResult + } else { + s.finalizedFts.MergeWith(sr.SynonymResult) } } func (s *synonymPreSearchResultProcessor) finalize(sr *SearchResult) { - if s.finalizeFn != nil { - s.finalizeFn(sr) + // Set the finalized synonym data to the PreSearchResults + if s.finalizedFts != nil { + sr.SynonymResult = s.finalizedFts } } @@ -86,14 +104,20 @@ func (m *compositePreSearchResultProcessor) finalize(sr *SearchResult) { // ----------------------------------------------------------------------------- // Function to create the appropriate preSearchResultProcessor(s) -func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor { +func createPreSearchResultProcessor(req *SearchRequest, flags *preSearchFlags) preSearchResultProcessor { var processors []preSearchResultProcessor // Add KNN processor if the request has KNN - if requestHasKNN(req) { + if flags.knn { if knnProcessor := newKnnPreSearchResultProcessor(req); knnProcessor != nil { processors = append(processors, knnProcessor) } } + // Add Synonym processor if the request has Synonym + if flags.synonyms { + if synonymProcessor := newSynonymPreSearchResultProcessor(); synonymProcessor != nil { + processors = append(processors, synonymProcessor) + } + } // Return based on the number of processors, optimizing for the common case of 1 processor // If there are no processors, return nil switch len(processors) { @@ -109,8 +133,8 @@ func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor } // ----------------------------------------------------------------------------- -func finalizePreSearchResult(req *SearchRequest, preSearchResult *SearchResult) { - if requestHasKNN(req) { +func finalizePreSearchResult(req *SearchRequest, flags *preSearchFlags, preSearchResult *SearchResult) { + if flags.knn { preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) } } diff --git a/search.go b/search.go index 8734867e5..72bfca5e2 100644 --- a/search.go +++ b/search.go @@ -446,7 +446,7 @@ type SearchResult struct { Facets search.FacetResults `json:"facets"` // special fields that are applicable only for search // results that are obtained from a presearch - PreSearchResults map[string]interface{} `json:"presearch_results,omitempty"` + SynonymResult search.FieldTermSynonymMap `json:"synonym_result,omitempty"` } func (sr *SearchResult) Size() int { diff --git a/search_knn.go b/search_knn.go index 498770a3a..e5fd595d4 100644 --- a/search_knn.go +++ b/search_knn.go @@ -598,41 +598,6 @@ func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { dummyReq.Sort = realReq.Sort } -// the preSearchData for KNN is a list of DocumentMatch objects -// that need to be redistributed to the right index. -// This is used only in the case of an alias tree, where the indexes -// are at the leaves of the tree, and the master alias is at the root. -// At each level of the tree, the preSearchData needs to be redistributed -// to the indexes/aliases at that level. Because the preSearchData is -// specific to each final index at the leaf. -func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { - knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch) - if !ok { - return nil, fmt.Errorf("request does not have knn preSearchData for redistribution") - } - segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) - if err != nil { - return nil, err - } - - rv := make(map[string]map[string]interface{}) - for _, index := range indexes { - rv[index.Name()] = make(map[string]interface{}) - } - - for _, index := range indexes { - for k, v := range req.PreSearchData { - switch k { - case search.KnnPreSearchDataKey: - rv[index.Name()][k] = segregatedKnnHits[index.Name()] - default: - rv[index.Name()][k] = v - } - } - } - return rv, nil -} - func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor { kArray := make([]int64, len(req.KNN)) for i, knnReq := range req.KNN { diff --git a/search_no_knn.go b/search_no_knn.go index bb72e15a9..c91980589 100644 --- a/search_no_knn.go +++ b/search_no_knn.go @@ -187,7 +187,7 @@ func requestHasKNN(req *SearchRequest) bool { func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { } -func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { +func validateAndDistributeKNNHits(knnHits []*search.DocumentMatch, indexes []Index) (map[string][]*search.DocumentMatch, error) { return nil, nil } From 7b865338225d482415590ad6ad9cdd6f2638cb8d Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 6 Dec 2024 11:55:55 +0530 Subject: [PATCH 13/35] Presearch Code Refactor - Refactor the presearch code path to make it more generic and extensible. --- index_alias_impl.go | 114 +++++++++++++++++++++++++++++++++++++------- pre_search.go | 45 ++++++++++++++--- search.go | 5 ++ search_knn.go | 39 +-------------- search_no_knn.go | 2 +- 5 files changed, 142 insertions(+), 63 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 3c7cdcd32..1daae819b 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -31,6 +31,10 @@ type indexAliasImpl struct { indexes []Index mutex sync.RWMutex open bool + // if all the indexes in tha alias have the same mapping + // then the user can set the mapping here to avoid + // checking the mapping of each index in the alias + mapping mapping.IndexMapping } // NewIndexAlias creates a new IndexAlias over the provided @@ -168,7 +172,10 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // indicates that this index alias is set as an Index // in another alias, so we need to do a preSearch search // and NOT a real search - return preSearchDataSearch(ctx, req, i.indexes...) + flags := &preSearchFlags{ + knn: requestHasKNN(req), // set knn flag if the request has KNN + } + return preSearchDataSearch(ctx, req, flags, i.indexes...) } // at this point we know we are doing a real search @@ -184,7 +191,7 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest if req.PreSearchData != nil { if requestHasKNN(req) { var err error - preSearchData, err = redistributeKNNPreSearchData(req, i.indexes) + preSearchData, err = redistributePreSearchData(req, i.indexes) if err != nil { return nil, err } @@ -208,9 +215,10 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // - the request requires preSearch var preSearchDuration time.Duration var sr *SearchResult - if req.PreSearchData == nil && preSearchRequired(req) { + flags := preSearchRequired(req, i.mapping) + if req.PreSearchData == nil && flags != nil { searchStart := time.Now() - preSearchResult, err := preSearch(ctx, req, i.indexes...) + preSearchResult, err := preSearch(ctx, req, flags, i.indexes...) if err != nil { return nil, err } @@ -221,17 +229,17 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest return preSearchResult, nil } // finalize the preSearch result now - finalizePreSearchResult(req, preSearchResult) + finalizePreSearchResult(req, flags, preSearchResult) // if there are no errors, then merge the data in the preSearch result // and construct the preSearchData to be used in the actual search // if the request is satisfied by the preSearch result, then we can // directly return the preSearch result as the final result - if requestSatisfiedByPreSearch(req) { + if requestSatisfiedByPreSearch(req, flags) { sr = finalizeSearchResult(req, preSearchResult) // no need to run the 2nd phase MultiSearch(..) } else { - preSearchData, err = constructPreSearchData(req, preSearchResult, i.indexes) + preSearchData, err = constructPreSearchData(req, flags, preSearchResult, i.indexes) if err != nil { return nil, err } @@ -352,6 +360,20 @@ func (i *indexAliasImpl) Close() error { return nil } +// SetIndexMapping sets the mapping for the alias and must be used +// ONLY when all the indexes in the alias have the same mapping. +// This is to avoid checking the mapping of each index in the alias +// when executing a search request. +func (i *indexAliasImpl) SetIndexMapping(m mapping.IndexMapping) error { + i.mutex.Lock() + defer i.mutex.Unlock() + if !i.open { + return ErrorIndexClosed + } + i.mapping = m + return nil +} + func (i *indexAliasImpl) Mapping() mapping.IndexMapping { i.mutex.RLock() defer i.mutex.RUnlock() @@ -360,6 +382,11 @@ func (i *indexAliasImpl) Mapping() mapping.IndexMapping { return nil } + // if the mapping is already set, return it + if i.mapping != nil { + return i.mapping + } + err := i.isAliasToSingleIndex() if err != nil { return nil @@ -520,21 +547,35 @@ type asyncSearchResult struct { Err error } -func preSearchRequired(req *SearchRequest) bool { - return requestHasKNN(req) +// preSearchFlags is a struct to hold flags indicating why preSearch is required +type preSearchFlags struct { + knn bool +} + +// preSearchRequired checks if preSearch is required and returns the presearch flags struct +// indicating which preSearch is required +func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) *preSearchFlags { + // Check for KNN query + knn := requestHasKNN(req) + if knn { + return &preSearchFlags{ + knn: knn, + } + } + return nil } -func preSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { +func preSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { // create a dummy request with a match none query // since we only care about the preSearchData in PreSearch dummyRequest := &SearchRequest{ Query: query.NewMatchNoneQuery(), } newCtx := context.WithValue(ctx, search.PreSearchKey, true) - if requestHasKNN(req) { + if flags.knn { addKnnToDummyRequest(dummyRequest, req) } - return preSearchDataSearch(newCtx, dummyRequest, indexes...) + return preSearchDataSearch(newCtx, dummyRequest, flags, indexes...) } // if the request is satisfied by just the preSearch result, @@ -585,20 +626,20 @@ func finalizeSearchResult(req *SearchRequest, preSearchResult *SearchResult) *Se return preSearchResult } -func requestSatisfiedByPreSearch(req *SearchRequest) bool { - if requestHasKNN(req) && isKNNrequestSatisfiedByPreSearch(req) { +func requestSatisfiedByPreSearch(req *SearchRequest, flags *preSearchFlags) bool { + if flags.knn && isKNNrequestSatisfiedByPreSearch(req) { return true } return false } -func constructPreSearchData(req *SearchRequest, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { +func constructPreSearchData(req *SearchRequest, flags *preSearchFlags, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { mergedOut := make(map[string]map[string]interface{}, len(indexes)) for _, index := range indexes { mergedOut[index.Name()] = make(map[string]interface{}) } var err error - if requestHasKNN(req) { + if flags.knn { mergedOut, err = constructKnnPreSearchData(mergedOut, preSearchResult, indexes) if err != nil { return nil, err @@ -607,7 +648,7 @@ func constructPreSearchData(req *SearchRequest, preSearchResult *SearchResult, i return mergedOut, nil } -func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*SearchResult, error) { +func preSearchDataSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { asyncResults := make(chan *asyncSearchResult, len(indexes)) // run search on each index in separate go routine var waitGroup sync.WaitGroup @@ -638,7 +679,7 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Ind if prp == nil { // first valid preSearch result // create a new preSearch result processor - prp = createPreSearchResultProcessor(req) + prp = createPreSearchResultProcessor(req, flags) } prp.add(asr.Result, asr.Name) if sr == nil { @@ -684,6 +725,43 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, indexes ...Ind return sr, nil } +// redistributePreSearchData redistributes the preSearchData coming from the individual constituent index aliases +// of an alias to the individual indexes in the alias. This is necessary when the preSearchData is specific to each +// index in the alias. This is used only in the case of an alias tree, where the indexes are at the leaves of the tree, +// and the master alias is at the root. At each level of the tree, the preSearchData needs to be redistributed to the +// indexes/aliases at that level. Because the preSearchData is specific to each final index at the leaf. +func redistributePreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { + rv := make(map[string]map[string]interface{}) + for _, index := range indexes { + rv[index.Name()] = make(map[string]interface{}) + } + if knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch); ok { + // the preSearchData for KNN is a list of DocumentMatch objects + // that need to be redistributed to the right index. + // This is used only in the case of an alias tree, where the indexes + // are at the leaves of the tree, and the master alias is at the root. + // At each level of the tree, the preSearchData needs to be redistributed + // to the indexes/aliases at that level. Because the preSearchData is + // specific to each final index at the leaf. + segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) + if err != nil { + return nil, err + } + for _, index := range indexes { + rv[index.Name()][search.KnnPreSearchDataKey] = segregatedKnnHits[index.Name()] + } + } + return rv, nil +} + +// finalizePreSearchResult finalizes the preSearch result by applying the finalization steps +// specific to the preSearch flags +func finalizePreSearchResult(req *SearchRequest, flags *preSearchFlags, preSearchResult *SearchResult) { + if flags.knn { + preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) + } +} + // hitsInCurrentPage returns the hits in the current page // using the From and Size parameters in the request func hitsInCurrentPage(req *SearchRequest, hits []*search.DocumentMatch) []*search.DocumentMatch { diff --git a/pre_search.go b/pre_search.go index c8c55bfbc..0992124d9 100644 --- a/pre_search.go +++ b/pre_search.go @@ -26,6 +26,8 @@ type preSearchResultProcessor interface { finalize(*SearchResult) } +// ----------------------------------------------------------------------------- +// KNN preSearchResultProcessor for handling KNN presearch results type knnPreSearchResultProcessor struct { addFn func(sr *SearchResult, indexName string) finalizeFn func(sr *SearchResult) @@ -44,16 +46,45 @@ func (k *knnPreSearchResultProcessor) finalize(sr *SearchResult) { } // ----------------------------------------------------------------------------- +// Master struct that can hold any number of presearch result processors +type compositePreSearchResultProcessor struct { + presearchResultProcessors []preSearchResultProcessor +} -func finalizePreSearchResult(req *SearchRequest, preSearchResult *SearchResult) { - if requestHasKNN(req) { - preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) +// Implements the add method, which forwards to all the internal processors +func (m *compositePreSearchResultProcessor) add(sr *SearchResult, indexName string) { + for _, p := range m.presearchResultProcessors { + p.add(sr, indexName) } } -func createPreSearchResultProcessor(req *SearchRequest) preSearchResultProcessor { - if requestHasKNN(req) { - return newKnnPreSearchResultProcessor(req) +// Implements the finalize method, which forwards to all the internal processors +func (m *compositePreSearchResultProcessor) finalize(sr *SearchResult) { + for _, p := range m.presearchResultProcessors { + p.finalize(sr) + } +} + +// ----------------------------------------------------------------------------- +// Function to create the appropriate preSearchResultProcessor(s) +func createPreSearchResultProcessor(req *SearchRequest, flags *preSearchFlags) preSearchResultProcessor { + var processors []preSearchResultProcessor + // Add KNN processor if the request has KNN + if flags.knn { + if knnProcessor := newKnnPreSearchResultProcessor(req); knnProcessor != nil { + processors = append(processors, knnProcessor) + } + } + // Return based on the number of processors, optimizing for the common case of 1 processor + // If there are no processors, return nil + switch len(processors) { + case 0: + return nil + case 1: + return processors[0] + default: + return &compositePreSearchResultProcessor{ + presearchResultProcessors: processors, + } } - return &knnPreSearchResultProcessor{} // equivalent to nil } diff --git a/search.go b/search.go index 7861d24b8..402109e05 100644 --- a/search.go +++ b/search.go @@ -589,3 +589,8 @@ func (r *SearchRequest) SortFunc() func(data sort.Interface) { return sort.Sort } + +func isMatchNoneQuery(q query.Query) bool { + _, ok := q.(*query.MatchNoneQuery) + return ok +} diff --git a/search_knn.go b/search_knn.go index 309b36593..e5fd595d4 100644 --- a/search_knn.go +++ b/search_knn.go @@ -381,7 +381,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea continue } - if _, ok := filterQ.(*query.MatchNoneQuery); ok { + if isMatchNoneQuery(filterQ) { // Filtering required since no hits are eligible. requiresFiltering[idx] = true // a match none query just means none the documents are eligible @@ -559,7 +559,7 @@ func requestHasKNN(req *SearchRequest) bool { func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool { // if req.Query is not match_none => then we need to go to phase 2 // to perform the actual query. - if _, ok := req.Query.(*query.MatchNoneQuery); !ok { + if !isMatchNoneQuery(req.Query) { return false } // req.Query is a match_none query @@ -598,41 +598,6 @@ func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { dummyReq.Sort = realReq.Sort } -// the preSearchData for KNN is a list of DocumentMatch objects -// that need to be redistributed to the right index. -// This is used only in the case of an alias tree, where the indexes -// are at the leaves of the tree, and the master alias is at the root. -// At each level of the tree, the preSearchData needs to be redistributed -// to the indexes/aliases at that level. Because the preSearchData is -// specific to each final index at the leaf. -func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { - knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch) - if !ok { - return nil, fmt.Errorf("request does not have knn preSearchData for redistribution") - } - segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) - if err != nil { - return nil, err - } - - rv := make(map[string]map[string]interface{}) - for _, index := range indexes { - rv[index.Name()] = make(map[string]interface{}) - } - - for _, index := range indexes { - for k, v := range req.PreSearchData { - switch k { - case search.KnnPreSearchDataKey: - rv[index.Name()][k] = segregatedKnnHits[index.Name()] - default: - rv[index.Name()][k] = v - } - } - } - return rv, nil -} - func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor { kArray := make([]int64, len(req.KNN)) for i, knnReq := range req.KNN { diff --git a/search_no_knn.go b/search_no_knn.go index bb72e15a9..c91980589 100644 --- a/search_no_knn.go +++ b/search_no_knn.go @@ -187,7 +187,7 @@ func requestHasKNN(req *SearchRequest) bool { func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { } -func redistributeKNNPreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { +func validateAndDistributeKNNHits(knnHits []*search.DocumentMatch, indexes []Index) (map[string][]*search.DocumentMatch, error) { return nil, nil } From d58474f7f95c6f83cd312bd2b2c9c43baa263974 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 6 Dec 2024 12:01:44 +0530 Subject: [PATCH 14/35] fix comment --- index_alias_impl.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 1daae819b..e2948ff53 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -725,11 +725,9 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, flags *preSear return sr, nil } -// redistributePreSearchData redistributes the preSearchData coming from the individual constituent index aliases -// of an alias to the individual indexes in the alias. This is necessary when the preSearchData is specific to each -// index in the alias. This is used only in the case of an alias tree, where the indexes are at the leaves of the tree, -// and the master alias is at the root. At each level of the tree, the preSearchData needs to be redistributed to the -// indexes/aliases at that level. Because the preSearchData is specific to each final index at the leaf. +// redistributePreSearchData redistributes the preSearchData sent in the search request to an index alias +// which would happen in the case of an alias tree and depending on the level of the tree, the preSearchData +// needs to be redistributed to the indexes at that level func redistributePreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { rv := make(map[string]map[string]interface{}) for _, index := range indexes { From 3908df35e385f3d60ea10f3e8fbae3cd8d382872 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 6 Dec 2024 13:52:09 +0530 Subject: [PATCH 15/35] Add ExtractFields API with unit test --- search/query/query.go | 38 ++++++ search/query/query_test.go | 234 +++++++++++++++++++++++++++++++++++++ 2 files changed, 272 insertions(+) diff --git a/search/query/query.go b/search/query/query.go index d263a0e54..1f1131f30 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -423,3 +423,41 @@ func DumpQuery(m mapping.IndexMapping, query Query) (string, error) { data, err := json.MarshalIndent(q, "", " ") return string(data), err } + +// FieldSet represents a set of queried fields. +type FieldSet map[string]struct{} + +// ExtractFields returns a set of fields referenced by the query. +// The returned set may be nil if the query does not explicitly reference any field +// and the DefaultSearchField is unset in the index mapping. +func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) FieldSet { + if q == nil { + return fs + } + switch q := q.(type) { + case FieldableQuery: + f := q.Field() + if f == "" { + f = m.DefaultSearchField() + } + if f != "" { + if fs == nil { + fs = make(FieldSet) + } + fs[f] = struct{}{} + } + case *BooleanQuery: + for _, subq := range []Query{q.Must, q.Should, q.MustNot} { + fs = ExtractFields(subq, m, fs) + } + case *ConjunctionQuery: + for _, subq := range q.Conjuncts { + fs = ExtractFields(subq, m, fs) + } + case *DisjunctionQuery: + for _, subq := range q.Disjuncts { + fs = ExtractFields(subq, m, fs) + } + } + return fs +} diff --git a/search/query/query_test.go b/search/query/query_test.go index 0028c956b..870510de2 100644 --- a/search/query/query_test.go +++ b/search/query/query_test.go @@ -16,6 +16,7 @@ package query import ( "reflect" + "sort" "strings" "testing" "time" @@ -785,3 +786,236 @@ func TestParseEmptyQuery(t *testing.T) { t.Errorf("[2] Expected %#v, got %#v", expect, rv) } } + +func TestExtractFields(t *testing.T) { + testQueries := []struct { + query string + expFields []string + }{ + { + query: `{"term":"water","field":"desc"}`, + expFields: []string{"desc"}, + }, + { + query: `{ + "must": { + "conjuncts": [ + { + "match": "water", + "prefix_length": 0, + "fuzziness": 0 + } + ] + }, + "should": { + "disjuncts": [ + { + "match": "beer", + "prefix_length": 0, + "fuzziness": 0 + } + ], + "min": 0 + }, + "must_not": { + "disjuncts": [ + { + "match": "light", + "prefix_length": 0, + "fuzziness": 0 + } + ], + "min": 0 + } + }`, + expFields: []string{"_all"}, + }, + { + query: `{ + "must": { + "conjuncts": [ + { + "match": "water", + "prefix_length": 0, + "field": "desc", + "fuzziness": 0 + } + ] + }, + "should": { + "disjuncts": [ + { + "match": "beer", + "prefix_length": 0, + "field": "desc", + "fuzziness": 0 + } + ], + "min": 0 + }, + "must_not": { + "disjuncts": [ + { + "match": "light", + "prefix_length": 0, + "field": "genre", + "fuzziness": 0 + } + ], + "min": 0 + } + }`, + expFields: []string{"desc", "genre"}, + }, + { + query: ` + { + "conjuncts": [ + { + "conjuncts": [ + { + "conjuncts": [ + { + "conjuncts": [ + { + "field": "date", + "start": "2002-09-05T08:09:00Z", + "end": "2007-03-01T03:52:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 1260295, + "max": 3917314, + "inclusive_min": true, + "inclusive_max": true + } + ] + }, + { + "conjuncts": [ + { + "field": "date2", + "start": "2004-08-21T18:30:00Z", + "end": "2006-03-24T08:08:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 165449, + "max": 3847517, + "inclusive_min": true, + "inclusive_max": true + } + ] + } + ] + }, + { + "conjuncts": [ + { + "conjuncts": [ + { + "field": "date", + "start": "2004-09-02T22:15:00Z", + "end": "2008-06-22T15:06:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number2", + "min": 876843, + "max": 3363351, + "inclusive_min": true, + "inclusive_max": true + } + ] + }, + { + "conjuncts": [ + { + "field": "date", + "start": "2000-12-03T21:35:00Z", + "end": "2008-02-07T05:00:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 2021479, + "max": 4763404, + "inclusive_min": true, + "inclusive_max": true + } + ] + } + ] + } + ] + }, + { + "conjuncts": [ + { + "conjuncts": [ + { + "field": "date3", + "start": "2000-03-13T07:13:00Z", + "end": "2005-09-19T09:33:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 883125, + "max": 4817433, + "inclusive_min": true, + "inclusive_max": true + } + ] + }, + { + "conjuncts": [ + { + "field": "date", + "start": "2002-08-10T22:42:00Z", + "end": "2008-02-10T23:19:00Z", + "inclusive_start": true, + "inclusive_end": true + }, + { + "field": "number", + "min": 896115, + "max": 3897074, + "inclusive_min": true, + "inclusive_max": true + } + ] + } + ] + } + ] + }`, + expFields: []string{"date", "number", "date2", "number2", "date3"}, + }, + } + + m := mapping.NewIndexMapping() + for i, test := range testQueries { + q, err := ParseQuery([]byte(test.query)) + if err != nil { + t.Fatal(err) + } + fields := ExtractFields(q, m, nil) + var fieldsSlice []string + for k := range fields { + fieldsSlice = append(fieldsSlice, k) + } + sort.Strings(test.expFields) + sort.Strings(fieldsSlice) + if !reflect.DeepEqual(fieldsSlice, test.expFields) { + t.Errorf("Test %d: expected %v, got %v", i, test.expFields, fieldsSlice) + } + } +} From 31973e08f0f28b456f0249164b6dbb968e4d41ea Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Fri, 6 Dec 2024 13:59:39 +0530 Subject: [PATCH 16/35] bug fix --- index_alias_impl.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index e2948ff53..04270e89d 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -189,12 +189,10 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // if necessary var preSearchData map[string]map[string]interface{} if req.PreSearchData != nil { - if requestHasKNN(req) { - var err error - preSearchData, err = redistributePreSearchData(req, i.indexes) - if err != nil { - return nil, err - } + var err error + preSearchData, err = redistributePreSearchData(req, i.indexes) + if err != nil { + return nil, err } } From 2130f3c96107079ac2f5f279f9dbce91c4778caa Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 9 Dec 2024 12:51:48 +0530 Subject: [PATCH 17/35] final fixes to alias query path --- index/scorch/snapshot_index.go | 145 ++++++++++++++++-- index/scorch/snapshot_index_thes.go | 107 +++++++++++++ index_impl.go | 4 +- search/levenshtein.go | 4 + search/levenshtein_test.go | 13 +- search/query/query.go | 206 +++++++++++--------------- search/searcher/search_fuzzy.go | 53 +++++-- search/searcher/search_regexp.go | 26 +++- search/searcher/search_term_prefix.go | 33 ++++- search_test.go | 98 +++++++++--- 10 files changed, 517 insertions(+), 172 deletions(-) create mode 100644 index/scorch/snapshot_index_thes.go diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 685f1c921..2c3ea5167 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -45,7 +45,7 @@ type asynchSegmentResult struct { index int docs *roaring.Bitmap - postings segment.PostingsList + thesItr segment.ThesaurusIterator err error } @@ -271,17 +271,32 @@ func (is *IndexSnapshot) FieldDictPrefix(field string, func (is *IndexSnapshot) FieldDictRegexp(field string, termRegex string) (index.FieldDict, error) { + fd, _, err := is.FieldDictRegexpAutomaton(field, termRegex) + return fd, err +} + +func (is *IndexSnapshot) FieldDictRegexpAutomaton(field string, + termRegex string) (index.FieldDict, index.RegexAutomaton, error) { + return is.fieldDictRegexp(field, termRegex) +} + +func (is *IndexSnapshot) fieldDictRegexp(field string, + termRegex string) (index.FieldDict, index.RegexAutomaton, error) { // TODO: potential optimization where the literal prefix represents the, // entire regexp, allowing us to use PrefixIterator(prefixTerm)? a, prefixBeg, prefixEnd, err := parseRegexp(termRegex) if err != nil { - return nil, err + return nil, nil, err } - return is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { + fd, err := is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { return is.AutomatonIterator(a, prefixBeg, prefixEnd) }, false) + if err != nil { + return nil, nil, err + } + return fd, a, nil } func (is *IndexSnapshot) getLevAutomaton(term string, @@ -296,20 +311,37 @@ func (is *IndexSnapshot) getLevAutomaton(term string, func (is *IndexSnapshot) FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (index.FieldDict, error) { + fd, _, err := is.FieldDictFuzzyAutomaton(field, term, fuzziness, prefix) + return fd, err +} + +func (is *IndexSnapshot) FieldDictFuzzyAutomaton(field string, + term string, fuzziness int, prefix string) (index.FieldDict, index.FuzzyAutomaton, error) { + return is.fieldDictFuzzy(field, term, fuzziness, prefix) +} + +func (is *IndexSnapshot) fieldDictFuzzy(field string, + term string, fuzziness int, prefix string) (index.FieldDict, index.FuzzyAutomaton, error) { a, err := is.getLevAutomaton(term, uint8(fuzziness)) if err != nil { - return nil, err + return nil, nil, err + } + var fa index.FuzzyAutomaton + if vfa, ok := a.(vellum.FuzzyAutomaton); ok { + fa = vfa } - var prefixBeg, prefixEnd []byte if prefix != "" { prefixBeg = []byte(prefix) prefixEnd = calculateExclusiveEndFromPrefix(prefixBeg) } - - return is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { + fd, err := is.newIndexSnapshotFieldDict(field, func(is segment.TermDictionary) segment.DictionaryIterator { return is.AutomatonIterator(a, prefixBeg, prefixEnd) }, false) + if err != nil { + return nil, nil, err + } + return fd, fa, nil } func (is *IndexSnapshot) FieldDictContains(field string) (index.FieldDictContains, error) { @@ -1020,7 +1052,7 @@ func (is *IndexSnapshot) SynonymTermReader(ctx context.Context, thesaurusName st if rv.thesauri == nil { rv.thesauri = make([]segment.Thesaurus, len(is.segment)) for i, s := range is.segment { - if synSeg, ok := s.segment.(segment.SynonymSegment); ok { + if synSeg, ok := s.segment.(segment.ThesaurusSegment); ok { thes, err := synSeg.Thesaurus(thesaurusName) if err != nil { return nil, err @@ -1031,7 +1063,7 @@ func (is *IndexSnapshot) SynonymTermReader(ctx context.Context, thesaurusName st } for i, s := range is.segment { - if _, ok := s.segment.(segment.SynonymSegment); ok { + if _, ok := s.segment.(segment.ThesaurusSegment); ok { pl, err := rv.thesauri[i].SynonymsList(term, s.deleted, rv.postings[i]) if err != nil { return nil, err @@ -1043,3 +1075,98 @@ func (is *IndexSnapshot) SynonymTermReader(ctx context.Context, thesaurusName st } return rv, nil } + +func (is *IndexSnapshot) newIndexSnapshotThesaurusKeys(name string, + makeItr func(i segment.Thesaurus) segment.ThesaurusIterator) (*IndexSnapshotThesaurusKeys, error) { + + results := make(chan *asynchSegmentResult, len(is.segment)) + var wg sync.WaitGroup + wg.Add(len(is.segment)) + for _, s := range is.segment { + go func(s *SegmentSnapshot) { + defer wg.Done() + if synSeg, ok := s.segment.(segment.ThesaurusSegment); ok { + thes, err := synSeg.Thesaurus(name) + if err != nil { + results <- &asynchSegmentResult{err: err} + } else { + results <- &asynchSegmentResult{thesItr: makeItr(thes)} + } + } + }(s) + } + // Close the channel after all goroutines complete + go func() { + wg.Wait() + close(results) + }() + + var err error + rv := &IndexSnapshotThesaurusKeys{ + snapshot: is, + cursors: make([]*segmentThesCursor, 0, len(is.segment)), + } + for asr := range results { + if asr.err != nil && err == nil { + err = asr.err + } else { + next, err2 := asr.thesItr.Next() + if err2 != nil && err == nil { + err = err2 + } + if next != nil { + rv.cursors = append(rv.cursors, &segmentThesCursor{ + itr: asr.thesItr, + curr: *next, + }) + } + } + } + // after ensuring we've read all items on channel + if err != nil { + return nil, err + } + + return rv, nil +} + +func (is *IndexSnapshot) ThesaurusKeys(name string) (index.ThesaurusKeys, error) { + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(nil, nil, nil) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysFuzzy(name string, + term string, fuzziness int, prefix string) (index.ThesaurusKeys, error) { + a, err := is.getLevAutomaton(term, uint8(fuzziness)) + if err != nil { + return nil, err + } + var prefixBeg, prefixEnd []byte + if prefix != "" { + prefixBeg = []byte(prefix) + prefixEnd = calculateExclusiveEndFromPrefix(prefixBeg) + } + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(a, prefixBeg, prefixEnd) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysPrefix(name string, + termPrefix []byte) (index.ThesaurusKeys, error) { + termPrefixEnd := calculateExclusiveEndFromPrefix(termPrefix) + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(nil, termPrefix, termPrefixEnd) + }) +} + +func (is *IndexSnapshot) ThesaurusKeysRegexp(name string, + termRegex string) (index.ThesaurusKeys, error) { + a, prefixBeg, prefixEnd, err := parseRegexp(termRegex) + if err != nil { + return nil, err + } + return is.newIndexSnapshotThesaurusKeys(name, func(is segment.Thesaurus) segment.ThesaurusIterator { + return is.AutomatonIterator(a, prefixBeg, prefixEnd) + }) +} diff --git a/index/scorch/snapshot_index_thes.go b/index/scorch/snapshot_index_thes.go new file mode 100644 index 000000000..6f3aae818 --- /dev/null +++ b/index/scorch/snapshot_index_thes.go @@ -0,0 +1,107 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "container/heap" + + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +type segmentThesCursor struct { + thes segment.Thesaurus + itr segment.ThesaurusIterator + curr index.ThesaurusEntry +} + +type IndexSnapshotThesaurusKeys struct { + snapshot *IndexSnapshot + cursors []*segmentThesCursor + entry index.ThesaurusEntry +} + +func (i *IndexSnapshotThesaurusKeys) Len() int { return len(i.cursors) } +func (i *IndexSnapshotThesaurusKeys) Less(a, b int) bool { + return i.cursors[a].curr.Term < i.cursors[b].curr.Term +} +func (i *IndexSnapshotThesaurusKeys) Swap(a, b int) { + i.cursors[a], i.cursors[b] = i.cursors[b], i.cursors[a] +} + +func (i *IndexSnapshotThesaurusKeys) Push(x interface{}) { + i.cursors = append(i.cursors, x.(*segmentThesCursor)) +} + +func (i *IndexSnapshotThesaurusKeys) Pop() interface{} { + n := len(i.cursors) + x := i.cursors[n-1] + i.cursors = i.cursors[0 : n-1] + return x +} + +func (i *IndexSnapshotThesaurusKeys) Next() (*index.ThesaurusEntry, error) { + if len(i.cursors) == 0 { + return nil, nil + } + i.entry = i.cursors[0].curr + next, err := i.cursors[0].itr.Next() + if err != nil { + return nil, err + } + if next == nil { + // at end of this cursor, remove it + heap.Pop(i) + } else { + // modified heap, fix it + i.cursors[0].curr = *next + heap.Fix(i, 0) + } + // look for any other entries with the exact same term + for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term { + next, err := i.cursors[0].itr.Next() + if err != nil { + return nil, err + } + if next == nil { + // at end of this cursor, remove it + heap.Pop(i) + } else { + // modified heap, fix it + i.cursors[0].curr = *next + heap.Fix(i, 0) + } + } + + return &i.entry, nil +} + +func (i *IndexSnapshotThesaurusKeys) Close() error { + return nil +} + +func (i *IndexSnapshotThesaurusKeys) Contains(key []byte) (bool, error) { + if len(i.cursors) == 0 { + return false, nil + } + + for _, cursor := range i.cursors { + if found, _ := cursor.thes.Contains(key); found { + return true, nil + } + } + + return false, nil +} diff --git a/index_impl.go b/index_impl.go index c085b796a..650f1d68c 100644 --- a/index_impl.go +++ b/index_impl.go @@ -453,7 +453,7 @@ func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader in var fts search.FieldTermSynonymMap if !isMatchNoneQuery(req.Query) { if synMap, ok := i.m.(mapping.SynonymMapping); ok { - if synReader, ok := reader.(index.SynonymReader); ok { + if synReader, ok := reader.(index.ThesaurusReader); ok { fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) if err != nil { return nil, err @@ -550,7 +550,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if fts == nil { if synMap, ok := i.m.(mapping.SynonymMapping); ok && synMap.SynonymCount() > 0 { - if synReader, ok := indexReader.(index.SynonymReader); ok { + if synReader, ok := indexReader.(index.ThesaurusReader); ok { fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) if err != nil { return nil, err diff --git a/search/levenshtein.go b/search/levenshtein.go index 687608d3f..dadab2521 100644 --- a/search/levenshtein.go +++ b/search/levenshtein.go @@ -68,6 +68,10 @@ func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, ld := int(math.Abs(float64(la - lb))) if ld > max { return max, true, d + } else if la == 0 || lb == 0 { + // if one string of the two strings is empty, then ld is + // the length of the other string and as such is <= max + return ld, false, d } if cap(d) < la+1 { diff --git a/search/levenshtein_test.go b/search/levenshtein_test.go index 651f7803c..ef23980ef 100644 --- a/search/levenshtein_test.go +++ b/search/levenshtein_test.go @@ -69,12 +69,19 @@ func TestLevenshteinDistanceMax(t *testing.T) { exceeded: true, }, { - a: "water", + a: "", b: "water", - max: 1, - dist: 0, + max: 10, + dist: 5, exceeded: false, }, + { + a: "water", + b: "", + max: 3, + dist: 3, + exceeded: true, + }, } for _, test := range tests { diff --git a/search/query/query.go b/search/query/query.go index 5e3259c11..dcb0e1583 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -439,17 +439,24 @@ func DumpQuery(m mapping.IndexMapping, query Query) (string, error) { return string(data), err } +const ( + FuzzyMatchType = iota + RegexpMatchType + PrefixMatchType +) + // ExtractSynonyms extracts synonyms from the query tree and returns a map of // field-term pairs to their synonyms. The input query tree is traversed and // for each term query, the synonyms are extracted from the synonym source // associated with the field. The synonyms are then added to the provided map. // The map is returned and may be nil if no synonyms were found. -func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.SynonymReader, +func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.ThesaurusReader, query Query, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { if r == nil || m == nil || query == nil { return rv, nil } + var err error resolveFieldAndSource := func(field string) (string, string) { if field == "" { field = m.DefaultSearchField() @@ -468,7 +475,6 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno } switch q := query.(type) { case *BooleanQuery: - var err error rv, err = ExtractSynonyms(ctx, m, r, q.Must, rv) if err != nil { return nil, err @@ -481,10 +487,8 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno if err != nil { return nil, err } - return rv, nil case *ConjunctionQuery: for _, child := range q.Conjuncts { - var err error rv, err = ExtractSynonyms(ctx, m, r, child, rv) if err != nil { return nil, err @@ -492,7 +496,6 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno } case *DisjunctionQuery: for _, child := range q.Disjuncts { - var err error rv, err = ExtractSynonyms(ctx, m, r, child, rv) if err != nil { return nil, err @@ -505,7 +508,10 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno if q.autoFuzzy { fuzziness = searcher.GetAutoFuzziness(q.Term) } - return addFuzzySynonymsForTerm(ctx, source, field, q.Term, fuzziness, q.Prefix, r, rv) + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, q.Term, fuzziness, q.Prefix, r, rv) + if err != nil { + return nil, err + } } case *MatchQuery, *MatchPhraseQuery: var analyzerName, matchString, fieldVal string @@ -527,7 +533,7 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno if autoFuzzy { fuzziness = searcher.GetAutoFuzziness(string(token.Term)) } - rv, err = addFuzzySynonymsForTerm(ctx, source, field, string(token.Term), fuzziness, prefix, r, rv) + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, string(token.Term), fuzziness, prefix, r, rv) if err != nil { return nil, err } @@ -553,164 +559,123 @@ func ExtractSynonyms(ctx context.Context, m mapping.SynonymMapping, r index.Syno terms = pq.Terms } for _, term := range terms { - var err error if autoFuzzy { fuzziness = searcher.GetAutoFuzziness(term) } - rv, err = addFuzzySynonymsForTerm(ctx, source, field, term, fuzziness, 0, r, rv) + rv, err = addSynonymsForTermWithMatchType(ctx, FuzzyMatchType, source, field, term, fuzziness, 0, r, rv) if err != nil { return nil, err } } - return rv, nil } case *PrefixQuery: field, source := resolveFieldAndSource(q.FieldVal) if source != "" { - return addPrefixSynonymsForTerm(ctx, source, field, q.Prefix, r, rv) + rv, err = addSynonymsForTermWithMatchType(ctx, PrefixMatchType, source, field, q.Prefix, 0, 0, r, rv) + if err != nil { + return nil, err + } } case *QueryStringQuery: expanded, err := expandQuery(m, q) if err != nil { return nil, err } - return ExtractSynonyms(ctx, m, r, expanded, rv) - case *RegexpQuery: + rv, err = ExtractSynonyms(ctx, m, r, expanded, rv) + if err != nil { + return nil, err + } + case *TermQuery: field, source := resolveFieldAndSource(q.FieldVal) if source != "" { - return addRegexpSynonymsForTerm(ctx, source, field, strings.TrimPrefix(q.Regexp, "^"), r, rv) + rv, err = addSynonymsForTerm(ctx, source, field, q.Term, r, rv) + if err != nil { + return nil, err + } } - case *TermQuery: + case *RegexpQuery: field, source := resolveFieldAndSource(q.FieldVal) if source != "" { - return addSynonymsForTerm(ctx, source, field, q.Term, r, rv) + rv, err = addSynonymsForTermWithMatchType(ctx, RegexpMatchType, source, field, strings.TrimPrefix(q.Regexp, "^"), 0, 0, r, rv) + if err != nil { + return nil, err + } } case *WildcardQuery: field, source := resolveFieldAndSource(q.FieldVal) if source != "" { - regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) - return addRegexpSynonymsForTerm(ctx, source, field, regexpString, r, rv) + rv, err = addSynonymsForTermWithMatchType(ctx, RegexpMatchType, source, field, wildcardRegexpReplacer.Replace(q.Wildcard), 0, 0, r, rv) + if err != nil { + return nil, err + } } } return rv, nil } -// addRegexpSynonymsForTerm finds all terms that match the given regexp and -// adds their synonyms to the provided map. -func addRegexpSynonymsForTerm(ctx context.Context, src, field, term string, - r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { - // find the terms with this regexp - var ok bool - var ir index.IndexReaderRegexp - if ir, ok = r.(index.IndexReaderRegexp); !ok { - return addSynonymsForTerm(ctx, src, field, term, r, rv) - } - fieldDict, err := ir.FieldDictRegexp(field, term) - if err != nil { - return nil, err - } - defer func() { - if cerr := fieldDict.Close(); cerr != nil && err == nil { - err = cerr - } - }() - regexpTerms := []string{term} - tfd, err := fieldDict.Next() - for err == nil && tfd != nil { - regexpTerms = append(regexpTerms, tfd.Term) - tfd, err = fieldDict.Next() - } - if err != nil { - return nil, err - } - for _, term := range regexpTerms { - rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) - if err != nil { - return nil, err +// addFuzzySynonymsForTerm finds all terms that match the given term with the +// given fuzziness and adds their synonyms to the provided map. +func addSynonymsForTermWithMatchType(ctx context.Context, matchType int, src, field, term string, fuzziness, prefix int, + r index.ThesaurusReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + // Determine the terms based on the match type (fuzzy, prefix, or regexp) + var thesKeys index.ThesaurusKeys + var err error + var terms []string + switch matchType { + case FuzzyMatchType: + // Ensure valid fuzziness + if fuzziness == 0 { + rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) + if err != nil { + return nil, err + } + return rv, nil } - } - return rv, nil -} - -// addPrefixSynonymsForTerm finds all terms that match the given prefix and -// adds their synonyms to the provided map. -func addPrefixSynonymsForTerm(ctx context.Context, src, field, term string, - r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { - var ok bool - var ir index.IndexReaderPrefix - if ir, ok = r.(index.IndexReaderPrefix); !ok { - return addSynonymsForTerm(ctx, src, field, term, r, rv) - } - fieldDict, err := ir.FieldDictPrefix(field, []byte(term)) - if err != nil { - return nil, err - } - defer func() { - if cerr := fieldDict.Close(); cerr != nil && err == nil { - err = cerr + if fuzziness > searcher.MaxFuzziness { + return nil, fmt.Errorf("fuzziness exceeds max (%d)", searcher.MaxFuzziness) } - }() - prefixTerms := []string{term} - tfd, err := fieldDict.Next() - for err == nil && tfd != nil { - prefixTerms = append(prefixTerms, tfd.Term) - tfd, err = fieldDict.Next() - } - if err != nil { - return nil, err - } - for _, term := range prefixTerms { - rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) - if err != nil { - return nil, err + if fuzziness < 0 { + return nil, fmt.Errorf("invalid fuzziness, negative") } - } - return rv, nil -} - -// addFuzzySynonymsForTerm finds all terms that match the given term with the -// given fuzziness and adds their synonyms to the provided map. -func addFuzzySynonymsForTerm(ctx context.Context, src, field, term string, fuzziness, prefix int, - r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { - if fuzziness > searcher.MaxFuzziness { - return nil, fmt.Errorf("fuzziness exceeds max (%d)", searcher.MaxFuzziness) - } - if fuzziness < 0 { - return nil, fmt.Errorf("invalid fuzziness, negative") - } - var ok bool - var ir index.IndexReaderFuzzy - if ir, ok = r.(index.IndexReaderFuzzy); !ok || fuzziness == 0 { - return addSynonymsForTerm(ctx, src, field, term, r, rv) - } - prefixTerm := "" - for i, r := range term { - if i < prefix { - prefixTerm += string(r) - } else { - break + // Handle fuzzy match + prefixTerm := "" + for i, r := range term { + if i < prefix { + prefixTerm += string(r) + } else { + break + } } + thesKeys, err = r.ThesaurusKeysFuzzy(src, term, fuzziness, prefixTerm) + case RegexpMatchType: + // Handle regexp match + thesKeys, err = r.ThesaurusKeysRegexp(src, term) + case PrefixMatchType: + // Handle prefix match + thesKeys, err = r.ThesaurusKeysPrefix(src, []byte(term)) + default: + return nil, fmt.Errorf("invalid match type: %d", matchType) } - fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) if err != nil { return nil, err } defer func() { - if cerr := fieldDict.Close(); cerr != nil && err == nil { + if cerr := thesKeys.Close(); cerr != nil && err == nil { err = cerr } }() - fuzzyTerms := []string{term} - tfd, err := fieldDict.Next() + // Collect the matching terms + terms = []string{} + tfd, err := thesKeys.Next() for err == nil && tfd != nil { - fuzzyTerms = append(fuzzyTerms, tfd.Term) - tfd, err = fieldDict.Next() + terms = append(terms, tfd.Term) + tfd, err = thesKeys.Next() } if err != nil { return nil, err } - for _, term := range fuzzyTerms { - rv, err = addSynonymsForTerm(ctx, src, field, term, r, rv) + for _, synTerm := range terms { + rv, err = addSynonymsForTerm(ctx, src, field, synTerm, r, rv) if err != nil { return nil, err } @@ -718,13 +683,10 @@ func addFuzzySynonymsForTerm(ctx context.Context, src, field, term string, fuzzi return rv, nil } -// addSynonymsForTerm finds synonyms for the given term and adds them to the -// provided map. func addSynonymsForTerm(ctx context.Context, src, field, term string, - r index.SynonymReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { + r index.ThesaurusReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { - termBytes := []byte(term) - termReader, err := r.SynonymTermReader(ctx, src, termBytes) + termReader, err := r.SynonymTermReader(ctx, src, []byte(term)) if err != nil { return nil, err } diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index d0e5c1ec5..187486efc 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -17,6 +17,7 @@ package searcher import ( "context" "fmt" + "strings" "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" @@ -73,7 +74,7 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s break } } - fuzzyCandidates, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, + fuzzyCandidates, err := findFuzzyCandidateTerms(ctx, indexReader, term, fuzziness, field, prefixTerm) if err != nil { return nil, err @@ -144,7 +145,7 @@ func reportIOStats(ctx context.Context, bytesRead uint64) { } } -func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, +func findFuzzyCandidateTerms(ctx context.Context, indexReader index.IndexReader, term string, fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) { rv = &fuzzyCandidates{ candidates: make([]string, 0), @@ -155,7 +156,19 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, // the levenshtein automaton based iterator to collect the // candidate terms if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { - fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) + termSet := make(map[string]struct{}) + addCandidateTerm := func(term string, editDistance uint8) error { + if _, exists := termSet[term]; !exists { + termSet[term] = struct{}{} + rv.candidates = append(rv.candidates, term) + rv.editDistances = append(rv.editDistances, editDistance) + if tooManyClauses(len(rv.candidates)) { + return tooManyClausesErr(field, len(rv.candidates)) + } + } + return nil + } + fieldDict, a, err := ir.FieldDictFuzzyAutomaton(field, term, fuzziness, prefixTerm) if err != nil { return nil, err } @@ -166,16 +179,38 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, }() tfd, err := fieldDict.Next() for err == nil && tfd != nil { - rv.candidates = append(rv.candidates, tfd.Term) - rv.editDistances = append(rv.editDistances, tfd.EditDistance) - if tooManyClauses(len(rv.candidates)) { - return nil, tooManyClausesErr(field, len(rv.candidates)) + err = addCandidateTerm(tfd.Term, tfd.EditDistance) + if err != nil { + return nil, err } tfd, err = fieldDict.Next() } - + if err != nil { + return nil, err + } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if !strings.HasPrefix(term, prefixTerm) { + continue + } + match, editDistance := a.MatchAndDistance(term) + if match { + err = addCandidateTerm(term, editDistance) + if err != nil { + return nil, err + } + } + } + } + } + } rv.bytesRead = fieldDict.BytesRead() - return rv, err + return rv, nil } var fieldDict index.FieldDict diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index b88133e31..1afdaee02 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -48,7 +48,7 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader, return NewRegexpSearcher(ctx, indexReader, r, field, boost, options) } - fieldDict, err := ir.FieldDictRegexp(field, pattern) + fieldDict, a, err := ir.FieldDictRegexpAutomaton(field, pattern) if err != nil { return nil, err } @@ -58,17 +58,37 @@ func NewRegexpStringSearcher(ctx context.Context, indexReader index.IndexReader, } }() + var termSet = make(map[string]struct{}) var candidateTerms []string tfd, err := fieldDict.Next() for err == nil && tfd != nil { - candidateTerms = append(candidateTerms, tfd.Term) - tfd, err = fieldDict.Next() + if _, exists := termSet[tfd.Term]; !exists { + termSet[tfd.Term] = struct{}{} + candidateTerms = append(candidateTerms, tfd.Term) + tfd, err = fieldDict.Next() + } } if err != nil { return nil, err } + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if a.MatchesRegex(term) { + termSet[term] = struct{}{} + candidateTerms = append(candidateTerms, term) + } + } + } + } + } + return NewMultiTermSearcher(ctx, indexReader, candidateTerms, field, boost, options, true) } diff --git a/search/searcher/search_term_prefix.go b/search/searcher/search_term_prefix.go index 3b05e5a8d..3d98cd28e 100644 --- a/search/searcher/search_term_prefix.go +++ b/search/searcher/search_term_prefix.go @@ -16,6 +16,7 @@ package searcher import ( "context" + "strings" "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" @@ -36,13 +37,17 @@ func NewTermPrefixSearcher(ctx context.Context, indexReader index.IndexReader, p }() var terms []string + var termSet = make(map[string]struct{}) tfd, err := fieldDict.Next() for err == nil && tfd != nil { - terms = append(terms, tfd.Term) - if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(field, len(terms)) + if _, exists := termSet[tfd.Term]; !exists { + termSet[tfd.Term] = struct{}{} + terms = append(terms, tfd.Term) + if tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(field, len(terms)) + } + tfd, err = fieldDict.Next() } - tfd, err = fieldDict.Next() } if err != nil { return nil, err @@ -52,6 +57,26 @@ func NewTermPrefixSearcher(ctx context.Context, indexReader index.IndexReader, p reportIOStats(ctx, fieldDict.BytesRead()) search.RecordSearchCost(ctx, search.AddM, fieldDict.BytesRead()) } + + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + for term := range ts { + if _, exists := termSet[term]; exists { + continue + } + if strings.HasPrefix(term, prefix) { + termSet[term] = struct{}{} + terms = append(terms, term) + if tooManyClauses(len(terms)) { + return nil, tooManyClausesErr(field, len(terms)) + } + } + } + } + } + } + // check if the terms are empty or have one term which is the prefix itself if len(terms) == 0 || (len(terms) == 1 && terms[0] == prefix) { return NewTermSearcher(ctx, indexReader, prefix, field, boost, options) diff --git a/search_test.go b/search_test.go index cd1346936..9c5f0d7bb 100644 --- a/search_test.go +++ b/search_test.go @@ -3854,7 +3854,7 @@ func TestSynonymTermReader(t *testing.T) { } }() - synReader, ok := reader.(index.SynonymReader) + synReader, ok := reader.(index.ThesaurusReader) if !ok { t.Fatal("expected synonym reader") } @@ -4272,32 +4272,90 @@ func TestSynonymSearchQueries(t *testing.T) { }, } - for _, dtq := range testQueries { - q, err := query.ParseQuery([]byte(dtq.query)) + runTestQueries := func(idx Index) error { + for _, dtq := range testQueries { + q, err := query.ParseQuery([]byte(dtq.query)) + if err != nil { + return err + } + sr := NewSearchRequest(q) + sr.Highlight = NewHighlightWithStyle(ansi.Name) + sr.SortBy([]string{"_id"}) + sr.Fields = []string{"*"} + sr.Size = 30 + sr.Explain = true + res, err := idx.Search(sr) + if err != nil { + return err + } + if len(res.Hits) != len(dtq.expectHits) { + return fmt.Errorf("expected %d hits, got %d", len(dtq.expectHits), len(res.Hits)) + } + // sort the expected hits to match the order of the search results + sort.Strings(dtq.expectHits) + for i, hit := range res.Hits { + if hit.ID != dtq.expectHits[i] { + return fmt.Errorf("expected docID %s, got %s", dtq.expectHits[i], hit.ID) + } + } + } + return nil + } + err = runTestQueries(idx) + if err != nil { + t.Fatal(err) + } + // test with index alias - with 1 batch per index + numIndexes := len(batches) + indexes := make([]Index, numIndexes) + indexesPath := make([]string, numIndexes) + for i := 0; i < numIndexes; i++ { + tmpIndexPath := createTmpIndexPath(t) + idx, err := New(tmpIndexPath, imap) if err != nil { t.Fatal(err) } - sr := NewSearchRequest(q) - sr.Highlight = NewHighlightWithStyle(ansi.Name) - sr.SortBy([]string{"_id"}) - sr.Fields = []string{"*"} - sr.Size = 30 - sr.Explain = true - - res, err := idx.Search(sr) + err = idx.Batch(batches[i]) if err != nil { t.Fatal(err) } - if len(res.Hits) != len(dtq.expectHits) { - t.Fatalf("expected %d hits, got %d", len(dtq.expectHits), len(res.Hits)) - } - // sort the expected hits to match the order of the search results - sort.Strings(dtq.expectHits) - for i, hit := range res.Hits { - if hit.ID != dtq.expectHits[i] { - t.Fatalf("expected docID %s, got %s", dtq.expectHits[i], hit.ID) + indexes[i] = idx + indexesPath[i] = tmpIndexPath + } + defer func() { + for i := 0; i < numIndexes; i++ { + err = indexes[i].Close() + if err != nil { + t.Fatal(err) } + cleanupTmpIndexPath(t, indexesPath[i]) } + }() + alias := NewIndexAlias(indexes...) + alias.SetIndexMapping(imap) + err = runTestQueries(alias) + if err != nil { + t.Fatal(err) + } + // test with multi-level alias now with two index per alias + // and having any extra index being in the final alias + numAliases := numIndexes / 2 + extraIndex := numIndexes % 2 + aliases := make([]IndexAlias, numAliases) + for i := 0; i < numAliases; i++ { + alias := NewIndexAlias(indexes[i*2], indexes[i*2+1]) + aliases[i] = alias + } + if extraIndex > 0 { + aliases[numAliases-1].Add(indexes[numIndexes-1]) + } + alias = NewIndexAlias() + alias.SetIndexMapping(imap) + for i := 0; i < numAliases; i++ { + alias.Add(aliases[i]) + } + err = runTestQueries(alias) + if err != nil { + t.Fatal(err) } - } From 59c3193377138975562d5690f35c7e277ef070f6 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 9 Dec 2024 14:20:00 +0530 Subject: [PATCH 18/35] rebase --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index b5b3a5507..31f8a34bc 100644 --- a/go.mod +++ b/go.mod @@ -24,7 +24,7 @@ require ( github.com/blevesearch/zapx/v13 v13.3.10 github.com/blevesearch/zapx/v14 v14.3.10 github.com/blevesearch/zapx/v15 v15.3.16 - github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2 + github.com/blevesearch/zapx/v16 v16.1.9 github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 github.com/spf13/cobra v1.7.0 diff --git a/go.sum b/go.sum index 565d9a56c..088c6aafb 100644 --- a/go.sum +++ b/go.sum @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7 github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= github.com/blevesearch/zapx/v15 v15.3.16 h1:Ct3rv7FUJPfPk99TI/OofdC+Kpb4IdyfdMH48sb+FmE= github.com/blevesearch/zapx/v15 v15.3.16/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= -github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2 h1:+RX9SM7KO7q91E7rFj4NARSsAhKj2EbvdWfzX+ihg/w= -github.com/blevesearch/zapx/v16 v16.1.9-0.20241120170816-85db80035af2/go.mod h1:zuxVgVaLZ0g4lZvrv06xDc24N6nLCOzXYHVkXI7LMHM= +github.com/blevesearch/zapx/v16 v16.1.9 h1:br5EgGntCF723cCecSpOACE0LnGAP4+HYrkcEfQkcBY= +github.com/blevesearch/zapx/v16 v16.1.9/go.mod h1:zuxVgVaLZ0g4lZvrv06xDc24N6nLCOzXYHVkXI7LMHM= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= From cad9e7950c4c2b4c36e111ba432ad5f48aa5da03 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 9 Dec 2024 16:21:19 +0530 Subject: [PATCH 19/35] fix bug --- index_impl.go | 10 +++++-- search/searcher/search_phrase.go | 50 +++++++++++++++++--------------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/index_impl.go b/index_impl.go index 650f1d68c..d98463b73 100644 --- a/index_impl.go +++ b/index_impl.go @@ -519,7 +519,11 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } var knnHits []*search.DocumentMatch + var skipKNNCollector bool + var fts search.FieldTermSynonymMap + var skipSynonymCollector bool + var ok bool if req.PreSearchData != nil { for k, v := range req.PreSearchData { @@ -530,6 +534,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if !ok { return nil, fmt.Errorf("knn preSearchData must be of type []*search.DocumentMatch") } + skipKNNCollector = true } case search.SynonymPreSearchDataKey: if v != nil { @@ -537,18 +542,19 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if !ok { return nil, fmt.Errorf("synonym preSearchData must be of type search.FieldTermSynonymMap") } + skipSynonymCollector = true } } } } - if knnHits == nil && requestHasKNN(req) { + if !skipKNNCollector && requestHasKNN(req) { knnHits, err = i.runKnnCollector(ctx, req, indexReader, false) if err != nil { return nil, err } } - if fts == nil { + if !skipSynonymCollector { if synMap, ok := i.m.(mapping.SynonymMapping); ok && synMap.SynonymCount() > 0 { if synReader, ok := indexReader.(index.ThesaurusReader); ok { fts, err = query.ExtractSynonyms(ctx, synMap, synReader, req.Query, fts) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 9c2ff7d5f..07675cfad 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -164,34 +164,36 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, } } - if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { - if ts, exists := fts[field]; exists { - if fuzzinessEnabled { - for term, fuzzyTerms := range fuzzyTermMatches { - fuzzySynonymTerms := make([]string, 0, len(fuzzyTerms)) - if s, found := ts[term]; found { - fuzzySynonymTerms = append(fuzzySynonymTerms, s...) - } - for _, fuzzyTerm := range fuzzyTerms { - if fuzzyTerm == term { - continue - } - if s, found := ts[fuzzyTerm]; found { + if ctx != nil { + if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok { + if ts, exists := fts[field]; exists { + if fuzzinessEnabled { + for term, fuzzyTerms := range fuzzyTermMatches { + fuzzySynonymTerms := make([]string, 0, len(fuzzyTerms)) + if s, found := ts[term]; found { fuzzySynonymTerms = append(fuzzySynonymTerms, s...) } + for _, fuzzyTerm := range fuzzyTerms { + if fuzzyTerm == term { + continue + } + if s, found := ts[fuzzyTerm]; found { + fuzzySynonymTerms = append(fuzzySynonymTerms, s...) + } + } + if len(fuzzySynonymTerms) > 0 { + fuzzyTermMatches[term] = append(fuzzyTermMatches[term], fuzzySynonymTerms...) + } } - if len(fuzzySynonymTerms) > 0 { - fuzzyTermMatches[term] = append(fuzzyTermMatches[term], fuzzySynonymTerms...) - } - } - } else { - for _, termPos := range terms { - for _, term := range termPos { - if s, found := ts[term]; found { - if fuzzyTermMatches == nil { - fuzzyTermMatches = make(map[string][]string) + } else { + for _, termPos := range terms { + for _, term := range termPos { + if s, found := ts[term]; found { + if fuzzyTermMatches == nil { + fuzzyTermMatches = make(map[string][]string) + } + fuzzyTermMatches[term] = s } - fuzzyTermMatches[term] = s } } } From f3d0ac57c608b7abc9404fa662c7898a35f16abf Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 9 Dec 2024 16:27:22 +0530 Subject: [PATCH 20/35] optimization --- index_alias_impl.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index dddb359bb..e9ed3a481 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -561,7 +561,14 @@ func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) *preSearchFla if !isMatchNoneQuery(req.Query) { // Check if synonyms are defined in the mapping if sm, ok := m.(mapping.SynonymMapping); ok && sm.SynonymCount() > 0 { - synonyms = true + // check if any of the fields queried have a synonym source + // in the index mapping, to prevent unnecessary preSearch + for field := range query.ExtractFields(req.Query, m, nil) { + if sm.SynonymSourceForPath(field) != "" { + synonyms = true + break + } + } } } if knn || synonyms { From e3b1d5b0536d487578a61db5c8d735df21d03bc9 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 9 Dec 2024 18:28:50 +0530 Subject: [PATCH 21/35] bleve APIs --- document/field_synonym.go | 2 +- index.go | 8 ++++++++ index_alias_impl.go | 19 +++++++++++++++++++ index_impl.go | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 1 deletion(-) diff --git a/document/field_synonym.go b/document/field_synonym.go index 0e4812690..4aa603fc4 100644 --- a/document/field_synonym.go +++ b/document/field_synonym.go @@ -117,7 +117,7 @@ func processSynonymData(input []string, synonyms []string) map[string][]string { // Map each term to the same list of synonyms. synonymMap = make(map[string][]string, len(input)) for _, term := range input { - synonymMap[term] = append([]string(nil), synonyms...) // Avoid sharing slices. + synonymMap[term] = synonyms } } else { synonymMap = make(map[string][]string, len(synonyms)) diff --git a/index.go b/index.go index d98f28558..5066c06af 100644 --- a/index.go +++ b/index.go @@ -378,3 +378,11 @@ func (sd *SynonymDefinition) Validate() error { } return nil } + +// SynonymIndex supports indexing synonym definitions alongside regular documents. +// Synonyms, grouped by collection name, define term relationships for query expansion in searches. +type SynonymIndex interface { + Index + // IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection. + IndexSynonym(id string, collection string, definition *SynonymDefinition) error +} diff --git a/index_alias_impl.go b/index_alias_impl.go index e9ed3a481..853665df4 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -82,6 +82,25 @@ func (i *indexAliasImpl) Index(id string, data interface{}) error { return i.indexes[0].Index(id, data) } +func (i *indexAliasImpl) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + return err + } + + if si, ok := i.indexes[0].(SynonymIndex); ok { + return si.IndexSynonym(id, collection, definition) + } + return ErrorSynonymSearchNotSupported +} + func (i *indexAliasImpl) Delete(id string) error { i.mutex.RLock() defer i.mutex.RUnlock() diff --git a/index_impl.go b/index_impl.go index d98463b73..289014f6c 100644 --- a/index_impl.go +++ b/index_impl.go @@ -268,6 +268,40 @@ func (i *indexImpl) Index(id string, data interface{}) (err error) { return } +// IndexSynonym indexes a synonym definition, with the specified id and belonging to the specified collection. +// Synonym definition defines term relationships for query expansion in searches. +func (i *indexImpl) IndexSynonym(id string, collection string, definition *SynonymDefinition) error { + if id == "" { + return ErrorEmptyID + } + + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + i.FireIndexEvent() + + synMap, ok := i.m.(mapping.SynonymMapping) + if !ok { + return ErrorSynonymSearchNotSupported + } + + if err := definition.Validate(); err != nil { + return err + } + + doc := document.NewSynonymDocument(id) + err := synMap.MapSynonymDocument(doc, collection, definition.Input, definition.Synonyms) + if err != nil { + return err + } + err = i.i.Update(doc) + return err +} + // IndexAdvanced takes a document.Document object // skips the mapping and indexes it. func (i *indexImpl) IndexAdvanced(doc *document.Document) (err error) { From b469373e8c1742bfa6903f4c9ba070bcb0321ea2 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 9 Dec 2024 20:10:16 +0530 Subject: [PATCH 22/35] minor fix --- index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.go b/index.go index 5066c06af..3d2389884 100644 --- a/index.go +++ b/index.go @@ -363,7 +363,7 @@ type SynonymDefinition struct { // When terms are specified in Input, they will map to the terms in Synonyms, // making the relationship unidirectional (each Input maps to all Synonyms). // If Input is omitted, the relationship is bidirectional among all Synonyms. - Input []string `json:"input"` + Input []string `json:"input,omitempty"` // Synonyms is a list of terms that are considered equivalent. // If Input is specified, each term in Input will map to each term in Synonyms. From 67815ad5451a94f2523819177c9f991f989edeba Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Mon, 9 Dec 2024 20:48:58 +0530 Subject: [PATCH 23/35] bug fix --- index_alias_impl.go | 17 ++++++++++++----- search/query/query.go | 30 +++++++++++++++++++++++------- search/query/query_test.go | 23 ++++++++++++++++++++++- 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 853665df4..eee8243fb 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -233,7 +233,10 @@ func (i *indexAliasImpl) SearchInContext(ctx context.Context, req *SearchRequest // - the request requires preSearch var preSearchDuration time.Duration var sr *SearchResult - flags := preSearchRequired(req, i.mapping) + flags, err := preSearchRequired(req, i.mapping) + if err != nil { + return nil, err + } if req.PreSearchData == nil && flags != nil { searchStart := time.Now() preSearchResult, err := preSearch(ctx, req, flags, i.indexes...) @@ -573,7 +576,7 @@ type preSearchFlags struct { // preSearchRequired checks if preSearch is required and returns a boolean flag // It only allocates the preSearchFlags struct if necessary -func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) *preSearchFlags { +func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) (*preSearchFlags, error) { // Check for KNN query knn := requestHasKNN(req) var synonyms bool @@ -582,7 +585,11 @@ func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) *preSearchFla if sm, ok := m.(mapping.SynonymMapping); ok && sm.SynonymCount() > 0 { // check if any of the fields queried have a synonym source // in the index mapping, to prevent unnecessary preSearch - for field := range query.ExtractFields(req.Query, m, nil) { + fs, err := query.ExtractFields(req.Query, m, nil) + if err != nil { + return nil, err + } + for field := range fs { if sm.SynonymSourceForPath(field) != "" { synonyms = true break @@ -594,9 +601,9 @@ func preSearchRequired(req *SearchRequest, m mapping.IndexMapping) *preSearchFla return &preSearchFlags{ knn: knn, synonyms: synonyms, - } + }, nil } - return nil + return nil, nil } func preSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { diff --git a/search/query/query.go b/search/query/query.go index defdb04eb..f44a248c7 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -445,10 +445,11 @@ type FieldSet map[string]struct{} // ExtractFields returns a set of fields referenced by the query. // The returned set may be nil if the query does not explicitly reference any field // and the DefaultSearchField is unset in the index mapping. -func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) FieldSet { - if q == nil { - return fs +func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, error) { + if q == nil || m == nil { + return fs, nil } + var err error switch q := q.(type) { case FieldableQuery: f := q.Field() @@ -461,20 +462,35 @@ func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) FieldSet { } fs[f] = struct{}{} } + case *QueryStringQuery: + var expandedQuery Query + expandedQuery, err = expandQuery(m, q) + if err == nil { + fs, err = ExtractFields(expandedQuery, m, fs) + } case *BooleanQuery: for _, subq := range []Query{q.Must, q.Should, q.MustNot} { - fs = ExtractFields(subq, m, fs) + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } } case *ConjunctionQuery: for _, subq := range q.Conjuncts { - fs = ExtractFields(subq, m, fs) + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } } case *DisjunctionQuery: for _, subq := range q.Disjuncts { - fs = ExtractFields(subq, m, fs) + fs, err = ExtractFields(subq, m, fs) + if err != nil { + break + } } } - return fs + return fs, err } const ( diff --git a/search/query/query_test.go b/search/query/query_test.go index 870510de2..60c1fa374 100644 --- a/search/query/query_test.go +++ b/search/query/query_test.go @@ -999,6 +999,24 @@ func TestExtractFields(t *testing.T) { }`, expFields: []string{"date", "number", "date2", "number2", "date3"}, }, + { + query: `{ + "query" : "hardworking people" + }`, + expFields: []string{"_all"}, + }, + { + query: `{ + "query" : "text:hardworking people" + }`, + expFields: []string{"text", "_all"}, + }, + { + query: `{ + "query" : "text:\"hardworking people\"" + }`, + expFields: []string{"text"}, + }, } m := mapping.NewIndexMapping() @@ -1007,7 +1025,10 @@ func TestExtractFields(t *testing.T) { if err != nil { t.Fatal(err) } - fields := ExtractFields(q, m, nil) + fields, err := ExtractFields(q, m, nil) + if err != nil { + t.Fatal(err) + } var fieldsSlice []string for k := range fields { fieldsSlice = append(fieldsSlice, k) From 55f6d4c182d28fde92d8fec408b6cece12d74634 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 10 Dec 2024 16:28:54 +0530 Subject: [PATCH 24/35] make default_synonym_source omitempty --- mapping/index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapping/index.go b/mapping/index.go index feb7634a9..78485cfad 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -49,7 +49,7 @@ type IndexMappingImpl struct { DefaultType string `json:"default_type"` DefaultAnalyzer string `json:"default_analyzer"` DefaultDateTimeParser string `json:"default_datetime_parser"` - DefaultSynonymSource string `json:"default_synonym_source"` + DefaultSynonymSource string `json:"default_synonym_source,omitempty"` DefaultField string `json:"default_field"` StoreDynamic bool `json:"store_dynamic"` IndexDynamic bool `json:"index_dynamic"` From 1a6dd1e43ff01294c1e31c9b0fdc4b696ed7eb33 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 10 Dec 2024 16:35:46 +0530 Subject: [PATCH 25/35] fix bugs --- mapping/field.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mapping/field.go b/mapping/field.go index 08af2dfa7..ce2878b18 100644 --- a/mapping/field.go +++ b/mapping/field.go @@ -462,22 +462,22 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error { return err } case "dims": - err := json.Unmarshal(v, &fm.Dims) + err := util.UnmarshalJSON(v, &fm.Dims) if err != nil { return err } case "similarity": - err := json.Unmarshal(v, &fm.Similarity) + err := util.UnmarshalJSON(v, &fm.Similarity) if err != nil { return err } case "vector_index_optimized_for": - err := json.Unmarshal(v, &fm.VectorIndexOptimizedFor) + err := util.UnmarshalJSON(v, &fm.VectorIndexOptimizedFor) if err != nil { return err } case "synonym_source": - err := json.Unmarshal(v, &fm.SynonymSource) + err := util.UnmarshalJSON(v, &fm.SynonymSource) if err != nil { return err } From ee71211399dac62677b1970ac1453b076adc1b5c Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 10 Dec 2024 19:36:34 +0530 Subject: [PATCH 26/35] refactor bleve APIs --- analysis/type.go | 9 ++++ index_test.go | 12 +++--- mapping/analysis.go | 8 ++++ mapping/document.go | 13 +++++- mapping/index.go | 65 +++++++++++++++++------------ mapping/synonym.go | 25 ++++++----- registry/registry.go | 11 +++++ registry/synonym_source.go | 85 ++++++++++++++++++++++++++++++++++++++ search_test.go | 28 ++++++++++--- 9 files changed, 206 insertions(+), 50 deletions(-) create mode 100644 registry/synonym_source.go diff --git a/analysis/type.go b/analysis/type.go index e3a7f201b..f819984b5 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -106,6 +106,15 @@ type DateTimeParser interface { ParseDateTime(string) (time.Time, string, error) } +const SynonymSourceType = "synonym" + +type SynonymSourceVisitor func(name string, item SynonymSource) error + +type SynonymSource interface { + Analyzer() string + Collection() string +} + type ByteArrayConverter interface { Convert([]byte) (interface{}, error) } diff --git a/index_test.go b/index_test.go index e75c5fd87..5d801b4e0 100644 --- a/index_test.go +++ b/index_test.go @@ -402,9 +402,9 @@ func TestBytesRead(t *testing.T) { stats, _ := idx.StatsMap()["index"].(map[string]interface{}) prevBytesRead, _ := stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead := uint64(21639) + expectedBytesRead := uint64(22049) if supportForVectorSearch { - expectedBytesRead = 22049 + expectedBytesRead = 22459 } if prevBytesRead != expectedBytesRead && res.Cost == prevBytesRead { @@ -560,9 +560,9 @@ func TestBytesReadStored(t *testing.T) { stats, _ := idx.StatsMap()["index"].(map[string]interface{}) bytesRead, _ := stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead := uint64(11501) + expectedBytesRead := uint64(11911) if supportForVectorSearch { - expectedBytesRead = 11911 + expectedBytesRead = 12321 } if bytesRead != expectedBytesRead && bytesRead == res.Cost { @@ -637,9 +637,9 @@ func TestBytesReadStored(t *testing.T) { stats, _ = idx1.StatsMap()["index"].(map[string]interface{}) bytesRead, _ = stats["num_bytes_read_at_query_time"].(uint64) - expectedBytesRead = uint64(3687) + expectedBytesRead = uint64(4097) if supportForVectorSearch { - expectedBytesRead = 4097 + expectedBytesRead = 4507 } if bytesRead != expectedBytesRead && bytesRead == res.Cost { diff --git a/mapping/analysis.go b/mapping/analysis.go index 03e3cd01b..311e97232 100644 --- a/mapping/analysis.go +++ b/mapping/analysis.go @@ -21,6 +21,7 @@ type customAnalysis struct { TokenFilters map[string]map[string]interface{} `json:"token_filters,omitempty"` Analyzers map[string]map[string]interface{} `json:"analyzers,omitempty"` DateTimeParsers map[string]map[string]interface{} `json:"date_time_parsers,omitempty"` + SynonymSources map[string]map[string]interface{} `json:"synonym_sources,omitempty"` } func (c *customAnalysis) registerAll(i *IndexMappingImpl) error { @@ -83,6 +84,12 @@ func (c *customAnalysis) registerAll(i *IndexMappingImpl) error { return err } } + for name, config := range c.SynonymSources { + _, err := i.cache.DefineSynonymSource(name, config) + if err != nil { + return err + } + } return nil } @@ -94,6 +101,7 @@ func newCustomAnalysis() *customAnalysis { TokenFilters: make(map[string]map[string]interface{}), Analyzers: make(map[string]map[string]interface{}), DateTimeParsers: make(map[string]map[string]interface{}), + SynonymSources: make(map[string]map[string]interface{}), } return &rv } diff --git a/mapping/document.go b/mapping/document.go index b470afaa4..e89e66979 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -60,6 +60,12 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache, return err } } + if dm.DefaultSynonymSource != "" { + _, err := cache.SynonymSourceNamed(dm.DefaultSynonymSource) + if err != nil { + return err + } + } for propertyName, property := range dm.Properties { newParent := propertyName if parentName != "" { @@ -83,7 +89,12 @@ func (dm *DocumentMapping) Validate(cache *registry.Cache, return err } } - + if field.SynonymSource != "" { + _, err = cache.SynonymSourceNamed(field.SynonymSource) + if err != nil { + return err + } + } err := validateFieldMapping(field, parentName, fieldAliasCtx) if err != nil { return err diff --git a/mapping/index.go b/mapping/index.go index 78485cfad..8a0d5e34a 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -55,7 +55,6 @@ type IndexMappingImpl struct { IndexDynamic bool `json:"index_dynamic"` DocValuesDynamic bool `json:"docvalues_dynamic"` CustomAnalysis *customAnalysis `json:"analysis,omitempty"` - SynonymSources map[string]*SynonymSource `json:"synonym_sources,omitempty"` cache *registry.Cache } @@ -147,11 +146,12 @@ func (im *IndexMappingImpl) AddCustomDateTimeParser(name string, config map[stri return nil } -func (im *IndexMappingImpl) AddSynonymSource(name, collection, analyzer string) error { - if im.SynonymSources == nil { - im.SynonymSources = make(map[string]*SynonymSource) +func (im *IndexMappingImpl) AddSynonymSource(name string, config map[string]interface{}) error { + _, err := im.cache.DefineSynonymSource(name, config) + if err != nil { + return err } - im.SynonymSources[name] = NewSynonymSource(collection, analyzer) + im.CustomAnalysis.SynonymSources[name] = config return nil } @@ -184,7 +184,12 @@ func (im *IndexMappingImpl) Validate() error { if err != nil { return err } - + if im.DefaultSynonymSource != "" { + _, err = im.cache.SynonymSourceNamed(im.DefaultSynonymSource) + if err != nil { + return err + } + } fieldAliasCtx := make(map[string]*FieldMapping) err = im.DefaultMapping.Validate(im.cache, "", fieldAliasCtx) if err != nil { @@ -196,12 +201,6 @@ func (im *IndexMappingImpl) Validate() error { return err } } - for _, synSource := range im.SynonymSources { - err = synSource.Validate(im.cache) - if err != nil { - return err - } - } return nil } @@ -304,14 +303,6 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { if err != nil { return err } - case "synonym_sources": - if im.SynonymSources == nil { - im.SynonymSources = make(map[string]*SynonymSource) - } - err := util.UnmarshalJSON(v, &im.SynonymSources) - if err != nil { - return err - } default: invalidKeys = append(invalidKeys, k) } @@ -371,18 +362,19 @@ func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{} func (im *IndexMappingImpl) MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error { // determine all the synonym sources with the given collection // and create a synonym field for each - for name, synSource := range im.SynonymSources { - if synSource.Collection() == collection { + err := im.SynonymSourceVisitor(func(name string, item analysis.SynonymSource) error { + if item.Collection() == collection { // create a new field with the name of the synonym source - analyzer := im.AnalyzerNamed(synSource.Analyzer()) + analyzer := im.AnalyzerNamed(item.Analyzer()) if analyzer == nil { - return fmt.Errorf("unknown analyzer named: %s", synSource.Analyzer()) + return fmt.Errorf("unknown analyzer named: %s", item.Analyzer()) } field := document.NewSynonymField(name, analyzer, input, synonyms) doc.AddField(field) } - } - return nil + return nil + }) + return err } type walkContext struct { @@ -504,6 +496,15 @@ func (im *IndexMappingImpl) DefaultSearchField() string { return im.DefaultField } +func (im *IndexMappingImpl) SynonymSourceNamed(name string) analysis.SynonymSource { + syn, err := im.cache.SynonymSourceNamed(name) + if err != nil { + logger.Printf("error using synonym source named: %s", name) + return nil + } + return syn +} + func (im *IndexMappingImpl) SynonymSourceForPath(path string) string { // first we look for explicit mapping on the field for _, docMapping := range im.TypeMapping { @@ -544,6 +545,16 @@ func (im *IndexMappingImpl) SynonymSourceForPath(path string) string { return im.DefaultSynonymSource } +// SynonymCount() returns the number of synonym sources defined in the mapping func (im *IndexMappingImpl) SynonymCount() int { - return len(im.SynonymSources) + return len(im.CustomAnalysis.SynonymSources) +} + +// SynonymSourceVisitor() allows a visitor to iterate over all synonym sources +func (im *IndexMappingImpl) SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error { + err := im.cache.SynonymSources.VisitSynonymSources(visitor) + if err != nil { + return err + } + return nil } diff --git a/mapping/synonym.go b/mapping/synonym.go index 597539bf0..06dee754a 100644 --- a/mapping/synonym.go +++ b/mapping/synonym.go @@ -17,6 +17,7 @@ package mapping import ( "fmt" + "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/registry" ) @@ -47,17 +48,21 @@ func (s *SynonymSource) SetCollection(c string) { func (s *SynonymSource) SetAnalyzer(a string) { s.AnalyzerName = a } - -func (s *SynonymSource) Validate(c *registry.Cache) error { - if s.CollectionName == "" { - return fmt.Errorf("collection name is required") +func SynonymSourceConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.SynonymSource, error) { + collection, ok := config["collection"].(string) + if !ok { + return nil, fmt.Errorf("must specify collection") } - if s.AnalyzerName == "" { - return fmt.Errorf("analyzer name is required") + analyzer, ok := config["analyzer"].(string) + if !ok { + return nil, fmt.Errorf("must specify analyzer") } - _, err := c.AnalyzerNamed(s.AnalyzerName) - if err != nil { - return fmt.Errorf("analyzer named '%s' not found", s.AnalyzerName) + if _, err := cache.AnalyzerNamed(analyzer); err != nil { + return nil, fmt.Errorf("analyzer named '%s' not found", analyzer) } - return nil + return NewSynonymSource(collection, analyzer), nil +} + +func init() { + registry.RegisterSynonymSource(analysis.SynonymSourceType, SynonymSourceConstructor) } diff --git a/registry/registry.go b/registry/registry.go index 1954d0896..69ee8dd86 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -36,6 +36,7 @@ var tokenMaps = make(TokenMapRegistry, 0) var tokenFilters = make(TokenFilterRegistry, 0) var analyzers = make(AnalyzerRegistry, 0) var dateTimeParsers = make(DateTimeParserRegistry, 0) +var synonymSources = make(SynonymSourceRegistry, 0) type Cache struct { CharFilters *CharFilterCache @@ -47,6 +48,7 @@ type Cache struct { FragmentFormatters *FragmentFormatterCache Fragmenters *FragmenterCache Highlighters *HighlighterCache + SynonymSources *SynonymSourceCache } func NewCache() *Cache { @@ -60,6 +62,7 @@ func NewCache() *Cache { FragmentFormatters: NewFragmentFormatterCache(), Fragmenters: NewFragmenterCache(), Highlighters: NewHighlighterCache(), + SynonymSources: NewSynonymSourceCache(), } } @@ -147,6 +150,14 @@ func (c *Cache) DefineDateTimeParser(name string, config map[string]interface{}) return c.DateTimeParsers.DefineDateTimeParser(name, typ, config, c) } +func (c *Cache) SynonymSourceNamed(name string) (analysis.SynonymSource, error) { + return c.SynonymSources.SynonymSourceNamed(name, c) +} + +func (c *Cache) DefineSynonymSource(name string, config map[string]interface{}) (analysis.SynonymSource, error) { + return c.SynonymSources.DefineSynonymSource(name, analysis.SynonymSourceType, config, c) +} + func (c *Cache) FragmentFormatterNamed(name string) (highlight.FragmentFormatter, error) { return c.FragmentFormatters.FragmentFormatterNamed(name, c) } diff --git a/registry/synonym_source.go b/registry/synonym_source.go new file mode 100644 index 000000000..cd26d8f01 --- /dev/null +++ b/registry/synonym_source.go @@ -0,0 +1,85 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package registry + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/analysis" +) + +func RegisterSynonymSource(typ string, constructor SynonymSourceConstructor) { + _, exists := synonymSources[typ] + if exists { + panic(fmt.Errorf("attempted to register duplicate synonym source with type '%s'", typ)) + } + synonymSources[typ] = constructor +} + +type SynonymSourceCache struct { + *ConcurrentCache +} + +func NewSynonymSourceCache() *SynonymSourceCache { + return &SynonymSourceCache{ + NewConcurrentCache(), + } +} + +type SynonymSourceConstructor func(config map[string]interface{}, cache *Cache) (analysis.SynonymSource, error) +type SynonymSourceRegistry map[string]SynonymSourceConstructor + +func SynonymSourceBuild(name string, config map[string]interface{}, cache *Cache) (interface{}, error) { + cons, registered := synonymSources[name] + if !registered { + return nil, fmt.Errorf("no synonym source with name '%s' registered", name) + } + synonymSource, err := cons(config, cache) + if err != nil { + return nil, fmt.Errorf("error building synonym source: %v", err) + } + return synonymSource, nil +} + +func (c *SynonymSourceCache) SynonymSourceNamed(name string, cache *Cache) (analysis.SynonymSource, error) { + item, err := c.ItemNamed(name, cache, SynonymSourceBuild) + if err != nil { + return nil, err + } + return item.(analysis.SynonymSource), nil +} + +func (c *SynonymSourceCache) DefineSynonymSource(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.SynonymSource, error) { + item, err := c.DefineItem(name, typ, config, cache, SynonymSourceBuild) + if err != nil { + if err == ErrAlreadyDefined { + return nil, fmt.Errorf("synonym source named '%s' already defined", name) + } + return nil, err + } + return item.(analysis.SynonymSource), nil +} + +func (c *SynonymSourceCache) VisitSynonymSources(visitor analysis.SynonymSourceVisitor) error { + c.mutex.RLock() + defer c.mutex.RUnlock() + for k, v := range c.data { + err := visitor(k, v.(analysis.SynonymSource)) + if err != nil { + return err + } + } + return nil +} diff --git a/search_test.go b/search_test.go index 9c5f0d7bb..885595971 100644 --- a/search_test.go +++ b/search_test.go @@ -3759,16 +3759,24 @@ func TestSynonymTermReader(t *testing.T) { synonymSourceName := "english" - synonymAnalyzer := "simple" + analyzer := simple.Name + + synonymSourceConfig := map[string]interface{}{ + "collection": synonymCollection, + "analyzer": analyzer, + } textField := mapping.NewTextFieldMapping() - textField.Analyzer = simple.Name + textField.Analyzer = analyzer textField.SynonymSource = synonymSourceName imap := mapping.NewIndexMapping() imap.DefaultMapping.AddFieldMappingsAt("text", textField) - imap.AddSynonymSource(synonymSourceName, synonymCollection, synonymAnalyzer) - err := imap.Validate() + err := imap.AddSynonymSource(synonymSourceName, synonymSourceConfig) + if err != nil { + t.Fatal(err) + } + err = imap.Validate() if err != nil { t.Fatal(err) } @@ -3942,14 +3950,22 @@ func TestSynonymSearchQueries(t *testing.T) { analyzer := en.AnalyzerName + synonymSourceConfig := map[string]interface{}{ + "collection": synonymCollection, + "analyzer": analyzer, + } + textField := mapping.NewTextFieldMapping() textField.Analyzer = analyzer textField.SynonymSource = synonymSourceName imap := mapping.NewIndexMapping() imap.DefaultMapping.AddFieldMappingsAt("text", textField) - imap.AddSynonymSource(synonymSourceName, synonymCollection, analyzer) - err := imap.Validate() + err := imap.AddSynonymSource(synonymSourceName, synonymSourceConfig) + if err != nil { + t.Fatal(err) + } + err = imap.Validate() if err != nil { t.Fatal(err) } From a4d83ac3ff697c3f0bcb31207e4350e4c406abb6 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 10 Dec 2024 19:53:32 +0530 Subject: [PATCH 27/35] add additional methods to interface --- mapping/mapping.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mapping/mapping.go b/mapping/mapping.go index 6714c55aa..a6c1591b8 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -59,6 +59,8 @@ type IndexMapping interface { FieldMappingForPath(path string) FieldMapping } +// A SynonymMapping extends the IndexMapping interface to provide +// additional methods for working with synonyms. type SynonymMapping interface { IndexMapping @@ -66,5 +68,9 @@ type SynonymMapping interface { SynonymSourceForPath(path string) string + SynonymSourceNamed(name string) analysis.SynonymSource + SynonymCount() int + + SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error } From 1cf2bfd4fe6b37f9f2e9bfccf26fda6eced9697f Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Wed, 11 Dec 2024 15:37:18 +0530 Subject: [PATCH 28/35] update interface name --- index/scorch/snapshot_index.go | 40 +++++++++++++++--------------- index/scorch/snapshot_index_str.go | 18 +++++++------- search/query/query.go | 2 +- search_test.go | 8 +++--- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 2c3ea5167..efb85b0f8 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -60,7 +60,7 @@ var reflectStaticSizeIndexSnapshot int // exported variable, or at the index level by setting the FieldTFRCacheThreshold // in the kvConfig. var DefaultFieldTFRCacheThreshold uint64 = 10 -var DefaultSynonymTermReaderCacheThreshold uint64 = 10 +var DefaultThesaurusTermReaderCacheThreshold uint64 = 10 func init() { var is interface{} = IndexSnapshot{} @@ -88,9 +88,9 @@ type IndexSnapshot struct { m sync.Mutex // Protects the fields that follow. refs int64 - m2 sync.Mutex // Protects the fields that follow. - fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's - synonymTermReaders map[string][]*IndexSnapshotSynonymTermReader // keyed by thesaurus name, recycled thesaurus readers + m2 sync.Mutex // Protects the fields that follow. + fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's + thesaurusTermReaders map[string][]*IndexSnapshotThesaurusTermReader // keyed by thesaurus name, recycled thesaurus readers } func (i *IndexSnapshot) Segments() []*SegmentSnapshot { @@ -683,13 +683,13 @@ func (is *IndexSnapshot) getFieldTFRCacheThreshold() uint64 { return DefaultFieldTFRCacheThreshold } -func (is *IndexSnapshot) getSynonymTermReaderCacheThreshold() uint64 { +func (is *IndexSnapshot) getThesaurusTermReaderCacheThreshold() uint64 { if is.parent.config != nil { - if _, ok := is.parent.config["SynonymTermReaderCacheThreshold"]; ok { - return is.parent.config["SynonymTermReaderCacheThreshold"].(uint64) + if _, ok := is.parent.config["ThesaurusTermReaderCacheThreshold"]; ok { + return is.parent.config["ThesaurusTermReaderCacheThreshold"].(uint64) } } - return DefaultSynonymTermReaderCacheThreshold + return DefaultThesaurusTermReaderCacheThreshold } func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { @@ -720,7 +720,7 @@ func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReade is.m2.Unlock() } -func (is *IndexSnapshot) recycleSynonymTermReader(str *IndexSnapshotSynonymTermReader) { +func (is *IndexSnapshot) recycleThesaurusTermReader(str *IndexSnapshotThesaurusTermReader) { is.parent.rootLock.RLock() obsolete := is.parent.root != is is.parent.rootLock.RUnlock() @@ -730,11 +730,11 @@ func (is *IndexSnapshot) recycleSynonymTermReader(str *IndexSnapshotSynonymTermR } is.m2.Lock() - if is.synonymTermReaders == nil { - is.synonymTermReaders = map[string][]*IndexSnapshotSynonymTermReader{} + if is.thesaurusTermReaders == nil { + is.thesaurusTermReaders = map[string][]*IndexSnapshotThesaurusTermReader{} } - if uint64(len(is.synonymTermReaders[str.name])) < is.getSynonymTermReaderCacheThreshold() { - is.synonymTermReaders[str.name] = append(is.synonymTermReaders[str.name], str) + if uint64(len(is.thesaurusTermReaders[str.name])) < is.getThesaurusTermReaderCacheThreshold() { + is.thesaurusTermReaders[str.name] = append(is.thesaurusTermReaders[str.name], str) } is.m2.Unlock() } @@ -1019,25 +1019,25 @@ func (is *IndexSnapshot) CloseCopyReader() error { return is.Close() } -func (is *IndexSnapshot) allocSynonymTermReader(name string) (str *IndexSnapshotSynonymTermReader) { +func (is *IndexSnapshot) allocThesaurusTermReader(name string) (str *IndexSnapshotThesaurusTermReader) { is.m2.Lock() - if is.synonymTermReaders != nil { - strs := is.synonymTermReaders[name] + if is.thesaurusTermReaders != nil { + strs := is.thesaurusTermReaders[name] last := len(strs) - 1 if last >= 0 { str = strs[last] strs[last] = nil - is.synonymTermReaders[name] = strs[:last] + is.thesaurusTermReaders[name] = strs[:last] is.m2.Unlock() return } } is.m2.Unlock() - return &IndexSnapshotSynonymTermReader{} + return &IndexSnapshotThesaurusTermReader{} } -func (is *IndexSnapshot) SynonymTermReader(ctx context.Context, thesaurusName string, term []byte) (index.SynonymTermReader, error) { - rv := is.allocSynonymTermReader(thesaurusName) +func (is *IndexSnapshot) ThesaurusTermReader(ctx context.Context, thesaurusName string, term []byte) (index.ThesaurusTermReader, error) { + rv := is.allocThesaurusTermReader(thesaurusName) rv.name = thesaurusName rv.snapshot = is diff --git a/index/scorch/snapshot_index_str.go b/index/scorch/snapshot_index_str.go index e1ba60272..c66fbeec5 100644 --- a/index/scorch/snapshot_index_str.go +++ b/index/scorch/snapshot_index_str.go @@ -21,14 +21,14 @@ import ( segment "github.com/blevesearch/scorch_segment_api/v2" ) -var reflectStaticSizeIndexSnapshotSynonymTermReader int +var reflectStaticSizeIndexSnapshotThesaurusTermReader int func init() { - var istr IndexSnapshotSynonymTermReader - reflectStaticSizeIndexSnapshotSynonymTermReader = int(reflect.TypeOf(istr).Size()) + var istr IndexSnapshotThesaurusTermReader + reflectStaticSizeIndexSnapshotThesaurusTermReader = int(reflect.TypeOf(istr).Size()) } -type IndexSnapshotSynonymTermReader struct { +type IndexSnapshotThesaurusTermReader struct { name string snapshot *IndexSnapshot thesauri []segment.Thesaurus @@ -37,8 +37,8 @@ type IndexSnapshotSynonymTermReader struct { segmentOffset int } -func (i *IndexSnapshotSynonymTermReader) Size() int { - sizeInBytes := reflectStaticSizeIndexSnapshotSynonymTermReader + size.SizeOfPtr + +func (i *IndexSnapshotThesaurusTermReader) Size() int { + sizeInBytes := reflectStaticSizeIndexSnapshotThesaurusTermReader + size.SizeOfPtr + len(i.name) + size.SizeOfString for _, postings := range i.postings { @@ -52,7 +52,7 @@ func (i *IndexSnapshotSynonymTermReader) Size() int { return sizeInBytes } -func (i *IndexSnapshotSynonymTermReader) Next() (string, error) { +func (i *IndexSnapshotThesaurusTermReader) Next() (string, error) { // find the next hit for i.segmentOffset < len(i.iterators) { if i.iterators[i.segmentOffset] != nil { @@ -70,9 +70,9 @@ func (i *IndexSnapshotSynonymTermReader) Next() (string, error) { return "", nil } -func (i *IndexSnapshotSynonymTermReader) Close() error { +func (i *IndexSnapshotThesaurusTermReader) Close() error { if i.snapshot != nil { - i.snapshot.recycleSynonymTermReader(i) + i.snapshot.recycleThesaurusTermReader(i) } return nil } diff --git a/search/query/query.go b/search/query/query.go index f44a248c7..86859ae5b 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -740,7 +740,7 @@ func addSynonymsForTermWithMatchType(ctx context.Context, matchType int, src, fi func addSynonymsForTerm(ctx context.Context, src, field, term string, r index.ThesaurusReader, rv search.FieldTermSynonymMap) (search.FieldTermSynonymMap, error) { - termReader, err := r.SynonymTermReader(ctx, src, []byte(term)) + termReader, err := r.ThesaurusTermReader(ctx, src, []byte(term)) if err != nil { return nil, err } diff --git a/search_test.go b/search_test.go index 885595971..043be4073 100644 --- a/search_test.go +++ b/search_test.go @@ -3751,7 +3751,7 @@ func TestAutoFuzzy(t *testing.T) { } } -func TestSynonymTermReader(t *testing.T) { +func TestThesaurusTermReader(t *testing.T) { tmpIndexPath := createTmpIndexPath(t) defer cleanupTmpIndexPath(t, tmpIndexPath) @@ -3862,9 +3862,9 @@ func TestSynonymTermReader(t *testing.T) { } }() - synReader, ok := reader.(index.ThesaurusReader) + thesReader, ok := reader.(index.ThesaurusReader) if !ok { - t.Fatal("expected synonym reader") + t.Fatal("expected thesaurus reader") } type testStruct struct { @@ -3912,7 +3912,7 @@ func TestSynonymTermReader(t *testing.T) { } for _, test := range testQueries { - str, err := synReader.SynonymTermReader(context.Background(), synonymSourceName, []byte(test.queryTerm)) + str, err := thesReader.ThesaurusTermReader(context.Background(), synonymSourceName, []byte(test.queryTerm)) if err != nil { t.Fatal(err) } From 302147b21701f1f5615376ade489e765c77dc9c5 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Thu, 12 Dec 2024 13:21:51 +0530 Subject: [PATCH 29/35] go.mod update --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 31f8a34bc..9743d6bf0 100644 --- a/go.mod +++ b/go.mod @@ -5,20 +5,20 @@ go 1.21 require ( github.com/RoaringBitmap/roaring v1.9.3 github.com/bits-and-blooms/bitset v1.12.0 - github.com/blevesearch/bleve_index_api v1.1.13 + github.com/blevesearch/bleve_index_api v1.2.0 github.com/blevesearch/geo v0.1.20 github.com/blevesearch/go-faiss v1.0.23 github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/goleveldb v1.0.1 github.com/blevesearch/gtreap v0.1.1 - github.com/blevesearch/scorch_segment_api/v2 v2.2.16 + github.com/blevesearch/scorch_segment_api/v2 v2.3.0 github.com/blevesearch/segment v0.9.1 github.com/blevesearch/snowball v0.6.1 github.com/blevesearch/snowballstem v0.9.0 github.com/blevesearch/stempel v0.2.0 github.com/blevesearch/upsidedown_store_api v1.0.2 - github.com/blevesearch/vellum v1.0.11 + github.com/blevesearch/vellum v1.1.0 github.com/blevesearch/zapx/v11 v11.3.10 github.com/blevesearch/zapx/v12 v12.3.10 github.com/blevesearch/zapx/v13 v13.3.10 diff --git a/go.sum b/go.sum index 088c6aafb..28bc2b8d0 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/RoaringBitmap/roaring v1.9.3 h1:t4EbC5qQwnisr5PrP9nt0IRhRTb9gMUgQF4t4 github.com/RoaringBitmap/roaring v1.9.3/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/blevesearch/bleve_index_api v1.1.13 h1:+nrA6oRJr85aCPyqaeZtsruObwKojutfonHJin/BP48= -github.com/blevesearch/bleve_index_api v1.1.13/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo= +github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM= github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w= github.com/blevesearch/go-faiss v1.0.23 h1:Wmc5AFwDLKGl2L6mjLX1Da3vCL0EKa2uHHSorcIS1Uc= @@ -19,8 +19,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.2.16 h1:uGvKVvG7zvSxCwcm4/ehBa9cCEuZVE+/zvrSl57QUVY= -github.com/blevesearch/scorch_segment_api/v2 v2.2.16/go.mod h1:VF5oHVbIFTu+znY1v30GjSpT5+9YFs9dV2hjvuh34F0= +github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw= +github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A= @@ -31,8 +31,8 @@ github.com/blevesearch/stempel v0.2.0 h1:CYzVPaScODMvgE9o+kf6D4RJ/VRomyi9uHF+PtB github.com/blevesearch/stempel v0.2.0/go.mod h1:wjeTHqQv+nQdbPuJ/YcvOjTInA2EIc6Ks1FoSUzSLvc= github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= -github.com/blevesearch/vellum v1.0.11 h1:SJI97toEFTtA9WsDZxkyGTaBWFdWl1n2LEDCXLCq/AU= -github.com/blevesearch/vellum v1.0.11/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= +github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= +github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk= github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ= github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s= From 2971072eff2e1c650fd1096d5893f8aaaa8bd6ce Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 17 Dec 2024 14:50:50 +0530 Subject: [PATCH 30/35] merge master --- go.mod | 6 +++--- go.sum | 12 ++++++------ index_alias_impl.go | 11 +++++++++++ pre_search.go | 4 ++++ 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 9743d6bf0..623904722 100644 --- a/go.mod +++ b/go.mod @@ -23,8 +23,8 @@ require ( github.com/blevesearch/zapx/v12 v12.3.10 github.com/blevesearch/zapx/v13 v13.3.10 github.com/blevesearch/zapx/v14 v14.3.10 - github.com/blevesearch/zapx/v15 v15.3.16 - github.com/blevesearch/zapx/v16 v16.1.9 + github.com/blevesearch/zapx/v15 v15.3.17 + github.com/blevesearch/zapx/v16 v16.1.10 github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 github.com/spf13/cobra v1.7.0 @@ -36,7 +36,7 @@ require ( github.com/blevesearch/mmap-go v1.0.4 // indirect github.com/couchbase/ghistogram v0.1.0 // indirect github.com/golang/geo v0.0.0-20210211234256-740aa86cb551 // indirect - github.com/golang/snappy v0.0.1 // indirect + github.com/golang/snappy v0.0.4 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/json-iterator/go v0.0.0-20171115153421-f7279a603ede // indirect github.com/mschoch/smat v0.2.0 // indirect diff --git a/go.sum b/go.sum index 28bc2b8d0..0ac89af30 100644 --- a/go.sum +++ b/go.sum @@ -41,10 +41,10 @@ github.com/blevesearch/zapx/v13 v13.3.10 h1:0KY9tuxg06rXxOZHg3DwPJBjniSlqEgVpxIq github.com/blevesearch/zapx/v13 v13.3.10/go.mod h1:w2wjSDQ/WBVeEIvP0fvMJZAzDwqwIEzVPnCPrz93yAk= github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz77pSwwKU= github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= -github.com/blevesearch/zapx/v15 v15.3.16 h1:Ct3rv7FUJPfPk99TI/OofdC+Kpb4IdyfdMH48sb+FmE= -github.com/blevesearch/zapx/v15 v15.3.16/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= -github.com/blevesearch/zapx/v16 v16.1.9 h1:br5EgGntCF723cCecSpOACE0LnGAP4+HYrkcEfQkcBY= -github.com/blevesearch/zapx/v16 v16.1.9/go.mod h1:zuxVgVaLZ0g4lZvrv06xDc24N6nLCOzXYHVkXI7LMHM= +github.com/blevesearch/zapx/v15 v15.3.17 h1:NkkMI98pYLq/uHnB6YWcITrrLpCVyvZ9iP+AyfpW1Ys= +github.com/blevesearch/zapx/v15 v15.3.17/go.mod h1:vXRQzJJvlGVCdmOD5hg7t7JdjUT5DmDPhsAfjvtzIq8= +github.com/blevesearch/zapx/v16 v16.1.10 h1:moxlODQYuwqsXWK7Yyj40Wr1G8A4QKB32+fz3OLlEDU= +github.com/blevesearch/zapx/v16 v16.1.10/go.mod h1:Xtloe2uqSXH2j3yrQr9yGiHwz3Hz/fpHn5szTqpyizs= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= @@ -60,8 +60,8 @@ github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= -github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= diff --git a/index_alias_impl.go b/index_alias_impl.go index eee8243fb..21f2daa64 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -16,6 +16,7 @@ package bleve import ( "context" + "fmt" "sync" "time" @@ -672,6 +673,9 @@ func finalizeSearchResult(req *SearchRequest, preSearchResult *SearchResult) *Se } func requestSatisfiedByPreSearch(req *SearchRequest, flags *preSearchFlags) bool { + if flags == nil { + return false + } // if the synonyms presearch flag is set the request can never be satisfied by // the preSearch result as synonyms are not part of the preSearch result if flags.synonyms { @@ -692,6 +696,9 @@ func constructSynonymPreSearchData(rv map[string]map[string]interface{}, sr *Sea func constructPreSearchData(req *SearchRequest, flags *preSearchFlags, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { + if flags == nil || preSearchResult == nil { + return nil, fmt.Errorf("invalid input, flags: %v, preSearchResult: %v", flags, preSearchResult) + } mergedOut := make(map[string]map[string]interface{}, len(indexes)) for _, index := range indexes { mergedOut[index.Name()] = make(map[string]interface{}) @@ -821,6 +828,10 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, flags *preSear // finalizePreSearchResult finalizes the preSearch result by applying the finalization steps // specific to the preSearch flags func finalizePreSearchResult(req *SearchRequest, flags *preSearchFlags, preSearchResult *SearchResult) { + // if flags is nil then return + if flags == nil { + return + } if flags.knn { preSearchResult.Hits = finalizeKNNResults(req, preSearchResult.Hits) } diff --git a/pre_search.go b/pre_search.go index 90d1293bf..5fd710d68 100644 --- a/pre_search.go +++ b/pre_search.go @@ -105,6 +105,10 @@ func (m *compositePreSearchResultProcessor) finalize(sr *SearchResult) { // ----------------------------------------------------------------------------- // Function to create the appropriate preSearchResultProcessor(s) func createPreSearchResultProcessor(req *SearchRequest, flags *preSearchFlags) preSearchResultProcessor { + // return nil for invalid input + if flags == nil || req == nil { + return nil + } var processors []preSearchResultProcessor // Add KNN processor if the request has KNN if flags.knn { From e062cd79cf66056e6a4f58473e8003f4bb6393f2 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Tue, 17 Dec 2024 15:00:00 +0530 Subject: [PATCH 31/35] reposition --- index_alias_impl.go | 64 ++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 21f2daa64..766240b4a 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -716,38 +716,6 @@ func constructPreSearchData(req *SearchRequest, flags *preSearchFlags, return mergedOut, nil } -// redistributePreSearchData redistributes the preSearchData sent in the search request to an index alias -// which would happen in the case of an alias tree and depending on the level of the tree, the preSearchData -// needs to be redistributed to the indexes at that level -func redistributePreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { - rv := make(map[string]map[string]interface{}) - for _, index := range indexes { - rv[index.Name()] = make(map[string]interface{}) - } - if knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch); ok { - // the preSearchData for KNN is a list of DocumentMatch objects - // that need to be redistributed to the right index. - // This is used only in the case of an alias tree, where the indexes - // are at the leaves of the tree, and the master alias is at the root. - // At each level of the tree, the preSearchData needs to be redistributed - // to the indexes/aliases at that level. Because the preSearchData is - // specific to each final index at the leaf. - segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) - if err != nil { - return nil, err - } - for _, index := range indexes { - rv[index.Name()][search.KnnPreSearchDataKey] = segregatedKnnHits[index.Name()] - } - } - if fts, ok := req.PreSearchData[search.SynonymPreSearchDataKey].(search.FieldTermSynonymMap); ok { - for _, index := range indexes { - rv[index.Name()][search.SynonymPreSearchDataKey] = fts - } - } - return rv, nil -} - func preSearchDataSearch(ctx context.Context, req *SearchRequest, flags *preSearchFlags, indexes ...Index) (*SearchResult, error) { asyncResults := make(chan *asyncSearchResult, len(indexes)) // run search on each index in separate go routine @@ -825,6 +793,38 @@ func preSearchDataSearch(ctx context.Context, req *SearchRequest, flags *preSear return sr, nil } +// redistributePreSearchData redistributes the preSearchData sent in the search request to an index alias +// which would happen in the case of an alias tree and depending on the level of the tree, the preSearchData +// needs to be redistributed to the indexes at that level +func redistributePreSearchData(req *SearchRequest, indexes []Index) (map[string]map[string]interface{}, error) { + rv := make(map[string]map[string]interface{}) + for _, index := range indexes { + rv[index.Name()] = make(map[string]interface{}) + } + if knnHits, ok := req.PreSearchData[search.KnnPreSearchDataKey].([]*search.DocumentMatch); ok { + // the preSearchData for KNN is a list of DocumentMatch objects + // that need to be redistributed to the right index. + // This is used only in the case of an alias tree, where the indexes + // are at the leaves of the tree, and the master alias is at the root. + // At each level of the tree, the preSearchData needs to be redistributed + // to the indexes/aliases at that level. Because the preSearchData is + // specific to each final index at the leaf. + segregatedKnnHits, err := validateAndDistributeKNNHits(knnHits, indexes) + if err != nil { + return nil, err + } + for _, index := range indexes { + rv[index.Name()][search.KnnPreSearchDataKey] = segregatedKnnHits[index.Name()] + } + } + if fts, ok := req.PreSearchData[search.SynonymPreSearchDataKey].(search.FieldTermSynonymMap); ok { + for _, index := range indexes { + rv[index.Name()][search.SynonymPreSearchDataKey] = fts + } + } + return rv, nil +} + // finalizePreSearchResult finalizes the preSearch result by applying the finalization steps // specific to the preSearch flags func finalizePreSearchResult(req *SearchRequest, flags *preSearchFlags, preSearchResult *SearchResult) { From 41fc99e8d6e73c906553921790d41f4a43d8341a Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Thu, 19 Dec 2024 15:34:21 +0530 Subject: [PATCH 32/35] refactor --- search/util.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/search/util.go b/search/util.go index 9f5a15cac..2e95f1180 100644 --- a/search/util.go +++ b/search/util.go @@ -148,15 +148,15 @@ type SearcherEndCallbackFn func(size uint64) error // field -> term -> synonyms type FieldTermSynonymMap map[string]map[string][]string -func (f *FieldTermSynonymMap) MergeWith(fts FieldTermSynonymMap) { +func (f FieldTermSynonymMap) MergeWith(fts FieldTermSynonymMap) { for field, termSynonymMap := range fts { // Ensure the field exists in the receiver - if _, exists := (*f)[field]; !exists { - (*f)[field] = make(map[string][]string) + if _, exists := f[field]; !exists { + f[field] = make(map[string][]string) } for term, synonyms := range termSynonymMap { // Append synonyms - (*f)[field][term] = append((*f)[field][term], synonyms...) + f[field][term] = append(f[field][term], synonyms...) } } } From 0f11d73c10d7719a3eb01f7cd6dfb39937423af8 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Thu, 19 Dec 2024 20:53:54 +0530 Subject: [PATCH 33/35] minor fix --- document/field_synonym.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/document/field_synonym.go b/document/field_synonym.go index 4aa603fc4..c34b481dd 100644 --- a/document/field_synonym.go +++ b/document/field_synonym.go @@ -81,12 +81,18 @@ func (s *SynonymField) Analyze() { if len(s.input) > 0 { analyzedInput = make([]string, 0, len(s.input)) for _, term := range s.input { - analyzedInput = append(analyzedInput, analyzeSynonymTerm(term, s.analyzer)) + analyzedTerm := analyzeSynonymTerm(term, s.analyzer) + if analyzedTerm != "" { + analyzedInput = append(analyzedInput, analyzedTerm) + } } } analyzedSynonyms := make([]string, 0, len(s.synonyms)) for _, syn := range s.synonyms { - analyzedSynonyms = append(analyzedSynonyms, analyzeSynonymTerm(syn, s.analyzer)) + analyzedTerm := analyzeSynonymTerm(syn, s.analyzer) + if analyzedTerm != "" { + analyzedSynonyms = append(analyzedSynonyms, analyzedTerm) + } } s.synonymMap = processSynonymData(analyzedInput, analyzedSynonyms) } @@ -136,8 +142,8 @@ func processSynonymData(input []string, synonyms []string) map[string][]string { func analyzeSynonymTerm(term string, analyzer analysis.Analyzer) string { tokenStream := analyzer.Analyze([]byte(term)) - if len(tokenStream) == 0 { - return term + if len(tokenStream) == 1 { + return string(tokenStream[0].Term) } - return string(tokenStream[0].Term) + return "" } From c73184451c47509d9a943f8a6dc71a8637192854 Mon Sep 17 00:00:00 2001 From: CascadingRadium Date: Thu, 19 Dec 2024 20:59:22 +0530 Subject: [PATCH 34/35] test fix --- search_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/search_test.go b/search_test.go index 043be4073..fdfaa3efb 100644 --- a/search_test.go +++ b/search_test.go @@ -3836,7 +3836,7 @@ func TestThesaurusTermReader(t *testing.T) { } for synName, synDef := range synonymDocuments { - err := batch.IndexSynonym(synName, "collection1", synDef) + err := batch.IndexSynonym(synName, synonymCollection, synDef) if err != nil { t.Fatal(err) } From 148d32aa6b71cffbe9d6a8c4f87173f3b690bf33 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Thu, 19 Dec 2024 09:04:55 -0700 Subject: [PATCH 35/35] Bump up zap/v16 * 82553cd Rahul Rampure | Add Thesaurus API and Synonym Index Handling in Search --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 7ea27dcde..cfee95607 100644 --- a/go.mod +++ b/go.mod @@ -24,7 +24,7 @@ require ( github.com/blevesearch/zapx/v13 v13.3.10 github.com/blevesearch/zapx/v14 v14.3.10 github.com/blevesearch/zapx/v15 v15.3.17 - github.com/blevesearch/zapx/v16 v16.1.11-0.20241217210710-e1dde3e9876d + github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38 github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 github.com/spf13/cobra v1.7.0 diff --git a/go.sum b/go.sum index 1aa0281d5..f21c89611 100644 --- a/go.sum +++ b/go.sum @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7 github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= github.com/blevesearch/zapx/v15 v15.3.17 h1:NkkMI98pYLq/uHnB6YWcITrrLpCVyvZ9iP+AyfpW1Ys= github.com/blevesearch/zapx/v15 v15.3.17/go.mod h1:vXRQzJJvlGVCdmOD5hg7t7JdjUT5DmDPhsAfjvtzIq8= -github.com/blevesearch/zapx/v16 v16.1.11-0.20241217210710-e1dde3e9876d h1:XUZzJwWrRqRJwigYWE7iB2nYBP6rjcU3x+InZtvQOGo= -github.com/blevesearch/zapx/v16 v16.1.11-0.20241217210710-e1dde3e9876d/go.mod h1:wZc3SFjKlrqxkiUkT+HVBBBBTX8oqXxUb2gjE+CMgIE= +github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38 h1:iJ3Q3sbyo2d0bjfb720RmGjj7cqzh/EdP3528ggDIMY= +github.com/blevesearch/zapx/v16 v16.1.11-0.20241219160422-82553cdd4b38/go.mod h1:JTZseJiEpogtkepKSubIKAmfgbQiOReJXfmjxB1qta4= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=