// Copyright (c) 2014 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package index import ( "bytes" "context" "reflect" ) var reflectStaticSizeTermFieldDoc int var reflectStaticSizeTermFieldVector int func init() { var tfd TermFieldDoc reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size()) var tfv TermFieldVector reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size()) } type Index interface { Open() error Close() error Update(doc Document) error Delete(id string) error Batch(batch *Batch) error SetInternal(key, val []byte) error DeleteInternal(key []byte) error // Reader returns a low-level accessor on the index data. Close it to // release associated resources. Reader() (IndexReader, error) StatsMap() map[string]interface{} } // CopyIndex is an extended index that supports copying to a new location online. // Use the CopyReader method to obtain a reader for initiating the copy operation. type CopyIndex interface { Index // Obtain a copy reader for the online copy/backup operation, // to handle necessary bookkeeping, instead of using the regular IndexReader. CopyReader() CopyReader } // EventIndex is an optional interface for exposing the support for firing event // callbacks for various events in the index. type EventIndex interface { // FireIndexEvent is used to fire an event callback when Index() is called, // to notify the caller that a document has been added to the index. FireIndexEvent() } type IndexReader interface { TermFieldReader(ctx context.Context, term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error) // DocIDReader returns an iterator over all doc ids // The caller must close returned instance to release associated resources. DocIDReaderAll() (DocIDReader, error) DocIDReaderOnly(ids []string) (DocIDReader, error) FieldDict(field string) (FieldDict, error) // FieldDictRange is currently defined to include the start and end terms FieldDictRange(field string, startTerm []byte, endTerm []byte) (FieldDict, error) FieldDictPrefix(field string, termPrefix []byte) (FieldDict, error) Document(id string) (Document, error) DocValueReader(fields []string) (DocValueReader, error) Fields() ([]string, error) GetInternal(key []byte) ([]byte, error) DocCount() (uint64, error) ExternalID(id IndexInternalID) (string, error) InternalID(id string) (IndexInternalID, error) Close() error } type BM25Reader interface { IndexReader FieldCardinality(field string) (int, error) } // CopyReader is an extended index reader for backup or online copy operations, replacing the regular index reader. type CopyReader interface { IndexReader // CopyTo performs an online copy or backup of the index to the specified directory. CopyTo(d Directory) error // CloseCopyReader must be used instead of Close() to close the copy reader. CloseCopyReader() error } // RegexAutomaton abstracts an automaton built using a regex pattern. type RegexAutomaton interface { // MatchesRegex returns true if the given string matches the regex pattern // used to build the automaton. MatchesRegex(string) bool } // IndexReaderRegexp provides functionality to work with regex-based field dictionaries. type IndexReaderRegexp interface { // FieldDictRegexp returns a FieldDict for terms matching the specified regex pattern // in the dictionary of the given field. FieldDictRegexp(field string, regex string) (FieldDict, error) // FieldDictRegexpAutomaton returns a FieldDict and a RegexAutomaton that can be used // to match strings against the regex pattern. FieldDictRegexpAutomaton(field string, regex string) (FieldDict, RegexAutomaton, error) } // FuzzyAutomaton abstracts a Levenshtein automaton built using a term and a fuzziness value. type FuzzyAutomaton interface { // MatchAndDistance checks if the given string is within the fuzziness distance // of the term used to build the automaton. It also returns the edit (Levenshtein) // distance between the string and the term. MatchAndDistance(term string) (bool, uint8) } // IndexReaderFuzzy provides functionality to work with fuzzy matching in field dictionaries. type IndexReaderFuzzy interface { // FieldDictFuzzy returns a FieldDict for terms that are within the specified fuzziness // distance of the given term and match the specified prefix in the given field. FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error) // FieldDictFuzzyAutomaton returns a FieldDict and a FuzzyAutomaton that can be used // to calculate the edit distance between the term and other strings. FieldDictFuzzyAutomaton(field string, term string, fuzziness int, prefix string) (FieldDict, FuzzyAutomaton, error) } type IndexReaderContains interface { FieldDictContains(field string) (FieldDictContains, error) } // SpatialIndexPlugin is an optional interface for exposing the // support for any custom analyzer plugins that are capable of // generating hierarchial spatial tokens for both indexing and // query purposes from the geo location data. type SpatialIndexPlugin interface { GetSpatialAnalyzerPlugin(typ string) (SpatialAnalyzerPlugin, error) } type TermFieldVector struct { Field string ArrayPositions []uint64 Pos uint64 Start uint64 End uint64 } func (tfv *TermFieldVector) Size() int { return reflectStaticSizeTermFieldVector + sizeOfPtr + len(tfv.Field) + len(tfv.ArrayPositions)*sizeOfUint64 } // IndexInternalID is an opaque document identifier interal to the index impl type IndexInternalID []byte func (id IndexInternalID) Equals(other IndexInternalID) bool { return id.Compare(other) == 0 } func (id IndexInternalID) Compare(other IndexInternalID) int { return bytes.Compare(id, other) } type TermFieldDoc struct { Term string ID IndexInternalID Freq uint64 Norm float64 Vectors []*TermFieldVector } func (tfd *TermFieldDoc) Size() int { sizeInBytes := reflectStaticSizeTermFieldDoc + sizeOfPtr + len(tfd.Term) + len(tfd.ID) for _, entry := range tfd.Vectors { sizeInBytes += entry.Size() } return sizeInBytes } // Reset allows an already allocated TermFieldDoc to be reused func (tfd *TermFieldDoc) Reset() *TermFieldDoc { // remember the []byte used for the ID id := tfd.ID vectors := tfd.Vectors // idiom to copy over from empty TermFieldDoc (0 allocations) *tfd = TermFieldDoc{} // reuse the []byte already allocated (and reset len to 0) tfd.ID = id[:0] tfd.Vectors = vectors[:0] return tfd } // TermFieldReader is the interface exposing the enumeration of documents // containing a given term in a given field. Documents are returned in byte // lexicographic order over their identifiers. type TermFieldReader interface { // Next returns the next document containing the term in this field, or nil // when it reaches the end of the enumeration. The preAlloced TermFieldDoc // is optional, and when non-nil, will be used instead of allocating memory. Next(preAlloced *TermFieldDoc) (*TermFieldDoc, error) // Advance resets the enumeration at specified document or its immediate // follower. Advance(ID IndexInternalID, preAlloced *TermFieldDoc) (*TermFieldDoc, error) // Count returns the number of documents contains the term in this field. Count() uint64 Close() error Size() int } type DictEntry struct { Term string Count uint64 EditDistance uint8 } type FieldDict interface { Next() (*DictEntry, error) Close() error Cardinality() int BytesRead() uint64 } type FieldDictContains interface { Contains(key []byte) (bool, error) BytesRead() uint64 } // DocIDReader is the interface exposing enumeration of documents identifiers. // Close the reader to release associated resources. type DocIDReader interface { // Next returns the next document internal identifier in the natural // index order, nil when the end of the sequence is reached. Next() (IndexInternalID, error) // Advance resets the iteration to the first internal identifier greater than // or equal to ID. If ID is smaller than the start of the range, the iteration // will start there instead. If ID is greater than or equal to the end of // the range, Next() call will return io.EOF. Advance(ID IndexInternalID) (IndexInternalID, error) Size() int Close() error } type DocValueVisitor func(field string, term []byte) type DocValueReader interface { VisitDocValues(id IndexInternalID, visitor DocValueVisitor) error BytesRead() uint64 } // IndexBuilder is an interface supported by some index schemes // to allow direct write-only index building type IndexBuilder interface { Index(doc Document) error Close() error } // ThesaurusTermReader is an interface for enumerating synonyms of a term in a thesaurus. type ThesaurusTermReader interface { // Next returns the next synonym of the term, or an error if something goes wrong. // Returns nil when the enumeration is complete. Next() (string, error) // Close releases any resources associated with the reader. Close() error Size() int } // ThesaurusEntry represents a term in the thesaurus for which synonyms are stored. type ThesaurusEntry struct { Term string } // ThesaurusKeys is an interface for enumerating terms (keys) in a thesaurus. type ThesaurusKeys interface { // Next returns the next key in the thesaurus, or an error if something goes wrong. // Returns nil when the enumeration is complete. Next() (*ThesaurusEntry, error) // Close releases any resources associated with the reader. Close() error } // ThesaurusReader is an interface for accessing a thesaurus in the index. type ThesaurusReader interface { IndexReader // ThesaurusTermReader returns a reader for the synonyms of a given term in the // specified thesaurus. ThesaurusTermReader(ctx context.Context, name string, term []byte) (ThesaurusTermReader, error) // ThesaurusKeys returns a reader for all terms in the specified thesaurus. ThesaurusKeys(name string) (ThesaurusKeys, error) // ThesaurusKeysFuzzy returns a reader for terms in the specified thesaurus that // match the given prefix and are within the specified fuzziness distance from // the provided term. ThesaurusKeysFuzzy(name string, term string, fuzziness int, prefix string) (ThesaurusKeys, error) // ThesaurusKeysRegexp returns a reader for terms in the specified thesaurus that // match the given regular expression pattern. ThesaurusKeysRegexp(name string, regex string) (ThesaurusKeys, error) // ThesaurusKeysPrefix returns a reader for terms in the specified thesaurus that // start with the given prefix. ThesaurusKeysPrefix(name string, termPrefix []byte) (ThesaurusKeys, error) } // EligibleDocumentSelector filters documents based on specific eligibility criteria. // It can be extended with additional methods for filtering and retrieval. type EligibleDocumentSelector interface { // AddEligibleDocumentMatch marks a document as eligible for selection. // id is the internal identifier of the document to be added. AddEligibleDocumentMatch(id IndexInternalID) error // SegmentEligibleDocs returns a list of eligible document IDs within a given segment. // segmentID identifies the segment for which eligible documents are retrieved. // This must be called after all eligible documents have been added. SegmentEligibleDocs(segmentID int) []uint64 }