Skip to content

Commit e3cef4c

Browse files
committed
Implemented vector search / vector index [API]
1 parent 74e7a7e commit e3cef4c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+11275
-214
lines changed

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,6 @@
2525
[submodule "tools"]
2626
path = tools
2727
url = https://github.com/couchbaselabs/litecore-download-script
28+
[submodule "vendor/mobile-vector-search"]
29+
path = vendor/mobile-vector-search
30+
url = https://github.com/couchbaselabs/mobile-vector-search.git

C/Cpp_include/c4Database.hh

+4
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ struct C4Database
5050

5151
using Config = C4DatabaseConfig2;
5252

53+
/** Registers a directory path to load extension libraries from, such as Vector Search.
54+
Must be called before opening a database that will use an extension. */
55+
static void setExtensionPath(slice path);
56+
5357
static bool exists(slice name, slice inDirectory);
5458
static void copyNamed(slice sourcePath, slice destinationName, const Config&);
5559
static bool deleteNamed(slice name, slice inDirectory);

C/c4.exp

+2
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,8 @@ _c4error_return
400400
_c4_dumpInstances
401401
_gC4ExpectExceptions
402402

403+
_c4_setExtensionPath
404+
403405
_FLDoc_FromJSON
404406
_FLDoc_Retain
405407
_FLDoc_GetAllocedData

C/c4CAPI.cc

+2
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,8 @@ C4SliceResult c4coll_getIndexesInfo(C4Collection* coll, C4Error* C4NULLABLE outE
361361

362362
#pragma mark - DATABASE:
363363

364+
void c4_setExtensionPath(C4String path) noexcept { C4Database::setExtensionPath(path); }
365+
364366
bool c4db_exists(C4String name, C4String inDirectory) noexcept { return C4Database::exists(name, inDirectory); }
365367

366368
bool c4key_setPassword(C4EncryptionKey* outKey, C4String password, C4EncryptionAlgorithm alg) noexcept {

C/c4Database.cc

+2
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ C4EncryptionKey C4EncryptionKeyFromPasswordSHA1(slice password, C4EncryptionAlgo
6060
return key;
6161
}
6262

63+
void C4Database::setExtensionPath(slice path) { SQLiteDataFile::setExtensionPath(string(path)); }
64+
6365
#pragma mark - STATIC LIFECYCLE METHODS:
6466

6567
static FilePath dbPath(slice name, slice parentDir) {

C/c4_ee.exp

+2
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,8 @@ _c4error_return
441441
_c4_dumpInstances
442442
_gC4ExpectExceptions
443443

444+
_c4_setExtensionPath
445+
444446
_FLDoc_FromJSON
445447
_FLDoc_Retain
446448
_FLDoc_GetAllocedData

C/include/c4Database.h

+4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ CBL_CORE_API bool c4key_setPassword(C4EncryptionKey* encryptionKey, C4String pas
4444
CBL_CORE_API bool c4key_setPasswordSHA1(C4EncryptionKey* encryptionKey, C4String password,
4545
C4EncryptionAlgorithm alg) C4API;
4646

47+
/** Registers a directory path to load extension libraries from, such as Vector Search.
48+
Must be called before opening a database that will use an extension. */
49+
CBL_CORE_API void c4_setExtensionPath(C4String path) C4API;
50+
4751
/** @} */
4852

4953
//////// DATABASE API:

C/include/c4DatabaseTypes.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ typedef C4_OPTIONS(uint32_t, C4DatabaseFlags){
3434
kC4DB_Create = 0x01, ///< Create the file if it doesn't exist
3535
kC4DB_ReadOnly = 0x02, ///< Open file read-only
3636
kC4DB_AutoCompact = 0x04, ///< Enable auto-compaction [UNIMPLEMENTED]
37-
kC4DB_VersionVectors = 0x08, ///< Upgrade DB to version vectors instead of rev trees [EXPERIMENTAL]
37+
kC4DB_VersionVectors = 0x08, ///< Upgrade DB to version vectors instead of rev trees
3838
kC4DB_NoUpgrade = 0x20, ///< Disable upgrading an older-version database
3939
kC4DB_NonObservable = 0x40, ///< Disable database/collection observers, for slightly faster writes
4040
kC4DB_FakeVectorClock = 0x80, ///< Use counters instead of timestamps in version vectors (TESTS ONLY)

C/include/c4IndexTypes.h

+18-1
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,24 @@ typedef C4_ENUM(uint32_t, C4IndexType){
2727
kC4FullTextIndex, ///< Full-text index
2828
kC4ArrayIndex, ///< Index of array values, for use with UNNEST
2929
kC4PredictiveIndex, ///< Index of prediction() results (Enterprise Edition only)
30+
kC4VectorIndex, ///< Index of ML vector similarity.
3031
};
3132

33+
/** Types of encoding (compression) to use in vector indexes. */
34+
typedef C4_ENUM(uint32_t, C4VectorEncoding){
35+
kC4VectorEncodingDefault, ///< Use default encoding, which is currently SQ
36+
kC4VectorEncodingNone, ///< No encoding: 4 bytes per dimension, no data loss
37+
kC4VectorEncodingSQ, ///< Scalar Quantizer: 1 byte per dimension (recommended)
38+
}; // Values must match IndexSpec::VectorOptions::Encoding
39+
40+
/** Options for vector indexes. */
41+
typedef struct C4VectorIndexOptions {
42+
unsigned numCentroids; ///< Number of buckets to partition the vectors between
43+
C4VectorEncoding encoding; ///< Vector compression type
44+
} C4VectorIndexOptions;
45+
3246
/** Options for indexes; these each apply to specific types of indexes. */
33-
typedef struct {
47+
typedef struct C4IndexOptions {
3448
/** Dominant language of text to be indexed; setting this enables word stemming, i.e.
3549
matching different cases of the same word ("big" and "bigger", for instance.)
3650
Can be an ISO-639 language code or a lowercase (English) language name; supported
@@ -61,6 +75,9 @@ typedef struct {
6175
To provide a custom list of words, use a string containing the words in lowercase
6276
separated by spaces. */
6377
const char* C4NULLABLE stopWords;
78+
79+
/** Options for vector indexes. */
80+
C4VectorIndexOptions vector;
6481
} C4IndexOptions;
6582

6683
/** @} */

C/scripts/c4.txt

+2
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,8 @@ c4error_return
408408
c4_dumpInstances
409409
gC4ExpectExceptions
410410

411+
c4_setExtensionPath
412+
411413
FLDoc_FromJSON
412414
FLDoc_Retain
413415
FLDoc_GetAllocedData

C/tests/c4Test.cc

+1-19
Original file line numberDiff line numberDiff line change
@@ -578,25 +578,7 @@ fleece::alloc_slice C4Test::readFile(const std::string& filepath) {
578578
}
579579

580580
bool C4Test::readFileByLines(const string& path, function_ref<bool(FLSlice)> callback, size_t maxLines) {
581-
INFO("Reading lines from " << path);
582-
fstream fd(path.c_str(), ios_base::in);
583-
REQUIRE(fd);
584-
vector<char> buf(1000000); // The Wikipedia dumps have verrry long lines
585-
size_t lineCount = 0;
586-
while ( fd.good() ) {
587-
if ( maxLines > 0 && lineCount == maxLines ) { break; }
588-
// Ensure that buf.capacity (size_t/uint64) will not exceed limit of std::streamsize (int64)
589-
DebugAssert(buf.capacity() <= std::numeric_limits<std::streamsize>::max());
590-
fd.getline(buf.data(), buf.capacity()); // NOLINT(cppcoreguidelines-narrowing-conversions)
591-
auto len = fd.gcount();
592-
if ( len <= 0 ) break;
593-
++lineCount;
594-
REQUIRE(buf[len - 1] == '\0');
595-
--len;
596-
if ( !callback({buf.data(), (size_t)len}) ) return false;
597-
}
598-
REQUIRE((fd.eof() || (maxLines > 0 && lineCount == maxLines)));
599-
return true;
581+
return ReadFileByLines(path, callback, maxLines);
600582
}
601583

602584
unsigned C4Test::importJSONFile(const string& path, const string& idPrefix, double timeout, bool verbose) const {

CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,6 @@ target_compile_definitions(
185185
-DHAVE_LOCALTIME_R # Use localtime_r instead of localtime
186186
-DHAVE_USLEEP # Allow millisecond precision sleep
187187
-DHAVE_UTIME # Use utime() instead of utimes()
188-
-DSQLITE_OMIT_LOAD_EXTENSION # Disable extensions (not needed for LiteCore)
189188
-DSQLITE_ENABLE_FTS4 # Build FTS versions 3 and 4
190189
-DSQLITE_ENABLE_FTS3_PARENTHESIS # Allow AND and NOT support in FTS parser
191190
-DSQLITE_ENABLE_FTS3_TOKENIZER # Allow LiteCore to define a tokenizer

LiteCore/Database/CollectionImpl.hh

+24-4
Original file line numberDiff line numberDiff line change
@@ -389,13 +389,33 @@ namespace litecore {
389389

390390
#pragma mark - INDEXES:
391391

392-
393-
static_assert(sizeof(C4IndexOptions) == sizeof(IndexSpec::Options));
394-
395392
void createIndex(slice indexName, slice indexSpec, C4QueryLanguage indexLanguage, C4IndexType indexType,
396393
const C4IndexOptions* indexOptions = nullptr) override {
394+
IndexSpec::Options options;
395+
switch ( indexType ) {
396+
case kC4FullTextIndex:
397+
if ( indexOptions ) {
398+
IndexSpec::FTSOptions ftsOpt;
399+
ftsOpt.language = indexOptions->language;
400+
ftsOpt.ignoreDiacritics = indexOptions->ignoreDiacritics;
401+
ftsOpt.disableStemming = indexOptions->disableStemming;
402+
ftsOpt.stopWords = indexOptions->stopWords;
403+
options = ftsOpt;
404+
break;
405+
}
406+
case kC4VectorIndex:
407+
if ( indexOptions ) {
408+
IndexSpec::VectorOptions vecOpt;
409+
vecOpt.numCentroids = indexOptions->vector.numCentroids;
410+
vecOpt.encoding = IndexSpec::VectorOptions::Encoding(indexOptions->vector.encoding);
411+
options = vecOpt;
412+
}
413+
break;
414+
default:
415+
break;
416+
}
397417
keyStore().createIndex(indexName, indexSpec, (QueryLanguage)indexLanguage, (IndexSpec::Type)indexType,
398-
(const IndexSpec::Options*)indexOptions);
418+
options);
399419
}
400420

401421
void deleteIndex(slice indexName) override { keyStore().deleteIndex(indexName); }

LiteCore/Query/IndexSpec.cc

+7-2
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,17 @@ namespace litecore {
2525
using namespace fleece::impl;
2626

2727
IndexSpec::IndexSpec(std::string name_, Type type_, alloc_slice expression_, QueryLanguage queryLanguage_,
28-
const Options* opt)
28+
Options opt)
2929
: name(std::move(name_))
3030
, type(type_)
3131
, expression(std::move(expression_))
3232
, queryLanguage(queryLanguage_)
33-
, options(opt ? std::make_optional(*opt) : std::optional<Options>()) {}
33+
, options(std::move(opt)) {
34+
if ( auto whichOpts = options.index() ) {
35+
if ( (type == kFullText && whichOpts != 1) || (type == kVector && whichOpts != 2) )
36+
error::_throw(error::LiteCoreError::InvalidParameter, "Invalid options type for index");
37+
}
38+
}
3439

3540
IndexSpec::IndexSpec(IndexSpec&&) = default;
3641
IndexSpec::~IndexSpec() = default;

LiteCore/Query/IndexSpec.hh

+41-14
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "Base.hh"
1515
#include <optional>
1616
#include <string>
17+
#include <variant>
1718

1819
namespace fleece::impl {
1920
class Array;
@@ -28,22 +29,46 @@ namespace litecore {
2829
};
2930

3031
struct IndexSpec {
32+
/// The types of indexes.
3133
enum Type {
3234
kValue, ///< Regular index of property value
33-
kFullText, ///< Full-text index, for MATCH queries
35+
kFullText, ///< Full-text index, for MATCH queries. Uses IndexSpec::FTSOptions.
3436
kArray, ///< Index of array values, for UNNEST queries
3537
kPredictive, ///< Index of prediction results
38+
kVector, ///< Index of ML vector similarity. Uses IndexSpec::VectorOptions.
3639
};
3740

38-
struct Options {
39-
const char* language; ///< NULL or an ISO language code ("en", etc)
40-
bool ignoreDiacritics; ///< True to strip diacritical marks/accents from letters
41-
bool disableStemming; ///< Disables stemming
42-
const char* stopWords; ///< NULL for default, or comma-delimited string, or empty
41+
/// Options for a full-text index.
42+
struct FTSOptions {
43+
const char* language{}; ///< NULL or an ISO language code ("en", etc)
44+
bool ignoreDiacritics{}; ///< True to strip diacritical marks/accents from letters
45+
bool disableStemming{}; ///< Disables stemming
46+
const char* stopWords{}; ///< NULL for default, or comma-delimited string, or empty
4347
};
4448

49+
/// Options for a vector index.
50+
struct VectorOptions {
51+
enum Encoding {
52+
DefaultEncoding, ///< Use default encoding, which is currently SQ
53+
NoEncoding, ///< No encoding; 4 bytes per dimension, no data loss
54+
SQEncoding, ///< Scalar Quantizer; 1 byte per dimension (recommended)
55+
}; // Note: values must match C4VectorEncoding in c4IndexTypes.h
56+
57+
unsigned numCentroids{2048}; ///< Number of centroids/buckets to divide the index into
58+
Encoding encoding{DefaultEncoding}; ///< Vector encoding/compression
59+
};
60+
61+
/// Index options. If not empty (the first state), must match the index type.
62+
using Options = std::variant<std::monostate, FTSOptions, VectorOptions>;
63+
64+
/// Constructs an index spec.
65+
/// @param name_ Name of the index (must be unique in its collection.)
66+
/// @param type_ Type of the index.
67+
/// @param expression_ The value(s) to be indexed.
68+
/// @param queryLanguage Language used for `expression_`; either JSON or N1QL.
69+
/// @param options_ Options; if given, its type must match the index type.
4570
IndexSpec(std::string name_, Type type_, alloc_slice expression_,
46-
QueryLanguage queryLanguage = QueryLanguage::kJSON, const Options* opt = nullptr);
71+
QueryLanguage queryLanguage = QueryLanguage::kJSON, Options options_ = {});
4772

4873
IndexSpec(const IndexSpec&) = delete;
4974
IndexSpec(IndexSpec&&);
@@ -53,23 +78,25 @@ namespace litecore {
5378
void validateName() const;
5479

5580
const char* typeName() const {
56-
static const char* kTypeName[] = {"value", "full-text", "array", "predictive"};
81+
static const char* kTypeName[] = {"value", "full-text", "array", "predictive", "vector"};
5782
return kTypeName[type];
5883
}
5984

60-
const Options* optionsPtr() const { return options ? &*options : nullptr; }
85+
const FTSOptions* ftsOptions() const { return std::get_if<FTSOptions>(&options); }
86+
87+
const VectorOptions* vectorOptions() const { return std::get_if<VectorOptions>(&options); }
6188

6289
/** The required WHAT clause: the list of expressions to index */
6390
const fleece::impl::Array* NONNULL what() const;
6491

6592
/** The optional WHERE clause: the condition for a partial index */
6693
const fleece::impl::Array* where() const;
6794

68-
std::string const name;
69-
Type const type;
70-
alloc_slice const expression;
71-
QueryLanguage queryLanguage;
72-
std::optional<Options> const options;
95+
std::string const name; ///< Name of index
96+
Type const type; ///< Type of index
97+
alloc_slice const expression; ///< The query expression
98+
QueryLanguage queryLanguage; ///< Is expression JSON or N1QL?
99+
Options const options; ///< Options for FTS and vector indexes
73100

74101
private:
75102
fleece::impl::Doc* doc() const;

LiteCore/Query/PredictiveModel.hh

+11-1
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,24 @@ namespace fleece::impl {
2424

2525
namespace litecore {
2626

27+
/** Abstract superclass of predictive models. A model consists of a `prediction` function.
28+
Implemented by C4PredictiveModelInternal, which bridges to the public C4PredictiveModel. */
2729
class PredictiveModel : public fleece::RefCounted {
2830
public:
31+
/// Given a document body, matches it against the model and returns an (encoded) Dict
32+
/// containing predictive info like ratings, rankings, etc.
33+
/// This must be a pure function that, given the same input, always produces the
34+
/// same output; otherwise predictive indexes wouldn't work.
2935
virtual fleece::alloc_slice prediction(const fleece::impl::Dict* NONNULL, DataFile::Delegate* NONNULL,
3036
C4Error* NONNULL) noexcept = 0;
3137

32-
void registerAs(const std::string& name);
38+
/// Registers a model instance globally, with a unique name.
39+
void registerAs(const std::string& name);
40+
41+
/// Unregisters the model instance with the given name.
3342
static bool unregister(const std::string& name);
3443

44+
/// Returns the instance registered under the given name.
3545
static fleece::Retained<PredictiveModel> named(const std::string&);
3646
};
3747

LiteCore/Query/QueryParser+Private.hh

+6
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ namespace litecore::qp {
3838
constexpr slice kNestedValueFnName = "fl_nested_value"_sl;
3939
constexpr slice kUnnestedValueFnName = "fl_unnested_value"_sl;
4040
constexpr slice kFTSValueFnName = "fl_fts_value"_sl;
41+
constexpr slice kVectorValueFnName = "fl_vector_value"_sl;
42+
constexpr slice kEncodeVectorFnName = "encode_vector"_sl;
4143
constexpr slice kBlobFnName = "fl_blob"_sl;
4244
constexpr slice kRootFnName = "fl_root"_sl;
4345
constexpr slice kEachFnName = "fl_each"_sl;
@@ -60,6 +62,10 @@ namespace litecore::qp {
6062
constexpr slice kPredictionFnName = "prediction"_sl;
6163
constexpr slice kPredictionFnNameWithParens = "prediction()"_sl;
6264

65+
constexpr slice kVectorMatchFnName = "vector_match"_sl;
66+
constexpr slice kVectorMatchFnNameWithParens = "vector_match()"_sl;
67+
constexpr slice kVectorDistanceFnName = "vector_distance"_sl;
68+
6369
const char* const kDefaultTableAlias = "_doc";
6470

6571

0 commit comments

Comments
 (0)