Skip to content

Commit

Permalink
Vector search feature (#1928)
Browse files Browse the repository at this point in the history
* Implemented vector search / vector index [API]

* Now supports cosine distance metric [API]

Added type C4VectorMetric and field C4VectorIndexOptions::metric.
Default value should be kC4VectorMetricEuclidean.

* Add faiss to blackduck and correct SQLiteCpp commit

* Updated mobile-vector-search submodule

* Support vector SQ encoding bits and min/max training size [API]

---------

Co-authored-by: Jim Borden <[email protected]>
Co-authored-by: Jianmin Zhao <[email protected]>
  • Loading branch information
3 people authored Jan 16, 2024
1 parent d45b8b2 commit 1977bc3
Show file tree
Hide file tree
Showing 56 changed files with 11,135 additions and 173 deletions.
4 changes: 4 additions & 0 deletions C/Cpp_include/c4Database.hh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ struct C4Database

using Config = C4DatabaseConfig2;

/** Registers a directory path to load extension libraries from, such as Vector Search.
Must be called before opening a database that will use an extension. */
static void setExtensionPath(slice path);

static bool exists(slice name, slice inDirectory);
static void copyNamed(slice sourcePath, slice destinationName, const Config&);
static bool deleteNamed(slice name, slice inDirectory);
Expand Down
2 changes: 2 additions & 0 deletions C/c4.exp
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,8 @@ _c4error_return
_c4_dumpInstances
_gC4ExpectExceptions

_c4_setExtensionPath

_FLDoc_FromJSON
_FLDoc_Retain
_FLDoc_GetAllocedData
Expand Down
2 changes: 2 additions & 0 deletions C/c4CAPI.cc
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,8 @@ C4SliceResult c4coll_getIndexesInfo(C4Collection* coll, C4Error* C4NULLABLE outE

#pragma mark - DATABASE:

void c4_setExtensionPath(C4String path) noexcept { C4Database::setExtensionPath(path); }

bool c4db_exists(C4String name, C4String inDirectory) noexcept { return C4Database::exists(name, inDirectory); }

bool c4key_setPassword(C4EncryptionKey* outKey, C4String password, C4EncryptionAlgorithm alg) noexcept {
Expand Down
2 changes: 2 additions & 0 deletions C/c4Database.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ C4EncryptionKey C4EncryptionKeyFromPasswordSHA1(slice password, C4EncryptionAlgo
return key;
}

void C4Database::setExtensionPath(slice path) { SQLiteDataFile::setExtensionPath(string(path)); }

#pragma mark - STATIC LIFECYCLE METHODS:

static FilePath dbPath(slice name, slice parentDir) {
Expand Down
2 changes: 2 additions & 0 deletions C/c4_ee.exp
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,8 @@ _c4error_return
_c4_dumpInstances
_gC4ExpectExceptions

_c4_setExtensionPath

_FLDoc_FromJSON
_FLDoc_Retain
_FLDoc_GetAllocedData
Expand Down
4 changes: 4 additions & 0 deletions C/include/c4Database.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ CBL_CORE_API bool c4key_setPassword(C4EncryptionKey* encryptionKey, C4String pas
CBL_CORE_API bool c4key_setPasswordSHA1(C4EncryptionKey* encryptionKey, C4String password,
C4EncryptionAlgorithm alg) C4API;

/** Registers a directory path to load extension libraries from, such as Vector Search.
Must be called before opening a database that will use an extension. */
CBL_CORE_API void c4_setExtensionPath(C4String path) C4API;

/** @} */

//////// DATABASE API:
Expand Down
2 changes: 1 addition & 1 deletion C/include/c4DatabaseTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ typedef C4_OPTIONS(uint32_t, C4DatabaseFlags){
kC4DB_Create = 0x01, ///< Create the file if it doesn't exist
kC4DB_ReadOnly = 0x02, ///< Open file read-only
kC4DB_AutoCompact = 0x04, ///< Enable auto-compaction [UNIMPLEMENTED]
kC4DB_VersionVectors = 0x08, ///< Upgrade DB to version vectors instead of rev trees [EXPERIMENTAL]
kC4DB_VersionVectors = 0x08, ///< Upgrade DB to version vectors instead of rev trees
kC4DB_NoUpgrade = 0x20, ///< Disable upgrading an older-version database
kC4DB_NonObservable = 0x40, ///< Disable database/collection observers, for slightly faster writes
kC4DB_FakeVectorClock = 0x80, ///< Use counters instead of timestamps in version vectors (TESTS ONLY)
Expand Down
31 changes: 30 additions & 1 deletion C/include/c4IndexTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,36 @@ typedef C4_ENUM(uint32_t, C4IndexType){
kC4FullTextIndex, ///< Full-text index
kC4ArrayIndex, ///< Index of array values, for use with UNNEST
kC4PredictiveIndex, ///< Index of prediction() results (Enterprise Edition only)
kC4VectorIndex, ///< Index of ML vector similarity (Enterprise Edition only)
};

/** Distance metric to use in vector indexes. */
typedef C4_ENUM(uint32_t, C4VectorMetric){
kC4VectorMetricDefault, ///< Use default metric, Euclidean
kC4VectorMetricEuclidean, ///< Euclidean distance (squared)
kC4VectorMetricCosine, ///< Cosine distance (1.0 - cosine similarity)
}; // Values must match IndexSpec::VectorOptions::Metric

/** Types of encoding (compression) to use in vector indexes. */
typedef C4_ENUM(uint32_t, C4VectorEncoding){
kC4VectorEncodingDefault, ///< Use default encoding, which is currently SQ8
kC4VectorEncodingNone, ///< No encoding: 32 bits per dimension, no data loss
kC4VectorEncodingSQ8, ///< Scalar Quantizer: 8 bits per dimension (default)
kC4VectorEncodingSQ6, ///< Scalar Quantizer: 6 bits per dimension
kC4VectorEncodingSQ4, ///< Scalar Quantizer: 4 bits per dimension
}; // Values must match IndexSpec::VectorOptions::Encoding

/** Options for vector indexes. */
typedef struct C4VectorIndexOptions {
unsigned numCentroids; ///< Number of buckets to partition the vectors between
C4VectorMetric metric; ///< Distance metric
C4VectorEncoding encoding; ///< Vector compression type
unsigned minTrainingSize; ///< Minimum # of vectors to train index (>= 25*numCentroids)
unsigned maxTrainingSize; ///< Maximum # of vectors to train index on (<= 256*numCentroids)
} C4VectorIndexOptions;

/** Options for indexes; these each apply to specific types of indexes. */
typedef struct {
typedef struct C4IndexOptions {
/** Dominant language of text to be indexed; setting this enables word stemming, i.e.
matching different cases of the same word ("big" and "bigger", for instance.)
Can be an ISO-639 language code or a lowercase (English) language name; supported
Expand Down Expand Up @@ -61,6 +87,9 @@ typedef struct {
To provide a custom list of words, use a string containing the words in lowercase
separated by spaces. */
const char* C4NULLABLE stopWords;

/** Options for vector indexes. */
C4VectorIndexOptions vector;
} C4IndexOptions;

/** @} */
Expand Down
2 changes: 2 additions & 0 deletions C/scripts/c4.txt
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,8 @@ c4error_return
c4_dumpInstances
gC4ExpectExceptions

c4_setExtensionPath

FLDoc_FromJSON
FLDoc_Retain
FLDoc_GetAllocedData
Expand Down
20 changes: 1 addition & 19 deletions C/tests/c4Test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -578,25 +578,7 @@ fleece::alloc_slice C4Test::readFile(const std::string& filepath) {
}

bool C4Test::readFileByLines(const string& path, function_ref<bool(FLSlice)> callback, size_t maxLines) {
INFO("Reading lines from " << path);
fstream fd(path.c_str(), ios_base::in);
REQUIRE(fd);
vector<char> buf(1000000); // The Wikipedia dumps have verrry long lines
size_t lineCount = 0;
while ( fd.good() ) {
if ( maxLines > 0 && lineCount == maxLines ) { break; }
// Ensure that buf.capacity (size_t/uint64) will not exceed limit of std::streamsize (int64)
DebugAssert(buf.capacity() <= std::numeric_limits<std::streamsize>::max());
fd.getline(buf.data(), buf.capacity()); // NOLINT(cppcoreguidelines-narrowing-conversions)
auto len = fd.gcount();
if ( len <= 0 ) break;
++lineCount;
REQUIRE(buf[len - 1] == '\0');
--len;
if ( !callback({buf.data(), (size_t)len}) ) return false;
}
REQUIRE((fd.eof() || (maxLines > 0 && lineCount == maxLines)));
return true;
return ReadFileByLines(path, callback, maxLines);
}

unsigned C4Test::importJSONFile(const string& path, const string& idPrefix, double timeout, bool verbose) const {
Expand Down
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ target_compile_definitions(
-DHAVE_LOCALTIME_R # Use localtime_r instead of localtime
-DHAVE_USLEEP # Allow millisecond precision sleep
-DHAVE_UTIME # Use utime() instead of utimes()
-DSQLITE_OMIT_LOAD_EXTENSION # Disable extensions (not needed for LiteCore)
-DSQLITE_ENABLE_FTS4 # Build FTS versions 3 and 4
-DSQLITE_ENABLE_FTS3_PARENTHESIS # Allow AND and NOT support in FTS parser
-DSQLITE_ENABLE_FTS3_TOKENIZER # Allow LiteCore to define a tokenizer
Expand Down
31 changes: 27 additions & 4 deletions LiteCore/Database/CollectionImpl.hh
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,36 @@ namespace litecore {

#pragma mark - INDEXES:


static_assert(sizeof(C4IndexOptions) == sizeof(IndexSpec::Options));

void createIndex(slice indexName, slice indexSpec, C4QueryLanguage indexLanguage, C4IndexType indexType,
const C4IndexOptions* indexOptions = nullptr) override {
IndexSpec::Options options;
switch ( indexType ) {
case kC4FullTextIndex:
if ( indexOptions ) {
IndexSpec::FTSOptions ftsOpt;
ftsOpt.language = indexOptions->language;
ftsOpt.ignoreDiacritics = indexOptions->ignoreDiacritics;
ftsOpt.disableStemming = indexOptions->disableStemming;
ftsOpt.stopWords = indexOptions->stopWords;
options = ftsOpt;
break;
}
case kC4VectorIndex:
if ( indexOptions ) {
IndexSpec::VectorOptions vecOpt;
vecOpt.metric = IndexSpec::VectorOptions::Metric(indexOptions->vector.metric);
vecOpt.numCentroids = indexOptions->vector.numCentroids;
vecOpt.encoding = IndexSpec::VectorOptions::Encoding(indexOptions->vector.encoding);
vecOpt.minTrainingSize = indexOptions->vector.minTrainingSize;
vecOpt.maxTrainingSize = indexOptions->vector.maxTrainingSize;
options = vecOpt;
}
break;
default:
break;
}
keyStore().createIndex(indexName, indexSpec, (QueryLanguage)indexLanguage, (IndexSpec::Type)indexType,
(const IndexSpec::Options*)indexOptions);
options);
}

void deleteIndex(slice indexName) override { keyStore().deleteIndex(indexName); }
Expand Down
23 changes: 21 additions & 2 deletions LiteCore/Query/IndexSpec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,32 @@ namespace litecore {
using namespace fleece;
using namespace fleece::impl;

void IndexSpec::VectorOptions::validate() {
const char* err = nullptr;
if ( numCentroids < 1 ) err = "numCentroids is too small";
else if ( numCentroids > 65535 )
err = "numCentroids is too large";
else if ( minTrainingSize < 25 * numCentroids )
err = "minTrainingSize is too small";
else if ( maxTrainingSize < minTrainingSize )
err = "maxTrainingSize is too small";
else if ( maxTrainingSize > 256 * numCentroids )
err = "maxTrainingSize is too large";
if ( err ) error::_throw(error::InvalidParameter, "Invalid VectorOptions: %s", err);
}

IndexSpec::IndexSpec(std::string name_, Type type_, alloc_slice expression_, QueryLanguage queryLanguage_,
const Options* opt)
Options opt)
: name(std::move(name_))
, type(type_)
, expression(std::move(expression_))
, queryLanguage(queryLanguage_)
, options(opt ? std::make_optional(*opt) : std::optional<Options>()) {}
, options(std::move(opt)) {
if ( auto whichOpts = options.index() ) {
if ( (type == kFullText && whichOpts != 1) || (type == kVector && whichOpts != 2) )
error::_throw(error::LiteCoreError::InvalidParameter, "Invalid options type for index");
}
}

IndexSpec::IndexSpec(IndexSpec&&) = default;
IndexSpec::~IndexSpec() = default;
Expand Down
73 changes: 59 additions & 14 deletions LiteCore/Query/IndexSpec.hh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "Base.hh"
#include <optional>
#include <string>
#include <variant>

namespace fleece::impl {
class Array;
Expand All @@ -28,22 +29,64 @@ namespace litecore {
};

struct IndexSpec {
/// The types of indexes.
enum Type {
kValue, ///< Regular index of property value
kFullText, ///< Full-text index, for MATCH queries
kFullText, ///< Full-text index, for MATCH queries. Uses IndexSpec::FTSOptions.
kArray, ///< Index of array values, for UNNEST queries
kPredictive, ///< Index of prediction results
kVector, ///< Index of ML vector similarity. Uses IndexSpec::VectorOptions.
};

struct Options {
const char* language; ///< NULL or an ISO language code ("en", etc)
bool ignoreDiacritics; ///< True to strip diacritical marks/accents from letters
bool disableStemming; ///< Disables stemming
const char* stopWords; ///< NULL for default, or comma-delimited string, or empty
/// Options for a full-text index.
struct FTSOptions {
const char* language{}; ///< NULL or an ISO language code ("en", etc)
bool ignoreDiacritics{}; ///< True to strip diacritical marks/accents from letters
bool disableStemming{}; ///< Disables stemming
const char* stopWords{}; ///< NULL for default, or comma-delimited string, or empty
};

/// Options for a vector index.
struct VectorOptions {
enum Metric {
DefaultMetric, ///< Use default metric, Euclidean
Euclidean, ///< Euclidean distance (squared)
Cosine, ///< Cosine distance (1.0 - cosine similarity)
}; // Note: values must match C4VectorMetric in c4IndexTypes.h

enum Encoding {
DefaultEncoding, ///< Use default encoding, which is currently SQ8Bit
NoEncoding, ///< No encoding; 4 bytes per dimension, no data loss
SQ8BitEncoding, ///< Scalar Quantizer; 8 bits per dimension (recommended)
SQ6BitEncoding, ///< Scalar Quantizer; 6 bits per dimension
SQ4BitEncoding, ///< Scalar Quantizer; 4 bits per dimension
}; // Note: values must match C4VectorEncoding in c4IndexTypes.h

unsigned numCentroids; ///< Number of centroids/buckets to divide the index into
Metric metric{DefaultMetric}; ///< Distance metric
Encoding encoding{DefaultEncoding}; ///< Vector encoding/compression
unsigned minTrainingSize; ///< Min # of vectors to train index on
unsigned maxTrainingSize; ///< Max # of vectors to train index on

VectorOptions(unsigned numCentroids_ = 2048)
: numCentroids(numCentroids_)
, minTrainingSize(25 * numCentroids_)
, maxTrainingSize(256 * numCentroids_) {}

void validate();
};

/// Index options. If not empty (the first state), must match the index type.
using Options = std::variant<std::monostate, FTSOptions, VectorOptions>;

/// Constructs an index spec.
/// @param name_ Name of the index (must be unique in its collection.)
/// @param type_ Type of the index.
/// @param expression_ The value(s) to be indexed.
/// @param queryLanguage Language used for `expression_`; either JSON or N1QL.
/// @param options_ Options; if given, its type must match the index type.
IndexSpec(std::string name_, Type type_, alloc_slice expression_,
QueryLanguage queryLanguage = QueryLanguage::kJSON, const Options* opt = nullptr);
QueryLanguage queryLanguage = QueryLanguage::kJSON, Options options_ = {});

IndexSpec(const IndexSpec&) = delete;
IndexSpec(IndexSpec&&);
Expand All @@ -53,23 +96,25 @@ namespace litecore {
void validateName() const;

const char* typeName() const {
static const char* kTypeName[] = {"value", "full-text", "array", "predictive"};
static const char* kTypeName[] = {"value", "full-text", "array", "predictive", "vector"};
return kTypeName[type];
}

const Options* optionsPtr() const { return options ? &*options : nullptr; }
const FTSOptions* ftsOptions() const { return std::get_if<FTSOptions>(&options); }

const VectorOptions* vectorOptions() const { return std::get_if<VectorOptions>(&options); }

/** The required WHAT clause: the list of expressions to index */
const fleece::impl::Array* NONNULL what() const;

/** The optional WHERE clause: the condition for a partial index */
const fleece::impl::Array* where() const;

std::string const name;
Type const type;
alloc_slice const expression;
QueryLanguage queryLanguage;
std::optional<Options> const options;
std::string const name; ///< Name of index
Type const type; ///< Type of index
alloc_slice const expression; ///< The query expression
QueryLanguage queryLanguage; ///< Is expression JSON or N1QL?
Options const options; ///< Options for FTS and vector indexes

private:
fleece::impl::Doc* doc() const;
Expand Down
12 changes: 11 additions & 1 deletion LiteCore/Query/PredictiveModel.hh
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,24 @@ namespace fleece::impl {

namespace litecore {

/** Abstract superclass of predictive models. A model consists of a `prediction` function.
Implemented by C4PredictiveModelInternal, which bridges to the public C4PredictiveModel. */
class PredictiveModel : public fleece::RefCounted {
public:
/// Given a document body, matches it against the model and returns an (encoded) Dict
/// containing predictive info like ratings, rankings, etc.
/// This must be a pure function that, given the same input, always produces the
/// same output; otherwise predictive indexes wouldn't work.
virtual fleece::alloc_slice prediction(const fleece::impl::Dict* NONNULL, DataFile::Delegate* NONNULL,
C4Error* NONNULL) noexcept = 0;

void registerAs(const std::string& name);
/// Registers a model instance globally, with a unique name.
void registerAs(const std::string& name);

/// Unregisters the model instance with the given name.
static bool unregister(const std::string& name);

/// Returns the instance registered under the given name.
static fleece::Retained<PredictiveModel> named(const std::string&);
};

Expand Down
6 changes: 6 additions & 0 deletions LiteCore/Query/QueryParser+Private.hh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ namespace litecore::qp {
constexpr slice kNestedValueFnName = "fl_nested_value"_sl;
constexpr slice kUnnestedValueFnName = "fl_unnested_value"_sl;
constexpr slice kFTSValueFnName = "fl_fts_value"_sl;
constexpr slice kVectorValueFnName = "fl_vector_value"_sl;
constexpr slice kEncodeVectorFnName = "encode_vector"_sl;
constexpr slice kBlobFnName = "fl_blob"_sl;
constexpr slice kRootFnName = "fl_root"_sl;
constexpr slice kEachFnName = "fl_each"_sl;
Expand All @@ -60,6 +62,10 @@ namespace litecore::qp {
constexpr slice kPredictionFnName = "prediction"_sl;
constexpr slice kPredictionFnNameWithParens = "prediction()"_sl;

constexpr slice kVectorMatchFnName = "vector_match"_sl;
constexpr slice kVectorMatchFnNameWithParens = "vector_match()"_sl;
constexpr slice kVectorDistanceFnName = "vector_distance"_sl;

const char* const kDefaultTableAlias = "_doc";


Expand Down
Loading

0 comments on commit 1977bc3

Please sign in to comment.