Merge pull request #465 from unum-cloud/main-dev

Default to `bf16`
unum-cloud · Aug 19, 2024 · 2e4bf82 · 2e4bf82
2 parents 9f800c1 + 25b90f1
commit 2e4bf82
Show file tree

Hide file tree

Showing 18 changed files with 195 additions and 50 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -128,6 +128,7 @@
     "arange",
     "ashvardanian",
     "astype",
+    "autovec",
     "Availible",
     "bidict",
     "BLAS",

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -174,8 +174,8 @@ Following options are enabled:
 - The `-p no:warnings` option will suppress and allow warnings.
 
 ```sh
-pip install pytest pytest-repeat # for repeated fuzzy tests
-pytest # if you trust the default settings
+pip install pytest pytest-repeat            # for repeated fuzzy tests
+pytest                                      # if you trust the default settings
 pytest python/scripts/ -s -x -p no:warnings # to overwrite the default settings
 ```
 

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <h1 align="center">USearch</h1>
 <h3 align="center">
 Smaller & <a href="https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel">Faster</a> Single-File<br/>
-Similarity Search Engine for <a href="https://github.com/ashvardanian/simsimd">Vectors</a> & 🔜 <a href="https://github.com/ashvardanian/stringzilla">Texts</a>
+Similarity Search & Clustering Engine for <a href="https://github.com/ashvardanian/simsimd">Vectors</a> & 🔜 <a href="https://github.com/ashvardanian/stringzilla">Texts</a>
 </h3>
 <br/>
 
@@ -71,13 +71,10 @@ Linux • MacOS • Windows • iOS • WebAssembly •
 
 __Technical Insights__ and related articles:
 
-- [Uses Horner's method for polynomial approximations, beating GCC 12 by 119x](https://ashvardanian.com/posts/gcc-12-vs-avx512fp16/).
 - [Uses Arm SVE and x86 AVX-512's masked loads to eliminate tail `for`-loops](https://ashvardanian.com/posts/simsimd-faster-scipy/#tails-of-the-past-the-significance-of-masked-loads).
-- [Uses AVX-512 FP16 for half-precision operations, that few compilers vectorize](https://ashvardanian.com/posts/simsimd-faster-scipy/#the-challenge-of-f16).
-- [Substitutes LibC's `sqrt` calls with bithacks using Jan Kadlec's constant](https://ashvardanian.com/posts/simsimd-faster-scipy/#bonus-section-bypassing-sqrt-and-libc-dependencies).
+- [Uses Horner's method for polynomial approximations, beating GCC 12 by 119x](https://ashvardanian.com/posts/gcc-12-vs-avx512fp16/).
 - [For every language implements a custom separate binding](https://ashvardanian.com/posts/porting-cpp-library-to-ten-languages/).
-- [For Python avoids slow PyBind11, and even `PyArg_ParseTuple` for speed](https://ashvardanian.com/posts/pybind11-cpython-tutorial/).
-- [For JavaScript uses typed arrays and NAPI for zero-copy calls](https://ashvardanian.com/posts/javascript-ai-vector-search/).
+
 
 ## Comparison with FAISS
 
@@ -119,31 +116,29 @@ USearch is compact and broadly compatible without sacrificing performance, prima
 Base functionality is identical to FAISS, and the interface must be familiar if you have ever investigated Approximate Nearest Neighbors search:
 
 ```py
-# pip install numpy usearch
+# pip install usearch
 
 import numpy as np
 from usearch.index import Index
 
-index = Index(ndim=3)
-
-vector = np.array([0.2, 0.6, 0.4])
-index.add(42, vector)
-
-matches = index.search(vector, 10)
+index = Index(ndim=3)               # Default settings for 3D vectors
+vector = np.array([0.2, 0.6, 0.4])  # Can be a matrix for batch operations
+index.add(42, vector)               # Add one or many vectors in parallel
+matches = index.search(vector, 10)  # Find 10 nearest neighbors
 
 assert matches[0].key == 42
 assert matches[0].distance <= 0.001
 assert np.allclose(index[42], vector, atol=0.1) # Ensure high tolerance in mixed-precision comparisons
 ```
 
 More settings are always available, and the API is designed to be as flexible as possible.
-The default storage/quantization level is hardware-dependant for efficiency, but `f16` is recommended for most modern CPUs.
+The default storage/quantization level is hardware-dependant for efficiency, but `bf16` is recommended for most modern CPUs.
 
 ```py
 index = Index(
     ndim=3, # Define the number of dimensions in input vectors
     metric='cos', # Choose 'l2sq', 'ip', 'haversine' or other metric, default = 'cos'
-    dtype='f16', # Store as 'f64', 'f32', 'f16', 'i8', 'b1'..., default = None
+    dtype='bf16', # Store as 'f64', 'f32', 'f16', 'i8', 'b1'..., default = None
     connectivity=16, # Optional: Limit number of neighbors per graph node
     expansion_add=128, # Optional: Control the recall of indexing
     expansion_search=64, # Optional: Control the quality of the search

diff --git a/golang/lib.go b/golang/lib.go
@@ -63,6 +63,7 @@ type Quantization uint8
 // Different quantization kinds supported by the USearch library.
 const (
 	F32 Quantization = iota
+	BF16
 	F16
 	F64
 	I8
@@ -72,6 +73,8 @@ const (
 // String returns the string representation of the Quantization.
 func (a Quantization) String() string {
 	switch a {
+	case BF16:
+		return "BF16"
 	case F16:
 		return "F16"
 	case F32:

diff --git a/include/usearch/index.hpp b/include/usearch/index.hpp
@@ -2870,16 +2870,16 @@ class index_gt {
         result.computed_distances = context.computed_distances;
         result.visited_members = context.iteration_cycles;
 
-        // If we are updating the entry node itself, it won't contain any neighbors,
-        // so we should traverse a level down to find the closest match.
-        if (updated_node_level == max_level_copy)
-            updated_node_level--;
-
         // Go down the level, tracking only the closest match;
         // It may even be equal to the `updated_slot`
-        compressed_slot_t closest_slot = search_for_one_( //
-            value, metric, prefetch,                      //
-            entry_slot_copy, max_level_copy, updated_node_level, context);
+        compressed_slot_t closest_slot =
+            // If we are updating the entry node itself, it won't contain any neighbors,
+            // so we should traverse a level down to find the closest match.
+            updated_node_level == max_level_copy //
+                ? entry_slot_copy
+                : search_for_one_(             //
+                      value, metric, prefetch, //
+                      entry_slot_copy, max_level_copy, updated_node_level, context);
 
         // From `updated_node_level` down - perform proper extensive search
         for (level_t level = (std::min)(updated_node_level, max_level_copy); level >= 0; --level) {
@@ -2943,7 +2943,7 @@ class index_gt {
             config.expansion = default_expansion_search();
 
         // Using references is cleaner, but would result in UBSan false positives
-        context_t* context_ptr = contexts_.data() + config.thread;
+        context_t* context_ptr = contexts_.data() ? contexts_.data() + config.thread : nullptr;
         top_candidates_t* top_ptr = context_ptr ? &context_ptr->top_candidates : nullptr;
         search_result_t result{*this, top_ptr};
         if (!nodes_count_.load(std::memory_order_relaxed))
@@ -3697,6 +3697,7 @@ class index_gt {
     inline neighbors_ref_t neighbors_base_(node_t node) const noexcept { return {node.neighbors_tape()}; }
 
     inline neighbors_ref_t neighbors_non_base_(node_t node, level_t level) const noexcept {
+        usearch_assert_m(level > 0 && level <= node.level(), "Linking to missing level");
         return {node.neighbors_tape() + pre_.neighbors_base_bytes + (level - 1) * pre_.neighbors_bytes};
     }
 

diff --git a/include/usearch/index_dense.hpp b/include/usearch/index_dense.hpp
@@ -2240,6 +2240,7 @@ class index_dense_gt {
         case scalar_kind_t::f64_k: return make_casts_<f64_t>();
         case scalar_kind_t::f32_k: return make_casts_<f32_t>();
         case scalar_kind_t::f16_k: return make_casts_<f16_t>();
+        case scalar_kind_t::bf16_k: return make_casts_<bf16_t>();
         case scalar_kind_t::i8_k: return make_casts_<i8_t>();
         case scalar_kind_t::b1x8_k: return make_casts_<b1x8_t>();
         default: return {};