remove fastembed (#141)

* remove fastembed * update fastembed * update python engine docs * poetry lock
michaelfeil · Mar 16, 2024 · 1e4f705 · 1e4f705
1 parent 3120f97
commit 1e4f705
Show file tree

Hide file tree

Showing 13 changed files with 79 additions and 202 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ Infinity is a high-throughput, low-latency REST API for serving vector embedding
 ## Why Infinity:
 Infinity provides the following features:
 * **Deploy any model from MTEB**: deploy the model you know from [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/)
-* **Fast inference backends**: The inference server is built on top of [torch](https://github.com/pytorch/pytorch), [optimum(onnx/tensorrt)](https://github.com/qdrant/fastembed) and [CTranslate2](https://github.com/OpenNMT/CTranslate2), using FlashAttention to get the most out of **CUDA**, **ROCM**, **CPU** or **MPS** chips.
+* **Fast inference backends**: The inference server is built on top of [torch](https://github.com/pytorch/pytorch), [optimum(onnx/tensorrt)](https://huggingface.co/docs/optimum/index) and [CTranslate2](https://github.com/OpenNMT/CTranslate2), using FlashAttention to get the most out of **CUDA**, **ROCM**, **CPU** or **MPS** device.
 * **Dynamic batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your device as soon as ready. Similar max throughput on GPU as text-embeddings-inference.
 * **Correct and tested implementation**: Unit and end-to-end tested. Embeddings via infinity are identical to [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/) (up to numerical precision). Lets API users create embeddings till infinity and beyond.
 * **Easy to use**: The API is built on top of [FastAPI](https://fastapi.tiangolo.com/), [Swagger](https://swagger.io/) makes it fully documented. API are aligned to [OpenAI's Embedding specs](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). See below on how to get started.

diff --git a/docs/docs/index.md b/docs/docs/index.md
@@ -5,7 +5,7 @@ Infinity is a high-throughput, low-latency REST API for serving vector embedding
 Infinity provides the following features:
 
 * **Deploy any model from MTEB**: deploy the model you know from [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/)
-* **Fast inference backends**: The inference server is built on top of [torch](https://github.com/pytorch/pytorch), [optimum(onnx/tensorrt)](https://github.com/qdrant/fastembed) and [CTranslate2](https://github.com/OpenNMT/CTranslate2), using FlashAttention to get the most out of **CUDA**, **ROCM**, **CPU** or **MPS** chips.
+* **Fast inference backends**: The inference server is built on top of [torch](https://github.com/pytorch/pytorch), [optimum(onnx/tensorrt)](https://huggingface.co/docs/optimum/index) and [CTranslate2](https://github.com/OpenNMT/CTranslate2), using FlashAttention to get the most out of **CUDA**, **ROCM**, **CPU** or **MPS** device.
 * **Dynamic batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your device as soon as ready. Similar max throughput on GPU as text-embeddings-inference.
 * **Correct and tested implementation**: Unit and end-to-end tested. Embeddings via infinity are identical to [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/) (up to numerical precision). Lets API users create embeddings till infinity and beyond.
 * **Easy to use**: The API is built on top of [FastAPI](https://fastapi.tiangolo.com/), [Swagger](https://swagger.io/) makes it fully documented. API are aligned to [OpenAI's Embedding specs](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). See below on how to get started.

diff --git a/docs/docs/python_engine.md b/docs/docs/python_engine.md
@@ -1,30 +1,40 @@
-Enhancing the document involves improving clarity, structure, and adding helpful context where necessary. Here's an enhanced version:
-
 # Python Engine Integration
 
 ## Launching Embedding generation with Python
 
-Use asynchronous programming in Python using `asyncio` for flexible and efficient embedding processing with Infinity. This advanced method allows for concurrent execution, making it ideal for high-throughput embedding generation.
+Use asynchronous programming in Python using `asyncio` for flexible and efficient embedding processing with Infinity. This advanced method allows for concurrent execution of different requests, making it ideal for high-throughput embedding generation.
 
 ```python
 import asyncio
 from infinity_emb import AsyncEmbeddingEngine, EngineArgs
+from infinity_emb.log_handler import logger
+logger.setLevel(5) # Debug
 
 # Define sentences for embedding
 sentences = ["Embed this sentence via Infinity.", "Paris is in France."]
 # Initialize the embedding engine with model specifications
 engine = AsyncEmbeddingEngine.from_args(
-    EngineArgs(model_name_or_path="BAAI/bge-small-en-v1.5", engine="torch", 
-    lengths_via_tokenize=True
+    EngineArgs(
+        model_name_or_path="BAAI/bge-small-en-v1.5",
+        engine="torch", 
+        lengths_via_tokenize=True
     )
 )
 
 async def main(): 
     async with engine:  # Context manager initializes and terminates the engine
+
+        job1 = asyncio.create_task(engine.embed(sentences=sentences))
+        # submit a second job in parallel
+        job2 = asyncio.create_task(engine.embed(sentences=["Hello world"]))
         # usage is total token count according to tokenizer.
-        embeddings, usage = await engine.embed(sentences=sentences)
-        # Embeddings are now available for use
-asyncio.run(main())
+        embeddings, usage = await job1
+        embeddings2, usage2 = await job2
+        # Embeddings are now available for use - they ran in the same batch.
+    print(f"for {sentences}, generated embeddings {len(embeddings)} with tot_tokens={usage}")
+asyncio.run(
+    main()
+)
 ```
 
 ## Reranker

diff --git a/libs/infinity_emb/infinity_emb/inference/select_model.py b/libs/infinity_emb/infinity_emb/inference/select_model.py
@@ -18,7 +18,7 @@
 def get_engine_type_from_config(
     engine_args: EngineArgs,
 ) -> Union[EmbedderEngine, RerankEngine]:
-    if engine_args.engine in [InferenceEngine.debugengine, InferenceEngine.fastembed]:
+    if engine_args.engine in [InferenceEngine.debugengine]:
         return EmbedderEngine.from_inference_engine(engine_args.engine)
 
     if Path(engine_args.model_name_or_path).is_dir():

diff --git a/libs/infinity_emb/infinity_emb/primitives.py b/libs/infinity_emb/infinity_emb/primitives.py
@@ -23,7 +23,6 @@ class InferenceEngine(enum.Enum):
     torch = "torch"
     ctranslate2 = "ctranslate2"
     optimum = "optimum"
-    fastembed = "fastembed"
     debugengine = "dummytransformer"
 
 

diff --git a/libs/infinity_emb/infinity_emb/transformer/embedder/fastembed.py b/libs/infinity_emb/infinity_emb/transformer/embedder/fastembed.py
diff --git a/libs/infinity_emb/infinity_emb/transformer/utils.py b/libs/infinity_emb/infinity_emb/transformer/utils.py
@@ -11,13 +11,11 @@
 )
 from infinity_emb.transformer.embedder.ct2 import CT2SentenceTransformer
 from infinity_emb.transformer.embedder.dummytransformer import DummyTransformer
-from infinity_emb.transformer.embedder.fastembed import Fastembed
 from infinity_emb.transformer.embedder.optimum import OptimumEmbedder
 from infinity_emb.transformer.embedder.sentence_transformer import (
     SentenceTransformerPatched,
 )
 
-# from infinity_emb.transformer.fastembed import FastEmbed
 __all__ = [
     "length_tokenizer",
     "get_lengths_with_tokenize",
@@ -28,7 +26,6 @@
 class EmbedderEngine(Enum):
     torch = SentenceTransformerPatched
     ctranslate2 = CT2SentenceTransformer
-    fastembed = Fastembed
     debugengine = DummyTransformer
     optimum = OptimumEmbedder
 
@@ -38,8 +35,6 @@ def from_inference_engine(engine: InferenceEngine):
             return EmbedderEngine.torch
         elif engine == InferenceEngine.ctranslate2:
             return EmbedderEngine.ctranslate2
-        elif engine == InferenceEngine.fastembed:
-            return EmbedderEngine.fastembed
         elif engine == InferenceEngine.debugengine:
             return EmbedderEngine.debugengine
         elif engine == InferenceEngine.optimum:

diff --git a/libs/infinity_emb/poetry.lock b/libs/infinity_emb/poetry.lock
diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml
@@ -27,7 +27,6 @@ sentence-transformers = {version = "^2.4.0", optional=true}
 transformers = {version = ">4.8.0", optional=true} 
 ctranslate2 = {version = "^4.0.0", optional=true}
 optimum = {version = ">=1.16.2", optional=true, extras=["onnxruntime"]}
-fastembed = {version = ">=0.2.1", optional=true} 
 hf_transfer = {version=">=0.1.5", optional=true}
 # cache
 diskcache = {version = "*", optional=true}
@@ -47,6 +46,7 @@ anyio = "*"
 trio = "*"
 coverage = {extras = ["toml"], version = "^7.3.2"}
 mypy = "^1.5.1"
+fastembed = ">=0.2.1"
 
 [tool.poetry.group.codespell.dependencies]
 codespell = "^2.2.0"
@@ -68,12 +68,11 @@ mypy-protobuf = "^3.0.0"
 [tool.poetry.extras]
 ct2=["ctranslate2","sentence-transformers","torch","transformers"]
 optimum=["optimum"]
-fastembed=["fastembed"]
 torch=["sentence-transformers","torch","hf_transfer"]
 logging=["rich"]
 cache=["diskcache"]
 server=["fastapi", "pydantic", "orjson", "prometheus-fastapi-instrumentator", "uvicorn", "typer","rich"]
-all=["ctranslate2", "fastapi", "fastembed", "optimum", "orjson", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "torch", "typer", "uvicorn","diskcache"]
+all=["ctranslate2", "fastapi", "optimum", "orjson", "prometheus-fastapi-instrumentator", "pydantic", "rich", "sentence-transformers", "torch", "typer", "uvicorn","diskcache"]
 # non-default gpu
 tensorrt=["tensorrt"]
 onnxruntime-gpu=["onnxruntime-gpu"]