From e5c149810ee56e6e30fe7da313bab326c53b4014 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Tue, 24 Sep 2024 06:37:45 +0200
Subject: [PATCH 01/14] update readme

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 53b1be47..e07f80e5 100644
--- a/README.md
+++ b/README.md
@@ -213,9 +213,8 @@ sentences = ["This is awesome.", "I am bored."]
 
 url = "https://bigsoundbank.com/UPLOAD/wav/2380.wav"
 raw_bytes = requests.get(url, stream=True).content
-data, samplerate = sf.read(io.BytesIO(raw_bytes))
 
-audios = [data]
+audios = [raw_bytes]
 engine_args = EngineArgs(
     model_name_or_path = "laion/clap-htsat-unfused",
     dtype="float32", 
@@ -233,6 +232,8 @@ async def embed(engine: AsyncEmbeddingEngine):
 asyncio.run(embed(array["laion/clap-htsat-unfused"]))
 ```
 
+* Note: The sampling rate of the audio data needs to match the model *
+
 Example models:
 - [Clap Models from LAION](https://huggingface.co/collections/laion/clap-contrastive-language-audio-pretraining-65415c0b18373b607262a490)
 

From 727da50d0d643948d84a727909bec8166f286afc Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Tue, 24 Sep 2024 06:43:53 +0200
Subject: [PATCH 02/14] extract audio related code into audio utils

---
 .../infinity_emb/inference/batch_handler.py   |  3 +-
 .../infinity_emb/transformer/audio/utils.py   | 66 +++++++++++++++++++
 .../infinity_emb/transformer/vision/utils.py  | 58 +---------------
 3 files changed, 69 insertions(+), 58 deletions(-)
 create mode 100644 libs/infinity_emb/infinity_emb/transformer/audio/utils.py

diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index 031bb19c..f69ba0b4 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -33,8 +33,9 @@
     get_inner_item,
 )
 from infinity_emb.transformer.abstract import BaseTransformer
+from infinity_emb.transformer.audio.utils import resolve_audios
 from infinity_emb.transformer.utils import get_lengths_with_tokenize
-from infinity_emb.transformer.vision.utils import resolve_audios, resolve_images
+from infinity_emb.transformer.vision.utils import resolve_images
 
 
 class ShutdownReadOnly:
diff --git a/libs/infinity_emb/infinity_emb/transformer/audio/utils.py b/libs/infinity_emb/infinity_emb/transformer/audio/utils.py
new file mode 100644
index 00000000..999fea45
--- /dev/null
+++ b/libs/infinity_emb/infinity_emb/transformer/audio/utils.py
@@ -0,0 +1,66 @@
+import asyncio
+import io
+from typing import Union
+
+from infinity_emb._optional_imports import CHECK_AIOHTTP, CHECK_SOUNDFILE
+from infinity_emb.primitives import (
+    AudioCorruption,
+    AudioSingle,
+)
+
+if CHECK_AIOHTTP.is_available:
+    import aiohttp
+
+if CHECK_SOUNDFILE.is_available:
+    import soundfile as sf  # type: ignore
+
+
+async def resolve_audio(
+    audio: Union[str, bytes],
+    allowed_sampling_rate: int,
+    session: "aiohttp.ClientSession",
+) -> AudioSingle:
+    if isinstance(audio, bytes):
+        try:
+            audio_bytes = io.BytesIO(audio)
+        except Exception as e:
+            raise AudioCorruption(f"Error opening audio: {e}")
+    else:
+        try:
+            downloaded = await (await session.get(audio)).read()
+            # downloaded = requests.get(audio, stream=True).content
+            audio_bytes = io.BytesIO(downloaded)
+        except Exception as e:
+            raise AudioCorruption(f"Error downloading audio.\nError msg: {str(e)}")
+
+    try:
+        data, rate = sf.read(audio_bytes)
+        if rate != allowed_sampling_rate:
+            raise AudioCorruption(
+                f"Audio sample rate is not {allowed_sampling_rate}Hz, it is {rate}Hz."
+            )
+        return AudioSingle(audio=data, sampling_rate=rate)
+    except Exception as e:
+        raise AudioCorruption(f"Error opening audio: {e}.\nError msg: {str(e)}")
+
+
+async def resolve_audios(
+    audio_urls: list[Union[str, bytes]], allowed_sampling_rate: int
+) -> list[AudioSingle]:
+    """Resolve audios from URLs."""
+    CHECK_AIOHTTP.mark_required()
+    CHECK_SOUNDFILE.mark_required()
+
+    resolved_audios: list[AudioSingle] = []
+    async with aiohttp.ClientSession(trust_env=True) as session:
+        try:
+            resolved_audios = await asyncio.gather(
+                *[
+                    resolve_audio(audio, allowed_sampling_rate, session)
+                    for audio in audio_urls
+                ]
+            )
+        except Exception as e:
+            raise AudioCorruption(f"Failed to resolve audio: {e}")
+
+    return resolved_audios
diff --git a/libs/infinity_emb/infinity_emb/transformer/vision/utils.py b/libs/infinity_emb/infinity_emb/transformer/vision/utils.py
index 08ff04ae..39cbfecb 100644
--- a/libs/infinity_emb/infinity_emb/transformer/vision/utils.py
+++ b/libs/infinity_emb/infinity_emb/transformer/vision/utils.py
@@ -5,10 +5,8 @@
 import io
 from typing import List, Union
 
-from infinity_emb._optional_imports import CHECK_AIOHTTP, CHECK_PIL, CHECK_SOUNDFILE
+from infinity_emb._optional_imports import CHECK_AIOHTTP, CHECK_PIL
 from infinity_emb.primitives import (
-    AudioCorruption,
-    AudioSingle,
     ImageClassType,
     ImageCorruption,
     ImageSingle,
@@ -20,9 +18,6 @@
 if CHECK_PIL.is_available:
     from PIL import Image  # type: ignore
 
-if CHECK_SOUNDFILE.is_available:
-    import soundfile as sf  # type: ignore
-
 
 def resolve_from_img_obj(img_obj: "ImageClassType") -> ImageSingle:
     """Resolve an image from a ImageClassType Object."""
@@ -90,54 +85,3 @@ async def resolve_images(
         )
 
     return resolved_imgs
-
-
-async def resolve_audio(
-    audio: Union[str, bytes],
-    allowed_sampling_rate: int,
-    session: "aiohttp.ClientSession",
-) -> AudioSingle:
-    if isinstance(audio, bytes):
-        try:
-            audio_bytes = io.BytesIO(audio)
-        except Exception as e:
-            raise AudioCorruption(f"Error opening audio: {e}")
-    else:
-        try:
-            downloaded = await (await session.get(audio)).read()
-            # downloaded = requests.get(audio, stream=True).content
-            audio_bytes = io.BytesIO(downloaded)
-        except Exception as e:
-            raise AudioCorruption(f"Error downloading audio.\nError msg: {str(e)}")
-
-    try:
-        data, rate = sf.read(audio_bytes)
-        if rate != allowed_sampling_rate:
-            raise AudioCorruption(
-                f"Audio sample rate is not {allowed_sampling_rate}Hz, it is {rate}Hz."
-            )
-        return AudioSingle(audio=data, sampling_rate=rate)
-    except Exception as e:
-        raise AudioCorruption(f"Error opening audio: {e}.\nError msg: {str(e)}")
-
-
-async def resolve_audios(
-    audio_urls: list[Union[str, bytes]], allowed_sampling_rate: int
-) -> list[AudioSingle]:
-    """Resolve audios from URLs."""
-    CHECK_AIOHTTP.mark_required()
-    CHECK_SOUNDFILE.mark_required()
-
-    resolved_audios: list[AudioSingle] = []
-    async with aiohttp.ClientSession(trust_env=True) as session:
-        try:
-            resolved_audios = await asyncio.gather(
-                *[
-                    resolve_audio(audio, allowed_sampling_rate, session)
-                    for audio in audio_urls
-                ]
-            )
-        except Exception as e:
-            raise AudioCorruption(f"Failed to resolve audio: {e}")
-
-    return resolved_audios

From eb62d3e369b21f47421710a52c3f2145be41192d Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Fri, 27 Sep 2024 04:41:06 +0200
Subject: [PATCH 03/14] add test cases for audio and vision

---
 docs/docs/cli_v2.md                           | 165 +++++++++++++-----
 libs/infinity_emb/tests/conftest.py           |   4 +-
 .../tests/end_to_end/test_torch_audio.py      |  94 ++++++++++
 .../tests/end_to_end/test_torch_vision.py     |  94 ++++++++++
 4 files changed, 314 insertions(+), 43 deletions(-)
 create mode 100644 libs/infinity_emb/tests/end_to_end/test_torch_audio.py
 create mode 100644 libs/infinity_emb/tests/end_to_end/test_torch_vision.py

diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index 33fd3d3a..a02a2c5b 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -5,47 +5,128 @@ Note: The section below is auto-generated by the makefile.
 
 ```bash
 infinity_emb v2 --help
-                                                                                                                                                                                                                                                                              
- Usage: infinity_emb v2 [OPTIONS]                                                                                                                                                                                                                                             
-                                                                                                                                                                                                                                                                              
- Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                                                                                                                                                                                     
- Multiple Model CLI Playbook:                                                                                                                                                                                                                                                 
- - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`                                                                                                                                                               
- - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;"                                                                                                                                 
- - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8.                                                                                                                      
-                                                                                                                                                                                                                                                                              
-╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ --model-id                                             TEXT                                       Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference& [env var: `INFINITY_MODEL_ID`]                      │
-│                                                                                                   [default: michaelfeil/bge-small-en-v1.5]                                                                                                                                 │
-│ --served-model-name                                    TEXT                                       the nickname for the API, under which the model_id can be selected [env var: `INFINITY_SERVED_MODEL_NAME`]                                                               │
-│ --batch-size                                           INTEGER                                    maximum batch size for inference [env var: `INFINITY_BATCH_SIZE`] [default: 32]                                                                                          │
-│ --revision                                             TEXT                                       huggingface  model repo revision. [env var: `INFINITY_REVISION`]                                                                                                         │
-│ --trust-remote-code       --no-trust-remote-code                                                  if potential remote modeling code from huggingface repo is trusted. [env var: `INFINITY_TRUST_REMOTE_CODE`] [default: trust-remote-code]                                 │
-│ --engine                                               [torch|ctranslate2|optimum|debugengine]    Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.                       │
-│                                                                                                   [env var: `INFINITY_ENGINE`]                                                                                                                                             │
-│                                                                                                   [default: torch]                                                                                                                                                         │
-│ --model-warmup            --no-model-warmup                                                       if model should be warmed up after startup, and before ready. [env var: `INFINITY_MODEL_WARMUP`] [default: model-warmup]                                                 │
-│ --vector-disk-cache       --no-vector-disk-cache                                                  If hash(request)/results should be cached to SQLite for latency improvement. [env var: `INFINITY_VECTOR_DISK_CACHE`] [default: vector-disk-cache]                        │
-│ --device                                               [cpu|cuda|mps|tensorrt|auto]               device to use for computing the model forward pass. [env var: `INFINITY_DEVICE`] [default: auto]                                                                         │
-│ --lengths-via-tokenize    --no-lengths-via-tokenize                                               if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy. [env var: `INFINITY_LENGTHS_VIA_TOKENIZE`]                              │
-│                                                                                                   [default: lengths-via-tokenize]                                                                                                                                          │
-│ --dtype                                                [float32|float16|int8|fp8|auto]            dtype for the model weights. [env var: `INFINITY_DTYPE`] [default: auto]                                                                                                 │
-│ --embedding-dtype                                      [float32|int8|uint8|binary|ubinary]        dtype post-forward pass. If != `float32`, using Post-Forward Static quantization. [env var: `INFINITY_EMBEDDING_DTYPE`] [default: float32]                               │
-│ --pooling-method                                       [mean|cls|auto]                            overwrite the pooling method if inferred incorrectly. [env var: `INFINITY_POOLING_METHOD`] [default: auto]                                                               │
-│ --compile                 --no-compile                                                            Enable usage of `torch.compile(dynamic=True)` if engine relies on it. [env var: `INFINITY_COMPILE`] [default: compile]                                                   │
-│ --bettertransformer       --no-bettertransformer                                                  Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model. [env var: `INFINITY_BETTERTRANSFORMER`]                        │
-│                                                                                                   [default: bettertransformer]                                                                                                                                             │
-│ --preload-only            --no-preload-only                                                       If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile. [env var: `INFINITY_PRELOAD_ONLY`]               │
-│                                                                                                   [default: no-preload-only]                                                                                                                                               │
-│ --host                                                 TEXT                                       host for the FastAPI uvicorn server [env var: `INFINITY_HOST`] [default: 0.0.0.0]                                                                                        │
-│ --port                                                 INTEGER                                    port for the FastAPI uvicorn server [env var: `INFINITY_PORT`] [default: 7997]                                                                                           │
-│ --url-prefix                                           TEXT                                       prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API. [env var: `INFINITY_URL_PREFIX`]                                   │
-│ --redirect-slash                                       TEXT                                       where to redirect `/` requests to. [env var: `INFINITY_REDIRECT_SLASH`] [default: /docs]                                                                                 │
-│ --log-level                                            [critical|error|warning|info|debug|trace]  console log level. [env var: `INFINITY_LOG_LEVEL`] [default: info]                                                                                                       │
-│ --permissive-cors         --no-permissive-cors                                                    whether to allow permissive cors. [env var: `INFINITY_PERMISSIVE_CORS`] [default: no-permissive-cors]                                                                    │
-│ --api-key                                              TEXT                                       api_key used for authentication headers. [env var: `INFINITY_API_KEY`]                                                                                                   │
-│ --proxy-root-path                                      TEXT                                       Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/ [env var: `INFINITY_PROXY_ROOT_PATH`]                                       │
-│ --help                                                                                            Show this message and exit.                                                                                                                                              │
-╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+                                                                                                                                 
+ Usage: infinity_emb v2 [OPTIONS]                                                                                                
+                                                                                                                                 
+ Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                                        
+ Multiple Model CLI Playbook:                                                                                                    
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`                  
+ - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" &&               
+ INFINITY_BATCH_SIZE="8;4;"                                                                                                      
+ - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both   
+ models have batch-size 8.                                                                                                       
+                                                                                                                                 
+╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --model-id                                             TEXT                               Huggingface model repo id. Subset   │
+│                                                                                           of possible models:                 │
+│                                                                                           https://huggingface.co/models?othe… │
+│                                                                                           [env var: `INFINITY_MODEL_ID`]      │
+│                                                                                           [default:                           │
+│                                                                                           michaelfeil/bge-small-en-v1.5]      │
+│ --served-model-name                                    TEXT                               the nickname for the API, under     │
+│                                                                                           which the model_id can be selected  │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_SERVED_MODEL_NAME`]       │
+│ --batch-size                                           INTEGER                            maximum batch size for inference    │
+│                                                                                           [env var: `INFINITY_BATCH_SIZE`]    │
+│                                                                                           [default: 32]                       │
+│ --revision                                             TEXT                               huggingface  model repo revision.   │
+│                                                                                           [env var: `INFINITY_REVISION`]      │
+│ --trust-remote-code       --no-trust-remote-code                                          if potential remote modeling code   │
+│                                                                                           from huggingface repo is trusted.   │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_TRUST_REMOTE_CODE`]       │
+│                                                                                           [default: trust-remote-code]        │
+│ --engine                                               [torch|ctranslate2|optimum|debuge  Which backend to use. `torch` uses  │
+│                                                        ngine]                             Pytorch GPU/CPU, optimum uses ONNX  │
+│                                                                                           on GPU/CPU/NVIDIA-TensorRT,         │
+│                                                                                           `CTranslate2` uses                  │
+│                                                                                           torch+ctranslate2 on CPU/GPU.       │
+│                                                                                           [env var: `INFINITY_ENGINE`]        │
+│                                                                                           [default: torch]                    │
+│ --model-warmup            --no-model-warmup                                               if model should be warmed up after  │
+│                                                                                           startup, and before ready.          │
+│                                                                                           [env var: `INFINITY_MODEL_WARMUP`]  │
+│                                                                                           [default: model-warmup]             │
+│ --vector-disk-cache       --no-vector-disk-cache                                          If hash(request)/results should be  │
+│                                                                                           cached to SQLite for latency        │
+│                                                                                           improvement.                        │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_VECTOR_DISK_CACHE`]       │
+│                                                                                           [default: vector-disk-cache]        │
+│ --device                                               [cpu|cuda|mps|tensorrt|auto]       device to use for computing the     │
+│                                                                                           model forward pass.                 │
+│                                                                                           [env var: `INFINITY_DEVICE`]        │
+│                                                                                           [default: auto]                     │
+│ --lengths-via-tokenize    --no-lengths-via-tokenize                                       if True, returned tokens is based   │
+│                                                                                           on actual tokenizer count. If       │
+│                                                                                           false, uses len(input) as proxy.    │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_LENGTHS_VIA_TOKENIZE`]    │
+│                                                                                           [default: lengths-via-tokenize]     │
+│ --dtype                                                [float32|float16|int8|fp8|auto]    dtype for the model weights.        │
+│                                                                                           [env var: `INFINITY_DTYPE`]         │
+│                                                                                           [default: auto]                     │
+│ --embedding-dtype                                      [float32|int8|uint8|binary|ubinar  dtype post-forward pass. If !=      │
+│                                                        y]                                 `float32`, using Post-Forward       │
+│                                                                                           Static quantization.                │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_EMBEDDING_DTYPE`]         │
+│                                                                                           [default: float32]                  │
+│ --pooling-method                                       [mean|cls|auto]                    overwrite the pooling method if     │
+│                                                                                           inferred incorrectly.               │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_POOLING_METHOD`]          │
+│                                                                                           [default: auto]                     │
+│ --compile                 --no-compile                                                    Enable usage of                     │
+│                                                                                           `torch.compile(dynamic=True)` if    │
+│                                                                                           engine relies on it.                │
+│                                                                                           [env var: `INFINITY_COMPILE`]       │
+│                                                                                           [default: compile]                  │
+│ --bettertransformer       --no-bettertransformer                                          Enables varlen flash-attention-2    │
+│                                                                                           via the `BetterTransformer`         │
+│                                                                                           implementation. If available for    │
+│                                                                                           this model.                         │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_BETTERTRANSFORMER`]       │
+│                                                                                           [default: bettertransformer]        │
+│ --preload-only            --no-preload-only                                               If true, only downloads models and  │
+│                                                                                           verifies setup, then exit.          │
+│                                                                                           Recommended for pre-caching the     │
+│                                                                                           download in a Dockerfile.           │
+│                                                                                           [env var: `INFINITY_PRELOAD_ONLY`]  │
+│                                                                                           [default: no-preload-only]          │
+│ --host                                                 TEXT                               host for the FastAPI uvicorn server │
+│                                                                                           [env var: `INFINITY_HOST`]          │
+│                                                                                           [default: 0.0.0.0]                  │
+│ --port                                                 INTEGER                            port for the FastAPI uvicorn server │
+│                                                                                           [env var: `INFINITY_PORT`]          │
+│                                                                                           [default: 7997]                     │
+│ --url-prefix                                           TEXT                               prefix for all routes of the        │
+│                                                                                           FastAPI uvicorn server. Useful if   │
+│                                                                                           you run behind a proxy / cascaded   │
+│                                                                                           API.                                │
+│                                                                                           [env var: `INFINITY_URL_PREFIX`]    │
+│ --redirect-slash                                       TEXT                               where to redirect `/` requests to.  │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_REDIRECT_SLASH`]          │
+│                                                                                           [default: /docs]                    │
+│ --log-level                                            [critical|error|warning|info|debu  console log level.                  │
+│                                                        g|trace]                           [env var: `INFINITY_LOG_LEVEL`]     │
+│                                                                                           [default: info]                     │
+│ --permissive-cors         --no-permissive-cors                                            whether to allow permissive cors.   │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_PERMISSIVE_CORS`]         │
+│                                                                                           [default: no-permissive-cors]       │
+│ --api-key                                              TEXT                               api_key used for authentication     │
+│                                                                                           headers.                            │
+│                                                                                           [env var: `INFINITY_API_KEY`]       │
+│ --proxy-root-path                                      TEXT                               Proxy prefix for the application.   │
+│                                                                                           See:                                │
+│                                                                                           https://fastapi.tiangolo.com/advan… │
+│                                                                                           [env var:                           │
+│                                                                                           `INFINITY_PROXY_ROOT_PATH`]         │
+│ --help                                                                                    Show this message and exit.         │
+╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 
 ```
diff --git a/libs/infinity_emb/tests/conftest.py b/libs/infinity_emb/tests/conftest.py
index 4fec5bd5..df557695 100644
--- a/libs/infinity_emb/tests/conftest.py
+++ b/libs/infinity_emb/tests/conftest.py
@@ -8,8 +8,10 @@
 pytest.DEFAULT_BERT_MODEL = "michaelfeil/bge-small-en-v1.5"
 pytest.DEFAULT_RERANKER_MODEL = "mixedbread-ai/mxbai-rerank-xsmall-v1"
 pytest.DEFAULT_CLASSIFIER_MODEL = "SamLowe/roberta-base-go_emotions"
+pytest.DEFAULT_AUDIO_MODEL = "laion/clap-htsat-unfused"
+pytest.DEFAULT_VISION_MODEL = "wkcn/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M"
 
-pytest.ENGINE_METHODS = ["embed", "image_embed", "classify", "rerank"]
+pytest.ENGINE_METHODS = ["embed", "image_embed", "classify", "rerank", "audio_embed"]
 
 
 @pytest.fixture
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
new file mode 100644
index 00000000..99672a5b
--- /dev/null
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
@@ -0,0 +1,94 @@
+import pytest
+import torch
+from asgi_lifespan import LifespanManager
+from fastapi import status
+from httpx import AsyncClient
+
+from infinity_emb import create_server
+from infinity_emb.args import EngineArgs
+from infinity_emb.primitives import Device, InferenceEngine
+
+PREFIX = "/v1_ct2"
+MODEL: str = pytest.DEFAULT_AUDIO_MODEL  # type: ignore[assignment]
+batch_size = 32 if torch.cuda.is_available() else 8
+
+app = create_server(
+    url_prefix=PREFIX,
+    engine_args_list=[
+        EngineArgs(
+            model_name_or_path=MODEL,
+            batch_size=batch_size,
+            engine=InferenceEngine.torch,
+            device=Device.auto if not torch.backends.mps.is_available() else Device.cpu,
+        )
+    ],
+)
+
+
+@pytest.fixture()
+async def client():
+    async with AsyncClient(
+        app=app, base_url="http://test", timeout=20
+    ) as client, LifespanManager(app):
+        yield client
+
+
+@pytest.mark.anyio
+async def test_model_route(client):
+    response = await client.get(f"{PREFIX}/models")
+    assert response.status_code == 200
+    rdata = response.json()
+    assert "data" in rdata
+    assert rdata["data"][0].get("id", "") == MODEL
+    assert isinstance(rdata["data"][0].get("stats"), dict)
+    assert "audio_embed" in rdata["data"][0]["capabilities"]
+
+
+@pytest.mark.anyio
+async def test_audio_single(client):
+    audio_url = "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_audio",
+        json={"model": MODEL, "input": audio_url},
+    )
+    assert response.status_code == 200
+    rdata = response.json()
+    assert "model" in rdata
+    assert "usage" in rdata
+    rdata_results = rdata["data"]
+    assert rdata_results[0]["object"] == "embedding"
+    assert len(rdata_results[0]["embedding"]) > 0
+
+
+@pytest.mark.anyio
+@pytest.mark.parametrize("no_of_audios", [0, 1, 5, 10])
+async def test_audio_multiple(client, no_of_audios):
+    audio_url = [
+        "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
+    ] * no_of_audios
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_image",
+        json={"model": MODEL, "input": audio_url},
+    )
+    assert response.status_code == 200
+    rdata = response.json()
+    rdata_results = rdata["data"]
+    assert len(rdata_results) == no_of_audios
+    if no_of_audios:
+        assert "model" in rdata
+        assert "usage" in rdata
+        assert rdata_results[0]["object"] == "embedding"
+        assert len(rdata_results[0]["embedding"]) > 0
+
+
+@pytest.mark.anyio
+async def test_audio_fail(client):
+    audio_url = "https://www.google.com/404"
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_image",
+        json={"model": MODEL, "input": audio_url},
+    )
+    assert response.status_code == status.HTTP_400_BAD_REQUEST
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
new file mode 100644
index 00000000..2d41f10e
--- /dev/null
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
@@ -0,0 +1,94 @@
+import pytest
+import torch
+from asgi_lifespan import LifespanManager
+from fastapi import status
+from httpx import AsyncClient
+
+from infinity_emb import create_server
+from infinity_emb.args import EngineArgs
+from infinity_emb.primitives import Device, InferenceEngine
+
+PREFIX = "/v1_ct2"
+MODEL: str = pytest.DEFAULT_VISION_MODEL  # type: ignore[assignment]
+batch_size = 32 if torch.cuda.is_available() else 8
+
+app = create_server(
+    url_prefix=PREFIX,
+    engine_args_list=[
+        EngineArgs(
+            model_name_or_path=MODEL,
+            batch_size=batch_size,
+            engine=InferenceEngine.torch,
+            device=Device.auto if not torch.backends.mps.is_available() else Device.cpu,
+        )
+    ],
+)
+
+
+@pytest.fixture()
+async def client():
+    async with AsyncClient(
+        app=app, base_url="http://test", timeout=20
+    ) as client, LifespanManager(app):
+        yield client
+
+
+@pytest.mark.anyio
+async def test_model_route(client):
+    response = await client.get(f"{PREFIX}/models")
+    assert response.status_code == 200
+    rdata = response.json()
+    assert "data" in rdata
+    assert rdata["data"][0].get("id", "") == MODEL
+    assert isinstance(rdata["data"][0].get("stats"), dict)
+    assert "image_embed" in rdata["data"][0]["capabilities"]
+
+
+@pytest.mark.anyio
+async def test_vision_single(client):
+    image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_image",
+        json={"model": MODEL, "input": image_url},
+    )
+    assert response.status_code == 200
+    rdata = response.json()
+    assert "model" in rdata
+    assert "usage" in rdata
+    rdata_results = rdata["data"]
+    assert rdata_results[0]["object"] == "embedding"
+    assert len(rdata_results[0]["embedding"]) > 0
+
+
+@pytest.mark.anyio
+@pytest.mark.parametrize("no_of_images", [0, 1, 5, 10])
+async def test_vision_multiple(client, no_of_images):
+    image_url = [
+        "http://images.cocodataset.org/val2017/000000039769.jpg"
+    ] * no_of_images
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_image",
+        json={"model": MODEL, "input": image_url},
+    )
+    assert response.status_code == 200
+    rdata = response.json()
+    rdata_results = rdata["data"]
+    assert len(rdata_results) == no_of_images
+    if no_of_images:
+        assert "model" in rdata
+        assert "usage" in rdata
+        assert rdata_results[0]["object"] == "embedding"
+        assert len(rdata_results[0]["embedding"]) > 0
+
+
+@pytest.mark.anyio
+async def test_vision_fail(client):
+    image_url = "https://www.google.com/404"
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_image",
+        json={"model": MODEL, "input": image_url},
+    )
+    assert response.status_code == status.HTTP_400_BAD_REQUEST

From 923b85e05939e6f360e169097e46bb17bff46c42 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Fri, 27 Sep 2024 04:46:12 +0200
Subject: [PATCH 04/14] revert docs v2

---
 docs/docs/cli_v2.md | 170 +++++++++++---------------------------------
 1 file changed, 41 insertions(+), 129 deletions(-)

diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index a02a2c5b..cc3b3317 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -1,132 +1,44 @@
-# CLI v2 Documentation
-
-The current version of Infinity uses the following arguments in its CLI:
-Note: The section below is auto-generated by the makefile.
-
 ```bash
 infinity_emb v2 --help
-                                                                                                                                 
- Usage: infinity_emb v2 [OPTIONS]                                                                                                
-                                                                                                                                 
- Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                                        
- Multiple Model CLI Playbook:                                                                                                    
- - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`                  
- - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" &&               
- INFINITY_BATCH_SIZE="8;4;"                                                                                                      
- - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both   
- models have batch-size 8.                                                                                                       
-                                                                                                                                 
-╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ --model-id                                             TEXT                               Huggingface model repo id. Subset   │
-│                                                                                           of possible models:                 │
-│                                                                                           https://huggingface.co/models?othe… │
-│                                                                                           [env var: `INFINITY_MODEL_ID`]      │
-│                                                                                           [default:                           │
-│                                                                                           michaelfeil/bge-small-en-v1.5]      │
-│ --served-model-name                                    TEXT                               the nickname for the API, under     │
-│                                                                                           which the model_id can be selected  │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_SERVED_MODEL_NAME`]       │
-│ --batch-size                                           INTEGER                            maximum batch size for inference    │
-│                                                                                           [env var: `INFINITY_BATCH_SIZE`]    │
-│                                                                                           [default: 32]                       │
-│ --revision                                             TEXT                               huggingface  model repo revision.   │
-│                                                                                           [env var: `INFINITY_REVISION`]      │
-│ --trust-remote-code       --no-trust-remote-code                                          if potential remote modeling code   │
-│                                                                                           from huggingface repo is trusted.   │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_TRUST_REMOTE_CODE`]       │
-│                                                                                           [default: trust-remote-code]        │
-│ --engine                                               [torch|ctranslate2|optimum|debuge  Which backend to use. `torch` uses  │
-│                                                        ngine]                             Pytorch GPU/CPU, optimum uses ONNX  │
-│                                                                                           on GPU/CPU/NVIDIA-TensorRT,         │
-│                                                                                           `CTranslate2` uses                  │
-│                                                                                           torch+ctranslate2 on CPU/GPU.       │
-│                                                                                           [env var: `INFINITY_ENGINE`]        │
-│                                                                                           [default: torch]                    │
-│ --model-warmup            --no-model-warmup                                               if model should be warmed up after  │
-│                                                                                           startup, and before ready.          │
-│                                                                                           [env var: `INFINITY_MODEL_WARMUP`]  │
-│                                                                                           [default: model-warmup]             │
-│ --vector-disk-cache       --no-vector-disk-cache                                          If hash(request)/results should be  │
-│                                                                                           cached to SQLite for latency        │
-│                                                                                           improvement.                        │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_VECTOR_DISK_CACHE`]       │
-│                                                                                           [default: vector-disk-cache]        │
-│ --device                                               [cpu|cuda|mps|tensorrt|auto]       device to use for computing the     │
-│                                                                                           model forward pass.                 │
-│                                                                                           [env var: `INFINITY_DEVICE`]        │
-│                                                                                           [default: auto]                     │
-│ --lengths-via-tokenize    --no-lengths-via-tokenize                                       if True, returned tokens is based   │
-│                                                                                           on actual tokenizer count. If       │
-│                                                                                           false, uses len(input) as proxy.    │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_LENGTHS_VIA_TOKENIZE`]    │
-│                                                                                           [default: lengths-via-tokenize]     │
-│ --dtype                                                [float32|float16|int8|fp8|auto]    dtype for the model weights.        │
-│                                                                                           [env var: `INFINITY_DTYPE`]         │
-│                                                                                           [default: auto]                     │
-│ --embedding-dtype                                      [float32|int8|uint8|binary|ubinar  dtype post-forward pass. If !=      │
-│                                                        y]                                 `float32`, using Post-Forward       │
-│                                                                                           Static quantization.                │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_EMBEDDING_DTYPE`]         │
-│                                                                                           [default: float32]                  │
-│ --pooling-method                                       [mean|cls|auto]                    overwrite the pooling method if     │
-│                                                                                           inferred incorrectly.               │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_POOLING_METHOD`]          │
-│                                                                                           [default: auto]                     │
-│ --compile                 --no-compile                                                    Enable usage of                     │
-│                                                                                           `torch.compile(dynamic=True)` if    │
-│                                                                                           engine relies on it.                │
-│                                                                                           [env var: `INFINITY_COMPILE`]       │
-│                                                                                           [default: compile]                  │
-│ --bettertransformer       --no-bettertransformer                                          Enables varlen flash-attention-2    │
-│                                                                                           via the `BetterTransformer`         │
-│                                                                                           implementation. If available for    │
-│                                                                                           this model.                         │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_BETTERTRANSFORMER`]       │
-│                                                                                           [default: bettertransformer]        │
-│ --preload-only            --no-preload-only                                               If true, only downloads models and  │
-│                                                                                           verifies setup, then exit.          │
-│                                                                                           Recommended for pre-caching the     │
-│                                                                                           download in a Dockerfile.           │
-│                                                                                           [env var: `INFINITY_PRELOAD_ONLY`]  │
-│                                                                                           [default: no-preload-only]          │
-│ --host                                                 TEXT                               host for the FastAPI uvicorn server │
-│                                                                                           [env var: `INFINITY_HOST`]          │
-│                                                                                           [default: 0.0.0.0]                  │
-│ --port                                                 INTEGER                            port for the FastAPI uvicorn server │
-│                                                                                           [env var: `INFINITY_PORT`]          │
-│                                                                                           [default: 7997]                     │
-│ --url-prefix                                           TEXT                               prefix for all routes of the        │
-│                                                                                           FastAPI uvicorn server. Useful if   │
-│                                                                                           you run behind a proxy / cascaded   │
-│                                                                                           API.                                │
-│                                                                                           [env var: `INFINITY_URL_PREFIX`]    │
-│ --redirect-slash                                       TEXT                               where to redirect `/` requests to.  │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_REDIRECT_SLASH`]          │
-│                                                                                           [default: /docs]                    │
-│ --log-level                                            [critical|error|warning|info|debu  console log level.                  │
-│                                                        g|trace]                           [env var: `INFINITY_LOG_LEVEL`]     │
-│                                                                                           [default: info]                     │
-│ --permissive-cors         --no-permissive-cors                                            whether to allow permissive cors.   │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_PERMISSIVE_CORS`]         │
-│                                                                                           [default: no-permissive-cors]       │
-│ --api-key                                              TEXT                               api_key used for authentication     │
-│                                                                                           headers.                            │
-│                                                                                           [env var: `INFINITY_API_KEY`]       │
-│ --proxy-root-path                                      TEXT                               Proxy prefix for the application.   │
-│                                                                                           See:                                │
-│                                                                                           https://fastapi.tiangolo.com/advan… │
-│                                                                                           [env var:                           │
-│                                                                                           `INFINITY_PROXY_ROOT_PATH`]         │
-│ --help                                                                                    Show this message and exit.         │
-╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 
-```
+ Usage: infinity_emb v2 [OPTIONS]                                                                                                                                                                                                                                             
+
+ Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                                                                                                                                                                                     
+ Multiple Model CLI Playbook:                                                                                                                                                                                                                                                 
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`                                                                                                                                                               
+ - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;"                                                                                                                                 
+ - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8.                                                                                                                      
+
+╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --model-id                                             TEXT                                       Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference& [env var: `INFINITY_MODEL_ID`]                      │
+│                                                                                                   [default: michaelfeil/bge-small-en-v1.5]                                                                                                                                 │
+│ --served-model-name                                    TEXT                                       the nickname for the API, under which the model_id can be selected [env var: `INFINITY_SERVED_MODEL_NAME`]                                                               │
+│ --batch-size                                           INTEGER                                    maximum batch size for inference [env var: `INFINITY_BATCH_SIZE`] [default: 32]                                                                                          │
+│ --revision                                             TEXT                                       huggingface  model repo revision. [env var: `INFINITY_REVISION`]                                                                                                         │
+│ --trust-remote-code       --no-trust-remote-code                                                  if potential remote modeling code from huggingface repo is trusted. [env var: `INFINITY_TRUST_REMOTE_CODE`] [default: trust-remote-code]                                 │
+│ --engine                                               [torch|ctranslate2|optimum|debugengine]    Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.                       │
+│                                                                                                   [env var: `INFINITY_ENGINE`]                                                                                                                                             │
+│                                                                                                   [default: torch]                                                                                                                                                         │
+│ --model-warmup            --no-model-warmup                                                       if model should be warmed up after startup, and before ready. [env var: `INFINITY_MODEL_WARMUP`] [default: model-warmup]                                                 │
+│ --vector-disk-cache       --no-vector-disk-cache                                                  If hash(request)/results should be cached to SQLite for latency improvement. [env var: `INFINITY_VECTOR_DISK_CACHE`] [default: vector-disk-cache]                        │
+│ --device                                               [cpu|cuda|mps|tensorrt|auto]               device to use for computing the model forward pass. [env var: `INFINITY_DEVICE`] [default: auto]                                                                         │
+│ --lengths-via-tokenize    --no-lengths-via-tokenize                                               if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy. [env var: `INFINITY_LENGTHS_VIA_TOKENIZE`]                              │
+│                                                                                                   [default: lengths-via-tokenize]                                                                                                                                          │
+│ --dtype                                                [float32|float16|int8|fp8|auto]            dtype for the model weights. [env var: `INFINITY_DTYPE`] [default: auto]                                                                                                 │
+│ --embedding-dtype                                      [float32|int8|uint8|binary|ubinary]        dtype post-forward pass. If != `float32`, using Post-Forward Static quantization. [env var: `INFINITY_EMBEDDING_DTYPE`] [default: float32]                               │
+│ --pooling-method                                       [mean|cls|auto]                            overwrite the pooling method if inferred incorrectly. [env var: `INFINITY_POOLING_METHOD`] [default: auto]                                                               │
+│ --compile                 --no-compile                                                            Enable usage of `torch.compile(dynamic=True)` if engine relies on it. [env var: `INFINITY_COMPILE`] [default: compile]                                                   │
+│ --bettertransformer       --no-bettertransformer                                                  Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model. [env var: `INFINITY_BETTERTRANSFORMER`]                        │
+│                                                                                                   [default: bettertransformer]                                                                                                                                             │
+│ --preload-only            --no-preload-only                                                       If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile. [env var: `INFINITY_PRELOAD_ONLY`]               │
+│                                                                                                   [default: no-preload-only]                                                                                                                                               │
+│ --host                                                 TEXT                                       host for the FastAPI uvicorn server [env var: `INFINITY_HOST`] [default: 0.0.0.0]                                                                                        │
+│ --port                                                 INTEGER                                    port for the FastAPI uvicorn server [env var: `INFINITY_PORT`] [default: 7997]                                                                                           │
+│ --url-prefix                                           TEXT                                       prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API. [env var: `INFINITY_URL_PREFIX`]                                   │
+│ --redirect-slash                                       TEXT                                       where to redirect `/` requests to. [env var: `INFINITY_REDIRECT_SLASH`] [default: /docs]                                                                                 │
+│ --log-level                                            [critical|error|warning|info|debug|trace]  console log level. [env var: `INFINITY_LOG_LEVEL`] [default: info]                                                                                                       │
+│ --permissive-cors         --no-permissive-cors                                                    whether to allow permissive cors. [env var: `INFINITY_PERMISSIVE_CORS`] [default: no-permissive-cors]                                                                    │
+│ --api-key                                              TEXT                                       api_key used for authentication headers. [env var: `INFINITY_API_KEY`]                                                                                                   │
+│ --proxy-root-path                                      TEXT                                       Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/ [env var: `INFINITY_PROXY_ROOT_PATH`]                                       │
+│ --help                                                                                            Show this message and exit.                                                                                                                                              │
+╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
\ No newline at end of file

From b48e8089b1d5c79b4e63596a7885566c60e85860 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Fri, 27 Sep 2024 04:47:12 +0200
Subject: [PATCH 05/14] revert docs v2

---
 docs/docs/cli_v2.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index cc3b3317..31e0c743 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -41,4 +41,6 @@ infinity_emb v2 --help
 │ --api-key                                              TEXT                                       api_key used for authentication headers. [env var: `INFINITY_API_KEY`]                                                                                                   │
 │ --proxy-root-path                                      TEXT                                       Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/ [env var: `INFINITY_PROXY_ROOT_PATH`]                                       │
 │ --help                                                                                            Show this message and exit.                                                                                                                                              │
-╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
\ No newline at end of file
+╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+
+```
\ No newline at end of file

From ef245358610b544e4509c3d5d2f92f88065d00e5 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Fri, 27 Sep 2024 17:23:44 +0200
Subject: [PATCH 06/14] fix test cases

---
 .../tests/end_to_end/test_torch_audio.py      | 21 ++++++++++++++-----
 .../tests/end_to_end/test_torch_vision.py     | 15 ++++++++++---
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
index 99672a5b..ed264d2d 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
@@ -62,15 +62,15 @@ async def test_audio_single(client):
 
 
 @pytest.mark.anyio
-@pytest.mark.parametrize("no_of_audios", [0, 1, 5, 10])
+@pytest.mark.parametrize("no_of_audios", [1, 5, 10])
 async def test_audio_multiple(client, no_of_audios):
-    audio_url = [
+    audio_urls = [
         "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
     ] * no_of_audios
 
     response = await client.post(
-        f"{PREFIX}/embeddings_image",
-        json={"model": MODEL, "input": audio_url},
+        f"{PREFIX}/embeddings_audio",
+        json={"model": MODEL, "input": audio_urls},
     )
     assert response.status_code == 200
     rdata = response.json()
@@ -88,7 +88,18 @@ async def test_audio_fail(client):
     audio_url = "https://www.google.com/404"
 
     response = await client.post(
-        f"{PREFIX}/embeddings_image",
+        f"{PREFIX}/embeddings_audio",
         json={"model": MODEL, "input": audio_url},
     )
     assert response.status_code == status.HTTP_400_BAD_REQUEST
+
+
+@pytest.mark.anyio
+async def test_audio_empty(client):
+    audio_url_empty = []
+
+    response_empty = await client.post(
+        f"{PREFIX}/embeddings_audio",
+        json={"model": MODEL, "input": audio_url_empty},
+    )
+    assert response_empty.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
\ No newline at end of file
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
index 2d41f10e..3db5c26d 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
@@ -62,15 +62,15 @@ async def test_vision_single(client):
 
 
 @pytest.mark.anyio
-@pytest.mark.parametrize("no_of_images", [0, 1, 5, 10])
+@pytest.mark.parametrize("no_of_images", [1, 5, 10])
 async def test_vision_multiple(client, no_of_images):
-    image_url = [
+    image_urls = [
         "http://images.cocodataset.org/val2017/000000039769.jpg"
     ] * no_of_images
 
     response = await client.post(
         f"{PREFIX}/embeddings_image",
-        json={"model": MODEL, "input": image_url},
+        json={"model": MODEL, "input": image_urls},
     )
     assert response.status_code == 200
     rdata = response.json()
@@ -92,3 +92,12 @@ async def test_vision_fail(client):
         json={"model": MODEL, "input": image_url},
     )
     assert response.status_code == status.HTTP_400_BAD_REQUEST
+
+@pytest.mark.anyio
+async def test_vision_empty(client):
+    image_url_empty = []
+    response = await client.post(
+        f"{PREFIX}/embeddings_image",
+        json={"model": MODEL, "input": image_url_empty},
+    )
+    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
\ No newline at end of file

From a894760ffd901c6781f87d0d66e58fcd618911e3 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Sat, 28 Sep 2024 16:38:24 +0200
Subject: [PATCH 07/14] add test for text only vision case

---
 .../tests/end_to_end/test_torch_audio.py      | 11 ++++----
 .../tests/end_to_end/test_torch_vision.py     | 28 +++++++++++++++----
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
index ed264d2d..81db3a18 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
@@ -76,11 +76,10 @@ async def test_audio_multiple(client, no_of_audios):
     rdata = response.json()
     rdata_results = rdata["data"]
     assert len(rdata_results) == no_of_audios
-    if no_of_audios:
-        assert "model" in rdata
-        assert "usage" in rdata
-        assert rdata_results[0]["object"] == "embedding"
-        assert len(rdata_results[0]["embedding"]) > 0
+    assert "model" in rdata
+    assert "usage" in rdata
+    assert rdata_results[0]["object"] == "embedding"
+    assert len(rdata_results[0]["embedding"]) > 0
 
 
 @pytest.mark.anyio
@@ -102,4 +101,4 @@ async def test_audio_empty(client):
         f"{PREFIX}/embeddings_audio",
         json={"model": MODEL, "input": audio_url_empty},
     )
-    assert response_empty.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
\ No newline at end of file
+    assert response_empty.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
index 3db5c26d..9b0fb8d7 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
@@ -61,6 +61,22 @@ async def test_vision_single(client):
     assert len(rdata_results[0]["embedding"]) > 0
 
 
+@pytest.mark.anyio
+async def test_vision_single_text_only(client):
+    text = "a image of a cat"
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_image",
+        json={"model": MODEL, "input": text},
+    )
+    assert response.status_code == 200
+    rdata = response.json()
+    assert "model" in rdata
+    assert "usage" in rdata
+    rdata_results = rdata["data"]
+    assert rdata_results[0]["object"] == "embedding"
+    assert len(rdata_results[0]["embedding"]) > 0
+
 @pytest.mark.anyio
 @pytest.mark.parametrize("no_of_images", [1, 5, 10])
 async def test_vision_multiple(client, no_of_images):
@@ -76,11 +92,10 @@ async def test_vision_multiple(client, no_of_images):
     rdata = response.json()
     rdata_results = rdata["data"]
     assert len(rdata_results) == no_of_images
-    if no_of_images:
-        assert "model" in rdata
-        assert "usage" in rdata
-        assert rdata_results[0]["object"] == "embedding"
-        assert len(rdata_results[0]["embedding"]) > 0
+    assert "model" in rdata
+    assert "usage" in rdata
+    assert rdata_results[0]["object"] == "embedding"
+    assert len(rdata_results[0]["embedding"]) > 0
 
 
 @pytest.mark.anyio
@@ -93,6 +108,7 @@ async def test_vision_fail(client):
     )
     assert response.status_code == status.HTTP_400_BAD_REQUEST
 
+
 @pytest.mark.anyio
 async def test_vision_empty(client):
     image_url_empty = []
@@ -100,4 +116,4 @@ async def test_vision_empty(client):
         f"{PREFIX}/embeddings_image",
         json={"model": MODEL, "input": image_url_empty},
     )
-    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
\ No newline at end of file
+    assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY

From a592419f81e79f3ae0ee2871f2d38fa843d6a753 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Sat, 28 Sep 2024 16:40:51 +0200
Subject: [PATCH 08/14] add text only case for audio

---
 .../tests/end_to_end/test_torch_audio.py         | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
index 81db3a18..4795b74e 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
@@ -61,6 +61,22 @@ async def test_audio_single(client):
     assert len(rdata_results[0]["embedding"]) > 0
 
 
+@pytest.mark.anyio
+async def test_audio_single_text_only(client):
+    text = "a sound of a at"
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_audio",
+        json={"model": MODEL, "input": text},
+    )
+    assert response.status_code == 200
+    rdata = response.json()
+    assert "model" in rdata
+    assert "usage" in rdata
+    rdata_results = rdata["data"]
+    assert rdata_results[0]["object"] == "embedding"
+    assert len(rdata_results[0]["embedding"]) > 0
+
 @pytest.mark.anyio
 @pytest.mark.parametrize("no_of_audios", [1, 5, 10])
 async def test_audio_multiple(client, no_of_audios):

From ad2d262b99369f0289e7137840ffa2bddfda3436 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Sat, 28 Sep 2024 17:29:19 +0200
Subject: [PATCH 09/14] format code

---
 libs/infinity_emb/tests/end_to_end/test_torch_audio.py  | 1 +
 libs/infinity_emb/tests/end_to_end/test_torch_vision.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
index 4795b74e..d58880c9 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
@@ -77,6 +77,7 @@ async def test_audio_single_text_only(client):
     assert rdata_results[0]["object"] == "embedding"
     assert len(rdata_results[0]["embedding"]) > 0
 
+
 @pytest.mark.anyio
 @pytest.mark.parametrize("no_of_audios", [1, 5, 10])
 async def test_audio_multiple(client, no_of_audios):
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
index 9b0fb8d7..be40044f 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
@@ -77,6 +77,7 @@ async def test_vision_single_text_only(client):
     assert rdata_results[0]["object"] == "embedding"
     assert len(rdata_results[0]["embedding"]) > 0
 
+
 @pytest.mark.anyio
 @pytest.mark.parametrize("no_of_images", [1, 5, 10])
 async def test_vision_multiple(client, no_of_images):

From 074105db340f05ded0509bdc0cabfeb431f56587 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Sat, 28 Sep 2024 18:03:48 +0200
Subject: [PATCH 10/14] skip text test for not to see updated coverage

---
 libs/infinity_emb/tests/end_to_end/test_torch_audio.py  | 1 +
 libs/infinity_emb/tests/end_to_end/test_torch_vision.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
index d58880c9..80625b43 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
@@ -62,6 +62,7 @@ async def test_audio_single(client):
 
 
 @pytest.mark.anyio
+@pytest.mark.skip("text only")
 async def test_audio_single_text_only(client):
     text = "a sound of a at"
 
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
index be40044f..f6e5b8a1 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
@@ -62,6 +62,7 @@ async def test_vision_single(client):
 
 
 @pytest.mark.anyio
+@pytest.mark.skip("text only")
 async def test_vision_single_text_only(client):
     text = "a image of a cat"
 

From f2925c87f15227843aa8f05d5435afc75d2babac Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Sun, 29 Sep 2024 04:22:09 +0200
Subject: [PATCH 11/14] revert cli doc from main branch

---
 docs/docs/cli_v2.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index 31e0c743..33fd3d3a 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -1,14 +1,19 @@
+# CLI v2 Documentation
+
+The current version of Infinity uses the following arguments in its CLI:
+Note: The section below is auto-generated by the makefile.
+
 ```bash
 infinity_emb v2 --help
-
+                                                                                                                                                                                                                                                                              
  Usage: infinity_emb v2 [OPTIONS]                                                                                                                                                                                                                                             
-
+                                                                                                                                                                                                                                                                              
  Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                                                                                                                                                                                     
  Multiple Model CLI Playbook:                                                                                                                                                                                                                                                 
  - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`                                                                                                                                                               
  - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;"                                                                                                                                 
  - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8.                                                                                                                      
-
+                                                                                                                                                                                                                                                                              
 ╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 │ --model-id                                             TEXT                                       Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference& [env var: `INFINITY_MODEL_ID`]                      │
 │                                                                                                   [default: michaelfeil/bge-small-en-v1.5]                                                                                                                                 │
@@ -43,4 +48,4 @@ infinity_emb v2 --help
 │ --help                                                                                            Show this message and exit.                                                                                                                                              │
 ╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 
-```
\ No newline at end of file
+```

From 7558e7a525519ad9b02eb847c5d59b577b17fe31 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Mon, 30 Sep 2024 02:49:11 +0200
Subject: [PATCH 12/14] add changes to support text and urls

---
 .../infinity_emb/fastapi_schemas/pymodels.py  | 13 ++++-
 .../infinity_emb/infinity_server.py           | 44 ++++++++++++---
 .../tests/end_to_end/test_torch_audio.py      | 56 ++++++++++++++++++-
 3 files changed, 101 insertions(+), 12 deletions(-)

diff --git a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
index 3852b497..7bb93f41 100644
--- a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
+++ b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
@@ -93,8 +93,17 @@ class ImageEmbeddingInput(BaseModel):
     user: Optional[str] = None
 
 
-class AudioEmbeddingInput(ImageEmbeddingInput):
-    pass
+class AudioEmbeddingInput(BaseModel):
+    input: Union[  # type: ignore
+        conlist(  # type: ignore
+            Union[Annotated[AnyUrl, HttpUrl], str],
+            **ITEMS_LIMIT_SMALL,
+        ),
+        Union[Annotated[AnyUrl, HttpUrl], str],
+    ]
+    model: str = "default/not-specified"
+    encoding_format: EmbeddingEncodingFormat = EmbeddingEncodingFormat.float
+    user: Optional[str] = None
 
 
 class _EmbeddingObject(BaseModel):
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
index 73bb5996..efcd60a5 100644
--- a/libs/infinity_emb/infinity_emb/infinity_server.py
+++ b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -8,6 +8,7 @@
 import time
 from contextlib import asynccontextmanager
 from typing import Any, Optional
+from urllib.parse import urlparse
 
 import infinity_emb
 from infinity_emb._optional_imports import CHECK_TYPER, CHECK_UVICORN
@@ -411,29 +412,54 @@ async def _embeddings_audio(data: AudioEmbeddingInput):
             json={"model":"laion/larger_clap_general","input":["https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"]})
         """
         engine = _resolve_engine(data.model)
-        if hasattr(data.input, "host"):
-            # if it is a single url
-            audio_inputs = [str(data.input)]
+        input_list: list[str] = []
+        if isinstance(data.input, str):
+            input_list.append(data.input)
         else:
-            audio_inputs = [str(d) for d in data.input]  # type: ignore
+            input_list = data.input  # type: ignore
+        audio_urls = []
+        texts = []
+        is_audios = []
+        for input in input_list:
+            parsed_url = urlparse(input)
+            # Todo: Improve url check
+            if parsed_url.netloc and parsed_url.scheme:
+                # if it is a single url
+                audio_urls.append(str(input))
+                is_audios.append(True)
+            else:
+                texts.append(input)  # type: ignore
+                is_audios.append(False)
         try:
-            logger.debug("[📝] Received request with %s Urls ", len(audio_inputs))
+            logger.debug(
+                f"[📝] Received request with {len(audio_urls)} Urls and {len(texts)} sentences"
+            )
             start = time.perf_counter()
 
-            embedding, usage = await engine.audio_embed(audios=audio_inputs)  # type: ignore
+            if audio_urls:
+                audio_embeddings, usage = await engine.audio_embed(audios=audio_urls)  # type: ignore
+            if texts:
+                text_embeddings, usage = await engine.embed(sentences=texts)
+
+            embeddings_with_restored_order = []
+            for is_audio in is_audios:
+                if is_audio:
+                    embeddings_with_restored_order.append(audio_embeddings.pop(0))
+                else:
+                    embeddings_with_restored_order.append(text_embeddings.pop(0))
 
             duration = (time.perf_counter() - start) * 1000
-            logger.debug("[✅] Done in %s ms", duration)
+            logger.debug(f"[✅] Done in {duration} ms")
 
             return OpenAIEmbeddingResult.to_embeddings_response(
-                embeddings=embedding,
+                embeddings=embeddings_with_restored_order,
                 engine_args=engine.engine_args,
                 encoding_format=data.encoding_format,
                 usage=usage,
             )
         except AudioCorruption as ex:
             raise errors.OpenAIException(
-                f"AudioCorruption, could not open {audio_inputs} -> {ex}",
+                f"AudioCorruption, could not open {audio_urls} -> {ex}",
                 code=status.HTTP_400_BAD_REQUEST,
             )
         except ModelNotDeployedError as ex:
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
index 80625b43..415a1287 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
@@ -25,6 +25,13 @@
 )
 
 
+def cosine_similarity(a, b):
+    from numpy import dot
+    from numpy.linalg import norm
+
+    return dot(a, b) / (norm(a) * norm(b))
+
+
 @pytest.fixture()
 async def client():
     async with AsyncClient(
@@ -62,7 +69,6 @@ async def test_audio_single(client):
 
 
 @pytest.mark.anyio
-@pytest.mark.skip("text only")
 async def test_audio_single_text_only(client):
     text = "a sound of a at"
 
@@ -79,6 +85,54 @@ async def test_audio_single_text_only(client):
     assert len(rdata_results[0]["embedding"]) > 0
 
 
+@pytest.mark.anyio
+@pytest.mark.parametrize("no_of_input_pairs", [1, 5])
+async def test_audio_text_url_mixed(client, no_of_input_pairs):
+    text = "a sound of a at"
+    audio_url = "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
+
+    input = [text, audio_url] * no_of_input_pairs
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_audio",
+        json={"model": MODEL, "input": input},
+    )
+    assert response.status_code == 200
+    rdata = response.json()
+    assert "model" in rdata
+    assert "usage" in rdata
+    rdata_results = rdata["data"]
+    assert rdata_results[0]["object"] == "embedding"
+    assert len(rdata_results[0]["embedding"]) > 0
+    assert len(rdata_results) == len(input)
+
+
+@pytest.mark.anyio
+async def test_meta(client):
+    audio_url = "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
+
+    input = [audio_url, "a beep", "a horse", "a fish"]
+
+    response = await client.post(
+        f"{PREFIX}/embeddings_audio",
+        json={"model": MODEL, "input": input},
+    )
+    assert response.status_code == 200
+    rdata = response.json()
+    rdata_results = rdata["data"]
+
+    embeddings_audio_beep = rdata_results[0]["embedding"]
+    embeddings_text_beep = rdata_results[1]["embedding"]
+    embeddings_text_horse = rdata_results[2]["embedding"]
+    embeddings_text_fish = rdata_results[3]["embedding"]
+    assert cosine_similarity(
+        embeddings_audio_beep, embeddings_text_beep
+    ) > cosine_similarity(embeddings_audio_beep, embeddings_text_fish)
+    assert cosine_similarity(
+        embeddings_audio_beep, embeddings_text_beep
+    ) > cosine_similarity(embeddings_audio_beep, embeddings_text_horse)
+
+
 @pytest.mark.anyio
 @pytest.mark.parametrize("no_of_audios", [1, 5, 10])
 async def test_audio_multiple(client, no_of_audios):

From 93ecc5dd8cec36db4e26e75a34ef412224d78c1b Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Mon, 30 Sep 2024 16:55:52 +0200
Subject: [PATCH 13/14] address comments. Report correct usage

---
 .../infinity_emb/infinity_server.py           | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
index efcd60a5..da98d720 100644
--- a/libs/infinity_emb/infinity_emb/infinity_server.py
+++ b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -33,6 +33,7 @@
     Device,
     Dtype,
     EmbeddingDtype,
+    EmbeddingReturnType,
     ImageCorruption,
     InferenceEngine,
     ModelNotDeployedError,
@@ -412,15 +413,15 @@ async def _embeddings_audio(data: AudioEmbeddingInput):
             json={"model":"laion/larger_clap_general","input":["https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"]})
         """
         engine = _resolve_engine(data.model)
-        input_list: list[str] = []
+        sentences_and_urls: list[str] = []
         if isinstance(data.input, str):
-            input_list.append(data.input)
+            sentences_and_urls.append(data.input)
         else:
-            input_list = data.input  # type: ignore
+            sentences_and_urls = data.input  # type: ignore
         audio_urls = []
-        texts = []
+        sentences = []
         is_audios = []
-        for input in input_list:
+        for input in sentences_and_urls:
             parsed_url = urlparse(input)
             # Todo: Improve url check
             if parsed_url.netloc and parsed_url.scheme:
@@ -428,18 +429,23 @@ async def _embeddings_audio(data: AudioEmbeddingInput):
                 audio_urls.append(str(input))
                 is_audios.append(True)
             else:
-                texts.append(input)  # type: ignore
+                sentences.append(input)  # type: ignore
                 is_audios.append(False)
         try:
             logger.debug(
-                f"[📝] Received request with {len(audio_urls)} Urls and {len(texts)} sentences"
+                f"[📝] Received request with {len(audio_urls)} Urls and {len(sentences)} sentences"
             )
             start = time.perf_counter()
 
+            total_usage = 0
+            audio_embeddings: list[EmbeddingReturnType] = []
+            text_embeddings: list[EmbeddingReturnType] = []
             if audio_urls:
-                audio_embeddings, usage = await engine.audio_embed(audios=audio_urls)  # type: ignore
-            if texts:
-                text_embeddings, usage = await engine.embed(sentences=texts)
+                audio_embeddings, usage_audio = await engine.audio_embed(audios=audio_urls)  # type: ignore
+                total_usage += usage_audio
+            if sentences:
+                text_embeddings, usage_text = await engine.embed(sentences=sentences)
+                total_usage += usage_text
 
             embeddings_with_restored_order = []
             for is_audio in is_audios:
@@ -455,7 +461,7 @@ async def _embeddings_audio(data: AudioEmbeddingInput):
                 embeddings=embeddings_with_restored_order,
                 engine_args=engine.engine_args,
                 encoding_format=data.encoding_format,
-                usage=usage,
+                usage=total_usage,
             )
         except AudioCorruption as ex:
             raise errors.OpenAIException(

From a74a5dfe40997bb7f152cdb9c5c21b07bb8bdb81 Mon Sep 17 00:00:00 2001
From: wirthual <wirthra@gmail.com>
Date: Tue, 1 Oct 2024 03:24:41 +0200
Subject: [PATCH 14/14] update endpoint usage for mixed case. Extend vision
 cases

---
 .../infinity_emb/fastapi_schemas/pymodels.py  | 13 +---
 .../infinity_emb/infinity_server.py           | 52 +++----------
 .../infinity_emb/tests/end_to_end/conftest.py |  6 ++
 .../tests/end_to_end/test_torch_audio.py      | 75 ++++++++-----------
 .../tests/end_to_end/test_torch_vision.py     | 49 +++++++++++-
 5 files changed, 98 insertions(+), 97 deletions(-)

diff --git a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
index 7bb93f41..3852b497 100644
--- a/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
+++ b/libs/infinity_emb/infinity_emb/fastapi_schemas/pymodels.py
@@ -93,17 +93,8 @@ class ImageEmbeddingInput(BaseModel):
     user: Optional[str] = None
 
 
-class AudioEmbeddingInput(BaseModel):
-    input: Union[  # type: ignore
-        conlist(  # type: ignore
-            Union[Annotated[AnyUrl, HttpUrl], str],
-            **ITEMS_LIMIT_SMALL,
-        ),
-        Union[Annotated[AnyUrl, HttpUrl], str],
-    ]
-    model: str = "default/not-specified"
-    encoding_format: EmbeddingEncodingFormat = EmbeddingEncodingFormat.float
-    user: Optional[str] = None
+class AudioEmbeddingInput(ImageEmbeddingInput):
+    pass
 
 
 class _EmbeddingObject(BaseModel):
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
index da98d720..73bb5996 100644
--- a/libs/infinity_emb/infinity_emb/infinity_server.py
+++ b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -8,7 +8,6 @@
 import time
 from contextlib import asynccontextmanager
 from typing import Any, Optional
-from urllib.parse import urlparse
 
 import infinity_emb
 from infinity_emb._optional_imports import CHECK_TYPER, CHECK_UVICORN
@@ -33,7 +32,6 @@
     Device,
     Dtype,
     EmbeddingDtype,
-    EmbeddingReturnType,
     ImageCorruption,
     InferenceEngine,
     ModelNotDeployedError,
@@ -413,59 +411,29 @@ async def _embeddings_audio(data: AudioEmbeddingInput):
             json={"model":"laion/larger_clap_general","input":["https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"]})
         """
         engine = _resolve_engine(data.model)
-        sentences_and_urls: list[str] = []
-        if isinstance(data.input, str):
-            sentences_and_urls.append(data.input)
+        if hasattr(data.input, "host"):
+            # if it is a single url
+            audio_inputs = [str(data.input)]
         else:
-            sentences_and_urls = data.input  # type: ignore
-        audio_urls = []
-        sentences = []
-        is_audios = []
-        for input in sentences_and_urls:
-            parsed_url = urlparse(input)
-            # Todo: Improve url check
-            if parsed_url.netloc and parsed_url.scheme:
-                # if it is a single url
-                audio_urls.append(str(input))
-                is_audios.append(True)
-            else:
-                sentences.append(input)  # type: ignore
-                is_audios.append(False)
+            audio_inputs = [str(d) for d in data.input]  # type: ignore
         try:
-            logger.debug(
-                f"[📝] Received request with {len(audio_urls)} Urls and {len(sentences)} sentences"
-            )
+            logger.debug("[📝] Received request with %s Urls ", len(audio_inputs))
             start = time.perf_counter()
 
-            total_usage = 0
-            audio_embeddings: list[EmbeddingReturnType] = []
-            text_embeddings: list[EmbeddingReturnType] = []
-            if audio_urls:
-                audio_embeddings, usage_audio = await engine.audio_embed(audios=audio_urls)  # type: ignore
-                total_usage += usage_audio
-            if sentences:
-                text_embeddings, usage_text = await engine.embed(sentences=sentences)
-                total_usage += usage_text
-
-            embeddings_with_restored_order = []
-            for is_audio in is_audios:
-                if is_audio:
-                    embeddings_with_restored_order.append(audio_embeddings.pop(0))
-                else:
-                    embeddings_with_restored_order.append(text_embeddings.pop(0))
+            embedding, usage = await engine.audio_embed(audios=audio_inputs)  # type: ignore
 
             duration = (time.perf_counter() - start) * 1000
-            logger.debug(f"[✅] Done in {duration} ms")
+            logger.debug("[✅] Done in %s ms", duration)
 
             return OpenAIEmbeddingResult.to_embeddings_response(
-                embeddings=embeddings_with_restored_order,
+                embeddings=embedding,
                 engine_args=engine.engine_args,
                 encoding_format=data.encoding_format,
-                usage=total_usage,
+                usage=usage,
             )
         except AudioCorruption as ex:
             raise errors.OpenAIException(
-                f"AudioCorruption, could not open {audio_urls} -> {ex}",
+                f"AudioCorruption, could not open {audio_inputs} -> {ex}",
                 code=status.HTTP_400_BAD_REQUEST,
             )
         except ModelNotDeployedError as ex:
diff --git a/libs/infinity_emb/tests/end_to_end/conftest.py b/libs/infinity_emb/tests/end_to_end/conftest.py
index e443ee04..3ce61fc8 100644
--- a/libs/infinity_emb/tests/end_to_end/conftest.py
+++ b/libs/infinity_emb/tests/end_to_end/conftest.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 import pytest
+from numpy import dot
+from numpy.linalg import norm
 
 
 class Helpers:
@@ -98,6 +100,10 @@ async def embedding_verify(client, model_base, prefix, model_name, decimal=3):
                     embedding["embedding"], st_embedding, decimal=decimal
                 )
 
+    @staticmethod
+    def cosine_similarity(a, b):
+        return dot(a, b) / (norm(a) * norm(b))
+
 
 @pytest.fixture
 def helpers():
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
index 415a1287..120e7c63 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_audio.py
@@ -25,13 +25,6 @@
 )
 
 
-def cosine_similarity(a, b):
-    from numpy import dot
-    from numpy.linalg import norm
-
-    return dot(a, b) / (norm(a) * norm(b))
-
-
 @pytest.fixture()
 async def client():
     async with AsyncClient(
@@ -73,7 +66,7 @@ async def test_audio_single_text_only(client):
     text = "a sound of a at"
 
     response = await client.post(
-        f"{PREFIX}/embeddings_audio",
+        f"{PREFIX}/embeddings",
         json={"model": MODEL, "input": text},
     )
     assert response.status_code == 200
@@ -86,51 +79,40 @@ async def test_audio_single_text_only(client):
 
 
 @pytest.mark.anyio
-@pytest.mark.parametrize("no_of_input_pairs", [1, 5])
-async def test_audio_text_url_mixed(client, no_of_input_pairs):
-    text = "a sound of a at"
+async def test_meta(client, helpers):
     audio_url = "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
 
-    input = [text, audio_url] * no_of_input_pairs
-
-    response = await client.post(
+    text_input = ["a beep", "a horse", "a fish"]
+    audio_input = [audio_url]
+    response_text = await client.post(
+        f"{PREFIX}/embeddings",
+        json={"model": MODEL, "input": text_input},
+    )
+    response_audio = await client.post(
         f"{PREFIX}/embeddings_audio",
-        json={"model": MODEL, "input": input},
+        json={"model": MODEL, "input": audio_input},
     )
-    assert response.status_code == 200
-    rdata = response.json()
-    assert "model" in rdata
-    assert "usage" in rdata
-    rdata_results = rdata["data"]
-    assert rdata_results[0]["object"] == "embedding"
-    assert len(rdata_results[0]["embedding"]) > 0
-    assert len(rdata_results) == len(input)
 
+    assert response_text.status_code == 200
+    assert response_audio.status_code == 200
 
-@pytest.mark.anyio
-async def test_meta(client):
-    audio_url = "https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav"
+    rdata_text = response_text.json()
+    rdata_results_text = rdata_text["data"]
 
-    input = [audio_url, "a beep", "a horse", "a fish"]
+    rdata_audio = response_audio.json()
+    rdata_results_audio = rdata_audio["data"]
 
-    response = await client.post(
-        f"{PREFIX}/embeddings_audio",
-        json={"model": MODEL, "input": input},
-    )
-    assert response.status_code == 200
-    rdata = response.json()
-    rdata_results = rdata["data"]
+    embeddings_audio_beep = rdata_results_audio[0]["embedding"]
+    embeddings_text_beep = rdata_results_text[0]["embedding"]
+    embeddings_text_horse = rdata_results_text[1]["embedding"]
+    embeddings_text_fish = rdata_results_text[2]["embedding"]
 
-    embeddings_audio_beep = rdata_results[0]["embedding"]
-    embeddings_text_beep = rdata_results[1]["embedding"]
-    embeddings_text_horse = rdata_results[2]["embedding"]
-    embeddings_text_fish = rdata_results[3]["embedding"]
-    assert cosine_similarity(
+    assert helpers.cosine_similarity(
         embeddings_audio_beep, embeddings_text_beep
-    ) > cosine_similarity(embeddings_audio_beep, embeddings_text_fish)
-    assert cosine_similarity(
+    ) > helpers.cosine_similarity(embeddings_audio_beep, embeddings_text_fish)
+    assert helpers.cosine_similarity(
         embeddings_audio_beep, embeddings_text_beep
-    ) > cosine_similarity(embeddings_audio_beep, embeddings_text_horse)
+    ) > helpers.cosine_similarity(embeddings_audio_beep, embeddings_text_horse)
 
 
 @pytest.mark.anyio
@@ -174,3 +156,12 @@ async def test_audio_empty(client):
         json={"model": MODEL, "input": audio_url_empty},
     )
     assert response_empty.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
+
+
+@pytest.mark.anyio
+async def test_unsupported_endpoints(client):
+    response_unsupported = await client.post(
+        f"{PREFIX}/classify",
+        json={"model": MODEL, "input": ["test"]},
+    )
+    assert response_unsupported.status_code == status.HTTP_400_BAD_REQUEST
diff --git a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
index f6e5b8a1..e9a7d68a 100644
--- a/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
+++ b/libs/infinity_emb/tests/end_to_end/test_torch_vision.py
@@ -62,12 +62,11 @@ async def test_vision_single(client):
 
 
 @pytest.mark.anyio
-@pytest.mark.skip("text only")
 async def test_vision_single_text_only(client):
     text = "a image of a cat"
 
     response = await client.post(
-        f"{PREFIX}/embeddings_image",
+        f"{PREFIX}/embeddings",
         json={"model": MODEL, "input": text},
     )
     assert response.status_code == 200
@@ -79,6 +78,43 @@ async def test_vision_single_text_only(client):
     assert len(rdata_results[0]["embedding"]) > 0
 
 
+@pytest.mark.anyio
+async def test_meta(client, helpers):
+    image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+    text_input = ["a cat", "a car", "a fridge"]
+    image_input = [image_url]
+    response_text = await client.post(
+        f"{PREFIX}/embeddings",
+        json={"model": MODEL, "input": text_input},
+    )
+    response_image = await client.post(
+        f"{PREFIX}/embeddings_image",
+        json={"model": MODEL, "input": image_input},
+    )
+
+    assert response_text.status_code == 200
+    assert response_image.status_code == 200
+
+    rdata_text = response_text.json()
+    rdata_results_text = rdata_text["data"]
+
+    rdata_image = response_image.json()
+    rdata_results_image = rdata_image["data"]
+
+    embeddings_image_cat = rdata_results_image[0]["embedding"]
+    embeddings_text_cat = rdata_results_text[0]["embedding"]
+    embeddings_text_car = rdata_results_text[1]["embedding"]
+    embeddings_text_fridge = rdata_results_text[2]["embedding"]
+
+    assert helpers.cosine_similarity(
+        embeddings_image_cat, embeddings_text_cat
+    ) > helpers.cosine_similarity(embeddings_image_cat, embeddings_text_car)
+    assert helpers.cosine_similarity(
+        embeddings_image_cat, embeddings_text_cat
+    ) > helpers.cosine_similarity(embeddings_image_cat, embeddings_text_fridge)
+
+
 @pytest.mark.anyio
 @pytest.mark.parametrize("no_of_images", [1, 5, 10])
 async def test_vision_multiple(client, no_of_images):
@@ -119,3 +155,12 @@ async def test_vision_empty(client):
         json={"model": MODEL, "input": image_url_empty},
     )
     assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
+
+
+@pytest.mark.anyio
+async def test_unsupported_endpoints(client):
+    response_unsupported = await client.post(
+        f"{PREFIX}/classify",
+        json={"model": MODEL, "input": ["test"]},
+    )
+    assert response_unsupported.status_code == status.HTTP_400_BAD_REQUEST