Merge pull request #195 from michaelfeil/offline-model-setup

update infinity offline solution
michaelfeil · Apr 8, 2024 · eac5226 · eac5226
2 parents 13e9564 + cdd7a21
commit eac5226
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 31 deletions.
diff --git a/libs/infinity_emb/Dockerfile b/libs/infinity_emb/Dockerfile
@@ -67,6 +67,19 @@ poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py -x ; \
 fi
 RUN echo "all tests passed" > "test_results.txt"
 
+# Use a multi-stage build -> production version, with download
+FROM base AS tested-builder
+
+COPY --from=builder /app /app
+# force testing stage to run
+COPY --from=testing /app/test_results.txt /app/test_results.txt
+
+ENV HF_HOME=/app/.cache/huggingface
+ENV PATH=/app/.venv/bin:$PATH
+
+# do nothing
+RUN echo "copied all files"
+
 # Export with tensorrt, not recommended.
 # docker buildx build --target=production-tensorrt -f Dockerfile .
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
@@ -77,21 +90,28 @@ RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-esse
 COPY --from=builder /app /app
 # force testing stage to run
 COPY --from=testing /app/test_results.txt /app/test_results.txt
-ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/torch
+ENV HF_HOME=/app/.cache/torch
 ENV PATH=/app/.venv/bin:$PATH
 RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
 ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
 ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
 ENTRYPOINT ["infinity_emb"]
 
-# Use a multi-stage build -> production version
-FROM base AS production
-
-COPY --from=builder /app /app
-# force testing stage to run
-COPY --from=testing /app/test_results.txt /app/test_results.txt
-
-ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/torch
-ENV PATH=/app/.venv/bin:$PATH
+# Use a multi-stage build -> production version, with download
+# docker buildx build --target=production-with-download \
+# --build-arg MODEL_NAME=BAAI/bge-small-en-v1.5 --build-arg ENGINE=torch -f Dockerfile -t infinity-BAAI-small .
+FROM tested-builder AS production-with-download
+# collect model name and engine from build args
+ARG MODEL_NAME 
+RUN if [ -z "${MODEL_NAME}" ]; then echo "Error: Build argument MODEL_NAME not set." && exit 1; fi
+ARG ENGINE
+RUN if [ -z "${ENGINE}" ]; then echo "Error: Build argument ENGINE not set." && exit 1; fi
+ARG EXTRA_PACKAGES
+RUN if [ -z "${EXTRA_PACKAGES}" ]; ; then python -m pip install --no-cache-dir $EXTRA_PACKAGES; fi
+
+RUN infinity_emb --model-name-or-path $MODEL_NAME --engine $ENGINE --preload_only || true
+ENTRYPOINT ["infinity_emb"]
 
+# Use a multi-stage build -> production version
+FROM tested-builder AS production
 ENTRYPOINT ["infinity_emb"]
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -1,4 +1,5 @@
 import time
+from contextlib import asynccontextmanager
 from typing import Optional
 
 import infinity_emb
@@ -31,13 +32,41 @@ def create_server(
     url_prefix: str = "",
     doc_extra: dict = {},
     redirect_slash: str = "/docs",
+    preload_only: bool = False,
 ):
     """
     creates the FastAPI App
     """
     from fastapi import FastAPI, responses, status
     from prometheus_fastapi_instrumentator import Instrumentator
 
+    @asynccontextmanager
+    async def lifespan(app: FastAPI):
+        instrumentator.expose(app)  # type: ignore
+        app.model = AsyncEmbeddingEngine.from_args(engine_args)  # type: ignore
+        # start in a threadpool
+        await app.model.astart()  # type: ignore
+
+        logger.info(
+            docs.startup_message(
+                host=doc_extra.pop("host", "localhost"),
+                port=doc_extra.pop("port", "PORT"),
+                prefix=url_prefix,
+            )
+        )
+
+        if preload_only:
+            logger.info(
+                f"Preloaded configuration successfully. {engine_args} "
+                " -> Non-graceful exit ."
+            )
+            # skip the blocking part
+        else:
+            # application is blocking here!
+            yield
+        await app.model.astop()  # type: ignore
+        # shutdown!
+
     app = FastAPI(
         title=docs.FASTAPI_TITLE,
         summary=docs.FASTAPI_SUMMARY,
@@ -49,30 +78,12 @@ def create_server(
             "name": "MIT License",
             "identifier": "MIT",
         },
+        lifespan=lifespan,
     )
+
     instrumentator = Instrumentator().instrument(app)
     app.add_exception_handler(errors.OpenAIException, errors.openai_exception_handler)
 
-    @app.on_event("startup")
-    async def _startup():
-        instrumentator.expose(app)
-
-        app.model = AsyncEmbeddingEngine.from_args(engine_args)
-        # start in a threadpool
-        await app.model.astart()
-
-        logger.info(
-            docs.startup_message(
-                host=doc_extra.pop("host", "localhost"),
-                port=doc_extra.pop("port", "PORT"),
-                prefix=url_prefix,
-            )
-        )
-
-    @app.on_event("shutdown")
-    async def _shutdown():
-        await app.model.astop()
-
     @app.get("/health")
     async def _health() -> dict[str, float]:
         """
@@ -232,6 +243,7 @@ def _start_uvicorn(
     pooling_method: PoolingMethod.names_enum() = PoolingMethod.names_enum().auto.name,  # type: ignore
     compile: bool = False,
     bettertransformer: bool = True,
+    preload_only: bool = False,
 ):
     """Infinity Embedding API ♾️  cli to start a uvicorn-server instance;
     MIT License; Copyright (c) 2023-now Michael Feil
@@ -260,7 +272,7 @@ def _start_uvicorn(
         pooling_method, PoolingMethod: pooling method to use. Defaults to PoolingMethod.auto or "auto"
         compile, bool: compile model for faster inference. Defaults to False.
         use_bettertransformer, bool: use bettertransformer. Defaults to True.
-
+        preload_only, bool: only preload the model and exit. Defaults to False.
     """
     CHECK_UVICORN.mark_required()
     import uvicorn
@@ -293,6 +305,7 @@ def _start_uvicorn(
         url_prefix=url_prefix,
         doc_extra=dict(host=host, port=port),
         redirect_slash=redirect_slash,
+        preload_only=preload_only,
     )
     uvicorn.run(app, host=host, port=port, log_level=log_level.name)