Skip to content

Commit

Permalink
Merge pull request #195 from michaelfeil/offline-model-setup
Browse files Browse the repository at this point in the history
update infinity offline solution
  • Loading branch information
michaelfeil authored Apr 8, 2024
2 parents 13e9564 + cdd7a21 commit eac5226
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 31 deletions.
40 changes: 30 additions & 10 deletions libs/infinity_emb/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@ poetry run python -m pytest tests/end_to_end/test_api_with_dummymodel.py -x ; \
fi
RUN echo "all tests passed" > "test_results.txt"

# Use a multi-stage build -> production version, with download
FROM base AS tested-builder

COPY --from=builder /app /app
# force testing stage to run
COPY --from=testing /app/test_results.txt /app/test_results.txt

ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH

# do nothing
RUN echo "copied all files"

# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
Expand All @@ -77,21 +90,28 @@ RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-esse
COPY --from=builder /app /app
# force testing stage to run
COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/torch
ENV HF_HOME=/app/.cache/torch
ENV PATH=/app/.venv/bin:$PATH
RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
ENTRYPOINT ["infinity_emb"]

# Use a multi-stage build -> production version
FROM base AS production

COPY --from=builder /app /app
# force testing stage to run
COPY --from=testing /app/test_results.txt /app/test_results.txt

ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/torch
ENV PATH=/app/.venv/bin:$PATH
# Use a multi-stage build -> production version, with download
# docker buildx build --target=production-with-download \
# --build-arg MODEL_NAME=BAAI/bge-small-en-v1.5 --build-arg ENGINE=torch -f Dockerfile -t infinity-BAAI-small .
FROM tested-builder AS production-with-download
# collect model name and engine from build args
ARG MODEL_NAME
RUN if [ -z "${MODEL_NAME}" ]; then echo "Error: Build argument MODEL_NAME not set." && exit 1; fi
ARG ENGINE
RUN if [ -z "${ENGINE}" ]; then echo "Error: Build argument ENGINE not set." && exit 1; fi
ARG EXTRA_PACKAGES
RUN if [ -z "${EXTRA_PACKAGES}" ]; ; then python -m pip install --no-cache-dir $EXTRA_PACKAGES; fi

RUN infinity_emb --model-name-or-path $MODEL_NAME --engine $ENGINE --preload_only || true
ENTRYPOINT ["infinity_emb"]

# Use a multi-stage build -> production version
FROM tested-builder AS production
ENTRYPOINT ["infinity_emb"]
55 changes: 34 additions & 21 deletions libs/infinity_emb/infinity_emb/infinity_server.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
from contextlib import asynccontextmanager
from typing import Optional

import infinity_emb
Expand Down Expand Up @@ -31,13 +32,41 @@ def create_server(
url_prefix: str = "",
doc_extra: dict = {},
redirect_slash: str = "/docs",
preload_only: bool = False,
):
"""
creates the FastAPI App
"""
from fastapi import FastAPI, responses, status
from prometheus_fastapi_instrumentator import Instrumentator

@asynccontextmanager
async def lifespan(app: FastAPI):
instrumentator.expose(app) # type: ignore
app.model = AsyncEmbeddingEngine.from_args(engine_args) # type: ignore
# start in a threadpool
await app.model.astart() # type: ignore

logger.info(
docs.startup_message(
host=doc_extra.pop("host", "localhost"),
port=doc_extra.pop("port", "PORT"),
prefix=url_prefix,
)
)

if preload_only:
logger.info(
f"Preloaded configuration successfully. {engine_args} "
" -> Non-graceful exit ."
)
# skip the blocking part
else:
# application is blocking here!
yield
await app.model.astop() # type: ignore
# shutdown!

app = FastAPI(
title=docs.FASTAPI_TITLE,
summary=docs.FASTAPI_SUMMARY,
Expand All @@ -49,30 +78,12 @@ def create_server(
"name": "MIT License",
"identifier": "MIT",
},
lifespan=lifespan,
)

instrumentator = Instrumentator().instrument(app)
app.add_exception_handler(errors.OpenAIException, errors.openai_exception_handler)

@app.on_event("startup")
async def _startup():
instrumentator.expose(app)

app.model = AsyncEmbeddingEngine.from_args(engine_args)
# start in a threadpool
await app.model.astart()

logger.info(
docs.startup_message(
host=doc_extra.pop("host", "localhost"),
port=doc_extra.pop("port", "PORT"),
prefix=url_prefix,
)
)

@app.on_event("shutdown")
async def _shutdown():
await app.model.astop()

@app.get("/health")
async def _health() -> dict[str, float]:
"""
Expand Down Expand Up @@ -232,6 +243,7 @@ def _start_uvicorn(
pooling_method: PoolingMethod.names_enum() = PoolingMethod.names_enum().auto.name, # type: ignore
compile: bool = False,
bettertransformer: bool = True,
preload_only: bool = False,
):
"""Infinity Embedding API ♾️ cli to start a uvicorn-server instance;
MIT License; Copyright (c) 2023-now Michael Feil
Expand Down Expand Up @@ -260,7 +272,7 @@ def _start_uvicorn(
pooling_method, PoolingMethod: pooling method to use. Defaults to PoolingMethod.auto or "auto"
compile, bool: compile model for faster inference. Defaults to False.
use_bettertransformer, bool: use bettertransformer. Defaults to True.
preload_only, bool: only preload the model and exit. Defaults to False.
"""
CHECK_UVICORN.mark_required()
import uvicorn
Expand Down Expand Up @@ -293,6 +305,7 @@ def _start_uvicorn(
url_prefix=url_prefix,
doc_extra=dict(host=host, port=port),
redirect_slash=redirect_slash,
preload_only=preload_only,
)
uvicorn.run(app, host=host, port=port, log_level=log_level.name)

Expand Down

0 comments on commit eac5226

Please sign in to comment.