Skip to content

Commit

Permalink
add openvino, trt (#460)
Browse files Browse the repository at this point in the history
* add openvino, trt

* remove .lock

* add sentencepiece
  • Loading branch information
michaelfeil authored Nov 12, 2024
1 parent cdbe888 commit 9206840
Show file tree
Hide file tree
Showing 12 changed files with 137 additions and 1,105 deletions.
15 changes: 10 additions & 5 deletions libs/infinity_emb/Docker.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@ cpu:
# RUN sed -i 's|torch = "2.4.1"|torch = "2.5.0"|' pyproject.toml
# RUN sed -i 's|"pypi"|"pytorch_cpu"|' pyproject.toml
# RUN poetry lock --no-update
poetry_extras: "all openvino"
main_install: |
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/cpu"
extra_env_variables: |
# Sets default to onnx
ENV INFINITY_ENGINE="optimum"
amd:
# 2 . command: jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
Expand All @@ -29,19 +33,20 @@ amd:
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/rocm6.2"
poetry_extras: "all onnxruntime-gpu"
python_version: python3.10

trt:
base_image: nvidia/cuda:12.1.1-devel-ubuntu22.04
base_image: nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
poetry_extras: "all onnxruntime-gpu"
extra_installs_main: |
# Install utils for tensorrt
RUN apt-get install -y --no-install-recommends openmpi-bin libopenmpi-dev git git-lfs python3-pip
RUN poetry run $PYTHON -m pip install --no-cache-dir flash-attn --no-build-isolation
RUN poetry run $PYTHON -m pip install --no-cache-dir "tensorrt==10.0.1" "tensorrt_lean==10.0.1" "tensorrt_dispatch==10.0.1"
ENV LD_LIBRARY_PATH /app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH /app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
RUN poetry run $PYTHON -m pip install --no-cache-dir "tensorrt==10.3.0" "tensorrt_lean==10.3.0" "tensorrt_dispatch==10.3.0"
extra_env_variables: |
# Set default to tensorrt
ENV LD_LIBRARY_PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
python_version: python3.10
main_install: "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
21 changes: 1 addition & 20 deletions libs/infinity_emb/Dockerfile.amd_auto
Original file line number Diff line number Diff line change
Expand Up @@ -91,26 +91,7 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]

#

# Use a multi-stage build -> production version, with download
# docker buildx build --target=production-with-download \
Expand Down
23 changes: 3 additions & 20 deletions libs/infinity_emb/Dockerfile.cpu_auto
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ENV PYTHONUNBUFFERED=1 \
POETRY_VIRTUALENVS_IN_PROJECT="true" \
# do not ask any interactive question
POETRY_NO_INTERACTION=1 \
EXTRAS="all" \
EXTRAS="all openvino" \
PYTHON="python3.11"
RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl
WORKDIR /app
Expand Down Expand Up @@ -91,25 +91,8 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]
# Sets default to onnx
ENV INFINITY_ENGINE="optimum"


# Use a multi-stage build -> production version, with download
Expand Down
21 changes: 1 addition & 20 deletions libs/infinity_emb/Dockerfile.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -82,26 +82,7 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]

{{extra_env_variables | default('#')}}

# Use a multi-stage build -> production version, with download
# docker buildx build --target=production-with-download \
Expand Down
21 changes: 1 addition & 20 deletions libs/infinity_emb/Dockerfile.nvidia_auto
Original file line number Diff line number Diff line change
Expand Up @@ -82,26 +82,7 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]

#

# Use a multi-stage build -> production version, with download
# docker buildx build --target=production-with-download \
Expand Down
28 changes: 5 additions & 23 deletions libs/infinity_emb/Dockerfile.trt_onnx_auto
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This file is generated from Dockerfile.jinja2. Do not edit the Dockerfile.cuda|cpu|amd file directly.
# Only contribute to the Dockerfile.jinja2 and dockerfile_template.yaml and regenerate the Dockerfile.cuda|cpu|amd

FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 AS base
FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04 AS base

ENV PYTHONUNBUFFERED=1 \
\
Expand Down Expand Up @@ -44,9 +44,7 @@ RUN poetry install --no-interaction --no-ansi --extras "${EXTRAS}" --without li
# Install utils for tensorrt
RUN apt-get install -y --no-install-recommends openmpi-bin libopenmpi-dev git git-lfs python3-pip
RUN poetry run $PYTHON -m pip install --no-cache-dir flash-attn --no-build-isolation
RUN poetry run $PYTHON -m pip install --no-cache-dir "tensorrt==10.0.1" "tensorrt_lean==10.0.1" "tensorrt_dispatch==10.0.1"
ENV LD_LIBRARY_PATH /app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH /app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}
RUN poetry run $PYTHON -m pip install --no-cache-dir "tensorrt==10.3.0" "tensorrt_lean==10.3.0" "tensorrt_dispatch==10.3.0"



Expand Down Expand Up @@ -88,25 +86,9 @@ COPY --from=testing /app/test_results.txt /app/test_results.txt
ENV HF_HOME=/app/.cache/huggingface
ENV PATH=/app/.venv/bin:$PATH
# do nothing
RUN echo "copied all files"


# Export with tensorrt, not recommended.
# docker buildx build --target=production-tensorrt -f Dockerfile .
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS production-tensorrt
# ENV PYTHONUNBUFFERED=1 \
# PIP_NO_CACHE_DIR=off \
# PYTHON="python3.11"
# RUN apt-get update && apt-get install python3-dev python3-pip $PYTHON build-essential curl -y
# COPY --from=builder /app /app
# # force testing stage to run
# COPY --from=testing /app/test_results.txt /app/test_results.txt
# ENV HF_HOME=/app/.cache/torch
# ENV PATH=/app/.venv/bin:$PATH
# RUN pip install --no-cache-dir "onnxruntime-gpu==1.17.0" "tensorrt==8.6.*"
# ENV LD_LIBRARY_PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/$(PYTHON)/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
# ENV PATH /app/.venv/lib/$(PYTHON)/site-packages/tensorrt/bin:${PATH}
# ENTRYPOINT ["infinity_emb"]
# Set default to tensorrt
ENV LD_LIBRARY_PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt:/usr/lib/x86_64-linux-gnu:/app/.venv/lib/${PYTHON}/site-packages/tensorrt_libs:${LD_LIBRARY_PATH}
ENV PATH=/app/.venv/lib/${PYTHON}/site-packages/tensorrt/bin:${PATH}


# Use a multi-stage build -> production version, with download
Expand Down
2 changes: 1 addition & 1 deletion libs/infinity_emb/infinity_emb/inference/select_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def select_model(

if engine_args.model_warmup:
# size one, warm up warm start timings.
loaded_engine.warmup(batch_size=engine_args.batch_size, n_tokens=1)
# loaded_engine.warmup(batch_size=engine_args.batch_size, n_tokens=1)
# size one token
min_inference_t = min(
min(loaded_engine.warmup(batch_size=1, n_tokens=1)[1] for _ in range(10)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, *, engine_args: EngineArgs):
model_name_or_path=engine_args.model_name_or_path,
revision=engine_args.revision,
use_auth_token=True,
prefer_quantized="cpu" in provider.lower(),
prefer_quantized=("cpu" in provider.lower() or "openvino" in provider.lower()),
)

self.model = optimize_model(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, *, engine_args: EngineArgs):
model_name_or_path=engine_args.model_name_or_path,
revision=engine_args.revision,
use_auth_token=True,
prefer_quantized="cpu" in provider.lower(),
prefer_quantized=("cpu" in provider.lower() or "openvino" in provider.lower()),
)

self.pooling = (
Expand Down
27 changes: 21 additions & 6 deletions libs/infinity_emb/infinity_emb/transformer/utils_optimum.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
from huggingface_hub import HfApi, HfFolder # type: ignore
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # type: ignore

from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TORCH
from infinity_emb._optional_imports import CHECK_ONNXRUNTIME
from infinity_emb.log_handler import logger
from infinity_emb.primitives import Device

if CHECK_ONNXRUNTIME.is_available:
try:
import onnxruntime as ort # type: ignore
from optimum.modeling_base import OptimizedModel # type: ignore
from optimum.onnxruntime import ( # type: ignore
ORTModel,
Expand All @@ -23,9 +24,6 @@
except (ImportError, RuntimeError, Exception) as ex:
CHECK_ONNXRUNTIME.mark_dirty(ex)

if CHECK_TORCH.is_available:
import torch


def mean_pooling(last_hidden_states: np.ndarray, attention_mask: np.ndarray):
input_mask_expanded = (np.expand_dims(attention_mask, axis=-1)).astype(float)
Expand All @@ -49,17 +47,32 @@ def normalize(input_array, p=2, dim=1, eps=1e-12):


def device_to_onnx(device: Device) -> str:
CHECK_ONNXRUNTIME.mark_required()
available = ort.get_available_providers()

if device == Device.cpu:
if "OpenVINOExecutionProvider" in available:
return "OpenVINOExecutionProvider"
return "CPUExecutionProvider"
elif device == Device.cuda:
if "ROCMExecutionProvider" in available:
return "ROCMExecutionProvider"
return "CUDAExecutionProvider"
elif device == Device.mps:
return "CoreMLExecutionProvider"
elif device == Device.tensorrt:
return "TensorrtExecutionProvider"
elif device is None or device == Device.auto:
if CHECK_TORCH.is_available and torch.cuda.is_available():
if "TensorrtExecutionProvider" in available:
return "TensorrtExecutionProvider"
elif "CUDAExecutionProvider" in available:
return "CUDAExecutionProvider"
elif "ROCMExecutionProvider" in available:
return "ROCMExecutionProvider"
elif "CoreMLExecutionProvider" in available:
return "CoreMLExecutionProvider"
elif "OpenVINOExecutionProvider" in available:
return "OpenVINOExecutionProvider"
else:
return "CPUExecutionProvider"
else:
Expand Down Expand Up @@ -135,7 +148,9 @@ def optimize_model(

optimizer = ORTOptimizer.from_pretrained(unoptimized_model)

is_gpu = "cpu" not in execution_provider.lower()
is_gpu = not (
"cpu" in execution_provider.lower() or "openvino" in execution_provider.lower()
)
optimization_config = OptimizationConfig(
optimization_level=99,
optimize_with_onnxruntime_only=False,
Expand Down
Loading

0 comments on commit 9206840

Please sign in to comment.