Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMD] [ROCm] [Optimum] Add optimum-amd support #443

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
32 changes: 32 additions & 0 deletions docs/docs/deploy.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,38 @@ docker run -it --gpus all \
```
The cache path at inside the docker container is set by the environment variable `HF_HOME`.


### AMD Docker: Deploy on AMD Platform (MI200 Series and MI300 Series)
#### Launch the CLI using a pre-built docker container (recommended)

```bash
port=7997
model1=michaelfeil/bge-small-en-v1.5
model2=mixedbread-ai/mxbai-rerank-xsmall-v1
volume=$PWD/data

docker run -it \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device=/dev/kfd \
--device=/dev/dri \
michaelfeil marked this conversation as resolved.
Show resolved Hide resolved
--group-add video \
--network host \
-v $volume:/app/.cache \
-p $port:$port \
michaelf34/infinity:latest-rocm \
v2 \
--model-id $model1 \
--model-id $model2 \
--port $port \
--engine torch \
--compile \
--no-bettertransformer
```
The cache path at inside the docker container is set by the environment variable `HF_HOME`.



## Modal Labs

A deployment example for usage within are located at repo, including a Github Actions Pipeline.
Expand Down
39 changes: 37 additions & 2 deletions libs/infinity_emb/Docker.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,43 @@ amd:
# "RUN poetry install --no-interaction --no-ansi --no-root --extras \"${EXTRAS}\" --without lint,test && poetry cache clear pypi --all"
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
RUN ./requirements_install_from_poetry.sh --no-root --without lint,test "https://download.pytorch.org/whl/rocm6.2"

poetry_extras: "all onnxruntime-gpu"
extra_installs_main: |
ARG GPU_ARCH
ENV GPU_ARCH=${GPU_ARCH}
# GPU architecture specific installations
RUN cd /opt/rocm/share/amd_smi \
&& python -m pip wheel . --wheel-dir=/install
RUN apt update -y && apt install migraphx -y
RUN if [ "$GPU_ARCH" = "gfx90a" ] || [ "$GPU_ARCH" = "gfx942" ]; then \
# OPTION1: Follow the steps here to install onnxruntime-rocm
# https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu
. .venv/bin/activate && python -m pip uninstall onnxruntime -y \
&& python -m pip install /install/*.whl \
&& python -m pip install cmake onnx \
&& (curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y) \
&& (. $HOME/.cargo/env) \
&& git clone --single-branch --branch main --recursive https://github.com/Microsoft/onnxruntime onnxruntime \
&& cd onnxruntime \
&& (./build.sh --config Release --build_wheel --allow_running_as_root --update --build --parallel --cmake_extra_defines CMAKE_HIP_ARCHITECTURES=${GPU_ARCH} ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --use_migraphx --rocm_home=/opt/rocm) \
&& python -m pip uninstall onnxruntime -y \
&& python -m pip install build/Linux/Release/dist/* \
&& cp -r /app/onnxruntime/build/Linux/Release/*.so /usr/local/lib/ \
&& cp -r /app/onnxruntime/build/Linux/Release/*.so.* /usr/local/lib/ \
&& git clone https://github.com/huggingface/optimum-amd.git \
&& cd optimum-amd \
&& python -m pip install -e .; \
elif [ "$GPU_ARCH" = "gfx1100" ]; then \
# OPTION2: Install onnxruntime-rocm from the wheel
. .venv/bin/activate && python -m pip uninstall onnxruntime onnxruntime-rocm -y && python -m pip install "numpy<2" https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2.3/onnxruntime_rocm-1.18.0-cp310-cp310-linux_x86_64.whl \
&& python -m pip install /install/*.whl \
&& git clone https://github.com/huggingface/optimum-amd.git /tmp-optimum \
&& cd /tmp-optimum \
&& python -m pip install .; \
else \
echo "Unsupported GPU_ARCH: ${GPU_ARCH}"; \
exit 1; \
fi
poetry_extras: "all"
python_version: python3.10

trt:
Expand Down
41 changes: 38 additions & 3 deletions libs/infinity_emb/Dockerfile.amd_auto
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ENV PYTHONUNBUFFERED=1 \
POETRY_VIRTUALENVS_IN_PROJECT="true" \
# do not ask any interactive question
POETRY_NO_INTERACTION=1 \
EXTRAS="all onnxruntime-gpu" \
EXTRAS="all" \
PYTHON="python3.10"
RUN apt-get update && apt-get install --no-install-recommends -y build-essential python3-dev libsndfile1 $PYTHON-venv $PYTHON curl
WORKDIR /app
Expand Down Expand Up @@ -47,7 +47,42 @@ COPY infinity_emb infinity_emb
COPY requirements_install_from_poetry.sh requirements_install_from_poetry.sh
RUN ./requirements_install_from_poetry.sh --without lint,test "https://download.pytorch.org/whl/rocm6.2"

#
ARG GPU_ARCH
ENV GPU_ARCH=${GPU_ARCH}
# GPU architecture specific installations
RUN cd /opt/rocm/share/amd_smi \
&& python -m pip wheel . --wheel-dir=/install
RUN apt update -y && apt install migraphx -y
RUN if [ "$GPU_ARCH" = "gfx90a" ] || [ "$GPU_ARCH" = "gfx942" ]; then \
# OPTION1: Follow the steps here to install onnxruntime-rocm
# https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu
. .venv/bin/activate && python -m pip uninstall onnxruntime -y \
&& python -m pip install /install/*.whl \
&& python -m pip install cmake onnx \
&& (curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y) \
&& (. $HOME/.cargo/env) \
&& git clone --single-branch --branch main --recursive https://github.com/Microsoft/onnxruntime onnxruntime \
&& cd onnxruntime \
&& (./build.sh --config Release --build_wheel --allow_running_as_root --update --build --parallel --cmake_extra_defines CMAKE_HIP_ARCHITECTURES=${GPU_ARCH} ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --use_migraphx --rocm_home=/opt/rocm) \
&& python -m pip uninstall onnxruntime -y \
&& python -m pip install build/Linux/Release/dist/* \
&& cp -r /app/onnxruntime/build/Linux/Release/*.so /usr/local/lib/ \
&& cp -r /app/onnxruntime/build/Linux/Release/*.so.* /usr/local/lib/ \
&& git clone https://github.com/huggingface/optimum-amd.git \
&& cd optimum-amd \
&& python -m pip install -e .; \
elif [ "$GPU_ARCH" = "gfx1100" ]; then \
# OPTION2: Install onnxruntime-rocm from the wheel
. .venv/bin/activate && python -m pip uninstall onnxruntime onnxruntime-rocm -y && python -m pip install "numpy<2" https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2.3/onnxruntime_rocm-1.18.0-cp310-cp310-linux_x86_64.whl \
&& python -m pip install /install/*.whl \
&& git clone https://github.com/huggingface/optimum-amd.git /tmp-optimum \
&& cd /tmp-optimum \
&& python -m pip install .; \
else \
echo "Unsupported GPU_ARCH: ${GPU_ARCH}"; \
exit 1; \
fi



FROM builder as testing
Expand Down Expand Up @@ -127,4 +162,4 @@ ENTRYPOINT ["infinity_emb"]

# Use a multi-stage build -> production version
FROM tested-builder AS production
ENTRYPOINT ["infinity_emb"]
ENTRYPOINT ["infinity_emb"]
8 changes: 4 additions & 4 deletions libs/infinity_emb/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ format format_diff:
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES)

template_docker:
jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s trt > Dockerfile.trt_onnx_auto
poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s amd > Dockerfile.amd_auto
poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s cpu > Dockerfile.cpu_auto
poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s nvidia > Dockerfile.nvidia_auto
poetry run jinja2 Dockerfile.jinja2 Docker.template.yaml --format=yaml -s trt > Dockerfile.trt_onnx_auto

poetry_check:
poetry check
Expand Down
1 change: 1 addition & 0 deletions libs/infinity_emb/infinity_emb/_optional_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def _raise_error(self) -> None:
CHECK_FASTAPI = OptionalImports("fastapi", "server")
CHECK_ONNXRUNTIME = OptionalImports("optimum.onnxruntime", "optimum")
CHECK_OPTIMUM = OptionalImports("optimum", "optimum")
CHECK_OPTIMUM_AMD = OptionalImports("optimum.amd", "optimum")
CHECK_OPTIMUM_NEURON = OptionalImports(
"optimum.neuron",
"<neuronx not available as extra, only runs on AMI image, no pip install possible.>",
Expand Down
2 changes: 2 additions & 0 deletions libs/infinity_emb/infinity_emb/primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def default_value():
class Device(EnumType):
cpu = "cpu"
cuda = "cuda"
rocm = "rocm"
migraphx = "migraphx"
mps = "mps"
tensorrt = "tensorrt"
auto = "auto"
Expand Down
42 changes: 34 additions & 8 deletions libs/infinity_emb/infinity_emb/transformer/utils_optimum.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from huggingface_hub import HfApi, HfFolder # type: ignore
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # type: ignore

from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TORCH
from infinity_emb._optional_imports import CHECK_ONNXRUNTIME, CHECK_TORCH, CHECK_OPTIMUM_AMD
from infinity_emb.log_handler import logger
from infinity_emb.primitives import Device

Expand Down Expand Up @@ -53,6 +53,16 @@ def device_to_onnx(device: Device) -> str:
return "CPUExecutionProvider"
elif device == Device.cuda:
return "CUDAExecutionProvider"
elif device == Device.rocm:
if torch.version.hip is not None:
return "ROCMExecutionProvider"
else:
raise ValueError("The `torch` installed is not for ROCm.")
elif device == Device.migraphx:
if torch.version.hip is not None:
return "MIGraphXExecutionProvider"
else:
raise ValueError("The `torch` installed is not for ROCm.")
elif device == Device.mps:
return "CoreMLExecutionProvider"
elif device == Device.tensorrt:
Expand Down Expand Up @@ -87,12 +97,8 @@ def optimize_model(
revision (Optional[str], optional): The revision to use. Defaults to None.
trust_remote_code (bool, optional): Whether to trust the remote code. Defaults to True.
"""
CHECK_ONNXRUNTIME.mark_required()
path_folder = (
Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path
)
OPTIMIZED_SUFFIX = "_optimized.onnx"
files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}"))

## If there is no need for optimization
if execution_provider == "TensorrtExecutionProvider":
return model_class.from_pretrained(
model_name_or_path,
Expand All @@ -110,8 +116,28 @@ def optimize_model(
# "trt_int8_enable": "quantize" in file_name,
},
)

elif execution_provider in ["ROCMExecutionProvider", "MIGraphXExecutionProvider"]:
CHECK_OPTIMUM_AMD.mark_required()
return model_class.from_pretrained(
model_name_or_path,
revision=revision,
trust_remote_code=trust_remote_code,
provider=execution_provider,
file_name=file_name,
)

## path to find if model has been optimized
CHECK_ONNXRUNTIME.mark_required()
path_folder = (
Path(HUGGINGFACE_HUB_CACHE) / "infinity_onnx" / execution_provider / model_name_or_path
)
OPTIMIZED_SUFFIX = "_optimized.onnx"
files_optimized = list(path_folder.glob(f"**/*{OPTIMIZED_SUFFIX}"))

logger.info(f"files_optimized: {files_optimized}")
if files_optimized:
file_optimized = files_optimized[0]
file_optimized = files_optimized[-1]
logger.info(f"Optimized model found at {file_optimized}, skipping optimization")
return model_class.from_pretrained(
file_optimized.parent.as_posix(),
Expand Down
Loading