Skip to content

Commit

Permalink
1.0.0preview update for Llama 3 support and more (vLLM 0.3.3 -> 0.4.2) (
Browse files Browse the repository at this point in the history
  • Loading branch information
alpayariyak committed May 9, 2024
1 parent 0a5b5bc commit 874379a
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 57 deletions.
14 changes: 8 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
ARG WORKER_CUDA_VERSION=11.8.0
FROM runpod/worker-vllm:base-0.3.2-cuda${WORKER_CUDA_VERSION} AS vllm-base
ARG BASE_IMAGE_VERSION=1.0.0
FROM runpod/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda${WORKER_CUDA_VERSION} AS vllm-base

RUN apt-get update -y \
&& apt-get install -y python3-pip
Expand All @@ -19,19 +20,19 @@ ARG MODEL_REVISION=""
ARG TOKENIZER_REVISION=""

ENV MODEL_NAME=$MODEL_NAME \
MODEL_REVISION=$REVISION \
MODEL_REVISION=$MODEL_REVISION \
TOKENIZER_NAME=$TOKENIZER_NAME \
TOKENIZER_REVISION=$TOKENIZER_REVISION \
BASE_PATH=$BASE_PATH \
QUANTIZATION=$QUANTIZATION \
HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \
HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \
HF_HOME="${BASE_PATH}/huggingface-cache/hub" \
HF_TRANSFER=1
HF_HUB_ENABLE_HF_TRANSFER=1

ENV PYTHONPATH="/:/vllm-installation"
ENV PYTHONPATH="/:/vllm-workspace"

COPY builder/download_model.py /download_model.py
COPY src/download_model.py /download_model.py
RUN --mount=type=secret,id=HF_TOKEN,required=false \
if [ -f /run/secrets/HF_TOKEN ]; then \
export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \
Expand All @@ -42,7 +43,8 @@ RUN --mount=type=secret,id=HF_TOKEN,required=false \

# Add source files
COPY src /src

# Remove download_model.py
RUN rm /download_model.py

# Start the handler
CMD ["python3", "/src/handler.py"]
56 changes: 30 additions & 26 deletions README.md

Large diffs are not rendered by default.

65 changes: 65 additions & 0 deletions docker-bake.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
variable "PUSH" {
default = "true"
}

variable "REPOSITORY" {
default = "runpod"
}

variable "BASE_IMAGE_VERSION" {
default = "1.0.0preview"
}

group "all" {
targets = ["base", "main"]
}

group "base" {
targets = ["base-1180", "base-1210"]
}

group "main" {
targets = ["worker-1180", "worker-1210"]
}

target "base-1180" {
tags = ["${REPOSITORY}/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda11.8.0"]
context = "vllm-base-image"
dockerfile = "Dockerfile"
args = {
WORKER_CUDA_VERSION = "11.8.0"
}
output = ["type=docker,push=${PUSH}"]
}

target "base-1210" {
tags = ["${REPOSITORY}/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda12.1.0"]
context = "vllm-base-image"
dockerfile = "Dockerfile"
args = {
WORKER_CUDA_VERSION = "12.1.0"
}
output = ["type=docker,push=${PUSH}"]
}

target "worker-1180" {
tags = ["${REPOSITORY}/worker-vllm:worker-${BASE_IMAGE_VERSION}-cuda11.8.0"]
context = "."
dockerfile = "Dockerfile"
args = {
BASE_IMAGE_VERSION = "${BASE_IMAGE_VERSION}"
WORKER_CUDA_VERSION = "11.8.0"
}
output = ["type=docker,push=${PUSH}"]
}

target "worker-1210" {
tags = ["${REPOSITORY}/worker-vllm:worker-${BASE_IMAGE_VERSION}-cuda12.1.0"]
context = "."
dockerfile = "Dockerfile"
args = {
BASE_IMAGE_VERSION = "${BASE_IMAGE_VERSION}"
WORKER_CUDA_VERSION = "12.1.0"
}
output = ["type=docker,push=${PUSH}"]
}
Binary file added media/ui_demo.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 5 additions & 0 deletions builder/download_model.py → src/download_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import shutil
from tensorize import serialize_model
from huggingface_hub import snapshot_download
from vllm.model_executor.weight_utils import prepare_hf_model_weights, Disabledtqdm

Expand Down Expand Up @@ -41,6 +42,10 @@ def move_files(src_dir, dest_dir):
model_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights(model_name_or_path=model, revision=revisions["model"], cache_dir=download_dir)
model_extras_folder = download_extras_or_tokenizer(model, download_dir, revisions["model"], extras=True)
move_files(model_extras_folder, model_folder)

if os.environ.get("TENSORIZE_MODEL"):



with open("/local_model_path.txt", "w") as f:
f.write(model_folder)
Expand Down
7 changes: 6 additions & 1 deletion src/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from dotenv import load_dotenv
from torch.cuda import device_count
from typing import AsyncGenerator
import time

from vllm import AsyncLLMEngine, AsyncEngineArgs
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
Expand Down Expand Up @@ -100,7 +101,11 @@ async def _generate_vllm(self, llm_input, validated_sampling_params, batch_size,

def _initialize_llm(self):
try:
return AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.config))
start = time.time()
engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.config))
end = time.time()
logging.info(f"Initialized vLLM engine in {end - start:.2f}s")
return engine
except Exception as e:
logging.error("Error initializing vLLM engine: %s", e)
raise e
Expand Down
1 change: 0 additions & 1 deletion src/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
from http import HTTPStatus
from typing import Any, Dict
from vllm.utils import random_uuid
from vllm.entrypoints.openai.protocol import ErrorResponse
from vllm import SamplingParams
Expand Down
103 changes: 82 additions & 21 deletions vllm-base-image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,21 @@ RUN apt-get update -y \
# Set working directory
WORKDIR /vllm-installation

RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/

# Install build and runtime dependencies
COPY vllm/requirements-${WORKER_CUDA_VERSION}.txt requirements.txt
COPY vllm/requirements-common.txt requirements-common.txt
COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
pip install -r requirements-cuda.txt

# Install development dependencies
COPY vllm/requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt

ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}

FROM dev AS build

Expand All @@ -40,26 +46,69 @@ COPY vllm/requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache

# Copy necessary files
COPY vllm/csrc csrc
COPY vllm/setup.py setup.py
COPY vllm/cmake cmake
COPY vllm/CMakeLists.txt CMakeLists.txt
COPY vllm/requirements-common.txt requirements-common.txt
COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt
COPY vllm/pyproject.toml pyproject.toml
COPY vllm/vllm/__init__.py vllm/__init__.py
COPY vllm/vllm vllm

# Set environment variables for building extensions
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ARG max_jobs=48
ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=1024
ENV NVCC_THREADS=${nvcc_threads}
ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION}
ENV VLLM_INSTALL_PUNICA_KERNELS=0
# Build extensions
RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/
RUN python3 setup.py build_ext --inplace
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
python3 setup.py bdist_wheel --dist-dir=dist

RUN --mount=type=cache,target=/root/.cache/pip \
pip cache remove vllm_nccl*

FROM dev as flash-attn-builder
# max jobs used for build
# flash attention version
ARG flash_attn_version=v2.5.8
ENV FLASH_ATTN_VERSION=${flash_attn_version}

FROM nvidia/cuda:${WORKER_CUDA_VERSION}-runtime-ubuntu22.04 AS vllm-base
WORKDIR /usr/src/flash-attention-v2

# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
--no-build-isolation --no-deps --no-cache-dir

FROM dev as NCCL-installer

# Re-declare ARG after FROM
ARG WORKER_CUDA_VERSION

# Update and install necessary libraries
RUN apt-get update -y \
&& apt-get install -y wget

# Install NCCL library
RUN if [ "$WORKER_CUDA_VERSION" = "11.8.0" ]; then \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& apt-get update \
&& apt install -y libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8; \
elif [ "$WORKER_CUDA_VERSION" = "12.1.0" ]; then \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& apt-get update \
&& apt install -y libnccl2=2.17.1-1+cuda12.1 libnccl-dev=2.17.1-1+cuda12.1; \
else \
echo "Unsupported CUDA version: $WORKER_CUDA_VERSION"; \
exit 1; \
fi

FROM nvidia/cuda:${WORKER_CUDA_VERSION}-base-ubuntu22.04 AS vllm-base

# Re-declare ARG after FROM
ARG WORKER_CUDA_VERSION
Expand All @@ -69,20 +118,32 @@ RUN apt-get update -y \
&& apt-get install -y python3-pip

# Set working directory
WORKDIR /vllm-installation
WORKDIR /vllm-workspace

RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/

# Install runtime dependencies
COPY vllm/requirements-${WORKER_CUDA_VERSION}.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
RUN --mount=type=bind,from=build,src=/vllm-installation/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
pip install dist/*.whl --verbose

RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
--mount=type=cache,target=/root/.cache/pip \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir

FROM vllm-base AS runtime

# Copy built files from the build stage
COPY --from=build /vllm-installation/vllm/*.so /vllm-installation/vllm/
COPY vllm/vllm vllm
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer modelscope tensorizer

# Set PYTHONPATH environment variable
ENV PYTHONPATH="/"

# Copy NCCL library
COPY --from=NCCL-installer /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/x86_64-linux-gnu/libnccl.so.2
# Set the VLLM_NCCL_SO_PATH environment variable
ENV VLLM_NCCL_SO_PATH="/usr/lib/x86_64-linux-gnu/libnccl.so.2"


# Validate the installation
RUN python3 -c "import sys; print(sys.path); import vllm; print(vllm.__file__)"
RUN python3 -c "import vllm; print(vllm.__file__)"
2 changes: 1 addition & 1 deletion vllm-base-image/vllm
Submodule vllm updated from c46d23 to ba8f5e
4 changes: 3 additions & 1 deletion vllm-base-image/vllm-metadata.yml
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
version: '0.3.3'
version: '0.3.3'
dev_version: '0.4.2'
worker_dev_version: '1.0.0preview'

0 comments on commit 874379a

Please sign in to comment.