Skip to content

Commit

Permalink
0.5.3, any vllm arg as env var, refactor and fixes, moving away from …
Browse files Browse the repository at this point in the history
…building separate image from vLLM fork
  • Loading branch information
alpayariyak committed Jul 25, 2024
1 parent a08d83f commit 5bd6f3a
Show file tree
Hide file tree
Showing 11 changed files with 209 additions and 223 deletions.
20 changes: 10 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
ARG WORKER_CUDA_VERSION=11.8.0
ARG BASE_IMAGE_VERSION=1.0.0
FROM runpod/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda${WORKER_CUDA_VERSION} AS vllm-base
FROM nvidia/cuda:12.1.0-base-ubuntu22.04

RUN apt-get update -y \
&& apt-get install -y python3-pip

RUN ldconfig /usr/local/cuda-12.1/compat/

# Install Python dependencies
COPY builder/requirements.txt /requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade pip && \
python3 -m pip install --upgrade -r /requirements.txt

# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
RUN python3 -m pip install vllm==0.5.1 && \
python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3

# Setup for Option 2: Building the Image with the Model included
ARG MODEL_NAME=""
ARG TOKENIZER_NAME=""
Expand All @@ -32,19 +36,15 @@ ENV MODEL_NAME=$MODEL_NAME \

ENV PYTHONPATH="/:/vllm-workspace"

COPY src/download_model.py /download_model.py

COPY src /src
RUN --mount=type=secret,id=HF_TOKEN,required=false \
if [ -f /run/secrets/HF_TOKEN ]; then \
export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \
fi && \
if [ -n "$MODEL_NAME" ]; then \
python3 /download_model.py; \
python3 /src/download_model.py; \
fi

# Add source files
COPY src /src
# Remove download_model.py
RUN rm /download_model.py

# Start the handler
CMD ["python3", "/src/handler.py"]
2 changes: 1 addition & 1 deletion docker-bake.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ variable "REPOSITORY" {
}

variable "BASE_IMAGE_VERSION" {
default = "1.0.0"
default = "1.1.0preview"
}

group "all" {
Expand Down
113 changes: 93 additions & 20 deletions src/download_model.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,100 @@
import os
from huggingface_hub import snapshot_download
import json
import logging
import glob
from shutil import rmtree
from huggingface_hub import snapshot_download
from utils import timer_decorator

BASE_DIR = "/"
TOKENIZER_PATTERNS = [["*.json", "tokenizer*"]]
MODEL_PATTERNS = [["*.safetensors"], ["*.bin"], ["*.pt"]]

def setup_env():
if os.getenv("TESTING_DOWNLOAD") == "1":
BASE_DIR = "tmp"
os.makedirs(BASE_DIR, exist_ok=True)
os.environ.update({
"HF_HOME": f"{BASE_DIR}/hf_cache",
"MODEL_NAME": "openchat/openchat-3.5-0106",
"HF_HUB_ENABLE_HF_TRANSFER": "1",
"TENSORIZE": "1",
"TENSORIZER_NUM_GPUS": "1",
"DTYPE": "auto"
})

@timer_decorator
def download(name, revision, type, cache_dir):
if type == "model":
pattern_sets = [model_pattern + TOKENIZER_PATTERNS[0] for model_pattern in MODEL_PATTERNS]
elif type == "tokenizer":
pattern_sets = TOKENIZER_PATTERNS
else:
raise ValueError(f"Invalid type: {type}")
try:
for pattern_set in pattern_sets:
path = snapshot_download(name, revision=revision, cache_dir=cache_dir,
allow_patterns=pattern_set)
for pattern in pattern_set:
if glob.glob(os.path.join(path, pattern)):
logging.info(f"Successfully downloaded {pattern} model files.")
return path
except ValueError:
raise ValueError(f"No patterns matching {pattern_sets} found for download.")


# @timer_decorator
# def tensorize_model(model_path): TODO: Add back once tensorizer is ready
# from vllm.engine.arg_utils import EngineArgs
# from vllm.model_executor.model_loader.tensorizer import TensorizerConfig, tensorize_vllm_model
# from torch.cuda import device_count

# tensorizer_num_gpus = int(os.getenv("TENSORIZER_NUM_GPUS", "1"))
# if tensorizer_num_gpus > device_count():
# raise ValueError(f"TENSORIZER_NUM_GPUS ({tensorizer_num_gpus}) exceeds available GPUs ({device_count()})")

# dtype = os.getenv("DTYPE", "auto")
# serialized_dir = f"{BASE_DIR}/serialized_model"
# os.makedirs(serialized_dir, exist_ok=True)
# serialized_uri = f"{serialized_dir}/model{'-%03d' if tensorizer_num_gpus > 1 else ''}.tensors"

# tensorize_vllm_model(
# EngineArgs(model=model_path, tensor_parallel_size=tensorizer_num_gpus, dtype=dtype),
# TensorizerConfig(tensorizer_uri=serialized_uri)
# )
# logging.info("Successfully serialized model to %s", str(serialized_uri))
# logging.info("Removing HF Model files after serialization")
# rmtree("/".join(model_path.split("/")[:-2]))
# return serialized_uri, tensorizer_num_gpus, dtype

if __name__ == "__main__":
model_name = os.getenv("MODEL_NAME")
if not model_name:
raise ValueError("Must specify model name by adding --build-arg MODEL_NAME=<your model's repo>")
revision = os.getenv("MODEL_REVISION") or None
snapshot_download(model_name, revision=revision, cache_dir=os.getenv("HF_HOME"))
setup_env()
cache_dir = os.getenv("HF_HOME")
model_name, model_revision = os.getenv("MODEL_NAME"), os.getenv("MODEL_REVISION") or None
tokenizer_name, tokenizer_revision = os.getenv("TOKENIZER_NAME") or model_name, os.getenv("TOKENIZER_REVISION") or model_revision

model_path = download(model_name, model_revision, "model", cache_dir)

metadata = {
"MODEL_NAME": model_path,
"MODEL_REVISION": os.getenv("MODEL_REVISION"),
"QUANTIZATION": os.getenv("QUANTIZATION"),
}

tokenizer_name = os.getenv("TOKENIZER_NAME") or None
tokenizer_revision = os.getenv("TOKENIZER_REVISION") or None
if tokenizer_name:
snapshot_download(tokenizer_name, revision=tokenizer_revision, cache_dir=os.getenv("HF_HOME"))
# if os.getenv("TENSORIZE") == "1": TODO: Add back once tensorizer is ready
# serialized_uri, tensorizer_num_gpus, dtype = tensorize_model(model_path)
# metadata.update({
# "MODEL_NAME": serialized_uri,
# "TENSORIZER_URI": serialized_uri,
# "TENSOR_PARALLEL_SIZE": tensorizer_num_gpus,
# "DTYPE": dtype
# })

# Create file with metadata of baked in model and/or tokenizer
tokenizer_path = download(tokenizer_name, tokenizer_revision, "tokenizer", cache_dir)
metadata.update({
"TOKENIZER_NAME": tokenizer_path,
"TOKENIZER_REVISION": tokenizer_revision
})

with open("/local_metadata.json", "w") as f:
json.dump({
"model_name": model_name,
"revision": revision,
"tokenizer_name": tokenizer_name or model_name,
"tokenizer_revision": tokenizer_revision or revision,
"quantization": os.getenv("QUANTIZATION")
}, f)

with open(f"{BASE_DIR}/local_model_args.json", "w") as f:
json.dump({k: v for k, v in metadata.items() if v not in (None, "")}, f)
29 changes: 21 additions & 8 deletions src/engine.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
import logging
import json
import asyncio

from dotenv import load_dotenv
from torch.cuda import device_count
from typing import AsyncGenerator
import time

Expand All @@ -21,8 +21,11 @@ class vLLMEngine:
def __init__(self, engine = None):
load_dotenv() # For local development
self.engine_args = get_engine_args()
self.tokenizer = TokenizerWrapper(self.tokenizer, self.engine_args.tokenizer_revision, self.engine_args.trust_remote_code)
self.llm = self._initialize_llm() if engine is None else engine
logging.info(f"Engine args: {self.engine_args}")
self.tokenizer = TokenizerWrapper(self.engine_args.tokenizer or self.engine_args.model,
self.engine_args.tokenizer_revision,
self.engine_args.trust_remote_code)
self.llm = self._initialize_llm() if engine is None else engine.llm
self.max_concurrency = int(os.getenv("MAX_CONCURRENCY", DEFAULT_MAX_CONCURRENCY))
self.default_batch_size = int(os.getenv("DEFAULT_BATCH_SIZE", DEFAULT_BATCH_SIZE))
self.batch_size_growth_factor = int(os.getenv("BATCH_SIZE_GROWTH_FACTOR", DEFAULT_BATCH_SIZE_GROWTH_FACTOR))
Expand Down Expand Up @@ -114,17 +117,27 @@ def _initialize_llm(self):
class OpenAIvLLMEngine(vLLMEngine):
def __init__(self, vllm_engine):
super().__init__(vllm_engine)
self.served_model_name = os.getenv("OPENAI_SERVED_MODEL_NAME_OVERRIDE") or self.engine_args["model"]
self.served_model_name = os.getenv("OPENAI_SERVED_MODEL_NAME_OVERRIDE") or self.engine_args.model
self.response_role = os.getenv("OPENAI_RESPONSE_ROLE") or "assistant"
self._initialize_engines()
asyncio.run(self._initialize_engines())
self.raw_openai_output = bool(int(os.getenv("RAW_OPENAI_OUTPUT", 1)))

def _initialize_engines(self):
async def _initialize_engines(self):
self.model_config = await self.llm.get_model_config()

self.chat_engine = OpenAIServingChat(
self.llm, self.served_model_name, self.response_role,
engine=self.llm,
model_config=self.model_config,
served_model_names=[self.served_model_name],
response_role=self.response_role,
chat_template=self.tokenizer.tokenizer.chat_template
)
self.completion_engine = OpenAIServingCompletion(self.llm, self.served_model_name)
self.completion_engine = OpenAIServingCompletion(
engine=self.llm,
model_config=self.model_config,
served_model_names=[self.served_model_name],
lora_modules=[]
)

async def generate(self, openai_request: JobInput):
if openai_request.openai_route == "/v1/models":
Expand Down
86 changes: 62 additions & 24 deletions src/engine_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,75 @@
import logging
from torch.cuda import device_count
from vllm import AsyncEngineArgs
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig

env_to_args_map = {
RENAME_ARGS_MAP = {
"MODEL_NAME": "model",
"MODEL_REVISION": "revision",
"TOKENIZER_NAME": "tokenizer",
"TOKENIZER_REVISION": "tokenizer_revision",
"QUANTIZATION": "quantization"
"MAX_CONTEXT_LEN_TO_CAPTURE": "max_seq_len_to_capture"
}


DEFAULT_ARGS = {
"disable_log_stats": True,
"disable_log_requests": True,
"gpu_memory_utilization": 0.9,
}

def match_vllm_args(args):
"""Rename args to match vllm by:
1. Renaming keys to lower case
2. Renaming keys to match vllm
3. Filtering args to match vllm's AsyncEngineArgs
Args:
args (dict): Dictionary of args
Returns:
dict: Dictionary of args with renamed keys
"""
renamed_args = {RENAME_ARGS_MAP.get(k, k): v for k, v in args.items()}
matched_args = {k: v for k, v in renamed_args.items() if k in AsyncEngineArgs.__dataclass_fields__}
return {k: v for k, v in matched_args.items() if v not in [None, ""]}
def get_local_args():
if os.path.exists("/local_metadata.json"):
with open("/local_metadata.json", "r") as f:
local_metadata = json.load(f)
if local_metadata.get("model_name") is None:
raise ValueError("Model name is not found in /local_metadata.json, there was a problem when baking the model in.")
else:
local_args = {env_to_args_map[k.upper()]: v for k, v in local_metadata.items() if k in env_to_args_map}
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
return local_args
"""
Retrieve local arguments from a JSON file.
Returns:
dict: Local arguments.
"""
if not os.path.exists("/local_model_args.json"):
return {}

with open("/local_model_args.json", "r") as f:
local_args = json.load(f)

if local_args.get("MODEL_NAME") is None:
raise ValueError("Model name not found in /local_model_args.json. There was a problem when baking the model in.")

logging.info(f"Using baked in model with args: {local_args}")
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"

return local_args
def get_engine_args():
# Start with default args
args = {
"disable_log_stats": True,
"disable_log_requests": True,
"gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.9)),
}
args = DEFAULT_ARGS

# Get env args that match keys in AsyncEngineArgs
env_args = {k.lower(): v for k, v in dict(os.environ).items() if k.lower() in AsyncEngineArgs.__dataclass_fields__}
args.update(env_args)
args.update(os.environ)

# Get local args if model is baked in and overwrite env args
local_args = get_local_args()
args.update(local_args)
args.update(get_local_args())

# if args.get("TENSORIZER_URI"): TODO: add back once tensorizer is ready
# args["load_format"] = "tensorizer"
# args["model_loader_extra_config"] = TensorizerConfig(tensorizer_uri=args["TENSORIZER_URI"], num_readers=None)
# logging.info(f"Using tensorized model from {args['TENSORIZER_URI']}")


# Rename and match to vllm args
args = match_vllm_args(args)

# Set tensor parallel size and max parallel loading workers if more than 1 GPU is available
num_gpus = device_count()
Expand All @@ -49,10 +82,15 @@ def get_engine_args():
logging.warning("Overriding MAX_PARALLEL_LOADING_WORKERS with None because more than 1 GPU is available.")

# Deprecated env args backwards compatibility
if args["kv_cache_dtype"] == "fp8_e5m2":
if args.get("kv_cache_dtype") == "fp8_e5m2":
args["kv_cache_dtype"] = "fp8"
logging.warning("Using fp8_e5m2 is deprecated. Please use fp8 instead.")
if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"):
args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"))
logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.")

if "gemma-2" in args.get("model", "").lower():
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
logging.info("Using FLASHINFER for gemma-2 model.")

return AsyncEngineArgs(**args)
3 changes: 2 additions & 1 deletion src/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

class TokenizerWrapper:
def __init__(self, tokenizer_name_or_path, tokenizer_revision, trust_remote_code):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, revision=tokenizer_revision, trust_remote_code=trust_remote_code)
print(f"tokenizer_name_or_path: {tokenizer_name_or_path}, tokenizer_revision: {tokenizer_revision}, trust_remote_code: {trust_remote_code}")
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, revision=tokenizer_revision or "main", trust_remote_code=trust_remote_code)
self.custom_chat_template = os.getenv("CUSTOM_CHAT_TEMPLATE")
self.has_chat_template = bool(self.tokenizer.chat_template) or bool(self.custom_chat_template)
if self.custom_chat_template and isinstance(self.custom_chat_template, str):
Expand Down
25 changes: 19 additions & 6 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import os
import logging
from http import HTTPStatus
from vllm.utils import random_uuid
from vllm.entrypoints.openai.protocol import ErrorResponse
from vllm import SamplingParams
from functools import wraps
from time import time

try:
from vllm.utils import random_uuid
from vllm.entrypoints.openai.protocol import ErrorResponse
from vllm import SamplingParams
except ImportError:
logging.warning("Error importing vllm, skipping related imports. This is ONLY expected when baking model into docker image from a machine without GPUs")
pass

logging.basicConfig(level=logging.INFO)

Expand Down Expand Up @@ -68,6 +75,12 @@ def create_error_response(message: str, err_type: str = "BadRequestError", statu
def get_int_bool_env(env_var: str, default: bool) -> bool:
return int(os.getenv(env_var, int(default))) == 1




def timer_decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time()
result = func(*args, **kwargs)
end = time()
logging.info(f"{func.__name__} completed in {end - start:.2f} seconds")
return result
return wrapper
Loading

0 comments on commit 5bd6f3a

Please sign in to comment.