|
| 1 | +# /// script |
| 2 | +# requires-python = ">=3.11" |
| 3 | +# dependencies = [ |
| 4 | +# "bentoml", |
| 5 | +# "vllm>=0.7.0", |
| 6 | +# ] |
| 7 | +# /// |
| 8 | +from __future__ import annotations |
| 9 | +import uuid, io, logging, os, traceback, functools, typing |
| 10 | +import bentoml, fastapi, pydantic, yaml |
| 11 | + |
| 12 | +from argparse import Namespace |
| 13 | +from typing import AsyncGenerator, Literal, Optional, Union, Sequence |
| 14 | +from annotated_types import Ge, Le |
| 15 | +from typing_extensions import Annotated |
| 16 | + |
| 17 | +logger = logging.getLogger(__name__) |
| 18 | +logger.setLevel(logging.INFO) |
| 19 | + |
| 20 | +openai_api_app = fastapi.FastAPI() |
| 21 | + |
| 22 | +MAX_TOKENS = 4096 |
| 23 | +MODEL_ID = 'meta-llama/Llama-3.1-8B-Instruct' |
| 24 | + |
| 25 | +SYSTEM_PROMPT= """Your are a proficient writer. Your goal is to create note suggestions for any given text that share similar stylistic choices and tonality as Frank Kafka. YOU MUST RETURN VALID JSON, with schema '{{"suggestion": string, "relevance": float}}'. ONLY RETURN JSON and RETURN AT MOST {num_suggestion} SUGGESTIONS. Kept suggestion terse and authentic.""" |
| 26 | + |
| 27 | +PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> |
| 28 | +
|
| 29 | +{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|> |
| 30 | +
|
| 31 | +{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|> |
| 32 | +
|
| 33 | +""" |
| 34 | + |
| 35 | + |
| 36 | +@bentoml.asgi_app(openai_api_app, path='/v1') |
| 37 | +@bentoml.service( |
| 38 | + name='asteraceae-service', |
| 39 | + traffic={'timeout': 300, 'concurrency': 256}, |
| 40 | + resources={'gpu': 1, 'gpu_type': 'nvidia-a100-80gb'}, |
| 41 | +) |
| 42 | +class Engine: |
| 43 | + def __init__(self): |
| 44 | + from transformers import AutoTokenizer |
| 45 | + from vllm import AsyncEngineArgs, AsyncLLMEngine |
| 46 | + from vllm.entrypoints.openai.api_server import init_app_state |
| 47 | + import vllm.entrypoints.openai.api_server as vllm_api_server |
| 48 | + |
| 49 | + ENGINE_ARGS = AsyncEngineArgs(model=MODEL_ID, max_model_len=MAX_TOKENS, enable_prefix_caching=True) |
| 50 | + self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS) |
| 51 | + self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
| 52 | + |
| 53 | + OPENAI_ENDPOINTS = [ |
| 54 | + ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]], |
| 55 | + ["/completions", vllm_api_server.create_completion, ["POST"]], |
| 56 | + ["/models", vllm_api_server.show_available_models, ["GET"]], |
| 57 | + ] |
| 58 | + |
| 59 | + for route, endpoint, methods in OPENAI_ENDPOINTS: openai_api_app.add_api_route( path=route, endpoint=endpoint, methods=methods,) |
| 60 | + |
| 61 | + model_config = self.engine.engine.get_model_config() |
| 62 | + args = Namespace() |
| 63 | + args.model = MODEL_ID |
| 64 | + args.disable_log_requests = True |
| 65 | + args.max_log_len = 1000 |
| 66 | + args.response_role = "assistant" |
| 67 | + args.served_model_name = None |
| 68 | + args.chat_template = None |
| 69 | + args.lora_modules = None |
| 70 | + args.prompt_adapters = None |
| 71 | + args.request_logger = None |
| 72 | + args.disable_log_stats = True |
| 73 | + args.return_tokens_as_token_ids = False |
| 74 | + args.enable_tool_call_parser = True |
| 75 | + args.enable_auto_tool_choice = True |
| 76 | + args.tool_call_parser = "llama3_json" |
| 77 | + args.enable_prompt_tokens_details = False |
| 78 | + |
| 79 | + vllm_api_server.init_app_state( self.engine, model_config, openai_api_app.state, args) |
| 80 | + |
| 81 | + @bentoml.api |
| 82 | + async def suggests(self, essay: str, num_suggestions: Annotated[int, Le(10)] = 5, max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS) -> AsyncGenerator[str, None]: |
| 83 | + from vllm import SamplingParams |
| 84 | + |
| 85 | + SAMPLING_PARAM = SamplingParams(max_tokens=max_tokens, skip_special_tokens=True) |
| 86 | + messages = [ |
| 87 | + {"role": "system", "content": SYSTEM_PROMPT.format(num_suggestion=num_suggestions)}, |
| 88 | + {"role": "user", "content": essay}] |
| 89 | + |
| 90 | + prompt = self.tokenizer.apply_chat_template( |
| 91 | + messages, |
| 92 | + tokenize=False, |
| 93 | + add_generation_prompt=True, |
| 94 | + ) |
| 95 | + stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM) |
| 96 | + |
| 97 | + cursor = 0 |
| 98 | + async for request_output in stream: |
| 99 | + text = request_output.outputs[0].text |
| 100 | + yield text[cursor:] |
| 101 | + cursor = len(text) |
| 102 | + |
| 103 | +if __name__ == "__main__": Engine.serve_http(port=3000) |
0 commit comments