feat(service): disagregated serving (#305)

aarnphm · web-flow · commit 9c4c9cb612af · 2025-03-20T18:54:51.000-04:00
* feat: perf perf perf reasoning streaming tasks go brrr

Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;

* feat: disagregated service

Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;

* chore: depends on engine for disagregated

Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;

---------

Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;
diff --git a/python/asteraceae/requirements.txt b/python/asteraceae/requirements.txt
@@ -1,4 +1,4 @@
-bentoml>=1.4.3
+bentoml>=1.4.5
+vllm==0.8.1
 kantoku>=0.18.1
-openai>=1.61.0
-vllm==0.7.3
+openai>=1.67.0
diff --git a/python/asteraceae/service.py b/python/asteraceae/service.py
@@ -1,89 +1,112 @@
 from __future__ import annotations
-import logging, traceback, asyncio
-import bentoml, fastapi, pydantic
 
-from typing import AsyncGenerator, List, Literal, Optional
-from annotated_types import Ge, Le
-from typing_extensions import Annotated
+import logging, traceback, os, asyncio, contextlib, typing as t
+import bentoml, fastapi, pydantic, annotated_types as ae, typing_extensions as te
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 openai_api_app = fastapi.FastAPI()
 
-MAX_TOKENS = 8192
 MODEL_ID = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-14B'
-
-SYSTEM_PROMPT = """You are a professional writing assistant influenced by the styles of Raymond Carver, Franz Kafka, Albert Camus, Iain McGilchrist, and Ian McEwan. Your task is to provide suggestions to improve a user's writing by offering concise, meaningful additions that match the stylistic choices and tonality of the given essay excerpt.
+STRUCTURED_OUTPUT_BACKEND = "xgrammar:disable-any-whitespace"  # remove any whitespace if it is not qwen.
+MAX_MODEL_LEN = int(os.environ.get("MAX_MODEL_LEN", 16 * 1024))
+MAX_TOKENS = int(os.environ.get("MAX_TOKENS", 8 * 1024))
+SYSTEM_PROMPT = """You are a professional writer heavily influenced by the styles of Raymond Carver, Franz Kafka, Albert Camus, Iain McGilchrist, and Ian McEwan. Your task is to provide suggestions by offering concise, meaningful additions that match the stylistic choices and tonality of the given essay excerpt.
 
 Please follow these steps to generate a suggestion:
 
 1. Analyze the excerpt, paying close attention to its style, tone, and central concept.
-2. Consider how Raymond Carver or Ian McEwan might approach expanding or enhancing the excerpt.
+2. Consider how you might might approach expanding or enhancing the excerpt.
 3. Formulate a suggestion that builds upon the existing concept while maintaining a terse and authentic voice.
 4. Ensure your suggestion adds depth to the writing without drastically changing its original intent.
 
-Before providing your final suggestion, wrap your analysis in <thought_process> tags. In this section:
-- List key stylistic elements and themes present in the excerpt
-- Identify specific influences from the mentioned authors
-- Brainstorm potential areas for improvement
-- Consider how each improvement aligns with the original style and tone
-
-This will help ensure a thorough interpretation of the excerpt and a well-crafted suggestion. It's OK for this section to be quite long.
-
 Guidelines for your suggestion:
 1. Keep it concise and authentic, typically one to two sentences.
 2. Focus on enhancing emotional depth, vivid imagery, or character insight.
 3. Maintain the overall tone and style of the original excerpt.
 4. Build upon the central concept or theme present in the excerpt.
+5. Make sure to provide minimum {num_suggestions} suggestions.
+"""
 
-After your analysis, provide your final suggestion in <suggestion> tags.
+IMAGE = bentoml.images.PythonImage(python_version='3.11', lock_python_packages=False).requirements_file('requirements.txt').run('uv pip install --compile-bytecode flashinfer-python --find-links https://flashinfer.ai/whl/cu124/torch2.6')
 
-Example output structure:
+class Suggestion(pydantic.BaseModel):
+  suggestion: str
+  reasoning: str = pydantic.Field(default='')
 
-<thinking>
-[Your detailed analysis of the excerpt, considering style, tone, and concept]
-</thinking>
+class Suggestions(pydantic.BaseModel):
+  suggestions: list[Suggestion]
 
-<suggestion>
-[Your concise, meaningful suggestion to improve the writing]
-</suggestion>
 
-Please proceed with your analysis and suggestion for the given essay excerpt."""
+@bentoml.asgi_app(openai_api_app, path='/v1')
+@bentoml.service(
+  name='asteraceae-inference-engine',
+  traffic={'timeout': 300, 'concurrency': 128},
+  resources={'gpu': 1, 'gpu_type': 'nvidia-a100-80gb'},
+  tracing={"sample_rate": 0.5},
+  envs=[
+      {'name': 'HF_TOKEN'},
+      {'name': 'UV_NO_PROGRESS', 'value': '1'},
+      {'name': 'HF_HUB_DISABLE_PROGRESS_BARS', 'value': '1'},
+      {'name': 'VLLM_ATTENTION_BACKEND', 'value': 'FLASH_ATTN'},
+      {'name': 'VLLM_USE_V1', 'value': '0'},
+      {'name': 'VLLM_LOGGING_CONFIG_PATH', 'value': os.path.join(os.path.dirname(__file__), 'logging-config.json')},
+  ],
+  labels={'owner': 'aarnphm', 'type': 'engine'},
+  image=IMAGE,
+)
+class Engine:
+  model_id = MODEL_ID
+  model = bentoml.models.HuggingFaceModel(model_id, exclude=['*.pth', '*.pt', 'original/**/*'])
 
+  def __init__(self):
+    self.exit_stack = contextlib.AsyncExitStack()
 
-class Suggestion(pydantic.BaseModel):
-  suggestion: str
+  @bentoml.on_startup
+  async def init_engine(self) -> None:
+    import vllm.entrypoints.openai.api_server as vllm_api_server
 
+    from vllm.utils import FlexibleArgumentParser
+    from vllm.entrypoints.openai.cli_args import make_arg_parser
+
+    args = make_arg_parser(FlexibleArgumentParser()).parse_args([])
+    args.model = self.model
+    args.disable_log_requests = True
+    args.max_log_len = 1000
+    args.served_model_name = [self.model_id]
+    args.request_logger = None
+    args.disable_log_stats = True
+    args.use_tqdm_on_load = False
+    args.max_model_len = MAX_MODEL_LEN
+    args.enable_prefix_caching = True
+    args.enable_reasoning = True
+    args.reasoning_parser = 'deepseek_r1'
+    args.enable_auto_tool_choice = True
+    args.tool_call_parser = 'hermes'
+    args.guided_decoding_backend = STRUCTURED_OUTPUT_BACKEND
+
+    router = fastapi.APIRouter(lifespan=vllm_api_server.lifespan)
+    OPENAI_ENDPOINTS = [
+        ['/chat/completions', vllm_api_server.create_chat_completion, ['POST']],
+        ['/models', vllm_api_server.show_available_models, ['GET']],
+        ["/embeddings", vllm_api_server.create_embedding, ["POST"]],
+    ]
 
-class ServerArgs(pydantic.BaseModel):
-  model: str
-  disable_log_requests: bool = True
-  disable_log_stats: bool = True
-  max_log_len: int = 1000
-  response_role: str = 'assistant'
-  served_model_name: Optional[List[str]] = None
-  chat_template: Optional[str] = None
-  chat_template_content_format: Literal['auto'] = 'auto'
-  lora_modules: Optional[List[str]] = None
-  prompt_adapters: Optional[List[str]] = None
-  request_logger: Optional[str] = None
-  return_tokens_as_token_ids: bool = False
-  enable_tool_call_parser: bool = True
-  enable_auto_tool_choice: bool = True
-  enable_prompt_tokens_details: bool = False
-  enable_reasoning: bool = False
-  tool_call_parser: str = 'llama3_json'
-  guided_decoding_backend: Literal['xgrammar', 'outlines'] = 'xgrammar'
-  reasoning_parser: str = 'deepseek_r1'
-  task: str = 'generate'
+    for route, endpoint, methods in OPENAI_ENDPOINTS: router.add_api_route(path=route, endpoint=endpoint, methods=methods, include_in_schema=True)
+    openai_api_app.include_router(router)
 
+    self.engine = await self.exit_stack.enter_async_context(vllm_api_server.build_async_engine_client(args))
+    self.model_config = await self.engine.get_model_config()
+    self.tokenizer = await self.engine.get_tokenizer()
+
+    await vllm_api_server.init_app_state(self.engine, self.model_config, openai_api_app.state, args)
+
+  @bentoml.on_shutdown
+  async def teardown_engine(self): await self.exit_stack.aclose()
 
-@bentoml.asgi_app(openai_api_app, path='/v1')
 @bentoml.service(
-  name='asteraceae-inference-service',
+  name='asteraceae-inference-api',
   traffic={'timeout': 300, 'concurrency': 128},
-  resources={'gpu': 1, 'gpu_type': 'nvidia-a100-80gb'},
   http={
     'cors': {
       'enabled': True,
@@ -95,72 +118,53 @@ class ServerArgs(pydantic.BaseModel):
       'access_control_expose_headers': ['Content-Length'],
     }
   },
-  envs=[{'name': 'HF_TOKEN'}, {'name': 'UV_COMPILE_BYTECODE', 'value': "1"}],
-  image=bentoml.images.PythonImage(python_version='3.11').requirements_file('requirements.txt'),
+  tracing={"sample_rate": 0.4},
+  labels={'owner': 'aarnphm', 'type': 'api'},
+  image=IMAGE,
 )
-class Engine:
-  ref = bentoml.models.HuggingFaceModel(MODEL_ID, exclude=['*.pth'])
+class API:
+  engine = bentoml.depends(Engine)
 
   def __init__(self):
-    from vllm import AsyncEngineArgs, AsyncLLMEngine
-    import vllm.entrypoints.openai.api_server as vllm_api_server
-
-    ENGINE_ARGS = AsyncEngineArgs(model=self.ref, enable_prefix_caching=True, enable_chunked_prefill=True)
-    self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)
-
-    OPENAI_ENDPOINTS = [
-      ['/chat/completions', vllm_api_server.create_chat_completion, ['POST']],
-      ['/completions', vllm_api_server.create_completion, ['POST']],
-      ['/embeddings', vllm_api_server.create_embedding, ['POST']],
-      ['/models', vllm_api_server.show_available_models, ['GET']],
-    ]
-    for route, endpoint, methods in OPENAI_ENDPOINTS:
-      openai_api_app.add_api_route(path=route, endpoint=endpoint, methods=methods, include_in_schema=True)
+    from openai import AsyncOpenAI
 
-    model_config = self.engine.engine.get_model_config()
-    # NOTE: This is ok, given that all bentoml service is running within a event loop.
-    asyncio.create_task(  # noqa: RUF006
-      vllm_api_server.init_app_state(self.engine, model_config, openai_api_app.state, ServerArgs(model=MODEL_ID))
-    )
+    self.client = AsyncOpenAI(base_url=f'{self.engine.client_url}/v1', api_key='dummy')
 
   @bentoml.api
   async def suggests(
     self,
     essay: str,
-    temperature: Annotated[float, Ge(0.5), Le(0.7)] = 0.6,
-    max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS,
-    num_suggestions: Annotated[int, Ge(1), Le(10)] = 5,
-    min_suggestions: Annotated[int, Ge(1), Le(10)] = 3,
-  ) -> AsyncGenerator[str, None]:
-    if min_suggestions >= num_suggestions:
-      raise ValueError(f'min_suggestions ({min_suggestions}) must be less than num_suggestions ({num_suggestions})')
-
-    from openai import AsyncOpenAI
-    from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam
-
-    client = AsyncOpenAI(base_url='http://127.0.0.1:3000/v1', api_key='dummy')
-
-    Output = pydantic.create_model(
-      'Output',
-      __module__=Suggestion.__module__,
-      __base__=pydantic.BaseModel,
-      suggestions=(pydantic.conlist(Suggestion, min_length=min_suggestions, max_length=num_suggestions), ...),
-    )
-    params = dict(guided_json=Output.model_json_schema())
+    num_suggestions: te.Annotated[int, ae.Ge(2)] = 3,
+    temperature: te.Annotated[float, ae.Ge(0.5), ae.Le(0.7)] = 0.6,
+    max_tokens: te.Annotated[int, ae.Ge(256), ae.Le(MAX_TOKENS)] = MAX_TOKENS,
+  ) -> t.AsyncGenerator[str, None]:
+
+    messages = [
+      {'role': 'system', 'content': SYSTEM_PROMPT.format(num_suggestions=num_suggestions)},
+      {'role': 'user', 'content': essay},
+    ]
 
+    prefill = False
     try:
-      completions = await client.chat.completions.create(
-        model=MODEL_ID,
+      completions = await self.client.chat.completions.create(
+        model=Engine.inner.model_id,
         temperature=temperature,
         max_tokens=max_tokens,
-        messages=[
-          ChatCompletionSystemMessageParam(role='system', content=SYSTEM_PROMPT),
-          ChatCompletionUserMessageParam(role='user', content=essay),
-        ],
+        messages=messages,
         stream=True,
-        extra_body=params,
+        extra_body=dict(guided_json=Suggestions.model_json_schema()),
       )
       async for chunk in completions:
-        yield chunk.choices[0].delta.content or ''
+        delta_choice = chunk.choices[0].delta
+        if hasattr(delta_choice, "reasoning_content"): s = Suggestion(suggestion=delta_choice.content or '', reasoning=delta_choice.reasoning_content)
+        else: s = Suggestion(suggestion=delta_choice.content)
+        if not prefill:
+            prefill = True
+            yield ''
+        else:
+          if not s.reasoning and not s.suggestion: break
+          yield f'{s.model_dump_json()}\n'
     except Exception:
-      yield traceback.format_exc()
+      logger.error(traceback.format_exc())
+      yield f'{Suggestion(suggestion="Internal error found. Check server logs for more information").model_dump_json()}\n'
+      return