feat: inference server rev0 (#233)

aarnphm · web-flow · commit 87742476c07f · 2025-02-01T13:53:54.000-05:00
diff --git a/.gitattributes b/.gitattributes
@@ -1,4 +1,9 @@
 * text=auto eol=lf
 docs/content/** linguist-documentation
 docs/ linguist-detectable
-docs/content/chicago-fullnote-bibliography.csl linguist-vendored
+*.md linguist-detectable=true
+*.md linguist-documentation=false
+*.md linguist-language=Markdown
+*.markdown linguist-detectable=true
+*.markdown linguist-documentation=false
+*.markdown linguist-language=Markdown
diff --git a/.gitignore b/.gitignore
@@ -86,7 +86,6 @@ ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
-.python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
diff --git a/.node-version b/.node-version
@@ -1 +1 @@
-v20.9.0
+v22.11.0
diff --git a/.python-version b/.python-version
diff --git a/packages/morph/src/styles/variables.scss b/packages/morph/src/styles/variables.scss
@@ -8,7 +8,4 @@ $normalWeight: 400;
 
 $font-heading: 'Cardo', serif;
 $font-body: 'Bricolage Grotesque', sans-serif;
-$font-mono: 'JetBrains Mono', monospace;
-
-  
-  
+$font-mono: 'JetBrains Mono', monospace;
diff --git a/python/asteraceae/asteraceae/__init__.py b/python/asteraceae/asteraceae/__init__.py
diff --git a/python/asteraceae/asteraceae/service.py b/python/asteraceae/asteraceae/service.py
diff --git a/python/asteraceae/pyproject.toml b/python/asteraceae/pyproject.toml
@@ -1,55 +1,22 @@
 [project]
 name = "asteraceae"
-description = "a BentoML-based service to run SAE-intervention"
+description = "a BentoML service that run a SAEs with vLLM"
 readme = "README.md"
 requires-python = ">=3.11"
 license = { text = "Apache-2.0" }
 authors = [{ name = "Aaron Pham", email = "contact@aarnphm.xyz" }]
-dependencies = [
-  "accelerate>=0.34.0",
-  "bentoml @ git+https://github.com/bentoml/BentoML.git@main",
-  "fastapi>=0.115.0",
-  "openai>=1.47.0",
-  "sae @ git+https://github.com/EleutherAI/sae.git@main",
-  "vllm>=0.6.3",
-]
-dynamic = ["version"]
+dependencies = ["bentoml>=1.3.20", "kantoku>=0.18.1", "vllm>=0.7.0"]
+version = "0.0.0"
 [project.urls]
 Documentation = "https://tinymorph.aarnphm.xyz"
 GitHub = "https://github.com/aarnphm/tinymorph"
 Twitter = "https://twitter.com/aarnphm_"
 Tracker = "https://github.com/aarnphm/tinymorph/issues"
 
-[build-system]
-requires = ["hatchling", "hatch-vcs"]
-build-backend = "hatchling.build"
-
-[tool.hatch.version]
-source = "vcs"
-fallback-version = "0.0.0"
-[tool.hatch.build.hooks.vcs]
-version-file = "asteraceae/_version.py"
-[tool.hatch.version.raw-options]
-git_describe_command = [
-  "git",
-  "describe",
-  "--dirty",
-  "--tags",
-  "--long",
-  "--first-parent",
-]
-version_scheme = "post-release"
-fallback_version = "0.0.0"
-[tool.hatch.metadata]
-allow-direct-references = true
-[tool.hatch.build.targets.sdist]
-only-include = ["asteraceae"]
-[tool.hatch.build.targets.wheel]
-packages = ["asteraceae"]
 
 [tool.bentoml.build]
-service = "asteraceae.service:Engine"
-include = ["asteraceae/service.py"]
+service = "service:Engine"
+include = ["service.py"]
 [tool.bentoml.build.python]
 lock_packages = false
 [tool.bentoml.build.docker]
@@ -59,3 +26,6 @@ name = "HF_TOKEN"
 [[tool.bentoml.build.envs]]
 name = "HF_HUB_ENABLE_HF_TRANSFER"
 value = "1"
+
+[tool.uv.sources]
+exo = { workspace = true }
diff --git a/python/asteraceae/service.py b/python/asteraceae/service.py
@@ -0,0 +1,103 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "bentoml",
+#     "vllm>=0.7.0",
+# ]
+# ///
+from __future__ import annotations
+import uuid, io, logging, os, traceback, functools, typing
+import bentoml, fastapi, pydantic, yaml
+
+from argparse import Namespace
+from typing import AsyncGenerator, Literal, Optional, Union, Sequence
+from annotated_types import Ge, Le
+from typing_extensions import Annotated
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+openai_api_app = fastapi.FastAPI()
+
+MAX_TOKENS = 4096
+MODEL_ID = 'meta-llama/Llama-3.1-8B-Instruct'
+
+SYSTEM_PROMPT= """Your are a proficient writer. Your goal is to create note suggestions for any given text that share similar stylistic choices and tonality as Frank Kafka. YOU MUST RETURN VALID JSON, with schema '{{"suggestion": string, "relevance": float}}'. ONLY RETURN JSON and RETURN AT MOST {num_suggestion} SUGGESTIONS. Kept suggestion terse and authentic."""
+
+PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+"""
+
+
+@bentoml.asgi_app(openai_api_app, path='/v1')
+@bentoml.service(
+  name='asteraceae-service',
+  traffic={'timeout': 300, 'concurrency': 256},
+  resources={'gpu': 1, 'gpu_type': 'nvidia-a100-80gb'},
+)
+class Engine:
+  def __init__(self):
+    from transformers import AutoTokenizer
+    from vllm import AsyncEngineArgs, AsyncLLMEngine
+    from vllm.entrypoints.openai.api_server import init_app_state
+    import vllm.entrypoints.openai.api_server as vllm_api_server
+
+    ENGINE_ARGS = AsyncEngineArgs(model=MODEL_ID, max_model_len=MAX_TOKENS, enable_prefix_caching=True)
+    self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)
+    self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+    OPENAI_ENDPOINTS = [
+        ["/chat/completions", vllm_api_server.create_chat_completion, ["POST"]],
+        ["/completions", vllm_api_server.create_completion, ["POST"]],
+        ["/models", vllm_api_server.show_available_models, ["GET"]],
+    ]
+
+    for route, endpoint, methods in OPENAI_ENDPOINTS: openai_api_app.add_api_route( path=route, endpoint=endpoint, methods=methods,)
+
+    model_config = self.engine.engine.get_model_config()
+    args = Namespace()
+    args.model = MODEL_ID
+    args.disable_log_requests = True
+    args.max_log_len = 1000
+    args.response_role = "assistant"
+    args.served_model_name = None
+    args.chat_template = None
+    args.lora_modules = None
+    args.prompt_adapters = None
+    args.request_logger = None
+    args.disable_log_stats = True
+    args.return_tokens_as_token_ids = False
+    args.enable_tool_call_parser = True
+    args.enable_auto_tool_choice = True
+    args.tool_call_parser = "llama3_json"
+    args.enable_prompt_tokens_details = False
+
+    vllm_api_server.init_app_state( self.engine, model_config, openai_api_app.state, args)
+
+  @bentoml.api
+  async def suggests(self, essay: str, num_suggestions: Annotated[int, Le(10)] = 5, max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS) -> AsyncGenerator[str, None]:
+    from vllm import SamplingParams
+
+    SAMPLING_PARAM = SamplingParams(max_tokens=max_tokens, skip_special_tokens=True)
+    messages = [
+    {"role": "system", "content": SYSTEM_PROMPT.format(num_suggestion=num_suggestions)},
+    {"role": "user", "content": essay}]
+
+    prompt = self.tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM)
+
+    cursor = 0
+    async for request_output in stream:
+      text = request_output.outputs[0].text
+      yield text[cursor:]
+      cursor = len(text)
+
+if __name__ == "__main__": Engine.serve_http(port=3000)
diff --git a/python/exo/README.md b/python/exo/README.md
@@ -1,5 +1,11 @@
 ## exo
 
-_a series of investigation into SAEs prevention for latent activation_
+a vLLM plugin to serve SAEs within the inference engine.
 
-To be used with `exo-service`
+## installation
+
+```bash
+uv add vllm exo
+```
+
+Note that this will override the default LlamaForCausalLM in vLLM.
diff --git a/python/exo/notebooks/sae.ipynb b/python/exo/notebooks/sae.ipynb
@@ -35,6 +35,7 @@
     "\n",
     "import torch, pathlib, pandas as pd\n",
     "import huggingface_hub as hf_hub, safetensors as st\n",
+    "from goodfire import Variant\n",
     "\n",
     "# device setup\n",
     "if torch.backends.mps.is_available():\n",
diff --git a/python/exo/pyproject.toml b/python/exo/pyproject.toml
@@ -1,15 +1,14 @@
 [project]
 name = "exo"
-description = "a series of investigation into SAEs prevention for latent activation"
+description = "a vLLM plugin to serve SAEs within the inference engine."
 readme = "README.md"
 requires-python = ">=3.11"
 license = { text = "Apache-2.0" }
 authors = [{ name = "Aaron Pham", email = "contact@aarnphm.xyz" }]
 dependencies = [
-  "nnsight>=0.3.5",
-  "sae",
-  "sae-lens>=0.5.0",
+  "huggingface-hub>=0.25.0",
   "transformers>=4.44.2",
+  "vllm>=0.7.0",
 ]
 dynamic = ["version"]
 [project.urls]
@@ -18,6 +17,9 @@ GitHub = "https://github.com/aarnphm/tinymorph"
 Twitter = "https://twitter.com/aarnphm_"
 Tracker = "https://github.com/aarnphm/tinymorph/issues"
 
+[project.entry-points."vllm.general_plugins"]
+llama_saes = "exo:register"
+
 [build-system]
 requires = ["hatchling", "hatch-vcs"]
 build-backend = "hatchling.build"
@@ -48,10 +50,10 @@ packages = ["src/exo"]
 
 [tool.uv]
 dev-dependencies = [
+  "nnsight>=0.3.5",
+  "sae-lens>=0.5.0",
+  "goodfire>=0.3.4",
   "jupyter>=1.1.1",
   "jupyterlab-vim>=4.1.4",
   "notebook>=7.2.2",
 ]
-
-[tool.uv.sources]
-sae = { git = "https://github.com/EleutherAI/sae.git", rev = "main" }
diff --git a/python/exo/src/exo/__init__.py b/python/exo/src/exo/__init__.py
@@ -0,0 +1,3 @@
+from .variants import register
+
+__all__ = ["register"]
diff --git a/python/exo/src/exo/llama_sae.py b/python/exo/src/exo/llama_sae.py
@@ -0,0 +1,6 @@
+from __future__ import annotations
+
+
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+class LlamaSAEForCausalLM(LlamaForCausalLM): ...
diff --git a/python/exo/src/exo/variants.py b/python/exo/src/exo/variants.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+# NOTE: This file has to be extremely light and can be called multiple times.
+def register():
+  """out-of-tree registration for intervention with SAEs.
+
+  """
+  from vllm import ModelRegistry
+  from exo.llama_sae import LlamaSAEForCausalLM
+
+  ModelRegistry.register_model("llama", LlamaSAEForCausalLM)
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .variants import register`
	`2`	`+`
	`3`	`+__all__ = ["register"]`