vllm-project · LeiWang1999 · Jul 1, 2024 · Jul 1, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
diff --git a/docs/source/quantization/bitblas.rst b/docs/source/quantization/bitblas.rst
@@ -0,0 +1,41 @@
+.. _bitblas:
+
+BitBLAS
+==================
+
+vLLM now supports `BitBLAS <https://github.com/microsoft/BitBLAS>`_ for more efficient and flexible model inference.
+Compared to other quantization frameworks, BitBLAS provides more precision combinations.
+
+Below are the steps to utilize BitBLAS with vLLM.
+
+.. code-block:: console
+
+    $ pip install bitblas>=0.0.1.dev15
+
+vLLM reads the model's config file and supports pre-quantized checkpoint.
+
+You can find pre-quantized models on https://huggingface.co/models?other=bitblas or https://huggingface.co/models?other=bitnet or https://huggingface.co/models?other=gptq.
+
+And usually, these repositories have a quantize_config.json file that includes a quantization_config section.
+
+Read bitblas format checkpoint.
+--------------------------
+
+.. code-block:: python
+
+    from vllm import LLM
+    import torch
+    # "hxbgsyxh/llama-13b-4bit-g-1-bitblas is a pre-quantized checkpoint.
+    model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
+    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
+
+Read gptq format checkpoint.
+--------------------------
+.. code-block:: python
+
+    from vllm import LLM
+    import torch
+    # hxbgsyxh/llama-13b-4bit-g-1 is a pre-quantized checkpoint.
+    model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+    llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
+
@@ -0,0 +1,62 @@
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_bitblas.py`.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from .utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_bitblas: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_bitblas,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="bitblas",
+    )
@@ -0,0 +1,60 @@
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_bitblas.py`.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from .utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_gptq,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="gptq_bitblas",
+    )
@@ -7,7 +7,10 @@
 import pytest
 import torch
 
+from vllm.model_executor.layers.quantization.bitblas import BitBLASLinearMethod
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.gptq_bitblas import (
+    GPTQBitBLASLinearMethod)
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinLinearMethod)
 from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
@@ -36,7 +39,8 @@ def test_lm_head(
     if lm_head_quantized:
         assert isinstance(
             lm_head_layer.linear_method,
-            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
+            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod,
+             GPTQBitBLASLinearMethod, BitBLASLinearMethod))
     else:
         assert isinstance(lm_head_layer.linear_method,
                           UnquantizedEmbeddingMethod)

diff --git a/vllm/config.py b/vllm/config.py
@@ -388,8 +388,8 @@ def _verify_quantization(self) -> None:
         ]
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
-            "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8"
+            "gptq_bitblas", "bitblas", "awq_marlin", "fbgemm_fp8",
+            "compressed_tensors", "compressed-tensors", "experts_int8"
         ]
         tpu_supported_quantization = ["tpu_int8"]
         neuron_supported_quantization = ["neuron_quant"]
@@ -584,10 +584,27 @@ def get_vocab_size(self) -> int:
     def get_hidden_size(self) -> int:
         return self.hf_text_config.hidden_size
 
+    def find_flash_attn_supported_head_dims(self, head_dim: int) -> int:
+        """
+        Find the closest head dimension to the given head dimension that 
+        is supported by Flash Attention.
+        """
+        from vllm.attention.backends.flash_attn import FlashAttentionBackend
+
+        FLASHATTN_SUPPORTED_HEAD_DIMS = (
+            FlashAttentionBackend.get_supported_head_sizes())
+
+        for supported_head_dim in FLASHATTN_SUPPORTED_HEAD_DIMS:
+            if head_dim <= supported_head_dim:
+                return supported_head_dim
+        raise ValueError(
+            f"Head dimension {head_dim} is not supported by Flash Attention."
+            f"Supported head dimensions are {FLASHATTN_SUPPORTED_HEAD_DIMS}.")
+
     def get_head_size(self) -> int:
         # TODO remove hard code
-        if hasattr(self.hf_text_config, "model_type"
-                   ) and self.hf_text_config.model_type == 'deepseek_v2':
+        if (hasattr(self.hf_text_config, "model_type")
+                and self.hf_text_config.model_type == "deepseek_v2"):
             # FlashAttention supports only head_size 32, 64, 128, 256,
             # we need to pad head_size 192 to 256
             return 256
@@ -623,8 +640,11 @@ def get_total_num_kv_heads(self) -> int:
                 return self.hf_config.attn_config["kv_n_heads"]
             return self.hf_config.num_attention_heads
         if self.hf_config.model_type == "dbrx":
-            return getattr(self.hf_config.attn_config, "kv_n_heads",
-                           self.hf_config.num_attention_heads)
+            return getattr(
+                self.hf_config.attn_config,
+                "kv_n_heads",
+                self.hf_config.num_attention_heads,
+            )
 
         if self.is_attention_free:
             return 0
@@ -814,6 +834,7 @@ class TokenizerPoolConfig:
             The way the config will be used depends on the
             pool type.
     """
+
     pool_size: int
     pool_type: Union[str, Type["BaseTokenizerGroup"]]
     extra_config: dict
@@ -849,9 +870,11 @@ def create_config(
             else:
                 tokenizer_pool_extra_config_parsed = (
                     tokenizer_pool_extra_config or {})
-            tokenizer_pool_config = cls(tokenizer_pool_size,
-                                        tokenizer_pool_type,
-                                        tokenizer_pool_extra_config_parsed)
+            tokenizer_pool_config = cls(
+                tokenizer_pool_size,
+                tokenizer_pool_type,
+                tokenizer_pool_extra_config_parsed,
+            )
         else:
             tokenizer_pool_config = None
         return tokenizer_pool_config
@@ -1006,6 +1029,7 @@ def __init__(
             # current node and we aren't in a ray placement group.
 
             from vllm.executor import ray_utils
+
             backend = "mp"
             ray_found = ray_utils.ray_is_available()
             if (current_platform.is_cuda()
@@ -1021,8 +1045,10 @@ def __init__(
                     backend = "ray"
                 else:
                     from ray import is_initialized as ray_is_initialized
+
                     if ray_is_initialized():
                         from ray.util import get_current_placement_group
+
                         if get_current_placement_group():
                             backend = "ray"
             self.distributed_executor_backend = backend
@@ -1380,8 +1406,8 @@ def maybe_create_spec_config(
 
             draft_hf_config = draft_model_config.hf_config
 
-            if (num_speculative_tokens is not None
-                    and hasattr(draft_hf_config, "num_lookahead_tokens")):
+            if num_speculative_tokens is not None and hasattr(
+                    draft_hf_config, "num_lookahead_tokens"):
                 draft_hf_config.num_lookahead_tokens = num_speculative_tokens
 
             n_predict = getattr(draft_hf_config, "n_predict", None)
@@ -1713,7 +1739,8 @@ def verify_with_model_config(self, model_config: ModelConfig):
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
         if model_config.quantization and model_config.quantization not in [
-                "awq", "gptq"
+                "awq",
+                "gptq",
         ]:
             # TODO support marlin
             logger.warning("%s quantization is not tested with LoRA yet.",
@@ -1876,8 +1903,8 @@ def _get_and_verify_max_len(
     for key in possible_keys:
         max_len = getattr(hf_config, key, None)
         if max_len is not None:
-            max_len_key = key if max_len < derived_max_model_len \
-                else max_len_key
+            max_len_key = (key
+                           if max_len < derived_max_model_len else max_len_key)
             derived_max_model_len = min(derived_max_model_len, max_len)
 
     # If sliding window is manually disabled, max_length should be less
@@ -1906,8 +1933,10 @@ def _get_and_verify_max_len(
         logger.warning(
             "The model's config.json does not contain any of the following "
             "keys to determine the original maximum length of the model: "
-            "%s. Assuming the model's maximum length is %d.", possible_keys,
-            default_max_len)
+            "%s. Assuming the model's maximum length is %d.",
+            possible_keys,
+            default_max_len,
+        )
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
@@ -2001,10 +2030,10 @@ class DecodingConfig:
     """Dataclass which contains the decoding strategy of the engine"""
 
     # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
-    guided_decoding_backend: str = 'outlines'
+    guided_decoding_backend: str = "outlines"
 
     def __post_init__(self):
-        valid_guided_backends = ['outlines', 'lm-format-enforcer']
+        valid_guided_backends = ["outlines", "lm-format-enforcer"]
         backend = self.guided_decoding_backend
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend},"
@@ -2014,6 +2043,7 @@ def __post_init__(self):
 @dataclass
 class ObservabilityConfig:
     """Configuration for observability."""
+
     otlp_traces_endpoint: Optional[str] = None
 
     # Collecting detailed timing information for each request can be expensive.