vllm-project · tlrmchlsmth · Jan 16, 2025 · Jan 16, 2025 · Jan 17, 2025 · Jan 17, 2025
@@ -4,14 +4,22 @@
 Run `pytest tests/models/test_mamba.py`.
 """
 import pytest
+import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
 from ...utils import check_outputs_equal
 
-MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
+MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "tiiuae/falcon-mamba-tiny-dev",
+    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
+    # See https://github.com/huggingface/transformers/pull/35943
+    # "mistralai/Mamba-Codestral-7B-v0.1",
+]
 
 
 # Use lower-level interfaces to create this greedy generator, as mamba will
@@ -21,6 +29,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(model_name)
 
+    # Set the device (GPU if available, else CPU)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+
     # Generate texts from the prompts
     outputs = []
     for prompt in example_prompts:
@@ -29,7 +41,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
         input_ids = inputs["input_ids"].to(model.device)
 
         # Generate text using the model's generate method directly
-        generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
+        generated_ids = model.generate(input_ids,
+                                       max_new_tokens=max_tokens,
+                                       do_sample=False)
         generated_text = tokenizer.decode(generated_ids[0],
                                           skip_special_tokens=True)
 
@@ -50,7 +64,8 @@ def test_models(
 ) -> None:
     hf_outputs = generate_greedy(model, example_prompts, max_tokens)
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    # Set max_num_seqs to keep Codestral from going OOM at fp32
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
         # This test is for verifying whether the model's extra_repr
@@ -81,7 +96,7 @@ def test_batching(
 ) -> None:
     # To pass the small model tests, we need full precision.
     for_loop_outputs = []
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         for prompt in example_prompts:
             for_loop_outputs.append(
                 vllm_model.generate_greedy([prompt], max_tokens)[0])
@@ -165,7 +180,7 @@ def test_parallel_sampling(
     max_tokens: int,
 ) -> None:
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         for_loop_outputs = []
         for _ in range(10):
             for_loop_outputs.append(
@@ -232,7 +247,7 @@ def test_models_preemption_recompute(
     # Tests that outputs are identical with and w/o preemtions (recompute)
     assert dtype == "float"
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         vllm_model.model.llm_engine.scheduler[
             0].ENABLE_ARTIFICIAL_PREEMPT = True
         preempt_vllm_outputs = vllm_model.generate_greedy(
@@ -283,7 +298,7 @@ def test_state_cleanup(
     # This test is for verifying that the Mamba state is cleaned up between
     # steps, If its not cleaned, an error would be expected.
     try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
+        with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
             for _ in range(10):
                 vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
     except ValueError:

diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -293,7 +293,8 @@ def _chunk_scan_fwd_kernel(
             dA_cs_m_boundary = tl.load(
                 dA_cumsum_ptr +
                 (pid_m * BLOCK_SIZE_M + c_off - 1) * stride_dA_cs_csize,
-                mask=(pid_m * BLOCK_SIZE_M + c_off - 1) > -1,
+                mask=(((pid_m * BLOCK_SIZE_M + c_off - 1) > -1)
+                      and ((pid_m * BLOCK_SIZE_M + c_off) < chunk_size)),
                 other=0.0).to(tl.float32)
 
     if HAS_SEQ_IDX:
@@ -463,7 +464,10 @@ def _seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
         p += (s % chunk_size > 0)
 
         # get the dimensions
-        _s, _e = s // chunk_size + p, e // chunk_size + p + 1
+        # - the + 1 for _e is to shift the boundary by one chunk
+        # - this shifting is not needed if chunk_size divides e
+        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
+                                                             > 0)
 
         # adjust inidces and offsets
         chunk_indices[_s:_e] -= p