Skip to content

Commit ac3d53f

Browse files
authoredJun 4, 2024
LLM: Fix vLLM CPU version error (#11206)
Fix vLLM CPU version error
1 parent 3ef4aa9 commit ac3d53f

File tree

5 files changed

+94
-17
lines changed

5 files changed

+94
-17
lines changed
 

‎docker/llm/serving/cpu/docker/start-vllm-service.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ python -m ipex_llm.vllm.cpu.entrypoints.openai.api_server \
1111
--device cpu \
1212
--dtype bfloat16 \
1313
--enforce-eager \
14-
--load-in-low-bit sym_int4 \
14+
--load-in-low-bit bf16 \
1515
--max-model-len 4096 \
1616
--max-num-batched-tokens 10240 \
1717
--max-num-seqs 12 \

‎docker/llm/serving/cpu/docker/vllm_offline_inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
device="cpu",
5050
dtype="bfloat16",
5151
enforce_eager=True,
52-
load_in_low_bit="sym_int4",
52+
load_in_low_bit="bf16",
5353
tensor_parallel_size=1)
5454
# Generate texts from the prompts. The output is a list of RequestOutput objects
5555
# that contain the prompt, generated text, and other information.

‎python/llm/example/CPU/vLLM-Serving/offline_inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646

4747
# Create an LLM.
4848
# llm = LLM(model="facebook/opt-125m")
49-
llm = LLM(model="YOUR_MODEL_PATH", device="cpu", load_in_low_bit="sym_int4")
49+
llm = LLM(model="YOUR_MODEL_PATH", device="cpu", load_in_low_bit="bf16")
5050
# Generate texts from the prompts. The output is a list of RequestOutput objects
5151
# that contain the prompt, generated text, and other information.
5252
outputs = llm.generate(prompts, sampling_params)

‎python/llm/src/ipex_llm/transformers/convert.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454

5555
_IS_VLLM_AVAILABLE = None
5656
_USE_VLLM = False
57+
_VLLM_VERSION = None
5758

5859

5960
def is_auto_gptq_available():
@@ -77,6 +78,14 @@ def is_vllm_available():
7778
return _IS_VLLM_AVAILABLE
7879

7980

81+
def get_package_version(package_name):
82+
result = subprocess.run(['pip', 'list'], capture_output=True, text=True)
83+
for line in result.stdout.splitlines():
84+
if line.startswith(package_name):
85+
return line.split()[1]
86+
return None
87+
88+
8089
def get_use_vllm():
8190
return _USE_VLLM
8291

@@ -133,13 +142,24 @@ def is_linear_module(module):
133142
is_awq = is_auto_awq_available() and isinstance(module, WQLinear_GEMM)
134143
if is_vllm_available():
135144
# Only convert vllm modules
145+
global _VLLM_VERSION
146+
if _VLLM_VERSION is None:
147+
_VLLM_VERSION = get_package_version('vllm')
148+
if 'xpu' in _VLLM_VERSION:
149+
# For vllm xpu
150+
from vllm.model_executor.parallel_utils.parallel_state import (
151+
get_tensor_model_parallel_group,
152+
get_tensor_model_parallel_world_size
153+
)
154+
tp_size = get_tensor_model_parallel_world_size()
155+
else:
156+
# For vllm cpu
157+
tp_size = 1
158+
136159
from vllm.model_executor.layers.linear import (
137160
ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear
138161
)
139-
from vllm.model_executor.parallel_utils.parallel_state import (
140-
get_tensor_model_parallel_group,
141-
get_tensor_model_parallel_world_size
142-
)
162+
143163
VLLM_LINEAR_LIST = [
144164
ColumnParallelLinear, RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear
145165
]
@@ -148,7 +168,6 @@ def is_linear_module(module):
148168
out_features = module.output_size
149169
result = True
150170
mp_group = None
151-
tp_size = get_tensor_model_parallel_world_size()
152171
if isinstance(module, RowParallelLinear) and tp_size >= 2:
153172
mp_group = get_tensor_model_parallel_group()
154173
in_features = module.input_size_per_partition

‎python/llm/src/ipex_llm/vllm/cpu/model_convert.py

+67-9
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@
2424
from vllm.attention import Attention, AttentionMetadata
2525
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
2626
from vllm.config import DeviceConfig
27-
from typing import Tuple
27+
28+
from vllm._C import ops
2829
from ipex_llm.utils.common import invalidInputError
30+
from typing import List, Optional, Tuple, Union
2931

3032

3133
def _MLP_forward(self, x):
@@ -42,7 +44,7 @@ def _Attention_forward(
4244
kv_cache: torch.Tensor,
4345
attn_metadata: AttentionMetadata,
4446
) -> torch.Tensor:
45-
qkv = self.qkv_proj(hidden_states)
47+
qkv = self.qkv_proj(hidden_states).to(dtype=kv_cache.dtype)
4648
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
4749
q, k = self.rotary_emb(positions, q, k)
4850
attn_output = self.attn(q, k, v, kv_cache, attn_metadata, self.kv_scale)
@@ -145,21 +147,77 @@ def _model_attention_convert():
145147

146148

147149
def _ipex_llm_convert(load_in_low_bit):
148-
from vllm.worker.model_runner import ModelRunner
150+
from vllm.worker.cpu_model_runner import CPUModelRunner
149151
import vllm.model_executor.model_loader as model_loader
150-
setattr(ModelRunner, "load_model", get_load_function(load_in_low_bit))
152+
setattr(CPUModelRunner, "load_model", get_load_function(load_in_low_bit))
153+
154+
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
155+
setattr(RotaryEmbedding, "forward", _ipex_llm_rotary_embedding_forward)
156+
from vllm.model_executor.layers.layernorm import RMSNorm
157+
setattr(RMSNorm, "forward", _ipex_llm_rmsnorm_forward)
158+
159+
160+
def _ipex_llm_rotary_embedding_forward(
161+
self,
162+
positions: torch.Tensor,
163+
query: torch.Tensor,
164+
key: torch.Tensor,
165+
offsets: Optional[torch.Tensor] = None,
166+
) -> Tuple[torch.Tensor, torch.Tensor]:
167+
self.cos_sin_cache = self.cos_sin_cache.to(positions.device, dtype=query.dtype)
168+
169+
# ops.rotary_embedding()/batched_rotary_embedding()
170+
# are in-place operations that update the query and key tensors.
171+
if offsets is not None:
172+
ops.batched_rotary_embedding(positions, query, key, self.head_size,
173+
self.cos_sin_cache,
174+
self.is_neox_style, self.rotary_dim,
175+
offsets)
176+
else:
177+
ops.rotary_embedding(positions, query, key, self.head_size,
178+
self.cos_sin_cache, self.is_neox_style)
179+
return query, key
180+
181+
182+
def _ipex_llm_rmsnorm_forward(
183+
self,
184+
x: torch.Tensor,
185+
residual: Optional[torch.Tensor] = None,
186+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
187+
x = x.to(dtype=self.weight.data.dtype)
188+
if residual is not None:
189+
residual = residual.to(dtype=self.weight.data.dtype)
190+
ops.fused_add_rms_norm(
191+
x,
192+
residual,
193+
self.weight.data,
194+
self.variance_epsilon,
195+
)
196+
return x, residual
197+
out = torch.empty_like(x)
198+
ops.rms_norm(
199+
out,
200+
x,
201+
self.weight.data,
202+
self.variance_epsilon,
203+
)
204+
return out
151205

152206

153207
def get_load_function(low_bit):
154208
def _ipex_llm_load_model(self) -> None:
155209
_model_mlp_convert()
156210
_model_attention_convert()
157211

158-
self.model = get_model(self.model_config,
159-
self.device_config,
160-
lora_config=self.lora_config,
161-
parallel_config=self.parallel_config,
162-
scheduler_config=self.scheduler_config)
212+
self.model = get_model(
213+
model_config=self.model_config,
214+
load_config=self.load_config,
215+
device_config=self.device_config,
216+
vision_language_config=self.vision_language_config,
217+
lora_config=self.lora_config,
218+
parallel_config=self.parallel_config,
219+
scheduler_config=self.scheduler_config)
220+
163221
from ipex_llm import optimize_model
164222
optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype)
165223

0 commit comments

Comments
 (0)