Skip to content

Commit 18662dc

Browse files
authored
change 5 pytorch/huggingface models to fp16 (#11894)
1 parent 5c4ed00 commit 18662dc

File tree

7 files changed

+7
-7
lines changed

7 files changed

+7
-7
lines changed

python/llm/example/GPU/HuggingFace/LLM/codellama/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
optimize_model=False,
4848
trust_remote_code=True,
4949
use_cache=True)
50-
model = model.to('xpu')
50+
model = model.half().to('xpu')
5151

5252
# Load tokenizer
5353
tokenizer = CodeLlamaTokenizer.from_pretrained(model_path,

python/llm/example/GPU/HuggingFace/LLM/internlm/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
optimize_model=False,
4848
trust_remote_code=True,
4949
use_cache=True)
50-
model = model.to('xpu')
50+
model = model.half().to('xpu')
5151

5252
# Load tokenizer
5353
tokenizer = AutoTokenizer.from_pretrained(model_path,

python/llm/example/GPU/HuggingFace/LLM/solar/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
load_in_4bit=True,
4848
trust_remote_code=True,
4949
use_cache=True)
50-
model = model.to('xpu')
50+
model = model.half().to('xpu')
5151

5252
# Load tokenizer
5353
tokenizer = AutoTokenizer.from_pretrained(model_path,

python/llm/example/GPU/PyTorch-Models/Model/codellama/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
5151
model = optimize_model(model)
5252

53-
model = model.to('xpu')
53+
model = model.half().to('xpu')
5454

5555
# Load tokenizer
5656
tokenizer = CodeLlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)

python/llm/example/GPU/PyTorch-Models/Model/internlm2/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
use_cache=True)
4747
model = optimize_model(model)
4848

49-
model = model.to('xpu')
49+
model = model.half().to('xpu')
5050

5151
# Load tokenizer
5252
tokenizer = AutoTokenizer.from_pretrained(model_path,

python/llm/example/GPU/PyTorch-Models/Model/mistral/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
5050
model = optimize_model(model)
5151

52-
model = model.to('xpu')
52+
model = model.half().to('xpu')
5353

5454
# Load tokenizer
5555
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

python/llm/example/GPU/PyTorch-Models/Model/solar/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
# When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the optimize_model function.
5050
# This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
5151
model = optimize_model(model)
52-
model = model.to('xpu')
52+
model = model.half().to('xpu')
5353

5454
# Load tokenizer
5555
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

0 commit comments

Comments
 (0)