Skip to content

Commit 10ee786

Browse files
authored
Replace with IPEX-LLM in example comments (#10671)
* Replace with IPEX-LLM in example comments * More replacement * revert some changes
1 parent 08018a1 commit 10ee786

File tree

159 files changed

+183
-183
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

159 files changed

+183
-183
lines changed

python/llm/example/CPU/Applications/streaming-llm/streaming_llm/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
import urllib.request
4949
import os
5050
import json
51-
# code change to import from bigdl-llm API instead of using transformers API
51+
# code change to import from IPEX-LLM API instead of using transformers API
5252
from ipex_llm.transformers import AutoModelForCausalLM
5353
from transformers import LlamaTokenizer
5454
import intel_extension_for_pytorch as ipex

python/llm/example/CPU/Deepspeed-AutoTP/deepspeed_autotp.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@
8787
replace_method="auto"
8888
)
8989

90-
# Apply BigDL-LLM INT4 optimizations on transformers
90+
# Apply IPEX-LLM INT4 optimizations on transformers
9191
model = optimize_model(model.module.to(f'cpu'), low_bit='sym_int4')
9292

9393
model = model.to(f'cpu:{local_rank}')
@@ -111,7 +111,7 @@
111111
# if your selected model is capable of utilizing previous key/value attentions
112112
# to enhance decoding speed, but has `"use_cache": false` in its model config,
113113
# it is important to set `use_cache=True` explicitly in the `generate` function
114-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
114+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
115115
output = model.generate(input_ids,
116116
do_sample=False,
117117
max_new_tokens=args.n_predict)

python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/AWQ/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
# if your selected model is capable of utilizing previous key/value attentions
6060
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6161
# it is important to set `use_cache=True` explicitly in the `generate` function
62-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
62+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6363
output = model.generate(input_ids,
6464
max_new_tokens=args.n_predict)
6565
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545
model_path = args.model
4646

47-
# Load gguf model and vocab, then convert them to bigdl-llm model and huggingface tokenizer
47+
# Load gguf model and vocab, then convert them to IPEX-LLM model and huggingface tokenizer
4848
model, tokenizer = AutoModelForCausalLM.from_gguf(model_path, low_bit = args.low_bit,)
4949

5050
# Generate predicted tokens

python/llm/example/CPU/HF-Transformers-AutoModels/Advanced-Quantizations/GPTQ/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
# if your selected model is capable of utilizing previous key/value attentions
6161
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6262
# it is important to set `use_cache=True` explicitly in the `generate` function
63-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
63+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6464
output = model.generate(input_ids,
6565
max_new_tokens=args.n_predict)
6666
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
# if your selected model is capable of utilizing previous key/value attentions
5757
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5858
# it is important to set `use_cache=True` explicitly in the `generate` function
59-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
59+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6060
output = model.generate(input_ids,
6161
max_new_tokens=args.n_predict)
6262
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/aquila2/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
# if your selected model is capable of utilizing previous key/value attentions
5757
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5858
# it is important to set `use_cache=True` explicitly in the `generate` function
59-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
59+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6060
output = model.generate(input_ids,
6161
max_new_tokens=args.n_predict)
6262
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
# if your selected model is capable of utilizing previous key/value attentions
5757
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5858
# it is important to set `use_cache=True` explicitly in the `generate` function
59-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
59+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6060
output = model.generate(input_ids,
6161
max_new_tokens=args.n_predict)
6262
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/baichuan2/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
# if your selected model is capable of utilizing previous key/value attentions
4646
# to enhance decoding speed, but has `"use_cache": false` in its model config,
4747
# it is important to set `use_cache=True` explicitly in the `generate` function
48-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
48+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
4949
model = AutoModelForCausalLM.from_pretrained(model_path,
5050
load_in_4bit=True,
5151
trust_remote_code=True,

python/llm/example/CPU/HF-Transformers-AutoModels/Model/bluelm/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
# if your selected model is capable of utilizing previous key/value attentions
5757
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5858
# it is important to set `use_cache=True` explicitly in the `generate` function
59-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
59+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6060
output = model.generate(input_ids,
6161
max_new_tokens=args.n_predict)
6262
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# if your selected model is capable of utilizing previous key/value attentions
5858
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5959
# it is important to set `use_cache=True` explicitly in the `generate` function
60-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
60+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6161
output = model.generate(input_ids,
6262
max_new_tokens=args.n_predict)
6363
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm2/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# if your selected model is capable of utilizing previous key/value attentions
5858
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5959
# it is important to set `use_cache=True` explicitly in the `generate` function
60-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
60+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6161
output = model.generate(input_ids,
6262
max_new_tokens=args.n_predict)
6363
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/chatglm3/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# if your selected model is capable of utilizing previous key/value attentions
5858
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5959
# it is important to set `use_cache=True` explicitly in the `generate` function
60-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
60+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6161
output = model.generate(input_ids,
6262
max_new_tokens=args.n_predict)
6363
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/codellama/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
# if your selected model is capable of utilizing previous key/value attentions
5656
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5757
# it is important to set `use_cache=True` explicitly in the `generate` function
58-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
58+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
5959
output = model.generate(input_ids,
6060
max_new_tokens=args.n_predict)
6161
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/codeshell/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
# if your selected model is capable of utilizing previous key/value attentions
5757
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5858
# it is important to set `use_cache=True` explicitly in the `generate` function
59-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
59+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6060

6161
output = model.generate(input_ids, max_new_tokens=args.n_predict)
6262

python/llm/example/CPU/HF-Transformers-AutoModels/Model/deepseek/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
# if your selected model is capable of utilizing previous key/value attentions
6262
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6363
# it is important to set `use_cache=True` explicitly in the `generate` function
64-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
64+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6565
output = model.generate(input_ids,
6666
max_new_tokens=args.n_predict)
6767
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v1/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
st = time.time()
6262
# enabling `use_cache=True` allows the model to utilize the previous
6363
# key/values attentions to speed up decoding;
64-
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
64+
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
6565
# it is important to set use_cache=True for Dolly v1 models
6666
output = model.generate(input_ids,
6767
use_cache=True,

python/llm/example/CPU/HF-Transformers-AutoModels/Model/dolly_v2/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
# if your selected model is capable of utilizing previous key/value attentions
6565
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6666
# it is important to set `use_cache=True` explicitly in the `generate` function
67-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
67+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6868
output = model.generate(input_ids,
6969
max_new_tokens=args.n_predict,
7070
pad_token_id=tokenizer.pad_token_id,

python/llm/example/CPU/HF-Transformers-AutoModels/Model/falcon/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# if your selected model is capable of utilizing previous key/value attentions
5858
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5959
# it is important to set `use_cache=True` explicitly in the `generate` function
60-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
60+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6161
output = model.generate(input_ids,
6262
max_new_tokens=args.n_predict)
6363
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/flan-t5/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
# if your selected model is capable of utilizing previous key/value attentions
6161
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6262
# it is important to set `use_cache=True` explicitly in the `generate` function
63-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
63+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6464
output = model.generate(input_ids,
6565
max_new_tokens=args.n_predict)
6666
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/fuyu/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
image = Image.open(args.image_path)
3939

4040
# Load model
41-
# For successful BigDL-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
41+
# For successful IPEX-LLM optimization on Fuyu, skip the 'vision_embed_tokens' module during optimization
4242
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu',
4343
load_in_4bit = True,
4444
trust_remote_code=True,

python/llm/example/CPU/HF-Transformers-AutoModels/Model/gemma/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
# if your selected model is capable of utilizing previous key/value attentions
6262
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6363
# it is important to set `use_cache=True` explicitly in the `generate` function
64-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
64+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6565
output = model.generate(input_ids,
6666
max_new_tokens=args.n_predict)
6767
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm-xcomposer/chat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
image = args.image_path
3838

3939
# Load model
40-
# For successful BigDL-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
40+
# For successful IPEX-LLM optimization on InternLM-XComposer, skip the 'qkv' module during optimization
4141
model = AutoModelForCausalLM.from_pretrained(model_path, device='cpu', load_in_4bit=True,
4242
trust_remote_code=True, modules_to_not_convert=['qkv'])
4343

python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# if your selected model is capable of utilizing previous key/value attentions
5858
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5959
# it is important to set `use_cache=True` explicitly in the `generate` function
60-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
60+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6161
output = model.generate(input_ids,
6262
max_new_tokens=args.n_predict)
6363
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/internlm2/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
# if your selected model is capable of utilizing previous key/value attentions
5757
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5858
# it is important to set `use_cache=True` explicitly in the `generate` function
59-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
59+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6060
output = model.generate(input_ids,
6161
max_new_tokens=args.n_predict)
6262
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/llama2/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
# if your selected model is capable of utilizing previous key/value attentions
6161
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6262
# it is important to set `use_cache=True` explicitly in the `generate` function
63-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
63+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6464
output = model.generate(input_ids,
6565
max_new_tokens=args.n_predict)
6666
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/mistral/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
# if your selected model is capable of utilizing previous key/value attentions
5656
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5757
# it is important to set `use_cache=True` explicitly in the `generate` function
58-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
58+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
5959
output = model.generate(input_ids,
6060
max_new_tokens=args.n_predict)
6161
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/mixtral/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
# if your selected model is capable of utilizing previous key/value attentions
6262
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6363
# it is important to set `use_cache=True` explicitly in the `generate` function
64-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
64+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6565
output = model.generate(input_ids,
6666
max_new_tokens=args.n_predict)
6767
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/moss/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
# if your selected model is capable of utilizing previous key/value attentions
5757
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5858
# it is important to set `use_cache=True` explicitly in the `generate` function
59-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
59+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6060
output = model.generate(input_ids,
6161
max_new_tokens=args.n_predict)
6262
end = time.time()

python/llm/example/CPU/HF-Transformers-AutoModels/Model/mpt/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
input_ids = tokenizer.encode(prompt, return_tensors="pt")
5656
# enabling `use_cache=True` allows the model to utilize the previous
5757
# key/values attentions to speed up decoding;
58-
# to obtain optimal performance with BigDL-LLM INT4 optimizations,
58+
# to obtain optimal performance with IPEX-LLM INT4 optimizations,
5959
# it is important to set use_cache=True for MPT models
6060
mpt_generation_config = GenerationConfig(
6161
max_new_tokens=args.n_predict,

python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-1_5/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
# if your selected model is capable of utilizing previous key/value attentions
5959
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6060
# it is important to set `use_cache=True` explicitly in the `generate` function
61-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
61+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6262

6363
# Note that phi-1_5 uses GenerationConfig to enable 'use_cache'
6464
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)

python/llm/example/CPU/HF-Transformers-AutoModels/Model/phi-2/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
# if your selected model is capable of utilizing previous key/value attentions
5959
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6060
# it is important to set `use_cache=True` explicitly in the `generate` function
61-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
61+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6262

6363
model.generation_config.pad_token_id = model.generation_config.eos_token_id
6464
# Note that phi-2 uses GenerationConfig to enable 'use_cache'

python/llm/example/CPU/HF-Transformers-AutoModels/Model/phixtral/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
# if your selected model is capable of utilizing previous key/value attentions
5959
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6060
# it is important to set `use_cache=True` explicitly in the `generate` function
61-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
61+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6262

6363
# Note that phixtral uses GenerationConfig to enable 'use_cache'
6464
output = model.generate(input_ids, do_sample=False, max_new_tokens=args.n_predict, generation_config = generation_config)

python/llm/example/CPU/HF-Transformers-AutoModels/Model/phoenix/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
# if your selected model is capable of utilizing previous key/value attentions
5656
# to enhance decoding speed, but has `"use_cache": false` in its model config,
5757
# it is important to set `use_cache=True` explicitly in the `generate` function
58-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
58+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
5959
output = model.generate(input_ids,
6060
do_sample=False,
6161
max_new_tokens=args.n_predict)

python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen-vl/chat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
model_path = args.repo_id_or_model_path
3737

3838
# Load model
39-
# For successful BigDL-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
39+
# For successful IPEX-LLM optimization on Qwen-VL-Chat, skip the 'c_fc' and 'out_proj' modules during optimization
4040
model = AutoModelForCausalLM.from_pretrained(model_path,
4141
load_in_4bit=True,
4242
device_map="cpu",

python/llm/example/CPU/HF-Transformers-AutoModels/Model/qwen/generate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
# if your selected model is capable of utilizing previous key/value attentions
6565
# to enhance decoding speed, but has `"use_cache": false` in its model config,
6666
# it is important to set `use_cache=True` explicitly in the `generate` function
67-
# to obtain optimal performance with BigDL-LLM INT4 optimizations
67+
# to obtain optimal performance with IPEX-LLM INT4 optimizations
6868
output = model.generate(input_ids,
6969
max_new_tokens=args.n_predict)
7070
end = time.time()

0 commit comments

Comments
 (0)