intel
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎docker/llm/README.md
+4-4 b/‎docker/llm/README.md
+4-4
diff --git a/‎docker/llm/inference/xpu/docker/chat.py
+1-1 b/‎docker/llm/inference/xpu/docker/chat.py
+1-1
diff --git a/‎docker/llm/serving/cpu/docker/entrypoint.sh
+4-4 b/‎docker/llm/serving/cpu/docker/entrypoint.sh
+4-4
diff --git a/‎docker/llm/serving/cpu/docker/model_adapter.py.patch
+1-1 b/‎docker/llm/serving/cpu/docker/model_adapter.py.patch
+1-1
diff --git a/‎docker/llm/serving/xpu/docker/entrypoint.sh
+4-4 b/‎docker/llm/serving/xpu/docker/entrypoint.sh
+4-4
diff --git a/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md
+4-4 b/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/finetune.md
+4-4
diff --git a/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md
+1-1 b/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/hugging_face_format.md
+1-1
diff --git a/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md
+5-5 b/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/inference_on_gpu.md
+5-5
diff --git a/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md
+4-4 b/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/langchain_api.md
+4-4
diff --git a/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md
+2-2 b/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/native_format.md
+2-2
diff --git a/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
+3-3 b/‎docs/readthedocs/source/doc/LLM/Overview/KeyFeatures/optimize_model.md
+3-3
diff --git a/‎docs/readthedocs/source/doc/LLM/Overview/llm.md
+1-1 b/‎docs/readthedocs/source/doc/LLM/Overview/llm.md
+1-1
diff --git a/‎docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md
+2-2 b/‎docs/readthedocs/source/doc/LLM/Quickstart/install_linux_gpu.md
+2-2
diff --git a/‎docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md
+3-3 b/‎docs/readthedocs/source/doc/LLM/Quickstart/install_windows_gpu.md
+3-3
@@ -86,7 +86,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
 
 ```python
 #load Hugging Face Transformers model with INT4 optimizations
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 
 #run the optimized model on CPU
@@ -113,7 +113,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
 
 ```python
 #load Hugging Face Transformers model with INT4 optimizations
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 
 #run the optimized model on Intel GPU
 
@@ -223,7 +223,7 @@ This controller manages the distributed workers.
 
 ##### Launch the model worker(s)
 ```bash
-python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
+python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
 ```
 Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
 
@@ -252,7 +252,7 @@ python3 -m fastchat.serve.controller
 Then, launch the model worker(s):
 
 ```bash
-python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
+python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device cpu
 ```
 
 Finally, launch the RESTful API server
@@ -319,7 +319,7 @@ This controller manages the distributed workers.
 
 ##### Launch the model worker(s)
 ```bash
-python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
+python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
 ```
 Wait until the process finishes loading the model and you see "Uvicorn running on ...". The model worker will register itself to the controller.
 
@@ -346,7 +346,7 @@ python3 -m fastchat.serve.controller
 Then, launch the model worker(s):
 
 ```bash
-python3 -m bigdl.llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
+python3 -m ipex_llm.serving.model_worker --model-path lmsys/vicuna-7b-v1.3 --device xpu
 ```
 
 Finally, launch the RESTful API server
 
@@ -23,7 +23,7 @@
 from transformers.tools.agents import StopSequenceCriteria
 from transformers.generation.stopping_criteria import StoppingCriteriaList
 from colorama import Fore
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 SYSTEM_PROMPT = "A chat between a curious human <human> and an artificial intelligence assistant <bot>.\
 The assistant gives helpful, detailed, and polite answers to the human's questions."
 HUMAN_ID = "<human>"
 
@@ -135,9 +135,9 @@ else
   done
 
   if [ "$worker_type" == "model_worker" ]; then
-      worker_type="bigdl.llm.serving.model_worker"
+      worker_type="ipex_llm.serving.model_worker"
   elif [ "$worker_type" == "vllm_worker" ]; then
-      worker_type="bigdl.llm.serving.vllm_worker"
+      worker_type="ipex_llm.serving.vllm_worker"
   fi
 
   if [[ -n $CONTROLLER_HOST ]]; then
@@ -220,9 +220,9 @@ else
     echo "Worker type: $worker_type"
     echo "Worker address: $worker_address"
     echo "Controller address: $controller_address"
-    if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
+    if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
       python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
-    elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
+    elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
       python3 -m "$worker_type" --model-path $model_path --device cpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
     fi
   fi
 
@@ -9,7 +9,7 @@
          generation_config = GenerationConfig.from_pretrained(
              model_path, trust_remote_code=True
          )
-+        from bigdl.llm.transformers import AutoModelForCausalLM
++        from ipex_llm.transformers import AutoModelForCausalLM
          model = AutoModelForCausalLM.from_pretrained(
              model_path,
              config=config,
 
@@ -66,9 +66,9 @@ else
     done
 
     if [ "$worker_type" == "model_worker" ]; then
-        worker_type="bigdl.llm.serving.model_worker"
+        worker_type="ipex_llm.serving.model_worker"
     elif [ "$worker_type" == "vllm_worker" ]; then
-        worker_type="bigdl.llm.serving.vllm_worker"
+        worker_type="ipex_llm.serving.vllm_worker"
     fi
 
     if [[ -n $CONTROLLER_HOST ]]; then
@@ -127,9 +127,9 @@ else
         echo "Worker address: $worker_address"
         echo "Controller address: $controller_address"
 
-        if [ "$worker_type" == "bigdl.llm.serving.model_worker" ]; then
+        if [ "$worker_type" == "ipex_llm.serving.model_worker" ]; then
             python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address --stream-interval $stream_interval
-        elif [ "$worker_type" == "bigdl.llm.serving.vllm_worker" ]; then
+        elif [ "$worker_type" == "ipex_llm.serving.vllm_worker" ]; then
             python3 -m "$worker_type" --model-path $model_path --device xpu --host $worker_host --port $worker_port --worker-address $worker_address --controller-address $controller_address
         fi
     fi
 
@@ -21,7 +21,7 @@ To help you better understand the finetuning process, here we use model [Llama-2
 First, load model using `transformers`-style API and **set it to `to('xpu')`**. We specify `load_in_low_bit="nf4"` here to apply 4-bit NormalFloat optimization. According to the [QLoRA paper](https://arxiv.org/pdf/2305.14314.pdf), using `"nf4"` could yield better model quality than `"int4"`.
 
 ```python
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf",
                                              load_in_low_bit="nf4",
@@ -33,14 +33,14 @@ model = model.to('xpu')
 
 Then, we have to apply some preprocessing to the model to prepare it for training.
 ```python
-from bigdl.llm.transformers.qlora import prepare_model_for_kbit_training
+from ipex_llm.transformers.qlora import prepare_model_for_kbit_training
 model.gradient_checkpointing_enable()
 model = prepare_model_for_kbit_training(model)
 ```
 
 Next, we can obtain a Peft model from the optimized model and a configuration object containing the parameters as follows:
 ```python
-from bigdl.llm.transformers.qlora import get_peft_model
+from ipex_llm.transformers.qlora import get_peft_model
 from peft import LoraConfig
 config = LoraConfig(r=8, 
                     lora_alpha=32, 
@@ -54,7 +54,7 @@ model = get_peft_model(model, config)
 ```eval_rst
 .. important::
 
-   Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``bigdl.llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
+   Instead of ``from peft import prepare_model_for_kbit_training, get_peft_model`` as we did for regular QLoRA using bitandbytes and cuda, we import them from ``ipex_llm.transformers.qlora`` here to get a BigDL-LLM compatible Peft model. And the rest is just the same as regular LoRA finetuning process using ``peft``.
 ```
 
 ```eval_rst
 
@@ -5,7 +5,7 @@ You may apply INT4 optimizations to any Hugging Face *Transformers* models as fo
 
 ```python
 # load Hugging Face Transformers model with INT4 optimizations
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained('/path/to/model/', load_in_4bit=True)
 ```
 
@@ -29,7 +29,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
 
          # Take Llama-2-7b-chat-hf as an example
          from transformers import LlamaForCausalLM
-         from bigdl.llm import optimize_model
+         from ipex_llm import optimize_model
 
          model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_dtype='auto', low_cpu_mem_usage=True)
          model = optimize_model(model) # With only one line to enable BigDL-LLM INT4 optimization
@@ -40,14 +40,14 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
 
          When running LLMs on Intel iGPUs for Windows users, we recommend setting ``cpu_embedding=True`` in the ``optimize_model`` function. This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
          
-         See the `API doc <../../../PythonAPI/LLM/optimize.html#bigdl.llm.optimize_model>`_ for ``optimize_model`` to find more information.
+         See the `API doc <../../../PythonAPI/LLM/optimize.html#ipex_llm.optimize_model>`_ for ``optimize_model`` to find more information.
 
       Especially, if you have saved the optimized model following setps `here <./optimize_model.html#save>`_, the loading process on Intel GPUs maybe as follows:
 
       .. code-block:: python
 
          from transformers import LlamaForCausalLM
-         from bigdl.llm.optimize import low_memory_init, load_low_bit
+         from ipex_llm.optimize import low_memory_init, load_low_bit
 
          saved_dir='./llama-2-bigdl-llm-4-bit'
          with low_memory_init(): # Fast and low cost by loading model on meta device
@@ -65,7 +65,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
       .. code-block:: python
 
          # Take Llama-2-7b-chat-hf as an example
-         from bigdl.llm.transformers import AutoModelForCausalLM
+         from ipex_llm.transformers import AutoModelForCausalLM
 
          # Load model in 4 bit, which convert the relevant layers in the model into INT4 format
          model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', load_in_4bit=True)
@@ -82,7 +82,7 @@ You could choose to use [PyTorch API](./optimize_model.html) or [`transformers`-
 
       .. code-block:: python
 
-         from bigdl.llm.transformers import AutoModelForCausalLM
+         from ipex_llm.transformers import AutoModelForCausalLM
 
          saved_dir='./llama-2-bigdl-llm-4-bit'
          model = AutoModelForCausalLM.load_low_bit(saved_dir) # Load the optimized model
 
@@ -7,8 +7,8 @@ You may run the models using the LangChain API in `bigdl-llm`.
 You may run any Hugging Face *Transformers* model (with INT4 optimiztions applied) using the LangChain API as follows:
 
 ```python
-from bigdl.llm.langchain.llms import TransformersLLM
-from bigdl.llm.langchain.embeddings import TransformersEmbeddings
+from ipex_llm.langchain.llms import TransformersLLM
+from ipex_llm.langchain.embeddings import TransformersEmbeddings
 from langchain.chains.question_answering import load_qa_chain
 
 embeddings = TransformersEmbeddings.from_model_id(model_id=model_path)
@@ -37,8 +37,8 @@ You may also convert Hugging Face *Transformers* models into native INT4 format,
 ```
 
 ```python
-from bigdl.llm.langchain.llms import LlamaLLM
-from bigdl.llm.langchain.embeddings import LlamaEmbeddings
+from ipex_llm.langchain.llms import LlamaLLM
+from ipex_llm.langchain.embeddings import LlamaEmbeddings
 from langchain.chains.question_answering import load_qa_chain
 
 # switch to ChatGLMEmbeddings/GptneoxEmbeddings/BloomEmbeddings/StarcoderEmbeddings to load other models
 
@@ -10,13 +10,13 @@ You may also convert Hugging Face *Transformers* models into native INT4 format
 
 ```python
 # convert the model
-from bigdl.llm import llm_convert
+from ipex_llm import llm_convert
 bigdl_llm_path = llm_convert(model='/path/to/model/',
        outfile='/path/to/output/', outtype='int4', model_family="llama")
 
 # load the converted model
 # switch to ChatGLMForCausalLM/GptneoxForCausalLM/BloomForCausalLM/StarcoderForCausalLM to load other models
-from bigdl.llm.transformers import LlamaForCausalLM
+from ipex_llm.transformers import LlamaForCausalLM
 llm = LlamaForCausalLM.from_pretrained("/path/to/output/model.bin", native=True, ...)
 
 # run the converted model
 
@@ -14,7 +14,7 @@ model = LlamaForCausalLM.from_pretrained('meta-llama/Llama-2-7b-chat-hf', torch_
 
 Then, just need to call `optimize_model` to optimize the loaded model and INT4 optimization is applied on model by default: 
 ```python
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # With only one line to enable BigDL-LLM INT4 optimization
 model = optimize_model(model)
@@ -31,7 +31,7 @@ Currently, ``low_bit`` supports options 'sym_int4', 'asym_int4', 'sym_int5', 'as
 You may apply symmetric INT8 optimization as follows:
 
 ```python
-from bigdl.llm import optimize_model
+from ipex_llm import optimize_model
 
 # Apply symmetric INT8 optimization
 model = optimize_model(model, low_bit="sym_int8")
@@ -51,7 +51,7 @@ model.save_low_bit(saved_dir)
 
 We recommend to use the context manager `low_memory_init` to quickly initiate a model instance with low cost, and then use `load_low_bit` to load the optimized low-bit model as follows:
 ```python
-from bigdl.llm.optimize import low_memory_init, load_low_bit
+from ipex_llm.optimize import low_memory_init, load_low_bit
 with low_memory_init(): # Fast and low cost by loading model on meta device
    model = LlamaForCausalLM.from_pretrained(saved_dir,
                                             torch_dtype="auto",
 
@@ -11,7 +11,7 @@ Here, let's take a relatively small LLM model, i.e [open_llama_3b_v2](https://hu
 Simply use one-line `transformers`-style API in `bigdl-llm` to load `open_llama_3b_v2` with INT4 optimization (by specifying `load_in_4bit=True`) as follows:
 
 ```python
-from bigdl.llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers import AutoModelForCausalLM
 
 model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="openlm-research/open_llama_3b_v2",
                                              load_in_4bit=True)
 
@@ -112,7 +112,7 @@ Install the Miniconda as follows if you don't have conda installed on your machi
 
   python
 
-  > from bigdl.llm.transformers import AutoModel, AutoModelForCausalLM
+  > from ipex_llm.transformers import AutoModel, AutoModelForCausalLM
   ```
 
   > <img src="https://llm-assets.readthedocs.io/en/latest/_images/verify_bigdl_import.png" alt="image-20240221102252562" width=100%; />
@@ -170,7 +170,7 @@ Now let's play with a real LLM. We'll be using the [phi-1.5](https://huggingface
    ```python
    # Copy/Paste the contents to a new file demo.py
    import torch
-   from bigdl.llm.transformers import AutoModelForCausalLM
+   from ipex_llm.transformers import AutoModelForCausalLM
    from transformers import AutoTokenizer, GenerationConfig
    generation_config = GenerationConfig(use_cache = True)
    
 
@@ -130,7 +130,7 @@ You can verify if `bigdl-llm` is successfully installed by simply running a few
 * Step 5: Copy following code to Anaconda prompt **line by line** and press Enter **after copying each line**.
   ```python
   import torch 
-  from bigdl.llm.transformers import AutoModel,AutoModelForCausalLM    
+  from ipex_llm.transformers import AutoModel,AutoModelForCausalLM    
   tensor_1 = torch.randn(1, 1, 40, 128).to('xpu') 
   tensor_2 = torch.randn(1, 1, 128, 40).to('xpu') 
   print(torch.matmul(tensor_1, tensor_2).size()) 
@@ -200,7 +200,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
   
            # Copy/Paste the contents to a new file demo.py
            import torch
-           from bigdl.llm.transformers import AutoModelForCausalLM
+           from ipex_llm.transformers import AutoModelForCausalLM
            from transformers import AutoTokenizer, GenerationConfig
            generation_config = GenerationConfig(use_cache=True)
            
@@ -260,7 +260,7 @@ Now let's play with a real LLM. We'll be using the [Qwen-1.8B-Chat](https://hugg
 
            # Copy/Paste the contents to a new file demo.py
            import torch
-           from bigdl.llm.transformers import AutoModelForCausalLM
+           from ipex_llm.transformers import AutoModelForCausalLM
            from transformers import GenerationConfig
            from modelscope import AutoTokenizer
            generation_config = GenerationConfig(use_cache=True)
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`generation_config = GenerationConfig.from_pretrained(`
`10`	`10`	`model_path, trust_remote_code=True`
`11`	`11`	`)`
`12`		`-+ from bigdl.llm.transformers import AutoModelForCausalLM`
	`12`	`++ from ipex_llm.transformers import AutoModelForCausalLM`
`13`	`13`	`model = AutoModelForCausalLM.from_pretrained(`
`14`	`14`	`model_path,`
`15`	`15`	`config=config,`