Merge pull request #3004 from alibaba/feature/sync

MNN:Sync: Sync Internal 2.9.4
alibaba · Aug 24, 2024 · 5e93be1 · 5e93be1
2 parents efd4a3c + ae6253f
commit 5e93be1
Show file tree

Hide file tree

Showing 298 changed files with 7,924 additions and 4,555 deletions.
diff --git a/3rd_party/OpenCLHeaders/CL/cl2.hpp b/3rd_party/OpenCLHeaders/CL/cl2.hpp
@@ -805,9 +805,9 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __GET_GL_OBJECT_INFO_ERR            CL_HPP_ERR_STR_(clGetGLObjectInfo)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __CREATE_IMAGE_ERR                  CL_HPP_ERR_STR_(clCreateImage)
-#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)
 #define __IMAGE_DIMENSION_ERR               CL_HPP_ERR_STR_(Incorrect image dimensions)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)
 #define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback)
 
 #define __CREATE_USER_EVENT_ERR             CL_HPP_ERR_STR_(clCreateUserEvent)
@@ -5229,7 +5229,6 @@ class Image3DGL : public Image3D
 };
 #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 120
 /*! \class ImageGL
  * \brief general image interface for GL interop.
  * We abstract the 2D and 3D GL images into a single instance here
@@ -5308,7 +5307,6 @@ class ImageGL : public Image
         return *this;
     }
 };
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
 
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -673,6 +673,15 @@ IF(MNN_TENSORRT)
   list(APPEND MNN_EXTRA_DEPENDS ${MNN_TRT_LIBS})
 ENDIF()
 
+IF(MNN_BUILD_LLM)
+    # add_definitions(-DMNN_BUILD_LLM)
+    include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt)
+    IF(NOT MNN_SEP_BUILD)
+      list(APPEND MNN_TARGETS llm)
+      list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:llm>)
+    ENDIF()
+ENDIF()
+
 IF(MNN_SEP_BUILD)
   add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS} ${MNN_EXTRA_HEADERS})
   target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
@@ -744,13 +753,7 @@ IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD)
   ENDIF()
   target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNOpenCV>)
 ENDIF()
-IF(MNN_BUILD_LLM)
-    # add_definitions(-DMNN_BUILD_LLM)
-    include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt)
-    IF(NOT MNN_SEP_BUILD)
-      target_sources(MNN PRIVATE $<TARGET_OBJECTS:llm>)
-    ENDIF()
-ENDIF()
+
 
 if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
 # Using -pthread, needed by thread-safe implemention of glibc, is better than only using -lpthread
@@ -761,9 +764,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
 else()
 endif()
 if (NOT MNN_BUILD_SHARED_LIBS)
-    if(APPLE)
-        set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
         # Static-link will not replace thread-related weak symbol in glibc with strong symbol
         # in pthread library, so we need use --whole-archive to pthread
         # https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why

diff --git a/docs/index.rst b/docs/index.rst
@@ -82,6 +82,7 @@
    tools/compress
    tools/visual
    tools/python
+   tools/script
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/inference/module.md b/docs/inference/module.md
@@ -200,6 +200,9 @@ MNN::Express::ExecutorScope scope(executor);
 module_thread.reset();
 ```
 
+## 多线程
+Module 的创建与运行依赖其所绑定的 Executor ，若不指定，则为全局 Executor ，并非线程安全。在多线程创建 Module 或进行推理时，会竞争全局 Executor 的资源，需要上锁或绑定不同的 Executor 。
+
 ## 调试
 
 Module API 也支持使用回调函数进行调试，与[runSessionWithCallBack](session.html#id19)相似。示例代码：
@@ -232,6 +235,40 @@ Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), s
 std::vector<VARP> outputs  = user_module->onForward(inputs);
 ```
 
+## 预推理分离模式
+对于满足 Interpreter-Session 运行条件的模型，若用户希望分离预推理（形状计算，几何计算，资源申请，策略搜索）与推理（内容计算）过程，可以设置预推理分离模式。示例代码如下：
+
+```cpp
+std::shared_ptr<Module> net(Module::load({"x"}, {"y"}, (const uint8_t*)buffer.data(), buffer.size()), Module::destroy);
+// 预推理分离模式
+auto code = net->traceOrOptimize(Interpreter::Module_Forward_Seperate);
+if (0 != code) {
+    // 若模型不支持预推理分离，需要还原设定
+    net->traceOrOptimize(Interpreter::Module_Forward_Combine);
+}
+
+/*预推理开始*/
+x = _Input({1, 3, 2, 2}, NCHW, halide_type_of<int>());
+auto input = x->writeMap<int>();
+y = net->onForward({x})[0];
+auto output = y->readMap<int>();
+
+/*预推理结束，获取输入和输出的数据指针*/
+
+/*内容计算*/
+/*
+Fill input
+*/
+
+// 输入空数组，表示仅进行推理
+net1->onForward({});
+
+/*
+Use output
+*/
+
+```
+
 ## 示例代码
 完整的示例代码可以参考`demo/exec/`文件夹中的以下源码文件：
 - `pictureRecognition_module.cpp` 使用`Module`执行图像分类，使用`ImageProcess`进行前处理，`Expr`进行后处理

diff --git a/docs/tools/script.md b/docs/tools/script.md
@@ -0,0 +1,70 @@
+# 脚本工具
+一些功能性脚本，提供各种功能。
+
+## apply_gptq.py
+将GPTQ的权重写入到量化的MNN权重中。
+
+### 用法
+```
+usage: apply_gptq.py [-h] --mnn_graph MNN_GRAPH --mnn_weight MNN_WEIGHT --gptq_tensor GPTQ_TENSOR
+
+apply_gptq
+
+options:
+  -h, --help            show this help message and exit
+  --mnn_graph MNN_GRAPH
+                        mnn graph json path.
+  --mnn_weight MNN_WEIGHT
+                        mnn weight file path.
+  --gptq_tensor GPTQ_TENSOR
+                        gptq tensor path.
+```
+
+### 参数
+- MNN_GRAPH: 模型计算图的json文件，获取方法：`./MNNDump2Json model.mnn model.json`
+- MNN_WEIGHT:  模型的权重文件，如：`gptq.mnn.weight`
+- GPTQ_TENSOR: GPTQ量化后的权重文件，`model.safetensor`
+
+### 示例
+使用该脚本生成gptq量化的权重`gptq.mnn.weight`
+```sh
+cd build
+./MNNDump2Json model.mnn model.json
+cp model.mnn.weight gptq.mnn.weight
+python ../tools/script/apply_gptq.py --mnn_graph model.json --mnn_weight gptq.mnn.weight --gptq_tensor model.safetensor
+```
+
+## apply_lora.py
+
+合并base模型的计算图和lora模型的权重文件，生成新的计算图。
+
+### 用法
+```sh
+usage: apply_lora.py [-h] --base BASE --lora LORA [--scale SCALE] [--fuse FUSE] [--out OUT]
+
+apply_lora
+
+options:
+  -h, --help     show this help message and exit
+  --base BASE    base model json path.
+  --lora LORA    lora dir path or *.safetensors path.
+  --scale SCALE  lora scale: `alpha/r`.
+  --fuse FUSE    fuse A and B.
+  --out OUT      out file name.
+```
+
+### 参数
+- BASE: base.json, base模型计算图的json文件，获取方法：`./MNNDump2Json base.mnn base.json`
+- LORA: lora权重文件夹或者lora权重的safetensors
+- SCALE: lora权重的scale, `lora_alpha / lora_r`, 一般为4.0
+- FUSE: 是否将lora_A与lora_B合并成为一个lora权重，合并后模型较大
+- OUT: 生成新的计算图文件名，默认为`lora.json`，转换为模型：`./MNNRevert2Buffer lora.json lora.mnn`
+
+### 示例
+使用该脚本生成lora对应的模型`lora.mnn`, 用法: [LoRA](../transformers/llm.html#lora)
+```sh
+cd build
+./MNNDump2Json base.mnn base.json
+python ../tools/script/apply_lora.py --base base.json --lora lora_dir
+./MNNRevert2Buffer lora.json lora.mnn
+```
diff --git a/docs/transformers/diffusion.md b/docs/transformers/diffusion.md
@@ -35,9 +35,10 @@ conda activate ldm
 ```
 ./MNNConvert -f ONNX --modelFile onnx_save_path/text_encoder/model.onnx --MNNModel mnn_save_path/text_encoder.mnn --weightQuantBits 8 --bizCode biz
 ```
-2. 实现denoiser从onnx模型 -> mnn模型
+2. 实现denoiser unet从onnx模型 -> mnn模型
 ```
 ./MNNConvert -f ONNX --modelFile onnx_save_path/unet/model.onnx --MNNModel mnn_save_path/unet.mnn --transformerFuse --weightQuantBits 8 --bizCode biz
+注意：对于非OpenCL后端推理，需要去掉--transformerFuse。
 ```
 3. 实现decoder从onnx模型 -> mnn模型
 ```
@@ -60,19 +61,26 @@ cd mnn_path/project/android/build
 ```
 ## 运行Diffusion Demo
 ```
-./diffusion_demo <resource_path> <model_type> <output_image_name> <input_text>
+./diffusion_demo <resource_path> <model_type> <output_image_name> <memory_mode> <backend_type> <input_text>
 ```
 其中，resource_path 就是mnn模型文件的路径，除了mnn文件，还需要:
 1. 将MNN目录transformers/diffusion/scheduler/alphas.txt文件拷贝到该文件夹下。
-2. 针对stable-diffusion-v1-5模型需要将huggingfacetokenizer目录下merges.txt和vocab.json拷贝到该文件夹中。
+2. 针对stable-diffusion-v1-5/chilloutmix模型需要将huggingfacetokenizer目录下merges.txt和vocab.json拷贝到该文件夹中。
 3. 针对Taiyi-Stable-Diffusion模型需要将huggingfacetokenizer目录下vocab.txt拷贝到该文件夹中。
-4. model_type是目前支持的两种diffusion模型的类别。如果是stable-diffusion-v1-5模型设为0，如果是Taiyi-Stable-Diffusion模型设为1。
+4. model_type是目前支持的两种diffusion模型的类别。如果是stable-diffusion-v1-5/chilloutmix模型设为0，如果是Taiyi-Stable-Diffusion模型设为1。
 5. output_image_name是生成图片的名字，默认图片位置在当前运行目录下。
-6. input_text是文生图的prompt，如果是stable-diffusion-v1-5模型建议英文prompt，如果是Taiyi-Stable-Diffusion建议中文prompt。
+6. memory_mode代表设备是否内存足够，设为0表示内存节约模式(demo中每个模型使用前等待初始化，用完释放)，1代表内存足够模式(所有模式启动时全初始化完，用时无需等待初始化)。
+7. backend_type代表选择的运行后端。
+8. input_text是文生图的prompt，如果是stable-diffusion-v1-5/chilloutmix模型建议英文prompt，如果是Taiyi-Stable-Diffusion建议中文prompt。
 
 运行指令例如: 
 ```
-./diffusion_demo mnn_sd1.5_path 0 demo.jpg "a cute cat"
-./diffusion_demo mnn_chilloutmix_path 0 demo.jpg "a pure girl"
-./diffusion_demo mnn_taiyi_path 1 demo.jpg "一只可爱的猫"
+./diffusion_demo mnn_sd1.5_path 0 demo.jpg 0 3 "a cute cat"
+./diffusion_demo mnn_chilloutmix_path 0 demo.jpg 0 3 "a pure girl"
+./diffusion_demo mnn_taiyi_path 1 demo.jpg 0 3 "一只可爱的猫"
 ```
+## FAQ
+1. Demo运行报错、段错误，怎么解决？
+- 常见错误可能是设备内存不足，通常支持opencl fp16的设备需要保证3GB以上的内存，不支持fp16则需要6GB以上显存了。
+2. 使用其他后端，出现报错，什么原因？
+- 目前其他后端暂不支持transformer插件算子，需要在onnx->mnn模型转换阶段，去掉--transformerFuse。
diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
@@ -110,7 +110,7 @@ options:
 
 ### 编译
 
-[从源码编译](../compile/tools.html#id4)
+[从源码编译](../compile/other.html#id4)
 
 ### 使用
 #### 运行时配置
@@ -151,7 +151,7 @@ options:
     - 3: 使用非对称8bit量化存储key，使用fp8格式寸处value
 - 硬件配置
   - backend_type: 推理使用硬件后端类型，默认为：`"cpu"`
-  - thread_num: 推理使用硬件线程数，默认为：`4`
+  - thread_num: CPU推理使用硬件线程数，默认为：`4`; OpenCL推理时使用`68`
   - precision: 推理使用精度策略，默认为：`"low"`，尽量使用`fp16`
   - memory: 推理使用内存策略，默认为：`"low"`，开启运行时量化
 
@@ -201,4 +201,69 @@ options:
 ./llm_demo model_dir/llm.mnn
 ## 针对prompt中的每行进行回复
 ./llm_demo model_dir/llm.mnn prompt.txt
-```
+```
+
+#### GPTQ权重加载
+- 使用脚本生成GPTQ模型权重，用法参考: [apply_gptq.py](../tools/script.html#apply-gptq-py)
+- 创建`gptq.json`配置文件
+  ```json
+  {
+      "llm_model": "model.mnn",
+      "llm_weight": "gptq.mnn.weight",
+  }
+  ```
+
+
+#### LoRA权重加载
+- 使用脚本生成lora模型，用法参考: [apply_lora.py](../tools/script.html#apply-lora-py)
+- lora模型使用
+  - 直接加载lora模型使用，创建`lora.json`配置文件
+  ```json
+  {
+      "llm_model": "lora.mnn",
+      "llm_weight": "base.mnn.weight",
+  }
+  ```
+  - 运行时选择并切换lora模型
+  ```cpp
+  // 创建并加载base模型
+  std::unique_ptr<Llm> llm(Llm::createLLM(config_path));
+  llm->load();
+  // 使用同一个对象，在多个lora模型之间选择性使用，不可以并发使用
+  {
+      // 在基础模型的基础上添加`lora_1`模型，模型的索引为`lora_1_idx`
+      size_t lora_1_idx = llm->apply_lora("lora_1.mnn");
+      llm->response("Hello lora1"); // 使用`lora_1`模型推理
+      // 添加`lora_2`模型，并使用
+      size_t lora_2_idx = llm->apply_lora("lora_2.mnn");
+      llm->response("Hello lora2"); // 使用`lora_2`模型推理
+      // 通过索引选择`lora_1`作为llm对象当前使用的模型
+      llm->select_module(lora_1_idx);
+      llm->response("Hello lora1"); // 使用`lora_1`模型推理
+      // 释放加载的lora模型
+      llm->release_module(lora_1_idx);
+      llm->release_module(lora_2_idx);
+      // 选择使用基础模型
+      llm->select_module(0);
+      llm->response("Hello base"); // 使用`base`模型推理
+  }
+  // 使用多个对象，可以并发的加载使用多个lora模型
+  {
+      std::mutex creat_mutex;
+      auto chat = [&](const std::string& lora_name) {
+          MNN::BackendConfig bnConfig;
+          auto newExe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
+          ExecutorScope scope(newExe);
+          Llm* current_llm = nullptr;
+          {
+              std::lock_guard<std::mutex> guard(creat_mutex);
+              current_llm = llm->create_lora(lora_name);
+          }
+          current_llm->response("Hello");
+      };
+      std::thread thread1(chat, "lora_1.mnn");
+      std::thread thread2(chat, "lora_2.mnn");
+      thread1.join();
+      thread2.join();
+  }
+  ```
diff --git a/express/Executor.cpp b/express/Executor.cpp
@@ -48,7 +48,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
             std::shared_ptr<Runtime> bn(creator->onCreate(info));
             mRuntimes[mAttr->firstType] = bn;
         } else {
-            firstIter->second->onReset(numberThread, &config);
+            firstIter->second->onReset(numberThread, &config, true);
         }
     } else {
         auto creator = MNNGetExtraRuntimeCreator(type);
@@ -69,7 +69,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
             std::shared_ptr<Runtime> bn(creator->onCreate(info));
             mRuntimes[mAttr->firstType] = bn;
         } else {
-            firstIter->second->onReset(numberThread, &config);
+            firstIter->second->onReset(numberThread, &config, true);
         }
     }
     _refreshRuntime();
@@ -147,18 +147,16 @@ static std::shared_ptr<Executor>* gExecutor = nullptr;
 std::shared_ptr<Executor> Executor::getGlobalExecutor() {
     std::call_once(gInitFlag, [&]() {
         auto creator = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU);
-#ifdef MNN_BUILD_MINI
-        SizeComputerSuite::init();
-        GeometryComputer::init();
-#endif
         Backend::Info info;
         info.type = MNN_FORWARD_CPU;
         info.numThread = 1;
         std::shared_ptr<Runtime> bn(creator->onCreate(info));
         RuntimeHint hint;
         hint.memoryAllocatorType = 0;// Defer
         bn->setRuntimeHint(hint);
-        gExecutor = new std::shared_ptr<Executor>(new Executor(bn, MNN_FORWARD_CPU, 1));
+        static std::shared_ptr<Executor> executorStatic;
+        executorStatic.reset(new Executor(bn, MNN_FORWARD_CPU, 1));
+        gExecutor = &executorStatic;
     });
     return *gExecutor;
 }
@@ -254,6 +252,10 @@ void Executor::RuntimeManager::setMode(Interpreter::SessionMode mode) {
 void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) {
     mInside->modes.setHint(mode, value);
 }
+void Executor::RuntimeManager::setExternalPath(std::string path, int type) {
+    mInside->modes.setExternalPath(path, type);
+}
+
 bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void* ptr) {
     // Only support get memory
     switch (code) {
@@ -320,7 +322,7 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
         }
         originRt.insert(std::make_pair(compute.type, std::shared_ptr<Runtime>(newBn)));
     } else {
-        iter->second->onReset(compute.numThread, compute.user);
+        iter->second->onReset(compute.numThread, compute.user, false);
     }
     res->mInside->mRuntime.second =  originRt[DEFAULT_BACKUP_RUNTIME_KEY];
     res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type]));