Skip to content

Commit

Permalink
Merge pull request #3004 from alibaba/feature/sync
Browse files Browse the repository at this point in the history
MNN:Sync: Sync Internal 2.9.4
  • Loading branch information
jxt1234 authored Aug 24, 2024
2 parents efd4a3c + ae6253f commit 5e93be1
Show file tree
Hide file tree
Showing 298 changed files with 7,924 additions and 4,555 deletions.
4 changes: 1 addition & 3 deletions 3rd_party/OpenCLHeaders/CL/cl2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -805,9 +805,9 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
#define __GET_GL_OBJECT_INFO_ERR CL_HPP_ERR_STR_(clGetGLObjectInfo)
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __CREATE_IMAGE_ERR CL_HPP_ERR_STR_(clCreateImage)
#define __CREATE_GL_TEXTURE_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture)
#define __IMAGE_DIMENSION_ERR CL_HPP_ERR_STR_(Incorrect image dimensions)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __CREATE_GL_TEXTURE_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture)
#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback)

#define __CREATE_USER_EVENT_ERR CL_HPP_ERR_STR_(clCreateUserEvent)
Expand Down Expand Up @@ -5229,7 +5229,6 @@ class Image3DGL : public Image3D
};
#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
/*! \class ImageGL
* \brief general image interface for GL interop.
* We abstract the 2D and 3D GL images into a single instance here
Expand Down Expand Up @@ -5308,7 +5307,6 @@ class ImageGL : public Image
return *this;
}
};
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120



Expand Down
21 changes: 11 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,15 @@ IF(MNN_TENSORRT)
list(APPEND MNN_EXTRA_DEPENDS ${MNN_TRT_LIBS})
ENDIF()

IF(MNN_BUILD_LLM)
# add_definitions(-DMNN_BUILD_LLM)
include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt)
IF(NOT MNN_SEP_BUILD)
list(APPEND MNN_TARGETS llm)
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:llm>)
ENDIF()
ENDIF()

IF(MNN_SEP_BUILD)
add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS} ${MNN_EXTRA_HEADERS})
target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
Expand Down Expand Up @@ -744,13 +753,7 @@ IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD)
ENDIF()
target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNOpenCV>)
ENDIF()
IF(MNN_BUILD_LLM)
# add_definitions(-DMNN_BUILD_LLM)
include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt)
IF(NOT MNN_SEP_BUILD)
target_sources(MNN PRIVATE $<TARGET_OBJECTS:llm>)
ENDIF()
ENDIF()


if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
# Using -pthread, needed by thread-safe implemention of glibc, is better than only using -lpthread
Expand All @@ -761,9 +764,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
else()
endif()
if (NOT MNN_BUILD_SHARED_LIBS)
if(APPLE)
set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# Static-link will not replace thread-related weak symbol in glibc with strong symbol
# in pthread library, so we need use --whole-archive to pthread
# https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
tools/compress
tools/visual
tools/python
tools/script

.. toctree::
:maxdepth: 1
Expand Down
37 changes: 37 additions & 0 deletions docs/inference/module.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,9 @@ MNN::Express::ExecutorScope scope(executor);
module_thread.reset();
```

## 多线程
Module 的创建与运行依赖其所绑定的 Executor ,若不指定,则为全局 Executor ,并非线程安全。在多线程创建 Module 或进行推理时,会竞争全局 Executor 的资源,需要上锁或绑定不同的 Executor 。

## 调试

Module API 也支持使用回调函数进行调试,与[runSessionWithCallBack](session.html#id19)相似。示例代码:
Expand Down Expand Up @@ -232,6 +235,40 @@ Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), s
std::vector<VARP> outputs = user_module->onForward(inputs);
```
## 预推理分离模式
对于满足 Interpreter-Session 运行条件的模型,若用户希望分离预推理(形状计算,几何计算,资源申请,策略搜索)与推理(内容计算)过程,可以设置预推理分离模式。示例代码如下:
```cpp
std::shared_ptr<Module> net(Module::load({"x"}, {"y"}, (const uint8_t*)buffer.data(), buffer.size()), Module::destroy);
// 预推理分离模式
auto code = net->traceOrOptimize(Interpreter::Module_Forward_Seperate);
if (0 != code) {
// 若模型不支持预推理分离,需要还原设定
net->traceOrOptimize(Interpreter::Module_Forward_Combine);
}
/*预推理开始*/
x = _Input({1, 3, 2, 2}, NCHW, halide_type_of<int>());
auto input = x->writeMap<int>();
y = net->onForward({x})[0];
auto output = y->readMap<int>();
/*预推理结束,获取输入和输出的数据指针*/
/*内容计算*/
/*
Fill input
*/
// 输入空数组,表示仅进行推理
net1->onForward({});
/*
Use output
*/
```

## 示例代码
完整的示例代码可以参考`demo/exec/`文件夹中的以下源码文件:
- `pictureRecognition_module.cpp` 使用`Module`执行图像分类,使用`ImageProcess`进行前处理,`Expr`进行后处理
Expand Down
70 changes: 70 additions & 0 deletions docs/tools/script.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# 脚本工具
一些功能性脚本,提供各种功能。

## apply_gptq.py
将GPTQ的权重写入到量化的MNN权重中。

### 用法
```
usage: apply_gptq.py [-h] --mnn_graph MNN_GRAPH --mnn_weight MNN_WEIGHT --gptq_tensor GPTQ_TENSOR
apply_gptq
options:
-h, --help show this help message and exit
--mnn_graph MNN_GRAPH
mnn graph json path.
--mnn_weight MNN_WEIGHT
mnn weight file path.
--gptq_tensor GPTQ_TENSOR
gptq tensor path.
```

### 参数
- MNN_GRAPH: 模型计算图的json文件,获取方法:`./MNNDump2Json model.mnn model.json`
- MNN_WEIGHT: 模型的权重文件,如:`gptq.mnn.weight`
- GPTQ_TENSOR: GPTQ量化后的权重文件,`model.safetensor`

### 示例
使用该脚本生成gptq量化的权重`gptq.mnn.weight`
```sh
cd build
./MNNDump2Json model.mnn model.json
cp model.mnn.weight gptq.mnn.weight
python ../tools/script/apply_gptq.py --mnn_graph model.json --mnn_weight gptq.mnn.weight --gptq_tensor model.safetensor
```

## apply_lora.py

合并base模型的计算图和lora模型的权重文件,生成新的计算图。

### 用法
```sh
usage: apply_lora.py [-h] --base BASE --lora LORA [--scale SCALE] [--fuse FUSE] [--out OUT]

apply_lora

options:
-h, --help show this help message and exit
--base BASE base model json path.
--lora LORA lora dir path or *.safetensors path.
--scale SCALE lora scale: `alpha/r`.
--fuse FUSE fuse A and B.
--out OUT out file name.
```

### 参数
- BASE: base.json, base模型计算图的json文件,获取方法:`./MNNDump2Json base.mnn base.json`
- LORA: lora权重文件夹或者lora权重的safetensors
- SCALE: lora权重的scale, `lora_alpha / lora_r`, 一般为4.0
- FUSE: 是否将lora_A与lora_B合并成为一个lora权重,合并后模型较大
- OUT: 生成新的计算图文件名,默认为`lora.json`,转换为模型:`./MNNRevert2Buffer lora.json lora.mnn`

### 示例
使用该脚本生成lora对应的模型`lora.mnn`, 用法: [LoRA](../transformers/llm.html#lora)
```sh
cd build
./MNNDump2Json base.mnn base.json
python ../tools/script/apply_lora.py --base base.json --lora lora_dir
./MNNRevert2Buffer lora.json lora.mnn
```
24 changes: 16 additions & 8 deletions docs/transformers/diffusion.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ conda activate ldm
```
./MNNConvert -f ONNX --modelFile onnx_save_path/text_encoder/model.onnx --MNNModel mnn_save_path/text_encoder.mnn --weightQuantBits 8 --bizCode biz
```
2. 实现denoiser从onnx模型 -> mnn模型
2. 实现denoiser unet从onnx模型 -> mnn模型
```
./MNNConvert -f ONNX --modelFile onnx_save_path/unet/model.onnx --MNNModel mnn_save_path/unet.mnn --transformerFuse --weightQuantBits 8 --bizCode biz
注意:对于非OpenCL后端推理,需要去掉--transformerFuse。
```
3. 实现decoder从onnx模型 -> mnn模型
```
Expand All @@ -60,19 +61,26 @@ cd mnn_path/project/android/build
```
## 运行Diffusion Demo
```
./diffusion_demo <resource_path> <model_type> <output_image_name> <input_text>
./diffusion_demo <resource_path> <model_type> <output_image_name> <memory_mode> <backend_type> <input_text>
```
其中,resource_path 就是mnn模型文件的路径,除了mnn文件,还需要:
1. 将MNN目录transformers/diffusion/scheduler/alphas.txt文件拷贝到该文件夹下。
2. 针对stable-diffusion-v1-5模型需要将huggingfacetokenizer目录下merges.txt和vocab.json拷贝到该文件夹中。
2. 针对stable-diffusion-v1-5/chilloutmix模型需要将huggingfacetokenizer目录下merges.txt和vocab.json拷贝到该文件夹中。
3. 针对Taiyi-Stable-Diffusion模型需要将huggingfacetokenizer目录下vocab.txt拷贝到该文件夹中。
4. model_type是目前支持的两种diffusion模型的类别。如果是stable-diffusion-v1-5模型设为0,如果是Taiyi-Stable-Diffusion模型设为1。
4. model_type是目前支持的两种diffusion模型的类别。如果是stable-diffusion-v1-5/chilloutmix模型设为0,如果是Taiyi-Stable-Diffusion模型设为1。
5. output_image_name是生成图片的名字,默认图片位置在当前运行目录下。
6. input_text是文生图的prompt,如果是stable-diffusion-v1-5模型建议英文prompt,如果是Taiyi-Stable-Diffusion建议中文prompt。
6. memory_mode代表设备是否内存足够,设为0表示内存节约模式(demo中每个模型使用前等待初始化,用完释放),1代表内存足够模式(所有模式启动时全初始化完,用时无需等待初始化)。
7. backend_type代表选择的运行后端。
8. input_text是文生图的prompt,如果是stable-diffusion-v1-5/chilloutmix模型建议英文prompt,如果是Taiyi-Stable-Diffusion建议中文prompt。

运行指令例如:
```
./diffusion_demo mnn_sd1.5_path 0 demo.jpg "a cute cat"
./diffusion_demo mnn_chilloutmix_path 0 demo.jpg "a pure girl"
./diffusion_demo mnn_taiyi_path 1 demo.jpg "一只可爱的猫"
./diffusion_demo mnn_sd1.5_path 0 demo.jpg 0 3 "a cute cat"
./diffusion_demo mnn_chilloutmix_path 0 demo.jpg 0 3 "a pure girl"
./diffusion_demo mnn_taiyi_path 1 demo.jpg 0 3 "一只可爱的猫"
```
## FAQ
1. Demo运行报错、段错误,怎么解决?
- 常见错误可能是设备内存不足,通常支持opencl fp16的设备需要保证3GB以上的内存,不支持fp16则需要6GB以上显存了。
2. 使用其他后端,出现报错,什么原因?
- 目前其他后端暂不支持transformer插件算子,需要在onnx->mnn模型转换阶段,去掉--transformerFuse。
71 changes: 68 additions & 3 deletions docs/transformers/llm.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ options:

### 编译

[从源码编译](../compile/tools.html#id4)
[从源码编译](../compile/other.html#id4)

### 使用
#### 运行时配置
Expand Down Expand Up @@ -151,7 +151,7 @@ options:
- 3: 使用非对称8bit量化存储key,使用fp8格式寸处value
- 硬件配置
- backend_type: 推理使用硬件后端类型,默认为:`"cpu"`
- thread_num: 推理使用硬件线程数,默认为:`4`
- thread_num: CPU推理使用硬件线程数,默认为:`4`; OpenCL推理时使用`68`
- precision: 推理使用精度策略,默认为:`"low"`,尽量使用`fp16`
- memory: 推理使用内存策略,默认为:`"low"`,开启运行时量化

Expand Down Expand Up @@ -201,4 +201,69 @@ options:
./llm_demo model_dir/llm.mnn
## 针对prompt中的每行进行回复
./llm_demo model_dir/llm.mnn prompt.txt
```
```
#### GPTQ权重加载
- 使用脚本生成GPTQ模型权重,用法参考: [apply_gptq.py](../tools/script.html#apply-gptq-py)
- 创建`gptq.json`配置文件
```json
{
"llm_model": "model.mnn",
"llm_weight": "gptq.mnn.weight",
}
```


#### LoRA权重加载
- 使用脚本生成lora模型,用法参考: [apply_lora.py](../tools/script.html#apply-lora-py)
- lora模型使用
- 直接加载lora模型使用,创建`lora.json`配置文件
```json
{
"llm_model": "lora.mnn",
"llm_weight": "base.mnn.weight",
}
```
- 运行时选择并切换lora模型
```cpp
// 创建并加载base模型
std::unique_ptr<Llm> llm(Llm::createLLM(config_path));
llm->load();
// 使用同一个对象,在多个lora模型之间选择性使用,不可以并发使用
{
// 在基础模型的基础上添加`lora_1`模型,模型的索引为`lora_1_idx`
size_t lora_1_idx = llm->apply_lora("lora_1.mnn");
llm->response("Hello lora1"); // 使用`lora_1`模型推理
// 添加`lora_2`模型,并使用
size_t lora_2_idx = llm->apply_lora("lora_2.mnn");
llm->response("Hello lora2"); // 使用`lora_2`模型推理
// 通过索引选择`lora_1`作为llm对象当前使用的模型
llm->select_module(lora_1_idx);
llm->response("Hello lora1"); // 使用`lora_1`模型推理
// 释放加载的lora模型
llm->release_module(lora_1_idx);
llm->release_module(lora_2_idx);
// 选择使用基础模型
llm->select_module(0);
llm->response("Hello base"); // 使用`base`模型推理
}
// 使用多个对象,可以并发的加载使用多个lora模型
{
std::mutex creat_mutex;
auto chat = [&](const std::string& lora_name) {
MNN::BackendConfig bnConfig;
auto newExe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
ExecutorScope scope(newExe);
Llm* current_llm = nullptr;
{
std::lock_guard<std::mutex> guard(creat_mutex);
current_llm = llm->create_lora(lora_name);
}
current_llm->response("Hello");
};
std::thread thread1(chat, "lora_1.mnn");
std::thread thread2(chat, "lora_2.mnn");
thread1.join();
thread2.join();
}
```
18 changes: 10 additions & 8 deletions express/Executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
std::shared_ptr<Runtime> bn(creator->onCreate(info));
mRuntimes[mAttr->firstType] = bn;
} else {
firstIter->second->onReset(numberThread, &config);
firstIter->second->onReset(numberThread, &config, true);
}
} else {
auto creator = MNNGetExtraRuntimeCreator(type);
Expand All @@ -69,7 +69,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
std::shared_ptr<Runtime> bn(creator->onCreate(info));
mRuntimes[mAttr->firstType] = bn;
} else {
firstIter->second->onReset(numberThread, &config);
firstIter->second->onReset(numberThread, &config, true);
}
}
_refreshRuntime();
Expand Down Expand Up @@ -147,18 +147,16 @@ static std::shared_ptr<Executor>* gExecutor = nullptr;
std::shared_ptr<Executor> Executor::getGlobalExecutor() {
std::call_once(gInitFlag, [&]() {
auto creator = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU);
#ifdef MNN_BUILD_MINI
SizeComputerSuite::init();
GeometryComputer::init();
#endif
Backend::Info info;
info.type = MNN_FORWARD_CPU;
info.numThread = 1;
std::shared_ptr<Runtime> bn(creator->onCreate(info));
RuntimeHint hint;
hint.memoryAllocatorType = 0;// Defer
bn->setRuntimeHint(hint);
gExecutor = new std::shared_ptr<Executor>(new Executor(bn, MNN_FORWARD_CPU, 1));
static std::shared_ptr<Executor> executorStatic;
executorStatic.reset(new Executor(bn, MNN_FORWARD_CPU, 1));
gExecutor = &executorStatic;
});
return *gExecutor;
}
Expand Down Expand Up @@ -254,6 +252,10 @@ void Executor::RuntimeManager::setMode(Interpreter::SessionMode mode) {
void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) {
mInside->modes.setHint(mode, value);
}
void Executor::RuntimeManager::setExternalPath(std::string path, int type) {
mInside->modes.setExternalPath(path, type);
}

bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void* ptr) {
// Only support get memory
switch (code) {
Expand Down Expand Up @@ -320,7 +322,7 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
}
originRt.insert(std::make_pair(compute.type, std::shared_ptr<Runtime>(newBn)));
} else {
iter->second->onReset(compute.numThread, compute.user);
iter->second->onReset(compute.numThread, compute.user, false);
}
res->mInside->mRuntime.second = originRt[DEFAULT_BACKUP_RUNTIME_KEY];
res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type]));
Expand Down
Loading

0 comments on commit 5e93be1

Please sign in to comment.