Skip to content

Commit

Permalink
Merge pull request #2963 from alibaba/feature/sync
Browse files Browse the repository at this point in the history
MNN:Sync: Sync Internal 2.9.3
  • Loading branch information
jxt1234 authored Jul 23, 2024
2 parents e101116 + 6ba9f9f commit 11a65d7
Show file tree
Hide file tree
Showing 260 changed files with 28,087 additions and 11,836 deletions.
43 changes: 39 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,8 @@ include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/CMakeLists.txt)

SET(MNN_PUB_HDRS "")
SET(MNN_EXPR_PUB_HDRS "")
set(MNN_EXTRA_HEADERS "")

list(APPEND MNN_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/MNNDefine.h")
list(APPEND MNN_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/Interpreter.hpp")
list(APPEND MNN_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/HalideRuntime.h")
Expand All @@ -430,6 +432,19 @@ list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Neur
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/ExecutorScope.hpp")
list(APPEND MNN_EXPR_PUB_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/include/MNN/expr/Scope.hpp")

# Add Extra Header
IF(MNN_BUILD_OPENCV)
file(GLOB MNN_CV_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/tools/cv/include/cv/*.hpp PARENT_SCOPE)
file(GLOB MNN_CV_IMGHDRS ${CMAKE_CURRENT_SOURCE_DIR}/tools/cv/include/cv/imgproc/*.hpp PARENT_SCOPE)
list(APPEND MNN_EXTRA_HEADERS ${MNN_CV_HDRS})
list(APPEND MNN_EXTRA_HEADERS ${MNN_CV_IMGHDRS})
ENDIF()
IF(MNN_BUILD_LLM)
file(GLOB MNN_LLM_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/llm/engine/include/llm/*)
list(APPEND MNN_EXTRA_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/llm/engine/include/llm/llm.hpp)
ENDIF()


set(MNN_DEPS "")
set(MNN_EXTRA_DEPENDS "")

Expand Down Expand Up @@ -659,11 +674,11 @@ IF(MNN_TENSORRT)
ENDIF()

IF(MNN_SEP_BUILD)
add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS} ${MNN_EXTRA_HEADERS})
target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
ELSE()
IF(MNN_BUILD_SHARED_LIBS)
add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS} ${MNN_EXTRA_HEADERS})
if (WIN32)
foreach(TARGET ${MNN_TARGETS})
target_compile_definitions(${TARGET} PRIVATE "-DBUILDING_MNN_DLL")
Expand All @@ -673,7 +688,7 @@ ELSE()
target_compile_definitions(MNN INTERFACE "-DUSING_MNN_DLL")
endif()
ELSE()
add_library(MNN STATIC ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
add_library(MNN STATIC ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS} ${MNN_EXTRA_HEADERS})
ENDIF()
target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
ENDIF()
Expand Down Expand Up @@ -729,7 +744,6 @@ IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD)
ENDIF()
target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNOpenCV>)
ENDIF()

IF(MNN_BUILD_LLM)
# add_definitions(-DMNN_BUILD_LLM)
include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt)
Expand Down Expand Up @@ -831,6 +845,27 @@ ELSE()
ARCHIVE DESTINATION lib
FRAMEWORK DESTINATION /Library/Frameworks/
)
IF(MNN_BUILD_OPENCV)
if (NOT MNN_AAPL_FMWK)
INSTALL(FILES ${MNN_CV_HDRS} DESTINATION include/MNN/cv)
INSTALL(FILES ${MNN_CV_IMGHDRS} DESTINATION include/MNN/cv/imgproc)
endif()
FOREACH(HDR ${MNN_CV_HDRS})
SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/cv/ )
ENDFOREACH()
FOREACH(HDR ${MNN_CV_IMGHDRS})
SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/cv/imgproc )
ENDFOREACH()
ENDIF()
IF(MNN_BUILD_LLM)
if (NOT MNN_AAPL_FMWK)
INSTALL(FILES ${MNN_LLM_HDRS} DESTINATION include/MNN/llm)
endif()
FOREACH(HDR ${MNN_LLM_HDRS})
SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/llm )
ENDFOREACH()
ENDIF()

if (NOT MNN_AAPL_FMWK)
INSTALL(FILES ${MNN_PUB_HDRS} DESTINATION include/MNN/)
INSTALL(FILES ${MNN_EXPR_PUB_HDRS} DESTINATION include/MNN/expr/)
Expand Down
208 changes: 208 additions & 0 deletions backupcode/cpubackend/arm/arm32/bf16/MNNPackC4ForMatMul_A_BF16.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
//
// NEON_MNNPackC4ForMatMul_A_BF16.S
// MNN
//
// Created by MNN on 2021/02/21.
// Copyright © 2018-2021 Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__

#include "MNNAsmGlobal.h"

.text
.align 5
asm_function NEON_MNNPackC4ForMatMul_A_BF16
// treate float pointer as int16_t*
//void NEON_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el)
//Auto: r0: dest, r1:sourceGroup, r2: info, r3:el
push {r4-r8, r10, r11, lr} // avoid to touch platform-register r-9
ldr r10, [r2, #0] // number
ldr r4, [r2, #4] // eReal
ldr r11, [r2, #8] // eDest
ldr r6, [r2, #12] // xOffset
// xOffset -> xOffset * 4 * sizeof(float)
// eReal -> eReal * 4 * sizeof(float)
// eDest -> eDest * sizeof(float)
mov r12, #2 // sizeof(int16_t)
mov lr, #8 // sizeof(int16_t) * 4
mul r4, lr, r4
mul r11, r12, r11
mul r6, lr, r6

LoopNumber:
ldr r5, [r3, #4] // l
ldr r8, [r3, #8] // eOffset
ldr r7, [r3, #12] // lOffset

push {r0, r1}
ldr r1, [r1, #0]

// Compute dest ptr: r0 = r0 + eOffset * sizeof(float) + lOffset * eDest * sizeof(float)
; mov lr, #2 //sizeof(int16_t)
mul r7, r11, r7
mul r8, r12, r8
add r0, r0, r7
add r0, r0, r8

ldr r2, [r3, #0] // e

Body:
cmp r2, #12
bne Right
cmp r5, #4
blt LoopEL3
LoopL4:
mov r2, r1
.macro MAIN_TRANSPOSE
vld1.16 {d16}, [r1], r6 // load size: 4 * sizeof(int16_t)
vld1.16 {d19}, [r1], r6
vld1.16 {d22}, [r1], r6
vld1.16 {d25}, [r1], r6
vld1.16 {d17}, [r1], r6
vld1.16 {d20}, [r1], r6
vld1.16 {d23}, [r1], r6
vld1.16 {d26}, [r1], r6
vld1.16 {d18}, [r1], r6
vld1.16 {d21}, [r1], r6
vld1.16 {d24}, [r1], r6
vld1.16 {d27}, [r1], r6

// transpose each 4 16-bit elements in 2 d_n vectors, by transpose 16-bit and scale up transpose 32-bit.
vtrn.16 d16, d19
vtrn.16 d22, d25
// vswp d0[2-3], d2[0-1]
// vswp d1[2-3], d3[0-1]
// swap half of 64-bit is equal to transpose in 32-bit unit.
vtrn.32 d16, d22
vtrn.32 d19, d25

vtrn.16 d17, d20
vtrn.16 d23, d26
vtrn.32 d17, d23
vtrn.32 d20, d26

vtrn.16 d18, d21
vtrn.16 d24, d27
vtrn.32 d18, d24
vtrn.32 d21, d27
// after transpose from 12x4 to 4x12, memory layout is
// +-------+------+------+
// | d16...|d17...|d18...|
// +-------+------+------+
// | d19...|d20...|d21...|
// +-------+------+------+
// | d22...|d23...|d24...|
// +-------+------+------+
// | d25...|d26...|d27...|
// +-------+------+------+
.endm
MAIN_TRANSPOSE

vstm r0!, {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27} // store at one time: 12 * 4 * sizeof(int16_t)

add r1, r2, r4
sub r5, r5, #4
cmp r5, #4
bge LoopL4

LoopEL3:
cmp r5, #3
blt LoopEL2
MAIN_TRANSPOSE

vstm r0!, {d16, d17, d18, d19, d20, d21, d22, d23, d24}

b LoopEEnd

LoopEL2:
cmp r5, #2
blt LoopEL1
MAIN_TRANSPOSE

vstm r0!, {d16, d17, d18, d19, d20, d21}

b LoopEEnd

LoopEL1:
cmp r5, #0
beq LoopEEnd
MAIN_TRANSPOSE

vstm r0!, {d16, d17, d18}

LoopEEnd:

b End


Right:

LoopE1:
mov lr, r5
mov r7, r1
mov r8, r0
cmp r5, #4
blt LoopE1L3
LoopE1L4:
vld1.16 {d0}, [r1], r4
vst1.16 {d0[0]}, [r0], r11
vst1.16 {d0[1]}, [r0], r11
vst1.16 {d0[2]}, [r0], r11
vst1.16 {d0[3]}, [r0], r11
sub r5, r5, #4
cmp r5, #4
bge LoopE1L4

LoopE1L3:
cmp r5, #3
blt LoopE1L2
vld1.16 {d0}, [r1], r4
vst1.16 {d0[0]}, [r0], r11
vst1.16 {d0[1]}, [r0], r11
vst1.16 {d0[2]}, [r0], r11

sub r5, r5, #3

LoopE1L2:
cmp r5, #2
blt LoopE1L1
vld1.16 {d0}, [r1], r4
vst1.16 {d0[0]}, [r0], r11
vst1.16 {d0[1]}, [r0], r11
sub r5, r5, #2

LoopE1L1:
cmp r5, #1
blt LoopE1End
vld1.16 {d0[0]}, [r1], r4
vst1.16 {d0[0]}, [r0], r11

LoopE1End:

subs r2, r2, #1
add r0, r8, r12 // !!!! caution : sizeof(int16_t)
add r1, r7, r6
mov r5, lr
bne LoopE1

End:

pop {r0, r1}
subs r10, r10, #1

// x3 is (const int32_t* el), this array size of 4. as a result for next struct element,
// address added by 4 * sizeof(int32_t)
add r3, r3, #16

// x1 is (const int16_t** sourceGroup), even though data content is int16_t,
// the element in sourceGroup in 'int16_t*', as a result for next struct element,
// value added by sizeof(void*)
add r1, r1, #4

bne LoopNumber

pop {r4-r8, r10, r11, pc}

#endif
#endif
Loading

0 comments on commit 11a65d7

Please sign in to comment.