Skip to content

Commit

Permalink
add ascend kernel compile struct
Browse files Browse the repository at this point in the history
  • Loading branch information
hipudding committed Apr 25, 2024
1 parent 3528399 commit f1bde5d
Show file tree
Hide file tree
Showing 11 changed files with 384 additions and 31 deletions.
33 changes: 25 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -822,20 +822,37 @@ if (LLAMA_CANN)
endif()
endif()

# Set headers
set(CANN_INCLUDE_DIRS "${CANN_INSTALL_DIR}/include" "${CANN_INSTALL_DIR}/include/aclnn")
# Set header and libs
if(LLAMA_CANN)
set(CANN_INCLUDE_DIRS
${CANN_INSTALL_DIR}/include
${CANN_INSTALL_DIR}/include/aclnn
${CANN_INSTALL_DIR}/acllib/include
)

# TODO: find libs
link_directories(
${CANN_INSTALL_DIR}/lib64
${CANN_INSTALL_DIR}/aarch64-linux/simulator/Ascend910B1/lib
${CANN_INSTALL_DIR}/../8.0.RC1.alpha003/tools/tikicpulib/lib/Ascend910B1)

# Set libs
if (LLAMA_CANN)
# Build Ascendc kernels.
add_subdirectory(ggml-cann/kernels)
list(APPEND CANN_LIBRARIES ascendcl nnopbase opapi acl_op_compiler ascendc_kernels)
LINK_DIRECTORIES(${LINK_DIRECTORIES} ${CANN_INSTALL_DIR}/lib64)
list(APPEND CANN_LIBRARIES
ascendcl
nnopbase
opapi
acl_op_compiler
cann_kernels
)

set(GGML_HEADERS_CANN ggml-cann.h)
file(GLOB GGML_SOURCES_CUDA "ggml-cann/*.cpp")
list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${CANN_LIBRARIES})

message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${CANN_LIBRARIES} )
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
add_compile_definitions(GGML_USE_CANN)
endif()
Expand Down
7 changes: 4 additions & 3 deletions ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
ggml_cann_clamp(ctx, dst);
break;
case GGML_OP_CPY:
return false;
ggml_cann_cpy(ctx, dst);
break;
case GGML_OP_CONT:
ggml_cann_dup(ctx, dst);
break;
Expand Down Expand Up @@ -664,8 +665,8 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
case GGML_OP_MUL_MAT_ID:
// embedding
case GGML_OP_GET_ROWS:
case GGML_OP_CPY:
return false;
case GGML_OP_CPY:
case GGML_OP_DUP:
case GGML_OP_REPEAT:
case GGML_OP_CONCAT:
Expand Down Expand Up @@ -847,7 +848,7 @@ extern "C" GGML_CALL int ggml_backend_cann_reg_devices();

GGML_CALL int ggml_backend_cann_reg_devices() {
ACL_CHECK(aclInit(nullptr));
uint32_t device_count = ggml_backend_cann_get_device_count();
uint32_t device_count = 1;//= ggml_backend_cann_get_device_count();
// initialization
for (uint32_t i = 0; i < device_count; i++) {
char name[128];
Expand Down
36 changes: 30 additions & 6 deletions ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -766,12 +766,7 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyIntArray(dilations));
}

void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];

aclTensor* acl_src = create_acl_tensor(src);
aclTensor* acl_dst = create_acl_tensor(dst);

void cann_copy(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_src, aclTensor* acl_dst) {
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
Expand All @@ -786,6 +781,16 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclrtStream stream = ctx.stream();
ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, stream));

}

void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];

aclTensor* acl_src = create_acl_tensor(src);
aclTensor* acl_dst = create_acl_tensor(dst);

cann_copy(ctx, dst, acl_src, acl_dst);

ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_dst));
}
Expand Down Expand Up @@ -1583,4 +1588,23 @@ void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
ACL_CHECK(aclDestroyTensor(tmp_arange3_tensor));
ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
}

void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];

aclTensor* acl_src = create_acl_tensor(src);
aclTensor* acl_dst = create_acl_tensor(dst);

if(!ggml_is_quantized(dst->type)) {
cann_copy(ctx, dst, acl_src, acl_dst);
} else {
uint8_t* size = (uint8_t*)ctx.alloc_buffer(dst, sizeof(size_t));
size_t ne = ggml_nelements(src);
aclrtMemcpy(size, sizeof(size_t), &ne, sizeof(size_t), ACL_MEMCPY_HOST_TO_DEVICE);
size_t ne1;
aclrtMemcpy(&ne1, sizeof(size_t), size, sizeof(size_t), ACL_MEMCPY_DEVICE_TO_HOST);

cann_quantize_q4_0(1, nullptr, (uint8_t*)src->data, (uint8_t*)dst->data, size);
}
}
2 changes: 2 additions & 0 deletions ggml-cann/aclnn_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* d

void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst);

void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);

template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
aclTensor*, uint64_t*, aclOpExecutor**),
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
Expand Down
55 changes: 41 additions & 14 deletions ggml-cann/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,19 +1,46 @@
if(NOT SOC_VERSION)
set(SOC_VERSION "ascend910b3")
if (NOT SOC_TYPE)
set (SOC_TYPE "Ascend910B3")
endif()
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim/cpu")

if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
file(GLOB SRC_FILES
dequantize_q4_0.cpp
quantize_q4_0.cpp
)

string(TOLOWER "${CMAKE_BUILD_TYPE}" lowercase_CMAKE_BUILD_TYPE)
if(${lowercase_CMAKE_BUILD_TYPE} STREQUAL "debug")
if (NOT DEFINED ENV{CMAKE_PREFIX_PATH})
set(CMAKE_PREFIX_PATH ${CANN_INSTALL_DIR}/tools/tikicpulib/lib/cmake)
endif()

find_package(tikicpulib REQUIRED)
add_library(cann_kernels ${SRC_FILES} ascendc_kernels.cpp)
target_link_libraries(cann_kernels PRIVATE
ascendcl
tikicpulib::ascend910B1
)

target_compile_features(cann_kernels PRIVATE cxx_std_17)

else()
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
endif()
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")

include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
else()
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
endif()
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)

ascendc_library(ascendc_kernels STATIC
threshold_opencv_kernel.cpp
)
ascendc_library(ascendc_kernels STATIC
${SRC_FILES}
)
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

add_library(cann_kernels STATIC ascendc_kernels.cpp)
target_link_libraries(cann_kernels PUBLIC ascendc_kernels)
endif()
56 changes: 56 additions & 0 deletions ggml-cann/kernels/ascendc_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "ascendc_kernels.h"

#ifdef __CCE_KT_TEST__
#include "tikicpulib.h"
#else
#include "aclrtlaunch_ascendc_dequantize_q4_0.h"
#include "aclrtlaunch_ascendc_quantize_q4_0.h"
#endif


#ifdef __CCE_KT_TEST__
#include <acl/acl.h>

uint8_t* to_gm(uint8_t* ptr, size_t size) {
uint8_t* gm = (uint8_t*)AscendC::GmAlloc(size);
aclrtMemcpy(gm, size, ptr, size, ACL_MEMCPY_DEVICE_TO_HOST);
return gm;
}

void free_gm(uint8_t* ptr) {
aclrtFree(ptr);
}

extern "C" __global__ __aicore__ void ascendc_dequantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size);
extern "C" __global__ __aicore__ void ascendc_quantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size);
#endif

void cann_dequantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size) {
#ifdef __CCE_KT_TEST__
uint8_t* size_host = to_gm(size, sizeof(size_t));
uint8_t* x_host = to_gm(x, *((size_t*)size_host));
uint8_t* y_host = to_gm(y, *((size_t*)size_host));
AscendC::SetKernelMode(KernelMode::AIV_MODE);
ICPU_RUN_KF(ascendc_dequantize_q4_0, 1, x_host, y_host, size_host);
free_gm(size_host);
free_gm(x_host);
free_gm(y_host);
#else
aclrtlaunch_ascendc_dequantize_q4_0(block_dim, stream, x, y, size);
#endif
}

void cann_quantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size) {
#ifdef __CCE_KT_TEST__
uint8_t* size_host = to_gm(size, sizeof(size_t));
uint8_t* x_host = to_gm(x, *((size_t*)size_host));
uint8_t* y_host = to_gm(y, *((size_t*)size_host));
AscendC::SetKernelMode(KernelMode::AIV_MODE);
ICPU_RUN_KF(ascendc_quantize_q4_0, 1, x_host, y_host, size_host);
free_gm(size_host);
free_gm(x_host);
free_gm(y_host);
#else
aclrtlaunch_ascendc_quantize_q4_0(block_dim, stream, x, y, size);
#endif
}
6 changes: 6 additions & 0 deletions ggml-cann/kernels/ascendc_kernels.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
#ifndef ASCENDC_KERNELS_H
#define ASCENDC_KERNELS_H


#include <stdint.h>

void cann_dequantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size);
void cann_quantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size);

#endif //ASCENDC_KERNELS_H
108 changes: 108 additions & 0 deletions ggml-cann/kernels/dequantize_q4_0.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#include "dequantize_q4_0.h"

using namespace AscendC;

#define BUFFER_NUM 2

__aicore__ inline int32_t align_ceil(int32_t n, int32_t align) { return ((n + align) & ~(align-1)); }

__aicore__ inline int32_t align_floor(int32_t n, int32_t align) { return (n & ~(align-1)); }


#define QK4_0 32
typedef struct {
uint16_t d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;

class KernelDequantizeQ4_0
{
public:
__aicore__ inline KernelDequantizeQ4_0() {}
__aicore__ inline void init(GM_ADDR x, GM_ADDR y, size_t size) {
uint64_t src_block_size =
align_ceil(size / GetBlockNum(), sizeof(block_q4_0));
uint64_t src_offset = GetBlockIdx() * src_block_size;
src_block_size =
(src_offset + src_block_size > (size / 32 * sizeof(block_q4_0)))
? (size / 32 * sizeof(block_q4_0) - src_offset)
: src_block_size;
uint64_t dst_block_size =
align_ceil(size / GetBlockNum(), QK4_0 * sizeof(float));
uint64_t dst_offset = GetBlockIdx() * dst_block_size;
dst_block_size =
(dst_offset + dst_block_size > size * sizeof(float))
? (size * sizeof(float) - dst_offset)
: dst_block_size;

xGM.SetGlobalBuffer((__gm__ int4b_t*)x + src_offset, src_block_size);
yGM.SetGlobalBuffer((__gm__ float*)y + dst_offset, dst_block_size);

pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
// Ascendc do not support cast int4b_t -> float, but support int4b_t ->
// half -> float.
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
pipe.InitBuffer(copy_queue, BUFFER_NUM, QK4_0 * sizeof(float));
pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
}

__aicore__ inline void copy_in(uint32_t offset) {
LocalTensor<int4b_t> x_local = input_queue.AllocTensor<int4b_t>();
// offset + 2 to skip scale.
DataCopy(x_local, xGM[offset + 2], QK4_0);
input_queue.EnQue(x_local);
}

__aicore__ inline void copy_out(uint32_t offset) {
LocalTensor<float> y_local = output_queue.DeQue<float>();
DataCopy(yGM[offset], y_local, QK4_0);
output_queue.FreeTensor(y_local);
}

__aicore__ inline void calculate(uint32_t offset, uint32_t len) {
copy_in(offset);

LocalTensor<int4b_t> x_local = input_queue.DeQue<int4b_t>();
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
LocalTensor<float> copy_local = copy_queue.AllocTensor<float>();
LocalTensor<float> y_local = output_queue.AllocTensor<float>();

Cast(x_local, cast_local, RoundMode::CAST_NONE, QK4_0);
Cast(cast_local, copy_local, RoundMode::CAST_NONE, QK4_0);


}

__aicore__ inline void run() {
calculate(0, 10);
}

private:
uint64_t block_size;
uint64_t offset;

TPipe pipe;
GlobalTensor<int4b_t> xGM;
GlobalTensor<float> yGM;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
TQue<QuePosition::VECIN, BUFFER_NUM> copy_queue;
};

extern "C" __global__ __aicore__ void ascendc_dequantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size)
{
size_t size_ub;
auto size_gm_ptr = (__gm__ uint8_t*)size;
auto size_ub_ptr = (uint8_t*)&size_ub;

for (int32_t i = 0; i < sizeof(size_t) / sizeof(uint8_t);
++i, ++size_gm_ptr, ++size_ub_ptr)
{
*size_ub_ptr = *size_gm_ptr;
}

KernelDequantizeQ4_0 dequantize_q4_0;
dequantize_q4_0.init(x, y, size_ub);
dequantize_q4_0.run();
}
7 changes: 7 additions & 0 deletions ggml-cann/kernels/dequantize_q4_0.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#ifndef DEQUANTIZE_Q4_0_H
#define DEQUANTIZE_Q4_0_H

#include "ascendc_kernels.h"
#include "kernel_operator.h"

#endif //DEQUANTIZE_Q4_0_H
Loading

0 comments on commit f1bde5d

Please sign in to comment.