add ascend kernel compile struct

hipudding · Apr 25, 2024 · f1bde5d · f1bde5d
1 parent 3528399
commit f1bde5d
Show file tree

Hide file tree

Showing 11 changed files with 384 additions and 31 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -822,20 +822,37 @@ if (LLAMA_CANN)
             endif()
         endif()
 
-        # Set headers
-        set(CANN_INCLUDE_DIRS "${CANN_INSTALL_DIR}/include" "${CANN_INSTALL_DIR}/include/aclnn")
+        # Set header and libs
+        if(LLAMA_CANN)
+            set(CANN_INCLUDE_DIRS 
+                ${CANN_INSTALL_DIR}/include
+                ${CANN_INSTALL_DIR}/include/aclnn
+                ${CANN_INSTALL_DIR}/acllib/include
+            )
+
+            # TODO: find libs
+            link_directories(
+                ${CANN_INSTALL_DIR}/lib64 
+                ${CANN_INSTALL_DIR}/aarch64-linux/simulator/Ascend910B1/lib 
+                ${CANN_INSTALL_DIR}/../8.0.RC1.alpha003/tools/tikicpulib/lib/Ascend910B1)
 
-        # Set libs
-        if (LLAMA_CANN)
-            # Build Ascendc kernels.
             add_subdirectory(ggml-cann/kernels)
-            list(APPEND CANN_LIBRARIES ascendcl nnopbase opapi acl_op_compiler ascendc_kernels)
-            LINK_DIRECTORIES(${LINK_DIRECTORIES} ${CANN_INSTALL_DIR}/lib64)
+            list(APPEND CANN_LIBRARIES
+                ascendcl
+                nnopbase
+                opapi
+                acl_op_compiler
+                cann_kernels
+            )
 
             set(GGML_HEADERS_CANN ggml-cann.h)
             file(GLOB GGML_SOURCES_CUDA "ggml-cann/*.cpp")
             list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
-            set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${CANN_LIBRARIES})
+
+            message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
+            message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
+
+            set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${CANN_LIBRARIES} )
             set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
             add_compile_definitions(GGML_USE_CANN)
         endif()

diff --git a/ggml-cann.cpp b/ggml-cann.cpp
@@ -431,7 +431,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
             ggml_cann_clamp(ctx, dst);
             break;
         case GGML_OP_CPY:
-            return false;
+            ggml_cann_cpy(ctx, dst);
+            break;
         case GGML_OP_CONT:
             ggml_cann_dup(ctx, dst);
             break;
@@ -664,8 +665,8 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
         case GGML_OP_MUL_MAT_ID:
         // embedding
         case GGML_OP_GET_ROWS:
-        case GGML_OP_CPY:
             return false;
+        case GGML_OP_CPY:
         case GGML_OP_DUP:
         case GGML_OP_REPEAT:
         case GGML_OP_CONCAT:
@@ -847,7 +848,7 @@ extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
 
 GGML_CALL int ggml_backend_cann_reg_devices() {
     ACL_CHECK(aclInit(nullptr));
-    uint32_t device_count = ggml_backend_cann_get_device_count();
+    uint32_t device_count = 1;//= ggml_backend_cann_get_device_count();
     // initialization
     for (uint32_t i = 0; i < device_count; i++) {
         char name[128];

diff --git a/ggml-cann/aclnn_ops.cpp b/ggml-cann/aclnn_ops.cpp
@@ -766,12 +766,7 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyIntArray(dilations));
 }
 
-void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src = dst->src[0];
-
-    aclTensor* acl_src = create_acl_tensor(src);
-    aclTensor* acl_dst = create_acl_tensor(dst);
-
+void cann_copy(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_src, aclTensor* acl_dst) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -786,6 +781,16 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclrtStream stream = ctx.stream();
     ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, stream));
 
+}
+
+void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    cann_copy(ctx, dst, acl_src, acl_dst);
+
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
@@ -1583,4 +1588,23 @@ void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
     ACL_CHECK(aclDestroyTensor(tmp_arange3_tensor));
     ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
+}
+
+void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+
+    aclTensor* acl_src = create_acl_tensor(src);
+    aclTensor* acl_dst = create_acl_tensor(dst);
+
+    if(!ggml_is_quantized(dst->type)) {
+        cann_copy(ctx, dst, acl_src, acl_dst);
+    } else {
+        uint8_t* size = (uint8_t*)ctx.alloc_buffer(dst, sizeof(size_t));
+        size_t ne = ggml_nelements(src);
+        aclrtMemcpy(size, sizeof(size_t), &ne, sizeof(size_t), ACL_MEMCPY_HOST_TO_DEVICE);
+        size_t ne1;
+        aclrtMemcpy(&ne1, sizeof(size_t), size, sizeof(size_t), ACL_MEMCPY_DEVICE_TO_HOST);
+
+        cann_quantize_q4_0(1, nullptr, (uint8_t*)src->data, (uint8_t*)dst->data, size);
+    }
 }
diff --git a/ggml-cann/aclnn_ops.h b/ggml-cann/aclnn_ops.h
@@ -66,6 +66,8 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* d
 
 void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
+void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
                                        aclTensor*, uint64_t*, aclOpExecutor**),
           aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>

diff --git a/ggml-cann/kernels/CMakeLists.txt b/ggml-cann/kernels/CMakeLists.txt
@@ -1,19 +1,46 @@
-if(NOT SOC_VERSION)
-    set(SOC_VERSION "ascend910b3")
+if (NOT SOC_TYPE)
+    set (SOC_TYPE "Ascend910B3")
 endif()
-set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
-set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim/cpu")
 
-if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+file(GLOB SRC_FILES
+    dequantize_q4_0.cpp
+    quantize_q4_0.cpp
+)
+
+string(TOLOWER "${CMAKE_BUILD_TYPE}" lowercase_CMAKE_BUILD_TYPE)
+if(${lowercase_CMAKE_BUILD_TYPE} STREQUAL "debug")
+    if (NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${CANN_INSTALL_DIR}/tools/tikicpulib/lib/cmake)
+    endif()
+
+    find_package(tikicpulib REQUIRED)
+    add_library(cann_kernels ${SRC_FILES} ascendc_kernels.cpp)
+    target_link_libraries(cann_kernels PRIVATE
+        ascendcl
+        tikicpulib::ascend910B1
+    )
+
+    target_compile_features(cann_kernels PRIVATE cxx_std_17)
+
 else()
-    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
-endif()
+    string(TOLOWER ${SOC_TYPE} SOC_VERSION)
+    set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
+    set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
 
-include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+    if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+        set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+        set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    else()
+        message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
+    endif()
+    include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 
-ascendc_library(ascendc_kernels STATIC
-    threshold_opencv_kernel.cpp
-)
+    ascendc_library(ascendc_kernels STATIC
+        ${SRC_FILES}
+    )
+    #ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
+
+    add_library(cann_kernels STATIC ascendc_kernels.cpp)
+    target_link_libraries(cann_kernels PUBLIC ascendc_kernels)
+endif()
diff --git a/ggml-cann/kernels/ascendc_kernels.cpp b/ggml-cann/kernels/ascendc_kernels.cpp
@@ -0,0 +1,56 @@
+#include "ascendc_kernels.h"
+
+#ifdef __CCE_KT_TEST__
+#include "tikicpulib.h"
+#else
+#include "aclrtlaunch_ascendc_dequantize_q4_0.h"
+#include "aclrtlaunch_ascendc_quantize_q4_0.h"
+#endif
+
+
+#ifdef __CCE_KT_TEST__
+#include <acl/acl.h>
+
+uint8_t* to_gm(uint8_t* ptr, size_t size) {
+    uint8_t* gm = (uint8_t*)AscendC::GmAlloc(size);
+    aclrtMemcpy(gm, size, ptr, size, ACL_MEMCPY_DEVICE_TO_HOST);
+    return gm;
+}
+
+void free_gm(uint8_t* ptr) {
+    aclrtFree(ptr);
+}
+
+extern "C" __global__ __aicore__ void ascendc_dequantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size);
+extern "C" __global__ __aicore__ void ascendc_quantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size);
+#endif
+
+void cann_dequantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size) {
+#ifdef __CCE_KT_TEST__
+    uint8_t* size_host = to_gm(size, sizeof(size_t));
+    uint8_t* x_host = to_gm(x, *((size_t*)size_host));
+    uint8_t* y_host = to_gm(y, *((size_t*)size_host));
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(ascendc_dequantize_q4_0, 1, x_host, y_host, size_host);
+    free_gm(size_host);
+    free_gm(x_host);
+    free_gm(y_host);
+#else
+    aclrtlaunch_ascendc_dequantize_q4_0(block_dim, stream, x, y, size);
+#endif
+}
+
+void cann_quantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size) {
+#ifdef __CCE_KT_TEST__
+    uint8_t* size_host = to_gm(size, sizeof(size_t));
+    uint8_t* x_host = to_gm(x, *((size_t*)size_host));
+    uint8_t* y_host = to_gm(y, *((size_t*)size_host));
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(ascendc_quantize_q4_0, 1, x_host, y_host, size_host);
+    free_gm(size_host);
+    free_gm(x_host);
+    free_gm(y_host);
+#else
+    aclrtlaunch_ascendc_quantize_q4_0(block_dim, stream, x, y, size);
+#endif
+}
diff --git a/ggml-cann/kernels/ascendc_kernels.h b/ggml-cann/kernels/ascendc_kernels.h
@@ -1,4 +1,10 @@
 #ifndef ASCENDC_KERNELS_H
 #define ASCENDC_KERNELS_H
 
+
+#include <stdint.h>
+
+void cann_dequantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size);
+void cann_quantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size);
+
 #endif //ASCENDC_KERNELS_H
diff --git a/ggml-cann/kernels/dequantize_q4_0.cpp b/ggml-cann/kernels/dequantize_q4_0.cpp
@@ -0,0 +1,108 @@
+#include "dequantize_q4_0.h"
+
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+__aicore__ inline int32_t align_ceil(int32_t n, int32_t align) { return ((n + align) & ~(align-1)); }
+
+__aicore__ inline int32_t align_floor(int32_t n, int32_t align) { return (n & ~(align-1)); }
+
+
+#define QK4_0 32
+typedef struct {
+    uint16_t d;           // delta
+    uint8_t qs[QK4_0 / 2]; // nibbles / quants
+} block_q4_0;
+
+class KernelDequantizeQ4_0
+{
+public:
+    __aicore__ inline KernelDequantizeQ4_0() {}
+    __aicore__ inline void init(GM_ADDR x, GM_ADDR y, size_t size) {
+        uint64_t src_block_size =
+            align_ceil(size / GetBlockNum(), sizeof(block_q4_0));
+        uint64_t src_offset = GetBlockIdx() * src_block_size;
+        src_block_size =
+            (src_offset + src_block_size > (size / 32 * sizeof(block_q4_0)))
+                ? (size / 32 * sizeof(block_q4_0) - src_offset)
+                : src_block_size;
+        uint64_t dst_block_size =
+            align_ceil(size / GetBlockNum(), QK4_0 * sizeof(float));
+        uint64_t dst_offset = GetBlockIdx() * dst_block_size;
+        dst_block_size =
+            (dst_offset + dst_block_size > size * sizeof(float))
+                ? (size * sizeof(float) - dst_offset)
+                : dst_block_size;
+
+        xGM.SetGlobalBuffer((__gm__ int4b_t*)x + src_offset, src_block_size);
+        yGM.SetGlobalBuffer((__gm__ float*)y + dst_offset, dst_block_size);
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
+        // Ascendc do not support cast int4b_t -> float, but support int4b_t ->
+        // half -> float.
+        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
+        pipe.InitBuffer(copy_queue, BUFFER_NUM, QK4_0 * sizeof(float));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<int4b_t> x_local = input_queue.AllocTensor<int4b_t>();
+        // offset + 2 to skip scale.
+        DataCopy(x_local, xGM[offset + 2], QK4_0);
+        input_queue.EnQue(x_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<float> y_local = output_queue.DeQue<float>();
+        DataCopy(yGM[offset], y_local, QK4_0);
+        output_queue.FreeTensor(y_local);
+    }
+
+    __aicore__ inline void calculate(uint32_t offset, uint32_t len) {
+        copy_in(offset);
+
+        LocalTensor<int4b_t> x_local = input_queue.DeQue<int4b_t>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+        LocalTensor<float> copy_local = copy_queue.AllocTensor<float>();
+        LocalTensor<float> y_local = output_queue.AllocTensor<float>();
+
+        Cast(x_local, cast_local, RoundMode::CAST_NONE, QK4_0);
+        Cast(cast_local, copy_local, RoundMode::CAST_NONE, QK4_0);
+
+
+    }
+
+    __aicore__ inline void run() {
+        calculate(0, 10);
+    }
+
+private:
+    uint64_t block_size;
+    uint64_t offset;
+
+    TPipe pipe;
+    GlobalTensor<int4b_t> xGM;
+    GlobalTensor<float> yGM;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> copy_queue;
+};
+
+extern "C" __global__ __aicore__ void ascendc_dequantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size)
+{
+    size_t size_ub;
+    auto size_gm_ptr = (__gm__ uint8_t*)size;
+    auto size_ub_ptr = (uint8_t*)&size_ub;
+
+    for (int32_t i = 0; i < sizeof(size_t) / sizeof(uint8_t);
+         ++i, ++size_gm_ptr, ++size_ub_ptr)
+    {
+        *size_ub_ptr = *size_gm_ptr;
+    }
+
+    KernelDequantizeQ4_0 dequantize_q4_0;
+    dequantize_q4_0.init(x, y, size_ub);
+    dequantize_q4_0.run();
+}
diff --git a/ggml-cann/kernels/dequantize_q4_0.h b/ggml-cann/kernels/dequantize_q4_0.h
@@ -0,0 +1,7 @@
+#ifndef DEQUANTIZE_Q4_0_H
+#define DEQUANTIZE_Q4_0_H
+
+#include "ascendc_kernels.h"
+#include "kernel_operator.h"
+
+#endif //DEQUANTIZE_Q4_0_H