Skip to content

Commit

Permalink
add ascend kernel compile struct
Browse files Browse the repository at this point in the history
  • Loading branch information
hipudding committed Apr 25, 2024
1 parent 652b0f7 commit 5b01aa6
Show file tree
Hide file tree
Showing 11 changed files with 384 additions and 44 deletions.
45 changes: 25 additions & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -822,20 +822,37 @@ if (LLAMA_CANN)
endif()
endif()

# Set headers
set(CANN_INCLUDE_DIRS "${CANN_INSTALL_DIR}/include" "${CANN_INSTALL_DIR}/include/aclnn")
# Set header and libs
if(LLAMA_CANN)
set(CANN_INCLUDE_DIRS
${CANN_INSTALL_DIR}/include
${CANN_INSTALL_DIR}/include/aclnn
${CANN_INSTALL_DIR}/acllib/include
)

# TODO: find libs
link_directories(
${CANN_INSTALL_DIR}/lib64
${CANN_INSTALL_DIR}/aarch64-linux/simulator/Ascend910B1/lib
${CANN_INSTALL_DIR}/../8.0.RC1.alpha003/tools/tikicpulib/lib/Ascend910B1)

# Set libs
if (LLAMA_CANN)
# Build Ascendc kernels.
add_subdirectory(ggml-cann/kernels)
list(APPEND CANN_LIBRARIES ascendcl nnopbase opapi acl_op_compiler ascendc_kernels)
LINK_DIRECTORIES(${LINK_DIRECTORIES} ${CANN_INSTALL_DIR}/lib64)
list(APPEND CANN_LIBRARIES
ascendcl
nnopbase
opapi
acl_op_compiler
cann_kernels
)

set(GGML_HEADERS_CANN ggml-cann.h)
file(GLOB GGML_SOURCES_CUDA "ggml-cann/*.cpp")
list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${CANN_LIBRARIES})

message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${CANN_LIBRARIES} )
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
add_compile_definitions(GGML_USE_CANN)
endif()
Expand Down Expand Up @@ -1218,7 +1235,6 @@ add_library(ggml OBJECT
ggml-backend.h
ggml-quants.c
ggml-quants.h
<<<<<<< HEAD
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
Expand All @@ -1229,17 +1245,6 @@ add_library(ggml OBJECT
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
=======
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
>>>>>>> a21434a136a034b6c64bd50b17442e36f3d7e3c8
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
)

Expand Down
7 changes: 4 additions & 3 deletions ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
ggml_cann_clamp(ctx, dst);
break;
case GGML_OP_CPY:
return false;
ggml_cann_cpy(ctx, dst);
break;
case GGML_OP_CONT:
ggml_cann_dup(ctx, dst);
break;
Expand Down Expand Up @@ -664,8 +665,8 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
case GGML_OP_MUL_MAT_ID:
// embedding
case GGML_OP_GET_ROWS:
case GGML_OP_CPY:
return false;
case GGML_OP_CPY:
case GGML_OP_DUP:
case GGML_OP_REPEAT:
case GGML_OP_CONCAT:
Expand Down Expand Up @@ -846,7 +847,7 @@ extern "C" GGML_CALL int ggml_backend_cann_reg_devices();

GGML_CALL int ggml_backend_cann_reg_devices() {
ACL_CHECK(aclInit(nullptr));
uint32_t device_count = ggml_backend_cann_get_device_count();
uint32_t device_count = 1;//= ggml_backend_cann_get_device_count();
// initialization
for (uint32_t i = 0; i < device_count; i++) {
char name[128];
Expand Down
37 changes: 30 additions & 7 deletions ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -766,12 +766,7 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyIntArray(dilations));
}

void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];

aclTensor* acl_src = create_acl_tensor(src);
aclTensor* acl_dst = create_acl_tensor(dst);

void cann_copy(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_src, aclTensor* acl_dst) {
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
Expand All @@ -786,6 +781,16 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclrtStream stream = ctx.stream();
ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, stream));

}

void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];

aclTensor* acl_src = create_acl_tensor(src);
aclTensor* acl_dst = create_acl_tensor(dst);

cann_copy(ctx, dst, acl_src, acl_dst);

ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_dst));
}
Expand Down Expand Up @@ -1580,5 +1585,23 @@ void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
ACL_CHECK(aclDestroyTensor(tmp_arange3_tensor));
ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
>>>>>>> a21434a136a034b6c64bd50b17442e36f3d7e3c8
}

void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];

aclTensor* acl_src = create_acl_tensor(src);
aclTensor* acl_dst = create_acl_tensor(dst);

if(!ggml_is_quantized(dst->type)) {
cann_copy(ctx, dst, acl_src, acl_dst);
} else {
uint8_t* size = (uint8_t*)ctx.alloc_buffer(dst, sizeof(size_t));
size_t ne = ggml_nelements(src);
aclrtMemcpy(size, sizeof(size_t), &ne, sizeof(size_t), ACL_MEMCPY_HOST_TO_DEVICE);
size_t ne1;
aclrtMemcpy(&ne1, sizeof(size_t), size, sizeof(size_t), ACL_MEMCPY_DEVICE_TO_HOST);

cann_quantize_q4_0(1, nullptr, (uint8_t*)src->data, (uint8_t*)dst->data, size);
}
}
2 changes: 2 additions & 0 deletions ggml-cann/aclnn_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* d

void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst);

void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);

template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
aclTensor*, uint64_t*, aclOpExecutor**),
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
Expand Down
55 changes: 41 additions & 14 deletions ggml-cann/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,19 +1,46 @@
if(NOT SOC_VERSION)
set(SOC_VERSION "ascend910b3")
if (NOT SOC_TYPE)
set (SOC_TYPE "Ascend910B3")
endif()
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim/cpu")

if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
file(GLOB SRC_FILES
dequantize_q4_0.cpp
quantize_q4_0.cpp
)

string(TOLOWER "${CMAKE_BUILD_TYPE}" lowercase_CMAKE_BUILD_TYPE)
if(${lowercase_CMAKE_BUILD_TYPE} STREQUAL "debug")
if (NOT DEFINED ENV{CMAKE_PREFIX_PATH})
set(CMAKE_PREFIX_PATH ${CANN_INSTALL_DIR}/tools/tikicpulib/lib/cmake)
endif()

find_package(tikicpulib REQUIRED)
add_library(cann_kernels ${SRC_FILES} ascendc_kernels.cpp)
target_link_libraries(cann_kernels PRIVATE
ascendcl
tikicpulib::ascend910B1
)

target_compile_features(cann_kernels PRIVATE cxx_std_17)

else()
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
endif()
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")

include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
else()
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
endif()
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)

ascendc_library(ascendc_kernels STATIC
threshold_opencv_kernel.cpp
)
ascendc_library(ascendc_kernels STATIC
${SRC_FILES}
)
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

add_library(cann_kernels STATIC ascendc_kernels.cpp)
target_link_libraries(cann_kernels PUBLIC ascendc_kernels)
endif()
56 changes: 56 additions & 0 deletions ggml-cann/kernels/ascendc_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "ascendc_kernels.h"

#ifdef __CCE_KT_TEST__
#include "tikicpulib.h"
#else
#include "aclrtlaunch_ascendc_dequantize_q4_0.h"
#include "aclrtlaunch_ascendc_quantize_q4_0.h"
#endif


#ifdef __CCE_KT_TEST__
#include <acl/acl.h>

uint8_t* to_gm(uint8_t* ptr, size_t size) {
uint8_t* gm = (uint8_t*)AscendC::GmAlloc(size);
aclrtMemcpy(gm, size, ptr, size, ACL_MEMCPY_DEVICE_TO_HOST);
return gm;
}

void free_gm(uint8_t* ptr) {
aclrtFree(ptr);
}

extern "C" __global__ __aicore__ void ascendc_dequantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size);
extern "C" __global__ __aicore__ void ascendc_quantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size);
#endif

void cann_dequantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size) {
#ifdef __CCE_KT_TEST__
uint8_t* size_host = to_gm(size, sizeof(size_t));
uint8_t* x_host = to_gm(x, *((size_t*)size_host));
uint8_t* y_host = to_gm(y, *((size_t*)size_host));
AscendC::SetKernelMode(KernelMode::AIV_MODE);
ICPU_RUN_KF(ascendc_dequantize_q4_0, 1, x_host, y_host, size_host);
free_gm(size_host);
free_gm(x_host);
free_gm(y_host);
#else
aclrtlaunch_ascendc_dequantize_q4_0(block_dim, stream, x, y, size);
#endif
}

void cann_quantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size) {
#ifdef __CCE_KT_TEST__
uint8_t* size_host = to_gm(size, sizeof(size_t));
uint8_t* x_host = to_gm(x, *((size_t*)size_host));
uint8_t* y_host = to_gm(y, *((size_t*)size_host));
AscendC::SetKernelMode(KernelMode::AIV_MODE);
ICPU_RUN_KF(ascendc_quantize_q4_0, 1, x_host, y_host, size_host);
free_gm(size_host);
free_gm(x_host);
free_gm(y_host);
#else
aclrtlaunch_ascendc_quantize_q4_0(block_dim, stream, x, y, size);
#endif
}
6 changes: 6 additions & 0 deletions ggml-cann/kernels/ascendc_kernels.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
#ifndef ASCENDC_KERNELS_H
#define ASCENDC_KERNELS_H


#include <stdint.h>

void cann_dequantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size);
void cann_quantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size);

#endif //ASCENDC_KERNELS_H
Loading

0 comments on commit 5b01aa6

Please sign in to comment.