From 2deb90007b4ebdf807e5826b5cca57e52b18602b Mon Sep 17 00:00:00 2001 From: huafengchun Date: Sun, 7 Apr 2024 06:17:43 +0000 Subject: [PATCH] rename bcast to acl_tensor --- ggml-cann.cpp | 19 +++++------ ggml-cann/acl_ops.cpp | 22 +++++------- ggml-cann/acl_ops.h | 9 +++-- ggml-cann/{bcast.cpp => acl_tensor.cpp} | 30 ++++++++--------- ggml-cann/{bcast.h => acl_tensor.h} | 13 ++++--- ggml-cann/aclnn_ops.cpp | 45 +++++++++++++------------ ggml-cann/aclnn_ops.h | 2 +- ggml-cann/common.h | 8 ++--- 8 files changed, 69 insertions(+), 79 deletions(-) rename ggml-cann/{bcast.cpp => acl_tensor.cpp} (85%) rename ggml-cann/{bcast.h => acl_tensor.h} (88%) diff --git a/ggml-cann.cpp b/ggml-cann.cpp index 3d618181dfc02..7b95d677fe580 100644 --- a/ggml-cann.cpp +++ b/ggml-cann.cpp @@ -6,17 +6,9 @@ #include #include "ggml-backend-impl.h" +#include "ggml-cann/acl_ops.h" #include "ggml-cann/aclnn_ops.h" #include "ggml-cann/common.h" -#include "ggml-cann/acl_ops.h" - -struct AclLifeCycle { - AclLifeCycle() { ACL_CHECK(aclInit(nullptr)); } - - ~AclLifeCycle() { ACL_CHECK(aclFinalize()); } -}; - -AclLifeCycle acl_life_cycle; [[noreturn]] void ggml_cann_error(const char* stmt, const char* func, const char* file, int line, const char* msg) { @@ -477,9 +469,15 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; ACL_CHECK(aclrtSynchronizeDevice()); + cann_ctx->free_buffers(); ACL_CHECK(aclrtResetDevice(cann_ctx->device)); delete cann_ctx; delete backend; + + // Finalize when last device freed. + if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) { + ACL_CHECK(aclFinalize()); + } } GGML_CALL static ggml_backend_buffer_type_t @@ -678,7 +676,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, case GGML_OP_DIAG_MASK_INF: return false; case GGML_OP_SOFT_MAX: - return true; + return true; case GGML_OP_ROPE: case GGML_OP_ALIBI: case GGML_OP_IM2COL: @@ -844,6 +842,7 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params, extern "C" GGML_CALL int ggml_backend_cann_reg_devices(); GGML_CALL int ggml_backend_cann_reg_devices() { + ACL_CHECK(aclInit(nullptr)); uint32_t device_count = ggml_backend_cann_get_device_count(); // initialization for (uint32_t i = 0; i < device_count; i++) { diff --git a/ggml-cann/acl_ops.cpp b/ggml-cann/acl_ops.cpp index 8fe4dc6a05d0f..eb78057d79670 100644 --- a/ggml-cann/acl_ops.cpp +++ b/ggml-cann/acl_ops.cpp @@ -15,10 +15,6 @@ OpCaller::~OpCaller() { for (aclDataBuffer* buffer : output_buffers) { aclDestroyDataBuffer(buffer); } - // TODO: may free before use. - for (void* ptr : ptrs) { - aclrtFree(ptr); - } aclopDestroyAttr(attrs); } @@ -100,20 +96,21 @@ void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst) { OpCaller op; op.name("ViewCopy") .input_no_contiguous(dst, "dst") - .input(dst->ne, ACL_INT64, 1, size_stride_dim, "dst_size", ctx.stream()) - .input(dst_stride, ACL_INT64, 1, size_stride_dim, "dst_stride", + .input(ctx, dst->ne, ACL_INT64, 1, size_stride_dim, "dst_size", + ctx.stream()) + .input(ctx, dst_stride, ACL_INT64, 1, size_stride_dim, "dst_stride", ctx.stream()) - .input(storage_offset, ACL_INT64, 1, storage_offset_dim, + .input(ctx, storage_offset, ACL_INT64, 1, storage_offset_dim, "dst_storage_offset", ctx.stream()) .input_no_contiguous(src, "src") - .input(src->ne, ACL_INT64, 1, size_stride_dim, "src_size", ctx.stream()) - .input(src_stride, ACL_INT64, 1, size_stride_dim, "src_stride", + .input(ctx, src->ne, ACL_INT64, 1, size_stride_dim, "src_size", + ctx.stream()) + .input(ctx, src_stride, ACL_INT64, 1, size_stride_dim, "src_stride", ctx.stream()) - .input(storage_offset, ACL_INT64, 1, storage_offset_dim, + .input(ctx, storage_offset, ACL_INT64, 1, storage_offset_dim, "src_storage_offset", ctx.stream()) .output(dst, "dst") .run(ctx.stream()); - //aclrtSynchronizeStream(ctx.stream()); } void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { @@ -125,8 +122,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { OpCaller op; op.name("Pad") .input(src, "x") - .input(paddings, ACL_INT64, 2, dim, "paddings", ctx.stream()) + .input(ctx, paddings, ACL_INT64, 2, dim, "paddings", ctx.stream()) .output(dst, "y") .run(ctx.stream()); - //aclrtSynchronizeStream(ctx.stream()); } diff --git a/ggml-cann/acl_ops.h b/ggml-cann/acl_ops.h index 654da7c16568a..77c64df4fae75 100644 --- a/ggml-cann/acl_ops.h +++ b/ggml-cann/acl_ops.h @@ -7,7 +7,7 @@ #include #include -#include "bcast.h" +#include "acl_tensor.h" #include "common.h" struct OpCaller { @@ -38,17 +38,16 @@ struct OpCaller { OpCaller& attr(float value, const char* name); template - OpCaller& input(T* values, aclDataType dtype, size_t dims, int64_t* dim, + OpCaller& input(ggml_backend_cann_context& ctx, T* values, + aclDataType dtype, size_t dims, int64_t* dim, const char* name, aclrtStream stream = nullptr) { - void* device_ptr = nullptr; size_t n_elem = 1; for (size_t i = 0; i < dims; i++) { n_elem *= dim[i]; } size_t n_bytes = n_elem * sizeof(T); - ACL_CHECK(aclrtMalloc(&device_ptr, n_bytes, ACL_MEM_MALLOC_HUGE_FIRST)); - ptrs.push_back(device_ptr); + void* device_ptr = ctx.alloc_buffer(n_bytes); if (stream == nullptr) { ACL_CHECK(aclrtMemcpy(device_ptr, n_bytes, values, n_bytes, ACL_MEMCPY_HOST_TO_DEVICE)); diff --git a/ggml-cann/bcast.cpp b/ggml-cann/acl_tensor.cpp similarity index 85% rename from ggml-cann/bcast.cpp rename to ggml-cann/acl_tensor.cpp index 8b8030f775d0c..612dda470d993 100644 --- a/ggml-cann/bcast.cpp +++ b/ggml-cann/acl_tensor.cpp @@ -1,4 +1,5 @@ -#include "bcast.h" +#include "acl_tensor.h" + #include #include @@ -32,7 +33,8 @@ aclDataType type_mapping(ggml_type type) { * changed to satisfy the broadcast. @sa: get_bcast_shape. */ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne, - size_t* bcast_nb, int64_t bcast_dims, aclFormat format) { + size_t* bcast_nb, int64_t bcast_dims, + aclFormat format) { size_t size = ggml_nbytes(tensor); void* deviceAddr = nullptr; @@ -74,9 +76,9 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne, return acl_tensor; } -aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size, int64_t* ne, - size_t* nb, int64_t dims, aclFormat format) { - +aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, + size_t type_size, int64_t* ne, size_t* nb, + int64_t dims, aclFormat format) { int64_t tmp_ne[GGML_MAX_DIMS * 2]; int64_t tmp_stride[GGML_MAX_DIMS * 2]; @@ -88,9 +90,8 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size std::reverse(tmp_ne, tmp_ne + dims); std::reverse(tmp_stride, tmp_stride + dims); - aclTensor* acl_tensor = - aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 0, - format, tmp_ne, dims, data_ptr); + aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 0, + format, tmp_ne, dims, data_ptr); return acl_tensor; } @@ -132,8 +133,7 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size */ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, int64_t* bcast_ne_src0, int64_t* bcast_ne_src1, - size_t* bcast_nb_src0, - size_t* bcast_nb_src1) { + size_t* bcast_nb_src0, size_t* bcast_nb_src1) { GGML_ASSERT(ggml_can_repeat(src1, src0)); int bcast_dim_cnt = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -147,12 +147,10 @@ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, // Need to add an extra dim. bcast_ne_src0[bcast_dim_cnt] = nr; bcast_ne_src1[bcast_dim_cnt] = 1; - bcast_nb_src0[bcast_dim_cnt] = - bcast_nb_src0[bcast_dim_cnt - 1] * - bcast_ne_src0[bcast_dim_cnt - 1]; - bcast_nb_src1[bcast_dim_cnt] = - bcast_nb_src1[bcast_dim_cnt - 1] * - bcast_ne_src1[bcast_dim_cnt - 1]; + bcast_nb_src0[bcast_dim_cnt] = bcast_nb_src0[bcast_dim_cnt - 1] * + bcast_ne_src0[bcast_dim_cnt - 1]; + bcast_nb_src1[bcast_dim_cnt] = bcast_nb_src1[bcast_dim_cnt - 1] * + bcast_ne_src1[bcast_dim_cnt - 1]; bcast_dim_cnt++; } } diff --git a/ggml-cann/bcast.h b/ggml-cann/acl_tensor.h similarity index 88% rename from ggml-cann/bcast.h rename to ggml-cann/acl_tensor.h index db72821c12cbd..e6ec4ff0259b5 100644 --- a/ggml-cann/bcast.h +++ b/ggml-cann/acl_tensor.h @@ -1,5 +1,5 @@ -#ifndef CANN_BCAST_H -#define CANN_BCAST_H +#ifndef CANN_ACL_TENSOR_H +#define CANN_ACL_TENSOR_H #include @@ -10,8 +10,7 @@ aclDataType type_mapping(ggml_type type); aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne = nullptr, - size_t* bcast_nb = nullptr, - int64_t bcast_dims = 0, + size_t* bcast_nb = nullptr, int64_t bcast_dims = 0, aclFormat format = ACL_FORMAT_ND); aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, @@ -28,12 +27,12 @@ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, #define BCAST_SHAPE(src0, src1) \ int64_t bcast_ne_##src0[GGML_MAX_DIMS * 2]; \ int64_t bcast_ne_##src1[GGML_MAX_DIMS * 2]; \ - size_t bcast_nb_##src0[GGML_MAX_DIMS * 2]; \ - size_t bcast_nb_##src1[GGML_MAX_DIMS * 2]; \ + size_t bcast_nb_##src0[GGML_MAX_DIMS * 2]; \ + size_t bcast_nb_##src1[GGML_MAX_DIMS * 2]; \ int64_t bcast_dims = \ get_bcast_shape(src0, src1, bcast_ne_##src0, bcast_ne_##src1, \ bcast_nb_##src0, bcast_nb_##src1); #define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims -#endif // CANN_BCAST_H \ No newline at end of file +#endif // CANN_ACL_TENSOR_H \ No newline at end of file diff --git a/ggml-cann/aclnn_ops.cpp b/ggml-cann/aclnn_ops.cpp index 04dfe51eeda5f..39f421cafcc6b 100644 --- a/ggml-cann/aclnn_ops.cpp +++ b/ggml-cann/aclnn_ops.cpp @@ -1,10 +1,10 @@ #include "aclnn_ops.h" -#include #include #include -#include +#include #include +#include #include #include @@ -25,13 +25,14 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]}; - aclIntArray *repeats = aclCreateIntArray(repeatsArray, GGML_MAX_DIMS); + aclIntArray* repeats = aclCreateIntArray(repeatsArray, GGML_MAX_DIMS); uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; - ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst, &workspaceSize, &executor)); + ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst, + &workspaceSize, &executor)); if (workspaceSize > 0) { workspaceAddr = ctx.alloc_buffer(workspaceSize); @@ -42,7 +43,6 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyIntArray(repeats)); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); - } void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) { @@ -140,7 +140,7 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, 1, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { - workspaceAddr = ctx.alloc_buffer(workspaceSize); + workspaceAddr = ctx.alloc_buffer(workspaceSize); } aclrtStream main_stream = ctx.stream(); @@ -262,7 +262,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src = create_acl_tensor(src); aclTensor* acl_dst = create_acl_tensor(dst); - void* buffer = ctx.alloc_buffer(ggml_nbytes(dst) / ggml_type_size(dst->type) * sizeof(int64_t)); + void* buffer = ctx.alloc_buffer( + ggml_nbytes(dst) / ggml_type_size(dst->type) * sizeof(int64_t)); aclTensor* tmp_tensor = create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, dst->nb, GGML_MAX_DIMS); @@ -311,8 +312,8 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { std::vector normData = {dst->ne[0]}; aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size()); - ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr, eps, - acl_dst, nullptr, nullptr, + ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr, + eps, acl_dst, nullptr, nullptr, &workspaceSize, &executor)); if (workspaceSize > 0) { @@ -381,24 +382,28 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src0 = create_acl_tensor(src0); aclTensor* acl_dst = create_acl_tensor(dst); - float scale = 1.0f; + float scale = 1.0f; float max_bias = 0.0f; - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&scale, (float*)dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float)); aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); - aclScalar* acl_max_bias = aclCreateScalar(&max_bias, aclDataType::ACL_FLOAT); + aclScalar* acl_max_bias = + aclCreateScalar(&max_bias, aclDataType::ACL_FLOAT); size_t n_bytes = ggml_nbytes(src0); - void *buffer = ctx.alloc_buffer(n_bytes); - aclTensor* temp_tensor = create_acl_tensor(buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); + void* buffer = ctx.alloc_buffer(n_bytes); + aclTensor* temp_tensor = + create_acl_tensor(buffer, ACL_FLOAT, ggml_type_size(src0->type), + src0->ne, src0->nb, GGML_MAX_DIMS); uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; - aclnnMulsGetWorkspaceSize(acl_src0, acl_scale, temp_tensor, &workspaceSize, &executor); + aclnnMulsGetWorkspaceSize(acl_src0, acl_scale, temp_tensor, &workspaceSize, + &executor); if (workspaceSize > 0) { workspaceAddr = ctx.alloc_buffer(workspaceSize); } @@ -406,8 +411,8 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclrtStream stream = ctx.stream(); aclnnMuls(workspaceAddr, workspaceSize, executor, stream); - ACL_CHECK(aclnnSoftmaxGetWorkspaceSize( - temp_tensor, 3, acl_dst, &workspaceSize, &executor)); + ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(temp_tensor, 3, acl_dst, + &workspaceSize, &executor)); if (workspaceSize > 0) { workspaceAddr = ctx.alloc_buffer(workspaceSize); @@ -419,6 +424,4 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_dst)); } -void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - -} \ No newline at end of file +void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {} \ No newline at end of file diff --git a/ggml-cann/aclnn_ops.h b/ggml-cann/aclnn_ops.h index 6aae070ce480f..ac59a9f2f63a8 100644 --- a/ggml-cann/aclnn_ops.h +++ b/ggml-cann/aclnn_ops.h @@ -16,7 +16,7 @@ #include #include -#include "bcast.h" +#include "acl_tensor.h" #include "common.h" void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst); diff --git a/ggml-cann/common.h b/ggml-cann/common.h index 744204be918e5..f115b528b3315 100644 --- a/ggml-cann/common.h +++ b/ggml-cann/common.h @@ -81,9 +81,7 @@ struct ggml_backend_cann_context { return buffer; } - void* alloc_buffer(size_t size) { - return alloc_buffer(size, 0); - } + void* alloc_buffer(size_t size) { return alloc_buffer(size, 0); } void free_buffers() { for (int i = 0; i < GGML_CANN_MAX_STREAMS; i++) { @@ -107,6 +105,4 @@ struct ggml_backend_cann_context { aclrtStream stream() { return stream(0); } }; - - -#endif //CANN_COMMON_H \ No newline at end of file +#endif // CANN_COMMON_H \ No newline at end of file