Skip to content

Commit

Permalink
fix norm
Browse files Browse the repository at this point in the history
  • Loading branch information
hipudding committed Apr 3, 2024
1 parent c330b78 commit c4740d6
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 84 deletions.
71 changes: 13 additions & 58 deletions ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "aclnn_ops.h"

#include <aclnnop/aclnn_batch_norm.h>
#include <aclnnop/aclnn_layer_norm.h>
#include <aclnnop/aclnn_cast.h>

#include <cmath>
Expand Down Expand Up @@ -368,77 +368,32 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

float eps;
memcpy(&eps, dst->op_params, sizeof(float));
float *weight_host, *bias_host;
int64_t channel = dst->ne[2];

weight_host = new float[channel];
bias_host = new float[channel];

for (int i = 0; i < channel; i++) {
weight_host[i] = 1;
bias_host[i] = 0;
}

aclrtStream stream = ctx.stream();

// Input tensors.
void *buffer, *acl_weight, *acl_bias, *acl_mean, *acl_invstd;
ACL_CHECK(aclrtMalloc(&buffer, 4 * channel * sizeof(float),
ACL_MEM_MALLOC_HUGE_FIRST));
acl_weight = buffer;
acl_bias = acl_weight + sizeof(float) * channel;
acl_mean = acl_bias + sizeof(float) * channel;
acl_invstd = acl_mean + sizeof(float) * channel;

// Set input params.
ACL_CHECK(aclrtMemcpyAsync(acl_weight, channel, weight_host, channel,
ACL_MEMCPY_HOST_TO_DEVICE, stream));
ACL_CHECK(aclrtMemcpyAsync(acl_bias, channel, bias_host, channel,
ACL_MEMCPY_HOST_TO_DEVICE, stream));
delete[] weight_host;
delete[] bias_host;

// Create input tensors.
int64_t input_tensor_shape[] = {channel};
size_t input_tensor_stride[] = {1};
aclTensor* weight =
create_acl_tensor(acl_weight, ACL_FLOAT, sizeof(float),
input_tensor_shape, input_tensor_stride, 1);
aclTensor* bias =
create_acl_tensor(acl_bias, ACL_FLOAT, sizeof(float),
input_tensor_shape, input_tensor_stride, 1);
aclTensor* mean =
create_acl_tensor(acl_mean, ACL_FLOAT, sizeof(float),
input_tensor_shape, input_tensor_stride, 1);
aclTensor* invstd =
create_acl_tensor(acl_invstd, ACL_FLOAT, sizeof(float),
input_tensor_shape, input_tensor_stride, 1);

uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;

ACL_CHECK(aclnnBatchNormGetWorkspaceSize(
acl_src, weight, bias, nullptr, nullptr, false, 0, eps, acl_dst, mean,
invstd, &workspaceSize, &executor));
std::vector<int64_t> normData = {dst->ne[0]};
aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr, eps,
acl_dst, nullptr, nullptr,
&workspaceSize, &executor));

if (workspaceSize > 0) {
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST));
}

ACL_CHECK(aclnnBatchNorm(workspaceAddr, workspaceSize, executor, stream));
aclrtStream stream = ctx.stream();

ACL_CHECK(aclDestroyTensor(weight));
ACL_CHECK(aclDestroyTensor(bias));
ACL_CHECK(aclDestroyTensor(mean));
ACL_CHECK(aclDestroyTensor(invstd));
ACL_CHECK(aclnnLayerNorm(workspaceAddr, workspaceSize, executor, stream));

// TODO: optimize argsort kernel or free tmp buffers after stream sync.
ACL_CHECK(aclrtSynchronizeStream(stream));
ACL_CHECK(aclrtFree(buffer));
ACL_CHECK(aclDestroyIntArray(norm));
ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_dst));

if (workspaceSize > 0) {
ACL_CHECK(aclrtFree(workspaceAddr));
}
}
}

32 changes: 16 additions & 16 deletions ggml-cann/bcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ aclDataType type_mapping(ggml_type type) {
* Transform ggml_tensor to acl_tensor. Note that ggml_tensor dimension order
* is reversed compared to acl_tensor.
*
* If bcast_ne and bcast_stride is nullptr, use ggml_tensor's ne and nb.
* otherwise, use bcast_ne bcast_stride, which means tensor dims should be
* If bcast_ne and bcast_nb is nullptr, use ggml_tensor's ne and nb.
* otherwise, use bcast_ne bcast_nb, which means tensor dims should be
* changed to satisfy the broadcast. @sa: get_bcast_shape.
*/
aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
int64_t* bcast_stride, int64_t bcast_dims) {
size_t* bcast_nb, int64_t bcast_dims, aclFormat format) {
size_t size = ggml_nbytes(tensor);
void* deviceAddr = nullptr;

Expand All @@ -53,13 +53,13 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
for (int i = 0; i < GGML_MAX_DIMS; i++) {
acl_ne[i] = tensor->ne[i];
// The step size of acl is in elements.
acl_stride[i] = tensor->nb[i] / tensor->nb[0];
acl_stride[i] = tensor->nb[i] / ggml_type_size(tensor->type);
}
} else {
// With bcast
for (int i = 0; i < bcast_dims; i++) {
acl_ne[i] = bcast_ne[i];
acl_stride[i] = bcast_stride[i] / tensor->nb[0];
acl_stride[i] = bcast_nb[i] / ggml_type_size(tensor->type);
}
}

Expand All @@ -69,13 +69,13 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,

aclTensor* acl_tensor =
aclCreateTensor(acl_ne, dims, type_mapping(tensor->type), acl_stride, 0,
aclFormat::ACL_FORMAT_ND, acl_ne, dims, deviceAddr);
format, acl_ne, dims, deviceAddr);

return acl_tensor;
}

aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size, int64_t* ne,
size_t* nb, int64_t dims) {
size_t* nb, int64_t dims, aclFormat format) {

int64_t tmp_ne[GGML_MAX_DIMS * 2];
int64_t tmp_stride[GGML_MAX_DIMS * 2];
Expand All @@ -90,7 +90,7 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size

aclTensor* acl_tensor =
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 0,
aclFormat::ACL_FORMAT_ND, tmp_ne, dims, data_ptr);
format, tmp_ne, dims, data_ptr);

return acl_tensor;
}
Expand Down Expand Up @@ -132,26 +132,26 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size
*/
int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
int64_t* bcast_stride_src0,
int64_t* bcast_stride_src1) {
size_t* bcast_nb_src0,
size_t* bcast_nb_src1) {
GGML_ASSERT(ggml_can_repeat(src1, src0));
int bcast_dim_cnt = 0;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
int64_t nr = src0->ne[i] / src1->ne[i];
bcast_ne_src0[bcast_dim_cnt] = src0->ne[i] / nr;
bcast_ne_src1[bcast_dim_cnt] = src1->ne[i];
bcast_stride_src0[bcast_dim_cnt] = src0->nb[i];
bcast_stride_src1[bcast_dim_cnt] = src1->nb[i];
bcast_nb_src0[bcast_dim_cnt] = src0->nb[i];
bcast_nb_src1[bcast_dim_cnt] = src1->nb[i];
bcast_dim_cnt++;
if (nr != 1) {
// Need to add an extra dim.
bcast_ne_src0[bcast_dim_cnt] = nr;
bcast_ne_src1[bcast_dim_cnt] = 1;
bcast_stride_src0[bcast_dim_cnt] =
bcast_stride_src0[bcast_dim_cnt - 1] *
bcast_nb_src0[bcast_dim_cnt] =
bcast_nb_src0[bcast_dim_cnt - 1] *
bcast_ne_src0[bcast_dim_cnt - 1];
bcast_stride_src1[bcast_dim_cnt] =
bcast_stride_src1[bcast_dim_cnt - 1] *
bcast_nb_src1[bcast_dim_cnt] =
bcast_nb_src1[bcast_dim_cnt - 1] *
bcast_ne_src1[bcast_dim_cnt - 1];
bcast_dim_cnt++;
}
Expand Down
22 changes: 12 additions & 10 deletions ggml-cann/bcast.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,30 @@ aclDataType type_mapping(ggml_type type);

aclTensor* create_acl_tensor(const ggml_tensor* tensor,
int64_t* bcast_ne = nullptr,
int64_t* bcast_stride = nullptr,
int64_t bcast_dims = 0);
size_t* bcast_nb = nullptr,
int64_t bcast_dims = 0,
aclFormat format = ACL_FORMAT_ND);

aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size, int64_t* ne,
size_t* nb, int64_t dims);
aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
size_t type_size, int64_t* ne, size_t* nb,
int64_t dims, aclFormat format = ACL_FORMAT_ND);

bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);

int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
int64_t* bcast_stride_src0, int64_t* bcast_stride_src1);
size_t* bcast_nb_src0, size_t* bcast_nb_src1);

// Bcast macro to avoid duplicate code.
#define BCAST_SHAPE(src0, src1) \
int64_t bcast_ne_##src0[GGML_MAX_DIMS * 2]; \
int64_t bcast_ne_##src1[GGML_MAX_DIMS * 2]; \
int64_t bcast_stride_##src0[GGML_MAX_DIMS * 2]; \
int64_t bcast_stride_##src1[GGML_MAX_DIMS * 2]; \
size_t bcast_nb_##src0[GGML_MAX_DIMS * 2]; \
size_t bcast_nb_##src1[GGML_MAX_DIMS * 2]; \
int64_t bcast_dims = \
get_bcast_shape(src0, src1, bcast_ne_##src0, bcast_ne_##src1, \
bcast_stride_##src0, bcast_stride_##src1);
bcast_nb_##src0, bcast_nb_##src1);

#define BCAST_PARAM(src) bcast_ne_##src, bcast_stride_##src, bcast_dims
#define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims

#endif //CANN_BCAST_H
#endif // CANN_BCAST_H

0 comments on commit c4740d6

Please sign in to comment.