fix norm

hipudding · hipudding · commit c4740d6ba64b · 2024-04-03T03:47:05.000Z
diff --git a/ggml-cann/aclnn_ops.cpp b/ggml-cann/aclnn_ops.cpp
@@ -1,6 +1,6 @@
 #include "aclnn_ops.h"
 
-#include <aclnnop/aclnn_batch_norm.h>
+#include <aclnnop/aclnn_layer_norm.h>
 #include <aclnnop/aclnn_cast.h>
 
 #include <cmath>
@@ -368,77 +368,32 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
-    float *weight_host, *bias_host;
-    int64_t channel = dst->ne[2];
-
-    weight_host = new float[channel];
-    bias_host = new float[channel];
-
-    for (int i = 0; i < channel; i++) {
-        weight_host[i] = 1;
-        bias_host[i] = 0;
-    }
-
-    aclrtStream stream = ctx.stream();
-
-    // Input tensors.
-    void *buffer, *acl_weight, *acl_bias, *acl_mean, *acl_invstd;
-    ACL_CHECK(aclrtMalloc(&buffer, 4 * channel * sizeof(float),
-                          ACL_MEM_MALLOC_HUGE_FIRST));
-    acl_weight = buffer;
-    acl_bias = acl_weight + sizeof(float) * channel;
-    acl_mean = acl_bias + sizeof(float) * channel;
-    acl_invstd = acl_mean + sizeof(float) * channel;
-
-    // Set input params.
-    ACL_CHECK(aclrtMemcpyAsync(acl_weight, channel, weight_host, channel,
-                               ACL_MEMCPY_HOST_TO_DEVICE, stream));
-    ACL_CHECK(aclrtMemcpyAsync(acl_bias, channel, bias_host, channel,
-                               ACL_MEMCPY_HOST_TO_DEVICE, stream));
-    delete[] weight_host;
-    delete[] bias_host;
-
-    // Create input tensors.
-    int64_t input_tensor_shape[] = {channel};
-    size_t input_tensor_stride[] = {1};
-    aclTensor* weight =
-        create_acl_tensor(acl_weight, ACL_FLOAT, sizeof(float),
-                          input_tensor_shape, input_tensor_stride, 1);
-    aclTensor* bias =
-        create_acl_tensor(acl_bias, ACL_FLOAT, sizeof(float),
-                          input_tensor_shape, input_tensor_stride, 1);
-    aclTensor* mean =
-        create_acl_tensor(acl_mean, ACL_FLOAT, sizeof(float),
-                          input_tensor_shape, input_tensor_stride, 1);
-    aclTensor* invstd =
-        create_acl_tensor(acl_invstd, ACL_FLOAT, sizeof(float),
-                          input_tensor_shape, input_tensor_stride, 1);
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
 
-    ACL_CHECK(aclnnBatchNormGetWorkspaceSize(
-        acl_src, weight, bias, nullptr, nullptr, false, 0, eps, acl_dst, mean,
-        invstd, &workspaceSize, &executor));
+    std::vector<int64_t> normData = {dst->ne[0]};
+    aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
+    ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr, eps,
+                                             acl_dst, nullptr, nullptr,
+                                             &workspaceSize, &executor));
 
     if (workspaceSize > 0) {
         ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize,
                               ACL_MEM_MALLOC_HUGE_FIRST));
     }
 
-    ACL_CHECK(aclnnBatchNorm(workspaceAddr, workspaceSize, executor, stream));
+    aclrtStream stream = ctx.stream();
 
-    ACL_CHECK(aclDestroyTensor(weight));
-    ACL_CHECK(aclDestroyTensor(bias));
-    ACL_CHECK(aclDestroyTensor(mean));
-    ACL_CHECK(aclDestroyTensor(invstd));
+    ACL_CHECK(aclnnLayerNorm(workspaceAddr, workspaceSize, executor, stream));
 
-    // TODO: optimize argsort kernel or free tmp buffers after stream sync.
-    ACL_CHECK(aclrtSynchronizeStream(stream));
-    ACL_CHECK(aclrtFree(buffer));
+    ACL_CHECK(aclDestroyIntArray(norm));
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
 
     if (workspaceSize > 0) {
         ACL_CHECK(aclrtFree(workspaceAddr));
     }
-}
+}
+
diff --git a/ggml-cann/bcast.cpp b/ggml-cann/bcast.cpp
@@ -27,12 +27,12 @@ aclDataType type_mapping(ggml_type type) {
  * Transform ggml_tensor to acl_tensor. Note that ggml_tensor dimension order
  * is reversed compared to acl_tensor.
  *
- * If bcast_ne and bcast_stride is nullptr, use ggml_tensor's ne and nb.
- * otherwise, use bcast_ne bcast_stride, which means tensor dims should be
+ * If bcast_ne and bcast_nb is nullptr, use ggml_tensor's ne and nb.
+ * otherwise, use bcast_ne bcast_nb, which means tensor dims should be
  * changed to satisfy the broadcast. @sa: get_bcast_shape.
  */
 aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
-                             int64_t* bcast_stride, int64_t bcast_dims) {
+                             size_t* bcast_nb, int64_t bcast_dims, aclFormat format) {
     size_t size = ggml_nbytes(tensor);
     void* deviceAddr = nullptr;
 
@@ -53,13 +53,13 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
         for (int i = 0; i < GGML_MAX_DIMS; i++) {
             acl_ne[i] = tensor->ne[i];
             // The step size of acl is in elements.
-            acl_stride[i] = tensor->nb[i] / tensor->nb[0];
+            acl_stride[i] = tensor->nb[i] / ggml_type_size(tensor->type);
         }
     } else {
         // With bcast
         for (int i = 0; i < bcast_dims; i++) {
             acl_ne[i] = bcast_ne[i];
-            acl_stride[i] = bcast_stride[i] / tensor->nb[0];
+            acl_stride[i] = bcast_nb[i] / ggml_type_size(tensor->type);
         }
     }
 
@@ -69,13 +69,13 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
 
     aclTensor* acl_tensor =
         aclCreateTensor(acl_ne, dims, type_mapping(tensor->type), acl_stride, 0,
-                        aclFormat::ACL_FORMAT_ND, acl_ne, dims, deviceAddr);
+                        format, acl_ne, dims, deviceAddr);
 
     return acl_tensor;
 }
 
 aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size, int64_t* ne,
-                             size_t* nb, int64_t dims) {
+                             size_t* nb, int64_t dims, aclFormat format) {
     
     int64_t tmp_ne[GGML_MAX_DIMS * 2];
     int64_t tmp_stride[GGML_MAX_DIMS * 2];
@@ -90,7 +90,7 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size
 
     aclTensor* acl_tensor =
         aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 0,
-                        aclFormat::ACL_FORMAT_ND, tmp_ne, dims, data_ptr);
+                        format, tmp_ne, dims, data_ptr);
 
     return acl_tensor;
 }
@@ -132,26 +132,26 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size
  */
 int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
                         int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
-                        int64_t* bcast_stride_src0,
-                        int64_t* bcast_stride_src1) {
+                        size_t* bcast_nb_src0,
+                        size_t* bcast_nb_src1) {
     GGML_ASSERT(ggml_can_repeat(src1, src0));
     int bcast_dim_cnt = 0;
     for (int i = 0; i < GGML_MAX_DIMS; i++) {
         int64_t nr = src0->ne[i] / src1->ne[i];
         bcast_ne_src0[bcast_dim_cnt] = src0->ne[i] / nr;
         bcast_ne_src1[bcast_dim_cnt] = src1->ne[i];
-        bcast_stride_src0[bcast_dim_cnt] = src0->nb[i];
-        bcast_stride_src1[bcast_dim_cnt] = src1->nb[i];
+        bcast_nb_src0[bcast_dim_cnt] = src0->nb[i];
+        bcast_nb_src1[bcast_dim_cnt] = src1->nb[i];
         bcast_dim_cnt++;
         if (nr != 1) {
             // Need to add an extra dim.
             bcast_ne_src0[bcast_dim_cnt] = nr;
             bcast_ne_src1[bcast_dim_cnt] = 1;
-            bcast_stride_src0[bcast_dim_cnt] =
-                bcast_stride_src0[bcast_dim_cnt - 1] *
+            bcast_nb_src0[bcast_dim_cnt] =
+                bcast_nb_src0[bcast_dim_cnt - 1] *
                 bcast_ne_src0[bcast_dim_cnt - 1];
-            bcast_stride_src1[bcast_dim_cnt] =
-                bcast_stride_src1[bcast_dim_cnt - 1] *
+            bcast_nb_src1[bcast_dim_cnt] =
+                bcast_nb_src1[bcast_dim_cnt - 1] *
                 bcast_ne_src1[bcast_dim_cnt - 1];
             bcast_dim_cnt++;
         }
diff --git a/ggml-cann/bcast.h b/ggml-cann/bcast.h
@@ -10,28 +10,30 @@ aclDataType type_mapping(ggml_type type);
 
 aclTensor* create_acl_tensor(const ggml_tensor* tensor,
                              int64_t* bcast_ne = nullptr,
-                             int64_t* bcast_stride = nullptr,
-                             int64_t bcast_dims = 0);
+                             size_t* bcast_nb = nullptr,
+                             int64_t bcast_dims = 0,
+                             aclFormat format = ACL_FORMAT_ND);
 
-aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size, int64_t* ne,
-                             size_t* nb, int64_t dims);
+aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
+                             size_t type_size, int64_t* ne, size_t* nb,
+                             int64_t dims, aclFormat format = ACL_FORMAT_ND);
 
 bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
 
 int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
                         int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
-                        int64_t* bcast_stride_src0, int64_t* bcast_stride_src1);
+                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
 
 // Bcast macro to avoid duplicate code.
 #define BCAST_SHAPE(src0, src1)                                       \
     int64_t bcast_ne_##src0[GGML_MAX_DIMS * 2];                       \
     int64_t bcast_ne_##src1[GGML_MAX_DIMS * 2];                       \
-    int64_t bcast_stride_##src0[GGML_MAX_DIMS * 2];                   \
-    int64_t bcast_stride_##src1[GGML_MAX_DIMS * 2];                   \
+    size_t bcast_nb_##src0[GGML_MAX_DIMS * 2];                   \
+    size_t bcast_nb_##src1[GGML_MAX_DIMS * 2];                   \
     int64_t bcast_dims =                                              \
         get_bcast_shape(src0, src1, bcast_ne_##src0, bcast_ne_##src1, \
-                        bcast_stride_##src0, bcast_stride_##src1);
+                        bcast_nb_##src0, bcast_nb_##src1);
 
-#define BCAST_PARAM(src) bcast_ne_##src, bcast_stride_##src, bcast_dims
+#define BCAST_PARAM(src) bcast_ne_##src, bcast_nb_##src, bcast_dims
 
-#endif //CANN_BCAST_H
+#endif  // CANN_BCAST_H