Skip to content

Commit c330b78

Browse files
committed
Add aclop
1 parent 5fec9cb commit c330b78

File tree

9 files changed

+458
-21
lines changed

9 files changed

+458
-21
lines changed

CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,20 @@ if (LLAMA_CANN)
841841
endif()
842842
endif()
843843
844+
# * libacl_op_compiler.so
845+
if (LLAMA_CANN)
846+
set(lib_dir "${CANN_INSTALL_DIR}/lib64")
847+
find_library(found_lib_acl_op_compiler NAMES acl_op_compiler PATHS ${lib_dir} NO_DEFAULT_PATH)
848+
if (found_lib_acl_op_compiler)
849+
set(lib_acl_op_compiler ${found_lib_acl_op_compiler})
850+
list(APPEND CANN_LIBRARIES ${lib_acl_op_compiler})
851+
message(STATUS "CANN: libacl_op_compiler.so is found at ${lib_dir}")
852+
else()
853+
set(LLAMA_CANN OFF)
854+
message(WARNING "CANN: Missing libacl_op_compiler.so. Turning off LLAMA_CANN")
855+
endif()
856+
endif()
857+
844858
# Set headers and libs
845859
if (LLAMA_CANN)
846860
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")

ggml-cann.cpp

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "ggml-backend-impl.h"
99
#include "ggml-cann/aclnn_ops.h"
1010
#include "ggml-cann/common.h"
11+
#include "ggml-cann/acl_ops.h"
1112

1213
struct AclLifeCycle {
1314
AclLifeCycle() { ACL_CHECK(aclInit(nullptr)); }
@@ -346,8 +347,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
346347
ggml_cann_repeat(ctx, dst);
347348
break;
348349
case GGML_OP_GET_ROWS:
349-
case GGML_OP_DUP:
350350
return false;
351+
case GGML_OP_DUP:
352+
ggml_cann_cont(ctx, dst);
353+
break;
351354
case GGML_OP_ADD:
352355
ggml_cann_add(ctx, dst);
353356
break;
@@ -394,14 +397,19 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
394397
}
395398
break;
396399
case GGML_OP_NORM:
400+
ggml_cann_norm(ctx, dst);
401+
break;
397402
case GGML_OP_GROUP_NORM:
398403
return false;
399404
case GGML_OP_CONCAT:
400405
ggml_cann_concat(ctx, dst);
401406
break;
407+
// TODO: Format need NC1HWC0.
402408
case GGML_OP_UPSCALE:
403-
case GGML_OP_PAD:
404409
return false;
410+
case GGML_OP_PAD:
411+
ggml_cann_pad(ctx, dst);
412+
break;
405413
case GGML_OP_ARANGE:
406414
ggml_cann_arange(ctx, dst);
407415
break;
@@ -413,21 +421,27 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
413421
case GGML_OP_RMS_NORM:
414422
case GGML_OP_MUL_MAT:
415423
case GGML_OP_MUL_MAT_ID:
416-
case GGML_OP_SCALE:
417424
return false;
425+
case GGML_OP_SCALE:
426+
ggml_cann_scale(ctx, dst);
427+
break;
418428
case GGML_OP_SQR:
419429
ggml_cann_sqr(ctx, dst);
420430
break;
421431
case GGML_OP_CLAMP:
422432
ggml_cann_clamp(ctx, dst);
423433
break;
424434
case GGML_OP_CPY:
435+
return false;
425436
case GGML_OP_CONT:
437+
ggml_cann_cont(ctx, dst);
426438
case GGML_OP_NONE:
427439
case GGML_OP_RESHAPE:
428440
case GGML_OP_VIEW:
429441
case GGML_OP_PERMUTE:
430442
case GGML_OP_TRANSPOSE:
443+
// Do nothing with these ops.
444+
break;
431445
case GGML_OP_DIAG_MASK_INF:
432446
case GGML_OP_SOFT_MAX:
433447
case GGML_OP_ROPE:
@@ -437,8 +451,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
437451
case GGML_OP_SUM_ROWS:
438452
return false;
439453
case GGML_OP_ARGSORT:
440-
// ggml_cann_argsort(ctx, dst);
441-
// break;
454+
ggml_cann_argsort(ctx, dst);
455+
break;
442456
return false;
443457
default:
444458
return false;
@@ -458,7 +472,8 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
458472
GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
459473
ggml_backend_cann_context* cann_ctx =
460474
(ggml_backend_cann_context*)backend->context;
461-
475+
ACL_CHECK(aclrtSynchronizeDevice());
476+
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
462477
delete cann_ctx;
463478
delete backend;
464479
}
@@ -591,8 +606,9 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
591606
for (int i = 0; i < cgraph->n_nodes; i++) {
592607
ggml_tensor* node = cgraph->nodes[i];
593608

594-
if (ggml_is_empty(node) || node->op == GGML_OP_VIEW ||
595-
node->op == GGML_OP_NONE) {
609+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE ||
610+
node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW ||
611+
node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
596612
continue;
597613
}
598614

@@ -627,29 +643,31 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
627643
case GGML_OP_MUL_MAT_ID:
628644
case GGML_OP_GET_ROWS:
629645
case GGML_OP_CPY:
630-
case GGML_OP_DUP:
631646
return false;
647+
case GGML_OP_DUP:
648+
return true;
632649
case GGML_OP_REPEAT:
633650
case GGML_OP_CONCAT:
634651
case GGML_OP_NONE:
635-
return true;
636652
case GGML_OP_RESHAPE:
637653
case GGML_OP_VIEW:
638654
case GGML_OP_PERMUTE:
639655
case GGML_OP_TRANSPOSE:
656+
return true;
640657
case GGML_OP_NORM:
641-
return false;
658+
return true;
642659
case GGML_OP_ADD:
643660
case GGML_OP_MUL:
644661
case GGML_OP_DIV:
645662
return true;
646663
case GGML_OP_RMS_NORM:
647-
case GGML_OP_SCALE:
648664
return false;
665+
case GGML_OP_SCALE:
666+
return true;
649667
case GGML_OP_SQR:
650668
case GGML_OP_CLAMP:
651-
return true;
652669
case GGML_OP_CONT:
670+
return true;
653671
case GGML_OP_DIAG_MASK_INF:
654672
case GGML_OP_SOFT_MAX:
655673
case GGML_OP_ROPE:
@@ -659,12 +677,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
659677
case GGML_OP_SUM_ROWS:
660678
return false;
661679
case GGML_OP_ARGSORT:
662-
return false;
680+
return true;
663681
case GGML_OP_ACC:
664682
case GGML_OP_GROUP_NORM:
665683
case GGML_OP_UPSCALE:
666-
case GGML_OP_PAD:
667684
return false;
685+
case GGML_OP_PAD:
686+
return true;
668687
case GGML_OP_ARANGE:
669688
return true;
670689
case GGML_OP_TIMESTEP_EMBEDDING:

ggml-cann/acl_ops.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#include "acl_ops.h"
2+
3+
OpCaller::OpCaller() { attrs = aclopCreateAttr(); }
4+
5+
OpCaller::~OpCaller() {
6+
for (aclTensorDesc* desc : input_descs) {
7+
aclDestroyTensorDesc(desc);
8+
}
9+
for (aclDataBuffer* buffer : input_buffers) {
10+
aclDestroyDataBuffer(buffer);
11+
}
12+
for (aclTensorDesc* desc : output_descs) {
13+
aclDestroyTensorDesc(desc);
14+
}
15+
for (aclDataBuffer* buffer : output_buffers) {
16+
aclDestroyDataBuffer(buffer);
17+
}
18+
// TODO: may free before use.
19+
for (void* ptr : ptrs) {
20+
aclrtFree(ptr);
21+
}
22+
aclopDestroyAttr(attrs);
23+
}
24+
25+
OpCaller& OpCaller::name(std::string _op_name) {
26+
op_name = _op_name;
27+
return *this;
28+
}
29+
30+
OpCaller& OpCaller::input_no_contiguous(ggml_tensor* tensor, const char* name) {
31+
aclDataType dtype = type_mapping(tensor->type);
32+
// TODO
33+
int64_t ne[] = {tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]};
34+
aclTensorDesc* tensor_desc =
35+
aclCreateTensorDesc(dtype, GGML_MAX_DIMS, ne, ACL_FORMAT_ND);
36+
aclSetTensorDescName(tensor_desc, name);
37+
input_descs.push_back(tensor_desc);
38+
aclDataBuffer* data_buffer =
39+
aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor));
40+
input_buffers.push_back(data_buffer);
41+
return *this;
42+
}
43+
44+
OpCaller& OpCaller::input(ggml_tensor* tensor, const char* name) {
45+
GGML_ASSERT(ggml_is_contiguous(tensor));
46+
return input_no_contiguous(tensor, name);
47+
}
48+
49+
OpCaller& OpCaller::output(ggml_tensor* tensor, const char* name) {
50+
aclDataType dtype = type_mapping(tensor->type);
51+
aclTensorDesc* tensor_desc =
52+
aclCreateTensorDesc(dtype, GGML_MAX_DIMS, tensor->ne, ACL_FORMAT_ND);
53+
aclSetTensorDescName(tensor_desc, name);
54+
output_descs.push_back(tensor_desc);
55+
aclDataBuffer* data_buffer =
56+
aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor));
57+
output_buffers.push_back(data_buffer);
58+
return *this;
59+
}
60+
61+
OpCaller& OpCaller::attr(int64_t value, const char* name) {
62+
ACL_CHECK(aclopSetAttrInt(attrs, name, value));
63+
return *this;
64+
}
65+
66+
OpCaller& OpCaller::attr(bool value, const char* name) {
67+
ACL_CHECK(aclopSetAttrBool(attrs, name, value));
68+
return *this;
69+
}
70+
71+
OpCaller& OpCaller::attr(float value, const char* name) {
72+
ACL_CHECK(aclopSetAttrFloat(attrs, name, value));
73+
return *this;
74+
}
75+
76+
OpCaller& OpCaller::run(aclrtStream stream) {
77+
ACL_CHECK(aclSetCompileopt(ACL_OP_JIT_COMPILE, "disable"));
78+
ACL_CHECK(aclopCompileAndExecute(
79+
op_name.c_str(), input_descs.size(), input_descs.data(),
80+
input_buffers.data(), output_buffers.size(), output_descs.data(),
81+
output_buffers.data(), attrs, ACL_ENGINE_SYS, ACL_COMPILE_SYS, nullptr,
82+
stream));
83+
return *this;
84+
}
85+
86+
void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
87+
ggml_tensor* src = dst->src[0];
88+
int64_t src_stride[GGML_MAX_DIMS];
89+
int64_t dst_stride[GGML_MAX_DIMS];
90+
91+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
92+
src_stride[i] = src->nb[i] / ggml_type_size(src->type);
93+
dst_stride[i] = dst->nb[i] / ggml_type_size(src->type);
94+
}
95+
96+
int64_t storage_offset[] = {0};
97+
int64_t storage_offset_dim[] = {1};
98+
int64_t size_stride_dim[] = {GGML_MAX_DIMS};
99+
100+
OpCaller op;
101+
op.name("ViewCopy")
102+
.input_no_contiguous(dst, "dst")
103+
.input(dst->ne, ACL_INT64, 1, size_stride_dim, "dst_size", ctx.stream())
104+
.input(dst_stride, ACL_INT64, 1, size_stride_dim, "dst_stride",
105+
ctx.stream())
106+
.input(storage_offset, ACL_INT64, 1, storage_offset_dim,
107+
"dst_storage_offset", ctx.stream())
108+
.input_no_contiguous(src, "src")
109+
.input(src->ne, ACL_INT64, 1, size_stride_dim, "src_size", ctx.stream())
110+
.input(src_stride, ACL_INT64, 1, size_stride_dim, "src_stride",
111+
ctx.stream())
112+
.input(storage_offset, ACL_INT64, 1, storage_offset_dim,
113+
"src_storage_offset", ctx.stream())
114+
.output(dst, "dst")
115+
.run(ctx.stream());
116+
//aclrtSynchronizeStream(ctx.stream());
117+
}
118+
119+
void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
120+
ggml_tensor* src = dst->src[0];
121+
int64_t paddings[] = {
122+
0, dst->ne[3] - src->ne[3], 0, dst->ne[2] - src->ne[2],
123+
0, dst->ne[1] - src->ne[1], 0, dst->ne[0] - src->ne[0]};
124+
int64_t dim[] = {GGML_MAX_DIMS, 2};
125+
OpCaller op;
126+
op.name("Pad")
127+
.input(src, "x")
128+
.input(paddings, ACL_INT64, 2, dim, "paddings", ctx.stream())
129+
.output(dst, "y")
130+
.run(ctx.stream());
131+
//aclrtSynchronizeStream(ctx.stream());
132+
}

ggml-cann/acl_ops.h

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#ifndef CANN_ACL_OPS
2+
#define CANN_ACL_OPS
3+
4+
#include <acl/acl_op.h>
5+
#include <acl/acl_op_compiler.h>
6+
7+
#include <string>
8+
#include <vector>
9+
10+
#include "bcast.h"
11+
#include "common.h"
12+
13+
struct OpCaller {
14+
std::string op_name;
15+
std::vector<aclTensorDesc*> input_descs;
16+
std::vector<aclDataBuffer*> input_buffers;
17+
std::vector<aclTensorDesc*> output_descs;
18+
std::vector<aclDataBuffer*> output_buffers;
19+
aclopAttr* attrs;
20+
std::vector<void*> ptrs;
21+
22+
OpCaller();
23+
24+
virtual ~OpCaller();
25+
26+
OpCaller& name(std::string _op_name);
27+
28+
OpCaller& input_no_contiguous(ggml_tensor* tensor, const char* name);
29+
30+
OpCaller& input(ggml_tensor* tensor, const char* name);
31+
32+
OpCaller& output(ggml_tensor* tensor, const char* name);
33+
34+
OpCaller& attr(int64_t value, const char* name);
35+
36+
OpCaller& attr(bool value, const char* name);
37+
38+
OpCaller& attr(float value, const char* name);
39+
40+
template <typename T>
41+
OpCaller& input(T* values, aclDataType dtype, size_t dims, int64_t* dim,
42+
const char* name, aclrtStream stream = nullptr) {
43+
void* device_ptr = nullptr;
44+
size_t n_elem = 1;
45+
for (size_t i = 0; i < dims; i++) {
46+
n_elem *= dim[i];
47+
}
48+
49+
size_t n_bytes = n_elem * sizeof(T);
50+
ACL_CHECK(aclrtMalloc(&device_ptr, n_bytes, ACL_MEM_MALLOC_HUGE_FIRST));
51+
ptrs.push_back(device_ptr);
52+
if (stream == nullptr) {
53+
ACL_CHECK(aclrtMemcpy(device_ptr, n_bytes, values, n_bytes,
54+
ACL_MEMCPY_HOST_TO_DEVICE));
55+
} else {
56+
ACL_CHECK(aclrtMemcpyAsync(device_ptr, n_bytes, values, n_bytes,
57+
ACL_MEMCPY_HOST_TO_DEVICE, stream));
58+
}
59+
60+
aclTensorDesc* tensor_desc =
61+
aclCreateTensorDesc(dtype, dims, dim, ACL_FORMAT_ND);
62+
aclSetTensorDescName(tensor_desc, name);
63+
input_descs.push_back(tensor_desc);
64+
aclDataBuffer* data_buffer = aclCreateDataBuffer(device_ptr, n_bytes);
65+
input_buffers.push_back(data_buffer);
66+
67+
return *this;
68+
}
69+
70+
OpCaller& run(aclrtStream stream = nullptr);
71+
};
72+
73+
void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst);
74+
75+
void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
76+
77+
void ggml_cann_upscale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
78+
79+
#endif // CANN_ACL_OPS

0 commit comments

Comments
 (0)