Skip to content

Commit 81c999f

Browse files
authored
cann : add Ascend NPU support (ggml-org#2336)
* enable Ascend NPU in src/whisper.cpp * sync test-backend-ops with llama.cpp
1 parent 4b7de08 commit 81c999f

File tree

3 files changed

+128
-21
lines changed

3 files changed

+128
-21
lines changed

ggml/src/ggml-cann/Doxyfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
3232
# title of most generated pages and in a few other places.
3333
# The default value is: My Project.
3434

35-
PROJECT_NAME = "llama.cpp"
35+
PROJECT_NAME = "whisper.cpp"
3636

3737
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
3838
# could be handy for archiving the generated documentation or if some version
@@ -44,7 +44,7 @@ PROJECT_NUMBER =
4444
# for a project that appears at the top of each page and should give viewer a
4545
# quick idea about the purpose of the project. Keep the description short.
4646

47-
PROJECT_BRIEF = "llama inference engine"
47+
PROJECT_BRIEF = "Port of OpenAI's Whisper model in C/C++"
4848

4949
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
5050
# in the documentation. The maximum height of the logo should not exceed 55

src/whisper.cpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@
2929
#include "openvino/whisper-openvino-encoder.h"
3030
#endif
3131

32+
#ifdef GGML_USE_CANN
33+
#include "ggml-cann.h"
34+
#endif
35+
3236
#include "ggml.h"
3337
#include "ggml-alloc.h"
3438
#include "ggml-backend.h"
@@ -1283,6 +1287,16 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
12831287
}
12841288
#endif
12851289

1290+
#ifdef GGML_USE_CANN
1291+
if (params.use_gpu) {
1292+
WHISPER_LOG_INFO("%s: using CANN backend\n", __func__);
1293+
result = ggml_backend_cann_init(params.gpu_device);
1294+
if (!result) {
1295+
WHISPER_LOG_ERROR("%s: ggml_backend_cann_init() failed\n", __func__);
1296+
}
1297+
}
1298+
#endif
1299+
12861300
return result;
12871301
}
12881302

@@ -1335,6 +1349,10 @@ static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_cont
13351349
result || (result = ggml_backend_vk_buffer_type(params.gpu_device));
13361350
#endif
13371351

1352+
#ifdef GGML_USE_CANN
1353+
result || (result == ggml_backend_cann_buffer_type(params.gpu_device));
1354+
#endif
1355+
13381356
result || (result = ggml_backend_cpu_buffer_type());
13391357

13401358
return result;
@@ -4337,8 +4355,8 @@ const char * whisper_print_system_info(void) {
43374355
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
43384356
s += "CUDA = " + std::to_string(ggml_cpu_has_cuda()) + " | ";
43394357
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
4340-
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) ;
4341-
4358+
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
4359+
s += "CANN = " + std::to_string(ggml_cpu_has_cann()) ;
43424360
return s.c_str();
43434361
}
43444362

tests/test-backend-ops.cpp

Lines changed: 106 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include <ggml.h>
22
#include <ggml-alloc.h>
33
#include <ggml-backend.h>
4-
#include <ggml-backend-impl.h>
54

65
#include <algorithm>
76
#include <array>
@@ -80,14 +79,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
8079
im = nullptr;
8180
}
8281
}
82+
8383
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
8484
GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
85+
// TODO: other cases
86+
//#pragma omp parallel for
87+
//for (int i = 0; i < tensor->ne[1]; i++) {
88+
// ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
89+
// i * tensor->ne[0], 1, tensor->ne[0], im);
90+
//}
91+
8592
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
8693
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
8794
// This is going to create some weird integers though.
8895
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
8996
} else {
90-
GGML_ASSERT(false);
97+
GGML_ABORT("fatal error");
9198
}
9299
}
93100

@@ -125,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
125132
tt.to_float(&buf[i], vq.data(), bs);
126133
tv.insert(tv.end(), vq.begin(), vq.end());
127134
} else {
128-
GGML_ASSERT(false);
135+
GGML_ABORT("fatal error");
129136
}
130137
}
131138
}
@@ -760,7 +767,7 @@ struct test_dup : public test_case {
760767
}
761768

762769
test_dup(ggml_type type = GGML_TYPE_F32,
763-
std::array<int64_t, 4> ne = {10, 10, 10, 1},
770+
std::array<int64_t, 4> ne = {10, 10, 20, 1},
764771
std::array<int64_t, 4> permute = {0, 0, 0, 0})
765772
: type(type), ne(ne), permute(permute),
766773
_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
@@ -780,9 +787,11 @@ struct test_cpy : public test_case {
780787
const ggml_type type_src;
781788
const ggml_type type_dst;
782789
const std::array<int64_t, 4> ne;
790+
const std::array<int64_t, 4> permute;
791+
bool _src_use_permute;
783792

784793
std::string vars() override {
785-
return VARS_TO_STR3(type_src, type_dst, ne);
794+
return VARS_TO_STR4(type_src, type_dst, ne, permute);
786795
}
787796

788797
double max_nmse_err() override {
@@ -794,12 +803,17 @@ struct test_cpy : public test_case {
794803
}
795804

796805
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
797-
std::array<int64_t, 4> ne = {10, 10, 10, 1})
798-
: type_src(type_src), type_dst(type_dst), ne(ne) {}
806+
std::array<int64_t, 4> ne = {10, 10, 10, 1},
807+
std::array<int64_t, 4> permute = {0, 0, 0, 0})
808+
: type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
809+
_src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
799810

800811
ggml_tensor * build_graph(ggml_context * ctx) override {
801812
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
802-
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
813+
if (_src_use_permute) {
814+
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
815+
}
816+
ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
803817
ggml_tensor * out = ggml_cpy(ctx, src, dst);
804818
return out;
805819
}
@@ -1175,6 +1189,7 @@ struct test_soft_max : public test_case {
11751189
}
11761190
};
11771191

1192+
11781193
// GGML_OP_ROPE
11791194
struct test_rope : public test_case {
11801195
const ggml_type type;
@@ -1267,6 +1282,32 @@ struct test_pool2d : public test_case {
12671282
}
12681283
};
12691284

1285+
// GGML_OP_CONV_TRANSPOSE_1D
1286+
struct test_conv_transpose_1d : public test_case {
1287+
const std::array<int64_t, 4> ne_input;
1288+
const std::array<int64_t, 4> ne_kernel;
1289+
1290+
const int s0; // stride
1291+
const int p0; // padding
1292+
const int d0; // dilation
1293+
1294+
std::string vars() override {
1295+
return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
1296+
}
1297+
1298+
test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
1299+
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
1300+
int s0 = 1, int p0 = 0, int d0 = 1)
1301+
: ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
1302+
1303+
ggml_tensor * build_graph(ggml_context * ctx) override {
1304+
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
1305+
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
1306+
ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
1307+
return out;
1308+
}
1309+
};
1310+
12701311
// GGML_OP_IM2COL
12711312
struct test_im2col : public test_case {
12721313
const ggml_type type_input;
@@ -1280,7 +1321,7 @@ struct test_im2col : public test_case {
12801321
// padding
12811322
const int p0;
12821323
const int p1;
1283-
// dilatation
1324+
// dilation
12841325
const int d0;
12851326
const int d1;
12861327
// mode
@@ -1393,7 +1434,7 @@ struct test_argsort : public test_case {
13931434
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
13941435
}
13951436
} else {
1396-
GGML_ASSERT(false);
1437+
GGML_ABORT("fatal error");
13971438
}
13981439
}
13991440
}
@@ -1470,19 +1511,21 @@ struct test_group_norm : public test_case {
14701511
const ggml_type type;
14711512
const std::array<int64_t, 4> ne;
14721513
const int32_t num_groups;
1514+
const float eps;
14731515

14741516
std::string vars() override {
14751517
return VARS_TO_STR3(type, ne, num_groups);
14761518
}
14771519

14781520
test_group_norm(ggml_type type = GGML_TYPE_F32,
14791521
std::array<int64_t, 4> ne = {64, 64, 320, 1},
1480-
int32_t num_groups = 32)
1481-
: type(type), ne(ne), num_groups(num_groups) {}
1522+
int32_t num_groups = 32,
1523+
float eps = 1e-6f)
1524+
: type(type), ne(ne), num_groups(num_groups), eps(eps) {}
14821525

14831526
ggml_tensor * build_graph(ggml_context * ctx) override {
14841527
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1485-
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
1528+
ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
14861529
return out;
14871530
}
14881531
};
@@ -2053,6 +2096,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
20532096
GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
20542097
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
20552098
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2099+
GGML_TYPE_BF16,
20562100
};
20572101

20582102
// unary ops
@@ -2097,6 +2141,19 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
20972141

20982142
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
20992143
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
2144+
// test cases for 1D im2col
2145+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
2146+
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
2147+
2148+
test_cases.emplace_back(new test_conv_transpose_1d());
2149+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
2150+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
2151+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
2152+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
2153+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
2154+
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
2155+
test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
2156+
21002157

21012158
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
21022159
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
@@ -2110,12 +2167,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
21102167
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
21112168
test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
21122169
test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
2170+
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
2171+
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
2172+
test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
2173+
test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
21132174
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
21142175
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
21152176

21162177
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
21172178
for (ggml_type type_dst : all_types) {
21182179
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
2180+
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
2181+
}
2182+
}
2183+
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2184+
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2185+
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
21192186
}
21202187
}
21212188

@@ -2165,6 +2232,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
21652232
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
21662233
}
21672234

2235+
#if 1
21682236
for (ggml_type type_a : base_types) {
21692237
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
21702238
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
@@ -2184,10 +2252,31 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
21842252
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
21852253
}
21862254
}
2255+
#else
2256+
// m = a rows
2257+
// n = b rows
2258+
// k = cols
2259+
std::uniform_int_distribution<> dist_m(1, 128);
2260+
std::uniform_int_distribution<> dist_n(16, 128);
2261+
std::uniform_int_distribution<> dist_k(1, 16);
2262+
for (int i = 0; i < 1000; i++) {
2263+
for (ggml_type type_a : all_types) {
2264+
for (ggml_type type_b : {GGML_TYPE_F32}) {
2265+
int m = dist_m(rng);
2266+
int n = dist_n(rng);
2267+
int k = dist_k(rng) * ggml_blck_size(type_a);
2268+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
2269+
}
2270+
}
2271+
}
2272+
#endif
21872273

21882274
for (ggml_type type_a : other_types) {
21892275
for (ggml_type type_b : {GGML_TYPE_F32}) {
2190-
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
2276+
if (ggml_blck_size(type_a) != 256) {
2277+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
2278+
}
2279+
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
21912280
}
21922281
}
21932282

@@ -2247,7 +2336,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
22472336
for (int n = 0; n < 10; ++n) {
22482337
int64_t ne0 = dist_ne0(rng);
22492338
int64_t ne1 = dist_ne1(rng);
2250-
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
2339+
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
22512340
}
22522341

22532342
exponent <<= 1;
@@ -2266,7 +2355,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
22662355
}
22672356
}
22682357
}
2269-
2358+
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
22702359
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
22712360
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
22722361
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
@@ -2380,7 +2469,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
23802469
return true;
23812470
}
23822471

2383-
GGML_ASSERT(false);
2472+
GGML_ABORT("fatal error");
23842473
return false;
23852474
}
23862475

0 commit comments

Comments
 (0)