Skip to content

Commit dca1d4b

Browse files
authored
ggml : fix BLAS with unsupported types (ggml-org#9775)
* ggml : do not use BLAS with types without to_float * ggml : return pointer from ggml_internal_get_type_traits to avoid unnecessary copies * ggml : rename ggml_internal_get_type_traits -> ggml_get_type_traits it's not really internal if everybody uses it
1 parent 458367a commit dca1d4b

File tree

13 files changed

+75
-74
lines changed

13 files changed

+75
-74
lines changed

examples/export-lora/export-lora.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,9 @@ struct lora_merge_ctx {
314314
// optionally dequantize it
315315
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
316316
auto nels = ggml_nelements(inp_base);
317-
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
317+
const auto * qtype = ggml_get_type_traits(base->type);
318318
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
319-
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
319+
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
320320
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
321321
} else {
322322
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));

examples/quantize-stats/quantize-stats.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
142142
}
143143

144144
static void test_roundtrip_on_chunk(
145-
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
145+
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
146146
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
147147
) {
148148
if (layer->type == GGML_TYPE_F16) {
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
166166

167167
// Run quantization function for a single layer and update error stats
168168
static void test_roundtrip_on_layer(
169-
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
169+
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
170170
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
171171
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
172172
) {
@@ -371,8 +371,8 @@ int main(int argc, char ** argv) {
371371
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
372372
continue;
373373
}
374-
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
375-
if (qfns.from_float && qfns.to_float) {
374+
const auto * qfns = ggml_get_type_traits(type);
375+
if (qfns->from_float && qfns->to_float) {
376376
if (params.verbose) {
377377
printf("testing %s ...\n", ggml_type_name(type));
378378
}
@@ -393,7 +393,7 @@ int main(int argc, char ** argv) {
393393
test_roundtrip_on_layer(
394394
layer_name,
395395
params.per_layer_stats,
396-
qfns,
396+
*qfns,
397397
params.reference,
398398
kv_tensor.second,
399399
input_scratch,

ggml/include/ggml.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2535,7 +2535,7 @@ extern "C" {
25352535
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
25362536
const void * GGML_RESTRICT y, int nr, int nc);
25372537

2538-
typedef struct {
2538+
struct ggml_type_traits {
25392539
const char * type_name;
25402540
int64_t blck_size;
25412541
int64_t blck_size_interleave; // interleave elements in blocks
@@ -2551,9 +2551,9 @@ extern "C" {
25512551
int64_t ncols; // number of columns to process simultaneously
25522552
ggml_gemv_t gemv;
25532553
ggml_gemm_t gemm;
2554-
} ggml_type_traits_t;
2554+
};
25552555

2556-
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2556+
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
25572557

25582558
#ifdef __cplusplus
25592559
}

ggml/src/ggml-backend.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1177,7 +1177,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
11771177
op->type != GGML_TYPE_IQ1_S &&
11781178
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
11791179
case GGML_OP_MUL_MAT:
1180-
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
1180+
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
11811181
case GGML_OP_ROPE_BACK:
11821182
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
11831183
case GGML_OP_IM2COL_BACK:

ggml/src/ggml-blas.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
6565

6666
// convert src0 to float
6767
if (type != GGML_TYPE_F32) {
68-
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
69-
ggml_to_float_t const to_float = type_traits.to_float;
68+
const auto * type_traits = ggml_get_type_traits(type);
69+
ggml_to_float_t const to_float = type_traits->to_float;
7070

7171
for (int64_t i03 = 0; i03 < ne03; i03++) {
7272
for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -420,19 +420,21 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s
420420
// TODO: find the optimal value
421421
const int64_t min_batch = 32;
422422

423-
return (ggml_is_contiguous(src0) &&
424-
ggml_is_contiguous(src1) &&
425-
src1->type == GGML_TYPE_F32 &&
426-
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch));
423+
return ggml_is_contiguous(src0) &&
424+
ggml_is_contiguous(src1) &&
425+
src1->type == GGML_TYPE_F32 &&
426+
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
427+
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
427428
}
428429

429430
case GGML_OP_OUT_PROD:
430-
return (op->src[0]->type == GGML_TYPE_F32 &&
431-
op->src[1]->type == GGML_TYPE_F32 &&
432-
ggml_is_matrix(src0) &&
433-
ggml_is_matrix(src1) &&
434-
ggml_is_contiguous(src0) &&
435-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
431+
return op->src[0]->type == GGML_TYPE_F32 &&
432+
op->src[1]->type == GGML_TYPE_F32 &&
433+
ggml_is_matrix(src0) &&
434+
ggml_is_matrix(src1) &&
435+
ggml_is_contiguous(src0) &&
436+
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
437+
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
436438

437439
default:
438440
return false;

ggml/src/ggml-vulkan.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5287,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg
52875287
return;
52885288
}
52895289

5290-
ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
5290+
const auto * tt = ggml_get_type_traits(quant);
52915291

5292-
ggml_to_float_t dequant_fn = tt.to_float;
5292+
ggml_to_float_t dequant_fn = tt->to_float;
52935293

52945294
dequant_fn(from, to, ne);
52955295
}

ggml/src/ggml.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
729729
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
730730
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
731731

732-
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
732+
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
733733
[GGML_TYPE_I8] = {
734734
.type_name = "i8",
735735
.blck_size = 1,
@@ -1151,9 +1151,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
11511151
};
11521152

11531153
// For internal test use
1154-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
1154+
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
11551155
GGML_ASSERT(type < GGML_TYPE_COUNT);
1156-
return type_traits[type];
1156+
return &type_traits[type];
11571157
}
11581158

11591159
//

pocs/vdot/q8dot.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ int main(int argc, char** argv) {
136136

137137
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
138138

139-
auto funcs = ggml_internal_get_type_traits(ggml_type);
139+
const auto * funcs = ggml_get_type_traits(ggml_type);
140140

141141
Stat simple, ggml;
142142

@@ -156,8 +156,8 @@ int main(int argc, char** argv) {
156156

157157
t1 = std::chrono::high_resolution_clock::now();
158158
float fs;
159-
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
160-
else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
159+
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
160+
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
161161
t2 = std::chrono::high_resolution_clock::now();
162162
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
163163
if (iloop > 3) ggml.addResult(fs, t);

pocs/vdot/vdot.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ int main(int argc, char** argv) {
236236
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
237237
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
238238

239-
auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
239+
const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);
240240

241241
std::vector<block_q4_0> q40;
242242
std::vector<block_q4_1> q41;
@@ -261,9 +261,9 @@ int main(int argc, char** argv) {
261261
// Note, we do not include this in the timing as in practical application
262262
// we already have the quantized model weights.
263263
if (useQ4_1) {
264-
funcs.from_float(x1.data(), q41.data(), kVecSize);
264+
funcs->from_float(x1.data(), q41.data(), kVecSize);
265265
} else {
266-
funcs.from_float(x1.data(), q40.data(), kVecSize);
266+
funcs->from_float(x1.data(), q40.data(), kVecSize);
267267
}
268268

269269
// Now measure time the dot product needs using the "scalar" version above
@@ -282,10 +282,10 @@ int main(int argc, char** argv) {
282282
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
283283
}
284284
else {
285-
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
286-
vdot.from_float(y1.data(), q8.data(), kVecSize);
287-
if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
288-
else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
285+
const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
286+
vdot->from_float(y1.data(), q8.data(), kVecSize);
287+
if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
288+
else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
289289
}
290290
sumq += result;
291291
t2 = std::chrono::high_resolution_clock::now();

src/llama.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17872,10 +17872,9 @@ static void llama_tensor_dequantize_internal(
1787217872
}
1787317873
float * f32_output = (float *) output.data();
1787417874

17875-
ggml_type_traits_t qtype;
17875+
const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
1787617876
if (ggml_is_quantized(tensor->type)) {
17877-
qtype = ggml_internal_get_type_traits(tensor->type);
17878-
if (qtype.to_float == NULL) {
17877+
if (qtype->to_float == NULL) {
1787917878
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
1788017879
}
1788117880
} else if (tensor->type != GGML_TYPE_F16 &&
@@ -17889,7 +17888,7 @@ static void llama_tensor_dequantize_internal(
1788917888
} else if (tensor->type == GGML_TYPE_BF16) {
1789017889
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
1789117890
} else if (ggml_is_quantized(tensor->type)) {
17892-
qtype.to_float(tensor->data, f32_output, nelements);
17891+
qtype->to_float(tensor->data, f32_output, nelements);
1789317892
} else {
1789417893
GGML_ABORT("fatal error"); // unreachable
1789517894
}
@@ -17925,7 +17924,7 @@ static void llama_tensor_dequantize_internal(
1792517924
} else if (typ == GGML_TYPE_BF16) {
1792617925
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
1792717926
} else {
17928-
qtype.to_float(inbuf, outbuf, nels);
17927+
qtype->to_float(inbuf, outbuf, nels);
1792917928
}
1793017929
};
1793117930
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);

tests/test-backend-ops.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
133133
std::vector<uint8_t> buf(ggml_nbytes(t));
134134
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
135135

136-
ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
136+
const auto * tt = ggml_get_type_traits(t->type);
137137
size_t bs = ggml_blck_size(t->type);
138138
std::vector<float> vq(ggml_blck_size(t->type));
139139
bool quantized = ggml_is_quantized(t->type);
@@ -159,7 +159,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
159159
} else if (t->type == GGML_TYPE_I8) {
160160
tv.push_back((float)*(int8_t *) &buf[i]);
161161
} else if (quantized) {
162-
tt.to_float(&buf[i], vq.data(), bs);
162+
tt->to_float(&buf[i], vq.data(), bs);
163163
tv.insert(tv.end(), vq.begin(), vq.end());
164164
} else {
165165
GGML_ABORT("fatal error");

tests/test-quantize-fns.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
4444
}
4545

4646
// Total quantization error on test data
47-
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
47+
static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
4848
std::vector<uint8_t> tmp_q(2*test_size);
4949
std::vector<float> tmp_out(test_size);
5050

51-
qfns.from_float(test_data, tmp_q.data(), test_size);
52-
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
51+
qfns->from_float(test_data, tmp_q.data(), test_size);
52+
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
5353
return array_rmse(test_data, tmp_out.data(), test_size);
5454
}
5555

5656
// Total quantization error on test data
57-
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
57+
static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
5858
std::vector<uint8_t> tmp_q(2*test_size);
5959
std::vector<float> tmp_out(test_size);
6060
std::vector<float> tmp_out_ref(test_size);
6161

62-
qfns.from_float(test_data, tmp_q.data(), test_size);
63-
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
62+
qfns->from_float(test_data, tmp_q.data(), test_size);
63+
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
6464

65-
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
66-
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
65+
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
66+
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
6767

6868
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
6969
}
@@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
7878

7979
// Total dot product error
8080
static float dot_product_error(
81-
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
81+
const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
8282
) {
8383
std::vector<uint8_t> tmp_q1(2*test_size);
8484
std::vector<uint8_t> tmp_q2(2*test_size);
8585

86-
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
86+
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
8787

88-
qfns.from_float(test_data1, tmp_q1.data(), test_size);
89-
vdot.from_float(test_data2, tmp_q2.data(), test_size);
88+
qfns->from_float(test_data1, tmp_q1.data(), test_size);
89+
vdot->from_float(test_data2, tmp_q2.data(), test_size);
9090

9191
float result = INFINITY;
92-
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
92+
qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
9393

9494
const float dot_ref = dot_product(test_data1, test_data2, test_size);
9595

@@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {
131131

132132
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
133133
ggml_type type = (ggml_type) i;
134-
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
134+
const auto * qfns = ggml_get_type_traits(type);
135135

136136
// deprecated - skip
137-
if (qfns.blck_size == 0) {
137+
if (qfns->blck_size == 0) {
138138
continue;
139139
}
140140

@@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
143143
printf("Testing %s\n", ggml_type_name((ggml_type) i));
144144
ggml_quantize_init(ei);
145145

146-
if (qfns.from_float && qfns.to_float) {
146+
if (qfns->from_float && qfns->to_float) {
147147
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
148148
const float max_quantization_error =
149149
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :

0 commit comments

Comments
 (0)