@@ -1023,6 +1023,9 @@ struct llama_kv_cache {
1023
1023
uint32_t head = 0 ;
1024
1024
uint32_t size = 0 ;
1025
1025
1026
+ // largest index of an occupied cell (used for a basic optimization heuristic)
1027
+ uint32_t cell_max = 0 ;
1028
+
1026
1029
std::vector<llama_kv_cell> cells;
1027
1030
1028
1031
struct ggml_tensor * k = NULL ;
@@ -1226,6 +1229,8 @@ static bool llama_kv_cache_init(
1226
1229
cache.head = 0 ;
1227
1230
cache.size = n_ctx;
1228
1231
1232
+ cache.cell_max = 0 ;
1233
+
1229
1234
cache.cells .clear ();
1230
1235
cache.cells .resize (n_ctx);
1231
1236
@@ -1311,6 +1316,16 @@ static bool llama_kv_cache_find_slot(
1311
1316
return true ;
1312
1317
}
1313
1318
1319
+ void llama_kv_cache_update_cell_max (struct llama_kv_cache & cache) {
1320
+ cache.cell_max = 0 ;
1321
+
1322
+ for (uint32_t i = 0 ; i < cache.size ; i++) {
1323
+ if (cache.cells [i].pos >= 0 ) {
1324
+ cache.cell_max = i + 1 ;
1325
+ }
1326
+ }
1327
+ }
1328
+
1314
1329
void llama_kv_cache_clear (struct llama_kv_cache & cache, int32_t p0, int32_t p1) {
1315
1330
cache.head = p0;
1316
1331
@@ -1321,6 +1336,8 @@ void llama_kv_cache_clear(struct llama_kv_cache & cache, int32_t p0, int32_t p1)
1321
1336
cache.cells [i].pos = -1 ;
1322
1337
cache.cells [i].seq_id .clear ();
1323
1338
}
1339
+
1340
+ llama_kv_cache_update_cell_max (cache);
1324
1341
}
1325
1342
1326
1343
//
@@ -2547,6 +2564,7 @@ static struct ggml_cgraph * llm_build_llama(
2547
2564
const int n_gpu_layers = model.n_gpu_layers ;
2548
2565
2549
2566
const int32_t n_tokens = batch.n_tokens ;
2567
+ const int32_t n_kv = kv_self.cell_max + n_tokens;
2550
2568
2551
2569
auto & buf_compute = lctx.buf_compute ;
2552
2570
@@ -2621,17 +2639,27 @@ static struct ggml_cgraph * llm_build_llama(
2621
2639
ggml_set_name (KQ_scale, " 1/sqrt(n_embd_head)" );
2622
2640
2623
2641
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2624
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_ctx , n_tokens, 1 );
2642
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv , n_tokens, 1 );
2625
2643
ggml_allocr_alloc (lctx.alloc , KQ_mask);
2626
2644
if (!ggml_allocr_is_measure (lctx.alloc )) {
2627
2645
float * data = (float *) KQ_mask->data ;
2628
2646
memset (data, 0 , ggml_nbytes (KQ_mask));
2629
2647
2630
2648
for (int h = 0 ; h < 1 ; ++h) {
2631
2649
for (int j = 0 ; j < n_tokens; ++j) {
2632
- for (int i = 0 ; i < n_ctx; ++i) {
2633
- if (!kv_self.cells [i].has_seq_id (batch.seq_id [j]) || kv_self.cells [i].pos > batch.pos [j]) {
2634
- data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
2650
+ const llama_pos pos = batch.pos [j];
2651
+ const llama_seq_id seq_id = batch.seq_id [j];
2652
+
2653
+ for (int i = 0 ; i < n_kv; ++i) {
2654
+ if (!kv_self.cells [i].has_seq_id (seq_id) || kv_self.cells [i].pos > pos) {
2655
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
2656
+ }
2657
+ }
2658
+
2659
+ // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
2660
+ for (int i = n_kv; i < n_ctx; ++i) {
2661
+ if (kv_self.cells [i].has_seq_id (seq_id) && kv_self.cells [i].pos >= 0 ) {
2662
+ GGML_ASSERT (false && " cell_max is too small - this might indicate a bug" );
2635
2663
}
2636
2664
}
2637
2665
}
@@ -2725,7 +2753,7 @@ static struct ggml_cgraph * llm_build_llama(
2725
2753
2726
2754
struct ggml_tensor * K =
2727
2755
ggml_view_3d (ctx0, kv_self.k ,
2728
- n_embd_head, n_ctx , n_head_kv,
2756
+ n_embd_head, n_kv , n_head_kv,
2729
2757
ggml_element_size (kv_self.k )*n_embd_gqa,
2730
2758
ggml_element_size (kv_self.k )*n_embd_head,
2731
2759
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
@@ -2738,7 +2766,7 @@ static struct ggml_cgraph * llm_build_llama(
2738
2766
ggml_set_name (KQ, " KQ" );
2739
2767
2740
2768
// KQ_scaled = KQ / sqrt(n_embd_head)
2741
- // KQ_scaled shape [n_ctx , n_tokens, n_head, 1]
2769
+ // KQ_scaled shape [n_kv , n_tokens, n_head, 1]
2742
2770
struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
2743
2771
offload_func_kq (KQ_scaled);
2744
2772
ggml_set_name (KQ_scaled, " KQ_scaled" );
@@ -2756,7 +2784,7 @@ static struct ggml_cgraph * llm_build_llama(
2756
2784
// split cached V into n_head heads
2757
2785
struct ggml_tensor * V =
2758
2786
ggml_view_3d (ctx0, kv_self.v ,
2759
- n_ctx , n_embd_head, n_head_kv,
2787
+ n_kv , n_embd_head, n_head_kv,
2760
2788
ggml_element_size (kv_self.v )*n_ctx,
2761
2789
ggml_element_size (kv_self.v )*n_ctx*n_embd_head,
2762
2790
ggml_element_size (kv_self.v )*n_ctx*n_embd_gqa*il);
@@ -2901,6 +2929,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2901
2929
const int n_gpu_layers = model.n_gpu_layers ;
2902
2930
2903
2931
const int32_t n_tokens = batch.n_tokens ;
2932
+ const int32_t n_kv = kv_self.cell_max + n_tokens;
2904
2933
2905
2934
auto & buf_compute = lctx.buf_compute ;
2906
2935
@@ -2975,17 +3004,27 @@ static struct ggml_cgraph * llm_build_baichaun(
2975
3004
ggml_set_name (KQ_scale, " 1/sqrt(n_embd_head)" );
2976
3005
2977
3006
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
2978
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_ctx , n_tokens, 1 );
3007
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv , n_tokens, 1 );
2979
3008
ggml_allocr_alloc (lctx.alloc , KQ_mask);
2980
3009
if (!ggml_allocr_is_measure (lctx.alloc )) {
2981
3010
float * data = (float *) KQ_mask->data ;
2982
3011
memset (data, 0 , ggml_nbytes (KQ_mask));
2983
3012
2984
3013
for (int h = 0 ; h < 1 ; ++h) {
2985
3014
for (int j = 0 ; j < n_tokens; ++j) {
2986
- for (int i = 0 ; i < n_ctx; ++i) {
2987
- if (!kv_self.cells [i].has_seq_id (batch.seq_id [j]) || kv_self.cells [i].pos > batch.pos [j]) {
2988
- data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
3015
+ const llama_pos pos = batch.pos [j];
3016
+ const llama_seq_id seq_id = batch.seq_id [j];
3017
+
3018
+ for (int i = 0 ; i < n_kv; ++i) {
3019
+ if (!kv_self.cells [i].has_seq_id (seq_id) || kv_self.cells [i].pos > pos) {
3020
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3021
+ }
3022
+ }
3023
+
3024
+ // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
3025
+ for (int i = n_kv; i < n_ctx; ++i) {
3026
+ if (kv_self.cells [i].has_seq_id (seq_id) && kv_self.cells [i].pos >= 0 ) {
3027
+ GGML_ASSERT (false && " cell_max is too small - this might indicate a bug" );
2989
3028
}
2990
3029
}
2991
3030
}
@@ -3092,7 +3131,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3092
3131
3093
3132
struct ggml_tensor * K =
3094
3133
ggml_view_3d (ctx0, kv_self.k ,
3095
- n_embd_head, n_ctx , n_head_kv,
3134
+ n_embd_head, n_kv , n_head_kv,
3096
3135
ggml_element_size (kv_self.k )*n_embd_gqa,
3097
3136
ggml_element_size (kv_self.k )*n_embd_head,
3098
3137
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
@@ -3135,7 +3174,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3135
3174
// split cached V into n_head heads
3136
3175
struct ggml_tensor * V =
3137
3176
ggml_view_3d (ctx0, kv_self.v ,
3138
- n_ctx , n_embd_head, n_head_kv,
3177
+ n_kv , n_embd_head, n_head_kv,
3139
3178
ggml_element_size (kv_self.v )*n_ctx,
3140
3179
ggml_element_size (kv_self.v )*n_ctx*n_embd_head,
3141
3180
ggml_element_size (kv_self.v )*n_ctx*n_embd_gqa*il);
@@ -3272,6 +3311,7 @@ static struct ggml_cgraph * llm_build_falcon(
3272
3311
const int n_gpu_layers = model.n_gpu_layers ;
3273
3312
3274
3313
const int32_t n_tokens = batch.n_tokens ;
3314
+ const int32_t n_kv = kv_self.cell_max + n_tokens;
3275
3315
3276
3316
auto & buf_compute = lctx.buf_compute ;
3277
3317
@@ -3346,17 +3386,27 @@ static struct ggml_cgraph * llm_build_falcon(
3346
3386
ggml_set_name (KQ_scale, " 1/sqrt(n_embd_head)" );
3347
3387
3348
3388
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3349
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_ctx , n_tokens, 1 );
3389
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv , n_tokens, 1 );
3350
3390
ggml_allocr_alloc (lctx.alloc , KQ_mask);
3351
3391
if (!ggml_allocr_is_measure (lctx.alloc )) {
3352
3392
float * data = (float *) KQ_mask->data ;
3353
3393
memset (data, 0 , ggml_nbytes (KQ_mask));
3354
3394
3355
3395
for (int h = 0 ; h < 1 ; ++h) {
3356
3396
for (int j = 0 ; j < n_tokens; ++j) {
3357
- for (int i = 0 ; i < n_ctx; ++i) {
3358
- if (!kv_self.cells [i].has_seq_id (batch.seq_id [j]) || kv_self.cells [i].pos > batch.pos [j]) {
3359
- data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
3397
+ const llama_pos pos = batch.pos [j];
3398
+ const llama_seq_id seq_id = batch.seq_id [j];
3399
+
3400
+ for (int i = 0 ; i < n_kv; ++i) {
3401
+ if (!kv_self.cells [i].has_seq_id (seq_id) || kv_self.cells [i].pos > pos) {
3402
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3403
+ }
3404
+ }
3405
+
3406
+ // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
3407
+ for (int i = n_kv; i < n_ctx; ++i) {
3408
+ if (kv_self.cells [i].has_seq_id (seq_id) && kv_self.cells [i].pos >= 0 ) {
3409
+ GGML_ASSERT (false && " cell_max is too small - this might indicate a bug" );
3360
3410
}
3361
3411
}
3362
3412
}
@@ -3479,7 +3529,7 @@ static struct ggml_cgraph * llm_build_falcon(
3479
3529
3480
3530
struct ggml_tensor * K =
3481
3531
ggml_view_3d (ctx0, kv_self.k ,
3482
- n_embd_head, n_ctx , n_head_kv,
3532
+ n_embd_head, n_kv , n_head_kv,
3483
3533
ggml_element_size (kv_self.k )*n_embd_gqa,
3484
3534
ggml_element_size (kv_self.k )*n_embd_head,
3485
3535
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
@@ -3504,7 +3554,7 @@ static struct ggml_cgraph * llm_build_falcon(
3504
3554
3505
3555
struct ggml_tensor * V =
3506
3556
ggml_view_3d (ctx0, kv_self.v ,
3507
- n_ctx , n_embd_head, n_head_kv,
3557
+ n_kv , n_embd_head, n_head_kv,
3508
3558
ggml_element_size (kv_self.v )*n_ctx,
3509
3559
ggml_element_size (kv_self.v )*n_ctx*n_embd_head,
3510
3560
ggml_element_size (kv_self.v )*n_ctx*n_embd_gqa*il);
@@ -3598,6 +3648,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3598
3648
const float norm_eps = hparams.f_norm_eps ;
3599
3649
3600
3650
const int32_t n_tokens = batch.n_tokens ;
3651
+ const int32_t n_kv = kv_self.cell_max + n_tokens;
3601
3652
3602
3653
auto & buf_compute = lctx.buf_compute ;
3603
3654
@@ -3664,17 +3715,27 @@ static struct ggml_cgraph * llm_build_starcoder(
3664
3715
ggml_set_name (KQ_scale, " 1/sqrt(n_embd_head)" );
3665
3716
3666
3717
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3667
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_ctx , n_tokens, 1 );
3718
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv , n_tokens, 1 );
3668
3719
ggml_allocr_alloc (lctx.alloc , KQ_mask);
3669
3720
if (!ggml_allocr_is_measure (lctx.alloc )) {
3670
3721
float * data = (float *) KQ_mask->data ;
3671
3722
memset (data, 0 , ggml_nbytes (KQ_mask));
3672
3723
3673
3724
for (int h = 0 ; h < 1 ; ++h) {
3674
3725
for (int j = 0 ; j < n_tokens; ++j) {
3675
- for (int i = 0 ; i < n_ctx; ++i) {
3676
- if (!kv_self.cells [i].has_seq_id (batch.seq_id [j]) || kv_self.cells [i].pos > batch.pos [j]) {
3677
- data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
3726
+ const llama_pos pos = batch.pos [j];
3727
+ const llama_seq_id seq_id = batch.seq_id [j];
3728
+
3729
+ for (int i = 0 ; i < n_kv; ++i) {
3730
+ if (!kv_self.cells [i].has_seq_id (seq_id) || kv_self.cells [i].pos > pos) {
3731
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3732
+ }
3733
+ }
3734
+
3735
+ // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
3736
+ for (int i = n_kv; i < n_ctx; ++i) {
3737
+ if (kv_self.cells [i].has_seq_id (seq_id) && kv_self.cells [i].pos >= 0 ) {
3738
+ GGML_ASSERT (false && " cell_max is too small - this might indicate a bug" );
3678
3739
}
3679
3740
}
3680
3741
}
@@ -3727,7 +3788,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3727
3788
3728
3789
struct ggml_tensor * K =
3729
3790
ggml_view_3d (ctx0, kv_self.k ,
3730
- n_embd_head, n_ctx , n_head_kv,
3791
+ n_embd_head, n_kv , n_head_kv,
3731
3792
ggml_element_size (kv_self.k )*n_embd_gqa,
3732
3793
ggml_element_size (kv_self.k )*n_embd_head,
3733
3794
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
@@ -3753,7 +3814,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3753
3814
// split cached V into n_head heads
3754
3815
struct ggml_tensor * V =
3755
3816
ggml_view_3d (ctx0, kv_self.v ,
3756
- n_ctx , n_embd_head, n_head_kv,
3817
+ n_kv , n_embd_head, n_head_kv,
3757
3818
ggml_element_size (kv_self.v )*n_ctx,
3758
3819
ggml_element_size (kv_self.v )*n_ctx*n_embd_head,
3759
3820
ggml_element_size (kv_self.v )*n_ctx*n_embd_gqa*il);
@@ -3974,8 +4035,9 @@ static bool llama_eval_internal(
3974
4035
ggml_mpi_graph_compute_post (lctx.ctx_mpi , gf, n_layer);
3975
4036
#endif
3976
4037
3977
- // update the kv ring buffer head
3978
- lctx.kv_self .head += n_tokens;
4038
+ // update the kv ring buffer
4039
+ lctx.kv_self .head += n_tokens;
4040
+ lctx.kv_self .cell_max = std::max (lctx.kv_self .cell_max , lctx.kv_self .head );
3979
4041
3980
4042
#ifdef GGML_PERF
3981
4043
// print timing information per ggml operation (for debugging purposes)
@@ -7040,6 +7102,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7040
7102
}
7041
7103
7042
7104
ctx->kv_self .head = kv_ntok;
7105
+ ctx->kv_self .size = kv_size;
7106
+
7107
+ ctx->kv_self .cell_max = kv_ntok;
7043
7108
}
7044
7109
7045
7110
const size_t nread = inp - src;
0 commit comments