@@ -373,18 +373,22 @@ def _try_schedule_encoder_inputs(
373
373
if self .encoder_cache_manager .has_cache (request , i ):
374
374
# The encoder input is already computed and cached.
375
375
continue
376
- if not self .encoder_cache_manager .can_allocate (request , i ):
377
- # The encoder cache is full. We can only schedule the decoder
378
- # tokens just before the encoder input.
379
- num_new_tokens = start_pos - num_computed_tokens
380
- break
381
- if num_encoder_tokens > encoder_budget :
382
- # The encoder budget is exhausted. We can only schedule the
383
- # decoder tokens up until the encoder input.
384
- # NOTE(woosuk): We assume that the encoder tokens should be
385
- # processed altogether, as the encoder usually uses
376
+ if (not self .encoder_cache_manager .can_allocate (request , i )
377
+ or num_encoder_tokens > encoder_budget ):
378
+ # The encoder cache is full or the encoder budget is exhausted.
379
+ # NOTE(woosuk): We assume that the encoder input tokens should
380
+ # be processed altogether, as the encoder usually uses
386
381
# bidirectional attention.
387
- num_new_tokens = start_pos - num_computed_tokens
382
+ if num_computed_tokens < start_pos :
383
+ # We only schedule the decoder tokens just before the
384
+ # encoder input.
385
+ num_new_tokens = start_pos - num_computed_tokens
386
+ else :
387
+ # Because of prefix caching, num_computed_tokens is greater
388
+ # than start_pos even though its encoder input is not
389
+ # available. In this case, we can't schedule any token for
390
+ # the request in this step.
391
+ num_new_tokens = 0
388
392
break
389
393
390
394
encoder_budget -= num_encoder_tokens
0 commit comments