-
-
Notifications
You must be signed in to change notification settings - Fork 11.8k
[BugFix] Fix assert batch_descriptor.num_tokens == num_tokens_padded
#30173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,16 +40,18 @@ def _run_ar( | |
| should_dp_pad: bool, | ||
| orig_num_tokens_per_ubatch: int, | ||
| padded_num_tokens_per_ubatch: int, | ||
| cudagraph_mode: int, | ||
| parallel_config: ParallelConfig, | ||
| ) -> torch.Tensor: | ||
| dp_size = parallel_config.data_parallel_size | ||
| dp_rank = parallel_config.data_parallel_rank | ||
| device, group = _get_device_and_group(parallel_config) | ||
| tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32) | ||
| tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32) | ||
| tensor[0][dp_rank] = orig_num_tokens_per_ubatch | ||
| tensor[1][dp_rank] = padded_num_tokens_per_ubatch | ||
| tensor[2][dp_rank] = 1 if should_ubatch else 0 | ||
| tensor[3][dp_rank] = 1 if should_dp_pad else 0 | ||
| tensor[4][dp_rank] = cudagraph_mode | ||
| dist.all_reduce(tensor, group=group) | ||
| return tensor | ||
|
|
||
|
|
@@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch | |
| return num_tokens_across_dp.cpu() | ||
|
|
||
|
|
||
| def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int: | ||
| """ | ||
| Synchronize cudagraph_mode across DP ranks by taking the minimum. | ||
| If any rank has NONE (0), all ranks use NONE. | ||
| This ensures all ranks send consistent values (all padded or all unpadded). | ||
| """ | ||
| return int(tensor[4, :].min().item()) | ||
|
|
||
|
|
||
| def _synchronize_dp_ranks( | ||
| num_tokens_unpadded: int, | ||
| num_tokens_padded: int, | ||
| should_attempt_ubatching: bool, | ||
| should_attempt_dp_padding: bool, | ||
| cudagraph_mode: int, | ||
| parallel_config: ParallelConfig, | ||
| ) -> tuple[bool, torch.Tensor | None]: | ||
| ) -> tuple[bool, torch.Tensor | None, int]: | ||
| """ | ||
| 1. Decides if each DP rank is going to microbatch. Either all ranks | ||
| run with microbatching or none of them do. | ||
|
|
@@ -104,10 +116,13 @@ def _synchronize_dp_ranks( | |
| When running microbatched or if should_attempt_dp_padding is True, all | ||
| ranks will be padded out so that the run with the same number of tokens | ||
| 3. Synchronizes cudagraph_mode across ranks by taking the minimum. | ||
| Returns: tuple[ | ||
| should_ubatch: Are all DP ranks going to microbatch | ||
| num_tokens_after_padding: A tensor containing the total number of | ||
| tokens per-microbatch for each DP rank including any DP padding. | ||
| synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks) | ||
| ] | ||
| """ | ||
|
|
@@ -121,6 +136,7 @@ def _synchronize_dp_ranks( | |
| should_dp_pad=should_attempt_dp_padding, | ||
| orig_num_tokens_per_ubatch=num_tokens_unpadded, | ||
| padded_num_tokens_per_ubatch=num_tokens_padded, | ||
| cudagraph_mode=cudagraph_mode, | ||
| parallel_config=parallel_config, | ||
| ) | ||
|
|
||
|
|
@@ -148,7 +164,10 @@ def _synchronize_dp_ranks( | |
| should_dp_pad, | ||
| ) | ||
|
|
||
| return should_ubatch, num_tokens_after_padding | ||
| # Synchronize cudagraph_mode across ranks (take min) | ||
| synced_cudagraph_mode = _post_process_cudagraph_mode(tensor) | ||
|
|
||
| return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode | ||
|
|
||
|
|
||
| def coordinate_batch_across_dp( | ||
|
|
@@ -159,7 +178,8 @@ def coordinate_batch_across_dp( | |
| num_tokens_padded: int | None = None, | ||
| uniform_decode: bool | None = None, | ||
| num_scheduled_tokens_per_request: np.ndarray | None = None, | ||
| ) -> tuple[bool, torch.Tensor | None]: | ||
| cudagraph_mode: int = 0, | ||
| ) -> tuple[bool, torch.Tensor | None, int]: | ||
| """ | ||
| Coordinates amongst all DP ranks to determine if and how the full batch | ||
| should be split into microbatches. | ||
|
|
@@ -175,6 +195,7 @@ def coordinate_batch_across_dp( | |
| only contains single token decodes | ||
| num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The | ||
| number of tokens per request. | ||
| cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL) | ||
| Returns: tuple[ | ||
| ubatch_slices: if this is set then all DP ranks have agreed to | ||
|
|
@@ -183,12 +204,13 @@ def coordinate_batch_across_dp( | |
| tokens per-microbatch for each DP rank including padding. Will be | ||
| padded up to the max value across all DP ranks when allow_dp_padding | ||
| is True. | ||
| synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks) | ||
| ] | ||
| """ | ||
| if parallel_config.data_parallel_size == 1: | ||
| # Early exit. | ||
| return False, None | ||
| return False, None, cudagraph_mode | ||
|
|
||
| # If the caller has explicitly enabled microbatching. | ||
| should_attempt_ubatching = False | ||
|
|
@@ -204,12 +226,15 @@ def coordinate_batch_across_dp( | |
| if num_tokens_padded is None: | ||
| num_tokens_padded = num_tokens_unpadded | ||
|
|
||
| (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks( | ||
| num_tokens_unpadded, | ||
| num_tokens_padded, | ||
| should_attempt_ubatching, | ||
| allow_dp_padding, | ||
| parallel_config, | ||
| (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = ( | ||
| _synchronize_dp_ranks( | ||
| num_tokens_unpadded, | ||
| num_tokens_padded, | ||
| should_attempt_ubatching, | ||
| allow_dp_padding, | ||
| cudagraph_mode, | ||
| parallel_config, | ||
| ) | ||
| ) | ||
|
|
||
| return (should_ubatch, num_tokens_after_padding) | ||
| return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
coordinate_batch_across_dp now returns three values including the synchronized cudagraph mode (return at line 240), but callers such as set_forward_context in forward_context.py (around lines 295–300) and eagle._pad_batch_across_dp in v1/spec_decode/eagle.py (around lines 1261–1269) still unpack only two items. In multi-DP runs where these paths invoke coordinate_batch_across_dp, Python will raise Useful? React with 👍 / 👎. |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The use of magic numbers
0, 1, 2, 3, 4for indexing into thetensormakes the code hard to read and maintain. It's not immediately clear what each index represents without looking at the surrounding code or comments. This pattern is also present in_post_process_cudagraph_modewithtensor[4, :]. This can lead to bugs if the order or size of the tensor changes.I recommend defining these indices as constants at the module level, for example, using an
Enum. This would make the code self-documenting and less error-prone across all functions that use this tensor (_run_ar,_post_process_ubatch,_post_process_dp_padding,_post_process_cudagraph_mode).For example:
Then you could use
tensor[DPSync.CUDAGRAPH_MODE]instead oftensor[4].