ServiceNow · jlamypoirier · Sep 18, 2025 · Jul 21, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/Megatron-LM b/Megatron-LM
diff --git a/examples/mistral.yaml b/examples/mistral.yaml
@@ -33,27 +33,31 @@ model:
         rotary:
           type: default
           theta: 10000
-        num_attention_heads: 32
+        heads: 32
         head_groups: 8
-        kv_channels: 128
+        head_size: 128
+        add_linear_biases: false
         window_size: 4096
-        attention_dropout: 0.0
+        dropout: 0.0
       mlp:
-        ffn_hidden_size: 14336
+        intermediate_size: 14336
+        add_linear_biases: false
         gated: true
-        activation_type: silu
+        activation: silu
       normalization:
         type: rms_norm
         epsilon: 1.0e-05
       num_layers: 32
       hidden_size: 4096
-      add_linear_biases: false
-      init_method_std: 0.009021
-      hidden_dropout: 0.0
+      dropout: 0.0
     embeddings_layer:
       vocab_size: 32000
+      dropout: 0.0
     output_layer:
       tied_weight: false
+      normalization:
+        type: rms_norm
+        epsilon: 1.0e-05
   multi_stage:
     zero_stage: 2
   distributed:

diff --git a/fast_llm/engine/base_model/base_model.py b/fast_llm/engine/base_model/base_model.py
@@ -6,7 +6,7 @@
 import torch.nn
 
 from fast_llm.config import Configurable
-from fast_llm.engine.base_model.config import BaseModelConfig
+from fast_llm.engine.base_model.config import BaseModelConfig, ResourceUsageConfig
 from fast_llm.engine.distributed.config import DistributedConfig, PhaseType
 from fast_llm.engine.distributed.distributed import Distributed
 from fast_llm.tensor import ParameterMeta, TensorMeta
@@ -43,6 +43,9 @@ def forward(
     ) -> torch.Tensor:
         pass
 
+    def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], config: ResourceUsageConfig) -> int:
+        raise NotImplementedError()
+
 
 class Sequential(Layer):
     def __init__(self, distributed_config: DistributedConfig):
@@ -94,7 +97,8 @@ def __init__(
         distributed_config: DistributedConfig,
     ):
         super().__init__(config, distributed_config)
-
+        for key, value in self.named_modules():
+            value.module_name = key
         for key, value in self.named_parameters():
             Assert.custom(isinstance, value, ParameterMeta)
             # Rename to the parameter full name

diff --git a/fast_llm/engine/base_model/config.py b/fast_llm/engine/base_model/config.py
@@ -63,3 +63,15 @@ def preprocess_meta(self, kwargs: dict[str, typing.Any]) -> None:
     @abc.abstractmethod
     def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
         pass
+
+
+@dataclasses.dataclass
+class ResourceUsageConfig:
+    # Disable to get usage for current GPU only
+    global_: bool = True
+    # Enable to get hardware compute, i.e. include redundant computations.
+    hardware: bool = False
+    # Number of backward passes. Typically 1, may be 2 with full activation recomputation.
+    forward: int = 1
+    # Number of backward passes. Typically 1 for training, 0 for inference.
+    backward: int = 1
diff --git a/fast_llm/engine/config_utils/run.py b/fast_llm/engine/config_utils/run.py
@@ -5,11 +5,11 @@
 
 import yaml
 
-from fast_llm.config import Config, Field, FieldHint, FieldVerboseLevel, config_class
+from fast_llm.config import Config, Field, FieldHint, FieldVerboseLevel, check_field, config_class
 from fast_llm.engine.config_utils.logging import TensorLogs, TensorLogsConfig, configure_logging
 from fast_llm.engine.config_utils.runnable import RunnableConfig
 from fast_llm.engine.distributed.config import DistributedConfig
-from fast_llm.utils import log, set_global_variables
+from fast_llm.utils import Assert, log, set_global_variables
 
 if typing.TYPE_CHECKING:
     from fast_llm.engine.distributed.distributed import Distributed
@@ -58,6 +58,12 @@ class RunConfig(Config):
         desc="Global switch to use triton kernels for linear layers. These may be slightly slower than the defaults.",
         hint=FieldHint.performance,
     )
+    model_debug_level: int = Field(
+        default=0,
+        desc="Debugging level for the model, ex. for printing intermediate model states.",
+        hint=FieldHint.logging,
+        valid=check_field(Assert.geq, 0),
+    )
 
     def _validate(self):
         if self.experiment_dir is None:
@@ -204,15 +210,21 @@ def open_artifact(self, name: str, mode: str | None = "w", verbose=True) -> path
         return path if mode is None else path.open(mode)
 
     def __enter__(self):
+        from fast_llm.logging import set_model_debug_level
+
         assert not self._is_running
         global _run
         _run = self
         TensorLogs.reset(self._config.tensor_logs)
+        set_model_debug_level(self._config.model_debug_level)
 
     def __exit__(self, exc_type, exc_val: OSError, exc_tb):
+        from fast_llm.logging import set_model_debug_level
+
         assert self._is_running
         global _run
         self.save_logged_tensors("none")
+        set_model_debug_level(0)
         _run = None
 
 

diff --git a/fast_llm/engine/evaluation/evaluator.py b/fast_llm/engine/evaluation/evaluator.py
@@ -1,6 +1,8 @@
 import abc
 import dataclasses
+import functools
 import logging
+import math
 import time
 import typing
 
@@ -203,12 +205,10 @@ def _evaluate_loss(
         )
         end_time = time.perf_counter()
         time_per_iteration = (end_time - begin_time) / num_iters
-        model_tflops, hardware_tflops = self._multi_stage.get_tflops(
-            phase,
-            time_per_iteration,
-            self._batch_config.batch_size,
-            self._batch_config.sequence_length,
-        )
+
+        model_compute, hardware_compute = self._schedule.compute_usage
+        model_tflops = math.nan if model_compute is None else model_compute / time_per_iteration
+        hardware_tflops = math.nan if hardware_compute is None else hardware_compute / time_per_iteration
         # TODO add other relevant eval metrics
         metrics = {
             "batch_size": self._batch_config.batch_size,
@@ -218,7 +218,7 @@ def _evaluate_loss(
             "hardware_tflops": hardware_tflops,
             "tokens_per_sec_per_gpu": (
                 (self._batch_config.sequence_length * self._batch_config.batch_size)
-                / self._schedule._distributed.world_size
+                / self._schedule._distributed_config.world_size
                 / time_per_iteration
             ),
             **get_and_reset_memory_usage_mib(),
@@ -240,6 +240,10 @@ def _get_data_iterator(
             prefetch_factor=prefetch_factor,
         )
 
+    @functools.cached_property
+    def compute_usage(self) -> tuple[int | None, int | None]:
+        return self._schedule.get_compute_usage(hardware=False), self._schedule.get_compute_usage(hardware=True)
+
 
 # NOTE: This is not a standalone runnable; it's a submodule of Trainer used for code encapsulation.
 class EvaluatorRunner:

diff --git a/fast_llm/engine/multi_stage/multi_stage.py b/fast_llm/engine/multi_stage/multi_stage.py
@@ -1,4 +1,3 @@
-import abc
 import dataclasses
 import logging
 import typing
@@ -13,7 +12,7 @@
 from fast_llm.engine.config_utils.data_type import DataType
 from fast_llm.engine.config_utils.run import log_main_rank, log_model_parallel_main_rank
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
-from fast_llm.engine.distributed.config import DistributedDim, DistributedDimNames, PhaseType
+from fast_llm.engine.distributed.config import DistributedDim, DistributedDimNames
 from fast_llm.engine.distributed.distributed import Distributed
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName, StageMode
 from fast_llm.engine.multi_stage.fsdp import FSDP
@@ -252,11 +251,6 @@ def setup(self, distributed: Distributed | None = None, mode: StageMode = StageM
 
         self.train(self._mode.support_backward)
 
-    @abc.abstractmethod
-    def get_tflops(self, phase: PhaseType, elapsed_time_per_iteration, batch_size, sequence_length) -> tuple[int, int]:
-        # TODO: Do in model, automate/generalize, get other stats
-        pass
-
     def _allocate_buffers(
         self, buffer_meta: TensorMeta, sizes: list[int], name: str
     ) -> tuple[tuple[torch.Tensor, ...], int]:

diff --git a/fast_llm/engine/multi_stage/stage.py b/fast_llm/engine/multi_stage/stage.py
@@ -5,6 +5,7 @@
 import torch
 
 from fast_llm.core.distributed import check_parallel_match
+from fast_llm.engine.base_model.config import ResourceUsageConfig
 from fast_llm.engine.config_utils.run import log_pipeline_parallel_main_rank
 from fast_llm.engine.distributed.distributed import Distributed
 from fast_llm.engine.multi_stage.config import StageConfig, StageMode
@@ -81,6 +82,7 @@ def setup(  # noqa
 
     def forward_meta(self, input_: TensorMeta, kwargs: dict) -> TensorMeta:
         # Store the meta inputs and outputs, for debugging only.
+        # TODO: Varies if there are multiple schedules.
         self._meta_inputs, self._meta_outputs = [], []
         # TODO: use layer.forward_meta
         for layer in self._layers:
@@ -93,6 +95,17 @@ def forward_meta(self, input_: TensorMeta, kwargs: dict) -> TensorMeta:
             self._meta_outputs.append(input_)
         return input_
 
+    def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], config: ResourceUsageConfig) -> int:
+        total = 0
+        for layer in self._layers:
+            total += layer.get_compute_usage(input_, kwargs, config)
+            input_ = layer(
+                input_,
+                kwargs,
+                losses={},
+            )
+        return total
+
     def forward(
         self,
         input_: torch.Tensor,

diff --git a/fast_llm/engine/schedule/schedule.py b/fast_llm/engine/schedule/schedule.py
@@ -1,5 +1,6 @@
 import abc
 import dataclasses
+import functools
 import logging
 import typing
 import warnings
@@ -9,6 +10,7 @@
 import torch.utils
 import torch.utils.data
 
+from fast_llm.engine.base_model.config import ResourceUsageConfig
 from fast_llm.engine.distributed.config import DistributedConfig, PhaseType
 from fast_llm.engine.multi_stage.multi_stage import MultiStageModel
 from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig, StepType
@@ -127,12 +129,12 @@ def __init__(
         self._multi_stage = multi_stage
         self._batch_config = batch_config
         self._schedule_config = schedule_config
-        self._distributed = distributed_config
+        self._distributed_config = distributed_config
         self._num_stages = len(self._multi_stage.stages)
         self._phase = phase
         self._is_training = self._phase.is_training
 
-        if self._batch_config.num_inputs < self._distributed.pipeline_parallel:
+        if self._batch_config.num_inputs < self._distributed_config.pipeline_parallel:
             warnings.warn("Not enough input to achieve true pipeline parallelism.")
 
         # Setup the activation metas.
@@ -172,7 +174,7 @@ def iterate(self, pipeline_rank: int | None = None) -> typing.Iterator[Step]:
         return iter(self._steps if pipeline_rank is None else self._device_steps[pipeline_rank])
 
     def __iter__(self) -> typing.Iterator[Step]:
-        return self.iterate(self._distributed.pipeline_rank)
+        return self.iterate(self._distributed_config.pipeline_rank)
 
     def __repr__(self) -> str:
         return "Schedule with steps:\n" + "\n".join(
@@ -191,7 +193,7 @@ def get_step(
         return self._step_map[(type_, stage, data_index)]
 
     def _create_index(self) -> None:
-        self._device_steps: list[list[Step]] = [[] for _ in range(self._distributed.pipeline_parallel)]
+        self._device_steps: list[list[Step]] = [[] for _ in range(self._distributed_config.pipeline_parallel)]
         self._step_map = {}
         for i, step in enumerate(self._steps):
             Assert.in_range(step.stage, 0, self._num_stages)
@@ -204,7 +206,7 @@ def _create_index(self) -> None:
             step.global_index = i
             # TODO: More configurable placement?
 
-            step.pipeline_rank = step.stage % self._distributed.pipeline_parallel
+            step.pipeline_rank = step.stage % self._distributed_config.pipeline_parallel
             step.local_index = len(self._device_steps[step.pipeline_rank])
             self._device_steps[step.pipeline_rank].append(step)
             Assert.not_incl(map_index := step.map_index, self._step_map)
@@ -272,7 +274,7 @@ def _create_index(self) -> None:
 
     def _setup_restore_steps(self, weight_buffer_indices: dict[int, int]) -> None:
         for rank, device_steps in enumerate(self._device_steps):
-            if rank != self._distributed.pipeline_rank:
+            if rank != self._distributed_config.pipeline_rank:
                 # TODO: Make restore schedule for all ranks (need all buffer indices)
                 continue
             buffer_contents, buffer_last_used = {}, {}
@@ -292,7 +294,7 @@ def _setup_reduce_steps(self, grad_buffer_indices: dict[int, int]) -> None:
         if not self._is_training:
             return
         for rank, device_steps in enumerate(self._device_steps):
-            if rank != self._distributed.pipeline_rank:
+            if rank != self._distributed_config.pipeline_rank:
                 # TODO: Make restore schedule for all ranks (need all buffer indices)
                 continue
             buffer_last_steps = {}
@@ -314,12 +316,12 @@ def _setup_reduce_steps(self, grad_buffer_indices: dict[int, int]) -> None:
             for stage, count in enumerate(reduction_count):
                 assert (count > 0) == (
                     stage >= self._first_grad_stage
-                    and (stage % self._distributed.pipeline_parallel == self._distributed.pipeline_rank)
+                    and (stage % self._distributed_config.pipeline_parallel == self._distributed_config.pipeline_rank)
                 )
 
     def _setup_timeline(self) -> None:
         # TODO: Include network time
-        idx = [0] * self._distributed.pipeline_parallel
+        idx = [0] * self._distributed_config.pipeline_parallel
         done = False
         while not done:
             done = True
@@ -380,11 +382,11 @@ def _setup_send_recv_steps(self) -> None:
                         recv_step.recv_event = torch.cuda.Event()
 
     def _validate_send_recv_steps(self) -> None:
-        times = [0.0] * self._distributed.pipeline_parallel
-        idx = [0] * self._distributed.pipeline_parallel
-        recv_idx = [0] * self._distributed.pipeline_parallel
-        statuses = ["Ok"] * self._distributed.pipeline_parallel
-        recv_queues: list[list[Step | None]] = [[] for _ in range(self._distributed.pipeline_parallel)]
+        times = [0.0] * self._distributed_config.pipeline_parallel
+        idx = [0] * self._distributed_config.pipeline_parallel
+        recv_idx = [0] * self._distributed_config.pipeline_parallel
+        statuses = ["Ok"] * self._distributed_config.pipeline_parallel
+        recv_queues: list[list[Step | None]] = [[] for _ in range(self._distributed_config.pipeline_parallel)]
         done = False
         while not done:
             done = True
@@ -519,3 +521,30 @@ def _create_steps(self) -> tuple[list[Step], int]:
                                 )
                             )
         return steps, first_grad_stage
+
+    def get_compute_usage(
+        self,
+        global_: bool = True,
+        hardware: bool = False,
+    ) -> int | None:
+        total = 0
+        try:
+            for step in self._steps if global_ else self._device_steps[self._distributed_config.pipeline_rank]:
+                if step.type_ == StepType.forward:
+                    total += self._multi_stage.stages[step.stage].get_compute_usage(
+                        step.meta_input,
+                        step.meta_kwargs,
+                        ResourceUsageConfig(
+                            global_=global_,
+                            hardware=hardware,
+                            forward=1,
+                            backward=int(self._is_training),
+                        ),
+                    )
+            return total
+        except NotImplementedError:
+            return None
+
+    @functools.cached_property
+    def compute_usage(self) -> tuple[int | None, int | None]:
+        return self.get_compute_usage(True, False), self.get_compute_usage(True, True)