added example for bidirectional checkpoint testing

wesleytruong · wesleytruong · commit e3e1be80fa36 · 2025-08-06T11:52:06.000-07:00
diff --git a/scripts/checkpoint_conversion/checkpoint_conversion.md b/scripts/checkpoint_conversion/checkpoint_conversion.md
@@ -0,0 +1,25 @@
+# Testing Checkpoint Conversion for Correctness
+
+When converting checkpoints between file types or model definitions, we need to ensure that the converted checkpoints are correct, i.e. their model definition remains the same, which includes that the converted checkpoint's weights will give the same outputs when loaded in the new intended program context.
+
+This guide provides a general framework on how to test your conversion script for correctness. The example that we will use here is bidirectional conversion between HuggingFace and `torchtitan`.
+
+## Methods
+
+### Sanity Check (Greedy Decode)
+A quick way to sanity check if your conversion is correct is to perform greedy decoding inference on both the initial and converted checkpoints and confirm that they are the same. This method doesn't guarantee correctness but will very likely result in a fast **true negative** if the model definitions are not the same. For greedy decoding, the `generation/test_generate.py` script can be used.
+
+Note that the model definitions can be influenced by external factors than correctness of weight conversion. For example, using our verified `convert_to_hf.py` script then running greedy decoding using HF `transformers` without a correct `config.json` will result in a **false negative** since our weights are correct but the model definition is incorrect due to `config.json`.
+
+### Comprehensive Check (KL Divergence)
+To ensure comprehensive end-to-end correctness we recommend using KL divergence loss to compare the logits between forward passes of both the original and converted model definitions. KL divergence quantifies the "difference" between two probability distributions. A result of zero or a very low KL divergence indicates that the model definitions are equivalent. This method is crucial as it evaluates the entire probability distribution, not just the highest probability at each step.
+
+In our `./scripts/checkpoint_conversion/example.py` this will be performing forward on dcp checkpoints loaded in `torchtitan` and safetensors checkpoints loaded in huggingface `AutoModelForCausalLM`. We additionally compare the conversions done with no permutation to double check that our permutation results in a lower kl divergence loss.
+
+```
+$ python ./scripts/checkpoint_conversion/example.py
+Average loss for test from_hf is -4.951488641303202e-14
+Average loss for test to_hf is -4.951488641303202e-14
+Average loss for test from_hf_no_perm is 6.310602202574955e-06
+Average loss for test to_hf_no_perm is 2.0396773834363557e-05
+```
diff --git a/scripts/checkpoint_conversion/example.py b/scripts/checkpoint_conversion/example.py
@@ -0,0 +1,180 @@
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+import torch
+
+import torch.distributed.checkpoint as dcp
+import torch.nn.functional as F
+from torch.distributed.checkpoint import HuggingFaceStorageReader
+from torchtitan.components.checkpoint import excluded_parameters_for_model_only
+from torchtitan.config import ConfigManager
+from torchtitan.protocols.train_spec import get_train_spec
+from torchtitan.tools.logging import logger
+from transformers import AutoModelForCausalLM
+
+device_type = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def loss_fn(logits1, logits2):
+    # Convert logits to probabilities
+    probs1 = F.log_softmax(logits1, dim=-1)
+    probs2 = F.softmax(logits2, dim=-1)
+
+    # Calculate KL Divergence
+    kl_loss = F.kl_div(probs1, probs2, "mean")
+    return kl_loss
+
+
+@torch.no_grad
+def forward_hf(model_name, model_path: Optional[str], input_ids):
+    # Load the tokenizer and model
+    model_path = model_path if model_path else model_name
+    model = AutoModelForCausalLM.from_pretrained(model_path)
+
+    device = torch.device(device_type)
+    model.to(device)
+
+    # List to store outputs
+    outputs_list = []
+
+    for inputs in input_ids:
+        inputs = inputs.to(device)
+        outputs = model.generate(
+            inputs=inputs,
+            max_length=prompt_len + 1,
+            do_sample=False,
+            output_logits=True,
+            return_dict_in_generate=True,
+        )
+
+        outputs = torch.stack(outputs.logits)
+        outputs_list.append(outputs)
+
+    del model
+    torch.cuda.empty_cache()
+
+    return outputs_list
+
+
+@torch.no_grad
+def forward_tt(config_path, checkpoint_path, test_set):
+
+    config_manager = ConfigManager()
+    config = config_manager.parse_args([f"--job.config_file={config_path}"])
+
+    train_spec = get_train_spec(config.model.name)
+
+    model_args = train_spec.model_args[config.model.flavor]
+    model_args.update_from_config(config)
+
+    model = train_spec.model_cls(model_args)
+
+    # materalize model
+    device = torch.device(device_type)
+    model.to_empty(device=device)
+    with torch.no_grad():
+        model.init_weights()
+    model.eval()
+
+    state_dict = model.state_dict()
+    for k in excluded_parameters_for_model_only:
+        state_dict.pop(k, None)
+
+    # Checkpoint Loading
+    logger.info(f"Loading chkpt at: {checkpoint_path}")
+    load_from_hf = False
+    for filename in os.listdir(checkpoint_path):
+        if filename == "model.safetensors.index.json":
+            load_from_hf = True
+    if load_from_hf:
+        sd_adapter = train_spec.state_dict_adapter
+        hf_state_dict = sd_adapter.to_hf(state_dict)
+        dcp.load(hf_state_dict, HuggingFaceStorageReader(path=checkpoint_path))
+        state_dict = sd_adapter.from_hf(hf_state_dict)
+    else:
+        dcp.load(state_dict, checkpoint_id=checkpoint_path)
+
+    output_list = []
+    for prompt in test_set:
+        input_ids = prompt.to(device_type)
+        # ensure batch dimension (T,) --> (B, T)
+        if input_ids.ndim == 1:
+            input_ids = input_ids.unsqueeze(0)
+
+        # obtains the logits of only the last token in the predictions
+        predictions = model(input_ids)[:, -1, :].unsqueeze(1)
+        output_list.append(predictions)
+
+    del model
+    torch.cuda.empty_cache()
+
+    return output_list
+
+
+if __name__ == "__main__":
+    # hf params
+    hf_model_name = "meta-llama/Meta-Llama-3-8B"
+    hf_model_path = "outputs/checkpoint/step-0-tohf"
+    hf_model_path_no_perm = "outputs/checkpoint/step-0-tohfnoperm"
+
+    # tt params
+    config_path = "torchtitan/models/llama3/train_configs/llama3_8b.toml"
+    baseline_checkpoint_path = "outputs/checkpoint/step-0-fromllama"
+    checkpoint_path = "outputs/checkpoint/step-0-fromhf"
+    checkpoint_path_no_perm = "outputs/checkpoint/step-0-fromhfnoperm"
+
+    # test params
+    prompt_len = 8
+    test_size = 100
+
+    config_manager = ConfigManager()
+    config = config_manager.parse_args([f"--job.config_file={config_path}"])
+    train_spec = get_train_spec(config.model.name)
+    tokenizer = train_spec.build_tokenizer_fn(config)
+
+    # Build test set of randomly generated token ids
+    test_set = [
+        torch.randint(
+            0,
+            tokenizer.get_vocab_size(),
+            (
+                1,  # batch size
+                prompt_len,
+            ),
+        )
+        for _ in range(test_size)
+    ]
+
+    # baseline logits
+    baseline_hf_outputs = forward_hf(hf_model_name, None, test_set)
+    baseline_tt_outputs = forward_tt(config_path, baseline_checkpoint_path, test_set)
+
+    # testing from hf script
+    from_hf_outputs = forward_tt(config_path, checkpoint_path, test_set)
+    from_hf_outputs_no_perm = forward_tt(config_path, checkpoint_path_no_perm, test_set)
+
+    # testing to hf script
+    to_hf_outputs = forward_hf(hf_model_name, hf_model_path, test_set)
+    to_hf_outputs_no_perm = forward_hf(hf_model_name, hf_model_path_no_perm, test_set)
+
+    # Define the set of outputs to test loss for
+    test_configs = {
+        "from_hf": [baseline_hf_outputs, from_hf_outputs],
+        "to_hf": [to_hf_outputs, baseline_tt_outputs],
+        "from_hf_no_perm": [baseline_hf_outputs, from_hf_outputs_no_perm],
+        "to_hf_no_perm": [to_hf_outputs_no_perm, baseline_tt_outputs],
+    }
+    avg_losses = {}
+
+    for test_name, (hf, tt) in test_configs.items():
+        total_loss = 0
+        for hf, tt in zip(hf, tt):
+            total_loss += loss_fn(hf, tt)
+        avg_loss = total_loss / len(test_set)
+        avg_losses[test_name] = avg_loss.item()
+
+    for test_name, avg_loss in avg_losses.items():
+        print(f"Average loss of test {test_name} is {avg_loss}")