Update cache test, add validation for cached run

avery-blanchard · avery-blanchard · commit 6a6fe67142b5 · 2025-07-21T13:47:58.000Z
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -24,7 +24,7 @@
 )
 import json
 from aiu_fms_testing_utils.utils.aiu_setup import dprint, aiu_dist_setup
-
+import shutil
 import os
 
 try:
@@ -148,7 +148,6 @@
     os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str((((max(common_seq_lengths) + max(common_max_new_tokens)) // 64) + 1) * 64)
     os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(common_batch_sizes))
 
-cache_params = list(itertools.product([common_model_paths[0]], [common_batch_sizes[0]], [common_seq_lengths[0]], [common_max_new_tokens[0]], ["miss", "hit"]))
 
 # thresholds are chosen based on 1024 tokens per sequence
 # 1% error threshold rate between cpu fp32 and cuda fp16
@@ -182,8 +181,7 @@
     USE_MICRO_MODELS = False
     common_model_paths = []
     frequency = int(model_configuration_frequency)
-    with open(model_configuration_path, 'r') as f:
-        for line in f:
+    for line in f:
             try:
                 model_config = json.loads(line)
                 if model_config["frequency"] <= frequency:
@@ -426,7 +424,7 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi
 
     # prepare the AIU model
     model = persistent_model.get_or_create(is_gptq, **gptq_kwargs_aiu, **get_model_kwargs)
-
+    
     # prepare the cpu model
     validation_model = get_model(
         device_type="cpu",
@@ -555,6 +553,7 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 model,
                 input_ids,
                 max_new_tokens,
+                max_new_tokens,
                 GoldenTokenHook(cpu_static_tokens),
                 only_last_token=ATTN_TYPE != "paged",
                 **extra_kwargs,
@@ -622,56 +621,272 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
     else:
         print("passed validation level 0")
 
-@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens,cache_status", cache_params)
-def test_cache(model_path, batch_size, seq_length, max_new_tokens, cache_status):
+@pytest.mark.parametrize("cache_status", ["miss", "hit"])
+def test_cache(cache_status):
     torch.manual_seed(42)
+    torch.set_grad_enabled(False)
     os.environ["TORCH_SENDNN_CACHE_ENABLE"] = "1"
+    os.environ["TORCH_SENDNN_CACHE_DIR"] = os.getcwd()+"/.cache"
     os.environ["COMPILATION_MODE"] = "offline_decoder"
     
+    if cache_status == "miss" and os.path.isdir(os.getcwd()+"/.cache"):
+        # Remove cache from previous runs
+        shutil.rmtree(os.getcwd()+"/.cache")
+    
+    model_path = "ibm-granite/granite-3.3-8b-instruct"
+    batch_size = common_batch_sizes[0]
+    seq_length = common_seq_lengths[0] 
+    max_new_tokens = common_max_new_tokens[0]
+    
     dprint(f"testing with cache: model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, cache={cache_status}")
 
-    if USE_MICRO_MODELS:
+    # we don't currently support inferring gptq from get_model, so we must use an adapter with hf_configured
+    gptq_kwargs_aiu, gptq_kwargs_cpu = __maybe_get_gptq_kwargs(model_path)
+    is_gptq = len(gptq_kwargs_aiu) != 0
+
+    micro_model_path = micro_model_mapping.get(model_path, None)
+    if USE_MICRO_MODELS and micro_model_path is None:
+        dprint("using randomly initialized model")
         micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3}
     else:
-        micro_model_kwargs  = {"architecture": "hf_pretrained"}
-    
+        dprint("using trained model")
+        micro_model_kwargs = {"architecture": "hf_pretrained"}
+
     if not USE_MICRO_MODELS and os.path.exists(model_path):
         model_path_kwargs = {"model_path": model_path}
+    elif USE_MICRO_MODELS and micro_model_path is not None:
+        model_path_kwargs = {"model_path": micro_model_path}
     else:
         model_path_kwargs = {"variant": model_path}
-   
+
     distributed_kwargs = {}
     if USE_DISTRIBUTED:
-        distributed_kwargs["distr_param"] = "tp"
+        distributed_kwargs["distributed_strategy"] = "tp"
         distributed_kwargs["group"] = dist.group.WORLD
-    get_model_kwargs = {**model_path_kwargs, **micro_model_kwargs, **distributed_kwargs}
+
+    get_model_kwargs = {}
+    if not is_gptq:
+        get_model_kwargs = {
+            **model_path_kwargs,
+            **micro_model_kwargs,
+            **distributed_kwargs,
+        }
 
     tokenizer = tokenizers.get_tokenizer(model_path)
 
     # prepare the AIU model
     model = get_model(
+                device_type="cpu",
+                data_type=None if is_gptq else torch.float16,
+                fused_weights=False,
+                **get_model_kwargs,
+            )
+
+    model.eval()
+    model.compile(backend="sendnn")
+
+    # prepare the cpu model
+    validation_model = get_model(
         device_type="cpu",
+        data_type=None if is_gptq else torch.float32,
         fused_weights=False,
-        **get_model_kwargs
+        **gptq_kwargs_cpu,
+        **get_model_kwargs,
     )
 
-    model.eval()
-    torch.set_grad_enabled(False)
-    model.compile(backend="sendnn_decoder")
-
+    if USE_MICRO_MODELS:
+        serialization.load_state_dict_into_model(
+            validation_model, model.state_dict(), **__custom_adapter
+        )
 
     # prepare input_ids
-    input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
+    input_ids, extra_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
+    extra_kwargs["attn_name"] = ATTN_NAME
 
     # warmup aiu model
-    warmup_model(model, input_ids, max_new_tokens, **padding_kwargs)
+    warmup_model(model, input_ids, max_new_tokens, compile_dynamic_sendnn, **extra_kwargs)
+ 
+    # generate cpu validation info
+    cpu_validation_info = __load_validation_info(
+        model_path, batch_size, seq_length, max_new_tokens, tokenizer, 0
+    )
+    if cpu_validation_info is None:
+        cpu_validation_info = extract_validation_information(
+            validation_model,
+            input_ids,
+            max_new_tokens,
+            LogitsExtractorHook(),
+            attn_algorithm="math",
+            **extra_kwargs,
+        )
 
-    # aiu validatation 
+        if save_validation_info_outputs:
+            cpu_validation_info.save(
+                __get_validation_info_full_path(
+                    model_path, batch_size, seq_length, max_new_tokens, 0
+                )
+            )
+    cpu_static_tokens = cpu_validation_info.get_info("tokens")
+    eos_indexes = __find_eos_index(
+        cpu_static_tokens, tokenizer.eos_token_id, seq_length, max_new_tokens
+    )
+    dprint(
+        "cpu validation info extracted for validation level 0 and validation level 1 (iter=0)"
+    )
+
+    # first test validation level 0
     aiu_validation_info = extract_validation_information(
-        model,
-        input_ids,
-        max_new_tokens,
-        None,
-        only_last_token=True,
-        **padding_kwargs
-)
+        model, input_ids, max_new_tokens, None, only_last_token="paged" not in ATTN_NAME, **extra_kwargs
+    )
+    dprint("aiu validation info extracted for validation level 0")
+    
+    # check cache status before validating cached results
+    updated_cache_len = len(os.listdir(os.getcwd()+"/.cache")) if os.path.isdir(os.getcwd()+"/.cache") else 0
+    if cache_status == "miss":
+        assert updated_cache_len ==  max_new_tokens, (
+                "cache directory not populated on cache miss"
+            )
+        return
+    else:
+        assert updated_cache_len ==  max_new_tokens, (
+                "cache miss occurred when hit was expected"
+            )
+
+    # validate level 0
+    failed_responses = validate_level_0(
+        aiu_validation_info.get_info("tokens"), cpu_static_tokens
+    )
+
+    failed_validation_level_0 = len(failed_responses) != 0
+
+    # if level 0 fails validation, validate level 1
+    if FORCE_VALIDATION_LEVEL_1 or failed_validation_level_0:
+
+        if failed_validation_level_0:
+            dprint("failed validation level 0, testing validation level 1")
+        else:
+            dprint("passed validation level 0, testing validation level 1")
+
+        # metric calculator based on the cross-entropy and mean diff for each decode step
+        def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
+            cross_entropy = torch.nn.CrossEntropyLoss()(
+                r, t.softmax(dim=1).to(dtype=torch.float32)
+            )
+            diff = torch.mean(
+                torch.abs(
+                    r.softmax(dim=1).to(dtype=torch.float32)
+                    - t.softmax(dim=1).to(dtype=torch.float32)
+                )
+            )
+            return (cross_entropy, diff)
+
+        iters = 1024 // max_new_tokens
+        ce_fail_responses_list = []
+        diff_fail_responses_list = []
+        total_tokens = 0
+        for i in range(iters):
+            # for iteration 0, we have computed the cpu validation info in the prior step for seed=0, so skip
+            if i != 0:
+                input_ids, extra_kwargs = __prepare_inputs(
+                    batch_size, seq_length, tokenizer, seed=i
+                )
+                extra_kwargs["attn_name"] = ATTN_NAME
+                cpu_validation_info = __load_validation_info(
+                    model_path, batch_size, seq_length, max_new_tokens, tokenizer, i
+                )
+                if cpu_validation_info is None:
+                    cpu_validation_info = extract_validation_information(
+                        validation_model,
+                        input_ids,
+                        max_new_tokens,
+                        LogitsExtractorHook(),
+                        attn_algorithm="math",
+                        **extra_kwargs,
+                    )
+                    dprint(
+                        f"cpu validation info extracted for validation level 1 - iter={i}"
+                    )
+                    if save_validation_info_outputs:
+                        cpu_validation_info.save(
+                            __get_validation_info_full_path(
+                                model_path, batch_size, seq_length, max_new_tokens, i
+                            )
+                        )
+                cpu_static_tokens = cpu_validation_info.get_info("tokens")
+                eos_indexes = __find_eos_index(
+                    cpu_static_tokens,
+                    tokenizer.eos_token_id,
+                    seq_length,
+                    max_new_tokens,
+                )
+
+            # generate aiu validation info
+            aiu_validation_info = extract_validation_information(
+                model,
+                input_ids,
+                max_new_tokens,
+                GoldenTokenHook(cpu_static_tokens),
+                only_last_token=ATTN_TYPE != "paged",
+                **extra_kwargs,
+            )
+            dprint(f"aiu validation info extracted for validation level 1 - iter={i}")
+            if save_validation_info_outputs:
+                aiu_validation_info.save(
+                    __get_validation_info_full_path(
+                        model_path, batch_size, seq_length, max_new_tokens, i, "aiu"
+                    )
+                )
+
+            # capture all level 1 metrics
+            level_1_metrics = capture_level_1_metrics(
+                cpu_validation_info.get_info("logits"),
+                aiu_validation_info.get_info("logits"),
+                top_k_loss_calculator(20, _metric_calculator),
+            )
+            # only consider those metrics captured prior to the eos
+            level_1_metrics = __filter_before_eos(level_1_metrics, eos_indexes)
+
+            # if we do not have real model weights, use a default_metrics_threshold
+            if USE_MICRO_MODELS and micro_model_path is None:
+                ce_threshold, diff_threshold = default_metrics_threshold
+            # if we have real weights, try and get the proper validation metrics threshold
+            else:
+                # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds
+                if USE_MICRO_MODELS:
+                    ce_threshold, diff_threshold = fail_thresholds.get(
+                        (model_path, True), fail_thresholds.get((model_path, False), default_metrics_threshold)
+                    )
+                else:
+                    ce_threshold, diff_threshold = fail_thresholds.get(
+                        (model_path, False), default_metrics_threshold
+                    )
+
+            # get all failed responses for each metric
+            ce_fail_responses = filter_failed_level_1_cases(
+                level_1_metrics, lambda m: m[0] >= ce_threshold
+            )
+            diff_fail_responses = filter_failed_level_1_cases(
+                level_1_metrics,
+                lambda m: m[1] >= diff_threshold,
+            )
+
+            ce_fail_responses_list.extend(ce_fail_responses)
+            diff_fail_responses_list.extend(diff_fail_responses)
+            total_tokens += len(level_1_metrics)
+
+        # test the failure rates for across all tokens
+        diff_failure_rate = len(diff_fail_responses_list) / total_tokens
+        ce_failure_rate = len(ce_fail_responses_list) / total_tokens
+        dprint(f"mean diff failure rate: {diff_failure_rate}")
+        dprint(f"cross entropy loss failure rate: {ce_failure_rate}")
+        if "mean_diff" not in skip_assertions:
+            assert diff_failure_rate < failure_rate_threshold, (
+                f"failure rate for mean diff was too high: {diff_failure_rate}"
+            )
+        if "ce" not in skip_assertions:
+            assert ce_failure_rate < failure_rate_threshold, (
+                f"failure rate for cross entropy loss was too high: {ce_failure_rate}"
+            )
+        print("passed validation level 1")
+    else:
+        print("passed validation level 0")