Revert "fix:Remove use_cache and update ReadMe. (#531)" (#532)

dushyantbehl · web-flow · commit ebe35a3491ec · 2025-04-18T15:39:25.000-04:00
This reverts commit 77dd8af.
diff --git a/README.md b/README.md
@@ -916,12 +916,12 @@ For information on supported dataset formats and how to tune a vision-language m
 
   ? May be supported, but not tested
 
-Model Name & Size  | Model Architecture | LoRA Tuning | Full Finetuning |
--------------------- | ---------------- | --------------- | --------------- |
-Llama 3.2-11B Vision  | MllamaForConditionalGeneration | ✅* | ✅* | 
-Llava 1.5-7B  | LlavaForConditionalGeneration | ✅* | 🚫 | 
-Granite 3.1-2B Vision  | LlavaNextForConditionalGeneration | ✅* | 🚫 |
-Llava Mistral 1.6-7B  | LlavaNextForConditionalGeneration | ✅* | 🚫 |
+Model Name & Size  | Model Architecture | Full Finetuning |
+-------------------- | ---------------- | --------------- |
+Llama 3.2-11B Vision  | MllamaForConditionalGeneration | ✅* |
+Llava 1.5-7B  | LlavaForConditionalGeneration | ✅* |
+Granite 3.1-2B Vision  | LlavaNextForConditionalGeneration | ✅* |
+Llava Mistral 1.6-7B  | LlavaNextForConditionalGeneration | ✅* |
 
 (*) - Supported with `fms-hf-tuning` v2.8.0 or later.
 
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
@@ -237,16 +237,9 @@ def train(
             attn_implementation="flash_attention_2"
             if model_args.use_flash_attn
             else None,
+            # avoid warning that use_cache is incompatible with gradient checkpointing
+            use_cache=(not train_args.gradient_checkpointing),
         )
-        try:
-            if "use_cache" in model.language_model.config:
-                # avoid warning that use_cache is incompatible with gradient checkpointing
-                model.language_model.config.use_cache = (
-                    not train_args.gradient_checkpointing
-                )
-        except AttributeError as e:
-            # When the model doesn't have the use_cache attribute
-            logger.warning("Couldn't update use_cache for vision model: %s", e)
 
         processor = AutoProcessor.from_pretrained(model_args.model_name_or_path)
         tokenizer = processor.tokenizer