Bug fixes (#1951)

danielhanchen · everythingisc00l · SethHWeidman · web-flow · commit 2b5d81d75281 · 2025-03-08T04:34:55.000-08:00
* Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update llama.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update llama.py * Update _utils.py * Update llama.py * Update _utils.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update rl_replacements.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * GRPO optimized * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Selective Log softmax * Fix GRPO bsz * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Fix TRL * Metrics GRPO * Update rl_replacements.py * Update rl_replacements.py * No compile * Update rl.py * Remove docs * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * llama-quantize on WINDOWS WSL error fix - edit save.py (gguf saving breaks) (#1649) * edit save.py to fix gguf saving breaks. * add check for .exe or not exe file extension for linux and windows * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update llama.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update llama.py * Update llama.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * unsloth_num_chunks * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py (#1754) Fix typo in comment: know -> now. This was printed when running the Llama3.1_(8B)-GRPO.ipynb example notebook, so I'd expect others to run into it as well. * Optional logits * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * fix an import error (#1767) * fix an import error * Delete .gitignore * Update loader.py * Update save.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * SamplingParams * Convert mask to float (#1762) * [Windows Support] Add latest `xformers` wheels to pyproject.toml (#1753) * Add latest xformers * Add a couple of lines to docs * vLLMSamplingParams * Update __init__.py * default num_chunks == -1 * Versioning * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update rl_replacements.py * Update rl_replacements.py * Update pyproject.toml * Update pyproject.toml * Export Model to ollama.com (#1648) * Ollama Export Model to ollama.com Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * Check for model_name Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * subprocess use instead of requests | added check for ollama server Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * create_ollama_model Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * create_ollama_model | fix Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * Push to Ollama Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> --------- Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * Update cross_entropy_loss.py * torch_cuda_device * Update utils.py * Update utils.py * Update utils.py * device * device * Update loader.py * Update llama.py * Update README.md * Update llama.py * Update llama.py * Update _utils.py * Update utils.py * Update utils.py * Update utils.py * Update utils.py * Update utils.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update utils.py * Update utils.py * Update utils.py * Update utils.py * __version__ * Update rl.py * Bug fixes * Bug fixes * Update llama.py * Update _utils.py * _wrap_fast_inference * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * SFT dataset prepare * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update llama.py * Update llama.py * Update utils.py * bug fix * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update __init__.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update rl.py * Update rl.py * Update rl.py * Update _utils.py * Update __init__.py * Update _utils.py * Version * versioning * Update _utils.py * Update llama.py * Update llama.py * Bug fixes * FastModel * __doc__ * Update vision.py * Update loader.py * Update loader.py * Update loader.py * version --------- Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> Co-authored-by: Gennadii Manzhos <105049664+everythingisc00l@users.noreply.github.com> Co-authored-by: Seth Weidman <seth@sethweidman.com> Co-authored-by: Nino Risteski <95188570+NinoRisteski@users.noreply.github.com> Co-authored-by: Edd <68678137+Erland366@users.noreply.github.com> Co-authored-by: Ben <6579034+versipellis@users.noreply.github.com> Co-authored-by: Jyotin Goel <120490013+gjyotin305@users.noreply.github.com>
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,7 +40,7 @@ triton = [
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.3.7",
+    "unsloth_zoo>=2025.3.8",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
@@ -354,7 +354,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.3.7",
+    "unsloth_zoo>=2025.3.8",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
@@ -198,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.3.7"):
+    if Version(unsloth_zoo_version) < Version("2025.3.8"):
         try:
             os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
         except:
diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .llama   import FastLlamaModel
-from .loader  import FastLanguageModel, FastVisionModel
+from .loader  import FastLanguageModel, FastVisionModel, FastTextModel, FastModel
 from .mistral import FastMistralModel
 from .qwen2   import FastQwen2Model
 from .granite import FastGraniteModel
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.3.8"
+__version__ = "2025.3.9"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -91,7 +91,7 @@ def original_apply_o(self, X):
 pass
 
 from math import sqrt as math_sqrt
-KV_CACHE_INCREMENT = 256 # KV Cache update size
+KV_CACHE_INCREMENT = 512 # KV Cache update size
 torch_nn_functional_softmax = torch.nn.functional.softmax
 # SDPA has GQA internally
 SDPA_HAS_GQA = "enable_gqa" in scaled_dot_product_attention.__doc__
@@ -1656,6 +1656,13 @@ def from_pretrained(
                 "Are you certain you want to do remote code execution?"
             )
         pass
+        if fast_inference:
+            import platform
+            if platform.system().lower() == 'windows':
+                print("Unsloth: vLLM does not work in Windows! Will use Unsloth inference!")
+                fast_inference = False
+        pass
+
         if token is None: token = get_token()
         if model_patcher is None: model_patcher = FastLlamaModel
         SUPPORTS_BFLOAT16 = is_bfloat16_supported()
@@ -1966,12 +1973,17 @@ def from_pretrained(
             for layer in model.model.layers:
                 layer.self_attn.rotary_emb = rotary_emb
         pass
-        
+
+        # Add for_inference and for_training
+        model.for_training  = functools.partial(FastLlamaModel.for_training,  model)
+        model.for_inference = functools.partial(FastLlamaModel.for_inference, model)
+
         # Patch generate
         if model.generate.__name__ != "unsloth_fast_generate":
             model._old_generate = model.generate
             unsloth_fast_generate.__doc__ = model._old_generate.__doc__
             model.generate = types.MethodType(unsloth_fast_generate, model)
+        pass
         return model, tokenizer
     pass
 
@@ -2404,7 +2416,7 @@ def get_peft_model(
         # Add for_inference and for_training
         model.for_training  = functools.partial(FastLlamaModel.for_training,  model)
         model.for_inference = functools.partial(FastLlamaModel.for_inference, model)
-        
+
         # Patch generate
         if model.generate.__name__ != "unsloth_fast_generate":
             model._old_generate = model.generate
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
@@ -383,10 +383,13 @@ def from_pretrained(
     patch_loss_functions,
     post_patch_loss_function,
 )
-from .vision import FastBaseVisionModel
-
+from .vision import FastBaseModel
+from transformers import (
+    AutoModelForVision2Seq,
+    AutoModelForCausalLM,
+)
 
-class FastVisionModel(FastBaseVisionModel):
+class FastModel(FastBaseModel):
     @staticmethod
     def from_pretrained(
         model_name                 = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
@@ -413,7 +416,7 @@ def from_pretrained(
         patch_compiling_bitsandbytes()
         if use_gradient_checkpointing == "unsloth":
             patch_unsloth_smart_gradient_checkpointing(dtype = dtype)
-        
+
         old_model_name = model_name
         if not use_exact_model_name:
             model_name = get_model_name(model_name, load_in_4bit)
@@ -427,7 +430,7 @@ def from_pretrained(
         from huggingface_hub.utils import disable_progress_bars, enable_progress_bars, are_progress_bars_disabled
         was_disabled = are_progress_bars_disabled()
         disable_progress_bars()
-        
+
         autoconfig_error = None
         peft_error = None
         try:
@@ -458,7 +461,7 @@ def from_pretrained(
 
         # Old transformers versions check
         both_exist = (is_model and is_peft) and not SUPPORTS_LLAMA32
-        
+
         # New transformers need to check manually.
         if SUPPORTS_LLAMA32:
             # Check if folder exists locally
@@ -515,9 +518,12 @@ def from_pretrained(
         if not was_disabled: enable_progress_bars()
 
         do_logging = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
-        redirector = sys.stdout if do_logging else open(os.devnull, "w")
+        if do_logging:
+            redirector = contextlib.nullcontext()
+        else:
+            redirector = contextlib.redirect_stdout(open(os.devnull, "w"))
 
-        with contextlib.redirect_stdout(redirector):
+        with redirector:
             patch_loss_functions(torch_compile = False)
             model_types = unsloth_compile_transformers(
                 model_name              = model_name,
@@ -547,7 +553,6 @@ def from_pretrained(
                 return_logits           = return_logits,
             )
         pass
-        if do_logging: redirector.close()
 
         # Check if this is local model since the tokenizer gets overwritten
         if  os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \
@@ -559,7 +564,12 @@ def from_pretrained(
             tokenizer_name = None
         pass
 
-        model, tokenizer = FastBaseVisionModel.from_pretrained(
+        # Check if VLM
+        is_vlm = (x.endswith("ForConditionalGeneration") for x in model_config.architectures)
+        is_vlm = is_vlm or hasattr(model_config, "vision_config")
+        auto_model = AutoModelForVision2Seq if is_vlm else AutoModelForCausalLM
+
+        model, tokenizer = FastBaseModel.from_pretrained(
             model_name        = model_name,
             max_seq_length    = max_seq_length,
             dtype             = _get_dtype(dtype),
@@ -570,6 +580,7 @@ def from_pretrained(
             revision          = revision if not is_peft else None,
             model_types       = model_types,
             tokenizer_name    = tokenizer_name,
+            auto_model        = auto_model,
             *args, **kwargs,
         )
         
@@ -617,8 +628,14 @@ def from_pretrained(
                 trust_remote_code = trust_remote_code,
             )
             # Patch it as well!
-            model = FastBaseVisionModel.patch_peft_model(model, use_gradient_checkpointing)
+            model = FastBaseModel.patch_peft_model(model, use_gradient_checkpointing)
         pass
         return model, tokenizer
     pass
 pass
+
+class FastVisionModel(FastModel):
+    pass
+
+class FastTextModel(FastModel):
+    pass
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
@@ -611,6 +611,21 @@
         "open-thoughts/OpenThinker-7B",
         "unsloth/OpenThinker-7B-bnb-4bit",
     ),
+    "unsloth/granite-3.2-2b-instruct-unsloth-bnb-4bit" : (
+        "unsloth/granite-3.2-2b-instruct",
+        "ibm-granite/granite-3.2-2b-instruct",
+        "unsloth/granite-3.2-2b-instruct-bnb-4bit",
+    ),
+    "unsloth/granite-3.2-8b-instruct-unsloth-bnb-4bit" : (
+        "unsloth/granite-3.2-8b-instruct",
+        "ibm-granite/granite-3.2-8b-instruct",
+        "unsloth/granite-3.2-8b-instruct-bnb-4bit",
+    ),
+    "unsloth/QwQ-32B-unsloth-bnb-4bit" : (
+        "unsloth/QwQ-32B",
+        "Qwen/QwQ-32B",
+        "unsloth/QwQ-32B-bnb-4bit",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER  = {}
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py