Gemma 3, bug fixes (#2014)

danielhanchen · NinoRisteski · Erland366 · web-flow · commit 029461a8c433 · 2025-03-14T06:42:44.000-07:00
* Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * Update rl_replacements.py * Update rl.py * Update rl.py * Update rl.py * Update rl.py * fix an import error (#1767) * fix an import error * Delete .gitignore * Update loader.py * Update save.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * SamplingParams * Convert mask to float (#1762) * [Windows Support] Add latest `xformers` wheels to pyproject.toml (#1753) * Add latest xformers * Add a couple of lines to docs * vLLMSamplingParams * Update __init__.py * default num_chunks == -1 * Versioning * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update rl_replacements.py * Update rl_replacements.py * Update pyproject.toml * Update pyproject.toml * Export Model to ollama.com (#1648) * Ollama Export Model to ollama.com Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * Check for model_name Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * subprocess use instead of requests | added check for ollama server Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * create_ollama_model Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * create_ollama_model | fix Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * Push to Ollama Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> --------- Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> * Update cross_entropy_loss.py * torch_cuda_device * Update utils.py * Update utils.py * Update utils.py * device * device * Update loader.py * Update llama.py * Update README.md * Update llama.py * Update llama.py * Update _utils.py * Update utils.py * Update utils.py * Update utils.py * Update utils.py * Update utils.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update utils.py * Update utils.py * Update utils.py * Update utils.py * __version__ * Update rl.py * Bug fixes * Bug fixes * Update llama.py * Update _utils.py * _wrap_fast_inference * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * SFT dataset prepare * Update pyproject.toml * Update rl_replacements.py * Update rl_replacements.py * Update rl_replacements.py * Update rl.py * Update llama.py * Update llama.py * Update utils.py * bug fix * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update __init__.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update rl.py * Update rl.py * Update rl.py * Update _utils.py * Update __init__.py * Update _utils.py * Version * versioning * Update _utils.py * Update llama.py * Update llama.py * Bug fixes * FastModel * __doc__ * Update vision.py * Update loader.py * Update loader.py * Update loader.py * version * move use_modelscope to _utils (#1938) * move use_modelscope to _utils * Update _utils.py * Update loader.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Don't use revision when loading model_config and is_peft=True (#1949) * More syntax warnings (#1944) * move use_modelscope to _utils * fix * Update _utils.py * Update loader.py --------- Co-authored-by: Daniel Han <danielhanchen@gmail.com> * Update loader.py * Full finetuning and other fixes * UNSLOTH_ENABLE_FULL_FINETUNING * Update loader.py * Update loader.py * Update loader.py * Update vision.py * Update vision.py * full finetuning * Update loader.py * Update loader.py * Update loader.py * Update _utils.py * max_seq_length * Update rl.py * Update rl.py * Update rl.py * Update pyproject.toml * AutoModelForImageTextToText * Update mapper.py * Update pyproject.toml * Update _utils.py * Update _utils.py * Update _utils.py * Batch samples * Update loader.py * Update loader.py * Update loader.py * Update loader.py * Update _utils.py * Update loader.py * Update vision.py * Update loader.py * Update vision.py * Update vision.py * Update vision.py * Update mapper.py * Update vision.py * Temporary patches * Update loader.py * model names * Gemma 3 chat template * Bug fixes * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update llama.py * Update llama.py * Update rl.py * Update chat_templates.py * Update chat_templates.py * Update vision.py * Update vision.py * Update vision.py * Update loader.py * Update vision.py * Update vision.py * Revert * Update _utils.py * forced precision * Autocast * Update vision.py * Update vision.py * Update rl.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update rl.py * vLLM fixes * constexpr * Update vision.py * Update vision.py * Update vision.py * Update rl.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update save.py * New models * Triton windows update (#1976) * Update pyproject.toml * Update README.md * Update RMS LayerNorm implementation, and list compr. change in chat templates (#1974) * Update RMS LayerNorm implementation with optimizations and testing suite * perf: optimize list comprehension in get_ollama_eos_tokens * Update Zoo * Update llama.py * Update llama.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update vision.py * Update rl_replacements.py * Update vision.py * grpo fix * Update rl_replacements.py * Update vision.py * Update rl_replacements.py * Update vision.py * Update mapper.py * Update vision.py * Update vision.py * Update loader.py --------- Signed-off-by: Jyotin Goel <b22ai063@iitj.ac.in> Co-authored-by: Nino Risteski <95188570+NinoRisteski@users.noreply.github.com> Co-authored-by: Edd <68678137+Erland366@users.noreply.github.com> Co-authored-by: Ben <6579034+versipellis@users.noreply.github.com> Co-authored-by: Jyotin Goel <120490013+gjyotin305@users.noreply.github.com> Co-authored-by: Kareem <81531392+KareemMusleh@users.noreply.github.com> Co-authored-by: Wilson Wu <140025193+wiwu2390@users.noreply.github.com> Co-authored-by: Akshay Behl <126911424+Captain-T2004@users.noreply.github.com>
diff --git a/README.md b/README.md
@@ -115,7 +115,7 @@ See [here](https://github.com/unslothai/unsloth/edit/main/README.md#advanced-pip
 7. **Install Unsloth:**
    
 ```python
-pip install "unsloth[windows] @ git+https://github.com/unslothai/unsloth.git"
+pip install unsloth
 ```
 
 #### Notes
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,14 +33,11 @@ exclude = ["images*"]
 
 [project.optional-dependencies]
 triton = [
-    "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.2.0-windows.post10/triton-3.2.0-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
-    "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.2.0-windows.post10/triton-3.2.0-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
-    "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.2.0-windows.post10/triton-3.2.0-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
-    "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.2.0-windows.post10/triton-3.2.0-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'"
+    "triton-windows ; platform_system == 'Windows'",
 ]
 
 huggingface = [
-    "unsloth_zoo>=2025.3.9",
+    "unsloth_zoo>=2025.3.11",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
diff --git a/unsloth/__init__.py b/unsloth/__init__.py
@@ -198,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.3.9"):
+    if Version(unsloth_zoo_version) < Version("2025.3.11"):
         print(
             "Unsloth: Updating Unsloth-Zoo utilies to the latest version.\n"\
             "To disable this, set os.environ['UNSLOTH_DISABLE_AUTO_UPDATES'] = '1'"
diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
@@ -1512,10 +1512,7 @@ def get_ollama_eos_tokens(tokenizer, extra_eos_tokens = []):
 
     # Remove duplicates
     splitted = joined_text.split("\x01\x00")
-    final_eos_tokens = []
-    for old, new in zip(added_tokens_decoder, splitted):
-        if old == new: final_eos_tokens.append(old)
-    pass
+    final_eos_tokens = [old for old, new in zip(added_tokens_decoder, splitted) if old == new]
     final_eos_tokens += extra_eos_tokens
     final_eos_tokens += repeatted_tokens
 
diff --git a/unsloth/kernels/cross_entropy_loss.py b/unsloth/kernels/cross_entropy_loss.py
@@ -37,12 +37,12 @@ def _cross_entropy_forward(
     loss_ptr          ,
     logsumexp_ptr     ,
     labels_ptr        ,
-    VOCAB_SIZE        ,
+    VOCAB_SIZE        : tl.constexpr,
     BLOCK_SIZE        : tl.constexpr,
-    DO_SOFTCAPPING    ,
-    SOFTCAP           ,
-    DO_LOGIT_SCALING  ,
-    LOGIT_SCALE       ,
+    DO_SOFTCAPPING    : tl.constexpr,
+    SOFTCAP           : tl.constexpr,
+    DO_LOGIT_SCALING  : tl.constexpr,
+    LOGIT_SCALE       : tl.constexpr,
 ):
     """
         Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
@@ -111,13 +111,13 @@ def _chunked_cross_entropy_forward(
     loss_ptr          ,
     logsumexp_ptr     ,
     labels_ptr        ,
-    VOCAB_SIZE        ,
-    N_CHUNKS          ,
+    VOCAB_SIZE        : tl.constexpr,
+    N_CHUNKS          : tl.constexpr,
     BLOCK_SIZE        : tl.constexpr,
-    DO_SOFTCAPPING    ,
-    SOFTCAP           ,
-    DO_LOGIT_SCALING  ,
-    LOGIT_SCALE       ,
+    DO_SOFTCAPPING    : tl.constexpr,
+    SOFTCAP           : tl.constexpr,
+    DO_LOGIT_SCALING  : tl.constexpr,
+    LOGIT_SCALE       : tl.constexpr,
 ):
     """
         256K vocab divided in 4 chunks
@@ -196,12 +196,12 @@ def _cross_entropy_backward(
     dloss_row_stride  ,
     logsumexp_ptr     ,
     labels_ptr        ,
-    VOCAB_SIZE        ,
+    VOCAB_SIZE        : tl.constexpr,
     BLOCK_SIZE        : tl.constexpr,
-    DO_SOFTCAPPING    ,
-    SOFTCAP           ,
-    DO_LOGIT_SCALING  ,
-    LOGIT_SCALE       ,
+    DO_SOFTCAPPING    : tl.constexpr,
+    SOFTCAP           : tl.constexpr,
+    DO_LOGIT_SCALING  : tl.constexpr,
+    LOGIT_SCALE       : tl.constexpr,
 ):
     """
         CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
diff --git a/unsloth/kernels/layernorm.py b/unsloth/kernels/layernorm.py
@@ -30,7 +30,8 @@ def layernorm_forward(
     b,
     r,
     mu,
-    n_cols, eps,
+    n_cols : tl.constexpr,
+    eps : tl.constexpr,
     BLOCK_SIZE : tl.constexpr
 ):
     row_idx = tl.program_id(0)
@@ -68,7 +69,8 @@ def layernorm_backward(
     b,
     r,
     mu,
-    n_cols, eps,
+    n_cols : tl.constexpr,
+    eps : tl.constexpr,
     BLOCK_SIZE : tl.constexpr
 ):
     # Approximately follows https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
diff --git a/unsloth/kernels/rms_layernorm.py b/unsloth/kernels/rms_layernorm.py
@@ -22,9 +22,10 @@ def _rms_layernorm_forward(
     Y, Y_row_stride,
     X, X_row_stride,
     W, W_row_stride,
-    r, r_row_stride,
-    n_cols, eps,
-    BLOCK_SIZE : tl.constexpr
+    r, r_row_stride : tl.constexpr,
+    n_cols     : tl.constexpr,
+    eps        : tl.constexpr,
+    BLOCK_SIZE : tl.constexpr,
 ):
     """
         Fast RMS Layernorm kernel
@@ -57,9 +58,10 @@ def _rms_layernorm_backward(
     dX, dX_row_stride,
     X,   X_row_stride,
     W,   W_row_stride,
-    r,   r_row_stride,
+    r,   r_row_stride : tl.constexpr,
     # dW, dW_row_stride,
-    n_cols, eps,
+    n_cols     : tl.constexpr,
+    eps        : tl.constexpr,
     GEMMA      : tl.constexpr,
     BLOCK_SIZE : tl.constexpr,
 ):
@@ -107,8 +109,9 @@ def _gemma_rms_layernorm_forward(
     Y, Y_row_stride,
     X, X_row_stride,
     W, W_row_stride,
-    r, r_row_stride,
-    n_cols, eps,
+    r, r_row_stride : tl.constexpr,
+    n_cols     : tl.constexpr,
+    eps        : tl.constexpr,
     BLOCK_SIZE : tl.constexpr,
 ):
     # Copies https://github.com/google-deepmind/gemma/blob/main/gemma/layers.py#L31
@@ -253,7 +256,6 @@ def unpatch_rms_layernorm():
     except:
         pass
     return
-    return
 pass
 
 
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.3.10"
+__version__ = "2025.3.11"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
@@ -72,6 +72,7 @@
 platform_system = platform_system()
 import numpy as np
 import contextlib
+import re
 import warnings, subprocess, re, inspect, psutil, os, math
 from unsloth_zoo.utils import Version
 
@@ -181,6 +182,34 @@ def filter(self, x): return not (self.text in x.getMessage())
 except:
     pass
 
+# Patch get_model_param_count to record correct 4bit / 8bit
+from transformers.trainer_pt_utils import is_deepspeed_zero3_enabled
+def get_model_param_count(model, trainable_only = False):
+    """
+    Calculate model's total param count. If trainable_only is True then count only those requiring grads
+    """
+    if is_deepspeed_zero3_enabled():
+        def numel(p):
+            return p.ds_numel if hasattr(p, "ds_numel") else p.numel()
+    else:
+        def numel(p):
+            return p.numel()
+    s = sum(numel(p) for p in model.parameters() if not trainable_only or p.requires_grad)
+    if (not trainable_only) and \
+        hasattr(model, "config") and \
+        hasattr(model.config, "quantization_config"):
+
+        billions = re.findall(r"([0-9]{1,})(?:b|B)", model.config.name_or_path)
+        if len(billions) != 0:
+            billions = int(billions[0])
+            s = 1_000_000_000 * billions
+    pass
+    return s
+pass
+import transformers.trainer_pt_utils
+transformers.trainer_pt_utils.get_model_param_count = get_model_param_count
+import transformers.trainer
+transformers.trainer.get_model_param_count = get_model_param_count
 # =============================================
 
 # =============================================
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -1663,6 +1663,10 @@ def from_pretrained(
             if platform.system().lower() == 'windows':
                 print("Unsloth: vLLM does not work in Windows! Will use Unsloth inference!")
                 fast_inference = False
+            major_version, minor_version = torch.cuda.get_device_capability()
+            if major_version < 7:
+                print("Unsloth: vLLM does not work on older GPUs - will switch to Unsloth inference!")
+                fast_inference = False
         pass
 
         if token is None: token = get_token()
@@ -1786,6 +1790,8 @@ def from_pretrained(
                 attn_implementation     = "eager",
                 **kwargs,
             )
+            model.fast_generate = model.generate
+            model.fast_generate_batches = None
         else:
             from unsloth_zoo.vllm_utils import (
                 load_vllm,
@@ -1804,6 +1810,7 @@ def from_pretrained(
                 enable_lora            = True,
                 max_lora_rank          = max_lora_rank,
                 disable_log_stats      = disable_log_stats,
+                use_bitsandbytes       = load_in_4bit,
             )
             for allowed_arg in allowed_args:
                 if allowed_arg not in load_vllm_kwargs and allowed_arg in kwargs:
@@ -2651,6 +2658,19 @@ def patch_peft_model(
             torch.cuda.empty_cache()
         pass
 
+        # Patch for fast inference
+        vllm_engine = getattr(model.model, "vllm_engine", None)
+        if vllm_engine is not None:
+            model.vllm_engine = model.model.vllm_engine
+            model.fast_generate = model.model.fast_generate
+            model.fast_generate_batches = model.model.fast_generate_batches
+
+            # Also saving and loading LoRA
+            from unsloth_zoo.vllm_utils import save_lora, load_lora
+            model.save_lora = functools.partial(save_lora, model)
+            model.load_lora = functools.partial(load_lora, model)
+        pass
+
         # Add for_inference and for_training
         model.for_training  = functools.partial(FastLlamaModel.for_training,  model)
         model.for_inference = functools.partial(FastLlamaModel.for_inference, model)
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
@@ -405,7 +405,6 @@ def from_pretrained(
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
-            model.enable_input_require_grads()
             model = PeftModel.from_pretrained(
                 model,
                 old_model_name,
@@ -498,10 +497,22 @@ def from_pretrained(
             raise RuntimeError("Unsloth: Pixtral only works on transformers >= 4.49.0." + LATEST)
         elif "qwen2.5" in model_name.lower() and transformers_version < Version("4.49.0"):
             raise RuntimeError("Unsloth: Qwen 2.5 only works on transformers >= 4.49.0." + LATEST)
-        elif "aya-vision" in model_name.lower() and transformers_version < Version("4.50.0.dev0"):
-            raise RuntimeError("Unsloth: Aya Vision only works on transformers >= 4.50.0." + NIGHTLY)
+        elif "aya-vision" in model_name.lower():
+            # Disable compiling for now - errors out!
+            os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
+            if transformers_version < Version("4.50.0.dev0"):
+                raise RuntimeError("Unsloth: Aya Vision only works on transformers >= 4.50.0." + NIGHTLY)
         elif "gemma-3" in model_name.lower() and transformers_version < Version("4.50.0.dev0"):
             raise RuntimeError("Unsloth: Gemma 3 only works on transformers >= 4.50.0." + NIGHTLY)
+        elif "c4ai-command-a-03-2025" in model_name.lower() and transformers_version < Version("4.50.0.dev0"):
+            raise RuntimeError("Unsloth: Cohere's Command model only works on transformers >= 4.50.0." + NIGHTLY)
+        elif "granite-vision" in model_name.lower():
+            # Disable compiling for now - errors out!
+            os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
+            if transformers_version < Version("4.50.0.dev0"):
+                raise RuntimeError("Unsloth: Granite Vision only works on transformers >= 4.50.0." + NIGHTLY)
+        elif "olmo-2" in model_name.lower() and transformers_version < Version("4.50.0.dev0"):
+            raise RuntimeError("Unsloth: OLMo-2 only works on transformers >= 4.50.0." + NIGHTLY)
         pass
 
         if USE_MODELSCOPE and not os.path.exists(model_name):
@@ -668,7 +679,7 @@ def from_pretrained(
             use_gradient_checkpointing = use_gradient_checkpointing,
             *args, **kwargs,
         )
-        
+
         if resize_model_vocab is not None:
             model.resize_token_embeddings(resize_model_vocab)
         pass
@@ -703,7 +714,6 @@ def from_pretrained(
         if is_peft:
             # From https://github.com/huggingface/peft/issues/184
             # Now add PEFT adapters
-            model.enable_input_require_grads()
             model = PeftModel.from_pretrained(
                 model,
                 old_model_name,
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
@@ -62,6 +62,16 @@
         "unsloth/llama-2-7b-chat",
         "meta-llama/Llama-2-7b-chat-hf",
     ),
+    "unsloth/Mixtral-8x7B-v0.1-unsloth-bnb-4bit" : (
+        "unsloth/Mixtral-8x7B-v0.1",
+        "mistralai/Mixtral-8x7B-v0.1",
+        "unsloth/Mixtral-8x7B-v0.1-bnb-4bit",
+    ),
+    "unsloth/Mixtral-8x7B-Instruct-v0.1-unsloth-bnb-4bit" : (
+        "unsloth/Mixtral-8x7B-Instruct-v0.1",
+        "mistralai/Mixtral-8x7B-Instruct-v0.1",
+        "unsloth/Mixtral-8x7B-Instruct-v0.1-bnb-4bit",
+    ),
     "unsloth/codellama-7b-bnb-4bit" : (
         "unsloth/codellama-7b",
         "codellama/CodeLlama-7b-hf",
@@ -678,6 +688,36 @@
         "google/gemma-3-27b-pt",
         "unsloth/gemma-3-27b-pt-bnb-4bit",
     ),
+    "unsloth/reka-flash-3-unsloth-bnb-4bit" : (
+        "unsloth/reka-flash-3",
+        "RekaAI/reka-flash-3",
+        "unsloth/reka-flash-3-bnb-4bit",
+    ),
+    "unsloth/c4ai-command-a-03-2025-unsloth-bnb-4bit" : (
+        "unsloth/c4ai-command-a-03-2025",
+        "CohereForAI/c4ai-command-a-03-2025",
+        "unsloth/c4ai-command-a-03-2025-bnb-4bit",
+    ),
+    "unsloth/aya-vision-32b-unsloth-bnb-4bit" : (
+        "unsloth/aya-vision-32b",
+        "CohereForAI/aya-vision-32b",
+        "unsloth/aya-vision-32b-bnb-4bit",
+    ),
+    "unsloth/aya-vision-8b-unsloth-bnb-4bit" : (
+        "unsloth/aya-vision-8b",
+        "CohereForAI/aya-vision-8b",
+        "unsloth/aya-vision-8b-bnb-4bit",
+    ),
+    "unsloth/granite-vision-3.2-2b-unsloth-bnb-4bit" : (
+        "unsloth/granite-vision-3.2-2b",
+        "ibm-granite/granite-vision-3.2-2b",
+        "unsloth/granite-vision-3.2-2b-bnb-4bit",
+    ),
+    "unsloth/OLMo-2-0325-32B-Instruct-unsloth-bnb-4bit" : (
+        "unsloth/OLMo-2-0325-32B-Instruct",
+        "allenai/OLMo-2-0325-32B-Instruct",
+        "unsloth/OLMo-2-0325-32B-Instruct-bnb-4bit",
+    ),
 }
 
 INT_TO_FLOAT_MAPPER  = {}
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
diff --git a/unsloth/models/vision.py b/unsloth/models/vision.py
diff --git a/unsloth/save.py b/unsloth/save.py