Ollama Chat Templates (#582)

danielhanchen · shimmyshimmer · web-flow · commit 8a9e24ed4f09 · 2024-06-13T05:04:54.000+10:00
* Update llama.py

* offload

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* continued pretraining trainer

* Update trainer.py

* Update trainer.py

* Update trainer.py

* Update trainer.py

* is_bfloat16_supported

* Update __init__.py

* Update README.md

* Update llama.py

* is_bfloat16_supported

* Update __init__.py

* Mistral v3

* Phi 3 medium

* Update chat_templates.py

* Update chat_templates.py

* Phi-3

* Update save.py

* Update README.md

Mistral v3 to Mistral v0.3

* Untrained tokens

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update llama.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update save.py

* Update save.py

* Update save.py

* checkpoint

* Update _utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update tokenizer_utils.py

* Update llama.py

* accelerate

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update tokenizer_utils.py

* train_dataloader

* Update llama.py

* Update llama.py

* Update llama.py

* use_fast_convert

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* remove_special_tokens

* Ollama

* Update chat_templates.py

* Update chat_templates.py

* Update chat_templates.py

* Update llama.py

* Update chat_templates.py

* Support bfloat16 GGUF

* Update save.py

* Update llama.py

* fast_forward_inference

* Update mapper.py

* Update loader.py

* Update llama.py

* Update tokenizer_utils.py

* info

* edits

* Create chat template

* Fix tokenizer

---------

Co-authored-by: Michael Han &lt;107991372+shimmyshimmer@users.noreply.github.com&gt;
diff --git a/unsloth/chat_templates.py b/unsloth/chat_templates.py
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -31,7 +31,7 @@
 import os
 import psutil
 
-__version__ = "2024.5"
+__version__ = "2024.6"
 
 # Get Flash Attention v2 if Ampere (RTX 30xx, A100)
 major_version, minor_version = torch.cuda.get_device_capability()
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -209,8 +209,9 @@ def LlamaAttention_fast_forward_inference(
 
     # Attention
     if bsz == 1:
+        Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
+        # It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
         A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
-        A *= self.scalar
         # if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
         A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
         A = torch.matmul(A, Vnn, out = Qn)
@@ -791,7 +792,7 @@ def _CausalLM_fast_forward(
         *args, **kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         
-        if past_key_values is not None and self.config.model_type != "qwen2":
+        if past_key_values is not None:
             outputs = fast_forward_inference(
                 self,
                 input_ids,
@@ -1195,7 +1196,13 @@ def from_pretrained(
         f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
         f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
         logger.warning(debug_info)
-        import gc
+        import subprocess, re, gc
+        output = subprocess.check_output(
+            'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
+        output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
+        output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
+        if output > 1: raise RuntimeError(
+            'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()"""
@@ -1206,12 +1213,12 @@ def from_pretrained(
 
         debug_info = """n_total_devices = total_train_batch_size // \\
             args.gradient_accumulation_steps // self._train_batch_size
-        if n_total_devices > 2:
+        if n_total_devices > 1:
             logger.warning_once(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
         debug_info ="""
         debug_info = debug_info.split('\n')
@@ -1236,17 +1243,17 @@ def from_pretrained(
         bsz = self._train_batch_size
         total_batches = bsz * ga * args.world_size
         n_total_devices = total_batches // ga // bsz
-        if n_total_devices > 2:
+        if n_total_devices > 1:
             logger.warning_once(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
-            divisor = n_total_devices / 2
+            divisor = n_total_devices / 1
             bsz = self._train_batch_size = max(int(bsz / divisor), 1)
-            if total_batches // ga // bsz > 2:
-                divisor = n_total_devices / 2
+            if total_batches // ga // bsz > 1:
+                divisor = n_total_devices / 1
                 ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
         check_batches = check_batches.split('\n')
         check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
@@ -1830,10 +1837,10 @@ def patch_peft_model(
 
     @staticmethod
     def for_inference(model):
-        if model.config.model_type == "qwen2":
-            FastLlamaModel.for_training(model)
-            return
-        pass
+        # if model.config.model_type == "qwen2":
+        #     FastLlamaModel.for_training(model)
+        #     return
+        # pass
 
         internal_model = model
         internal_model.gradient_checkpointing = False
diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
@@ -33,6 +33,9 @@
 
 def _get_model_name(model_name, load_in_4bit = True):
 
+    # First try replacing lowercase 'b' with uppercase 'B'
+    model_name = model_name.lower()
+
     if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
         model_name = INT_TO_FLOAT_MAPPER[model_name]
         logger.warning_once(
diff --git a/unsloth/models/mapper.py b/unsloth/models/mapper.py
@@ -197,4 +197,12 @@
     for value in values:
         FLOAT_TO_INT_MAPPER[value] = key
     pass
+
+    # Get lowercased
+    lowered_key = key.lower()
+    INT_TO_FLOAT_MAPPER[lowered_key] = values[0].lower()
+
+    for value in values:
+        FLOAT_TO_INT_MAPPER[value.lower()] = lowered_key
+    pass
 pass
diff --git a/unsloth/models/mistral.py b/unsloth/models/mistral.py
@@ -393,21 +393,6 @@ def from_pretrained(
             layer.self_attn.apply_o   = original_apply_o
         pass
 
-        # Patch Trainer
-        from transformers.trainer import Trainer
-        if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
-            try:
-                inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
-            except:
-                raise RuntimeError(
-                    "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                    "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
-                    "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
-                    "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
-                )
-            pass
-        pass
-
         # Patch Trainer
         from transformers.trainer import Trainer
         try:
@@ -419,7 +404,7 @@ def from_pretrained(
         except:
             raise RuntimeError(
                 "Our OSS was designed for people with few GPU resources to level the playing field.\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
+                "The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
                 "We're a 2 person team, so we still have to fund our development costs - thanks!\n"
                 "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
@@ -447,20 +432,30 @@ def from_pretrained(
         f"O^O/ \\_/ \\    Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
         f"\\        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
         f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
-        logger.warning_once(debug_info)"""
+        logger.warning(debug_info)
+        import subprocess, re, gc
+        output = subprocess.check_output(
+            'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
+        output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
+        output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
+        if output > 1: raise RuntimeError(
+            'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()"""
 
         debug_info = debug_info.split('\n')
         debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
         inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
 
         debug_info = """n_total_devices = total_train_batch_size // \\
             args.gradient_accumulation_steps // self._train_batch_size
-        if n_total_devices > 2:
+        if n_total_devices > 1:
             logger.warning_once(
-                "Our OSS was designed for people with few GPU resources to level the playing field.\\n"
-                "The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\\n"
-                "We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
-                "If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
         debug_info ="""
         debug_info = debug_info.split('\n')
@@ -485,16 +480,17 @@ def from_pretrained(
         bsz = self._train_batch_size
         total_batches = bsz * ga * args.world_size
         n_total_devices = total_batches // ga // bsz
-        if n_total_devices > 2:
+        if n_total_devices > 1:
             logger.warning_once(
-                "Please consider a commercial license - Unsloth was designed for the GPU Poor.\\n"
-                "The OSS currently works on 4 GPUs - we're a 2 person team, so please help fund\\n"
-                "our development costs by supporting us through Ko-fi or buying a license! Thanks!",
+                "* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
+                "* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
+                "* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
+                "* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
             )
-            divisor = n_total_devices / 2
+            divisor = n_total_devices / 1
             bsz = self._train_batch_size = max(int(bsz / divisor), 1)
-            if total_batches // ga // bsz > 2:
-                divisor = n_total_devices / 2
+            if total_batches // ga // bsz > 1:
+                divisor = n_total_devices / 1
                 ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
         check_batches = check_batches.split('\n')
         check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
diff --git a/unsloth/models/qwen2.py b/unsloth/models/qwen2.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from .llama import *
-from .mistral import FastMistralModel
 import os
 from ._utils import __version__
 
@@ -60,7 +59,7 @@ def pre_patch():
 
     @staticmethod
     def from_pretrained(
-        model_name     = "Qwen/Qwen1.5-7B",
+        model_name     = "Qwen/Qwen2-7B",
         max_seq_length = 4096,
         dtype          = None,
         load_in_4bit   = True,
@@ -73,7 +72,7 @@ def from_pretrained(
         trust_remote_code = False,
         **kwargs,
     ):
-        return FastMistralModel.from_pretrained(
+        return FastLlamaModel.from_pretrained(
             model_name     = model_name,
             max_seq_length = max_seq_length,
             dtype          = dtype,
diff --git a/unsloth/save.py b/unsloth/save.py
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py