Skip to content

Commit 8a9e24e

Browse files
Ollama Chat Templates (#582)
* Update llama.py * offload * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * continued pretraining trainer * Update trainer.py * Update trainer.py * Update trainer.py * Update trainer.py * is_bfloat16_supported * Update __init__.py * Update README.md * Update llama.py * is_bfloat16_supported * Update __init__.py * Mistral v3 * Phi 3 medium * Update chat_templates.py * Update chat_templates.py * Phi-3 * Update save.py * Update README.md Mistral v3 to Mistral v0.3 * Untrained tokens * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update llama.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update save.py * Update save.py * Update save.py * checkpoint * Update _utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update tokenizer_utils.py * Update llama.py * accelerate * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update tokenizer_utils.py * train_dataloader * Update llama.py * Update llama.py * Update llama.py * use_fast_convert * Update save.py * Update save.py * Update save.py * Update save.py * remove_special_tokens * Ollama * Update chat_templates.py * Update chat_templates.py * Update chat_templates.py * Update llama.py * Update chat_templates.py * Support bfloat16 GGUF * Update save.py * Update llama.py * fast_forward_inference * Update mapper.py * Update loader.py * Update llama.py * Update tokenizer_utils.py * info * edits * Create chat template * Fix tokenizer --------- Co-authored-by: Michael Han <[email protected]>
1 parent 8d9bd0e commit 8a9e24e

9 files changed

+1015
-161
lines changed

unsloth/chat_templates.py

+646-33
Large diffs are not rendered by default.

unsloth/models/_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import os
3232
import psutil
3333

34-
__version__ = "2024.5"
34+
__version__ = "2024.6"
3535

3636
# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
3737
major_version, minor_version = torch.cuda.get_device_capability()

unsloth/models/llama.py

+27-20
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,9 @@ def LlamaAttention_fast_forward_inference(
209209

210210
# Attention
211211
if bsz == 1:
212+
Qn *= self.scalar # See https://github.com/ggerganov/llama.cpp/issues/7805#issuecomment-2153349963
213+
# It seems like doing (Q * scalar) @ K is better than (Q @ K) * scalar to stop overflows
212214
A = torch.matmul(Qn, Knn.transpose(2, 3), out = self.attention[:,:,:,:cached_len])
213-
A *= self.scalar
214215
# if attention_mask is not None: A += attention_mask # Must add attention_mask for batched
215216
A[:] = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32)#.to(A.dtype)
216217
A = torch.matmul(A, Vnn, out = Qn)
@@ -791,7 +792,7 @@ def _CausalLM_fast_forward(
791792
*args, **kwargs,
792793
) -> Union[Tuple, CausalLMOutputWithPast]:
793794

794-
if past_key_values is not None and self.config.model_type != "qwen2":
795+
if past_key_values is not None:
795796
outputs = fast_forward_inference(
796797
self,
797798
input_ids,
@@ -1195,7 +1196,13 @@ def from_pretrained(
11951196
f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
11961197
f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
11971198
logger.warning(debug_info)
1198-
import gc
1199+
import subprocess, re, gc
1200+
output = subprocess.check_output(
1201+
'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
1202+
output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
1203+
output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
1204+
if output > 1: raise RuntimeError(
1205+
'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
11991206
for _ in range(3):
12001207
gc.collect()
12011208
torch.cuda.empty_cache()"""
@@ -1206,12 +1213,12 @@ def from_pretrained(
12061213

12071214
debug_info = """n_total_devices = total_train_batch_size // \\
12081215
args.gradient_accumulation_steps // self._train_batch_size
1209-
if n_total_devices > 2:
1216+
if n_total_devices > 1:
12101217
logger.warning_once(
1211-
"Our OSS was designed for people with few GPU resources to level the playing field.\\n"
1212-
"The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
1213-
"We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
1214-
"If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
1218+
"* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
1219+
"* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
1220+
"* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
1221+
"* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
12151222
)
12161223
debug_info ="""
12171224
debug_info = debug_info.split('\n')
@@ -1236,17 +1243,17 @@ def from_pretrained(
12361243
bsz = self._train_batch_size
12371244
total_batches = bsz * ga * args.world_size
12381245
n_total_devices = total_batches // ga // bsz
1239-
if n_total_devices > 2:
1246+
if n_total_devices > 1:
12401247
logger.warning_once(
1241-
"Our OSS was designed for people with few GPU resources to level the playing field.\\n"
1242-
"The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
1243-
"We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
1244-
"If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
1248+
"* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
1249+
"* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
1250+
"* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
1251+
"* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
12451252
)
1246-
divisor = n_total_devices / 2
1253+
divisor = n_total_devices / 1
12471254
bsz = self._train_batch_size = max(int(bsz / divisor), 1)
1248-
if total_batches // ga // bsz > 2:
1249-
divisor = n_total_devices / 2
1255+
if total_batches // ga // bsz > 1:
1256+
divisor = n_total_devices / 1
12501257
ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
12511258
check_batches = check_batches.split('\n')
12521259
check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])
@@ -1830,10 +1837,10 @@ def patch_peft_model(
18301837

18311838
@staticmethod
18321839
def for_inference(model):
1833-
if model.config.model_type == "qwen2":
1834-
FastLlamaModel.for_training(model)
1835-
return
1836-
pass
1840+
# if model.config.model_type == "qwen2":
1841+
# FastLlamaModel.for_training(model)
1842+
# return
1843+
# pass
18371844

18381845
internal_model = model
18391846
internal_model.gradient_checkpointing = False

unsloth/models/loader.py

+3
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333

3434
def _get_model_name(model_name, load_in_4bit = True):
3535

36+
# First try replacing lowercase 'b' with uppercase 'B'
37+
model_name = model_name.lower()
38+
3639
if not SUPPORTS_FOURBIT and model_name in INT_TO_FLOAT_MAPPER:
3740
model_name = INT_TO_FLOAT_MAPPER[model_name]
3841
logger.warning_once(

unsloth/models/mapper.py

+8
Original file line numberDiff line numberDiff line change
@@ -197,4 +197,12 @@
197197
for value in values:
198198
FLOAT_TO_INT_MAPPER[value] = key
199199
pass
200+
201+
# Get lowercased
202+
lowered_key = key.lower()
203+
INT_TO_FLOAT_MAPPER[lowered_key] = values[0].lower()
204+
205+
for value in values:
206+
FLOAT_TO_INT_MAPPER[value.lower()] = lowered_key
207+
pass
200208
pass

unsloth/models/mistral.py

+25-29
Original file line numberDiff line numberDiff line change
@@ -393,21 +393,6 @@ def from_pretrained(
393393
layer.self_attn.apply_o = original_apply_o
394394
pass
395395

396-
# Patch Trainer
397-
from transformers.trainer import Trainer
398-
if Trainer._inner_training_loop.__name__ != "_fast_inner_training_loop":
399-
try:
400-
inner_training_loop = inspect.getsource(Trainer._inner_training_loop)
401-
except:
402-
raise RuntimeError(
403-
"Our OSS was designed for people with few GPU resources to level the playing field.\n"
404-
"The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
405-
"We're a 2 person team, so we still have to fund our development costs - thanks!\n"
406-
"If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
407-
)
408-
pass
409-
pass
410-
411396
# Patch Trainer
412397
from transformers.trainer import Trainer
413398
try:
@@ -419,7 +404,7 @@ def from_pretrained(
419404
except:
420405
raise RuntimeError(
421406
"Our OSS was designed for people with few GPU resources to level the playing field.\n"
422-
"The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\n"
407+
"The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\n"
423408
"We're a 2 person team, so we still have to fund our development costs - thanks!\n"
424409
"If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
425410
)
@@ -447,20 +432,30 @@ def from_pretrained(
447432
f"O^O/ \\_/ \\ Batch size per device = {self._train_batch_size:,} | Gradient Accumulation steps = {args.gradient_accumulation_steps}\\n"\\
448433
f"\\ / Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
449434
f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
450-
logger.warning_once(debug_info)"""
435+
logger.warning(debug_info)
436+
import subprocess, re, gc
437+
output = subprocess.check_output(
438+
'nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
439+
output = re.findall(rb'([\\d]{1,})[\\s]{1,}M', output)
440+
output = sum(int(x.decode('utf-8'))/1024 > 4 for x in output)
441+
if output > 1: raise RuntimeError(
442+
'Error: More than 1 GPUs have a lot of VRAM usage. Please obtain a commercial license.')
443+
for _ in range(3):
444+
gc.collect()
445+
torch.cuda.empty_cache()"""
451446

452447
debug_info = debug_info.split('\n')
453448
debug_info = "\n".join([debug_info[0]] + [spaces + x[8:] for x in debug_info[1:]])
454449
inner_training_loop = inner_training_loop.replace(original_debug, debug_info)
455450

456451
debug_info = """n_total_devices = total_train_batch_size // \\
457452
args.gradient_accumulation_steps // self._train_batch_size
458-
if n_total_devices > 2:
453+
if n_total_devices > 1:
459454
logger.warning_once(
460-
"Our OSS was designed for people with few GPU resources to level the playing field.\\n"
461-
"The OSS Apache 2 license only supports four GPUs - please obtain a commercial license from our website.\\n"
462-
"We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
463-
"If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
455+
"* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
456+
"* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
457+
"* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
458+
"* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
464459
)
465460
debug_info ="""
466461
debug_info = debug_info.split('\n')
@@ -485,16 +480,17 @@ def from_pretrained(
485480
bsz = self._train_batch_size
486481
total_batches = bsz * ga * args.world_size
487482
n_total_devices = total_batches // ga // bsz
488-
if n_total_devices > 2:
483+
if n_total_devices > 1:
489484
logger.warning_once(
490-
"Please consider a commercial license - Unsloth was designed for the GPU Poor.\\n"
491-
"The OSS currently works on 4 GPUs - we're a 2 person team, so please help fund\\n"
492-
"our development costs by supporting us through Ko-fi or buying a license! Thanks!",
485+
"* Our OSS was designed for people with few GPU resources to level the playing field.\\n"
486+
"* The OSS Apache 2 license only supports one GPU - please obtain a commercial license.\\n"
487+
"* We're a 2 person team, so we still have to fund our development costs - thanks!\\n"
488+
"* If you don't, please consider at least sponsoring us through Ko-fi! Appreciate it!",
493489
)
494-
divisor = n_total_devices / 2
490+
divisor = n_total_devices / 1
495491
bsz = self._train_batch_size = max(int(bsz / divisor), 1)
496-
if total_batches // ga // bsz > 2:
497-
divisor = n_total_devices / 2
492+
if total_batches // ga // bsz > 1:
493+
divisor = n_total_devices / 1
498494
ga = args.gradient_accumulation_steps = max(int(ga / divisor), 1)"""
499495
check_batches = check_batches.split('\n')
500496
check_batches = "\n".join([check_batches[0]] + [front_spaces + x[8:] for x in check_batches[1:]])

unsloth/models/qwen2.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414

1515
from .llama import *
16-
from .mistral import FastMistralModel
1716
import os
1817
from ._utils import __version__
1918

@@ -60,7 +59,7 @@ def pre_patch():
6059

6160
@staticmethod
6261
def from_pretrained(
63-
model_name = "Qwen/Qwen1.5-7B",
62+
model_name = "Qwen/Qwen2-7B",
6463
max_seq_length = 4096,
6564
dtype = None,
6665
load_in_4bit = True,
@@ -73,7 +72,7 @@ def from_pretrained(
7372
trust_remote_code = False,
7473
**kwargs,
7574
):
76-
return FastMistralModel.from_pretrained(
75+
return FastLlamaModel.from_pretrained(
7776
model_name = model_name,
7877
max_seq_length = max_seq_length,
7978
dtype = dtype,

0 commit comments

Comments
 (0)