modelscope · Jintao-Huang · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/examples/models/ovis2/train.sh b/examples/models/ovis2/train.sh
@@ -1,23 +1,25 @@
-# 28GiB
+# 17GiB
 
 pip install "transformers==4.51.*"
 
 CUDA_VISIBLE_DEVICES=0 \
 swift sft \
-    --model AIDC-AI/Ovis2-8B \
-    --dataset 'modelscope/coco_2014_caption:validation#20000' \
+    --model AIDC-AI/Ovis2.5-2B \
+    --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \
     --split_dataset_ratio 0.01 \
     --train_type lora \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 16 \
+    --attn_impl flash_attn \
+    --padding_free true \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --target_modules all-linear \
     --freeze_vit true \
-    --gradient_accumulation_steps 16 \
+    --gradient_accumulation_steps 1 \
     --eval_steps 50 \
     --save_steps 50 \
     --save_total_limit 2 \

diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
@@ -140,25 +140,6 @@ def _check_padding_free(self):
                 raise ValueError(f'The "{feature}" feature requires a flash attention implementation. '
                                  'Please use one of: "flash_attn", "flash_attention_2", "flash_attention_3".')
 
-            if self.model_meta.is_multimodal:
-                supported_model_type = [
-                    'qwen2_vl',
-                    'qwen2_5_vl',
-                    'qwen2_5_omni',
-                    'qvq',
-                    'mimo_vl',
-                    'internvl',
-                    'internvl_phi3',
-                    'internvl2',
-                    'internvl2_phi3',
-                    'internvl2_5',
-                    'internvl3',
-                ]
-                if self.model_type not in supported_model_type:
-                    raise ValueError(
-                        f'Packing/padding_free is not supported for model_type `{self.model_type}`. '
-                        f'model_type of multimodal models that support packing/padding_free: {supported_model_type}.')
-
     def __post_init__(self) -> None:
         if self.resume_from_checkpoint:
             self.resume_from_checkpoint = to_abspath(self.resume_from_checkpoint, True)

diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -49,6 +49,7 @@ class Template(ProcessorMixin):
     skip_prompt = True
     use_model = False
     norm_bbox = 'norm1000'
+    support_padding_free = False  # It only takes effect for multimodal models.
 
     is_encoder_decoder = False
 

diff --git a/swift/llm/template/template/internvl.py b/swift/llm/template/template/internvl.py
@@ -20,6 +20,7 @@ class InternvlTemplate(Template):
     skip_prompt = False
     num_image_token = None
     placeholder_tokens = ['<IMG_CONTEXT>']
+    support_padding_free = True
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
                     inputs: StdTemplateInputs) -> List[Context]:

diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py
@@ -227,6 +227,7 @@ class Qwen2VLTemplate(Template):
     placeholder_tokens = ['<|image_pad|>', '<|video_pad|>']
     version = 'v2'
     use_model = True
+    support_padding_free = True
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
                     inputs: StdTemplateInputs) -> List[Context]:
@@ -737,6 +738,7 @@ class Ovis2_5Template(ThinkingTemplate):
     num_frames = 8
     use_model = True
     skip_prompt = False
+    support_padding_free = True
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
                     inputs: StdTemplateInputs) -> List[Context]:

diff --git a/swift/llm/train/sft.py b/swift/llm/train/sft.py
@@ -58,10 +58,13 @@ def _prepare_model_tokenizer(self, load_model=True):
         self._prepare_generation_config()
 
     def _prepare_template(self) -> None:
-        template = self.args.get_template(self.processor)
+        args = self.args
+        template = args.get_template(self.processor)
         template.set_mode('train')
         if template.use_model:
             template.model = self.model
+        if args.model_meta.is_multimodal and (args.padding_free or args.packing) and not template.support_padding_free:
+            raise ValueError(f'Template `{args.template}` does not support padding free or packing.')
         self.template = template
 
     def _get_dataset(self):