[QEff. Finetune]: Enabled FT CI tests. (quic#420)

quic-meetkuma · quic-dhirajku · commit 01b06008e212 · 2025-08-04T09:56:56.000Z
- Enabled CI tests for Finetuning.
- Updated Jenkins file to install torch_qaic as it is required during FT
tests.
- Added finetune as a new pytest flag and updated other existing tests
not to trigger for this flag.

---------

Signed-off-by: meetkuma &lt;meetkuma@qti.qualcomm.com&gt;
Co-authored-by: Meet Patel &lt;meetkuma@qti.qualcomm.com&gt;
Signed-off-by: Dhiraj Kumar Sah &lt;dhirajku@qti.qualcomm.com&gt;
diff --git a/QEfficient/finetune/dataset/samsum_dataset.py b/QEfficient/finetune/dataset/samsum_dataset.py
@@ -9,7 +9,7 @@
 
 
 def get_preprocessed_samsum(dataset_config, tokenizer, split, context_length=None):
-    dataset = datasets.load_dataset("Samsung/samsum", split=split, trust_remote_code=True)
+    dataset = datasets.load_dataset("knkarthick/samsum", split=split, trust_remote_code=True)
 
     prompt = "Summarize this dialog:\n{dialog}\n---\nSummary:\n"
 
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
@@ -25,6 +25,7 @@ pipeline {
                    pip install junitparser pytest-xdist &&
                    pip install librosa==0.10.2 soundfile==0.13.1 && #packages needed to load example for whisper testing
                    pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.19.1+cpu einops==0.8.1 && #packages to load VLMs
+                   pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl && # For finetuning tests
                    rm -rf QEfficient"
                '''
            }
@@ -41,7 +42,7 @@ pipeline {
                            mkdir -p $PWD/Non_cli_qaic &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_cli_qaic &&
-                           pytest tests -m '(not cli) and (not on_qaic)' --ignore tests/vllm -n auto --junitxml=tests/tests_log1.xml &&
+                           pytest tests -m '(not cli) and (not on_qaic) and (not finetune)' --ignore tests/vllm -n auto --junitxml=tests/tests_log1.xml &&
                            junitparser merge tests/tests_log1.xml tests/tests_log.xml &&
                            deactivate"
                            '''
@@ -58,7 +59,7 @@ pipeline {
                            mkdir -p $PWD/Non_qaic &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_qaic &&
-                           pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
+                           pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
                            junitparser merge tests/tests_log2.xml tests/tests_log.xml &&
                            deactivate"
                            '''
@@ -77,14 +78,14 @@ pipeline {
                            mkdir -p $PWD/Non_cli_qaic_multimodal &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_cli_qaic_multimodal &&
-                           pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log6.xml &&
+                           pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log6.xml &&
                            junitparser merge tests/tests_log6.xml tests/tests_log.xml &&
                            deactivate"
                            '''
                        }
                    }
         }
-       stage('CLI Tests') {
+       stage('Inference Tests') {
                    steps {
                        timeout(time: 60, unit: 'MINUTES') {
                            sh '''
@@ -96,7 +97,7 @@ pipeline {
                            mkdir -p $PWD/cli &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/cli &&
-                           pytest tests -m '(cli and not qnn)' --ignore tests/vllm --junitxml=tests/tests_log3.xml &&
+                           pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log3.xml &&
                            junitparser merge tests/tests_log3.xml tests/tests_log.xml &&
                            deactivate"
                            '''
@@ -125,7 +126,7 @@ pipeline {
                     mkdir -p $PWD/Qnn_cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_cli &&
-                    pytest tests -m '(cli and qnn)' --ignore tests/vllm --junitxml=tests/tests_log4.xml &&
+                    pytest tests -m '(cli and qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log4.xml &&
                     junitparser merge tests/tests_log4.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -144,7 +145,7 @@ pipeline {
                     mkdir -p $PWD/Qnn_non_cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_non_cli &&
-                    pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
+                    pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
                     junitparser merge tests/tests_log5.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -170,6 +171,23 @@ pipeline {
                 }
             }
         }
+        stage('Finetune CLI Tests') {
+            steps {
+                timeout(time: 5, unit: 'MINUTES') {
+                    sh '''
+                    sudo docker exec ${BUILD_TAG} bash -c "
+                    cd /efficient-transformers &&
+                    . preflight_qeff/bin/activate &&
+                    mkdir -p $PWD/cli_qaic_finetuning &&
+                    export TOKENIZERS_PARALLELISM=false &&
+                    export QEFF_HOME=$PWD/cli_qaic_finetuning &&
+                    pytest tests -m '(cli) and (on_qaic) and (not qnn) and (not multimodal) and (finetune)' --ignore tests/vllm --junitxml=tests/tests_log_finetune.xml &&
+                    junitparser merge tests/tests_log_finetune.xml tests/tests_log.xml &&
+                    deactivate"
+                    '''
+                }
+            }
+        }
     }
 
    post {
diff --git a/tests/finetune/test_finetune.py b/tests/finetune/test_finetune.py
@@ -7,94 +7,164 @@
 
 import os
 import shutil
+from pathlib import Path
 
 import numpy as np
 import pytest
+import requests
 import torch.optim as optim
 from torch.utils.data import DataLoader
 
 import QEfficient
 import QEfficient.cloud.finetune
 from QEfficient.cloud.finetune import main as finetune
 
+alpaca_json_path = Path.cwd() / "alpaca_data.json"
+
 
 def clean_up(path):
-    if os.path.exists(path):
+    if os.path.isdir(path) and os.path.exists(path):
         shutil.rmtree(path)
+    if os.path.isfile(path):
+        os.remove(path)
+
+
+def download_alpaca():
+    alpaca_url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json"
+    response = requests.get(alpaca_url)
+
+    with open(alpaca_json_path, "wb") as f:
+        f.write(response.content)
 
 
 configs = [
     pytest.param(
         "meta-llama/Llama-3.2-1B",  # model_name
+        "generation",  # task_type
         10,  # max_eval_step
         20,  # max_train_step
+        "gsm8k_dataset",  # dataset_name
+        None,  # data_path
         1,  # intermediate_step_save
         None,  # context_length
         True,  # run_validation
         True,  # use_peft
         "qaic",  # device
-        id="llama_config",  # config name
-    )
+        0.0043353,  # expected_train_loss
+        1.0043447,  # expected_train_metric
+        0.0117334,  # expected_eval_loss
+        1.0118025,  # expected_eval_metric
+        id="llama_config_gsm8k",  # config name
+    ),
+    pytest.param(
+        "meta-llama/Llama-3.2-1B",  # model_name
+        "generation",  # task_type
+        10,  # max_eval_step
+        20,  # max_train_step
+        "alpaca_dataset",  # dataset_name
+        alpaca_json_path,  # data_path
+        1,  # intermediate_step_save
+        None,  # context_length
+        True,  # run_validation
+        True,  # use_peft
+        "qaic",  # device
+        0.0006099,  # expected_train_loss
+        1.0006101,  # expected_train_metric
+        0.0065296,  # expected_eval_loss
+        1.0065510,  # expected_eval_metric
+        id="llama_config_alpaca",  # config name
+    ),
+    pytest.param(
+        "google-bert/bert-base-uncased",  # model_name
+        "seq_classification",  # task_type
+        10,  # max_eval_step
+        20,  # max_train_step
+        "imdb_dataset",  # dataset_name
+        None,  # data_path
+        1,  # intermediate_step_save
+        None,  # context_length
+        True,  # run_validation
+        False,  # use_peft
+        "qaic",  # device
+        0.00052981,  # expected_train_loss
+        0.55554199,  # expected_train_metric
+        0.00738618,  # expected_eval_loss
+        0.70825195,  # expected_eval_metric
+        id="bert_config_imdb",  # config name
+    ),
 ]
 
 
-@pytest.mark.skip(reason="Currently CI is broken. Once it is fixed we will enable this test.")
 @pytest.mark.cli
 @pytest.mark.on_qaic
 @pytest.mark.finetune
 @pytest.mark.parametrize(
-    "model_name,max_eval_step,max_train_step,intermediate_step_save,context_length,run_validation,use_peft,device",
+    "model_name,task_type,max_eval_step,max_train_step,dataset_name,data_path,intermediate_step_save,context_length,run_validation,use_peft,device,expected_train_loss,expected_train_metric,expected_eval_loss,expected_eval_metric",
     configs,
 )
-def test_finetune(
+def test_finetune_llama(
     model_name,
+    task_type,
     max_eval_step,
     max_train_step,
+    dataset_name,
+    data_path,
     intermediate_step_save,
     context_length,
     run_validation,
     use_peft,
     device,
+    expected_train_loss,
+    expected_train_metric,
+    expected_eval_loss,
+    expected_eval_metric,
     mocker,
 ):
     train_config_spy = mocker.spy(QEfficient.cloud.finetune, "TrainConfig")
     generate_dataset_config_spy = mocker.spy(QEfficient.cloud.finetune, "generate_dataset_config")
     generate_peft_config_spy = mocker.spy(QEfficient.cloud.finetune, "generate_peft_config")
-    get_dataloader_kwargs_spy = mocker.spy(QEfficient.cloud.finetune, "get_dataloader_kwargs")
+    get_dataloader_kwargs_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_dataloader_kwargs")
     update_config_spy = mocker.spy(QEfficient.cloud.finetune, "update_config")
-    get_custom_data_collator_spy = mocker.spy(QEfficient.cloud.finetune, "get_custom_data_collator")
-    get_preprocessed_dataset_spy = mocker.spy(QEfficient.cloud.finetune, "get_preprocessed_dataset")
+    get_custom_data_collator_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_custom_data_collator")
+    get_preprocessed_dataset_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_preprocessed_dataset")
     get_longest_seq_length_spy = mocker.spy(QEfficient.cloud.finetune, "get_longest_seq_length")
     print_model_size_spy = mocker.spy(QEfficient.cloud.finetune, "print_model_size")
     train_spy = mocker.spy(QEfficient.cloud.finetune, "train")
 
     kwargs = {
         "model_name": model_name,
+        "task_type": task_type,
         "max_eval_step": max_eval_step,
         "max_train_step": max_train_step,
+        "dataset": dataset_name,
+        "data_path": data_path,
         "intermediate_step_save": intermediate_step_save,
         "context_length": context_length,
         "run_validation": run_validation,
         "use_peft": use_peft,
         "device": device,
     }
 
+    if dataset_name == "alpaca_dataset":
+        download_alpaca()
+
     results = finetune(**kwargs)
-    assert np.allclose(results["avg_train_loss"], 0.00232327, atol=1e-5), "Train loss is not matching."
-    assert np.allclose(results["avg_train_metric"], 1.002326, atol=1e-5), "Train metric is not matching."
-    assert np.allclose(results["avg_eval_loss"], 0.0206124, atol=1e-5), "Eval loss is not matching."
-    assert np.allclose(results["avg_eval_metric"], 1.020826, atol=1e-5), "Eval metric is not matching."
+    assert np.allclose(results["avg_train_loss"], expected_train_loss, atol=1e-3), "Train loss is not matching."
+    assert np.allclose(results["avg_train_metric"], expected_train_metric, atol=1e-3), "Train metric is not matching."
+    assert np.allclose(results["avg_eval_loss"], expected_eval_loss, atol=1e-3), "Eval loss is not matching."
+    assert np.allclose(results["avg_eval_metric"], expected_eval_metric, atol=1e-3), "Eval metric is not matching."
     assert results["avg_epoch_time"] < 60, "Training should complete within 60 seconds."
 
     train_config_spy.assert_called_once()
     generate_dataset_config_spy.assert_called_once()
-    generate_peft_config_spy.assert_called_once()
-    get_custom_data_collator_spy.assert_called_once()
+    if task_type == "generation":
+        generate_peft_config_spy.assert_called_once()
     get_longest_seq_length_spy.assert_called_once()
     print_model_size_spy.assert_called_once()
     train_spy.assert_called_once()
 
     assert update_config_spy.call_count == 2
+    assert get_custom_data_collator_spy.call_count == 2
     assert get_dataloader_kwargs_spy.call_count == 2
     assert get_preprocessed_dataset_spy.call_count == 2
 
@@ -123,12 +193,19 @@ def test_finetune(
         f"{train_config.gradient_accumulation_steps} which is gradient accumulation steps."
     )
 
-    saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/adapter_model.safetensors")
+    if use_peft:
+        saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/adapter_model.safetensors")
+    else:
+        saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/model.safetensors")
     assert os.path.isfile(saved_file)
 
     clean_up(train_config.output_dir)
     clean_up("runs")
+    clean_up("qaic-dumps")
     clean_up(train_config.dump_root_dir)
 
+    if dataset_name == "alpaca_dataset":
+        clean_up(alpaca_json_path)
+
 
 # TODO (Meet): Add seperate tests for BERT FT and LLama FT