microsoft · kaleid-liner · Nov 11, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
diff --git a/README.md b/README.md
@@ -313,14 +313,14 @@ We have provided an **all-in-one script**. Invoke it with:
 ```bash
 pip install 3rdparty/llama.cpp/gguf-py
 huggingface-cli download 1bitLLM/bitnet_b1_58-3B --local-dir ${model_dir}
-python tools/run_pipeline.py -o ${model_dir}
+python tools/run_pipeline.py -o ${model_dir} -q int_n
 ```
 
 We have also supported models in GTPQ format from [GPTQModel](https://github.com/ModelCloud/GPTQModel)/[EfficientQAT](https://github.com/OpenGVLab/EfficientQAT). Try it out with officially released EfficientQAT (of GPTQ format) [Llama-3-8b-instruct-w2-g128](https://huggingface.co/ChenMnZ/Llama-3-8b-instruct-EfficientQAT-w2g128-GPTQ):
 
 ```bash
 huggingface-cli download ChenMnZ/Llama-3-8b-instruct-EfficientQAT-w2g128-GPTQ --local-dir ${model_dir}
-python tools/run_pipeline.py -o ${model_dir} -m llama-3-8b-2bit
+python tools/run_pipeline.py -o ${model_dir} -m llama-3-8b-2bit -q int_n
 ```
 
 > - Use `-p` or `-s` argument to select the steps you want to run.

diff --git a/python/t_mac/intrins/tbl.py b/python/t_mac/intrins/tbl.py
@@ -41,18 +41,18 @@ def tbl(
 
     if m_groups == -1:
         if zero_point:
-            scales_shape = (1, m // bits * 2)
+            scales_shape = (kfactor * g // act_group_size, m // bits * 2)
             def _get_scale(m, k):
-                return Scales[0, m // bits * 2] - Scales[0, m // bits * 2 + 1]
+                return Scales[k * g // act_group_size, m // bits * 2] - Scales[k * g // act_group_size, m // bits * 2 + 1]
         else:
-            scales_shape = (1, m // bits)
+            scales_shape = (kfactor * g // act_group_size, m // bits)
             def _get_scale(m, k):
-                return Scales[0, m // bits]
+                return Scales[k * g // act_group_size, m // bits]
         scale_buffer_strides = [te.var("ss"), 1]
     else:
-        scales_shape = (1,)
+        scales_shape = (kfactor * g // act_group_size,)
         def _get_scale(m, k):
-            return Scales[0]
+            return Scales[k * g // act_group_size]
         scale_buffer_strides = [1]
 
     alpha = te.const(get_bits_alphas(bits)[0], dtype=out_dtype)

diff --git a/tools/all_in_one.sh b/tools/all_in_one.sh
@@ -0,0 +1,103 @@
+set -e
+
+if [ "$#" -lt 3 ]; then
+    echo "Usage: $0 <model_path> <kernel_name> <model_type> [--rechunk] [--convert-model] [--run-only] [--disable-t-mac]"
+    echo "       model_path: path to the model directory"
+    echo "       kernel_name: name of the kernel for compiler, e.g., llama-2-7b-4bit, hf-bitnet-3b, hf-bitnet-large-intn, hf-bitnet-large-tq, trilm-3.9b"
+    echo "       model_type: type of the model, e.g., f16, int_n, tq1_0, tq2_0, q4_0"
+    echo "       --rechunk: optional. Rechunk the model if set."
+    echo "       --convert-model: optional. Convert the model to gguf format if set."
+    echo "       --run-only: optional. Skip the compilation and only run the inference and benchmark if set."
+    echo "       --disable-t-mac: optional. Disable T-MAC if set."
+    exit 1
+fi
+
+
+if [[ "$3" == "q4_0" ]]; then
+    export EXTRA_COMPILE_ARGS=("-gs=32" "-ags=32")
+elif [[ "$3" == "tq1_0" || "$3" == "tq2_0" ]]; then
+    export EXTRA_COMPILE_ARGS=("-gs=256" "-ags=64")
+else
+    export EXTRA_COMPILE_ARGS=()
+fi
+
+
+RECHUNK=false
+for arg in "$@"; do
+    case $arg in
+        --rechunk)
+            RECHUNK=true
+            ;;
+        *)
+            ;;
+    esac
+done
+
+
+CONVERT_MODEL=false
+for arg in "$@"; do
+    case $arg in
+        --convert-model)
+            CONVERT_MODEL=true
+            ;;
+        *)
+            ;;
+    esac
+done
+
+RUN_ONLY=false
+for arg in "$@"; do
+    case $arg in
+        --run-only)
+            RUN_ONLY=true
+            ;;
+        *)
+            ;;
+    esac
+done
+
+DISABLE_T_MAC=false
+for arg in "$@"; do
+    case $arg in
+        --disable-t-mac)
+            DISABLE_T_MAC=true
+            ;;
+        *)
+            ;;
+    esac
+done
+
+export MODEL_DIR=$(readlink -f "$1")
+export KERNEL_NAME=$2
+export MODEL_DTYPE=$3
+
+echo "MODEL_DIR: $MODEL_DIR"
+echo "KERNEL_NAME: $KERNEL_NAME"
+echo "MODEL_DTYPE: $MODEL_DTYPE"
+echo "RECHUNK: $RECHUNK"
+echo "CONVERT_MODEL: $CONVERT_MODEL"
+echo "RUN_ONLY: $RUN_ONLY"
+echo "DISABLE_T_MAC: $DISABLE_T_MAC"
+
+
+if [ "$RUN_ONLY" != true ]; then
+    if [ "$DISABLE_T_MAC" == true ]; then
+        echo "===  python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 4,5 "${EXTRA_COMPILE_ARGS[@]}" --disable-t-mac  ==="
+        python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 4,5 ${EXTRA_COMPILE_ARGS[@]} --disable-t-mac
+    else
+        echo "===  python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 0,1,2,4,5 "${EXTRA_COMPILE_ARGS[@]}" -q $MODEL_DTYPE  ==="
+        python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 0,1,2,4,5 ${EXTRA_COMPILE_ARGS[@]} -q $MODEL_DTYPE
+        if $CONVERT_MODEL; then
+            echo "===  python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 3 "${EXTRA_COMPILE_ARGS[@]}" -q $MODEL_DTYPE  ==="
+            python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 3 ${EXTRA_COMPILE_ARGS[@]} -q $MODEL_DTYPE
+        fi
+    fi
+fi
+
+echo "===  python tools/run_pipeline.py -o "$MODEL_DIR" -it "$MODEL_DTYPE" -s 6  ==="
+python tools/run_pipeline.py -o "$MODEL_DIR" -it $MODEL_DTYPE -s 6
+for threads in $(seq 1 4); do
+    echo "===  Running with $threads threads, 1 batch  ==="
+    python tools/run_pipeline.py -o "$MODEL_DIR" -it $MODEL_DTYPE -nt $threads -s 7
+done
+
diff --git a/tools/run_pipeline.py b/tools/run_pipeline.py
@@ -16,10 +16,14 @@ def run_command(command, pwd, ignore_errors=False):
     print(f"  Running command in {pwd}:")
     print(f"    {' '.join(command)}")
     os.makedirs(FLAGS.logs_dir, exist_ok=True)
-    log_file = os.path.join(FLAGS.logs_dir, datetime.now().strftime("%Y-%m-%d-%H-%M-%S.log"))
+    command_name = command[0].split(os.path.sep)[-1]
+    log_file = os.path.join(FLAGS.logs_dir, f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}_{command_name}.log")
     with open(log_file, "w") as fp:
         try:
-            subprocess.check_call(command, cwd=pwd, stdout=fp, stderr=fp)
+            if "llama-bench" in command_name:
+                subprocess.check_call(command, cwd=pwd)
+            else:
+                subprocess.check_call(command, cwd=pwd, stdout=fp, stderr=fp)
         except subprocess.CalledProcessError as err:
             if not ignore_errors:
                 print(RED + f"Please check {log_file} for what's wrong" + RESET)
@@ -48,6 +52,7 @@ def get_llamacpp_build_dir():
 
 
 def compile_kernels():
+    model_name = f"{FLAGS.model}_{str(FLAGS.quant_type).upper()}"
     deploy_dir = os.path.join(ROOT_DIR, "deploy")
     tuned_dir = os.path.join(deploy_dir, "tuned")
     prebuilt_dir = os.path.join(tuned_dir, f"{get_arch(FLAGS.device)}-{FLAGS.model}")
@@ -56,10 +61,18 @@ def compile_kernels():
         shutil.copytree(prebuilt_dir, tuned_dir, dirs_exist_ok=True)
         return
 
+    # Clear previous tune.log
+    command = [
+        'rm',
+        os.path.join("tuned", "preprocessor", "tune.log"),
+        os.path.join("tuned", "qgemm_lut", "tune.log"),
+    ]
+    run_command(command, deploy_dir, ignore_errors=True)
+
     qargs = get_quant_args()
     command = [
         'python', 'compile.py',
-        '-o', 'tuned',
+        '-o', f'{os.path.join("tuned", model_name)}',
         '-da',
         '-nt', f'{FLAGS.num_threads}',
         '-tb',
@@ -82,6 +95,11 @@ def compile_kernels():
         command.append('-v')
     run_command(command, deploy_dir)
 
+    # Move to pre-install directory
+    kernel_dir = os.path.join(tuned_dir, model_name)
+    print(f"  Copy built kernels from {kernel_dir} to {tuned_dir}")
+    shutil.copytree(kernel_dir, tuned_dir, dirs_exist_ok=True)
+
 
 def _clean_cmake(build_dir):
     command = ['cmake', '--build', '.', '--target', 'clean']
@@ -123,31 +141,51 @@ def convert_models():
     model_dir = FLAGS.model_dir
     if not os.path.exists(model_dir):
         raise FileNotFoundError(model_dir)
-    out_path = os.path.join(model_dir, f"ggml-model.{FLAGS.quant_type}.gguf")
+
+    out_type = FLAGS.quant_type
+    if FLAGS.quant_type == "q4_0":
+        out_type = "f16"
+
+    model_name = f"{os.path.split(model_dir)[-1]}.{str(out_type).upper()}.gguf"
+    out_path = os.path.join(model_dir, model_name)
     kcfg_path = os.path.join(ROOT_DIR, "install", "lib", "kcfg.ini")
     llamacpp_dir = os.path.join(ROOT_DIR, "3rdparty", "llama.cpp")
     command = [
         'python',
         'convert_hf_to_gguf.py',
         f'{model_dir}',
-        '--outtype', f'{FLAGS.quant_type}',
+        '--outtype', f'{out_type}',
         '--outfile', f'{out_path}',
         '--kcfg', f'{kcfg_path}',
         '--enable-t-mac',
         '--verbose',
     ]
     run_command(command, llamacpp_dir)
 
+    if FLAGS.quant_type == "q4_0":
+        quantized_model_name = f"{os.path.split(model_dir)[-1]}.Q4_0.gguf"
+        quantized_out_path = os.path.join(model_dir, quantized_model_name)
+        command = [
+            './build/bin/llama-quantize',
+            '--token-embedding-type', 'f16',
+            '--output-tensor-type', 'f16',
+            f'{out_path}',
+            f'{quantized_out_path}',
+            'q4_0',
+        ]
+        run_command(command, llamacpp_dir)
+
 
 def cmake_llamacpp():
     build_dir = get_llamacpp_build_dir()
     cmake_prefix_path = os.path.join(ROOT_DIR, "install", "lib", "cmake", "t-mac")
     command = [
         'cmake', '..',
-        '-DGGML_TMAC=ON',
+        f'-DGGML_TMAC={"OFF" if FLAGS.disable_t_mac else "ON"}',
         f'-DCMAKE_PREFIX_PATH={cmake_prefix_path}',
         '-DCMAKE_BUILD_TYPE=Release',
         '-DGGML_OPENMP=OFF',
+        f'-DGGML_TMAC_RECHUNK={"ON" if FLAGS.rechunk else "OFF"}',
     ]
     if FLAGS.device == "android":
         try:
@@ -178,13 +216,14 @@ def cmake_llamacpp():
 
 def build_llamacpp():
     build_dir = get_llamacpp_build_dir()
-    command = ['cmake', '--build', '.', '--target', 'llama-cli', 'llama-bench', 'llama-quantize', '--config', 'Release']
+    command = ['cmake', '--build', '.', '--target', 'llama-cli', 'llama-bench', 'llama-quantize', 'llama-perplexity', '--config', 'Release']
     run_command(command, build_dir)
 
 
 def run_inference():
     build_dir = get_llamacpp_build_dir()
-    out_path = os.path.join(FLAGS.model_dir, f"ggml-model.{FLAGS.quant_type}.gguf")
+    model_name = f"{os.path.split(FLAGS.model_dir)[-1]}.{str(FLAGS.inference_type).upper()}.gguf"
+    out_path = os.path.join(FLAGS.model_dir, model_name)
     if is_win():
         main_path = os.path.join(build_dir, "bin", "Release", "llama-cli.exe")
         if not os.path.exists(main_path):
@@ -229,14 +268,67 @@ def run_inference():
             '-m', f'{out_path}',
             '-n', '128',
             '-t', f'{FLAGS.num_threads}',
-            '-p', prompt,
+            '-p', f'{prompt}',
             '-ngl', '0',
             '-c', '2048'
         ]
         log_file = run_command(command, build_dir)
     print(GREEN + f"Check {log_file} for inference output" + RESET)
 
 
+def run_llama_bench():
+    build_dir = get_llamacpp_build_dir()
+    model_name = f"{os.path.split(FLAGS.model_dir)[-1]}.{str(FLAGS.inference_type).upper()}.gguf"
+    out_path = os.path.join(FLAGS.model_dir, model_name)
+    if is_win():
+        main_path = os.path.join(build_dir, "bin", "Release", "llama-bench.exe")
+        if not os.path.exists(main_path):
+            main_path = os.path.join(build_dir, "bin", "llama-bench")
+    else:
+        main_path = os.path.join(build_dir, "bin", "llama-bench")
+    prompt = 256
+    # TODO: verify in Android
+    if FLAGS.device == "android":
+        remote_bin_path = os.path.join(FLAGS.remote_dir, "bin")
+        command = ['push', os.path.join(build_dir, "bin"), FLAGS.remote_dir]
+        run_adb_command(command, build_dir)
+        remote_main_path = os.path.join(remote_bin_path, "llama-bench")
+        command = ['shell', 'chmod', '-R', '+x', remote_bin_path]
+        run_adb_command(command, build_dir)
+        remote_out_path = os.path.join(
+            FLAGS.remote_dir,
+            f"{os.path.basename(FLAGS.model_dir)}-{os.path.basename(out_path)}",
+        )
+        if not FLAGS.skip_push_model:
+            command = ['push', out_path, remote_out_path]
+            run_adb_command(command, build_dir)
+        kcfg_path = os.path.join(ROOT_DIR, "install", "lib", "kcfg.ini")
+        remote_kcfg_path = os.path.join(FLAGS.remote_dir, "kcfg.ini")
+        command = ['push', kcfg_path, remote_kcfg_path]
+        run_adb_command(command, build_dir)
+        command = [
+            'shell',
+            f'TMAC_KCFG_FILE={remote_kcfg_path}',
+            f'{remote_main_path}',
+            '-m', f'{remote_out_path}',
+            '-n', '128',
+            '-t', f'{FLAGS.num_threads}',
+            '-p', f'{prompt}',
+            '-ngl', '0',
+        ]
+        log_file = run_adb_command(command, build_dir)
+    else:
+        command = [
+            f'{main_path}',
+            '-m', f'{out_path}',
+            '-n', '128',
+            '-t', f'{FLAGS.num_threads}',
+            '-p', f'{prompt}',
+            '-ngl', '0',
+        ]
+        log_file = run_command(command, build_dir)
+    print(GREEN + f"Check {log_file} for llama-bench output" + RESET)
+
 STEPS = [
     ("Compile kernels", compile_kernels),
     ("Build T-MAC C++ CMakeFiles", cmake_t_mac),
@@ -245,6 +337,7 @@ def run_inference():
     ("Build llama.cpp CMakeFiles", cmake_llamacpp),
     ("Build llama.cpp", build_llamacpp),
     ("Run inference", run_inference),
+    ("Run llama-bench", run_llama_bench)
 ]
 
 
@@ -278,7 +371,10 @@ def parse_args():
     parser.add_argument("-gs", "--group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
     parser.add_argument("-ags", "--act_group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
     parser.add_argument("-ld", "--logs_dir", type=str, default="logs")
-    parser.add_argument("-q", "--quant_type", type=str, choices=["int_n", "f16", "f32"], default="int_n")
+    parser.add_argument("-q", "--quant_type", type=str, choices=["int_n", "f16", "f32", "tq1_0", "tq2_0", "q4_0"], default=None,
+                        help="Quantization model type. This will override inference_type.")
+    parser.add_argument("-it", "--inference_type", type=str, default="int_n",
+                        help="Inference model type. This will be overridden by quant_type if quant_type is set.")
     parser.add_argument("-zp", "--zero_point", action="store_true", help="Enforce enable zero_point. Required by EfficientQAT models.")
     parser.add_argument("-nzp", "--no_zero_point", action="store_false", help="Enforce disable zero_point. Don't set this argument if you don't know its meaning.")
 
@@ -293,8 +389,16 @@ def parse_args():
     parser.add_argument("-ndk", "--ndk_home", type=str, default="", help="NDK home")
     parser.add_argument("-spm", "--skip_push_model", action="store_true", help="Suppose the model is unchanged to skip pushing the model file")
 
+    parser.add_argument("-rc", "--rechunk", action="store_true", help="Set this argument if you want to use rechunk in computation.")
+    parser.add_argument("--disable-t-mac", action="store_true", help="Set this argument if you want to disable T-MAC.")
+
     parser.set_defaults(zero_point=None)
-    return parser.parse_args()
+    args = parser.parse_args()
+
+    if args.quant_type is not None:
+        args.inference_type = args.quant_type
+
+    return args
 
 
 def get_quant_args():
+2 −2		convert_hf_to_gguf.py
+4 −0		ggml/src/CMakeLists.txt
+2 −1		ggml/src/ggml.c