Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update scripts #69

Merged
merged 8 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/llama.cpp
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -313,14 +313,14 @@ We have provided an **all-in-one script**. Invoke it with:
```bash
pip install 3rdparty/llama.cpp/gguf-py
huggingface-cli download 1bitLLM/bitnet_b1_58-3B --local-dir ${model_dir}
python tools/run_pipeline.py -o ${model_dir}
python tools/run_pipeline.py -o ${model_dir} -q int_n
```

We have also supported models in GTPQ format from [GPTQModel](https://github.com/ModelCloud/GPTQModel)/[EfficientQAT](https://github.com/OpenGVLab/EfficientQAT). Try it out with officially released EfficientQAT (of GPTQ format) [Llama-3-8b-instruct-w2-g128](https://huggingface.co/ChenMnZ/Llama-3-8b-instruct-EfficientQAT-w2g128-GPTQ):

```bash
huggingface-cli download ChenMnZ/Llama-3-8b-instruct-EfficientQAT-w2g128-GPTQ --local-dir ${model_dir}
python tools/run_pipeline.py -o ${model_dir} -m llama-3-8b-2bit
python tools/run_pipeline.py -o ${model_dir} -m llama-3-8b-2bit -q int_n
```

> - Use `-p` or `-s` argument to select the steps you want to run.
Expand Down
12 changes: 6 additions & 6 deletions python/t_mac/intrins/tbl.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,18 @@ def tbl(

if m_groups == -1:
if zero_point:
scales_shape = (1, m // bits * 2)
scales_shape = (kfactor * g // act_group_size, m // bits * 2)
def _get_scale(m, k):
return Scales[0, m // bits * 2] - Scales[0, m // bits * 2 + 1]
return Scales[k * g // act_group_size, m // bits * 2] - Scales[k * g // act_group_size, m // bits * 2 + 1]
else:
scales_shape = (1, m // bits)
scales_shape = (kfactor * g // act_group_size, m // bits)
def _get_scale(m, k):
return Scales[0, m // bits]
return Scales[k * g // act_group_size, m // bits]
scale_buffer_strides = [te.var("ss"), 1]
else:
scales_shape = (1,)
scales_shape = (kfactor * g // act_group_size,)
def _get_scale(m, k):
return Scales[0]
return Scales[k * g // act_group_size]
scale_buffer_strides = [1]

alpha = te.const(get_bits_alphas(bits)[0], dtype=out_dtype)
Expand Down
103 changes: 103 additions & 0 deletions tools/all_in_one.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
set -e

if [ "$#" -lt 3 ]; then
echo "Usage: $0 <model_path> <kernel_name> <model_type> [--rechunk] [--convert-model] [--run-only] [--disable-t-mac]"
echo " model_path: path to the model directory"
echo " kernel_name: name of the kernel for compiler, e.g., llama-2-7b-4bit, hf-bitnet-3b, hf-bitnet-large-intn, hf-bitnet-large-tq, trilm-3.9b"
echo " model_type: type of the model, e.g., f16, int_n, tq1_0, tq2_0, q4_0"
echo " --rechunk: optional. Rechunk the model if set."
echo " --convert-model: optional. Convert the model to gguf format if set."
echo " --run-only: optional. Skip the compilation and only run the inference and benchmark if set."
echo " --disable-t-mac: optional. Disable T-MAC if set."
exit 1
fi


if [[ "$3" == "q4_0" ]]; then
export EXTRA_COMPILE_ARGS=("-gs=32" "-ags=32")
elif [[ "$3" == "tq1_0" || "$3" == "tq2_0" ]]; then
export EXTRA_COMPILE_ARGS=("-gs=256" "-ags=64")
else
export EXTRA_COMPILE_ARGS=()
fi


RECHUNK=false
for arg in "$@"; do
case $arg in
--rechunk)
RECHUNK=true
;;
*)
;;
esac
done


CONVERT_MODEL=false
for arg in "$@"; do
case $arg in
--convert-model)
CONVERT_MODEL=true
;;
*)
;;
esac
done

RUN_ONLY=false
for arg in "$@"; do
case $arg in
--run-only)
RUN_ONLY=true
;;
*)
;;
esac
done

DISABLE_T_MAC=false
for arg in "$@"; do
case $arg in
--disable-t-mac)
DISABLE_T_MAC=true
;;
*)
;;
esac
done

export MODEL_DIR=$(readlink -f "$1")
export KERNEL_NAME=$2
export MODEL_DTYPE=$3

echo "MODEL_DIR: $MODEL_DIR"
echo "KERNEL_NAME: $KERNEL_NAME"
echo "MODEL_DTYPE: $MODEL_DTYPE"
echo "RECHUNK: $RECHUNK"
echo "CONVERT_MODEL: $CONVERT_MODEL"
echo "RUN_ONLY: $RUN_ONLY"
echo "DISABLE_T_MAC: $DISABLE_T_MAC"


if [ "$RUN_ONLY" != true ]; then
if [ "$DISABLE_T_MAC" == true ]; then
echo "=== python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 4,5 "${EXTRA_COMPILE_ARGS[@]}" --disable-t-mac ==="
python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 4,5 ${EXTRA_COMPILE_ARGS[@]} --disable-t-mac
else
echo "=== python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 0,1,2,4,5 "${EXTRA_COMPILE_ARGS[@]}" -q $MODEL_DTYPE ==="
python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 0,1,2,4,5 ${EXTRA_COMPILE_ARGS[@]} -q $MODEL_DTYPE
if $CONVERT_MODEL; then
echo "=== python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 3 "${EXTRA_COMPILE_ARGS[@]}" -q $MODEL_DTYPE ==="
python tools/run_pipeline.py -o $MODEL_DIR -m $KERNEL_NAME -nt 4 -s 3 ${EXTRA_COMPILE_ARGS[@]} -q $MODEL_DTYPE
fi
fi
fi

echo "=== python tools/run_pipeline.py -o "$MODEL_DIR" -it "$MODEL_DTYPE" -s 6 ==="
python tools/run_pipeline.py -o "$MODEL_DIR" -it $MODEL_DTYPE -s 6
for threads in $(seq 1 4); do
echo "=== Running with $threads threads, 1 batch ==="
python tools/run_pipeline.py -o "$MODEL_DIR" -it $MODEL_DTYPE -nt $threads -s 7
done

126 changes: 115 additions & 11 deletions tools/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@ def run_command(command, pwd, ignore_errors=False):
print(f" Running command in {pwd}:")
print(f" {' '.join(command)}")
os.makedirs(FLAGS.logs_dir, exist_ok=True)
log_file = os.path.join(FLAGS.logs_dir, datetime.now().strftime("%Y-%m-%d-%H-%M-%S.log"))
command_name = command[0].split(os.path.sep)[-1]
log_file = os.path.join(FLAGS.logs_dir, f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}_{command_name}.log")
with open(log_file, "w") as fp:
try:
subprocess.check_call(command, cwd=pwd, stdout=fp, stderr=fp)
if "llama-bench" in command_name:
subprocess.check_call(command, cwd=pwd)
else:
subprocess.check_call(command, cwd=pwd, stdout=fp, stderr=fp)
except subprocess.CalledProcessError as err:
if not ignore_errors:
print(RED + f"Please check {log_file} for what's wrong" + RESET)
Expand Down Expand Up @@ -48,6 +52,7 @@ def get_llamacpp_build_dir():


def compile_kernels():
model_name = f"{FLAGS.model}_{str(FLAGS.quant_type).upper()}"
deploy_dir = os.path.join(ROOT_DIR, "deploy")
tuned_dir = os.path.join(deploy_dir, "tuned")
prebuilt_dir = os.path.join(tuned_dir, f"{get_arch(FLAGS.device)}-{FLAGS.model}")
Expand All @@ -56,10 +61,18 @@ def compile_kernels():
shutil.copytree(prebuilt_dir, tuned_dir, dirs_exist_ok=True)
return

# Clear previous tune.log
command = [
'rm',
os.path.join("tuned", "preprocessor", "tune.log"),
os.path.join("tuned", "qgemm_lut", "tune.log"),
]
run_command(command, deploy_dir, ignore_errors=True)

qargs = get_quant_args()
command = [
'python', 'compile.py',
'-o', 'tuned',
'-o', f'{os.path.join("tuned", model_name)}',
'-da',
'-nt', f'{FLAGS.num_threads}',
'-tb',
Expand All @@ -82,6 +95,11 @@ def compile_kernels():
command.append('-v')
run_command(command, deploy_dir)

# Move to pre-install directory
kernel_dir = os.path.join(tuned_dir, model_name)
print(f" Copy built kernels from {kernel_dir} to {tuned_dir}")
shutil.copytree(kernel_dir, tuned_dir, dirs_exist_ok=True)


def _clean_cmake(build_dir):
command = ['cmake', '--build', '.', '--target', 'clean']
Expand Down Expand Up @@ -123,31 +141,51 @@ def convert_models():
model_dir = FLAGS.model_dir
if not os.path.exists(model_dir):
raise FileNotFoundError(model_dir)
out_path = os.path.join(model_dir, f"ggml-model.{FLAGS.quant_type}.gguf")

out_type = FLAGS.quant_type
if FLAGS.quant_type == "q4_0":
out_type = "f16"

model_name = f"{os.path.split(model_dir)[-1]}.{str(out_type).upper()}.gguf"
out_path = os.path.join(model_dir, model_name)
kcfg_path = os.path.join(ROOT_DIR, "install", "lib", "kcfg.ini")
llamacpp_dir = os.path.join(ROOT_DIR, "3rdparty", "llama.cpp")
command = [
'python',
'convert_hf_to_gguf.py',
f'{model_dir}',
'--outtype', f'{FLAGS.quant_type}',
'--outtype', f'{out_type}',
'--outfile', f'{out_path}',
'--kcfg', f'{kcfg_path}',
'--enable-t-mac',
'--verbose',
]
run_command(command, llamacpp_dir)

if FLAGS.quant_type == "q4_0":
quantized_model_name = f"{os.path.split(model_dir)[-1]}.Q4_0.gguf"
quantized_out_path = os.path.join(model_dir, quantized_model_name)
command = [
'./build/bin/llama-quantize',
'--token-embedding-type', 'f16',
'--output-tensor-type', 'f16',
f'{out_path}',
f'{quantized_out_path}',
'q4_0',
]
run_command(command, llamacpp_dir)


def cmake_llamacpp():
build_dir = get_llamacpp_build_dir()
cmake_prefix_path = os.path.join(ROOT_DIR, "install", "lib", "cmake", "t-mac")
command = [
'cmake', '..',
'-DGGML_TMAC=ON',
f'-DGGML_TMAC={"OFF" if FLAGS.disable_t_mac else "ON"}',
f'-DCMAKE_PREFIX_PATH={cmake_prefix_path}',
'-DCMAKE_BUILD_TYPE=Release',
'-DGGML_OPENMP=OFF',
f'-DGGML_TMAC_RECHUNK={"ON" if FLAGS.rechunk else "OFF"}',
]
if FLAGS.device == "android":
try:
Expand Down Expand Up @@ -178,13 +216,14 @@ def cmake_llamacpp():

def build_llamacpp():
build_dir = get_llamacpp_build_dir()
command = ['cmake', '--build', '.', '--target', 'llama-cli', 'llama-bench', 'llama-quantize', '--config', 'Release']
command = ['cmake', '--build', '.', '--target', 'llama-cli', 'llama-bench', 'llama-quantize', 'llama-perplexity', '--config', 'Release']
run_command(command, build_dir)


def run_inference():
build_dir = get_llamacpp_build_dir()
out_path = os.path.join(FLAGS.model_dir, f"ggml-model.{FLAGS.quant_type}.gguf")
model_name = f"{os.path.split(FLAGS.model_dir)[-1]}.{str(FLAGS.inference_type).upper()}.gguf"
out_path = os.path.join(FLAGS.model_dir, model_name)
if is_win():
main_path = os.path.join(build_dir, "bin", "Release", "llama-cli.exe")
if not os.path.exists(main_path):
Expand Down Expand Up @@ -229,14 +268,67 @@ def run_inference():
'-m', f'{out_path}',
'-n', '128',
'-t', f'{FLAGS.num_threads}',
'-p', prompt,
'-p', f'{prompt}',
'-ngl', '0',
'-c', '2048'
]
log_file = run_command(command, build_dir)
print(GREEN + f"Check {log_file} for inference output" + RESET)


def run_llama_bench():
build_dir = get_llamacpp_build_dir()
model_name = f"{os.path.split(FLAGS.model_dir)[-1]}.{str(FLAGS.inference_type).upper()}.gguf"
out_path = os.path.join(FLAGS.model_dir, model_name)
if is_win():
main_path = os.path.join(build_dir, "bin", "Release", "llama-bench.exe")
if not os.path.exists(main_path):
main_path = os.path.join(build_dir, "bin", "llama-bench")
else:
main_path = os.path.join(build_dir, "bin", "llama-bench")
prompt = 256
# TODO: verify in Android
if FLAGS.device == "android":
remote_bin_path = os.path.join(FLAGS.remote_dir, "bin")
command = ['push', os.path.join(build_dir, "bin"), FLAGS.remote_dir]
run_adb_command(command, build_dir)
remote_main_path = os.path.join(remote_bin_path, "llama-bench")
command = ['shell', 'chmod', '-R', '+x', remote_bin_path]
run_adb_command(command, build_dir)
remote_out_path = os.path.join(
FLAGS.remote_dir,
f"{os.path.basename(FLAGS.model_dir)}-{os.path.basename(out_path)}",
)
if not FLAGS.skip_push_model:
command = ['push', out_path, remote_out_path]
run_adb_command(command, build_dir)
kcfg_path = os.path.join(ROOT_DIR, "install", "lib", "kcfg.ini")
remote_kcfg_path = os.path.join(FLAGS.remote_dir, "kcfg.ini")
command = ['push', kcfg_path, remote_kcfg_path]
run_adb_command(command, build_dir)
command = [
'shell',
f'TMAC_KCFG_FILE={remote_kcfg_path}',
f'{remote_main_path}',
'-m', f'{remote_out_path}',
'-n', '128',
'-t', f'{FLAGS.num_threads}',
'-p', f'{prompt}',
'-ngl', '0',
]
log_file = run_adb_command(command, build_dir)
else:
command = [
f'{main_path}',
'-m', f'{out_path}',
'-n', '128',
'-t', f'{FLAGS.num_threads}',
'-p', f'{prompt}',
'-ngl', '0',
]
log_file = run_command(command, build_dir)
print(GREEN + f"Check {log_file} for llama-bench output" + RESET)

STEPS = [
("Compile kernels", compile_kernels),
("Build T-MAC C++ CMakeFiles", cmake_t_mac),
Expand All @@ -245,6 +337,7 @@ def run_inference():
("Build llama.cpp CMakeFiles", cmake_llamacpp),
("Build llama.cpp", build_llamacpp),
("Run inference", run_inference),
("Run llama-bench", run_llama_bench)
]


Expand Down Expand Up @@ -278,7 +371,10 @@ def parse_args():
parser.add_argument("-gs", "--group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
parser.add_argument("-ags", "--act_group_size", type=int, default=None, help="Don't set this argument if you don't know its meaning.")
parser.add_argument("-ld", "--logs_dir", type=str, default="logs")
parser.add_argument("-q", "--quant_type", type=str, choices=["int_n", "f16", "f32"], default="int_n")
parser.add_argument("-q", "--quant_type", type=str, choices=["int_n", "f16", "f32", "tq1_0", "tq2_0", "q4_0"], default=None,
help="Quantization model type. This will override inference_type.")
parser.add_argument("-it", "--inference_type", type=str, default="int_n",
help="Inference model type. This will be overridden by quant_type if quant_type is set.")
parser.add_argument("-zp", "--zero_point", action="store_true", help="Enforce enable zero_point. Required by EfficientQAT models.")
parser.add_argument("-nzp", "--no_zero_point", action="store_false", help="Enforce disable zero_point. Don't set this argument if you don't know its meaning.")

Expand All @@ -293,8 +389,16 @@ def parse_args():
parser.add_argument("-ndk", "--ndk_home", type=str, default="", help="NDK home")
parser.add_argument("-spm", "--skip_push_model", action="store_true", help="Suppose the model is unchanged to skip pushing the model file")

parser.add_argument("-rc", "--rechunk", action="store_true", help="Set this argument if you want to use rechunk in computation.")
parser.add_argument("--disable-t-mac", action="store_true", help="Set this argument if you want to disable T-MAC.")

parser.set_defaults(zero_point=None)
return parser.parse_args()
args = parser.parse_args()

if args.quant_type is not None:
args.inference_type = args.quant_type

return args


def get_quant_args():
Expand Down