Skip to content

Commit 58bb511

Browse files
committed
Merge branch 'master' into custom-attention-mask
2 parents d29e769 + 7ddf185 commit 58bb511

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1180
-413
lines changed

.github/workflows/build.yml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -265,22 +265,24 @@ jobs:
265265
matrix:
266266
include:
267267
- build: 'noavx'
268-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
268+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
269269
- build: 'avx2'
270-
defines: '-DLLAMA_BUILD_SERVER=ON'
270+
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
271271
- build: 'avx'
272-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
272+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
273273
- build: 'avx512'
274274
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
275275
- build: 'clblast'
276-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
276+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
277277
- build: 'openblas'
278-
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
278+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
279279

280280
steps:
281281
- name: Clone
282282
id: checkout
283283
uses: actions/checkout@v3
284+
with:
285+
fetch-depth: 0
284286

285287
- name: Download OpenCL SDK
286288
id: get_opencl
@@ -397,19 +399,22 @@ jobs:
397399
- name: Clone
398400
id: checkout
399401
uses: actions/checkout@v3
402+
with:
403+
fetch-depth: 0
400404

401405
- uses: Jimver/[email protected]
402406
id: cuda-toolkit
403407
with:
404408
cuda: ${{ matrix.cuda }}
409+
method: 'network'
405410
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
406411

407412
- name: Build
408413
id: cmake_build
409414
run: |
410415
mkdir build
411416
cd build
412-
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
417+
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
413418
cmake --build . --config Release
414419
415420
- name: Determine tag name
@@ -485,6 +490,8 @@ jobs:
485490
- name: Clone
486491
id: checkout
487492
uses: actions/checkout@v3
493+
with:
494+
fetch-depth: 0
488495

489496
- name: Determine tag name
490497
id: tag

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
8080
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
8181
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
8282
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
83+
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
84+
"llama: max. batch size for using peer access")
8385
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
8486
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
8587
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
@@ -304,6 +306,7 @@ if (LLAMA_CUBLAS)
304306
add_compile_definitions(GGML_CUDA_F16)
305307
endif()
306308
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
309+
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
307310

308311
if (LLAMA_STATIC)
309312
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -427,6 +430,7 @@ if (LLAMA_ALL_WARNINGS)
427430
-Wextra
428431
-Wpedantic
429432
-Wcast-qual
433+
-Wmissing-declarations
430434
-Wno-unused-function
431435
-Wno-multichar
432436
)

Makefile

Lines changed: 52 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,19 @@ CXXV := $(shell $(CXX) --version | head -n 1)
9595
#
9696

9797
# keep standard at C11 and C++11
98+
MK_CPPFLAGS = -I. -Icommon
99+
MK_CFLAGS = -std=c11 -fPIC
100+
MK_CXXFLAGS = -std=c++11 -fPIC
101+
98102
# -Ofast tends to produce faster code, but may not be available for some compilers.
99103
ifdef LLAMA_FAST
100-
OPT = -Ofast
104+
MK_CFLAGS += -Ofast
105+
MK_HOST_CXXFLAGS += -Ofast
106+
MK_CUDA_CXXFLAGS += -O3
101107
else
102-
OPT = -O3
108+
MK_CFLAGS += -O3
109+
MK_CXXFLAGS += -O3
103110
endif
104-
MK_CPPFLAGS = -I. -Icommon
105-
MK_CFLAGS = $(OPT) -std=c11 -fPIC
106-
MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
107-
MK_LDFLAGS =
108111

109112
# clock_gettime came in POSIX.1b (1993)
110113
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@@ -172,9 +175,16 @@ endif # LLAMA_DISABLE_LOGS
172175
# warnings
173176
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
174177
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
175-
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
178+
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar
179+
180+
# TODO(cebtenzzre): remove this once PR #2632 gets merged
181+
TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations
176182

177-
ifeq '' '$(findstring clang,$(shell $(CXX) --version))'
183+
ifneq '' '$(findstring clang,$(shell $(CXX) --version))'
184+
# clang++ only
185+
MK_CXXFLAGS += -Wmissing-prototypes
186+
TTFS_CXXFLAGS += -Wno-missing-prototypes
187+
else
178188
# g++ only
179189
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
180190
endif
@@ -225,7 +235,7 @@ ifndef RISCV
225235
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
226236
# Use all CPU extensions that are available:
227237
MK_CFLAGS += -march=native -mtune=native
228-
MK_CXXFLAGS += -march=native -mtune=native
238+
MK_HOST_CXXFLAGS += -march=native -mtune=native
229239

230240
# Usage AVX-only
231241
#MK_CFLAGS += -mfma -mf16c -mavx
@@ -358,14 +368,19 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
358368
else
359369
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
360370
endif
371+
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
372+
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
373+
else
374+
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
375+
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
361376
#ifdef LLAMA_CUDA_CUBLAS
362377
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
363378
#endif # LLAMA_CUDA_CUBLAS
364379
ifdef LLAMA_CUDA_CCBIN
365380
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
366381
endif
367382
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
368-
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
383+
$(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
369384
endif # LLAMA_CUBLAS
370385

371386
ifdef LLAMA_CLBLAST
@@ -433,23 +448,30 @@ k_quants.o: k_quants.c k_quants.h
433448
endif # LLAMA_NO_K_QUANTS
434449

435450
# combine build flags with cmdline overrides
436-
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
437-
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
438-
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
451+
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
452+
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
453+
override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
454+
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
455+
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
456+
457+
# save CXXFLAGS before we add host-only options
458+
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
459+
override CXXFLAGS += $(HOST_CXXFLAGS)
439460

440461
#
441462
# Print build information
442463
#
443464

444465
$(info I llama.cpp build info: )
445-
$(info I UNAME_S: $(UNAME_S))
446-
$(info I UNAME_P: $(UNAME_P))
447-
$(info I UNAME_M: $(UNAME_M))
448-
$(info I CFLAGS: $(CFLAGS))
449-
$(info I CXXFLAGS: $(CXXFLAGS))
450-
$(info I LDFLAGS: $(LDFLAGS))
451-
$(info I CC: $(CCV))
452-
$(info I CXX: $(CXXV))
466+
$(info I UNAME_S: $(UNAME_S))
467+
$(info I UNAME_P: $(UNAME_P))
468+
$(info I UNAME_M: $(UNAME_M))
469+
$(info I CFLAGS: $(CFLAGS))
470+
$(info I CXXFLAGS: $(CXXFLAGS))
471+
$(info I NVCCFLAGS: $(NVCCFLAGS))
472+
$(info I LDFLAGS: $(LDFLAGS))
473+
$(info I CC: $(CCV))
474+
$(info I CXX: $(CXXV))
453475
$(info )
454476

455477
#
@@ -492,22 +514,22 @@ main: examples/main/main.cpp build-info.h ggml.
492514
@echo '==== Run ./main -h for help. ===='
493515
@echo
494516

495-
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
517+
simple: examples/simple/simple.cpp ggml.o llama.o common.o $(OBJS)
496518
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
497519

498-
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
520+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
499521
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
500522

501-
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
523+
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
502524
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
503525

504-
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
526+
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
505527
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
506528

507-
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
529+
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
508530
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
509531

510-
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
532+
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o $(OBJS)
511533
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
512534

513535
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
@@ -524,7 +546,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
524546
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
525547

526548
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
527-
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
549+
$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
528550

529551
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
530552
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -547,7 +569,7 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
547569
endif
548570

549571
build-info.h: $(wildcard .git/index) scripts/build-info.sh
550-
@sh scripts/build-info.sh > $@.tmp
572+
@sh scripts/build-info.sh $(CC) > $@.tmp
551573
@if ! cmp -s $@.tmp $@; then \
552574
mv $@.tmp $@; \
553575
else \
@@ -560,7 +582,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
560582

561583
tests: $(TEST_TARGETS)
562584

563-
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
585+
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
564586
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
565587
./$@
566588

README.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -391,13 +391,14 @@ Building the program with BLAS support may lead to some performance improvements
391391
<!---
392392
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
393393
--->
394-
| Option | Legal values | Default | Description |
395-
|-------------------------|------------------------|---------|-------------|
396-
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
397-
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
398-
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
399-
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
400-
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
394+
| Option | Legal values | Default | Description |
395+
|--------------------------------|------------------------|---------|-------------|
396+
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
397+
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
398+
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
399+
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
400+
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
401+
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
401402

402403
- #### hipBLAS
403404

common/common.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
7878
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
7979
}
8080

81-
void process_escapes(std::string& input) {
81+
static void process_escapes(std::string& input) {
8282
std::size_t input_len = input.length();
8383
std::size_t output_idx = 0;
8484

@@ -798,10 +798,10 @@ std::vector<llama_token> llama_tokenize(
798798
// upper limit for the number of tokens
799799
int n_tokens = text.length() + add_bos;
800800
std::vector<llama_token> result(n_tokens);
801-
n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
801+
n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
802802
if (n_tokens < 0) {
803803
result.resize(-n_tokens);
804-
int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
804+
int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
805805
GGML_ASSERT(check == -n_tokens);
806806
} else {
807807
result.resize(n_tokens);

common/common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#pragma once
44

55
#include "llama.h"
6+
#include "build-info.h"
67

78
#define LOG_NO_FILE_LINE_FUNCTION
89
#include "log.h"
@@ -23,6 +24,11 @@
2324
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
2425
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
2526

27+
#define print_build_info() do { \
28+
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
29+
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
30+
} while(0)
31+
2632
//
2733
// CLI argument parsing
2834
//

0 commit comments

Comments
 (0)