Mozilla-Ocho · asg017 · Nov 24, 2024 · Nov 24, 2024 · Nov 24, 2024 · Nov 25, 2024
diff --git a/Makefile b/Makefile
@@ -16,6 +16,7 @@ include llamafile/BUILD.mk
 include llama.cpp/BUILD.mk
 include stable-diffusion.cpp/BUILD.mk
 include whisper.cpp/BUILD.mk
+include embedfile/BUILD.mk
 
 # the root package is `o//` by default
 # building a package also builds its sub-packages

diff --git a/TODO b/TODO
@@ -0,0 +1,13 @@
+```
+./make -j8
+ ./make o//embedfile/embedfile
+ ./o/embedfile/embedfile --version
+ ./o/embedfile/embedfile
+```
+
+
+```
+EMBEDFILE_MODEL_PATH=$PWD/models/mxbai-embed-xsmall-v1-f16.gguf ./o/embedfile/embedfile backfill tmp.smol.db nyt_headlines headline
+
+./o/embedfile/embedfile backfill tmp.smol.db nyt_headlines headline
+```
diff --git a/embedfile.mk b/embedfile.mk
@@ -0,0 +1,71 @@
+prefix=dist
+
+$(prefix):
+	mkdir -p $@
+	echo "*" > $(prefix)/.gitignore
+
+MODELS_DIR=$(prefix)/.models
+
+$(MODELS_DIR): $(prefix)
+	mkdir -p $@
+
+.PHONY: models all
+
+EMBEDFILE=./o/embedfile/embedfile
+
+dist/embedfile: $(EMBEDFILE)
+	cp $< $@
+
+MODEL_MXBAI=mxbai-embed-xsmall-v1-f16
+MODEL_SNOWFLAKE=snowflake-arctic-embed-m-v1.5-f16
+MODEL_NOMIC=nomic-embed-text-v1.5.f16
+MODEL_ALLMINI=all-MiniLM-L6-v2.f16
+
+$(MODELS_DIR)/$(MODEL_MXBAI).gguf: $(MODELS_DIR)
+	curl -L -o $@ 'https://huggingface.co/mixedbread-ai/mxbai-embed-xsmall-v1/resolve/main/gguf/mxbai-embed-xsmall-v1-f16.gguf'
+
+$(MODELS_DIR)/$(MODEL_SNOWFLAKE).gguf: $(MODELS_DIR)
+	curl -L -o $@ 'https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5/resolve/main/gguf/snowflake-arctic-embed-m-v1.5-f16.gguf'
+
+$(MODELS_DIR)/$(MODEL_NOMIC).gguf: $(MODELS_DIR)
+	curl -L -o $@ 'https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf'
+
+$(MODELS_DIR)/$(MODEL_ALLMINI).gguf: $(MODELS_DIR)
+	curl -L -o $@ 'https://huggingface.co/asg017/sqlite-lembed-model-examples/resolve/main/all-MiniLM-L6-v2/all-MiniLM-L6-v2.e4ce9877.f16.gguf'
+
+models: \
+	$(MODELS_DIR)/$(MODEL_MXBAI).gguf \
+	$(MODELS_DIR)/$(MODEL_SNOWFLAKE).gguf \
+	$(MODELS_DIR)/$(MODEL_NOMIC).gguf \
+	$(MODELS_DIR)/$(MODEL_ALLMINI).gguf
+
+dist/$(MODEL_MXBAI).embedfile: $(MODELS_DIR)/$(MODEL_MXBAI).gguf $(EMBEDFILE) embedfile.mk
+	cp $(EMBEDFILE) $@
+	echo "-m\n$(MODEL_MXBAI).gguf\n..." > .args
+	./o/llamafile/zipalign -j0 $@ $< .args
+	rm .args
+
+dist/$(MODEL_SNOWFLAKE).embedfile: $(MODELS_DIR)/$(MODEL_SNOWFLAKE).gguf $(EMBEDFILE) embedfile.mk
+	cp $(EMBEDFILE) $@
+	echo "-m\n$(MODEL_SNOWFLAKE).gguf\n..." > .args
+	./o/llamafile/zipalign -j0 $@ $< .args
+	rm .args
+
+dist/$(MODEL_NOMIC).embedfile: $(MODELS_DIR)/$(MODEL_NOMIC).gguf $(EMBEDFILE) embedfile.mk
+	cp $(EMBEDFILE) $@
+	echo "-m\n$(MODEL_NOMIC).gguf\n..." > .args
+	./o/llamafile/zipalign -j0 $@ $< .args
+	rm .args
+
+dist/$(MODEL_ALLMINI).embedfile: $(MODELS_DIR)/$(MODEL_ALLMINI).gguf $(EMBEDFILE) embedfile.mk
+	cp $(EMBEDFILE) $@
+	echo "-m\n$(MODEL_ALLMINI).gguf\n..." > .args
+	./o/llamafile/zipalign -j0 $@ $< .args
+	rm .args
+
+all: \
+	dist/$(MODEL_MXBAI).embedfile \
+	dist/$(MODEL_SNOWFLAKE).embedfile \
+	dist/$(MODEL_NOMIC).embedfile \
+	dist/$(MODEL_ALLMINI).embedfile \
+	dist/embedfile
diff --git a/embedfile/.clang-format b/embedfile/.clang-format
@@ -0,0 +1,12 @@
+---
+BasedOnStyle: LLVM
+IndentWidth: 4
+ColumnLimit: 100
+---
+Language: Cpp
+AllowShortFunctionsOnASingleLine: false
+AlignTrailingComments: false
+AlignEscapedNewlines: DontAlign
+AlwaysBreakTemplateDeclarations: true
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+---
diff --git a/embedfile/BUILD.mk b/embedfile/BUILD.mk
@@ -0,0 +1,56 @@
+#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
+#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
+
+PKGS += LLAMA_CPP_EMBEDFILE
+
+LLAMA_CPP_EMBEDFILE_FILES := $(wildcard embedfile/*)
+LLAMA_CPP_EMBEDFILE_HDRS = $(filter %.h,$(LLAMA_CPP_EMBEDFILE_FILES))
+LLAMA_CPP_EMBEDFILE_SRCS_C = $(filter %.c,$(LLAMA_CPP_EMBEDFILE_FILES))
+LLAMA_CPP_EMBEDFILE_SRCS_CPP = $(filter %.cpp,$(LLAMA_CPP_EMBEDFILE_FILES))
+LLAMA_CPP_EMBEDFILE_SRCS = $(LLAMA_CPP_EMBEDFILE_SRCS_C) $(LLAMA_CPP_EMBEDFILE_SRCS_CPP)
+
+LLAMA_CPP_EMBEDFILE_OBJS = \
+	$(LLAMA_CPP_EMBEDFILE_SRCS_C:%.c=o/$(MODE)/%.o) \
+	$(LLAMA_CPP_EMBEDFILE_SRCS_CPP:%.cpp=o/$(MODE)/%.o)
+
+
+o/$(MODE)/embedfile/embedfile.a: $(LLAMA_CPP_EMBEDFILE_SRCS_C)
+
+o/$(MODE)/embedfile/sqlite-vec.o: embedfile/sqlite-vec.c
+o/$(MODE)/embedfile/sqlite-vec.a: o/$(MODE)/embedfile/sqlite-vec.o
+
+o/$(MODE)/embedfile/sqlite-csv.o: embedfile/sqlite-csv.c
+o/$(MODE)/embedfile/sqlite-csv.a: o/$(MODE)/embedfile/sqlite-csv.o
+
+o/$(MODE)/embedfile/sqlite-lines.o: embedfile/sqlite-lines.c
+o/$(MODE)/embedfile/sqlite-lines.a: o/$(MODE)/embedfile/sqlite-lines.o
+
+o/$(MODE)/embedfile/sqlite-lembed.o: embedfile/sqlite-lembed.c
+o/$(MODE)/embedfile/sqlite-lembed.a: o/$(MODE)/embedfile/sqlite-lembed.o o/$(MODE)/llama.cpp/llama.cpp.a
+
+o/$(MODE)/embedfile/shell.o: embedfile/shell.c
+o/$(MODE)/embedfile/shell.a: o/$(MODE)/embedfile/shell.o
+
+#o/$(MODE)/embedfile/embedfile.a: $(LLAMA_CPP_EMBEDFILE_OBJS)
+
+o/$(MODE)/embedfile/shell.o: private CFLAGS += \
+	-DSQLITE_ENABLE_STMT_SCANSTATUS
+
+o/$(MODE)/embedfile/embedfile:					\
+		o/$(MODE)/embedfile/shell.a \
+		o/$(MODE)/embedfile/embedfile.o			\
+		o/$(MODE)/embedfile/embedfile.1.asc.zip.o	\
+		o/$(MODE)/llama.cpp/llama.cpp.a \
+		o/$(MODE)/third_party/sqlite/sqlite3.a \
+		o/$(MODE)/embedfile/sqlite-csv.a \
+		o/$(MODE)/embedfile/sqlite-vec.a \
+		o/$(MODE)/embedfile/sqlite-lines.a \
+		o/$(MODE)/embedfile/sqlite-lembed.a
+
+$(LLAMA_CPP_EMBEDFILE_OBJS): private CCFLAGS += -DSQLITE_CORE
+
+.PHONY: o/$(MODE)/embedfile
+o/$(MODE)/embedfile:						\
+		o/$(MODE)/embedfile/embedfile
+
+$(LLAMA_CPP_EMBEDFILE_OBJS): llama.cpp/BUILD.mk embedfile/BUILD.mk
diff --git a/embedfile/embedfile.1 b/embedfile/embedfile.1
@@ -0,0 +1,98 @@
+.Dd December 5, 2023
+.Dt EMBEDFILE 1
+.Os Llamafile Manual
+.Sh NAME
+.Nm embedfile
+.Nd large language model quantizer
+.Sh SYNOPSIS
+.Nm
+.Op flags...
+.Ar model-f32.gguf
+.Op Ar model-quant.gguf
+.Ar type
+.Op Ar nthreads
+.Sh DESCRIPTION
+.Nm
+converts large language model weights from the float32 or float16
+formats into smaller data types from 2 to 8 bits in size.
+.Sh OPTIONS
+The following flags are available:
+.Bl -tag -width indent
+.It Fl Fl allow-requantize
+Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit
+.It Fl Fl leave-output-tensor
+Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
+.It Fl Fl pure
+Disable k-quant mixtures and quantize all tensors to the same type
+.El
+.Sh ARGUMENTS
+The following positional arguments are accepted:
+.Bl -tag -width indent
+.It Ev Ar model-f32.gguf
+Is the input file, which contains the unquantized model weights in either the float32 or float16 format.
+.It Ev Ar model-quant.gguf
+Is the output file, which will contain quantized weights in the desired format. If this path isn't specified, it'll default to [inp path]/ggml-model-[ftype].gguf.
+.It Ev Ar type
+Is the desired quantization format, which may be the integer id of a supported quantization type, or its name. See the quantization types section below for acceptable formats.
+.It Ev Ar nthreads
+Number of threads to use during computation (default: nproc/2)
+.El
+.Sh QUANTIZATION TYPES
+The following quantization types are available. This table shows the ID
+of the quantization format, its name, the file size of 7B model weights
+that use it, and finally the amount of quality badness it introduces as
+measured by the llamafile-perplexity tool averaged over 128 chunks with
+the TinyLLaMA 1.1B v1.0 Chat model. Rows are ordered in accordance with
+how recommended the quantization format is for general usage.
+.Pp
+.Bl -dash -compact
+.It
+  18 Q6_K   5.6gb +0.0446 ppl (q6 kawrakow)
+.It
+   7 Q8_0   7.2gb +0.0022 ppl (q8 gerganov)
+.It
+   1 F16    14gb  +0.0000 ppl (best but biggest)
+.It
+   8 Q5_0   4.7gb +0.0817 ppl (q5 gerganov zero)
+.It
+  17 Q5_K_M 4.8gb +0.0836 ppl (q5 kawrakow medium)
+.It
+  16 Q5_K_S 4.7gb +0.1049 ppl (q5 kawrakow small)
+.It
+  15 Q4_K_M 4.1gb +0.3132 ppl (q4 kawrakow medium)
+.It
+  14 Q4_K_S 3.9gb +0.3408 ppl (q4 kawrakow small)
+.It
+  13 Q3_K_L 3.6gb +0.5736 ppl (q3 kawrakow large)
+.It
+  12 Q3_K_M 3.3gb +0.7612 ppl (q3 kawrakow medium)
+.It
+  11 Q3_K_S 3.0gb +1.3834 ppl (q3 kawrakow small)
+.It
+  10 Q2_K   2.6gb +4.2359 ppl (tiniest hallucinates most)
+.It
+  32 BF16   14gb  +0.0000 ppl (canonical but cpu/cuda only)
+.It
+   0 F32    27gb   9.0952 ppl (reference point)
+.It
+   2 Q4_0   3.9gb +0.3339 ppl (legacy)
+.It
+   3 Q4_1   4.3gb +0.4163 ppl (legacy)
+.It
+   9 Q5_1   5.1gb +0.1091 ppl (legacy)
+.It
+  12 Q3_K   alias for Q3_K_M
+.It
+  15 Q4_K   alias for Q4_K_M
+.It
+  17 Q5_K   alias for Q5_K_M
+.It
+COPY Only copy tensors, no quantizing.
+.El
+.Sh SEE ALSO
+.Xr llamafile 1 ,
+.Xr llamafile-imatrix 1 ,
+.Xr llamafile-perplexity 1 ,
+.Xr llava-quantize 1 ,
+.Xr zipalign 1 ,
+.Xr unzip 1
diff --git a/embedfile/embedfile.1.asc b/embedfile/embedfile.1.asc
@@ -0,0 +1,80 @@
+EMBEDFILE(1)                 General Commands Manual                EMBEDFILE(1)
+
+NNAAMMEE
+     eemmbbeeddffiillee - large language model quantizer
+
+SSYYNNOOPPSSIISS
+     eemmbbeeddffiillee [flags...] _m_o_d_e_l_-_f_3_2_._g_g_u_f [_m_o_d_e_l_-_q_u_a_n_t_._g_g_u_f] _t_y_p_e [_n_t_h_r_e_a_d_s]
+
+DDEESSCCRRIIPPTTIIOONN
+     eemmbbeeddffiillee converts large language model weights from the float32 or float16
+     formats into smaller data types from 2 to 8 bits in size.
+
+OOPPTTIIOONNSS
+     The following flags are available:
+
+     ----aallllooww--rreeqquuaannttiizzee
+             Allows requantizing tensors that have already been quantized.
+             Warning: This can severely reduce quality compared to quantizing
+             from 16bit or 32bit
+
+     ----lleeaavvee--oouuttppuutt--tteennssoorr
+             Will leave output.weight un(re)quantized. Increases model size but
+             may also increase quality, especially when requantizing
+
+     ----ppuurree  Disable k-quant mixtures and quantize all tensors to the same type
+
+AARRGGUUMMEENNTTSS
+     The following positional arguments are accepted:
+
+     _m_o_d_e_l_-_f_3_2_._g_g_u_f
+             Is the input file, which contains the unquantized model weights in
+             either the float32 or float16 format.
+
+     _m_o_d_e_l_-_q_u_a_n_t_._g_g_u_f
+             Is the output file, which will contain quantized weights in the
+             desired format. If this path isn't specified, it'll default to [inp
+             path]/ggml-model-[ftype].gguf.
+
+     _t_y_p_e    Is the desired quantization format, which may be the integer id of
+             a supported quantization type, or its name. See the quantization
+             types section below for acceptable formats.
+
+     _n_t_h_r_e_a_d_s
+             Number of threads to use during computation (default: nproc/2)
+
+QQUUAANNTTIIZZAATTIIOONN TTYYPPEESS
+     The following quantization types are available. This table shows the ID of
+     the quantization format, its name, the file size of 7B model weights that
+     use it, and finally the amount of quality badness it introduces as measured
+     by the llamafile-perplexity tool averaged over 128 chunks with the
+     TinyLLaMA 1.1B v1.0 Chat model. Rows are ordered in accordance with how
+     recommended the quantization format is for general usage.
+
+     --     18 Q6_K   5.6gb +0.0446 ppl (q6 kawrakow)
+     --      7 Q8_0   7.2gb +0.0022 ppl (q8 gerganov)
+     --      1 F16    14gb  +0.0000 ppl (best but biggest)
+     --      8 Q5_0   4.7gb +0.0817 ppl (q5 gerganov zero)
+     --     17 Q5_K_M 4.8gb +0.0836 ppl (q5 kawrakow medium)
+     --     16 Q5_K_S 4.7gb +0.1049 ppl (q5 kawrakow small)
+     --     15 Q4_K_M 4.1gb +0.3132 ppl (q4 kawrakow medium)
+     --     14 Q4_K_S 3.9gb +0.3408 ppl (q4 kawrakow small)
+     --     13 Q3_K_L 3.6gb +0.5736 ppl (q3 kawrakow large)
+     --     12 Q3_K_M 3.3gb +0.7612 ppl (q3 kawrakow medium)
+     --     11 Q3_K_S 3.0gb +1.3834 ppl (q3 kawrakow small)
+     --     10 Q2_K   2.6gb +4.2359 ppl (tiniest hallucinates most)
+     --     32 BF16   14gb  +0.0000 ppl (canonical but cpu/cuda only)
+     --      0 F32    27gb   9.0952 ppl (reference point)
+     --      2 Q4_0   3.9gb +0.3339 ppl (legacy)
+     --      3 Q4_1   4.3gb +0.4163 ppl (legacy)
+     --      9 Q5_1   5.1gb +0.1091 ppl (legacy)
+     --     12 Q3_K   alias for Q3_K_M
+     --     15 Q4_K   alias for Q4_K_M
+     --     17 Q5_K   alias for Q5_K_M
+     --   COPY Only copy tensors, no quantizing.
+
+SSEEEE AALLSSOO
+     llamafile(1), llamafile-imatrix(1), llamafile-perplexity(1),
+     llava-quantize(1), zipalign(1), unzip(1)
+
+Llamafile Manual                December 5, 2023                Llamafile Manual