server : do not normalize embeddings when there is no pooling

ggerganov · ggerganov · commit b3ae9783cba0 · 2024-12-17T13:36:32.000+02:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1780,7 +1780,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
             break;
         case 0: // max absolute
             for (int i = 0; i < n; i++) {
-                if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
+                if (sum < std::abs(inp[i])) {
+                    sum = std::abs(inp[i]);
+                }
             }
             sum /= 32760.0; // make an int16 range
             break;
diff --git a/common/common.h b/common/common.h
@@ -603,7 +603,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
 // Embedding utils
 //
 
-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+// TODO: repace embd_norm with an enum
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
 
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
@@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         std::vector<float> emb_norm(emb_unorm.size());
-        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
+        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);
         result.push_back(emb_norm);
 
 #ifdef GRIT_DEBUG
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
@@ -107,7 +107,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         }
 
         float * out = output + batch.seq_id[i][0] * n_embd;
-        common_embd_normalize(embd, out, n_embd);
+        common_embd_normalize(embd, out, n_embd, 2);
     }
 }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2042,8 +2042,14 @@ struct server_context {
                 continue;
             }
 
-            common_embd_normalize(embd, embd_res.data(), n_embd);
-            res->embedding.push_back(embd_res);
+            // normalize only when there is pooling
+            // TODO: configurable
+            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
+                common_embd_normalize(embd, embd_res.data(), n_embd, 2);
+                res->embedding.push_back(embd_res);
+            } else {
+                res->embedding.push_back({ embd, embd + n_embd });
+            }
         }
 
         SLT_DBG(slot, "%s", "sending embeddings\n");

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve`
`75`	`75`	`}`
`76`	`76`
`77`	`77`	`std::vector<float> emb_norm(emb_unorm.size());`
`78`		`- common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);`
	`78`	`+ common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);`
`79`	`79`	`result.push_back(emb_norm);`
`80`	`80`
`81`	`81`	`#ifdef GRIT_DEBUG`
Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu`
`107`	`107`	`}`
`108`	`108`
`109`	`109`	`float * out = output + batch.seq_id[i][0] * n_embd;`
`110`		`- common_embd_normalize(embd, out, n_embd);`
	`110`	`+ common_embd_normalize(embd, out, n_embd, 2);`
`111`	`111`	`}`
`112`	`112`	`}`
`113`	`113`