Store sparse vector and decoded token weights

ghukill · ghukill · commit d4e6e5450914 · 2025-11-03T09:21:04.000-05:00
Why these changes are being introduced: Formerly, our 'Embedding' class only had an 'embedding' property for the output. However, for our first model in the pipeline, opensearch-project/ opensearch-neural-sparse-encoding-doc-v3-gte, it produces two representations of the embedding that are useful to store: a sparse vector and decoded token weights. How this addresses that need: Updates the 'Embedding' class to explicitly store both representations of the embedding. We may decide that we don't store both, or some futures models may not produce decoded token weights of any kind, but this matches our first proposed model and pipeline. Better to be explicit and opinionated in these early days, then adjust later if needed. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-112
diff --git a/embeddings/cli.py b/embeddings/cli.py
@@ -258,7 +258,8 @@ def create_embeddings(
             run_record_offset=input_record.run_record_offset,
             embedding_strategy=input_record.embedding_strategy,
             model_uri=model.model_uri,
-            embedding={"coffee": 0.9, "seattle": 0.5},
+            embedding_vector=[0.1, 0.2, 0.3],
+            embedding_token_weights={"coffee": 0.9, "seattle": 0.5},
         )
         for input_record in input_records
     )
diff --git a/embeddings/embedding.py b/embeddings/embedding.py
@@ -32,15 +32,18 @@ class Embedding:
         (timdex_record_id, run_id, run_record_offset): composite key for TIMDEX record
         model_uri: model URI used to create the embedding
         embedding_strategy: strategy used to create text for embedding
-        embedding: model embedding created from text
+        embedding_vector: vector representation of embedding
+        embedding_token_weights: decoded token:weight pairs from sparse vector
+            - only applicable to models that produce this output
     """
 
     timdex_record_id: str
     run_id: str
     run_record_offset: int
     model_uri: str
     embedding_strategy: str
-    embedding: dict | list[float]
+    embedding_vector: list[float]
+    embedding_token_weights: dict
 
     timestamp: datetime.datetime = field(
         default_factory=lambda: datetime.datetime.now(datetime.UTC)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -52,7 +52,8 @@ def create_embedding(self, input_record: EmbeddingInput) -> Embedding:
             run_record_offset=input_record.run_record_offset,
             embedding_strategy=input_record.embedding_strategy,
             model_uri=self.model_uri,
-            embedding={"coffee": 0.9, "seattle": 0.5},
+            embedding_vector=[0.1, 0.2, 0.3],
+            embedding_token_weights={"coffee": 0.9, "seattle": 0.5},
         )
 
 
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -49,7 +49,8 @@ def test_mock_model_create_embedding(mock_model):
     assert embedding.run_record_offset == 42
     assert embedding.embedding_strategy == "full_record"
     assert embedding.model_uri == "test/mock-model"
-    assert embedding.embedding == {"coffee": 0.9, "seattle": 0.5}
+    assert embedding.embedding_vector == [0.1, 0.2, 0.3]
+    assert embedding.embedding_token_weights == {"coffee": 0.9, "seattle": 0.5}
 
 
 def test_registry_contains_opensearch_model():

Original file line number	Diff line number	Diff line change
`@@ -258,7 +258,8 @@ def create_embeddings(`
`258`	`258`	`run_record_offset=input_record.run_record_offset,`
`259`	`259`	`embedding_strategy=input_record.embedding_strategy,`
`260`	`260`	`model_uri=model.model_uri,`
`261`		`- embedding={"coffee": 0.9, "seattle": 0.5},`
	`261`	`+ embedding_vector=[0.1, 0.2, 0.3],`
	`262`	`+ embedding_token_weights={"coffee": 0.9, "seattle": 0.5},`
`262`	`263`	`)`
`263`	`264`	`for input_record in input_records`
`264`	`265`	`)`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,8 @@ def create_embedding(self, input_record: EmbeddingInput) -> Embedding:`
`52`	`52`	`run_record_offset=input_record.run_record_offset,`
`53`	`53`	`embedding_strategy=input_record.embedding_strategy,`
`54`	`54`	`model_uri=self.model_uri,`
`55`		`- embedding={"coffee": 0.9, "seattle": 0.5},`
	`55`	`+ embedding_vector=[0.1, 0.2, 0.3],`
	`56`	`+ embedding_token_weights={"coffee": 0.9, "seattle": 0.5},`
`56`	`57`	`)`
`57`	`58`
`58`	`59`