Stub CLI command methods to create embeddings

ghukill · ghukill · commit ce181dfeb2aa · 2025-10-30T11:50:26.000-04:00
How this addresses that need: * CLI command create-embeddings created * args and some functionality in place * WIP comments and DEBUG code temporarily added to demonstrate how it will work * class RecordText added to encapsulate text that is ready for an embedding * this will support future functionality of pre-embedding "strategies" applied to records * class Embedding created to encapsulate the embedding result * this captures the TIMDEX record the embedding was assocaited with, and the model + strategy used to prepare the text Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-112
diff --git a/.gitignore b/.gitignore
@@ -155,3 +155,5 @@ cython_debug/
 .DS_Store
 output/
 .vscode/
+
+CLAUDE.md
diff --git a/embeddings/cli.py b/embeddings/cli.py
@@ -7,6 +7,8 @@
 from typing import TYPE_CHECKING
 
 import click
+import jsonlines
+from timdex_dataset_api import TIMDEXDataset
 
 from embeddings.config import configure_logger, configure_sentry
 from embeddings.models.registry import get_model_class
@@ -150,8 +152,139 @@ def test_model_load(ctx: click.Context) -> None:
 @main.command()
 @click.pass_context
 @model_required
-def create_embedding(ctx: click.Context) -> None:
-    """Create a single embedding for a single input text."""
+@click.option(
+    "-d",
+    "--dataset-location",
+    required=True,
+    type=click.Path(),
+    help="TIMDEX dataset location, e.g. 's3://timdex/dataset', to read records from.",
+)
+@click.option(
+    "--run-id",
+    required=True,
+    type=str,
+    help="TIMDEX ETL run id.",
+)
+@click.option(
+    "--run-record-offset",
+    required=True,
+    type=int,
+    default=0,
+    help="TIMDEX ETL run record offset to start from, default = 0.",
+)
+@click.option(
+    "--record-limit",
+    required=True,
+    type=int,
+    default=None,
+    help="Limit number of records after --run-record-offset, default = None (unlimited).",
+)
+@click.option(
+    "--strategy",
+    type=str,  # WIP: establish an enum of supported strategies
+    required=True,
+    multiple=True,
+    help="Pre-embedding record transformation strategy to use.  Repeatable.",
+)
+@click.option(
+    "--output-jsonl",
+    required=False,
+    type=str,
+    default=None,
+    help="Optionally write embeddings to local JSONLines file (primarily for testing).",
+)
+def create_embeddings(
+    ctx: click.Context,
+    dataset_location: str,
+    run_id: str,
+    run_record_offset: int,
+    record_limit: int,
+    strategy: list[str],
+    output_jsonl: str,
+) -> None:
+    """Create embeddings for TIMDEX records."""
+    model: BaseEmbeddingModel = ctx.obj["model"]
+
+    # init TIMDEXDataset
+    timdex_dataset = TIMDEXDataset(dataset_location)
+
+    # query TIMDEX dataset for an iterator of records
+    timdex_records = timdex_dataset.read_dicts_iter(
+        columns=[
+            "timdex_record_id",
+            "run_id",
+            "run_record_offset",
+            "transformed_record",
+        ],
+        run_id=run_id,
+        where=f"""run_record_offset >= {run_record_offset}""",
+        limit=record_limit,
+        action="index",
+    )
+
+    # create an iterator of InputTexts applying all requested strategies to all records
+    # WIP NOTE: this will leverage some kind of pre-embedding transformer class(es) that
+    #   create texts based on the requested strategies (e.g. "full record"), which are
+    #   captured in --strategy CLI args
+    # WIP NOTE: the following simulates that...
+    # DEBUG ------------------------------------------------------------------------------
+    import json  # noqa: PLC0415
+
+    from embeddings.embedding import RecordText  # noqa: PLC0415
+
+    input_records = (
+        RecordText(
+            timdex_record_id=timdex_record["timdex_record_id"],
+            run_id=timdex_record["run_id"],
+            run_record_offset=timdex_record["run_record_offset"],
+            embedding_strategy=_strategy,
+            text=json.dumps(timdex_record["transformed_record"].decode()),
+        )
+        for timdex_record in timdex_records
+        for _strategy in strategy
+    )
+    # DEBUG ------------------------------------------------------------------------------
+
+    # create an iterator of Embeddings via the embedding model
+    # WIP NOTE: this will use the embedding class .create_embeddings() bulk method
+    # WIP NOTE: the following simulates that...
+    # DEBUG ------------------------------------------------------------------------------
+    from embeddings.embedding import Embedding  # noqa: PLC0415
+
+    embeddings = (
+        Embedding(
+            timdex_record_id=input_record.timdex_record_id,
+            run_id=input_record.run_id,
+            run_record_offset=input_record.run_record_offset,
+            embedding_strategy=input_record.embedding_strategy,
+            model_uri=model.model_uri,
+            embedding={"coffee": 0.9, "seattle": 0.5},
+        )
+        for input_record in input_records
+    )
+    # DEBUG ------------------------------------------------------------------------------
+
+    # if requested, write embeddings to a local JSONLines file
+    if output_jsonl:
+        with jsonlines.open(
+            output_jsonl,
+            mode="w",
+            dumps=lambda obj: json.dumps(
+                obj,
+                default=str,
+            ),
+        ) as writer:
+            for embedding in embeddings:
+                writer.write(embedding.to_dict())
+
+    # else, default writing embeddings back to TIMDEX dataset
+    else:
+        # WIP NOTE: write via anticipated timdex_dataset.embeddings.write(...)
+        # NOTE: will likely use an imported TIMDEXEmbedding class from TDA, which the
+        #   Embedding instance will nearly 1:1 map to.
+        raise NotImplementedError
+
+    logger.info("Embeddings creation complete.")
 
 
 if __name__ == "__main__":  # pragma: no cover
diff --git a/embeddings/config.py b/embeddings/config.py
@@ -11,8 +11,6 @@ def configure_logger(logger: logging.Logger, *, verbose: bool) -> str:
             "%(message)s"
         )
         logger.setLevel(logging.DEBUG)
-        for handler in logging.root.handlers:
-            handler.addFilter(logging.Filter("embeddings"))
     else:
         logging.basicConfig(
             format="%(asctime)s %(levelname)s %(name)s.%(funcName)s(): %(message)s"
diff --git a/embeddings/embedding.py b/embeddings/embedding.py
@@ -0,0 +1,51 @@
+import datetime
+import json
+from dataclasses import asdict, dataclass, field
+
+
+@dataclass
+class RecordText:
+    """Input record for creating an embedding for.
+
+    Args:
+        (timdex_record_id, run_id, run_record_offset): composite key for TIMDEX record
+        embedding_strategy: strategy used to create text for embedding
+        text: text to embed, created from the TIMDEX record via the embedding_strategy
+    """
+
+    timdex_record_id: str
+    run_id: str
+    run_record_offset: int
+    embedding_strategy: str
+    text: str
+
+
+@dataclass
+class Embedding:
+    """Encapsulates a single embedding.
+
+    Args:
+        (timdex_record_id, run_id, run_record_offset): composite key for TIMDEX record
+        model_uri: model URI used to create the embedding
+        embedding_strategy: strategy used to create text for embedding
+        embedding: model embedding created from text
+    """
+
+    timdex_record_id: str
+    run_id: str
+    run_record_offset: int
+    model_uri: str
+    embedding_strategy: str
+    embedding: dict | list[float]
+
+    timestamp: datetime.datetime = field(
+        default_factory=lambda: datetime.datetime.now(datetime.UTC)
+    )
+
+    def to_dict(self) -> dict:
+        """Marshal to dictionary."""
+        return asdict(self)
+
+    def to_json(self) -> str:
+        """Serialize to JSON."""
+        return json.dumps(self.to_dict(), default=str)
diff --git a/embeddings/models/base.py b/embeddings/models/base.py
@@ -1,8 +1,11 @@
 """Base class for embedding models."""
 
 from abc import ABC, abstractmethod
+from collections.abc import Iterator
 from pathlib import Path
 
+from embeddings.embedding import Embedding, RecordText
+
 
 class BaseEmbeddingModel(ABC):
     """Abstract base class for embedding models.
@@ -46,3 +49,22 @@ def download(self) -> Path:
     @abstractmethod
     def load(self) -> None:
         """Load model from self.model_path."""
+
+    @abstractmethod
+    def create_embedding(self, input_record: RecordText) -> Embedding:
+        """Create an Embedding for an RecordText.
+
+        Args:
+            input_record: RecordText instance
+        """
+
+    def create_embeddings(
+        self, input_records: Iterator[RecordText]
+    ) -> Iterator[Embedding]:
+        """Yield Embeddings for an iterator of InputRecords.
+
+        Args:
+            input_records: iterator of InputRecords
+        """
+        for input_text in input_records:
+            yield self.create_embedding(input_text)
diff --git a/embeddings/models/os_neural_sparse_doc_v3_gte.py b/embeddings/models/os_neural_sparse_doc_v3_gte.py
@@ -11,6 +11,7 @@
 from huggingface_hub import snapshot_download
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 
+from embeddings.embedding import Embedding, RecordText
 from embeddings.models.base import BaseEmbeddingModel
 
 if TYPE_CHECKING:
@@ -161,3 +162,6 @@ def load(self) -> None:
             self._id_to_token[token_id] = token
 
         logger.info(f"Model loaded successfully, {time.perf_counter()-start_time}s")
+
+    def create_embedding(self, input_record: RecordText) -> Embedding:
+        raise NotImplementedError
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ requires-python = ">=3.12"
 dependencies = [
     "click>=8.2.1",
     "huggingface-hub>=0.26.0",
+    "jsonlines>=4.0.0",
     "sentry-sdk>=2.34.1",
     "timdex-dataset-api",
     "torch>=2.9.0",
@@ -39,6 +40,11 @@ exclude = [
     "output/"
 ]
 
+[[tool.mypy.overrides]]
+module = ["timdex_dataset_api.*"]
+follow_untyped_imports = true
+
+
 [tool.pytest.ini_options]
 log_level = "INFO"
 
@@ -88,6 +94,7 @@ fixture-parentheses = false
 "tests/**/*" = [
     "ANN",
     "ARG001",
+    "PLR2004",
     "S101",
 ]
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -6,6 +6,8 @@
 import pytest
 from click.testing import CliRunner
 
+from embeddings.embedding import Embedding, RecordText
+from embeddings.models import registry
 from embeddings.models.base import BaseEmbeddingModel
 
 logger = logging.getLogger(__name__)
@@ -43,13 +45,30 @@ def download(self) -> Path:
     def load(self) -> None:
         logger.info("Model loaded successfully, 1.5s")
 
+    def create_embedding(self, input_record: RecordText) -> Embedding:
+        return Embedding(
+            timdex_record_id=input_record.timdex_record_id,
+            run_id=input_record.run_id,
+            run_record_offset=input_record.run_record_offset,
+            embedding_strategy=input_record.embedding_strategy,
+            model_uri=self.model_uri,
+            embedding={"coffee": 0.9, "seattle": 0.5},
+        )
+
 
 @pytest.fixture
 def mock_model(tmp_path):
     """Fixture providing a MockEmbeddingModel instance."""
     return MockEmbeddingModel(tmp_path / "model")
 
 
+@pytest.fixture
+def register_mock_model(monkeypatch):
+    """Register MockEmbeddingModel in the model registry."""
+    monkeypatch.setitem(registry.MODEL_REGISTRY, "test/mock-model", MockEmbeddingModel)
+    monkeypatch.setenv("TE_MODEL_PATH", "/fake/path")
+
+
 @pytest.fixture
 def neural_sparse_doc_v3_gte_fake_model_directory(tmp_path):
     """Create a fake downloaded model directory with required files."""
diff --git a/tests/test_cli.py b/tests/test_cli.py
diff --git a/tests/test_models.py b/tests/test_models.py
diff --git a/tests/test_os_neural_sparse_doc_v3_gte.py b/tests/test_os_neural_sparse_doc_v3_gte.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,6 @@ def configure_logger(logger: logging.Logger, *, verbose: bool) -> str:`
`11`	`11`	`"%(message)s"`
`12`	`12`	`)`
`13`	`13`	`logger.setLevel(logging.DEBUG)`
`14`		`- for handler in logging.root.handlers:`
`15`		`- handler.addFilter(logging.Filter("embeddings"))`
`16`	`14`	`else:`
`17`	`15`	`logging.basicConfig(`
`18`	`16`	`format="%(asctime)s %(levelname)s %(name)s.%(funcName)s(): %(message)s"`