Merge pull request #150 from MITLibraries/TIMX-506-dataset-metadata-class-client

ghukill · web-flow · commit f429b2c6d3e6 · 2025-06-27T11:03:29.000-04:00
TIMX 506 - new dataset metadata client
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
+from timdex_dataset_api import TIMDEXDataset
+
 # timdex-dataset-api
-Python library for interacting with a TIMDEX parquet dataset located remotely or in S3.
+Python library for interacting with a TIMDEX parquet dataset located remotely or in S3.  This library is often abbreviated as "TDA".
 
 ## Development
 
@@ -9,6 +11,13 @@ Python library for interacting with a TIMDEX parquet dataset located remotely or
 - To run unit tests: `make test`
 - To lint the repo: `make lint`
 
+The library version number is set in [`timdex_dataset_api/__init__.py`](timdex_dataset_api/__init__.py), e.g.:
+```python
+__version__ = "2.1.0"
+```
+
+Updating the version number when making changes to the library will prompt applications that install it, when they have _their_ dependencies updated, to pickup the new version.
+
 ## Installation
 
 This library is designed to be utilized by other projects, and can therefore be added as a dependency directly from the Github repository.
@@ -30,11 +39,116 @@ timdex_dataset_api = {git = "https://github.com/MITLibraries/timdex-dataset-api.
 
 ### Required
 
+None at this time.
+
 ### Optional
 ```shell
 TDA_LOG_LEVEL=# log level for timdex-dataset-api, accepts [DEBUG, INFO, WARNING, ERROR], default INFO
+WARNING_ONLY_LOGGERS=# Comma-seperated list of logger names to set as WARNING only, e.g. 'botocore,charset_normalizer,smart_open'
 ```
 
 ## Usage
 
-_TODO..._
+Currently, the most common use cases are:
+  * **Transmogrifier**: uses TDA to **write** to the parquet dataset
+  * **TIMDEX-Index-Manager (TIM)**: uses TDA to **read** from the parquet dataset
+
+Beyond those two ETL run use cases, others are emerging where this library proves helpful:
+
+  * yielding only the current version of all records in the dataset, useful for quickly re-indexing to Opensearch
+  * high throughput (time) + memory safe (space) access to the dataset for analysis
+
+For both reading and writing, the following env vars are recommended:
+```shell
+TDA_LOG_LEVEL=INFO
+WARNING_ONLY_LOGGERS=asyncio,botocore,urllib3,s3transfer,boto3
+```
+
+### Reading Data
+
+First, import the library:
+```python
+from timdex_dataset_api import TIMDEXDataset
+```
+
+Load a dataset instance:
+```python
+# dataset in S3
+timdex_dataset = TIMDEXDataset("s3://my-bucket/path/to/dataset")
+
+# or, local dataset (e.g. testing or development)
+timdex_dataset = TIMDEXDataset("/path/to/dataset")
+
+# load the dataset, which discovers all parquet files
+timdex_dataset.load()
+
+# or, load the dataset but ensure that only current records are ever yielded
+timdex_dataset.load(current_records=True)
+```
+
+All read methods for `TIMDEXDataset` allow for the same group of filters which are defined in `timdex_dataset_api.dataset.DatasetFilters`.  Examples are shown below.
+
+```python
+# read a single row, no filtering
+single_record_dict = next(timdex_dataset.read_dicts_iter())
+
+
+# get batches of records, filtering to a particular run
+for batch in timdex_dataset.read_batches_iter(
+    source="alma",
+    run_date="2025-06-01",
+    run_id="abc123"
+):
+    # do thing with pyarrow batch...
+
+
+# use convenience method to yield only transformed records
+# NOTE: this is what TIM uses for indexing to Opensearch for a given ETL run
+for transformed_record in timdex_dataset.read_transformed_records_iter(
+    source="aspace",
+    run_date="2025-06-01",
+    run_id="ghi789"
+):
+    # do something with transformed record dictionary...
+
+
+# load all records for a given run into a pandas dataframe
+# NOTE: this can be potentially expensive memory-wise if the run is large
+run_df = timdex_dataset.read_dataframe(
+    source="dspace",
+    run_date="2025-06-01",
+    run_id="def456"
+)
+```
+
+### Writing Data
+
+At this time, the only application that writes to the ETL parquet dataset is Transmogrifier.
+
+To write records to the dataset, you must prepare an iterator of `timdex_dataset_api.record.DatasetRecord`.  Here is some pseudocode for how a dataset write can work:
+
+```python
+from timdex_dataset_api import DatasetRecord, TIMDEXDataset
+
+# different ways to achieve, just need some kind of iterator (e.g. list, generator, etc.)
+# of DatasetRecords for writing
+def records_to_write_iter() -> Iterator[DatasetRecord]:	
+    records = [...]		
+	for record in records:
+        yield DatasetRecord(
+            timdex_record_id=...,
+            source_record=...,
+            transformed_record=...,
+            source=...,
+            run_date=...,
+            run_type=...,
+            run_timestamp=...,
+            action=...,
+            run_record_offset=...			
+        )
+records_iter = records_to_write_iter()
+    
+# finally, perform the write, relying on the library to handle efficient batching
+timdex_dataset = TIMDEXDataset("/path/to/dataset")
+timdex_dataset.write(records_iter=records_iter)
+```
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -10,7 +10,7 @@
     generate_sample_records,
     generate_sample_records_with_simulated_partitions,
 )
-from timdex_dataset_api import TIMDEXDataset
+from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
 from timdex_dataset_api.dataset import TIMDEXDatasetConfig
 
 
@@ -208,3 +208,8 @@ def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
     timdex_dataset.load()
 
     return timdex_dataset
+
+
+@pytest.fixture
+def timdex_dataset_metadata(dataset_with_same_day_runs):
+    return TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -0,0 +1,89 @@
+# ruff: noqa: PLR2004
+
+import duckdb
+
+from timdex_dataset_api import TIMDEXDataset, TIMDEXDatasetMetadata
+
+
+def test_tdm_init_from_timdex_dataset_instance_success(dataset_with_same_day_runs):
+    tdm = TIMDEXDatasetMetadata(timdex_dataset=dataset_with_same_day_runs)
+    assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
+
+
+def test_tdm_init_from_timdex_dataset_path_success(dataset_with_runs_location):
+    tdm = TIMDEXDatasetMetadata.from_dataset_location(dataset_with_runs_location)
+    assert isinstance(tdm.timdex_dataset, TIMDEXDataset)
+
+
+def test_tdm_default_database_location_in_memory(timdex_dataset_metadata):
+    assert timdex_dataset_metadata.db_path == ":memory:"
+    result = timdex_dataset_metadata.conn.query("PRAGMA database_list;").fetchone()
+    assert result[1] == "memory"  # name of database
+    assert result[2] is None  # file associated with database, where None is memory
+
+
+def test_tdm_explicit_database_in_file(tmp_path, dataset_with_runs_location):
+    db_path = str(tmp_path / "tda.duckdb")
+    tdm = TIMDEXDatasetMetadata.from_dataset_location(
+        dataset_with_runs_location,
+        db_path=db_path,
+    )
+    assert tdm.db_path == db_path
+    result = tdm.conn.query("PRAGMA database_list;").fetchone()
+    assert result[1] == "tda"  # name of database
+    assert result[2] == db_path  # filepath passed during init
+
+
+def test_tdm_get_duckdb_connection(timdex_dataset_metadata):
+    conn = timdex_dataset_metadata.get_connection()
+    assert isinstance(conn, duckdb.DuckDBPyConnection)
+
+
+def test_tdm_set_threads(timdex_dataset_metadata):
+    # set to 64
+    timdex_dataset_metadata.set_database_thread_usage(64)
+    sixty_four_thread_count = timdex_dataset_metadata.conn.query(
+        """SELECT current_setting('threads');"""
+    ).fetchone()[0]
+    assert sixty_four_thread_count == 64
+
+    # set to 12
+    timdex_dataset_metadata.set_database_thread_usage(12)
+    sixty_four_thread_count = timdex_dataset_metadata.conn.query(
+        """SELECT current_setting('threads');"""
+    ).fetchone()[0]
+    assert sixty_four_thread_count == 12
+
+
+def test_tdm_init_sets_up_database(timdex_dataset_metadata):
+    df = timdex_dataset_metadata.conn.query("show tables;").to_df()
+    assert set(df.name) == {"current_records", "records"}
+
+
+def test_tdm_get_current_parquet_files(timdex_dataset_metadata):
+    parquet_files = timdex_dataset_metadata.get_current_parquet_files()
+    # assert 5 total parquet files in dataset
+    # but only 3 contain current records
+    assert len(timdex_dataset_metadata.timdex_dataset.dataset.files) == 5
+    assert len(parquet_files) == 3
+
+
+def test_tdm_get_record_to_run_mapping(timdex_dataset_metadata):
+    record_map = timdex_dataset_metadata.get_current_record_to_run_map()
+
+    assert len(record_map) == 75
+    assert record_map["alma:0"] == "run-5"
+    assert record_map["alma:5"] == "run-4"
+    assert record_map["alma:19"] == "run-4"
+    assert "run-3" not in record_map.values()
+    assert record_map["alma:20"] == "run-2"
+
+
+def test_tdm_current_records_subset_of_all_records(timdex_dataset_metadata):
+    records_df = timdex_dataset_metadata.conn.query("select * from records;").to_df()
+    current_records_df = timdex_dataset_metadata.conn.query(
+        "select * from current_records;"
+    ).to_df()
+    assert set(current_records_df.timdex_record_id).issubset(
+        set(records_df.timdex_record_id)
+    )
diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py
@@ -1,11 +1,13 @@
 """timdex_dataset_api/__init__.py"""
 
 from timdex_dataset_api.dataset import TIMDEXDataset
+from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
 from timdex_dataset_api.record import DatasetRecord
 
-__version__ = "2.1.0"
+__version__ = "2.2.0"
 
 __all__ = [
     "DatasetRecord",
     "TIMDEXDataset",
+    "TIMDEXDatasetMetadata",
 ]
diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py