Skip to content

Commit f594059

Browse files
authored
Merge pull request #112 from MITLibraries/timx-128-update-config
Timx 128 update config
2 parents 20945f7 + 4246e4a commit f594059

File tree

6 files changed

+307
-265
lines changed

6 files changed

+307
-265
lines changed

.python-version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.10.6
1+
3.10.8

Pipfile.lock

Lines changed: 264 additions & 244 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ TIMDEX! Index Manager (TIM) is a Python cli application for managing TIMDEX inde
99
## Optional ENV
1010

1111
- `AWS_REGION` = Only needed if AWS region changes from the default of us-east-1.
12-
- `OPENSEARCH_REQUEST_TIMEOUT` = Only used for OpenSearch requests that tend to take longer than the default timeout of 10 seconds, such as bulk or index refresh requests. Defaults to 30 seconds if not set.
12+
- `OPENSEARCH_BULK_MAX_CHUNK_BYTES` = Chunk size limit for sending requests to the bulk indexing endpoint, in bytes. Defaults to 100 MB (the opensearchpy default) if not set.
13+
- `OPENSEARCH_BULK_MAX_RETRIES` = Maximum number of retries when sending requests to the bulk indexing endpoint. Defaults to 8 if not set.
14+
- `OPENSEARCH_REQUEST_TIMEOUT` = Only used for OpenSearch requests that tend to take longer than the default timeout of 10 seconds, such as bulk or index refresh requests. Defaults to 120 seconds if not set.
1315
- `SENTRY_DSN` = If set to a valid Sentry DSN, enables Sentry exception monitoring. This is not needed for local development.
1416
- `STATUS_UPDATE_INTERVAL` = The ingest process logs the # of records indexed every nth record (1000 by default). Set this env variable to any integer to change the frequency of logging status updates. Can be useful for development/debugging.
1517
- `TIMDEX_OPENSEARCH_ENDPOINT` = If using a local Docker OpenSearch instance, this isn't needed. Otherwise set to OpenSearch instance endpoint _without_ the http scheme, e.g. `search-timdex-env-1234567890.us-east-1.es.amazonaws.com`. Can also be passed directly to the CLI via the `--url` option.

tests/test_config.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import logging
22

33
from tim.config import (
4+
OPENSEARCH_BULK_CONFIG_DEFAULTS,
45
configure_index_settings,
56
configure_logger,
7+
configure_opensearch_bulk_settings,
68
configure_sentry,
7-
opensearch_request_timeout,
89
)
910

1011

@@ -28,6 +29,24 @@ def test_configure_logger_verbose():
2829
assert result == "Logger 'tests.test_config' configured with level=DEBUG"
2930

3031

32+
def test_configure_opensearch_bulk_settings_from_env(monkeypatch):
33+
monkeypatch.setenv("OPENSEARCH_BULK_MAX_CHUNK_BYTES", "10")
34+
monkeypatch.setenv("OPENSEARCH_BULK_MAX_RETRIES", "2")
35+
monkeypatch.setenv("OPENSEARCH_REQUEST_TIMEOUT", "20")
36+
assert configure_opensearch_bulk_settings() == {
37+
"OPENSEARCH_BULK_MAX_CHUNK_BYTES": 10,
38+
"OPENSEARCH_BULK_MAX_RETRIES": 2,
39+
"OPENSEARCH_REQUEST_TIMEOUT": 20,
40+
}
41+
42+
43+
def test_configure_opensearch_bulk_settings_uses_defaults(monkeypatch):
44+
monkeypatch.delenv("OPENSEARCH_BULK_MAX_CHUNK_BYTES", raising=False)
45+
monkeypatch.delenv("OPENSEARCH_BULK_MAX_RETRIES", raising=False)
46+
monkeypatch.delenv("OPENSEARCH_REQUEST_TIMEOUT", raising=False)
47+
assert configure_opensearch_bulk_settings() == OPENSEARCH_BULK_CONFIG_DEFAULTS
48+
49+
3150
def test_configure_sentry_no_env_variable(monkeypatch):
3251
monkeypatch.delenv("SENTRY_DSN", raising=False)
3352
result = configure_sentry()
@@ -44,13 +63,3 @@ def test_configure_sentry_env_variable_is_dsn(monkeypatch):
4463
monkeypatch.setenv("SENTRY_DSN", "https://[email protected]/123456")
4564
result = configure_sentry()
4665
assert result == "Sentry DSN found, exceptions will be sent to Sentry with env=test"
47-
48-
49-
def test_opensearch_request_timeout_default(monkeypatch):
50-
monkeypatch.delenv("OPENSEARCH_REQUEST_TIMEOUT", raising=False)
51-
assert opensearch_request_timeout() == 120
52-
53-
54-
def test_opensearch_request_timeout_from_env(monkeypatch):
55-
monkeypatch.setenv("OPENSEARCH_REQUEST_TIMEOUT", "5")
56-
assert opensearch_request_timeout() == 5

tim/config.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
import sentry_sdk
66

7+
OPENSEARCH_BULK_CONFIG_DEFAULTS = {
8+
"OPENSEARCH_BULK_MAX_CHUNK_BYTES": 100 * 1024 * 1024,
9+
"OPENSEARCH_BULK_MAX_RETRIES": 8,
10+
"OPENSEARCH_REQUEST_TIMEOUT": 120,
11+
}
712
PRIMARY_ALIAS = "all-current"
813
VALID_BULK_OPERATIONS = ["create", "delete", "index", "update"]
914
VALID_SOURCES = ["alma", "aspace", "dspace", "jpal", "whoas", "zenodo"]
@@ -35,14 +40,17 @@ def configure_logger(logger: logging.Logger, verbose: bool) -> str:
3540
)
3641

3742

43+
def configure_opensearch_bulk_settings() -> dict[str, int]:
44+
result = {}
45+
for key, value in OPENSEARCH_BULK_CONFIG_DEFAULTS.items():
46+
result[key] = int(os.getenv(key) or value)
47+
return result
48+
49+
3850
def configure_sentry() -> str:
3951
env = os.getenv("WORKSPACE")
4052
sentry_dsn = os.getenv("SENTRY_DSN")
4153
if sentry_dsn and sentry_dsn.lower() != "none":
4254
sentry_sdk.init(sentry_dsn, environment=env)
4355
return f"Sentry DSN found, exceptions will be sent to Sentry with env={env}"
4456
return "No Sentry DSN found, exceptions will not be sent to Sentry"
45-
46-
47-
def opensearch_request_timeout() -> int:
48-
return int(os.getenv("OPENSEARCH_REQUEST_TIMEOUT", "120"))

tim/opensearch.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from tim.config import (
1313
PRIMARY_ALIAS,
1414
configure_index_settings,
15-
opensearch_request_timeout,
15+
configure_opensearch_bulk_settings,
1616
)
1717
from tim.errors import AliasNotFoundError, IndexExistsError, IndexNotFoundError
1818

@@ -315,14 +315,16 @@ def bulk_index(
315315
Returns total sums of: records created, records updated, errors, and total records
316316
processed.
317317
"""
318+
bulk_config = configure_opensearch_bulk_settings()
318319
result = {"created": 0, "updated": 0, "errors": 0, "total": 0}
319320
actions = helpers.generate_bulk_actions(index, records, "index")
320321
responses = streaming_bulk(
321322
client,
322323
actions,
323-
max_retries=3,
324+
max_chunk_bytes=bulk_config["OPENSEARCH_BULK_MAX_CHUNK_BYTES"],
325+
max_retries=bulk_config["OPENSEARCH_BULK_MAX_RETRIES"],
324326
raise_on_error=False,
325-
request_timeout=opensearch_request_timeout(),
327+
request_timeout=bulk_config["OPENSEARCH_REQUEST_TIMEOUT"],
326328
)
327329
for response in responses:
328330
if response[0] is False:
@@ -347,7 +349,8 @@ def bulk_index(
347349
logger.info("Status update: %s records indexed so far!", result["total"])
348350
logger.info("All records ingested, refreshing index.")
349351
response = client.indices.refresh(
350-
index=index, request_timeout=opensearch_request_timeout()
352+
index=index,
353+
request_timeout=bulk_config["OPENSEARCH_REQUEST_TIMEOUT"],
351354
)
352355
logger.debug(response)
353356
return result

0 commit comments

Comments
 (0)