Skip to content

Extending MongoDB auto-reconnect to all inserts (SCP-2629, SCP-5904) #379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 4 additions & 15 deletions ingest/ingest_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
from opencensus.ext.stackdriver.trace_exporter import StackdriverExporter
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer
from pymongo import MongoClient
from mongo_connection import MongoConnection, graceful_auto_reconnect
from subsample import SubSample
from validation.validate_metadata import (
report_issues,
Expand Down Expand Up @@ -210,19 +210,7 @@ def __init__(

# Will be replaced by MongoConnection as defined in SCP-2629
def get_mongo_db(self):
host = os.environ["DATABASE_HOST"]
user = os.environ["MONGODB_USERNAME"]
password = os.environ["MONGODB_PASSWORD"]
db_name = os.environ["DATABASE_NAME"]
client = MongoClient(
host,
username=user,
password=password,
authSource=db_name,
authMechanism="SCRAM-SHA-1",
)

return client[db_name]
return MongoConnection()._client

def initialize_file_connection(self, file_type, file_path):
"""Initializes connection to file.
Expand Down Expand Up @@ -258,6 +246,7 @@ def initialize_file_connection(self, file_type, file_path):
self.report_validation("failure")
raise ValueError(v)

@graceful_auto_reconnect
def insert_many(self, collection_name, documents):
if not config.bypass_mongo_writes():
self.db[collection_name].insert_many(documents)
Expand Down Expand Up @@ -333,7 +322,7 @@ def load_subsample(
},
):
documents.append(model)
self.db["data_arrays"].insert_many(documents)
self.insert_many("data_arrays", documents)

except Exception as e:
log_exception(IngestPipeline.dev_logger, IngestPipeline.user_logger, e)
Expand Down
13 changes: 10 additions & 3 deletions ingest/mongo_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ def retry(attempt_num):
@functools.wraps(mongo_op_func)
def wrapper(*args, **kwargs):
args = list(args)
# detect which argument is the list of inserted documents
# when called from IngestPipeline, first arg is ingest instance, and 3rd is the list
# when called from GeneExpression (static implementation), first arg is list
if args[0].__class__.__name__ == 'IngestPipeline':
docs_idx = 2
else:
docs_idx = 0
Comment on lines +70 to +76
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clever, nice solution.

for attempt in range(MongoConnection.MAX_AUTO_RECONNECT_ATTEMPTS):
try:
return mongo_op_func(*args, **kwargs)
Expand All @@ -84,12 +91,12 @@ def wrapper(*args, **kwargs):
error = bwe.details["writeErrors"]
# Check error code to see if any failures are due to violating a unique index (error code 11000)
# and discard those documents before retrying
filtered_docs = discard_inserted_documents(error, args[0])
filtered_docs = discard_inserted_documents(error, args[docs_idx])
if len(filtered_docs) > 0:
args[0] = filtered_docs
args[docs_idx] = filtered_docs
retry(attempt)
else:
return args[0]
return args[docs_idx]
else:
log_error_without_values(bwe)
raise bwe
Expand Down
Binary file modified ingest/validation/ontologies/cl.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/efo.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/mondo.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/uberon.min.tsv.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion ingest/validation/ontologies/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1734020562 # validation cache key
1737653567 # validation cache key
84 changes: 83 additions & 1 deletion tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from test_dense import mock_load_r_files
import os

from pymongo.errors import AutoReconnect
from pymongo.errors import AutoReconnect, BulkWriteError
from test_expression_files import mock_expression_load
from mock_gcp import mock_storage_client, mock_storage_blob

Expand Down Expand Up @@ -813,6 +813,88 @@ def test_get_action_from_args(self):
bad_args = ["foo", "bar", "bing"]
self.assertEqual("", get_action_from_args(bad_args))

@patch("mongo_connection.MongoConnection.MAX_AUTO_RECONNECT_ATTEMPTS", 3)
def test_insert_reconnect(self):
args = [
"--study-id",
"5d276a50421aa9117c982845",
"--study-file-id",
"5dd5ae25421aa910a723a337",
"ingest_subsample",
"--cluster-file",
"../tests/data/good_subsample_cluster.csv",
"--name",
"cluster1",
"--cell-metadata-file",
"../tests/data/test_cell_metadata.csv",
"--subsample",
]
parsed_args = create_parser().parse_args(args)
validate_arguments(parsed_args)
arguments = vars(parsed_args)
ingest = IngestPipeline(**arguments)
client_mock = MagicMock()
ingest.db = client_mock
docs = [
{
'id': 1,
'name': 'foo',
'study_id': 1,
'study_file_id': 1,
'array_index': 0,
'linear_data_type': 'Cluster',
},
{
'id': 2,
'name': 'bar',
'study_id': 1,
'study_file_id': 1,
'array_index': 0,
'linear_data_type': 'Cluster',
},
]
ingest.insert_many("data_arrays", docs)
client_mock["data_arrays"].insert_many.assert_called_with(docs)

client_mock["data_arrays"].insert_many.side_effect = ValueError("Foo")
self.assertRaises(
Exception, ingest.insert_many, "data_arrays", docs
)
client_mock.reset_mock()

# Test exponential back off for auto reconnect
client_mock["data_arrays"].insert_many.side_effect = AutoReconnect
self.assertRaises(
AutoReconnect, ingest.insert_many, "data_arrays", docs
)
self.assertEqual(client_mock["data_arrays"].insert_many.call_count, 3)
client_mock.reset_mock()

def raiseError(*args, **kwargs):
details = {
"writeErrors": [
{
"code": 11000,
"op": {
'id': 1,
'name': 'foo',
'study_id': 1,
'study_file_id': 1,
'array_index': 0,
'linear_data_type': 'Cluster',
},
}
]
}
raise BulkWriteError(details)

# Test exponential back off for BulkWriteError
client_mock["data_arrays"].insert_many.side_effect = raiseError
self.assertRaises(
BulkWriteError, ingest.insert_many, "data_arrays", docs
)
self.assertEqual(client_mock["data_arrays"].insert_many.call_count, 3)


if __name__ == "__main__":
unittest.main()
Loading