Skip to content

Commit 3182485

Browse files
authored
bugfix: bulk insert serialization of data (#136)
## Description Data can be inserted when linking or seeding, in the former we were inserting the data as a dictionary into the Patient.data column, however in the latter we inserted the data as a string. This causes issues on subsequent reads when using the data. The call to convert the PIIRecord into a value for the database has been standardized in PIIRecord.to_dict(), and the bulk insert method (which is used in the seeding process) has been updated to use this method.
1 parent 7bd7b09 commit 3182485

File tree

5 files changed

+42
-10
lines changed

5 files changed

+42
-10
lines changed

src/recordlinker/database/mpi_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def bulk_insert_patients(
132132
pat_data = [
133133
{
134134
"person_id": person and person.id,
135-
"_data": record.to_json(prune_empty=True),
135+
"_data": record.to_dict(prune_empty=True),
136136
"external_patient_id": record.external_id,
137137
"external_person_id": external_person_id,
138138
"external_person_source": "IRIS" if external_person_id else None,

src/recordlinker/models/mpi.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import enum
2-
import json
32
import uuid
43

54
from sqlalchemy import orm
@@ -96,12 +95,10 @@ def record(self, value):
9695
from recordlinker.schemas import pii
9796

9897
assert isinstance(value, pii.PIIRecord), "Expected a PIIRecord object"
99-
# convert the data to a JSON string, then load it back as a dictionary
100-
# this is necessary to ensure all data elements are JSON serializable
10198
# recursively remove all None and unset values from the data
10299
# this is an optimization to reduce the amount of data stored in the
103100
# database, if a value is empty, no need to store it
104-
self._data = json.loads(value.to_json(prune_empty=True))
101+
self._data = value.to_dict(prune_empty=True)
105102
if hasattr(self, "_record"):
106103
# if the record property is cached, delete it
107104
del self._record

src/recordlinker/schemas/pii.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import datetime
22
import enum
3+
import json
34
import re
45
import typing
56

@@ -288,6 +289,15 @@ def to_json(self, prune_empty: bool = False) -> str:
288289
"""
289290
return self.model_dump_json(exclude_unset=prune_empty, exclude_none=prune_empty)
290291

292+
def to_dict(self, prune_empty: bool = False) -> dict:
293+
"""
294+
Convert the PIIRecord object to a dictionary.
295+
"""
296+
# convert the data to a JSON string, then load it back as a dictionary
297+
# this is necessary to ensure all data elements are JSON serializable
298+
data = self.to_json(prune_empty=prune_empty)
299+
return json.loads(data)
300+
291301
def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
292302
"""
293303
Given a field name, return an iterator of all string values for that field.

tests/unit/database/test_mpi_service.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
This module contains the unit tests for the recordlinker.database.mpi_service module.
66
"""
77

8-
import json
98
import uuid
109

1110
import pytest
@@ -253,7 +252,7 @@ def test_no_person(self, session):
253252
patients = mpi_service.bulk_insert_patients(session, [rec], external_person_id="123456")
254253
assert len(patients) == 1
255254
assert patients[0].person_id is None
256-
assert json.loads(patients[0].data) == {
255+
assert patients[0].data == {
257256
"name": [{"given": ["Johnathon"], "family": "Smith"}]
258257
}
259258
assert patients[0].external_person_id == "123456"
@@ -280,11 +279,11 @@ def test_with_person(self, session):
280279
assert len(patients) == 2
281280
assert patients[0].person_id == person.id
282281
assert patients[1].person_id == person.id
283-
assert json.loads(patients[0].data) == {
282+
assert patients[0].data == {
284283
"birth_date": "1950-01-01",
285284
"name": [{"given": ["George"], "family": "Harrison"}],
286285
}
287-
assert json.loads(patients[1].data) == {
286+
assert patients[1].data == {
288287
"birth_date": "1950-01-01",
289288
"name": [{"given": ["George", "Harold"], "family": "Harrison"}],
290289
}

tests/unit/routes/test_seed_router.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
This module contains the unit tests for the recordlinker.routes.seed_router module.
66
"""
77

8+
import unittest.mock as mock
9+
810
from conftest import load_test_json_asset
911

1012
from recordlinker import models
@@ -20,7 +22,10 @@ def test_too_many_clusters(self, client):
2022
data = {"clusters": [{"records": []} for _ in range(101)]}
2123
response = client.post("/seed", json=data)
2224
assert response.status_code == 422
23-
assert response.json()["detail"][0]["msg"] == "Value error, Clusters must not exceed 100 records"
25+
assert (
26+
response.json()["detail"][0]["msg"]
27+
== "Value error, Clusters must not exceed 100 records"
28+
)
2429

2530
def test_large_batch(self, client):
2631
data = load_test_json_asset("seed_test.json.gz")
@@ -35,6 +40,27 @@ def test_large_batch(self, client):
3540
assert client.session.query(models.Patient).count() == 1285
3641
assert client.session.query(models.BlockingValue).count() == 8995
3742

43+
@mock.patch("recordlinker.database.algorithm_service.default_algorithm")
44+
def test_seed_and_link(self, mock_algorithm, basic_algorithm, client):
45+
mock_algorithm.return_value = basic_algorithm
46+
record = {
47+
"birth_date": "1956-09-06",
48+
"sex": "F",
49+
"address": [{"line": ["581 Baker Club"], "postal_code": "80373"}],
50+
"name": [
51+
{
52+
"family": "Cervantes",
53+
"given": ["Jason"],
54+
}
55+
],
56+
}
57+
seed_resp = client.post("/seed", json={"clusters": [{"records": [record]}]})
58+
assert seed_resp.status_code == 201
59+
persons = seed_resp.json()["persons"]
60+
assert len(persons) == 1
61+
response = client.post("/link", json={"record": record})
62+
assert response.status_code == 200
63+
3864

3965
class TestReset:
4066
def test_reset(self, client):

0 commit comments

Comments
 (0)