Skip to content

Commit 990c736

Browse files
authored
INTPYTHON-498 Missing field handling in nested data (#266)
1 parent b720226 commit 990c736

File tree

2 files changed

+49
-0
lines changed

2 files changed

+49
-0
lines changed

bindings/python/pymongoarrow/lib.pyx

+6
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,12 @@ cdef class BuilderManager:
254254
# For list children, the nulls are stored in the parent.
255255
key = field.encode('utf-8')
256256
parent_type = self.parent_types.get(key, None)
257+
# Check if the item was in our schema but never seen, and should have a parent.
258+
if parent_type is None and "." in field:
259+
parent_key, _, _ = field.rpartition('.')
260+
self.parent_names[key] = parent_key.encode('utf-8')
261+
parent_type = BSON_TYPE_DOCUMENT
262+
# Add nulls according to parent type.
257263
if parent_type == BSON_TYPE_ARRAY:
258264
continue
259265
if parent_type == BSON_TYPE_DOCUMENT:

bindings/python/test/test_arrow.py

+43
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,49 @@ def test_schema_arrays_of_documents_with_nulls(self):
533533
expected = json.load(fid)
534534
assert df.to_pylist() == expected
535535

536+
def test_schema_arrays_of_documents_orphaned_null(self):
537+
# From https://github.com/mongodb-labs/mongo-arrow/issues/265.
538+
col = self.coll
539+
col.delete_many({})
540+
schema = Schema(
541+
{
542+
"_id": ObjectId,
543+
"test_list_struct": [
544+
{
545+
"field1": {
546+
"sub_field1": pa.string(),
547+
"sub_field2": pa.string(),
548+
}
549+
}
550+
],
551+
}
552+
)
553+
554+
col.insert_one(
555+
{
556+
"_id": ObjectId("000000000000000000000001"),
557+
"test_list_struct": [
558+
{
559+
"field1": {
560+
"sub_field1": "test_data",
561+
}
562+
},
563+
{
564+
"field1": "test_data",
565+
},
566+
],
567+
}
568+
)
569+
df = aggregate_arrow_all(col, schema=schema, pipeline=[])
570+
doc = df.to_pylist()[0]
571+
del doc["_id"]
572+
assert doc == {
573+
"test_list_struct": [
574+
{"field1": {"sub_field1": "test_data", "sub_field2": None}},
575+
{"field1": {"sub_field1": None, "sub_field2": None}},
576+
]
577+
}
578+
536579
def test_auto_schema_nested(self):
537580
# Create table with random data of various types.
538581
_, data = self._create_nested_data()

0 commit comments

Comments
 (0)