From 3d971264747bbabaf33601d641c232614396d9c2 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 22 Jan 2025 22:13:06 -0800 Subject: [PATCH] [r] Assign distinct replica type for DUOS (#6139) --- src/azul/plugins/metadata/anvil/__init__.py | 23 +-- .../metadata/anvil/indexer/transform.py | 13 +- src/azul/service/avro_pfb.py | 7 +- src/azul/service/manifest_service.py | 2 +- .../manifest/verbatim/jsonl/anvil/linked.json | 2 +- .../verbatim/pfb/anvil/pfb_entities.json | 21 +-- .../verbatim/pfb/anvil/pfb_schema.json | 171 ++++++++---------- 7 files changed, 106 insertions(+), 133 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index 292a40b05..e1a462145 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -349,26 +349,13 @@ def verbatim_pfb_schema(self, entity_schemas = super().verbatim_pfb_schema(non_schema_replicas) # For the rest, use the AnVIL schema as the basis of the PFB schema for table_name, table_schema in table_schemas_by_name.items(): - # FIXME: Improve handling of DUOS replicas - # https://github.com/DataBiosphere/azul/issues/6139 - is_duos_type = table_name == 'anvil_dataset' field_schemas = [ self._pfb_schema_from_anvil_column(table_name=table_name, column_name='datarepo_row_id', anvil_datatype='string', - is_optional=False, - is_polymorphic=is_duos_type) + is_optional=False) ] - if is_duos_type: - field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, - column_name='duos_id', - anvil_datatype='string', - is_polymorphic=True)) - field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, - column_name='description', - anvil_datatype='string', - is_polymorphic=True)) - elif table_name == 'anvil_file': + if table_name == 'anvil_file': field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, column_name='drs_uri', anvil_datatype='string')) @@ -378,8 +365,7 @@ def verbatim_pfb_schema(self, column_name=column_schema['name'], anvil_datatype=column_schema['datatype'], is_array=column_schema['array_of'], - is_optional=not column_schema['required'], - is_polymorphic=is_duos_type) + is_optional=not column_schema['required']) ) field_schemas.sort(key=itemgetter('name')) @@ -397,7 +383,6 @@ def _pfb_schema_from_anvil_column(self, anvil_datatype: str, is_array: bool = False, is_optional: bool = True, - is_polymorphic: bool = False ) -> AnyMutableJSON: _anvil_to_pfb_types = { 'boolean': 'boolean', @@ -414,8 +399,6 @@ def _pfb_schema_from_anvil_column(self, 'type': 'array', 'items': type_ } - if is_polymorphic and (is_array or not is_optional): - type_ = ['null', type_] return { 'name': column_name, 'namespace': table_name, diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 5ff292ffa..bbe8398eb 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -506,7 +506,12 @@ def _duos(self, dataset: EntityReference) -> MutableJSON: return self._entity(dataset, self._duos_types()) def _is_duos(self, dataset: EntityReference) -> bool: - return 'duos_id' in self.bundle.entities[dataset] + try: + contents = self.bundle.entities[dataset] + except KeyError: + return False + else: + return 'duos_id' in contents def _dataset(self, dataset: EntityReference) -> MutableJSON: if self._is_duos(dataset): @@ -514,6 +519,12 @@ def _dataset(self, dataset: EntityReference) -> MutableJSON: else: return super()._dataset(dataset) + def _replica_type(self, entity: EntityReference) -> str: + if entity.entity_type == 'anvil_dataset' and self._is_duos(entity): + return 'duos_dataset_registration' + else: + return super()._replica_type(entity) + def _list_entities(self) -> Iterable[EntityReference]: # Suppress contributions for bundles that only contain orphans if self.bundle.entities: diff --git a/src/azul/service/avro_pfb.py b/src/azul/service/avro_pfb.py index cab2f6142..ba4ab31cf 100644 --- a/src/azul/service/avro_pfb.py +++ b/src/azul/service/avro_pfb.py @@ -193,14 +193,13 @@ def for_aggregate(cls, return cls(id=id_, name=name, object=object_) @classmethod - def for_replica(cls, replica: MutableJSON, schema: JSON) -> Self: + def for_replica(cls, replica: MutableJSON) -> Self: name, object_ = replica['replica_type'], replica['contents'] - cls._add_missing_fields(name, object_, schema) # Note that it is possible for two distinct replicas to have the same # entity ID. For example, replicas representing the DUOS registration # of AnVIL datasets have the same ID as the replica for the dataset - # itself. Terra appears to combine PFB entities with the same ID - # into a single row. + # itself. Terra appears to combine PFB entities with the same ID and + # name into a single row. # FIXME: Improve handling of DUOS replicas # https://github.com/DataBiosphere/azul/issues/6139 return cls(id=replica['entity_id'], name=name, object=object_) diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index f444326c5..de38a5493 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2149,7 +2149,7 @@ def create_file(self) -> tuple[str, str | None]: def pfb_entities(): yield pfb_metadata_entity for replica in replicas: - yield avro_pfb.PFBEntity.for_replica(dict(replica), pfb_schema).to_json(()) + yield avro_pfb.PFBEntity.for_replica(dict(replica)).to_json(()) fd, path = mkstemp(suffix=f'.{self.file_name_extension()}') os.close(fd) diff --git a/test/service/data/manifest/verbatim/jsonl/anvil/linked.json b/test/service/data/manifest/verbatim/jsonl/anvil/linked.json index 673b15da5..02ba7a46c 100644 --- a/test/service/data/manifest/verbatim/jsonl/anvil/linked.json +++ b/test/service/data/manifest/verbatim/jsonl/anvil/linked.json @@ -224,7 +224,7 @@ } }, { - "type": "anvil_dataset", + "type": "duos_dataset_registration", "value": { "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", "description": "Study description from DUOS", diff --git a/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json b/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json index d03b19e87..75fd60369 100644 --- a/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json +++ b/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json @@ -89,6 +89,13 @@ "properties": [], "values": {} }, + { + "links": [], + "name": "duos_dataset_registration", + "ontology_reference": "", + "properties": [], + "values": {} + }, { "links": [], "name": "non_schema_orphan_table", @@ -102,20 +109,12 @@ }, { "id": "2370f948-2783-4eb6-afea-e022897f4dcf", - "name": "anvil_dataset", + "name": "duos_dataset_registration", "object": { - "consent_group": null, - "data_modality": null, - "data_use_permission": null, - "datarepo_row_id": null, "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", "description": "Study description from DUOS", "duos_id": "DUOS-000000", - "owner": null, - "principal_investigator": null, - "registered_identifier": null, - "source_datarepo_row_ids": null, - "title": null + "version": "2022-06-01T00:00:00.000000Z" }, "relations": [] }, @@ -282,8 +281,6 @@ ], "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf", "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", - "description": null, - "duos_id": null, "owner": [ "Debbie Nickerson" ], diff --git a/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json b/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json index 1f0d38f6f..145d3d9f2 100644 --- a/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json +++ b/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json @@ -497,132 +497,89 @@ { "name": "consent_group", "namespace": "anvil_dataset", - "type": [ - "null", - { - "items": [ - "null", - "string" - ], - "type": "array" - } - ] + "type": { + "items": [ + "null", + "string" + ], + "type": "array" + } }, { "name": "data_modality", "namespace": "anvil_dataset", - "type": [ - "null", - { - "items": [ - "null", - "string" - ], - "type": "array" - } - ] + "type": { + "items": [ + "null", + "string" + ], + "type": "array" + } }, { "name": "data_use_permission", "namespace": "anvil_dataset", - "type": [ - "null", - { - "items": [ - "null", - "string" - ], - "type": "array" - } - ] + "type": { + "items": [ + "null", + "string" + ], + "type": "array" + } }, { "name": "datarepo_row_id", "namespace": "anvil_dataset", - "type": [ - "null", - "string" - ] + "type": "string" }, { "name": "dataset_id", "namespace": "anvil_dataset", - "type": [ - "null", - "string" - ] - }, - { - "name": "description", - "namespace": "anvil_dataset", - "type": [ - "null", - "string" - ] - }, - { - "name": "duos_id", - "namespace": "anvil_dataset", - "type": [ - "null", - "string" - ] + "type": "string" }, { "name": "owner", "namespace": "anvil_dataset", - "type": [ - "null", - { - "items": [ - "null", - "string" - ], - "type": "array" - } - ] + "type": { + "items": [ + "null", + "string" + ], + "type": "array" + } }, { "name": "principal_investigator", "namespace": "anvil_dataset", - "type": [ - "null", - { - "items": [ - "null", - "string" - ], - "type": "array" - } - ] + "type": { + "items": [ + "null", + "string" + ], + "type": "array" + } }, { "name": "registered_identifier", "namespace": "anvil_dataset", - "type": [ - "null", - { - "items": [ - "null", - "string" - ], - "type": "array" - } - ] + "type": { + "items": [ + "null", + "string" + ], + "type": "array" + } }, { "name": "source_datarepo_row_ids", "namespace": "anvil_dataset", - "type": [ - "null", - { - "items": [ - "null", - "string" - ], - "type": "array" - } - ] + "type": { + "items": [ + "null", + "string" + ], + "type": "array" + } }, { "name": "title", @@ -1167,6 +1124,32 @@ "name": "anvil_variantcallingactivity", "type": "record" }, + { + "fields": [ + { + "name": "dataset_id", + "namespace": "duos_dataset_registration", + "type": "string" + }, + { + "name": "description", + "namespace": "duos_dataset_registration", + "type": "string" + }, + { + "name": "duos_id", + "namespace": "duos_dataset_registration", + "type": "string" + }, + { + "name": "version", + "namespace": "duos_dataset_registration", + "type": "string" + } + ], + "name": "duos_dataset_registration", + "type": "record" + }, { "fields": [ {