Skip to content

Commit

Permalink
[r] Assign distinct replica type for DUOS (#6139)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Jan 23, 2025
1 parent 6bac0f5 commit b677599
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 133 deletions.
23 changes: 3 additions & 20 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,26 +349,13 @@ def verbatim_pfb_schema(self,
entity_schemas = super().verbatim_pfb_schema(non_schema_replicas)
# For the rest, use the AnVIL schema as the basis of the PFB schema
for table_name, table_schema in table_schemas_by_name.items():
# FIXME: Improve handling of DUOS replicas
# https://github.com/DataBiosphere/azul/issues/6139
is_duos_type = table_name == 'anvil_dataset'
field_schemas = [
self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='datarepo_row_id',
anvil_datatype='string',
is_optional=False,
is_polymorphic=is_duos_type)
is_optional=False)
]
if is_duos_type:
field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='duos_id',
anvil_datatype='string',
is_polymorphic=True))
field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='description',
anvil_datatype='string',
is_polymorphic=True))
elif table_name == 'anvil_file':
if table_name == 'anvil_file':
field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='drs_uri',
anvil_datatype='string'))
Expand All @@ -378,8 +365,7 @@ def verbatim_pfb_schema(self,
column_name=column_schema['name'],
anvil_datatype=column_schema['datatype'],
is_array=column_schema['array_of'],
is_optional=not column_schema['required'],
is_polymorphic=is_duos_type)
is_optional=not column_schema['required'])
)

field_schemas.sort(key=itemgetter('name'))
Expand All @@ -397,7 +383,6 @@ def _pfb_schema_from_anvil_column(self,
anvil_datatype: str,
is_array: bool = False,
is_optional: bool = True,
is_polymorphic: bool = False
) -> AnyMutableJSON:
_anvil_to_pfb_types = {
'boolean': 'boolean',
Expand All @@ -414,8 +399,6 @@ def _pfb_schema_from_anvil_column(self,
'type': 'array',
'items': type_
}
if is_polymorphic and (is_array or not is_optional):
type_ = ['null', type_]
return {
'name': column_name,
'namespace': table_name,
Expand Down
5 changes: 4 additions & 1 deletion src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,10 @@ def _transform(self,

def _replicate(self, entity: EntityReference) -> tuple[str, JSON]:
content = ChainMap(self.bundle.entities, self.bundle.orphans)[entity]
return entity.entity_type, content
entity_type = ('duos_dataset_registration'
if self._is_duos(content)
else entity.entity_type)
return entity_type, content

def _convert_entity_type(self, entity_type: str) -> str:
assert entity_type == 'bundle' or entity_type.startswith('anvil_'), entity_type
Expand Down
7 changes: 3 additions & 4 deletions src/azul/service/avro_pfb.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,14 +193,13 @@ def for_aggregate(cls,
return cls(id=id_, name=name, object=object_)

@classmethod
def for_replica(cls, replica: MutableJSON, schema: JSON) -> Self:
def for_replica(cls, replica: MutableJSON) -> Self:
name, object_ = replica['replica_type'], replica['contents']
cls._add_missing_fields(name, object_, schema)
# Note that it is possible for two distinct replicas to have the same
# entity ID. For example, replicas representing the DUOS registration
# of AnVIL datasets have the same ID as the replica for the dataset
# itself. Terra appears to combine PFB entities with the same ID
# into a single row.
# itself. Terra appears to combine PFB entities with the same ID and
# name into a single row.
# FIXME: Improve handling of DUOS replicas
# https://github.com/DataBiosphere/azul/issues/6139
return cls(id=replica['entity_id'], name=name, object=object_)
Expand Down
2 changes: 1 addition & 1 deletion src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2149,7 +2149,7 @@ def create_file(self) -> tuple[str, str | None]:
def pfb_entities():
yield pfb_metadata_entity
for replica in replicas:
yield avro_pfb.PFBEntity.for_replica(dict(replica), pfb_schema).to_json(())
yield avro_pfb.PFBEntity.for_replica(dict(replica)).to_json(())

fd, path = mkstemp(suffix=f'.{self.file_name_extension()}')
os.close(fd)
Expand Down
21 changes: 9 additions & 12 deletions test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@
"properties": [],
"values": {}
},
{
"links": [],
"name": "duos_dataset_registration",
"ontology_reference": "",
"properties": [],
"values": {}
},
{
"links": [],
"name": "non_schema_orphan_table",
Expand All @@ -102,20 +109,12 @@
},
{
"id": "2370f948-2783-4eb6-afea-e022897f4dcf",
"name": "anvil_dataset",
"name": "duos_dataset_registration",
"object": {
"consent_group": null,
"data_modality": null,
"data_use_permission": null,
"datarepo_row_id": null,
"dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739",
"description": "Study description from DUOS",
"duos_id": "DUOS-000000",
"owner": null,
"principal_investigator": null,
"registered_identifier": null,
"source_datarepo_row_ids": null,
"title": null
"version": "2022-06-01T00:00:00.000000Z"
},
"relations": []
},
Expand Down Expand Up @@ -282,8 +281,6 @@
],
"datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf",
"dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739",
"description": null,
"duos_id": null,
"owner": [
"Debbie Nickerson"
],
Expand Down
171 changes: 77 additions & 94 deletions test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -497,132 +497,89 @@
{
"name": "consent_group",
"namespace": "anvil_dataset",
"type": [
"null",
{
"items": [
"null",
"string"
],
"type": "array"
}
]
"type": {
"items": [
"null",
"string"
],
"type": "array"
}
},
{
"name": "data_modality",
"namespace": "anvil_dataset",
"type": [
"null",
{
"items": [
"null",
"string"
],
"type": "array"
}
]
"type": {
"items": [
"null",
"string"
],
"type": "array"
}
},
{
"name": "data_use_permission",
"namespace": "anvil_dataset",
"type": [
"null",
{
"items": [
"null",
"string"
],
"type": "array"
}
]
"type": {
"items": [
"null",
"string"
],
"type": "array"
}
},
{
"name": "datarepo_row_id",
"namespace": "anvil_dataset",
"type": [
"null",
"string"
]
"type": "string"
},
{
"name": "dataset_id",
"namespace": "anvil_dataset",
"type": [
"null",
"string"
]
},
{
"name": "description",
"namespace": "anvil_dataset",
"type": [
"null",
"string"
]
},
{
"name": "duos_id",
"namespace": "anvil_dataset",
"type": [
"null",
"string"
]
"type": "string"
},
{
"name": "owner",
"namespace": "anvil_dataset",
"type": [
"null",
{
"items": [
"null",
"string"
],
"type": "array"
}
]
"type": {
"items": [
"null",
"string"
],
"type": "array"
}
},
{
"name": "principal_investigator",
"namespace": "anvil_dataset",
"type": [
"null",
{
"items": [
"null",
"string"
],
"type": "array"
}
]
"type": {
"items": [
"null",
"string"
],
"type": "array"
}
},
{
"name": "registered_identifier",
"namespace": "anvil_dataset",
"type": [
"null",
{
"items": [
"null",
"string"
],
"type": "array"
}
]
"type": {
"items": [
"null",
"string"
],
"type": "array"
}
},
{
"name": "source_datarepo_row_ids",
"namespace": "anvil_dataset",
"type": [
"null",
{
"items": [
"null",
"string"
],
"type": "array"
}
]
"type": {
"items": [
"null",
"string"
],
"type": "array"
}
},
{
"name": "title",
Expand Down Expand Up @@ -1167,6 +1124,32 @@
"name": "anvil_variantcallingactivity",
"type": "record"
},
{
"fields": [
{
"name": "dataset_id",
"namespace": "duos_dataset_registration",
"type": "string"
},
{
"name": "description",
"namespace": "duos_dataset_registration",
"type": "string"
},
{
"name": "duos_id",
"namespace": "duos_dataset_registration",
"type": "string"
},
{
"name": "version",
"namespace": "duos_dataset_registration",
"type": "string"
}
],
"name": "duos_dataset_registration",
"type": "record"
},
{
"fields": [
{
Expand Down
2 changes: 1 addition & 1 deletion test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2167,7 +2167,7 @@ def _canned_entities(self):
def hash_entities(entities: dict[EntityReference, JSON]) -> dict[str, JSON]:
return {
json_hash(contents).digest(): {
'type': ref.entity_type,
'type': 'duos_dataset_registration' if 'duos_id' in contents else ref.entity_type,
'value': contents
}
for ref, contents in entities.items()
Expand Down

0 comments on commit b677599

Please sign in to comment.