Skip to content

Commit

Permalink
Refactor and pluginify verbatim PFB links
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Jan 30, 2025
1 parent 1a88afa commit b308aa7
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 12 deletions.
8 changes: 8 additions & 0 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
from azul.types import (
JSON,
MutableJSON,
MutableJSONs,
get_generic_type_params,
)

Expand Down Expand Up @@ -503,6 +504,13 @@ def verbatim_pfb_relations(self,
"""
return []

def verbatim_pfb_links(self, replica_type: str) -> MutableJSONs:
"""
Express the relationships of the given replica type as PFB links
(https://uc-cdis.github.io/pypfb/#link).
"""
return []

@abstractmethod
def document_slice(self, entity_type: str) -> DocumentSlice | None:
raise NotImplementedError
Expand Down
24 changes: 16 additions & 8 deletions src/azul/service/avro_pfb.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)
from typing import (
ClassVar,
Mapping,
MutableSet,
Self,
)
Expand Down Expand Up @@ -63,6 +64,7 @@
AnyMutableJSON,
JSON,
MutableJSON,
MutableJSONs,
)

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -261,8 +263,18 @@ def to_entity(cls, entity: PFBEntity) -> Self:
return cls(dst_id=entity.id, dst_name=entity.name)


def pfb_metadata_entity(entity_types: Iterable[str],
links: bool = True
def pfb_links_from_field_types(field_types: FieldTypes) -> MutableJSON:
return {
entity_type: [] if entity_type == 'files' else [{
'multiplicity': 'MANY_TO_MANY',
'dst': 'files',
'name': 'files'
}]
for entity_type in field_types
}


def pfb_metadata_entity(links_by_entity_type: Mapping[str, MutableJSONs],
) -> MutableJSON:
"""
The Metadata entity encodes the possible relationships between tables.
Expand All @@ -278,13 +290,9 @@ def pfb_metadata_entity(entity_types: Iterable[str],
'name': entity_type,
'ontology_reference': '',
'values': {},
'links': [] if not links or entity_type == 'files' else [{
'multiplicity': 'MANY_TO_MANY',
'dst': 'files',
'name': 'files'
}],
'links': links,
'properties': []
} for entity_type in entity_types
} for entity_type, links in links_by_entity_type.items()
],
'misc': {}
}
Expand Down
10 changes: 7 additions & 3 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1744,7 +1744,8 @@ def create_file(self) -> tuple[str, str | None]:
for doc in self._all_docs_sorted():
converter.add_doc(doc)

entity = avro_pfb.pfb_metadata_entity(field_types)
links = avro_pfb.pfb_links_from_field_types(field_types)
entity = avro_pfb.pfb_metadata_entity(links)
entities = itertools.chain([entity], converter.entities())

fd, path = mkstemp(suffix='.avro')
Expand Down Expand Up @@ -2146,9 +2147,12 @@ def create_file(self) -> tuple[str, str | None]:
replica_schemas = plugin.verbatim_pfb_schema(replicas)
# Ensure field order is consistent for unit tests
replica_schemas.sort(key=itemgetter('name'))
replica_types = [s['name'] for s in replica_schemas]
links = {
replica_type: plugin.verbatim_pfb_links(replica_type)
for replica_type in ([s['name'] for s in replica_schemas])
}
pfb_metadata_entity = avro_pfb.pfb_metadata_entity(links)
pfb_schema = avro_pfb.avro_pfb_schema(replica_schemas)
pfb_metadata_entity = avro_pfb.pfb_metadata_entity(replica_types, links=False)

def pfb_entities():
yield pfb_metadata_entity
Expand Down
3 changes: 2 additions & 1 deletion test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2212,7 +2212,8 @@ def test_terra_pfb_schema(self):
self._assert_pfb_schema(schema)

def test_pfb_metadata_object(self):
metadata_entity = avro_pfb.pfb_metadata_entity(FileTransformer.field_types())
links = avro_pfb.pfb_links_from_field_types(FileTransformer.field_types())
metadata_entity = avro_pfb.pfb_metadata_entity(links)
field_types = FileTransformer.field_types()
schema = avro_pfb.pfb_schema_from_field_types(field_types)
parsed_schema = fastavro.parse_schema(cast(dict, schema))
Expand Down

0 comments on commit b308aa7

Please sign in to comment.