102
102
from azul .deployment import (
103
103
aws ,
104
104
)
105
+ from azul .indexer import (
106
+ SourceSpec ,
107
+ )
105
108
from azul .indexer .document import (
106
109
DocumentType ,
107
110
FieldPath ,
@@ -2051,6 +2054,9 @@ class ReplicaKeys:
2051
2054
"""
2052
2055
hub_id : str
2053
2056
replica_ids : list [str ]
2057
+ # We need to track whether sources use a common prefix to avoid dangling
2058
+ # references in PFB relations
2059
+ source_spec : str
2054
2060
2055
2061
def _replica_keys (self ) -> Iterable [ReplicaKeys ]:
2056
2062
request = self ._create_request ()
@@ -2065,14 +2071,16 @@ def _replica_keys(self) -> Iterable[ReplicaKeys]:
2065
2071
for document_id in always_iterable (inner_entity ['document_id' ])
2066
2072
]
2067
2073
yield self .ReplicaKeys (hub_id = hit ['entity_id' ],
2068
- replica_ids = document_ids )
2074
+ replica_ids = document_ids ,
2075
+ source_spec = one (hit ['sources' ])['spec' ])
2069
2076
2070
- def _all_replicas (self ) -> Iterable [JSON ]:
2077
+ def _all_replicas (self ) -> Iterable [tuple [ JSON , set [ str ]] ]:
2071
2078
emitted_replica_ids = set ()
2072
2079
page_size = 100
2073
2080
for page in chunked (self ._replica_keys (), page_size ):
2074
2081
num_replicas = 0
2075
2082
num_new_replicas = 0
2083
+ sources = {keys .source_spec for keys in page }
2076
2084
for replica in self ._join_replicas (page ):
2077
2085
num_replicas += 1
2078
2086
# A single replica may have many hubs. To prevent replicas from
@@ -2081,7 +2089,7 @@ def _all_replicas(self) -> Iterable[JSON]:
2081
2089
replica_id = replica .meta .id
2082
2090
if replica_id not in emitted_replica_ids :
2083
2091
num_new_replicas += 1
2084
- yield replica .to_dict ()
2092
+ yield replica .to_dict (), sources
2085
2093
emitted_replica_ids .add (replica_id )
2086
2094
log .info ('Found %d replicas (%d already emitted) from page of %d hubs' ,
2087
2095
num_replicas , num_replicas - num_new_replicas , len (page ))
@@ -2119,7 +2127,7 @@ def create_file(self) -> tuple[str, str | None]:
2119
2127
fd , path = mkstemp (suffix = f'.{ self .file_name_extension ()} ' )
2120
2128
os .close (fd )
2121
2129
with open (path , 'w' ) as f :
2122
- for replica in self ._all_replicas ():
2130
+ for replica , _ in self ._all_replicas ():
2123
2131
entry = {
2124
2132
'value' : replica ['contents' ],
2125
2133
'type' : replica ['replica_type' ]
@@ -2143,7 +2151,14 @@ def file_name_extension(cls):
2143
2151
def format (cls ) -> ManifestFormat :
2144
2152
return ManifestFormat .verbatim_pfb
2145
2153
2146
- def _include_relations (self ) -> bool :
2154
+ @property
2155
+ def included_fields (self ) -> list [FieldPath ]:
2156
+ return [
2157
+ * super ().included_fields ,
2158
+ ('sources' ,)
2159
+ ]
2160
+
2161
+ def _include_relations (self , sources : Iterable [str ]) -> bool :
2147
2162
# Terra will reject the handover if the manifest includes
2148
2163
# dangling relations, i.e., if any entity references another
2149
2164
# entity that isn't included in the manifest. There are three
@@ -2171,14 +2186,17 @@ def _include_relations(self) -> bool:
2171
2186
# See https://github.com/DataBiosphere/azul/issues/4440
2172
2187
#
2173
2188
# (1) can only occur when orphans are included, and (2) and (3)
2174
- # can only occur when orphans are *not* included. Because (1)
2175
- # can only occur on lower deployments, we make the inclusion of
2176
- # relations conditional on avoiding (2) and (3).
2189
+ # can only occur when orphans are *not* included.
2177
2190
#
2178
- return config .enable_verbatim_relations and self .include_orphans
2191
+ complete = not any (
2192
+ SourceSpec .parse_prefix (source )[1 ].common
2193
+ for source in sources
2194
+ )
2195
+ return config .enable_verbatim_relations and self .include_orphans and complete
2179
2196
2180
2197
def create_file (self ) -> tuple [str , str | None ]:
2181
- replicas = list (self ._all_replicas ())
2198
+ replicas , sources = zip (* self ._all_replicas ())
2199
+ sources = set ().union (* sources )
2182
2200
plugin = self .metadata_plugin
2183
2201
replica_schemas = plugin .verbatim_pfb_schema (replicas )
2184
2202
# Ensure field order is consistent for unit tests
@@ -2192,10 +2210,11 @@ def create_file(self) -> tuple[str, str | None]:
2192
2210
2193
2211
def pfb_entities ():
2194
2212
yield pfb_metadata_entity
2213
+ include_relations = self ._include_relations (sources )
2195
2214
for replica in replicas :
2196
2215
id = plugin .verbatim_pfb_entity_id (replica )
2197
2216
entity = avro_pfb .PFBEntity .for_replica (id , dict (replica ))
2198
- if self . _include_relations () :
2217
+ if include_relations :
2199
2218
relations = plugin .verbatim_pfb_relations (replica )
2200
2219
entity_relations = [
2201
2220
PFBRelation (dst_name = replica_type , dst_id = entity_id )
0 commit comments