Skip to content

Commit 75d1241

Browse files
committed
First pass at extending provenance report history
#152
1 parent 9d4352e commit 75d1241

File tree

2 files changed

+166
-75
lines changed

2 files changed

+166
-75
lines changed

data_management/prov.py

+153-74
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,11 @@ def _add_author_agents(authors, doc, entity, reg_uri_prefix, vocab_namespaces):
114114
entity,
115115
author_agent,
116116
None,
117-
{PROV_ROLE: QualifiedName(
118-
vocab_namespaces[DCTERMS_VOCAB_PREFIX], 'creator')},
117+
{
118+
PROV_ROLE: QualifiedName(
119+
vocab_namespaces[DCTERMS_VOCAB_PREFIX], 'creator'
120+
)
121+
},
119122
)
120123

121124

@@ -148,7 +151,7 @@ def _add_code_repo_release(
148151
(
149152
(
150153
PROV_TYPE,
151-
QualifiedName(vocab_namespaces[DCMITYPE_VOCAB_PREFIX], 'Software')
154+
QualifiedName(vocab_namespaces[DCMITYPE_VOCAB_PREFIX], 'Software'),
152155
),
153156
*_generate_object_meta(code_repo, vocab_namespaces),
154157
(
@@ -352,7 +355,6 @@ def _add_linked_files(
352355
cr_activity,
353356
doc,
354357
dp_entity,
355-
dp_id,
356358
input_objects,
357359
object_components,
358360
reg_uri_prefix,
@@ -364,24 +366,29 @@ def _add_linked_files(
364366
@param cr_activity: a prov.activity representing the code run
365367
@param doc: a ProvDocument that the entities will belong to
366368
@param dp_entity: a prov.entity representing the data_product
367-
@param dp_id: the data_product id
368369
@param input_objects: boolean, 'True' if the object_components represent input
369370
objects
370371
@param object_components: a list of object_components from the ObjectComponent table
371372
@param reg_uri_prefix: a str containing the name of the prefix
372373
@param vocab_namespaces: a dict containing the Namespaces for the vocab
373374
375+
@return a list of data products that were added
376+
374377
"""
375378
for component in object_components:
376379
obj = component.object
377380
data_products = obj.data_products.all()
378381

379382
for data_product in data_products:
380-
if not input_objects and data_product.id == dp_id:
381-
# we have already added the original data product
383+
file_id = f'{reg_uri_prefix}:api/data_product/{data_product.id}'
384+
385+
entity = doc.get_record(file_id)
386+
# check to see if we have already created an entity for this data product
387+
if len(entity) > 0:
388+
# The prov documentation says a ProvRecord is returned, but actually a
389+
# list of ProvRecord is returned
382390
continue
383391

384-
file_id = f'{reg_uri_prefix}:api/data_product/{data_product.id}'
385392
file_entity = doc.entity(
386393
file_id,
387394
(
@@ -421,6 +428,8 @@ def _add_linked_files(
421428
# add the link to the code run
422429
doc.wasGeneratedBy(file_entity, cr_activity)
423430

431+
return data_products
432+
424433

425434
def _add_model_config(cr_activity, doc, model_config, reg_uri_prefix, vocab_namespaces):
426435
"""
@@ -458,6 +467,48 @@ def _add_model_config(cr_activity, doc, model_config, reg_uri_prefix, vocab_name
458467
)
459468

460469

470+
def _add_prime_data_product(doc, data_product, reg_uri_prefix, vocab_namespaces):
471+
"""
472+
Add the prime data product for this level of the provenance report.
473+
474+
@param doc: a ProvDocument that the entities will belong to
475+
@param data_product: The DataProduct to generate the PROV document for
476+
@param reg_uri_prefix: a str containing the name of the prefix
477+
@param vocab_namespaces: a dict containing the Namespaces for the vocab
478+
479+
@return the data product entity
480+
481+
"""
482+
data_product_id = f'{reg_uri_prefix}:api/data_product/{data_product.id}'
483+
entity = doc.get_record(data_product_id)
484+
# check to see if we have already created an entity for this data product
485+
if len(entity) > 0:
486+
# The prov documentation says a ProvRecord is returned, but actually a
487+
# list of ProvRecord is returned
488+
return entity[0]
489+
490+
# add the data product
491+
dp_entity = doc.entity(
492+
f'{reg_uri_prefix}:api/data_product/{data_product.id}',
493+
(
494+
(PROV_TYPE, QualifiedName(vocab_namespaces[DCAT_VOCAB_PREFIX], 'Dataset')),
495+
*_generate_object_meta(data_product.object, vocab_namespaces),
496+
),
497+
)
498+
499+
_add_author_agents(
500+
data_product.object.authors.all(),
501+
doc,
502+
dp_entity,
503+
reg_uri_prefix,
504+
vocab_namespaces,
505+
)
506+
507+
_add_external_object(doc, data_product, dp_entity, reg_uri_prefix, vocab_namespaces)
508+
509+
return dp_entity
510+
511+
461512
def _add_submission_script(
462513
cr_activity, doc, submission_script, reg_uri_prefix, vocab_namespaces
463514
):
@@ -476,9 +527,10 @@ def _add_submission_script(
476527
(
477528
(
478529
PROV_TYPE,
479-
QualifiedName(vocab_namespaces[DCMITYPE_VOCAB_PREFIX], 'Software')
530+
QualifiedName(vocab_namespaces[DCMITYPE_VOCAB_PREFIX], 'Software'),
480531
),
481-
*_generate_object_meta(submission_script, vocab_namespaces),),
532+
*_generate_object_meta(submission_script, vocab_namespaces),
533+
),
482534
)
483535

484536
_add_author_agents(
@@ -501,69 +553,26 @@ def _add_submission_script(
501553
)
502554

503555

504-
def get_whole_object_component(components):
505-
for component in components:
506-
if component.whole_object:
507-
return component
508-
509-
510-
def generate_prov_document(data_product, request):
556+
def _generate_prov_document(doc, data_product, reg_uri_prefix, vocab_namespaces):
511557
"""
512-
Generate a PROV document for a DataProduct detailing all the input and outputs and
513-
how they were generated.
558+
Add the next level to the provenance doc.
514559
515-
This uses the W3C PROV ontology (https://www.w3.org/TR/prov-o/).
560+
This takes a data product and finds generates it provenance. A list of input files
561+
are returned so they can be used to create the next level of the provenane if
562+
needed.
516563
517-
:param data_product: The DataProduct to generate the PROV document for
564+
@param doc: a ProvDocument that the entities will belong to
565+
@param data_product: The DataProduct to generate the PROV document for
566+
@param reg_uri_prefix: a str containing the name of the prefix
567+
@param vocab_namespaces: a dict containing the Namespaces for the vocab
518568
519-
:return: A PROV-O document
569+
@return a list of files that were used as input files, may be empty
520570
521571
"""
522-
url = request.build_absolute_uri('/')
523-
cenral_registry_url = settings.CENTRAL_REGISTRY_URL
524-
if not cenral_registry_url.endswith('/'):
525-
cenral_registry_url = f'{cenral_registry_url}/'
526-
527-
doc = prov.model.ProvDocument()
528-
529-
if url == cenral_registry_url:
530-
# we are using the main registry
531-
reg_uri_prefix = 'reg'
532-
doc.add_namespace(reg_uri_prefix, cenral_registry_url)
533-
else:
534-
# we are using a local registry
535-
reg_uri_prefix = 'lreg'
536-
doc.add_namespace(reg_uri_prefix, url)
537-
538-
# the vocab namespace is always the main registry
539-
doc.add_namespace(FAIR_VOCAB_PREFIX, f'{cenral_registry_url}vocab/#')
540-
# we need to tell SONAR to ignore 'http' in the vocab URLs
541-
doc.add_namespace(DCAT_VOCAB_PREFIX, 'http://www.w3.org/ns/dcat#') # NOSONAR
542-
doc.add_namespace(DCMITYPE_VOCAB_PREFIX, 'http://purl.org/dc/dcmitype/') # NOSONAR
543-
doc.add_namespace(DCTERMS_VOCAB_PREFIX, 'http://purl.org/dc/terms/') # NOSONAR
544-
doc.add_namespace(FOAF_VOCAB_PREFIX, 'http://xmlns.com/foaf/spec/#') # NOSONAR
545-
546-
vocab_namespaces = {}
547-
for namespace in doc.get_registered_namespaces():
548-
vocab_namespaces[namespace.prefix] = namespace
549-
550-
# add the data product
551-
dp_entity = doc.entity(
552-
f'{reg_uri_prefix}:api/data_product/{data_product.id}',
553-
(
554-
(PROV_TYPE, QualifiedName(vocab_namespaces[DCAT_VOCAB_PREFIX], 'Dataset')),
555-
*_generate_object_meta(data_product.object, vocab_namespaces),
556-
),
557-
)
558-
559-
_add_author_agents(
560-
data_product.object.authors.all(),
561-
doc,
562-
dp_entity,
563-
reg_uri_prefix,
564-
vocab_namespaces,
572+
# add the the root data product
573+
dp_entity = _add_prime_data_product(
574+
doc, data_product, reg_uri_prefix, vocab_namespaces
565575
)
566-
_add_external_object(doc, data_product, dp_entity, reg_uri_prefix, vocab_namespaces)
567576

568577
# add the activity, i.e. the code run
569578
components = data_product.object.components.all()
@@ -572,11 +581,12 @@ def generate_prov_document(data_product, request):
572581
code_run = whole_object.outputs_of.all()[0]
573582
except IndexError:
574583
# there is no code run so we cannot add any more provenance data
575-
return doc
584+
return []
576585

577586
# add the code run, this is the central activity
578587
cr_activity = _add_code_run(
579-
dp_entity, doc, code_run, reg_uri_prefix, vocab_namespaces)
588+
dp_entity, doc, code_run, reg_uri_prefix, vocab_namespaces
589+
)
580590

581591
# add the code repo release
582592
if code_run.code_repo is not None:
@@ -596,11 +606,10 @@ def generate_prov_document(data_product, request):
596606
)
597607

598608
# add input files
599-
_add_linked_files(
609+
input_files = _add_linked_files(
600610
cr_activity,
601611
doc,
602612
dp_entity,
603-
None,
604613
True,
605614
code_run.inputs.all(),
606615
reg_uri_prefix,
@@ -612,19 +621,89 @@ def generate_prov_document(data_product, request):
612621
cr_activity,
613622
doc,
614623
dp_entity,
615-
data_product.id,
616624
False,
617625
code_run.outputs.all(),
618626
reg_uri_prefix,
619627
vocab_namespaces,
620628
)
621629

630+
return input_files
631+
632+
633+
def get_whole_object_component(components):
634+
for component in components:
635+
if component.whole_object:
636+
return component
637+
638+
639+
def generate_prov_document(data_product, depth, request):
640+
"""
641+
Generate a PROV document for a DataProduct detailing all the input and outputs and
642+
how they were generated.
643+
644+
This uses the W3C PROV ontology (https://www.w3.org/TR/prov-o/).
645+
646+
:param data_product: The DataProduct to generate the PROV document for
647+
:param depth: The depth for the document. How many levels of code runs to include.
648+
:param request: A request object
649+
650+
:return: A PROV-O document
651+
652+
"""
653+
url = request.build_absolute_uri('/')
654+
cenral_registry_url = settings.CENTRAL_REGISTRY_URL
655+
if not cenral_registry_url.endswith('/'):
656+
cenral_registry_url = f'{cenral_registry_url}/'
657+
658+
doc = prov.model.ProvDocument()
659+
660+
if url == cenral_registry_url:
661+
# we are using the main registry
662+
reg_uri_prefix = 'reg'
663+
doc.add_namespace(reg_uri_prefix, cenral_registry_url)
664+
else:
665+
# we are using a local registry
666+
reg_uri_prefix = 'lreg'
667+
doc.add_namespace(reg_uri_prefix, url)
668+
669+
# the vocab namespace is always the main registry
670+
doc.add_namespace(FAIR_VOCAB_PREFIX, f'{cenral_registry_url}vocab/#')
671+
# we need to tell SONAR to ignore 'http' in the vocab URLs
672+
doc.add_namespace(DCAT_VOCAB_PREFIX, 'http://www.w3.org/ns/dcat#') # NOSONAR
673+
doc.add_namespace(DCMITYPE_VOCAB_PREFIX, 'http://purl.org/dc/dcmitype/') # NOSONAR
674+
doc.add_namespace(DCTERMS_VOCAB_PREFIX, 'http://purl.org/dc/terms/') # NOSONAR
675+
doc.add_namespace(FOAF_VOCAB_PREFIX, 'http://xmlns.com/foaf/spec/#') # NOSONAR
676+
677+
vocab_namespaces = {}
678+
for namespace in doc.get_registered_namespaces():
679+
vocab_namespaces[namespace.prefix] = namespace
680+
681+
# get the initial set of input files
682+
input_files = _generate_prov_document(
683+
doc, data_product, reg_uri_prefix, vocab_namespaces
684+
)
685+
686+
if depth == 1:
687+
return doc
688+
689+
# add extra layers to the report if requested by the user
690+
while depth > 1:
691+
next_level_input_files = []
692+
693+
for input_file in input_files:
694+
next_input_files = _generate_prov_document(
695+
doc, input_file, reg_uri_prefix, vocab_namespaces
696+
)
697+
next_level_input_files.extend(next_input_files)
698+
699+
# reset the input files for the next level
700+
input_files = next_level_input_files
701+
depth = depth - 1
702+
622703
return doc
623704

624705

625-
def serialize_prov_document(
626-
doc, format_, aspect_ratio, dpi=None, show_attributes=True
627-
):
706+
def serialize_prov_document(doc, format_, aspect_ratio, dpi=None, show_attributes=True):
628707
"""
629708
Serialise a PROV document as either a JPEG or SVG image or an XML or PROV-N report.
630709

data_management/rest/views.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ class ProvReportView(views.APIView):
123123
124124
`dpi` (optional): A float used to define the dpi for the `JPEG` and `SVG` images
125125
126+
`depth` (optional): An integer used to determine how many code runs to include,
127+
the default is 1
126128
"""
127129
try:
128130
Dot(prog='dot').create()
@@ -137,7 +139,6 @@ class ProvReportView(views.APIView):
137139

138140
def get(self, request, pk):
139141
data_product = get_object_or_404(models.DataProduct, pk=pk)
140-
doc = generate_prov_document(data_product, request)
141142

142143
show_attributes = request.query_params.get('attributes', True)
143144
if show_attributes == "False":
@@ -150,12 +151,23 @@ def get(self, request, pk):
150151
except ValueError:
151152
aspect_ratio = default_aspect_ratio
152153

154+
default_depth = 1
155+
depth = request.query_params.get('depth', default_depth)
156+
try:
157+
depth = int(depth)
158+
except ValueError:
159+
depth = default_depth
160+
if depth < 1:
161+
depth = 1
162+
153163
dpi = request.query_params.get('dpi', None)
154164
try:
155165
dpi = float(dpi)
156166
except (TypeError, ValueError):
157167
dpi = None
158168

169+
doc = generate_prov_document(data_product, depth, request)
170+
159171
value = serialize_prov_document(
160172
doc,
161173
request.accepted_renderer.format,

0 commit comments

Comments
 (0)