From 3254b250b4dcfd53d5fea15c7a07a2b07c6b5b42 Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Fri, 12 Apr 2024 13:58:10 +0200 Subject: [PATCH 1/8] added dashboard process --- parrot/processes/__init__.py | 2 + parrot/processes/wps_dashboard.py | 102 ++++++++++++++++++++++++++++++ parrot/query.py | 76 ++++++++++++++++++++++ tests/test_wps_caps.py | 12 ++-- 4 files changed, 186 insertions(+), 6 deletions(-) create mode 100644 parrot/processes/wps_dashboard.py create mode 100644 parrot/query.py diff --git a/parrot/processes/__init__.py b/parrot/processes/__init__.py index 437441e..7a2dd77 100644 --- a/parrot/processes/__init__.py +++ b/parrot/processes/__init__.py @@ -1,5 +1,7 @@ from .wps_say_hello import SayHello +from .wps_dashboard import Dashboard processes = [ SayHello(), + Dashboard(), ] diff --git a/parrot/processes/wps_dashboard.py b/parrot/processes/wps_dashboard.py new file mode 100644 index 0000000..c2ea766 --- /dev/null +++ b/parrot/processes/wps_dashboard.py @@ -0,0 +1,102 @@ +from pathlib import Path + +from pywps import Process, LiteralInput, ComplexOutput, Format + +from parrot import query + + +class Dashboard(Process): + def __init__(self): + inputs = [ + LiteralInput( + "time", + "Time Period", + abstract="The time period for the report seperated by /" + "Example: 2023-09-01/2023-09-30", + data_type="string", + default="2023-09-01/2023-09-30", + min_occurs=0, + max_occurs=1, + ), + ] + outputs = [ + ComplexOutput( + "report", + "Generated HTML Report", + as_reference=True, + supported_formats=[Format("text/html")], + ), + ] + + super(Dashboard, self).__init__( + self._handler, + identifier="dashboard", + title="Generate HTML Report", + version="1.0", + abstract="Generate an HTML report from a provenance database.", + inputs=inputs, + outputs=outputs, + status_supported=True, + store_supported=True, + ) + + def _handler(self, request, response): + workdir = Path(self.workdir) + # input_csv = request.inputs['input_csv'][0].file + + # Query the provenance database ... result is a Pandas DataFrame + df = query.query() + + # Generate an HTML report from the DataFrame + html_report = self.write_html(df, workdir) + + print(f"report: {html_report}") + response.outputs["report"].file = html_report + # response.outputs["report"].output_format = Format("text/html") + + return response + + def write_html(self, df, workdir): + # Convert the DataFrame to an HTML table + html_table = df.to_html(escape=False, index=False) + + # Define the HTML template + html_template = f""" + + + + Provenance Report + + + +

Provenance Report

+ {html_table} + + + """ + + # Write the HTML template to a file + outfile = workdir / "provenance_report.html" + with outfile.open(mode="w") as file: + file.write(html_template) + + return outfile diff --git a/parrot/query.py b/parrot/query.py new file mode 100644 index 0000000..4876937 --- /dev/null +++ b/parrot/query.py @@ -0,0 +1,76 @@ +from duck.db import GraphDB +import pandas as pd +import json +import yaml + + +def display_image(base64_image): + # img_data = base64.b64decode(base64_image) + # img = Image.open(io.BytesIO(img_data)) + return ''.format(base64_image) + + +def display_json(data): + content = yaml.dump(data, default_flow_style=True, indent=2) + return f"
{content}
" + + +def query(): + query_str = """ + SELECT ?process ?dataset ?variable ?startTime ?endTime ?input ?output ?info ?histogram + WHERE { + ?exec rdf:type provone:Execution ; + rdfs:label ?process ; + clint:dataset_name ?dataset ; + clint:variable_name ?variable ; + prov:startedAtTime ?startTime ; + prov:endedAtTime ?endTime ; + clint:info ?info ; + clint:histogram ?histogram . + + ?input rdf:type prov:Entity . + + ?output rdf:type prov:Entity ; + prov:qualifiedDerivation [ prov:entity ?input; prov:hadActivity ?exec ] . + } + """ # noqa + graph_db = GraphDB() + results = graph_db.query(query_str) + + data = [] + for row in results: + # print(row) + process = row.process.split("/")[-1] + dataset = row.dataset.value + variable = row.variable.value + start_time = row.startTime.value + end_time = row.endTime.value + input = row.input.split("/")[-1] + input = input.split("urn:clint:")[-1] + output = row.output.split("/")[-1] + output = output.split("urn:clint:")[-1] + # min = row.min.value + # max = row.max.value + # mean = row.mean.value + # stddev = row.stddev.value + info = json.loads(row.info.value) + histogram = row.histogram.value + entry = { + "Process": process, + "Dataset": dataset, + "Variable": variable, + "Start Time": start_time, + "End Time": end_time, + "Input": input, + "Output": output, + # "Min": min, + # "Max": max, + # "Mean": mean, + # "StdDev": stddev, + "Histogram": display_image(histogram), + } + for key in info: + entry[key] = display_json(info[key]) + data.append(entry) + df = pd.DataFrame(data) + return df diff --git a/tests/test_wps_caps.py b/tests/test_wps_caps.py index 693bdbf..e5fdac7 100644 --- a/tests/test_wps_caps.py +++ b/tests/test_wps_caps.py @@ -6,11 +6,11 @@ def test_wps_caps(): client = client_for(Service(processes=processes)) - resp = client.get(service='wps', request='getcapabilities', version='1.0.0') - names = resp.xpath_text('/wps:Capabilities' - '/wps:ProcessOfferings' - '/wps:Process' - '/ows:Identifier') + resp = client.get(service="wps", request="getcapabilities", version="1.0.0") + names = resp.xpath_text( + "/wps:Capabilities" "/wps:ProcessOfferings" "/wps:Process" "/ows:Identifier" + ) assert sorted(names.split()) == [ - 'hello', + "dashboard", + "hello", ] From 7f47f0f61135a52746b2ec2cfb162ea4e95d6e96 Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Fri, 12 Apr 2024 14:12:40 +0200 Subject: [PATCH 2/8] added missing dashboard code --- parrot/data_stats.py | 96 ++++++++++++++++++++++++++++++++++++++++++++ parrot/db.py | 41 +++++++++++++++++++ parrot/query.py | 2 +- 3 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 parrot/data_stats.py create mode 100644 parrot/db.py diff --git a/parrot/data_stats.py b/parrot/data_stats.py new file mode 100644 index 0000000..1530a25 --- /dev/null +++ b/parrot/data_stats.py @@ -0,0 +1,96 @@ +import xarray as xr +import matplotlib.pyplot as plt +import numpy as np +import yaml +import pathlib +import io +import base64 + + +def get_stats(data): + return { + "min": float(np.nanmin(data)), + "max": float(np.nanmax(data)), + "mean": float(np.nanmean(data)), + "std": float(np.nanstd(data)), + } + + +class DataStats(object): + def __init__(self, output_dir): + if isinstance(output_dir, pathlib.Path): + self.output_dir = output_dir + else: + self.output_dir = pathlib.Path(output_dir) + self.info = None + self.histogram = None + + def gen_data_stats(self, filename, var, nbins=100): + ds = xr.open_dataset(filename) + + vstats = get_stats(ds[var].values) + bins = np.linspace(vstats["min"], vstats["max"], num=nbins + 1) + + ntime, nlon, nlat = ds[var].shape + mratio = np.zeros(ntime) + hist = np.zeros((ntime, nbins)) + for i in range(ntime): + a = ds[var].values[i] + idx = ~np.isnan(a) + mratio[i] = idx.sum() + a = np.histogram(a[idx], bins=bins)[0] + hist[i] = a / max(a) + + mratio = 1 - mratio / (nlon * nlat) + + # TODO: It would be great to store the distribution graph in a database + if True: + plt.close() + plt.imshow( + hist, + aspect="auto", + origin="lower", + extent=[vstats["min"], vstats["max"], 0, ntime], + cmap="gist_ncar", + ) + ax = plt.gca() + ax.grid(color="gray", linestyle="-.", linewidth=1) + plt.xlabel(var) + plt.ylabel("Timesteps") + # outfile = self.output_dir / "histogram.png" + # print(f"histogram: {outfile}") + # plt.savefig(outfile.as_posix(), dpi=50) + # store as base64 + # Save the plot to a BytesIO object + buffer = io.BytesIO() + plt.savefig(buffer, format="png") + buffer.seek(0) + + # Encode the BytesIO object as base64 + base64_encoded_plot = base64.b64encode(buffer.read()).decode("utf-8") + # print(f"{base64_encoded_plot}") + self.histogram = base64_encoded_plot + # close plot + plt.close() + + # The following information should be stored in a database + attrs = {} + orig_attrs = dict(ds.attrs) + for key in orig_attrs: + value = orig_attrs[key] + if isinstance(value, str): + attrs[key] = value + + self.info = {} + self.info["Attrs"] = attrs + self.info["Dims"] = dict(ds.dims) + self.info["Vars"] = list(dict(ds.variables).keys()) + self.info["Vstats"] = vstats + self.info["Mstats"] = get_stats(mratio) + # print(self.info) + + def write_json(self): + outfile = self.output_dir / "info.txt" + with open(outfile.as_posix(), "w") as f: + yaml.dump(self.info, f) + return outfile diff --git a/parrot/db.py b/parrot/db.py new file mode 100644 index 0000000..ad3ae5f --- /dev/null +++ b/parrot/db.py @@ -0,0 +1,41 @@ +from rdflib import Graph, URIRef +from rdflib.plugins.sparql import prepareQuery +# from pywps import configuration + +# Provide the path to the SQLite database in the local folder +DB_URL = "sqlite:////var/lib/pywps/db/provenance.sqlite" +# DB_URL = "sqlite:////tmp/provenance.sqlite" +# DB_URL = configuration.get_config_value('provenance', 'db_url') + + +class GraphDB(object): + def __init__(self): + # Create a graph with a specific backend store + self.graph = Graph( + store="SQLAlchemy", identifier=URIRef("http://example.org/graph") + ) + self.graph.open(DB_URL, create=True) + + def add(self, data): + new_graph = Graph() + new_graph.parse(data=data, format="turtle") + + # add rdf to existing graph + for triple in new_graph: + self.graph.add(triple) + # Commit changes to the store + self.graph.commit() + + def query(self, query_str): + namespaces = { + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "foaf": "http://xmlns.com/foaf/0.1/", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "prov": "http://www.w3.org/ns/prov#", + "provone": "http://purl.dataone.org/provone/2015/01/15/ontology#", + "dcterms": "http://purl.org/dc/terms/", + "clint": "urn:clint:", + } + query = prepareQuery(query_str, initNs=namespaces) + results = self.graph.query(query) + return results diff --git a/parrot/query.py b/parrot/query.py index 4876937..457637e 100644 --- a/parrot/query.py +++ b/parrot/query.py @@ -1,4 +1,4 @@ -from duck.db import GraphDB +from parrot.db import GraphDB import pandas as pd import json import yaml From 2ca8b815561e0a44b9fee3a37c00a6e42fcbf26c Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Fri, 12 Apr 2024 14:15:44 +0200 Subject: [PATCH 3/8] update reqs --- environment.yml | 11 ++++++++++- requirements.txt | 10 +++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index d43ebef..10d38c4 100644 --- a/environment.yml +++ b/environment.yml @@ -5,9 +5,18 @@ channels: dependencies: - pip - python>=3.8,<3.12 -- pywps>=4.5.1,<4.6 +- pywps>=4.5.2,<4.7 - jinja2 - click - psutil # tests - pytest +# provenance +- prov>=2.0.0 +- pydot +- graphviz +- rdflib +- rdflib-sqlalchemy +- sqlalchemy<2 +- pandas +- pyyaml diff --git a/requirements.txt b/requirements.txt index 01cdf08..ca3fd9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,12 @@ click jinja2 psutil -pywps>=4.5.1,<4.6 +pywps>=4.5.2,<4.7 +# provenance +prov>=2.0.0 +pydot +rdflib +rdflib-sqlalchemy +sqlalchemy<2 +pandas +pyyaml From c6f6dd081a62a236a06e9580b605ad1ad7ee73a5 Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Fri, 12 Apr 2024 14:33:47 +0200 Subject: [PATCH 4/8] skip python 3.7 and 3.8 --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9a93fda..998047f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] steps: - name: Checkout repository and submodules uses: actions/checkout@v4 @@ -32,8 +32,8 @@ jobs: run: make test - name: Lint with flake8 ⚙️ run: make lint - if: matrix.python-version == 3.7 + if: matrix.python-version == 3.9 - name: Build docs 🏗️ run: make docs - if: matrix.python-version == 3.7 + if: matrix.python-version == 3.9 From 6007145ab458c557b5bedeca729f5123ccb95220 Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Fri, 12 Apr 2024 14:38:46 +0200 Subject: [PATCH 5/8] removed unused module --- parrot/data_stats.py | 96 -------------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 parrot/data_stats.py diff --git a/parrot/data_stats.py b/parrot/data_stats.py deleted file mode 100644 index 1530a25..0000000 --- a/parrot/data_stats.py +++ /dev/null @@ -1,96 +0,0 @@ -import xarray as xr -import matplotlib.pyplot as plt -import numpy as np -import yaml -import pathlib -import io -import base64 - - -def get_stats(data): - return { - "min": float(np.nanmin(data)), - "max": float(np.nanmax(data)), - "mean": float(np.nanmean(data)), - "std": float(np.nanstd(data)), - } - - -class DataStats(object): - def __init__(self, output_dir): - if isinstance(output_dir, pathlib.Path): - self.output_dir = output_dir - else: - self.output_dir = pathlib.Path(output_dir) - self.info = None - self.histogram = None - - def gen_data_stats(self, filename, var, nbins=100): - ds = xr.open_dataset(filename) - - vstats = get_stats(ds[var].values) - bins = np.linspace(vstats["min"], vstats["max"], num=nbins + 1) - - ntime, nlon, nlat = ds[var].shape - mratio = np.zeros(ntime) - hist = np.zeros((ntime, nbins)) - for i in range(ntime): - a = ds[var].values[i] - idx = ~np.isnan(a) - mratio[i] = idx.sum() - a = np.histogram(a[idx], bins=bins)[0] - hist[i] = a / max(a) - - mratio = 1 - mratio / (nlon * nlat) - - # TODO: It would be great to store the distribution graph in a database - if True: - plt.close() - plt.imshow( - hist, - aspect="auto", - origin="lower", - extent=[vstats["min"], vstats["max"], 0, ntime], - cmap="gist_ncar", - ) - ax = plt.gca() - ax.grid(color="gray", linestyle="-.", linewidth=1) - plt.xlabel(var) - plt.ylabel("Timesteps") - # outfile = self.output_dir / "histogram.png" - # print(f"histogram: {outfile}") - # plt.savefig(outfile.as_posix(), dpi=50) - # store as base64 - # Save the plot to a BytesIO object - buffer = io.BytesIO() - plt.savefig(buffer, format="png") - buffer.seek(0) - - # Encode the BytesIO object as base64 - base64_encoded_plot = base64.b64encode(buffer.read()).decode("utf-8") - # print(f"{base64_encoded_plot}") - self.histogram = base64_encoded_plot - # close plot - plt.close() - - # The following information should be stored in a database - attrs = {} - orig_attrs = dict(ds.attrs) - for key in orig_attrs: - value = orig_attrs[key] - if isinstance(value, str): - attrs[key] = value - - self.info = {} - self.info["Attrs"] = attrs - self.info["Dims"] = dict(ds.dims) - self.info["Vars"] = list(dict(ds.variables).keys()) - self.info["Vstats"] = vstats - self.info["Mstats"] = get_stats(mratio) - # print(self.info) - - def write_json(self): - outfile = self.output_dir / "info.txt" - with open(outfile.as_posix(), "w") as f: - yaml.dump(self.info, f) - return outfile From 89ede0dabeee90e4e8cdf2afba19129d556a908a Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Fri, 12 Apr 2024 14:45:31 +0200 Subject: [PATCH 6/8] update db --- parrot/db.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parrot/db.py b/parrot/db.py index ad3ae5f..3868c28 100644 --- a/parrot/db.py +++ b/parrot/db.py @@ -1,11 +1,11 @@ from rdflib import Graph, URIRef from rdflib.plugins.sparql import prepareQuery -# from pywps import configuration + +from pywps import configuration # Provide the path to the SQLite database in the local folder -DB_URL = "sqlite:////var/lib/pywps/db/provenance.sqlite" -# DB_URL = "sqlite:////tmp/provenance.sqlite" -# DB_URL = configuration.get_config_value('provenance', 'db_url') +# DB_URL = configuration.get_config_value("provenance", "db_url") +DB_URL = configuration.get_config_value("logging", "database") class GraphDB(object): From 064d013efb0c22462990082f9500e091114032d3 Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Fri, 12 Apr 2024 14:47:20 +0200 Subject: [PATCH 7/8] update docs --- docs/source/conf.py | 54 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 1f36816..e866f47 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -41,7 +41,7 @@ "sphinx.ext.todo", "sphinx.ext.viewcode", "IPython.sphinxext.ipython_console_highlighting", - "nbsphinx", + # "nbsphinx", "pywps.ext_autodoc", ] @@ -49,23 +49,51 @@ # List of all tested working mock imports from all birds so new birds can # inherit without having to test which work which do not. autodoc_mock_imports = [ - "numpy", "xarray", "fiona", "rasterio", "shapely", - "osgeo", "geopandas", "pandas", "statsmodels", - "affine", "rasterstats", "spotpy", "matplotlib", - "scipy", "unidecode", "gdal", "sentry_sdk", "dask", - "numba", "parse", "siphon", "sklearn", "cftime", - "netCDF4", "bottleneck", "ocgis", "geotiff", "geos", - "hdf4", "hdf5", "zlib", "pyproj", "proj", "cartopy", - "scikit-learn", "cairo" + "numpy", + "xarray", + "fiona", + "rasterio", + "shapely", + "osgeo", + "geopandas", + "pandas", + "statsmodels", + "affine", + "rasterstats", + "spotpy", + "matplotlib", + "scipy", + "unidecode", + "gdal", + "sentry_sdk", + "dask", + "numba", + "parse", + "siphon", + "sklearn", + "cftime", + "netCDF4", + "bottleneck", + "ocgis", + "geotiff", + "geos", + "hdf4", + "hdf5", + "zlib", + "pyproj", + "proj", + "cartopy", + "scikit-learn", + "cairo", ] # Monkeypatch constant because the following are mock imports. # Only works if numpy is actually installed and at the same time being mocked. -#import numpy -#numpy.pi = 3.1416 +# import numpy +# numpy.pi = 3.1416 # We are using mock imports in readthedocs, so probably safer to not run the notebooks -nbsphinx_execute = 'never' +nbsphinx_execute = "never" # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -112,7 +140,7 @@ todo_include_todos = False # Suppress "WARNING: unknown mimetype for ..." when building EPUB. -suppress_warnings = ['epub.unknown_project_files'] +suppress_warnings = ["epub.unknown_project_files"] # Avoid "configuration.rst:4:duplicate label configuration, other instance in configuration.rst" autosectionlabel_prefix_document = True From 5f345fa637e4f938c1bad3880b39383a94caef85 Mon Sep 17 00:00:00 2001 From: Pingu Carsti Date: Fri, 12 Apr 2024 14:50:01 +0200 Subject: [PATCH 8/8] fix docs --- docs/source/index.rst | 1 - docs/source/notebooks/example.ipynb | 48 ----------------------------- docs/source/notebooks/index.rst | 8 ----- 3 files changed, 57 deletions(-) delete mode 100644 docs/source/notebooks/example.ipynb delete mode 100644 docs/source/notebooks/index.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 318de0c..aed6750 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -6,7 +6,6 @@ installation configuration - notebooks/index dev_guide processes authors diff --git a/docs/source/notebooks/example.ipynb b/docs/source/notebooks/example.ipynb deleted file mode 100644 index a68d809..0000000 --- a/docs/source/notebooks/example.ipynb +++ /dev/null @@ -1,48 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Usage Example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import parrot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/source/notebooks/index.rst b/docs/source/notebooks/index.rst deleted file mode 100644 index cdbc518..0000000 --- a/docs/source/notebooks/index.rst +++ /dev/null @@ -1,8 +0,0 @@ -======== -Examples -======== - -.. toctree:: - :maxdepth: 1 - - example