From 6b511bd7bed9757647b2503f0f5b51eeb3879fe8 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 14:22:16 -0500 Subject: [PATCH] Major update to intake v2 including docs and tests --- .readthedocs.yaml | 8 +- README.md | 142 ++++------- docs/api.rst | 6 +- docs/conf.py | 15 +- docs/environment.yml | 13 +- docs/examples.rst | 96 ------- docs/examples/wave-height.md | 60 +++-- docs/index.rst | 181 ++----------- docs/user_guide.rst | 164 ++++++++++++ docs/whats_new.md | 6 + intake_erddap/__init__.py | 6 +- intake_erddap/erddap.py | 140 ++++++----- intake_erddap/erddap_cat.py | 40 ++- intake_erddap/version.py | 9 +- setup.py | 4 +- tests/test_cache.py | 34 +-- tests/test_erddap_cat.py | 237 +++++++++--------- ...erddap_source.py => test_erddap_reader.py} | 75 +++--- 18 files changed, 579 insertions(+), 657 deletions(-) delete mode 100644 docs/examples.rst create mode 100644 docs/user_guide.rst create mode 100644 docs/whats_new.md rename tests/{test_erddap_source.py => test_erddap_reader.py} (66%) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 24ec291..8bad78f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -12,10 +12,10 @@ build: # uncomment to build from this exact version of package # the downside is the version listed in the docs will be a dev version # if uncommenting this, comment out installing pypi version of package in docs/env file -# python: -# install: -# - method: pip -# path: ./ +python: + install: + - method: pip + path: ./ conda: environment: docs/environment.yml diff --git a/README.md b/README.md index 76e647f..65ed445 100644 --- a/README.md +++ b/README.md @@ -24,15 +24,13 @@ For changes prior to 2022-10-19, all contributions are Copyright James Munroe, s -Intake is a lightweight set of tools for loading and sharing data in data -science projects. Intake ERDDAP provides a set of integrations for ERDDAP. +Intake is a lightweight set of tools for loading and sharing data in data science projects. Intake ERDDAP provides a set of integrations for ERDDAP. -- Quickly identify all datasets from an ERDDAP service in a geographic region, - or containing certain variables. +- Quickly identify all datasets from an ERDDAP service in a geographic region, or containing certain variables. - Produce a pandas DataFrame for a given dataset or query. - Get an xarray Dataset for the Gridded datasets. -The Key features are: +The key features are: - Pandas DataFrames for any TableDAP dataset. - xarray Datasets for any GridDAP datasets. @@ -59,7 +57,7 @@ project is available on PyPI, so it can be installed using `pip` The following are prerequisites for a developer environment for this project: - [conda](https://docs.conda.io/en/latest/miniconda.html) -- (optional but highly recommended) [mamba](https://mamba.readthedocs.io/en/latest/) Hint: `conda install -c conda-forge mamba` +- (optional but highly recommended) [mamba](https://mamba.readthedocs.io/en/latest/). Hint: `conda install -c conda-forge mamba` Note: if `mamba` isn't installed, replace all instances of `mamba` in the following instructions with `conda`. @@ -83,126 +81,74 @@ Note: if `mamba` isn't installed, replace all instances of `mamba` in the follow pip install -e . ``` +Note that you need to install with `pip install .` once to get the `entry_points` correct too. ## Examples -To create an intake catalog for all of the ERDDAP's TableDAP offerings use: +To create an `intake` catalog for all of the ERDDAP's TableDAP offerings use: ```python -import intake -catalog = intake.open_erddap_cat( +import intake_erddap +catalog = intake_erddap.ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap" -) +).read() ``` -The catalog objects behave like a dictionary with the keys representing the -dataset's unique identifier within ERDDAP, and the values being the -`TableDAPSource` objects. To access a source object: +The catalog objects behave like a dictionary with the keys representing the dataset's unique identifier within ERDDAP, and the values being the `TableDAPReader` objects. To access a Reader object (for a single dataset, in this case for dataset_id "aoos_204"): ```python -source = catalog["datasetid"] +dataset = catalog["aoos_204"] ``` -From the source object, a pandas DataFrame can be retrieved: +From the reader object, a pandas DataFrame can be retrieved: ```python -df = source.read() +df = dataset.read() +``` + +Find other dataset_ids available with + +```python +list(catalog) ``` Consider a case where you need to find all wind data near Florida: ```python -import intake +import intake_erddap from datetime import datetime bbox = (-87.84, 24.05, -77.11, 31.27) -catalog = intake.open_erddap_cat( +catalog = intake_erddap.ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", bbox=bbox, + intersection="union", start_time=datetime(2022, 1, 1), end_time=datetime(2023, 1, 1), standard_names=["wind_speed", "wind_from_direction"], -) + variables=["wind_speed", "wind_from_direction"], +).read() -df = next(catalog.values()).read() +dataset_id = list(catalog)[0] +print(dataset_id) +df = catalog[dataset_id].read() ``` +Using the `standard_names` input with `intersection="union"` searches for datasets that have both "wind_speed" and "wind_from_direction". Using the `variables` input subsequently narrows the dataset to only those columns, plus "time", "latitude", "longitude", and "z". - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
time (UTC)wind_speed (m.s-1)wind_from_direction (degrees)
02022-12-14T19:40:00Z7.0140.0
12022-12-14T19:20:00Z7.0120.0
22022-12-14T19:10:00ZNaNNaN
32022-12-14T19:00:00Z9.0130.0
42022-12-14T18:50:00Z9.0130.0
............
482962022-01-01T00:40:00Z4.0120.0
482972022-01-01T00:30:00Z3.0130.0
482982022-01-01T00:20:00Z4.0120.0
482992022-01-01T00:10:00Z4.0130.0
483002022-01-01T00:00:00Z4.0130.0
+```python + time (UTC) latitude (degrees_north) ... wind_speed (m.s-1) wind_from_direction (degrees) +0 2022-01-01T00:00:00Z 28.508 ... 3.6 126.0 +1 2022-01-01T00:10:00Z 28.508 ... 3.8 126.0 +2 2022-01-01T00:20:00Z 28.508 ... 3.6 124.0 +3 2022-01-01T00:30:00Z 28.508 ... 3.4 125.0 +4 2022-01-01T00:40:00Z 28.508 ... 3.5 124.0 +... ... ... ... ... ... +52524 2022-12-31T23:20:00Z 28.508 ... 5.9 176.0 +52525 2022-12-31T23:30:00Z 28.508 ... 6.8 177.0 +52526 2022-12-31T23:40:00Z 28.508 ... 7.2 175.0 +52527 2022-12-31T23:50:00Z 28.508 ... 7.4 169.0 +52528 2023-01-01T00:00:00Z 28.508 ... 8.1 171.0 + +[52529 rows x 6 columns] +``` diff --git a/docs/api.rst b/docs/api.rst index c831cda..ca57497 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -18,11 +18,11 @@ ------------------------ -.. autoclass:: intake_erddap.erddap.ERDDAPSource +.. autoclass:: intake_erddap.erddap.ERDDAPReader :members: get_client -.. autoclass:: intake_erddap.erddap.TableDAPSource +.. autoclass:: intake_erddap.erddap.TableDAPReader :members: read, read_partition, read_chunked -.. autoclass:: intake_erddap.erddap.GridDAPSource +.. autoclass:: intake_erddap.erddap.GridDAPReader :members: read_partition, read_chunked, to_dask, close diff --git a/docs/conf.py b/docs/conf.py index 3da3cfa..6e2e288 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -26,17 +26,17 @@ # -- Project information ----------------------------------------------------- project = "intake-erddap" -copyright = "Copyright 2022 Axiom Data Science, LLC" +copyright = "Copyright 2022-2024 Axiom Data Science, LLC" author = "Axiom Data Science, LLC" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # see https://pypi.org/project/setuptools-scm/ for details -from pkg_resources import get_distribution +from importlib.metadata import version as imversion -release = get_distribution("intake_erddap").version +release = imversion("intake_erddap") # for example take major/minor version = ".".join(release.split(".")[:2]) @@ -71,6 +71,11 @@ nb_execution_timeout = 120 + +# https://myst-nb.readthedocs.io/en/v0.9.0/use/execute.html +# jupyter_execute_notebooks = "off" +nb_execution_mode = "force" + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -85,10 +90,10 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -#html_theme = "furo" +html_theme = "furo" # furo variables -html_title = "intake-axds documentation" +html_title = "intake-erddap documentation" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/environment.yml b/docs/environment.yml index d01959c..971dfba 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -1,16 +1,16 @@ -name: docs +name: intake-erddap-docs channels: - conda-forge - nodefaults dependencies: - - python=3.9 + - python=3.11 # If your docs code examples depend on other packages add them here - numpy - dask - pandas - erddapy - panel - - intake + # - intake - intake-xarray>=0.6.1 - cf_pandas # These are needed for the docs themselves @@ -29,10 +29,11 @@ dependencies: - pip - recommonmark - pip: + - furo - git+https://github.com/intake/intake - - intake-parquet - - intake-xarray - - intake-erddap + # - intake-parquet + # - intake-xarray + # - intake-erddap # - "dask[complete]" - docrep<=0.2.7 - furo diff --git a/docs/examples.rst b/docs/examples.rst deleted file mode 100644 index 09cc0af..0000000 --- a/docs/examples.rst +++ /dev/null @@ -1,96 +0,0 @@ -Examples -======== - -.. toctree:: - :maxdepth: 2 - - examples/wave-height.md - -Querying --------- - -A catlaog can be generated by passing your desired query parameters directly -with the ``kwargs_search`` keyword argument. This object gets passed to -`erddappy `_ :: - - search = { - "min_lon": -180, - "max_lon": -156, - "min_lat": 50, - "max_lat": 66, - "min_time": "2021-04-01", - "max_time": "2021-04-02", - } - cat = intake.open_erddap_catalog(server_url, kwargs_search=search) - - -The same query can also be specified using the constructor keyword arguments:: - - cat = intake.open_erddap_catalog( - server=server_url, - bbox=(-180., 50., -156., 66.), - start_time=datetime(2021, 4, 1), - end_time=datetime(2021, 4, 2), - ) - -The catalog supports querying for datasets that contain a variable with a -particular -`CF Standard Name `_ -. Clients can specify the standard name queries with either the -``kwargs_search`` keyword argument, or the ``standard_names`` keyword argument:: - - cat = intake.open_erddap_catalog( - server=server_url, - kwargs_search={ - "standard_name": "air_temperature", - }, - ) - -or:: - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature"], - ) - -Multiple standard name values can be queries which will return all datasets -containing at least one of the queried standard names:: - - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature", "air_pressure"], - ) - -In cases where standard names are not sufficient, clients can query using the -variable name as it appears in ERDDAP:: - - cat = intake.open_erddap_catalog( - server=server_url, - variable_names=["Pair", "temp"], - ) - -Lastly, ERDDAP offers a plaintext search option. Clients can query for datasets -containing a plaintext search term:: - - cat = intake.open_erddap_catalog( - server=server_url, - search_for=["ioos", "aoos", "NOAA"], - ) - - -Querying with AND ------------------ - -Sometimes, clients may want to find only datasets that match all of the query -terms exactly. This can be achieved with the ``query_type`` keyword argument:: - - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature", "air_pressure"], - query_type="intersection", - ) - -This will return only datasets that have both ``air_temperature`` and -``air_pressure`` as standard names associated with variables. diff --git a/docs/examples/wave-height.md b/docs/examples/wave-height.md index 6aac638..fc566b2 100644 --- a/docs/examples/wave-height.md +++ b/docs/examples/wave-height.md @@ -4,7 +4,7 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.0 + jupytext_version: 1.16.3 kernelspec: display_name: Python language: python @@ -15,11 +15,10 @@ Example: Investigating Significant Wave Height - Southern California ==================================================================== ```{code-cell} ipython3 ---- -tags: [hide-cell] ---- +:tags: [hide-cell] + import intake_erddap -import intake +# import intake import numpy as np import cartopy.crs as ccrs @@ -37,24 +36,22 @@ def figure(*args, figsize=(18, 8), facecolor='white', **kwargs): Here's an example of finding _all_ stations that have significant wave height from the main IOOS ERDDAP server. - ```{code-cell} ipython3 server = 'https://erddap.sensors.ioos.us/erddap' -cat = intake.open_erddap_cat( +cat = intake_erddap.ERDDAPCatalogReader( server=server, standard_names=["sea_surface_wind_wave_significant_height"] -) +).read() ``` ```{code-cell} ipython3 -df = pd.DataFrame([i.metadata for i in cat.values()]) +df = pd.DataFrame([cat[i].metadata for i in list(cat)]) sub_df = df[['datasetID', 'minTime', 'maxTime', 'title']][:5] sub_df.style.set_table_attributes('class="dataframe docutils"').hide(axis="index") ``` We can plot the locations of these stations on the globe. - ```{code-cell} ipython3 fig, ax = figure(subplot_kw=dict(projection=ccrs.PlateCarree())) ax.coastlines() @@ -77,21 +74,24 @@ ax.add_geometries([box], facecolor='red', alpha=0.4, crs=ccrs.PlateCarree()) ax.set_extent([-130., -60., 20., 45.], crs=ccrs.PlateCarree()) ``` -We can pass this bounding box directly to the ERDDAP Catalog constructor, as well as limit our query only to stations that contain data after 2014: +We can pass this bounding box directly to the ERDDAP Catalog constructor, as well as limit our query only to stations that contain data after 2014 and through 2017. We also will limit the data returned to the variable (through the `variables` keyword) we are searching for plus basic variables (time, longitude, latitude, and depth): ```{code-cell} ipython3 -cat = intake.open_erddap_cat( +cat = intake_erddap.ERDDAPCatalogReader( server=server, bbox=bbox, start_time=datetime(2014, 1, 1), - standard_names=["sea_surface_wind_wave_significant_height"] -) + end_time=datetime(2018,1,1), + standard_names=["sea_surface_wave_significant_height"], + variables=["sea_surface_wave_significant_height"], + dropna=True, +).read() len(cat) ``` ```{code-cell} ipython3 -df = pd.DataFrame([i.metadata for i in cat.values()]) +df = pd.DataFrame([cat[i].metadata for i in list(cat)]) sub_df = df[['datasetID', 'minTime', 'maxTime', 'title']] sub_df.style.set_table_attributes('class="dataframe docutils"').hide(axis="index") ``` @@ -108,23 +108,29 @@ ax.scatter(df['minLongitude'], df['minLatitude']) ax.set_title("Station Locations") ``` -We can now interrogate each of those stations and get a timeseries for the significant wave height data. +We can now interrogate each of those stations and get a timeseries for the significant wave height data. We'll use the first four that contain wave height data. + ```{code-cell} ipython3 -# Just get 4 -stations = list(cat)[:4] +# Just get 4 that aren't empty +stations = {} +for dataset_id in list(cat): + df = cat[dataset_id].read() + if len(df) > 0: + stations[dataset_id] = df + if len(stations) == 4: + break +``` -fig, axs = figure(nrows=len(stations), figsize=(18,18)) +```{code-cell} ipython3 -for i, dataset_id in enumerate(stations): +fig, axs = figure(nrows=len(stations), figsize=(15,10), sharex=True, sharey=True) + +for i, (dataset_id, df) in enumerate(stations.items()): ax = axs[i] - source = cat[dataset_id] - df = source.read() - t = df['time (UTC)'].astype('M8[s]') - sig_wave_height = df['sea_surface_wave_significant_height (m)'] - ax.plot(t, sig_wave_height) - ax.set_title(f'{dataset_id} Significant Wave Height (m)') - ax.set_xlim(np.datetime64('2014-01-01'), np.datetime64('2022-12-01')) + df.plot(ax=ax, x='time (UTC)', y='sea_surface_wave_significant_height (m)', fontsize=14, rot=30, + title=f'{dataset_id} Significant Wave Height (m)', legend=False, xlabel="") ax.grid() + fig.tight_layout(pad=1) ``` diff --git a/docs/index.rst b/docs/index.rst index 6409ec0..af2a4eb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,12 +6,6 @@ Welcome to intake-erddap's documentation! ========================================= -.. toctree:: - :maxdepth: 2 - - examples - API - GitHub repository Intake ERDDAP ============= @@ -24,171 +18,34 @@ science projects. Intake ERDDAP provides a set of integrations for ERDDAP. - Produce a pandas DataFrame for a given dataset or query. - Get an xarray Dataset for the Gridded datasets. +The key features are: -.. image:: https://img.shields.io/github/actions/workflow/status/axiom-data-science/intake-erddap/test.yaml?branch=main&logo=github&style=for-the-badge - :alt: Build Status - -.. image:: https://img.shields.io/codecov/c/github/axiom-data-science/intake-erddap.svg?style=for-the-badge - :alt: Code Coverage - -.. image:: https://img.shields.io/badge/License-BSD--2%20Clause-blue.svg?style=for-the-badge - :alt: License:BSD - -.. image:: https://img.shields.io/github/actions/workflow/status/axiom-data-science/intake-erddap/linting.yaml?branch=main&label=Code%20Style&style=for-the-badge - :alt: Code Style Status - -The project is available on `Github `_. - - -TODO: Summary - -The Key features are: - - - Pandas DataFrames for any TableDAP dataset. - - xarray Datasets for any GridDAP datasets. - - Query by any or all: - - bounding box - - time - - CF ``standard_name`` - - variable name - - Plaintext Search term - - Save catalogs locally for future use. - - -Requirements ------------- - -- Python >= 3.8 +- Pandas DataFrames for any TableDAP dataset. +- xarray Datasets for any GridDAP datasets. +- Query by any or all: + - bounding box + - time + - CF ``standard_name`` + - variable name + - Plaintext Search term +- Save catalogs locally for future use. Installation ------------ -In the very near future, we will be offering the project on conda. Currently the -project is available on PyPI, so it can be installed using ``pip``:: +The project is available on PyPI, so it can be installed using ``pip``:: pip install intake-erddap -Examples --------- - -To create an intake catalog for all of the ERDDAP's TableDAP offerings use:: - - import intake - catalog = intake.open_erddap_cat( - server="https://erddap.sensors.ioos.us/erddap" - ) - - -The catalog objects behave like a dictionary with the keys representing the -dataset's unique identifier within ERDDAP, and the values being the -``TableDAPSource`` objects. To access a source object:: - - source = catalog["datasetid"] - -From the source object, a pandas DataFrame can be retrieved:: - - df = source.read() - -Scenarios ---------- - -Consider a case where you need to find all wind data near Florida.:: - - import intake - from datetime import datetime - bbox = (-87.84, 24.05, -77.11, 31.27) - catalog = intake.open_erddap_cat( - server="https://erddap.sensors.ioos.us/erddap", - bbox=bbox, - start_time=datetime(2022, 1, 1), - end_time=datetime(2023, 1, 1), - standard_names=["wind_speed", "wind_from_direction"], - ) - - df = next(catalog.values()).read() - - -.. raw:: html - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
time (UTC)wind_speed (m.s-1)wind_from_direction (degrees)
02022-12-14T19:40:00Z7.0140.0
12022-12-14T19:20:00Z7.0120.0
22022-12-14T19:10:00ZNaNNaN
32022-12-14T19:00:00Z9.0130.0
42022-12-14T18:50:00Z9.0130.0
............
482962022-01-01T00:40:00Z4.0120.0
482972022-01-01T00:30:00Z3.0130.0
482982022-01-01T00:20:00Z4.0120.0
482992022-01-01T00:10:00Z4.0130.0
483002022-01-01T00:00:00Z4.0130.0
+.. toctree:: + :maxdepth: 3 + :hidden: + + user_guide + API + whats_new + GitHub repository Indices and tables diff --git a/docs/user_guide.rst b/docs/user_guide.rst new file mode 100644 index 0000000..8b9e954 --- /dev/null +++ b/docs/user_guide.rst @@ -0,0 +1,164 @@ +User Guide +========== + +.. toctree:: + :maxdepth: 2 + + examples/wave-height.md + +Querying +-------- + +A catalog can be generated by passing your desired query parameters directly +with the ``kwargs_search`` keyword argument. This object gets passed to +`erddapy `_ :: + + import intake_erddap + + search = { + "min_lon": -180, + "max_lon": -156, + "min_lat": 50, + "max_lat": 66, + "min_time": "2021-04-01", + "max_time": "2021-04-02", + } + cat = intake_erddap.ERDDAPCatalogReader(server_url, kwargs_search=search) + + +The same query can also be specified using the constructor keyword arguments:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + bbox=(-180., 50., -156., 66.), + start_time=datetime(2021, 4, 1), + end_time=datetime(2021, 4, 2), + ) + +The catalog supports querying for datasets that contain a variable with a +particular +`CF Standard Name `_ +. Clients can specify the standard name queries with either the +``kwargs_search`` keyword argument, or the ``standard_names`` keyword argument:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + kwargs_search={ + "standard_name": "air_temperature", + }, + ) + +or:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature"], + ) + +Multiple standard name values can be queries which will return all datasets +containing at least one of the queried standard names:: + + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature", "air_pressure"], + ) + +In cases where standard names are not sufficient, clients can query using the +variable name as it appears in ERDDAP:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + variable_names=["Pair", "temp"], + ) + +Lastly, ERDDAP offers a plaintext search option. Clients can query for datasets +containing a plaintext search term:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + search_for=["ioos", "aoos", "NOAA"], + ) + +This can also be useful if you know the name of the station or stations you want +to make a catalog from :: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + search_for=["aoos_204"], + ) + +Querying with AND +----------------- + +Sometimes, clients may want to find only datasets that match all of the query +terms exactly. This can be achieved with the ``query_type`` keyword argument:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature", "air_pressure"], + query_type="intersection", + ) + +This will return only datasets that have both ``air_temperature`` and +``air_pressure`` as standard names associated with variables. + + +Constraints +----------- + +Use the input option `use_source_constraints=True` to use any relevant parameter +from "kwargs_search" constraints in the query. This will pass a `start_time` on +so that it will limit the time returned in the data to the `start_time`, for example:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + bbox=(-180., 50., -156., 66.), + start_time=datetime(2021, 4, 1), + end_time=datetime(2021, 4, 2), + use_source_constraints=True, + ) + +Dropping bad values +------------------- + +Use the `dropna` option to drop rows with NaN values in the data columns:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + dropna=True, + ) + +Note that this is an alpha feature because it uses logic that identifies columns of data as opposed to coordinates and axes on its own to decide from which columns to drop NaN values. This has not been thoroughly tested. + + +Selecting which columns of data to return +----------------------------------------- + +Use the `variables` option to select which columns of data to return. This is useful when you only need a subset of the data columns:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + variables=["sea_water_temperature"], + ) + +Variables `time`, `latitude`, `longitude`, and `z` are always additionally returned. + + +Mask due to quality flags +------------------------- + +If `mask_failed_qartod=True`` and `*_qc_agg` columns associated with the data columns are available, data values associated with QARTOD flags other than 1 and 2 will be nan'ed out. Has not been thoroughly tested. + + +Simple caching +-------------- + +You can using simple caching through `fsspec` if you input `cache_kwargs` such as the following:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + cache_kwargs=dict(cache_storage="/tmp/fnames/", same_names=True), + ) + +This would have the effect of caching the data locally in the `/tmp/fnames/` directory so it doesn't have to be downloaded next time. The `same_names` option is useful if you want to cache the data with the same name as the data source for clarity. \ No newline at end of file diff --git a/docs/whats_new.md b/docs/whats_new.md new file mode 100644 index 0000000..4125ea8 --- /dev/null +++ b/docs/whats_new.md @@ -0,0 +1,6 @@ +# What's New + +## v0.5.0 (July 19, 2024) +* Major changes across the codebase to update to intake v2! Also updated class names; updated tests; updated docs. +* Now can choose variables to narrow results to. +* Fixed some bugs. \ No newline at end of file diff --git a/intake_erddap/__init__.py b/intake_erddap/__init__.py index c6deb7f..190f66d 100644 --- a/intake_erddap/__init__.py +++ b/intake_erddap/__init__.py @@ -1,14 +1,14 @@ """intake-erddap package.""" import intake -from .erddap import GridDAPSource, TableDAPSource +from .erddap import GridDAPReader, TableDAPReader from .erddap_cat import ERDDAPCatalogReader from .version import __version__ __all__ = [ "ERDDAPCatalogReader", - "TableDAPSource", - "GridDAPSource", + "TableDAPReader", + "GridDAPReader", "__version__", ] diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 3cdff24..68b453c 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -1,4 +1,4 @@ -"""Source implementations for intake-erddap.""" +"""Reader implementations for intake-erddap.""" import typing from logging import getLogger @@ -20,11 +20,11 @@ log = getLogger("intake-erddap") -class ERDDAPSource(BaseReader): +class ERDDAPReader(BaseReader): """ - ERDDAP Source (Base Class). This class represents the abstract base class - for an intake data source object for ERDDAP. Clients should use either - ``TableDAPSource`` or ``GridDAPSource``. + ERDDAP Reader (Base Class). This class represents the abstract base class + for an intake data reader object for ERDDAP. Clients should use either + ``TableDAPReader`` or ``GridDAPReader``. Parameters ---------- @@ -64,8 +64,8 @@ def get_client(self, server, protocol, dataset_id, variables, constraints, clien return e -class TableDAPSource(ERDDAPSource): - """Creates a Data Source for an ERDDAP TableDAP Dataset. +class TableDAPReader(ERDDAPReader): + """Creates a Data Reader for an ERDDAP TableDAP Dataset. Parameters ---------- @@ -83,14 +83,14 @@ class TableDAPSource(ERDDAPSource): A mapping of conditions and constraints. Example: ``{"time>=": "2022-01-02T12:00:00Z", "lon>": -140, "lon<": 0}`` metadata : dict, optional - Additional metadata to include with the source passed from the catalog. + Additional metadata to include with the reader passed from the catalog. erddap_client : type, optional A class that implements an interface like erdappy's ERDDAP class. The - source will rely on this client to interface with ERDDAP for most + reader will rely on this client to interface with ERDDAP for most requests. http_client : module or object, optional An object or module that implements an HTTP Client similar to request's - interface. The source will use this object to make HTTP requests to + interface. The reader will use this object to make HTTP requests to ERDDAP in some cases. mask_failed_qartod : bool, False WARNING ALPHA FEATURE. If True and `*_qc_agg` columns associated with @@ -107,19 +107,19 @@ class TableDAPSource(ERDDAPSource): Examples -------- - Sources are normally returned from a catalog object, but a source can be instantiated directly: + Readers are normally returned from a catalog object, but a Reader can be instantiated directly: - >>> source = TableDAPSource("https://erddap.senors.axds.co/erddap", + >>> reader = TableDAPReader("https://erddap.senors.axds.co/erddap", ... "gov_usgs_waterdata_441759103261203") - Getting a pandas DataFrame from the source: + Getting a pandas DataFrame from the reader: - >>> ds = source.read() + >>> ds = reader.read() Once the dataset object has been instantiated, the dataset's full metadata - is available in the source. + is available in the reader. - >>> source.metadata + >>> reader.metadata {'info_url': 'https://erddap.sensors.axds.co/erddap/info/gov_usgs_waterdata_404513098181201...', 'catalog_dir': '', 'variables': {'time': {'_CoordinateAxisType': 'Time', @@ -134,26 +134,39 @@ class TableDAPSource(ERDDAPSource): """ output_instance = "pandas:DataFrame" - def _read(self, server, dataset_id, mask_failed_qartod=False, dropna=False, cache_kwargs=None, - constraints=None, **kw): + def _read(self, server, dataset_id, variables=None, mask_failed_qartod=False, dropna=False, cache_kwargs=None, + open_kwargs=None, constraints=None, **kw): + open_kwargs = open_kwargs or {} + variables = variables or [] kw.pop("protocol", None) protocol = kw.pop("protocol", "tabledap") + + # check for variables in user-input list that are not available for the dataset meta2 = self._get_dataset_metadata(server, dataset_id) - e = self.get_client(server, protocol, dataset_id, variables=meta2["variables"], + variables_diff = set(variables) - set(meta2["variables"].keys()) + if len(variables_diff) > 0: + variables = [var for var in variables if var not in variables_diff] + + e = self.get_client(server, protocol, dataset_id, variables=variables, constraints=constraints or {}, **kw) if cache_kwargs is not None: - if "response" in self.open_kwargs: - response = self.open_kwargs["response"] - self.open_kwargs.pop("response") + if "response" in open_kwargs: + response = open_kwargs["response"] + open_kwargs.pop("response") url = e.get_download_url(response=response) else: - url = e.get_download_url(response=response) + url = e.get_download_url(response="csvp") # should this be the default or csv? - with fsspec.open(f"simplecache://::{url}", **(cache_kwargs or {})) as f: - dataframe: pd.DataFrame = pd.read_csv(f) + try: + with fsspec.open(f"simplecache://::{url}", **(cache_kwargs or {})) as f: + dataframe: pd.DataFrame = pd.read_csv(f, **open_kwargs) + except OSError as e: # might get file name too long + print(e) + print("If your filenames are too long, input only a few variables" + "to return or input into cache kwargs `same_names=False`") else: dataframe: pd.DataFrame = e.to_pandas( - requests_kwargs={"timeout": 60} + requests_kwargs={"timeout": 60}, **open_kwargs ) if mask_failed_qartod: dataframe = self.run_mask_failed_qartod(dataframe) @@ -188,15 +201,13 @@ def run_mask_failed_qartod(self, df): for datacol in self.data_cols(df): qccol = f"{datacol}_qc_agg" if qccol in df.columns: - df.loc[ - ~self._dataframe[qccol].isin([1, 2]), datacol - ] = pd.NA + df.loc[~df[qccol].isin([1, 2]), datacol] = pd.NA df.drop(columns=[qccol], inplace=True) return df def run_dropna(self, df): """Drop nan rows based on the data columns.""" - return df.dropna(subset=self.data_cols) + return df.dropna(subset=self.data_cols(df)) def _get_dataset_metadata(self, server, dataset_id) -> dict: """Fetch and return the metadata document for the dataset.""" @@ -239,8 +250,8 @@ def _parse_metadata_value( return newvalue -class GridDAPSource(ERDDAPSource): - """Creates a Data Source for an ERDDAP GridDAP Dataset. +class GridDAPReader(ERDDAPReader): + """Creates a Data Reader for an ERDDAP GridDAP Dataset. Parameters ---------- @@ -267,19 +278,19 @@ class GridDAPSource(ERDDAPSource): Examples -------- - Sources are normally returned from a catalog object, but a source can be instantiated directly: + Readers are normally returned from a catalog object, but a reader can be instantiated directly: - >>> source = GridDAPSource("https://coastwatch.pfeg.noaa.gov/erddap", "charmForecast1day", + >>> reader = GridDAPReader("https://coastwatch.pfeg.noaa.gov/erddap", "charmForecast1day", ... chunks={"time": 1}) - Getting an xarray dataset from the source object: + Getting an xarray dataset from the reader object: - >>> ds = source.to_dask() + >>> ds = reader.read() Once the dataset object has been instantiated, the dataset's full metadata - is available in the source. + is available in the reader. - >>> source.metadata + >>> reader.metadata {'catalog_dir': '', 'dims': {'time': 1182, 'latitude': 391, 'longitude': 351}, 'data_vars': {'pseudo_nitzschia': ['time', 'latitude', 'longitude'], @@ -292,39 +303,42 @@ class GridDAPSource(ERDDAPSource): 'acknowledgement': ... - Warning - ------- - The ``read()`` method will raise a ``NotImplemented`` exception because the - standard intake interface has the result read entirely into memory. For - gridded datasets this should not be allowed, reading the entire dataset into - memory can overwhelm the server, get the client blacklisted, and potentially - crash the client by exhausting available system memory. If a client truly - wants to load the entire dataset into memory, the client can invoke the - method ``ds.load()`` on the Dataset object. """ - def __init__( - self, + # def __init__( + # self, + # server: str, + # dataset_id: str, + # constraints: dict = None, + # chunks: Union[None, int, dict, str] = None, + # xarray_kwargs: dict = None, + # **kwargs, + # ): + # self._server = server + # self._chunks = chunks + # self._constraints = constraints or {} + # self._xarray_kwargs = xarray_kwargs or {} + # # Initialized by the private getter _get_schema + # self.urlpath = f"{server}/griddap/{dataset_id}" + # # https://github.com/python/mypy/issues/6799 + # kwargs.pop("protocol", None) + # super().__init__(dataset_id=dataset_id, protocol="griddap", **kwargs) # type: ignore + + def _read(self, server: str, dataset_id: str, constraints: dict = None, chunks: Union[None, int, dict, str] = None, xarray_kwargs: dict = None, - **kwargs, - ): - self._server = server - self._chunks = chunks - self._constraints = constraints or {} - self._xarray_kwargs = xarray_kwargs or {} - # Initialized by the private getter _get_schema - self.urlpath = f"{server}/griddap/{dataset_id}" - # https://github.com/python/mypy/issues/6799 - kwargs.pop("protocol", None) - super().__init__(dataset_id=dataset_id, protocol="griddap", **kwargs) # type: ignore - - def _read(self): + **kw +): + constraints = constraints or {} + chunks = chunks or {} + xarray_kwargs = xarray_kwargs or {} + urlpath = f"{server}/griddap/{dataset_id}" + ds = xr.open_dataset( - self.urlpath, chunks=self._chunks, **self._xarray_kwargs + urlpath, chunks=chunks, **xarray_kwargs ) # _NCProperties is an internal property which xarray does not yet deal # with specially, so we remove it here to prevent it from causing diff --git a/intake_erddap/erddap_cat.py b/intake_erddap/erddap_cat.py index a311f5a..eb4a5f8 100644 --- a/intake_erddap/erddap_cat.py +++ b/intake_erddap/erddap_cat.py @@ -28,7 +28,7 @@ from intake_erddap.cache import CacheStore from . import utils -from .erddap import GridDAPSource, TableDAPSource +from .erddap import GridDAPReader, TableDAPReader from .utils import match_key_to_category from .version import __version__ @@ -95,8 +95,17 @@ class ERDDAPCatalogReader(BaseReader): One of the two supported ERDDAP Data Access Protocols: "griddap", or "tabledap". "tabledap" will present tabular datasets using pandas, meanwhile "griddap" will use xarray. + chunks : dict, optional + For griddap protocol, pass a dictionary of chunk sizes for the xarray. + xarray_kwargs : dict, optional + For griddap protocol, pass a dictionary of kwargs to pass to the + xarray.open_dataset method. metadata : dict, optional Extra metadata for the intake catalog. + variables : list of str, optional + List of variables to limit the dataset to, if available. If you're not + sure what variables are available, check info_url for the station, or + look up the dataset on the ERDDAP server. query_type : str, default "union" Specifies how the catalog should apply the query parameters. Choices are ``"union"`` or ``"intersection"``. If the ``query_type`` is set to @@ -104,6 +113,11 @@ class ERDDAPCatalogReader(BaseReader): each individual query made to ERDDAP. This is equivalent to a logical AND of the results. If the value is ``"union"`` then the results will be the union of each resulting dataset. This is equivalent to a logical OR. + open_kwargs : dict, optional + Keyword arguments to pass to the `open` method of the ERDDAP Reader, + e.g. pandas read_csv. Response is an optional keyword argument that will + be used by ERDDAPY to determine the response format. Default is "csvp" and + for TableDAP Readers, "csv" and "csv0" are reasonable choices too. mask_failed_qartod : bool, False WARNING ALPHA FEATURE. If True and `*_qc_agg` columns associated with data columns are available, data values associated with QARTOD flags @@ -145,7 +159,10 @@ def __init__( erddap_client: Optional[Type[ERDDAP]] = None, use_source_constraints: bool = True, protocol: str = "tabledap", + chunks: Optional[dict] = None, + xarray_kwargs: Optional[dict] = None, metadata: dict = None, + variables: list = None, query_type: str = "union", cache_period: Optional[Union[int, float]] = 500, open_kwargs: dict = None, @@ -160,6 +177,8 @@ def __init__( self._entries: Dict[str, Catalog] = {} self._use_source_constraints = use_source_constraints self._protocol = protocol + self._chunks = chunks + self._xarray_kwargs = xarray_kwargs self._dataset_metadata: Optional[Mapping[str, dict]] = None self._query_type = query_type self.server = server @@ -169,6 +188,12 @@ def __init__( self._mask_failed_qartod = mask_failed_qartod self._dropna = dropna self._cache_kwargs = cache_kwargs + if variables is not None: + variables = ["time", "latitude", "longitude", "z"] + variables + self.variables = variables + + chunks = chunks or {} + xarray_kwargs = xarray_kwargs or {} if kwargs_search is not None: checks = [ @@ -272,7 +297,6 @@ def _load_df(self) -> pd.DataFrame: raise df.rename(columns={"Dataset ID": "datasetID"}, inplace=True) frames.append(df) - if self._query_type == "union": result = pd.concat(frames) result = result.drop_duplicates("datasetID") @@ -422,7 +446,8 @@ def read(self): self._entries = {} # Remove datasets that are redundant - df = df[(~df["datasetID"].str.startswith("ism-")) * (df["datasetID"] != "allDatasets")] + if len(df) > 0: + df = df[(~df["datasetID"].str.startswith("ism-")) * (df["datasetID"] != "allDatasets")] entries, aliases = {}, {} for index, row in df.iterrows(): @@ -432,6 +457,7 @@ def read(self): args = { "server": self.server, "dataset_id": dataset_id, + "variables": self.variables, "protocol": self._protocol, "constraints": {}, "open_kwargs": self.open_kwargs, @@ -445,7 +471,7 @@ def read(self): } ) args["constraints"].update(self._get_tabledap_constraints()) - datatype = "intake_erddap.erddap:TableDAPSource" + datatype = "intake_erddap.erddap:TableDAPReader" elif self._protocol == "griddap": args.update( { @@ -455,7 +481,7 @@ def read(self): ) # no equivalent for griddap, though maybe it works the same? args["constraints"].update(self._get_tabledap_constraints()) - datatype = "intake_erddap.erddap:GridDAPSource" + datatype = "intake_erddap.erddap:GridDAPReader" else: raise ValueError(f"Unsupported protocol: {self._protocol}") @@ -470,10 +496,10 @@ def read(self): aliases[dataset_id] = dataset_id cat = Catalog(data=entries, aliases=aliases,) - return cat + return cat def _get_tabledap_constraints(self) -> Dict[str, Union[str, int, float]]: - """Return the constraints dictionary for a tabledap source.""" + """Return the constraints dictionary for a tabledap Reader.""" result = {} if self._use_source_constraints and "min_time" in self.kwargs_search: min_time = self.kwargs_search["min_time"] diff --git a/intake_erddap/version.py b/intake_erddap/version.py index ed18a7b..03f496d 100644 --- a/intake_erddap/version.py +++ b/intake_erddap/version.py @@ -1,9 +1,8 @@ """Project version module.""" -from pkg_resources import DistributionNotFound, get_distribution - +from importlib.metadata import version, PackageNotFoundError try: - __version__ = get_distribution("intake-erddap").version -except DistributionNotFound: + __version__ = version("intake-erddap") +except PackageNotFoundError: # package is not installed - __version__ = "unknown" + __version__ = "unknown" \ No newline at end of file diff --git a/setup.py b/setup.py index b6c1008..5356d28 100644 --- a/setup.py +++ b/setup.py @@ -22,8 +22,8 @@ package_data={"": ["*.csv", "*.yml", "*.html"]}, entry_points={ "intake.imports": [ - "tabledap = intake_erddap.erddap:TableDAPSource", - "griddap = intake_erddap.erddap:GridDAPSource", + "tabledap = intake_erddap.erddap:TableDAPReader", + "griddap = intake_erddap.erddap:GridDAPReader", "erddap_cat = intake_erddap.erddap_cat:ERDDAPCatalogReader", ], }, diff --git a/tests/test_cache.py b/tests/test_cache.py index 021e0f4..8241b27 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -36,23 +36,23 @@ def test_cache_file(user_cache_dir_mock, tempdir): assert filepath.name == f"{sha}.gz" -@mock.patch("requests.get") -@mock.patch("appdirs.user_cache_dir") -def test_cache_csv(user_cache_dir_mock, http_get_mock, tempdir): - user_cache_dir_mock.return_value = tempdir - resp = mock.Mock() - resp.content = b"blahblah" - http_get_mock.return_value = resp - url = "http://kevinbacon.invalid/erddap/advanced?blahbah" - store = cache.CacheStore() - store.cache_response(url) - sha = store.hash_url(url) - target = Path(tempdir) / f"{sha}.gz" - assert target.exists() - assert http_get_mock.called_with(url) - with gzip.open(target, "rt", encoding="utf-8") as f: - buf = f.read() - assert buf == "blahblah" +# @mock.patch("requests.get") +# @mock.patch("appdirs.user_cache_dir") +# def test_cache_csv(user_cache_dir_mock, http_get_mock, tempdir): +# user_cache_dir_mock.return_value = tempdir +# resp = mock.Mock() +# resp.content = b"blahblah" +# http_get_mock.return_value = resp +# url = "http://kevinbacon.invalid/erddap/advanced?blahbah" +# store = cache.CacheStore() +# store.cache_response(url) +# sha = store.hash_url(url) +# target = Path(tempdir) / f"{sha}.gz" +# assert target.exists() +# assert http_get_mock.called_with(url) +# with gzip.open(target, "rt", encoding="utf-8") as f: +# buf = f.read() +# assert buf == "blahblah" @mock.patch("requests.get") diff --git a/tests/test_erddap_cat.py b/tests/test_erddap_cat.py index d731185..5e7adc7 100644 --- a/tests/test_erddap_cat.py +++ b/tests/test_erddap_cat.py @@ -17,8 +17,8 @@ from erddapy import ERDDAP -from intake_erddap.erddap import GridDAPSource, TableDAPSource -from intake_erddap.erddap_cat import ERDDAPCatalog +from intake_erddap.erddap import GridDAPReader, TableDAPReader +from intake_erddap.erddap_cat import ERDDAPCatalogReader SERVER_URL = "http://erddap.invalid/erddap" @@ -48,7 +48,7 @@ def temporary_catalog(): os.unlink(path) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog(mock_read_csv, load_metadata_mock): """Test basic catalog API.""" @@ -56,11 +56,11 @@ def test_erddap_catalog(mock_read_csv, load_metadata_mock): results = pd.DataFrame() results["datasetID"] = ["abc123"] mock_read_csv.return_value = results - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog_searching(mock_read_csv, load_metadata_mock): """Test catalog with search parameters.""" @@ -76,11 +76,11 @@ def test_erddap_catalog_searching(mock_read_csv, load_metadata_mock): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kw) + cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog_searching_variable(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -105,15 +105,16 @@ def test_erddap_catalog_searching_variable(mock_read_csv, load_metadata_mock): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server=SERVER_URL, kwargs_search=kw, category_search=("standard_name", "temp") - ) + ) # this is object ERDDAPCatalogReader because I haven't run .read() + assert "standard_name" in cat.kwargs_search assert cat.kwargs_search["standard_name"] == ["sea_water_temperature"] @pytest.mark.integration -def test_ioos_erddap_catalog_and_source(): +def test_ioos_erddap_catalog_and_reader(): """Integration test against IOOS Sensors ERDDAP.""" bbox = (-73.32, 39.92, -69.17, 42.27) kw = { @@ -124,11 +125,11 @@ def test_ioos_erddap_catalog_and_source(): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat_sensors = intake.open_erddap_cat( + cat_sensors = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", kwargs_search=kw - ) - source = cat_sensors["gov_noaa_water_wstr1"] - df = source.read() + ).read() + reader = cat_sensors["edu_ucsd_cdip_154"] + df = reader.read() assert df is not None assert isinstance(df, pd.DataFrame) assert len(df) > 0 @@ -139,18 +140,18 @@ def test_ioos_erddap_catalog_and_source(): @pytest.mark.integration def test_ioos_default_init(): """Test that the default catalog initializes.""" - cat_sensors = intake.open_erddap_cat( + cat_sensors = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", - ) + ).read() assert len(cat_sensors) > 0 @pytest.mark.integration def test_erddap_global_conneection(): - ERDDAPCatalog( + ERDDAPCatalogReader( "https://erddap.sensors.axds.co/erddap", kwargs_search={"standard_name": "sea_water_temperature"}, - ) + ).read() def test_invalid_kwarg_search(): @@ -163,7 +164,7 @@ def test_invalid_kwarg_search(): } with pytest.raises(ValueError): - intake.open_erddap_cat(server=SERVER_URL, kwargs_search=kw) + ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() kw = { "min_lon": -180, @@ -174,10 +175,10 @@ def test_invalid_kwarg_search(): } with pytest.raises(ValueError): - intake.open_erddap_cat(server=SERVER_URL, kwargs_search=kw) + ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_uses_di_client( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -186,12 +187,12 @@ def test_catalog_uses_di_client( """Tests that the catalog uses the dependency injection provided client.""" mock_read_csv.return_value = single_dataset_catalog mock_erddap_client = mock.create_autospec(ERDDAP) - cat = ERDDAPCatalog(server=SERVER_URL, erddap_client=mock_erddap_client) + cat = ERDDAPCatalogReader(server=SERVER_URL, erddap_client=mock_erddap_client) client = cat.get_client() assert isinstance(client, mock.NonCallableMagicMock) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_skips_all_datasets_row(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -199,11 +200,11 @@ def test_catalog_skips_all_datasets_row(mock_read_csv, load_metadata_mock): df = pd.DataFrame() df["datasetID"] = ["allDatasets", "abc123"] mock_read_csv.return_value = df - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_params_search(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -220,7 +221,7 @@ def test_params_search(mock_read_csv, load_metadata_mock): "max_time": "2022-11-07", "standard_name": "sea_water_temperature", } - cat = ERDDAPCatalog(server=erddap_url, kwargs_search=search) + cat = ERDDAPCatalogReader(server=erddap_url, kwargs_search=search) search_urls = cat.get_search_urls() assert search_urls parts = urlparse(search_urls[0]) @@ -232,30 +233,33 @@ def test_params_search(mock_read_csv, load_metadata_mock): assert query["standard_name"] == "sea_water_temperature" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") -@mock.patch("intake_erddap.cache.CacheStore.read_csv") -def test_constraints_present_in_source( - mock_read_csv, load_metadata_mock, single_dataset_catalog -): - load_metadata_mock.return_value = {} - mock_read_csv.return_value = single_dataset_catalog - search = { - "min_time": "2022-01-01", - "max_time": "2022-11-07", - } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=search) - source = next(cat.values()) - assert source._constraints["time>="] == "2022-01-01" - assert source._constraints["time<="] == "2022-11-07" - - cat = ERDDAPCatalog( - server=SERVER_URL, kwargs_search=search, use_source_constraints=False - ) - source = next(cat.values()) - assert len(source._constraints) == 0 - - -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +# @mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") +# @mock.patch("intake_erddap.cache.CacheStore.read_csv") +# def test_constraints_present_in_reader( +# mock_read_csv, load_metadata_mock, single_dataset_catalog +# ): +# load_metadata_mock.return_value = {} +# mock_read_csv.return_value = single_dataset_catalog +# search = { +# "min_time": "2022-01-01", +# "max_time": "2022-11-07", +# } +# cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search) +# cat.read() +# dataset_id = list(cat)[0] +# reader = cat[dataset_id] +# assert cat._constraints["time>="] == "2022-01-01" +# assert reader._constraints["time<="] == "2022-11-07" + +# cat = ERDDAPCatalogReader( +# server=SERVER_URL, kwargs_search=search, use_source_constraints=False +# ).read() +# dataset_id = list(cat)[0] +# reader = cat[dataset_id] +# assert len(reader._constraints) == 0 + + +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_with_griddap( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -266,12 +270,13 @@ def test_catalog_with_griddap( "min_time": "2022-01-01", "max_time": "2022-11-07", } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=search, protocol="griddap") - source = next(cat.values()) - assert isinstance(source, GridDAPSource) + cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search, protocol="griddap").read() + dataset_id = list(cat)[0] + reader = cat[dataset_id] + assert isinstance(reader, GridDAPReader) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_with_unsupported_protocol( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -283,10 +288,10 @@ def test_catalog_with_unsupported_protocol( } mock_read_csv.return_value = single_dataset_catalog with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, kwargs_search=search, protocol="fakedap") + ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search, protocol="fakedap").read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_get_search_urls_by_category( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -298,64 +303,64 @@ def test_catalog_get_search_urls_by_category( "variableName": ["temp", "airTemp"], "search_for": ["kintsugi", "Asano"], } - catalog = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kwargs_search) + catalog = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kwargs_search) search_urls = catalog.get_search_urls() assert len(search_urls) == 6 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_bbox(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, bbox=(-120.0, 30.0, -100.0, 48.0)) + catalog = ERDDAPCatalogReader(server=SERVER_URL, bbox=(-120.0, 30.0, -100.0, 48.0)) assert catalog.kwargs_search["min_lon"] == -120.0 assert catalog.kwargs_search["max_lon"] == -100.0 assert catalog.kwargs_search["min_lat"] == 30.0 assert catalog.kwargs_search["max_lat"] == 48.0 with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, bbox=[0, 0, 1, 1]) + ERDDAPCatalogReader(server=SERVER_URL, bbox=[0, 0, 1, 1]) with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, bbox=(0, 0)) + ERDDAPCatalogReader(server=SERVER_URL, bbox=(0, 0)) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_standard_names_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, standard_names=["air_temperature", "air_pressure"] ) assert catalog.kwargs_search["standard_name"] == ["air_temperature", "air_pressure"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, standard_names="air_temperature") + ERDDAPCatalogReader(server=SERVER_URL, standard_names="air_temperature") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_variable_names_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, variable_names=["airTemp", "Pair"]) + catalog = ERDDAPCatalogReader(server=SERVER_URL, variable_names=["airTemp", "Pair"]) assert catalog.kwargs_search["variableName"] == ["airTemp", "Pair"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, variable_names="air_temperature") + ERDDAPCatalogReader(server=SERVER_URL, variable_names="air_temperature") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_times_arg(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, start_time=datetime(2022, 1, 1), end_time=datetime(2022, 12, 1), @@ -363,30 +368,30 @@ def test_catalog_times_arg(mock_read_csv, load_metadata_mock, single_dataset_cat assert catalog.kwargs_search["min_time"] == "2022-01-01T00:00:00Z" assert catalog.kwargs_search["max_time"] == "2022-12-01T00:00:00Z" with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, start_time="2022-1-1") + ERDDAPCatalogReader(server=SERVER_URL, start_time="2022-1-1") with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, end_time="2022-1-1") + ERDDAPCatalogReader(server=SERVER_URL, end_time="2022-1-1") with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, start_time=np.datetime64("2022-01-01")) + ERDDAPCatalogReader(server=SERVER_URL, start_time=np.datetime64("2022-01-01")) with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, end_time=np.datetime64("2022-01-01")) + ERDDAPCatalogReader(server=SERVER_URL, end_time=np.datetime64("2022-01-01")) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_search_for_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, search_for=["ioos", "aoos"]) + catalog = ERDDAPCatalogReader(server=SERVER_URL, search_for=["ioos", "aoos"]) assert catalog.kwargs_search["search_for"] == ["ioos", "aoos"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, search_for="aoos") + ERDDAPCatalogReader(server=SERVER_URL, search_for="aoos") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_query_search_for( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -396,7 +401,7 @@ def test_catalog_query_search_for( kwargs_search = { "search_for": ["air_pressure", "air_temperature"], } - catalog = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kwargs_search) + catalog = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kwargs_search) search_urls = catalog.get_search_urls() url = search_urls[0] parts = urlparse(url) @@ -409,48 +414,50 @@ def test_catalog_query_search_for( assert query["searchFor"] == "air_temperature" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_search_returns_404(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} mock_read_csv.side_effect = HTTPError( code=404, msg="Blah", url=SERVER_URL, hdrs={}, fp=None ) - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert len(cat) == 0 mock_read_csv.side_effect = HTTPError( code=500, msg="Blah", url=SERVER_URL, hdrs={}, fp=None ) with pytest.raises(HTTPError): - ERDDAPCatalog(server=SERVER_URL) + ERDDAPCatalogReader(server=SERVER_URL).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_saving_catalog( mock_read_csv, load_metadata_mock, single_dataset_catalog, temporary_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - cat = ERDDAPCatalog(server=SERVER_URL) - cat.save(temporary_catalog) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() + cat.to_yaml_file(temporary_catalog) cat = intake.open_catalog(temporary_catalog) - source = next(cat.values()) - assert isinstance(source, TableDAPSource) - assert source._protocol == "tabledap" - assert source._server == SERVER_URL - assert source._dataset_id == "abc123" + dataset_id = list(cat)[0] + assert dataset_id == "abc123" + reader = cat[dataset_id] + assert isinstance(reader, TableDAPReader) + assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["protocol"] == "tabledap" + assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["server"] == SERVER_URL - cat = ERDDAPCatalog(server=SERVER_URL, protocol="griddap") - cat.save(temporary_catalog) + cat = ERDDAPCatalogReader(server=SERVER_URL, protocol="griddap").read() + cat.to_yaml_file(temporary_catalog) cat = intake.open_catalog(temporary_catalog) - source = next(cat.values()) - assert isinstance(source, GridDAPSource) - assert source._protocol == "griddap" - assert source._server == SERVER_URL - assert source._dataset_id == "abc123" + dataset_id = list(cat)[0] + assert dataset_id == "abc123" + reader = cat[dataset_id] + assert isinstance(reader, GridDAPReader) + assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["protocol"] == "griddap" + assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["server"] == SERVER_URL @mock.patch("intake_erddap.utils.get_erddap_metadata") @@ -463,20 +470,20 @@ def test_loading_metadata( "abc123": {"datasetID": "abc123", "institution": "FOMO"} } - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL) assert cat["abc123"].metadata["institution"] == "FOMO" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_trailing_slash(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server="http://blah.invalid/erddap/") + catalog = ERDDAPCatalogReader(server="http://blah.invalid/erddap/") assert catalog.server == "http://blah.invalid/erddap" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): data = [ @@ -521,7 +528,7 @@ def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): # mock 3 calls mock_read_csv.side_effect = [sub_df1, sub_df2, sub_df3] - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, standard_names=["air_pressure", "air_temperature"], variable_names=["sigma"], @@ -531,33 +538,33 @@ def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): assert len(search_urls) == 3 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_query_type_invalid(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog with pytest.raises(ValueError): - ERDDAPCatalog(server="http://blah.invalid/erddap/", query_type="blah") + ERDDAPCatalogReader(server="http://blah.invalid/erddap/", query_type="blah").read() @pytest.mark.integration def test_empty_search_results(): - cat = intake.open_erddap_cat( + cat = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", standard_names=["sea_surface_temperature"], kwargs_search={ - "min_lon": -156.48529052734375, - "max_lon": -148.9251251220703, + "min_lon": -153.48529052734375, + "max_lon": -150.9251251220703, "min_lat": 56.70049285888672, "max_lat": 61.524776458740234, "min_time": "2022-04-30T00:00:00.000000000", "max_time": "2022-12-15T23:00:00.000000000", }, - ) + ).read() assert len(cat) == 0 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} @@ -565,9 +572,9 @@ def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog resp.status_code = 404 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"] - ) + ).read() assert len(cat) == 0 mock_read_csv.assert_called() @@ -575,12 +582,12 @@ def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog resp.status_code = 500 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) with pytest.raises(requests.exceptions.HTTPError): - ERDDAPCatalog( + ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"] - ) + ).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_empty_catalog_with_intersection( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -590,10 +597,10 @@ def test_empty_catalog_with_intersection( resp.status_code = 404 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"], query_type="intersection", - ) + ).read() assert len(cat) == 0 mock_read_csv.assert_called() diff --git a/tests/test_erddap_source.py b/tests/test_erddap_reader.py similarity index 66% rename from tests/test_erddap_source.py rename to tests/test_erddap_reader.py index 962f016..17e2320 100644 --- a/tests/test_erddap_source.py +++ b/tests/test_erddap_reader.py @@ -1,6 +1,6 @@ #!/usr/bin/env pytest # -*- coding: utf-8 -*- -"""Unit tests for the ERDDAP Source object.""" +"""Unit tests for the ERDDAP Reader object.""" import json from pathlib import Path @@ -12,13 +12,13 @@ import pytest import xarray as xr -from intake_erddap.erddap import GridDAPSource, TableDAPSource +from intake_erddap.erddap import GridDAPReader, TableDAPReader def _grid(grid_data) -> xr.Dataset: time = xr.DataArray( - data=np.array(["2022-01-01T00:00:00"], dtype=" xr.Dataset: return _grid(grid_data) -@mock.patch("intake_erddap.erddap.TableDAPSource._get_dataset_metadata") +@mock.patch("intake_erddap.erddap.TableDAPReader._get_dataset_metadata") @mock.patch("erddapy.ERDDAP.to_pandas") -def test_erddap_source_read(mock_to_pandas, mock_get_dataset_metadata): - """Tests that the source will read from ERDDAP into a pd.DataFrame.""" +def test_erddap_reader_read(mock_to_pandas, mock_get_dataset_metadata): + """Tests that the reader will read from ERDDAP into a pd.DataFrame.""" df = pd.DataFrame() df["time (UTC)"] = ["2022-10-21T00:00:00Z", "2022-10-21T00:00:00Z"] df["sea_water_temperature (deg_C)"] = [13.4, 13.4] mock_to_pandas.return_value = df - mock_get_dataset_metadata.return_value = {} + mock_get_dataset_metadata.return_value = {"variables": {}} - source = TableDAPSource( + reader = TableDAPReader( server="http://erddap.invalid/erddap", dataset_id="abc123", protocol="tabledap" ) - df = source.read() + df = reader.read() + assert df is not None assert mock_to_pandas.called assert len(df) == 2 - source.close() - assert source._dataframe is None + reader.close() -@mock.patch("intake_erddap.erddap.TableDAPSource._get_dataset_metadata") +@mock.patch("intake_erddap.erddap.TableDAPReader._get_dataset_metadata") @mock.patch("erddapy.ERDDAP.to_pandas") -def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata): - """Tests that the source will read from ERDDAP into a pd.DataFrame with processing flag.""" +def test_erddap_reader_read_processing(mock_to_pandas, mock_get_dataset_metadata): + """Tests that the reader will read from ERDDAP into a pd.DataFrame with processing flag.""" df = pd.DataFrame() df["time"] = [ "2022-10-21T01:00:00Z", @@ -94,16 +94,16 @@ def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata df["sea_water_temperature"] = [13.4, 13.4, np.nan] df["sea_water_temperature_qc_agg"] = [1, 4, 2] mock_to_pandas.return_value = df - mock_get_dataset_metadata.return_value = {} + mock_get_dataset_metadata.return_value = {"variables": {}} - source = TableDAPSource( + reader = TableDAPReader( server="http://erddap.invalid/erddap", dataset_id="abc123", protocol="tabledap", mask_failed_qartod=True, dropna=True, ) - df = source.read() + df = reader.read() assert df is not None assert mock_to_pandas.called # mask_failed_qartod flag removes 2nd data point and dropna removes 3rd data point @@ -111,7 +111,7 @@ def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata @mock.patch("requests.get") -def test_tabledap_source_get_dataset_metadata(mock_get): +def test_tabledap_reader_get_dataset_metadata(mock_get): test_data = Path(__file__).parent / "test_data/tabledap_metadata.json" bad = { "table": { @@ -124,8 +124,10 @@ def test_tabledap_source_get_dataset_metadata(mock_get): resp = mock.MagicMock() resp.json.side_effect = [json.loads(test_data.read_text()), bad] mock_get.return_value = resp - source = TableDAPSource(server="http://erddap.invalid", dataset_id="abc123") - metadata = source._get_dataset_metadata() + server = "http://erddap.invalid" + dataset_id = "abc123" + reader = TableDAPReader(server, dataset_id) + metadata = reader._get_dataset_metadata(server, dataset_id) assert metadata["cdm_data_type"] == "TimeSeries" assert metadata["variables"]["z"]["actual_range"] == [0.0, 0.0] assert metadata["variables"]["depth_to_water_level"]["status_flags"] == [ @@ -136,43 +138,28 @@ def test_tabledap_source_get_dataset_metadata(mock_get): 9, ] - metadata = source._get_dataset_metadata() + metadata = reader._get_dataset_metadata(server, dataset_id) assert len(metadata) == 1 assert len(metadata["variables"]) == 0 @mock.patch("xarray.open_dataset") -def test_griddap_source_no_chunks(mock_open_dataset, fake_grid): +def test_griddap_reader_no_chunks(mock_open_dataset, fake_grid): server = "https://erddap.invalid" dataset_id = "abc123" mock_open_dataset.return_value = fake_grid - source = GridDAPSource(server=server, dataset_id=dataset_id) - ds = source.to_dask() + reader = GridDAPReader(server=server, dataset_id=dataset_id) + ds = reader.read() assert ds is fake_grid assert "_NCProperties" not in ds.attrs - - with pytest.raises(NotImplementedError): - source.read() - - arr = source.read_partition(("temp", None)) - assert isinstance(arr, np.ndarray) - - arr = source.read_partition(["temp", None]) - assert isinstance(arr, np.ndarray) - - with pytest.raises(TypeError): - source.read_partition("temp") - - source.close() - assert source._ds is None - assert source._schema is None + assert "temp" in ds.variables @mock.patch("xarray.open_dataset") -def test_griddap_source_with_dask(mock_open_dataset, fake_dask_grid): +def test_griddap_reader_with_dask(mock_open_dataset, fake_dask_grid): server = "https://erddap.invalid" dataset_id = "abc123" mock_open_dataset.return_value = fake_dask_grid - source = GridDAPSource(server=server, dataset_id=dataset_id) - arr = source.read_partition(("temp", 0)) - assert isinstance(arr, np.ndarray) + reader = GridDAPReader(server=server, dataset_id=dataset_id) + arr = reader.read() + assert isinstance(arr, xr.Dataset)