Update to xarray datatree

ghiggi · Jan 7, 2025 · 8d8b535 · 8d8b535
1 parent a69dca5
commit 8d8b535
Show file tree

Hide file tree

Showing 16 changed files with 73 additions and 47 deletions.
diff --git a/ci/environment.yaml b/ci/environment.yaml
@@ -31,7 +31,6 @@ dependencies:
   - trame-vtk
   - trame-vuetify
   - trollsift
-  - xarray-datatree
-  - xarray
+  - xarray>=2025.01.0
   - xoak
   - ximage
diff --git a/ci/environment_latest.yaml b/ci/environment_latest.yaml
@@ -31,6 +31,5 @@ dependencies:
   - trame-vuetify
   - trollsift
   - xarray
-  - xarray-datatree
   - xoak
   - ximage
diff --git a/docs/source/00_introduction.rst b/docs/source/00_introduction.rst
@@ -586,6 +586,7 @@ IMERG Precipitation
 The Integrated Multi-satellite Retrievals for GPM (IMERG) is an advanced algorithm designed
 to generate a high resolution precipitation product every 30 minutes, covering the full globe (up to version 6,  a quasi-global area from 60°S to 60°N)
 with a spatial resolution of 0.1° x 0.1°.
+The latest IMERG product covers the time period from January 1998 to present.
 
 By leveraging measurements from Geostationary (GEO) IR imagers and the :ref:`GPM PMW sensors constellation <gpm_constellation>`,
 IMERG provides a "best-estimate" of 30-minute average precipitation rates.
@@ -606,8 +607,11 @@ While IMERG Early uses only forward propagation in time (extrapolation in time),
 
 When PMW data are too sparse, IMERG incorporates precipitation estimates derived from GEO IR imagery through a weighted Kalman filter.
 
-IR-based precipitation estimates are derived from the
-`Climate Prediction Center (CPC) Merged 4-km Global IR data product <https://disc.gsfc.nasa.gov/datasets/GPM_MERGIR_1/summary>`_.
+IR-based precipitation estimates are derived from the NOAA Climate Data Record (CDR) of Gridded Satellite Data from the ISCCP B1 (GridSat-B1) IR dataset for
+all timesteps between January 1998 and February 7, 2002, 20:00:00 UTC.
+Starting from February 7, 2002, 20:00:00 UTC, IR-based precipitation estimates
+are obtained from the `NOAA Climate Prediction Center (CPC) Merged 4-km Global IR data product <https://disc.gsfc.nasa.gov/datasets/GPM_MERGIR_1/summary>`_.
+
 This dataset composites infrared (IR) brightness temperature measurements from numerous geostationary sensors over their operational periods,
 including European (METEOSAT-5/7/8/9/10/11), Japanese (GMS-5, MTSat-1R/2, Himawari-8/9), and U.S. (GOES-8/9/10/11/12/13/14/15/16/17/18) satellites,
 every 30 minutes between 60°N/S.
@@ -931,10 +935,14 @@ Please also note that on Google Earth Engine are available the version 6 of `GSM
 and `IMERG <https://developers.google.com/earth-engine/datasets/catalog/NASA_GPM_L3_IMERG_V06>`_.
 
 GSMaP can be visualized on the `JAXA Global Rainfall Watch <https://sharaku.eorc.jaxa.jp/GSMaP/index.htm>`_,
-while IMERG on the `GPM IMERG Global Viewer <https://gpm.nasa.gov/data/visualization/global-viewer>`_ and the `EOSDIS WorldView Portal <https://worldview.earthdata.nasa.gov/?v=-235.13866988428558,-76.35016978404038,104.5800850894752,96.99821113230026&l=Reference_Labels_15m(hidden),Reference_Features_15m(hidden),Coastlines_15m,IMERG_Precipitation_Rate,VIIRS_NOAA20_CorrectedReflectance_TrueColor(hidden),VIIRS_SNPP_CorrectedReflectance_TrueColor(hidden),MODIS_Aqua_CorrectedReflectance_TrueColor(hidden),MODIS_Terra_CorrectedReflectance_TrueColor&lg=true&t=2024-02-08-T03%3A43%3A10Z>`_.
+while IMERG on
+`the RAIN-Global Viewer (Regional Animations of IMERG in Near-realtime - Global Edition) <https://storm.pps.eosdis.nasa.gov/storm/outreach/RAIN-Global.html>`_ ,
+the `GPM IMERG Global Viewer <https://gpm.nasa.gov/data/visualization/global-viewer>`_ and
+the `EOSDIS WorldView Portal <https://worldview.earthdata.nasa.gov/?v=-235.13866988428558,-76.35016978404038,104.5800850894752,96.99821113230026&l=Reference_Labels_15m(hidden),Reference_Features_15m(hidden),Coastlines_15m,IMERG_Precipitation_Rate,VIIRS_NOAA20_CorrectedReflectance_TrueColor(hidden),VIIRS_SNPP_CorrectedReflectance_TrueColor(hidden),MODIS_Aqua_CorrectedReflectance_TrueColor(hidden),MODIS_Terra_CorrectedReflectance_TrueColor&lg=true&t=2024-02-08-T03%3A43%3A10Z>`_.
 
 The `GES DISC Interactive Online Visualization ANd aNalysis Infrastructure (Giovanni) <https://giovanni.gsfc.nasa.gov/giovanni/>`_ also provides quick access to analysis of IMERG products.
 
+
 .. _useful_resources:
 
 Useful Resources

diff --git a/gpm/dataset/dataset.py b/gpm/dataset/dataset.py
@@ -139,6 +139,7 @@ def _open_valid_granules(
     prefix_group,
     chunks,
     parallel=False,
+    **kwargs,
 ):
     """Open a list of HDF granules.
 
@@ -165,6 +166,7 @@ def _open_valid_granules(
         prefix_group=prefix_group,
         chunks=chunks,
         parallel=parallel,
+        **kwargs,
     )
 
     if len(list_ds) == 0:
@@ -202,6 +204,7 @@ def open_dataset(
     parallel=False,
     prefix_group=False,
     verbose=False,
+    **kwargs,
 ):
     """Lazily map HDF5 data into xarray.Dataset with relevant GPM data and attributes.
 
@@ -273,6 +276,8 @@ def open_dataset(
         If ``parallel=True``, ``'chunks'`` can not be ``None``.
         The underlying data must be :py:class:`dask.array.Array`.
         The default is ``False``.
+    **kwargs : dict
+        Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group.
 
     Returns
     -------
@@ -318,6 +323,7 @@ def open_dataset(
         prefix_group=prefix_group,
         parallel=parallel,
         chunks=chunks,
+        **kwargs,
     )
 
     ##-------------------------------------------------------------------------.

diff --git a/gpm/dataset/datatree.py b/gpm/dataset/datatree.py
@@ -27,7 +27,6 @@
 """This module contains functions to read a GPM granule into a DataTree object."""
 import os
 
-import datatree
 import xarray as xr
 
 import gpm
@@ -41,17 +40,27 @@
 # --> gpm.open_dataset(datatree=False)  # or if multiple scan_modes provided
 
 
-def open_datatree(filepath, chunks={}, decode_cf=False, use_api_defaults=True):
+def open_datatree(filepath, chunks={}, decode_cf=False, use_api_defaults=True, **kwargs):
     """Open HDF5 in datatree object.
 
     - chunks={} --> Lazy map to dask.array
       --> Wait for https://github.com/pydata/xarray/pull/7948
       --> Maybe need to implement "auto" option manually that defaults to full shape"
     - chunks="auto" --> datatree fails. Can not estimate size of object dtype !
     - chunks=None --> lazy map to numpy.array
+
+    **kwargs : dict
+        Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group.
     """
     try:
-        dt = datatree.open_datatree(filepath, engine="netcdf4", chunks=chunks, decode_cf=decode_cf)
+        dt = xr.open_datatree(
+            filepath,
+            engine="netcdf4",
+            chunks=chunks,
+            decode_cf=decode_cf,
+            decode_times=False,
+            **kwargs,
+        )
         check_non_empty_granule(dt, filepath)
     except Exception as e:
         check_valid_granule(filepath)

diff --git a/gpm/dataset/decoding/cf.py b/gpm/dataset/decoding/cf.py
@@ -40,12 +40,18 @@ def apply_cf_decoding(ds):
     """
     # Take care of numpy 2.0 FillValue CF Decoding issue
     # - https://github.com/pydata/xarray/issues/9381
+    vars_and_coords = list(ds.data_vars) + list(ds.coords)
     if version.parse(np.__version__) >= version.parse("2.0.0"):
-        vars_and_coords = list(ds.data_vars) + list(ds.coords)
         for var in vars_and_coords:
             if "_FillValue" in ds[var].attrs:
                 ds[var].attrs["_FillValue"] = ds[var].data.dtype.type(ds[var].attrs["_FillValue"])
 
+    # At some point with xarray.2023.>8 currently require this
+    # --> TODO: update code in decoding attrs ...
+    for var in vars_and_coords:
+        if "_FillValue" in ds[var].attrs:
+            ds[var].encoding["_FillValue"] = ds[var].attrs.pop("_FillValue")
+
     # Decode with xr.decode_cf
     with warnings.catch_warnings():
         warnings.simplefilter(action="ignore", category=FutureWarning)

diff --git a/gpm/dataset/dimensions.py b/gpm/dataset/dimensions.py
@@ -94,6 +94,7 @@
     ["lon", "lat"],
     ["longitude", "latitude"],
     ["x", "y"],  # compatibility with satpy/gpm_geo i.e.
+    ["nx", "ny"],  # compatibility with TC-PRIMED
     ["transect"],
     ["trajectory"],
     ["beam"],  # when stacking 2D spatial dims

diff --git a/gpm/dataset/granule.py b/gpm/dataset/granule.py
@@ -192,12 +192,13 @@ def _open_granule(
     decode_cf,
     chunks,
     prefix_group,
+    **kwargs,
 ):
     """Open granule file into xarray Dataset."""
     from gpm.dataset.datatree import open_datatree
 
     # Open datatree
-    dt = open_datatree(filepath=filepath, chunks=chunks, decode_cf=decode_cf, use_api_defaults=True)
+    dt = open_datatree(filepath=filepath, chunks=chunks, decode_cf=decode_cf, use_api_defaults=True, **kwargs)
 
     # Retrieve the granule dataset (without cf decoding)
     ds = _get_scan_mode_dataset(
@@ -230,6 +231,7 @@ def open_granule(
     decode_cf=True,
     chunks={},
     prefix_group=False,
+    **kwargs,
 ):
     """Create a lazy xarray.Dataset with relevant GPM data and attributes for a specific granule.
 
@@ -271,6 +273,8 @@ def open_granule(
     prefix_group: bool, optional
         Whether to add the group as a prefix to the variable names.
         THe default is ``True``.
+    **kwargs : dict
+        Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group.
 
     Returns
     -------
@@ -297,6 +301,7 @@ def open_granule(
         decode_cf=False,
         chunks=chunks,
         prefix_group=prefix_group,
+        **kwargs,
     )
 
     # Finalize granule

diff --git a/gpm/io/data_integrity.py b/gpm/io/data_integrity.py
@@ -43,9 +43,8 @@ def get_corrupted_filepaths(filepaths):
     for filepath in filepaths:
         try:
             # Try open the HDF file
-
-            # DataTree.close() does not work yet!
-            # dt = datatree.open_datatree(filepath, engine="netcdf4")
+            # TODO: use this within context manager
+            # dt = xr.open_datatree(filepath, engine="netcdf4")
             # dt.close()
 
             # h5py it's an heavy dependency !

diff --git a/gpm/tests/test_dataset/test_attrs.py b/gpm/tests/test_dataset/test_attrs.py
@@ -27,7 +27,6 @@
 """This module test the GPM-API Dataset attributes."""
 
 import xarray as xr
-from datatree import DataTree
 
 from gpm.dataset import attrs
 
@@ -118,7 +117,7 @@ def test_get_granule_attrs(monkeypatch):
     )
 
     # Test with non-nested dictionary
-    dt = DataTree()
+    dt = xr.DataTree()
     dt.attrs = {
         "key_1": "value_1",
         "invalid_key": "value_2",

diff --git a/gpm/tests/test_dataset/test_coords.py b/gpm/tests/test_dataset/test_coords.py
@@ -31,7 +31,6 @@
 import numpy as np
 import pandas as pd
 import xarray as xr
-from datatree import DataTree
 from deepdiff import DeepDiff
 
 from gpm.dataset import coords
@@ -76,7 +75,7 @@ def test_get_orbit_coords():
     lat = xr.DataArray(rng.random(shape), dims=["along_track", "cross_track"])
     time_array, time_ds = get_random_datetime_array_and_dataset(shape[0])
 
-    dt = DataTree.from_dict({scan_mode: DataTree.from_dict({"ScanTime": time_ds})})
+    dt = xr.DataTree.from_dict({scan_mode: xr.DataTree.from_dict({"ScanTime": time_ds})})
     dt[scan_mode]["Longitude"] = lon
     dt[scan_mode]["Latitude"] = lat
     dt.attrs["FileHeader"] = f"GranuleNumber={granule_id}"
@@ -124,7 +123,7 @@ def test_get_grid_coords():
     ds.coords["lon"] = ("lon", lon)
     ds.coords["lat"] = ("lat", lat)
 
-    dt = DataTree.from_dict({scan_mode: ds})
+    dt = xr.DataTree.from_dict({scan_mode: ds})
     dt.attrs["FileHeader"] = f"StartGranuleDateTime={time_formated};\nTimeInterval=HALF_HOUR;"
 
     # Test get_grid_coords
@@ -151,12 +150,12 @@ def test_get_coords(monkeypatch):
 
     # Test get_coords
     scan_mode = "S1"
-    dt = DataTree()
+    dt = xr.DataTree()
     returned_coords = coords.get_coords(dt, scan_mode)
     assert returned_coords == "return from get_orbit_coords"
 
     scan_mode = "Grid"
-    dt = DataTree()
+    dt = xr.DataTree()
     returned_coords = coords.get_coords(dt, scan_mode)
     assert returned_coords == "return from get_grid_coords"
 

diff --git a/gpm/tests/test_dataset/test_datatree.py b/gpm/tests/test_dataset/test_datatree.py
@@ -24,4 +24,4 @@
 # SOFTWARE.
 
 # -----------------------------------------------------------------------------.
-"""This module test the GPM-API DataTree."""
+"""This module test the GPM-API xr.DataTree."""
diff --git a/gpm/tests/test_dataset/test_dimensions.py b/gpm/tests/test_dataset/test_dimensions.py
@@ -28,7 +28,6 @@
 
 import numpy as np
 import xarray as xr
-from datatree import DataTree
 
 from gpm.dataset import dimensions
 
@@ -87,7 +86,7 @@ def test_get_datatree_dim_dict():
     dataarray_2.attrs["DimensionNames"] = "replaced_dim_2"
     dataset_1 = xr.Dataset(data_vars={"var_1": dataarray_1})
     dataset_2 = xr.Dataset(data_vars={"var_2": dataarray_2})
-    datatree = DataTree.from_dict({"dataset_1": dataset_1, "dataset_2": dataset_2})
+    datatree = xr.DataTree.from_dict({"dataset_1": dataset_1, "dataset_2": dataset_2})
 
     expected_dict = {
         "phony_dim_1": "replaced_dim_1",
@@ -180,7 +179,7 @@ def test_rename_datatree_dimensions(monkeypatch):
     dataarray_2.attrs["DimensionNames"] = "intermediate_2"
     dataset_1 = xr.Dataset(data_vars={"var_1": dataarray_1})
     dataset_2 = xr.Dataset(data_vars={"var_2": dataarray_2})
-    datatree = DataTree.from_dict({"dataset_1": dataset_1, "dataset_2": dataset_2})
+    datatree = xr.DataTree.from_dict({"dataset_1": dataset_1, "dataset_2": dataset_2})
 
     # With use_api_defaults=True, which replaces intermediate_2 with final_2
     returned_datatree = dimensions._rename_datatree_dimensions(datatree)

diff --git a/gpm/tests/test_dataset/test_granule.py b/gpm/tests/test_dataset/test_granule.py
@@ -32,7 +32,6 @@
 import pandas as pd
 import pytest
 import xarray as xr
-from datatree import DataTree
 
 from gpm.dataset import conventions, datatree, granule
 from gpm.dataset.conventions import finalize_dataset
@@ -94,7 +93,7 @@ def test_open_granule(monkeypatch):
     scan_mode = "FS"
 
     ds = xr.Dataset()
-    dt = DataTree.from_dict({scan_mode: ds})
+    dt = xr.DataTree.from_dict({scan_mode: ds})
 
     # Mock units tested elsewhere
     monkeypatch.setattr(
@@ -223,12 +222,12 @@ def test_get_flattened_scan_mode_dataset():
 
     # Build source datatree
     scan_mode = "scan_mode"
-    dt = DataTree.from_dict(
+    dt = xr.DataTree.from_dict(
         {
-            scan_mode: DataTree.from_dict(
+            scan_mode: xr.DataTree.from_dict(
                 {
-                    "group_1": DataTree(),
-                    "group_2": DataTree(),
+                    "group_1": xr.DataTree(),
+                    "group_2": xr.DataTree(),
                 },
             ),
         },