-
| Hi, import h5py
from xarray.backends import BackendEntrypoint, BackendArray
from xarray import Dataset, DataArray, Variable
from xarray.core import indexing
import numpy as np
filename = "/tmp/testsgli.h5"
varname = "Lt_VN01"
def create_h5_data(filename, varname, shape):
    h5f = h5py.File(filename, mode="w")
    h5f[varname] = np.random.rand(*shape)
    h5f.close()
create_h5_data(filename, varname, (2000, 2000))
class H5Array(BackendArray):
    def __init__(self, array):
        self.shape = array.shape
        self.dtype = array.dtype
        self.array = array
    def __getitem__(self, key):
        return indexing.explicit_indexing_adapter(
            key, self.shape, indexing.IndexingSupport.BASIC, self._getitem
        )
    def _getitem(self, key):
        return self.array[key]
class SGLIBackend(BackendEntrypoint):
    def open_dataset(self, filename, *, drop_variables=None, **kwargs):
        ds = Dataset()
        h5f = h5py.File(filename)
        h5_arr = h5f["Lt_VN01"]
        ds["Lt_VN01"] = Variable(["y", "x"],
                                 indexing.LazilyIndexedArray(H5Array(h5_arr)),
                                 encoding={"preferred_chunks": h5_arr.chunks})
        return ds
print(SGLIBackend().open_dataset(filename)[varname].data)As you can see, the result is not a dask array. Annex question: I see in the source of existing engines the usage of datastore, where can I find documentation about how to use it in the context of a backend? | 
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 3 replies
-
| as far as I can tell, you're not supposed to handle  
 As such, I think you should change the last line to  Traceback---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Input In [1], in <cell line: 49>()
     41         ds["Lt_VN01"] = Variable(
     42             ["y", "x"],
     43             indexing.LazilyIndexedArray(H5Array(h5_arr)),
     44             encoding={"preferred_chunks": h5_arr.chunks},
     45         )
     46         return ds
---> 49 xr.open_dataset(filename, engine=SGLIBackend, chunks={})
File .../xarray/backends/api.py:545, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)
    538 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
    539 backend_ds = backend.open_dataset(
    540     filename_or_obj,
    541     drop_variables=drop_variables,
    542     **decoders,
    543     **kwargs,
    544 )
--> 545 ds = _dataset_from_backend_dataset(
    546     backend_ds,
    547     filename_or_obj,
    548     engine,
    549     chunks,
    550     cache,
    551     overwrite_encoded_chunks,
    552     inline_array,
    553     drop_variables=drop_variables,
    554     **decoders,
    555     **kwargs,
    556 )
    557 return ds
File .../xarray/backends/api.py:357, in _dataset_from_backend_dataset(backend_ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, inline_array, **extra_tokens)
    355     ds = backend_ds
    356 else:
--> 357     ds = _chunk_ds(
    358         backend_ds,
    359         filename_or_obj,
    360         engine,
    361         chunks,
    362         overwrite_encoded_chunks,
    363         inline_array,
    364         **extra_tokens,
    365     )
    367 ds.set_close(backend_ds._close)
    369 # Ensure source filename always stored in dataset object
File .../xarray/backends/api.py:325, in _chunk_ds(backend_ds, filename_or_obj, engine, chunks, overwrite_encoded_chunks, inline_array, **extra_tokens)
    323 variables = {}
    324 for name, var in backend_ds.variables.items():
--> 325     var_chunks = _get_chunk(var, chunks)
    326     variables[name] = _maybe_chunk(
    327         name,
    328         var,
   (...)
    333         inline_array=inline_array,
    334     )
    335 return backend_ds._replace(variables)
File .../xarray/core/dataset.py:211, in _get_chunk(var, chunks)
    209 # Determine the explicit requested chunks.
    210 preferred_chunks = var.encoding.get("preferred_chunks", {})
--> 211 preferred_chunk_shape = tuple(
    212     preferred_chunks.get(dim, size) for dim, size in zip(dims, shape)
    213 )
    214 if isinstance(chunks, Number) or (chunks == "auto"):
    215     chunks = dict.fromkeys(dims, chunks)
File .../xarray/core/dataset.py:212, in <genexpr>(.0)
    209 # Determine the explicit requested chunks.
    210 preferred_chunks = var.encoding.get("preferred_chunks", {})
    211 preferred_chunk_shape = tuple(
--> 212     preferred_chunks.get(dim, size) for dim, size in zip(dims, shape)
    213 )
    214 if isinstance(chunks, Number) or (chunks == "auto"):
    215     chunks = dict.fromkeys(dims, chunks)
AttributeError: 'NoneType' object has no attribute 'get'which means that  Working exampleIn [1]: import h5py
   ...: from xarray.backends import BackendEntrypoint, BackendArray
   ...: from xarray import Dataset, DataArray, Variable
   ...: from xarray.core import indexing
   ...: import xarray as xr
   ...: import numpy as np
   ...: 
   ...: filename = "/tmp/testsgli.h5"
   ...: varname = "Lt_VN01"
   ...: 
   ...: 
   ...: def create_h5_data(filename, varname, shape):
   ...:     h5f = h5py.File(filename, mode="w")
   ...:     h5f[varname] = np.random.rand(*shape)
   ...:     h5f.close()
   ...: 
   ...: 
   ...: create_h5_data(filename, varname, (2000, 2000))
   ...: 
   ...: 
   ...: class H5Array(BackendArray):
   ...:     def __init__(self, array):
   ...:         self.shape = array.shape
   ...:         self.dtype = array.dtype
   ...:         self.array = array
   ...: 
   ...:     def __getitem__(self, key):
   ...:         return indexing.explicit_indexing_adapter(
   ...:             key, self.shape, indexing.IndexingSupport.BASIC, self._getitem
   ...:         )
   ...: 
   ...:     def _getitem(self, key):
   ...:         return self.array[key]
   ...: 
   ...: 
   ...: class SGLIBackend(BackendEntrypoint):
   ...:     def open_dataset(self, filename, *, drop_variables=None, **kwargs):
   ...:         ds = Dataset()
   ...:         h5f = h5py.File(filename)
   ...:         h5_arr = h5f["Lt_VN01"]
   ...:         ds["Lt_VN01"] = Variable(
   ...:             ["y", "x"],
   ...:             indexing.LazilyIndexedArray(H5Array(h5_arr)),
   ...:             encoding={"preferred_chunks": {}},
   ...:         )
   ...:         return ds
   ...: 
   ...: 
   ...: xr.open_dataset(filename, engine=SGLIBackend, chunks={})
Out[1]: 
<xarray.Dataset>
Dimensions:  (y: 2000, x: 2000)
Dimensions without coordinates: y, x
Data variables:
    Lt_VN01  (y, x) float64 dask.array<chunksize=(2000, 2000), meta=np.ndarray>
 As mentioned in one of the PRs that introduced the custom backends, the datastore is an implementation detail and can be removed at any moment (hence the lack of documentation). | 
Beta Was this translation helpful? Give feedback.
-
| Does https://tutorial.xarray.dev/advanced/backends/2.Backend_with_Lazy_Loading.html help? (suggestions and PRs to improve docs and tutorials are always welcome) | 
Beta Was this translation helpful? Give feedback.
as far as I can tell, you're not supposed to handle
daskin the backend, this will be taken care of byopen_dataset:As such, I think you should change the last line to
xr.open_dataset(filename, engine=SGLIBackend, chunks={}), but that raises anAttributeError.Traceback
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Input In [1], in <cell line: 49>() 41 ds["Lt_VN01"] = Variable( 42 ["y", "x"], 43 indexing.LazilyIndexedArray(H5A…