From f5bd2f04e8083f080af6c1b7bbbca1676701da16 Mon Sep 17 00:00:00 2001 From: nicolasK Date: Wed, 12 Feb 2025 18:49:21 +0100 Subject: [PATCH 1/6] feat(warnings): add new known warnings --- earthdaily/earthdatastore/cube_utils/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/earthdaily/earthdatastore/cube_utils/__init__.py b/earthdaily/earthdatastore/cube_utils/__init__.py index 4cade93d..96a065d3 100644 --- a/earthdaily/earthdatastore/cube_utils/__init__.py +++ b/earthdaily/earthdatastore/cube_utils/__init__.py @@ -263,7 +263,14 @@ def _disable_known_datacube_warning(): message="invalid value encountered in cast", module="dask.array.chunk", ) - + warnings.filterwarnings( + "ignore", + category=RuntimeWarning, + message="All-NaN slice encountered") + warnings.filterwarnings( + "ignore", + category=RuntimeWarning, + message="Mean of empty slice") def datacube( items_collection=None, From 2009fb8bcfedbf47c40b13b319ac27846ebf861a Mon Sep 17 00:00:00 2001 From: nkarasiak <12277252+nkarasiak@users.noreply.github.com> Date: Wed, 12 Feb 2025 17:49:41 +0000 Subject: [PATCH 2/6] style(ruff) : automatic lint/format --- earthdaily/earthdatastore/cube_utils/__init__.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/earthdaily/earthdatastore/cube_utils/__init__.py b/earthdaily/earthdatastore/cube_utils/__init__.py index 96a065d3..3c31ac9e 100644 --- a/earthdaily/earthdatastore/cube_utils/__init__.py +++ b/earthdaily/earthdatastore/cube_utils/__init__.py @@ -264,13 +264,12 @@ def _disable_known_datacube_warning(): module="dask.array.chunk", ) warnings.filterwarnings( - "ignore", - category=RuntimeWarning, - message="All-NaN slice encountered") + "ignore", category=RuntimeWarning, message="All-NaN slice encountered" + ) warnings.filterwarnings( - "ignore", - category=RuntimeWarning, - message="Mean of empty slice") + "ignore", category=RuntimeWarning, message="Mean of empty slice" + ) + def datacube( items_collection=None, From 41a7f52e4c71b93cde00c348e449051ea3cd135e Mon Sep 17 00:00:00 2001 From: nicolasK Date: Thu, 13 Feb 2025 11:57:29 +0100 Subject: [PATCH 3/6] release(v0.5.2) - new duplicates item detection method - fix issue with rescale and same datetimes --- docs/CHANGELOG.md | 12 +++++ earthdaily/__init__.py | 2 +- earthdaily/earthdatastore/__init__.py | 4 +- .../earthdatastore/_filter_duplicate.py | 53 ++++++++++++------- .../earthdatastore/cube_utils/__init__.py | 7 ++- examples/field_evolution.py | 14 ++--- 6 files changed, 60 insertions(+), 32 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 481df269..b581f95a 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.5.2] - 2025-02-13 + +### Added + +- New default method for duplicates detection using `proj:transform` instead +of bbox (huge gap). Change can be done using +`filter_duplicate_items(...,method='bbox')`. + +### Fixed + +- Issues with rescale when same datetime between products. + ## [0.5.1] - 2025-02-12 ### Added diff --git a/earthdaily/__init__.py b/earthdaily/__init__.py index d0776b0c..05964c10 100644 --- a/earthdaily/__init__.py +++ b/earthdaily/__init__.py @@ -6,7 +6,7 @@ __all__ = ["options"] -__version__ = "0.5.1" +__version__ = "0.5.2" def EarthDataStore( diff --git a/earthdaily/earthdatastore/__init__.py b/earthdaily/earthdatastore/__init__.py index a8516e23..3624ca66 100644 --- a/earthdaily/earthdatastore/__init__.py +++ b/earthdaily/earthdatastore/__init__.py @@ -1166,7 +1166,8 @@ def datacube( ) ds["time"] = ds.time.astype("M8[ns]") if ds.time.size != ds_mask.time.size: - ds = _select_last_common_occurrences(ds, ds_mask) + raise NotImplementedError("Sensor and cloudmask don't havethe same time.\ + {ds.time.size} for sensor and {ds_mask.time.size} for cloudmask.") ds_mask["time"] = ds["time"].time ds_mask = cube_utils._match_xy_dims(ds_mask, ds) ds = xr.merge((ds, ds_mask), compat="override") @@ -1460,6 +1461,7 @@ def ag_cloud_mask_from_items(items): collections=collections, ids=ids_[items_start_idx : items_start_idx + step], limit=step, + deduplicate_items=False, **kwargs, ) items_list.extend(list(items)) diff --git a/earthdaily/earthdatastore/_filter_duplicate.py b/earthdaily/earthdatastore/_filter_duplicate.py index 624460f4..939a51a7 100644 --- a/earthdaily/earthdatastore/_filter_duplicate.py +++ b/earthdaily/earthdatastore/_filter_duplicate.py @@ -15,28 +15,28 @@ class StacGroup: ---------- datetime : datetime Reference datetime for the group - bbox : tuple[float, ...] | None + footprint : tuple[float, ...] | None Bounding box coordinates as (west, south, east, north) or None """ datetime: datetime - bbox: Optional[Tuple[float, ...]] + footprint: Optional[Tuple[float, ...]] def matches( self, other_dt: datetime, - other_bbox: Optional[Tuple[float, ...]], + other_footprint: Optional[Tuple[float, ...]], threshold: timedelta, ) -> bool: """ - Check if another datetime and bbox match this group within threshold. + Check if another datetime and footprint match this group within threshold. Parameters ---------- other_dt : datetime Datetime to compare against - other_bbox : tuple[float, ...] | None - Bounding box to compare against + other_fontprint : tuple[float, ...] | None + Proj:transform or Bounding box to compare against threshold : timedelta Maximum allowed time difference @@ -45,7 +45,7 @@ def matches( bool True if the other datetime and bbox match within threshold """ - return self.bbox == other_bbox and abs(self.datetime - other_dt) <= threshold + return self.footprint == other_footprint and abs(self.datetime - other_dt) <= threshold @lru_cache(maxsize=1024) @@ -66,9 +66,9 @@ def _parse_datetime(dt_str: str) -> datetime: return datetime.fromisoformat(dt_str.replace("Z", "+00:00")) -def _extract_item_metadata(item: Dict) -> Tuple[datetime, Optional[Tuple[float, ...]]]: +def _extract_item_metadata(item: Dict, method='proj:transform') -> Tuple[datetime, Optional[Tuple[float, ...]]]: """ - Extract datetime and bbox from a STAC item. + Extract datetime and footprint from a STAC item. Parameters ---------- @@ -78,11 +78,15 @@ def _extract_item_metadata(item: Dict) -> Tuple[datetime, Optional[Tuple[float, Returns ------- tuple[datetime, tuple[float, ...] | None] - Tuple of (datetime, bbox) + Tuple of (datetime, footprint) """ dt = _parse_datetime(item["properties"]["datetime"]) - bbox = tuple(item["bbox"]) if "bbox" in item else None - return dt, bbox + _first_item = list(item['assets'].keys())[0] + if method=='proj:transform': + footprint = tuple(item['assets'][_first_item].get('proj:transform',item['bbox'])) + elif method=='bbox': + footprint = tuple(round(bbox,6) for bbox in item.bbox) + return dt, footprint def _group_items( @@ -106,12 +110,12 @@ def _group_items( groups: Dict[StacGroup, List[Dict]] = {} for item in items: - dt, bbox = _extract_item_metadata(item) + dt, footprint = _extract_item_metadata(item) # Find matching group or create new one matching_group = next( - (group for group in groups if group.matches(dt, bbox, time_threshold)), - StacGroup(datetime=dt, bbox=bbox), + (group for group in groups if group.matches(dt, footprint, time_threshold)), + StacGroup(datetime=dt, footprint=footprint), ) groups.setdefault(matching_group, []).append(item) @@ -148,7 +152,10 @@ def _select_latest_items(items: List[Dict]) -> List[Dict]: def filter_duplicate_items( - items: ItemCollection, time_threshold: timedelta = timedelta(minutes=5) + items: ItemCollection, + time_threshold: timedelta = timedelta(minutes=5), + method='proj:transform' + ) -> ItemCollection: """ Deduplicate STAC items based on spatial and temporal proximity. @@ -164,11 +171,14 @@ def filter_duplicate_items( ISO format acquisition timestamp - properties.updated : str ISO format update timestamp - - bbox : list[float], optional - Bounding box coordinates [west, south, east, north] + - footprint : list[float], optional + Bounding box coordinates [west, south, east, north] or proj:transform. time_threshold : timedelta, optional Maximum time difference to consider items as duplicates, by default 5 minutes + + method : str + 'proj:transform' or 'bbox' to compare against Returns ------- @@ -216,5 +226,10 @@ def filter_duplicate_items( for group_items in grouped_items.values() for item in _select_latest_items(group_items) ] - logging.info(f"Deduplication removes {len(items)-len(deduplicated)} items.") + duplicates = (len(items)-len(deduplicated)) + if duplicates: + logging.info(f"Deduplication removes {duplicates} items.") + + deduplicated = sorted(deduplicated, key=lambda x: x['properties']['datetime']) + return ItemCollection(deduplicated) diff --git a/earthdaily/earthdatastore/cube_utils/__init__.py b/earthdaily/earthdatastore/cube_utils/__init__.py index 96a065d3..cd9d4ecf 100644 --- a/earthdaily/earthdatastore/cube_utils/__init__.py +++ b/earthdaily/earthdatastore/cube_utils/__init__.py @@ -392,7 +392,7 @@ def datacube( json.dumps(ds.coords[coord].values[idx]) for idx in range(ds.coords[coord].size) ] - + ds = ds.sortby(ds.time) return ds @@ -510,7 +510,7 @@ def rescale_assets_with_items( # Track scaling parameters scales.setdefault(ds_asset, {}).setdefault(scale, {}).setdefault( offset, [] - ).append(time) + ).append(idx) # Apply rescaling if scales: @@ -519,8 +519,7 @@ def rescale_assets_with_items( asset_scaled = [] for scale, offset_data in scale_data.items(): for offset, times in offset_data.items(): - mask = np.isin(ds.time, times) - asset_scaled.append(ds[[asset]].isel(time=mask) * scale + offset) + asset_scaled.append(ds[[asset]].isel(time=times) * scale + offset) scaled_assets.append(xr.concat(asset_scaled, dim="time")) # Merge scaled assets diff --git a/examples/field_evolution.py b/examples/field_evolution.py index 2cd4d515..19d7fdf2 100644 --- a/examples/field_evolution.py +++ b/examples/field_evolution.py @@ -28,27 +28,27 @@ # Search for collection items for June 2022. # where at least 50% of the field is clear according to the native cloudmask. -pivot_cube = eds.datacube( +datacube = eds.datacube( "sentinel-2-l2a", intersects=pivot, - datetime=["2022-06"], + datetime=["2022-06","2022-07"], assets=["red", "green", "blue", "nir"], mask_with="native", clear_cover=50 ) -pivot_cube.clear_percent.plot.scatter(x="time") +datacube.clear_percent.plot.scatter(x="time") ############################################################################## # Add spectral indices using spyndex from earthdaily accessor # ------------------------------------------------------------ -pivot_cube = pivot_cube.ed.add_indices(['NDVI']) +datacube = datacube.ed.add_indices(['NDVI']) ############################################################################## # Plots cube with SCL with at least 50% of clear data # ---------------------------------------------------- -pivot_cube = pivot_cube.load() -pivot_cube.ed.plot_rgb(col_wrap=4, vmin=0, vmax=.3) +datacube = datacube.load() +datacube.ed.plot_rgb(col_wrap=4, vmin=0, vmax=.3) plt.title("Pivot evolution masked with native cloudmasks") plt.show() @@ -57,7 +57,7 @@ # Compute zonal stats for the pivot # ---------------------------------------------------- -zonal_stats = pivot_cube.ed.zonal_stats(pivot, ['mean','max','min']) +zonal_stats = datacube.ed.zonal_stats(pivot, ['mean','max','min']) zonal_stats.isel(feature=0).to_array(dim="band").plot.line( x="time", col="band", hue="zonal_statistics", col_wrap=3 From 88aa42a256e113701cc6ec201b9c72e344cae9c0 Mon Sep 17 00:00:00 2001 From: nkarasiak <12277252+nkarasiak@users.noreply.github.com> Date: Thu, 13 Feb 2025 10:59:34 +0000 Subject: [PATCH 4/6] style(ruff) : automatic lint/format --- earthdaily/earthdatastore/__init__.py | 6 ++-- .../earthdatastore/_filter_duplicate.py | 30 +++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/earthdaily/earthdatastore/__init__.py b/earthdaily/earthdatastore/__init__.py index 3624ca66..ba77f62b 100644 --- a/earthdaily/earthdatastore/__init__.py +++ b/earthdaily/earthdatastore/__init__.py @@ -1166,8 +1166,10 @@ def datacube( ) ds["time"] = ds.time.astype("M8[ns]") if ds.time.size != ds_mask.time.size: - raise NotImplementedError("Sensor and cloudmask don't havethe same time.\ - {ds.time.size} for sensor and {ds_mask.time.size} for cloudmask.") + raise NotImplementedError( + "Sensor and cloudmask don't havethe same time.\ + {ds.time.size} for sensor and {ds_mask.time.size} for cloudmask." + ) ds_mask["time"] = ds["time"].time ds_mask = cube_utils._match_xy_dims(ds_mask, ds) ds = xr.merge((ds, ds_mask), compat="override") diff --git a/earthdaily/earthdatastore/_filter_duplicate.py b/earthdaily/earthdatastore/_filter_duplicate.py index 939a51a7..ff1897e3 100644 --- a/earthdaily/earthdatastore/_filter_duplicate.py +++ b/earthdaily/earthdatastore/_filter_duplicate.py @@ -45,7 +45,10 @@ def matches( bool True if the other datetime and bbox match within threshold """ - return self.footprint == other_footprint and abs(self.datetime - other_dt) <= threshold + return ( + self.footprint == other_footprint + and abs(self.datetime - other_dt) <= threshold + ) @lru_cache(maxsize=1024) @@ -66,7 +69,9 @@ def _parse_datetime(dt_str: str) -> datetime: return datetime.fromisoformat(dt_str.replace("Z", "+00:00")) -def _extract_item_metadata(item: Dict, method='proj:transform') -> Tuple[datetime, Optional[Tuple[float, ...]]]: +def _extract_item_metadata( + item: Dict, method="proj:transform" +) -> Tuple[datetime, Optional[Tuple[float, ...]]]: """ Extract datetime and footprint from a STAC item. @@ -81,11 +86,13 @@ def _extract_item_metadata(item: Dict, method='proj:transform') -> Tuple[datetim Tuple of (datetime, footprint) """ dt = _parse_datetime(item["properties"]["datetime"]) - _first_item = list(item['assets'].keys())[0] - if method=='proj:transform': - footprint = tuple(item['assets'][_first_item].get('proj:transform',item['bbox'])) - elif method=='bbox': - footprint = tuple(round(bbox,6) for bbox in item.bbox) + _first_item = list(item["assets"].keys())[0] + if method == "proj:transform": + footprint = tuple( + item["assets"][_first_item].get("proj:transform", item["bbox"]) + ) + elif method == "bbox": + footprint = tuple(round(bbox, 6) for bbox in item.bbox) return dt, footprint @@ -154,8 +161,7 @@ def _select_latest_items(items: List[Dict]) -> List[Dict]: def filter_duplicate_items( items: ItemCollection, time_threshold: timedelta = timedelta(minutes=5), - method='proj:transform' - + method="proj:transform", ) -> ItemCollection: """ Deduplicate STAC items based on spatial and temporal proximity. @@ -176,7 +182,7 @@ def filter_duplicate_items( time_threshold : timedelta, optional Maximum time difference to consider items as duplicates, by default 5 minutes - + method : str 'proj:transform' or 'bbox' to compare against @@ -226,10 +232,10 @@ def filter_duplicate_items( for group_items in grouped_items.values() for item in _select_latest_items(group_items) ] - duplicates = (len(items)-len(deduplicated)) + duplicates = len(items) - len(deduplicated) if duplicates: logging.info(f"Deduplication removes {duplicates} items.") - deduplicated = sorted(deduplicated, key=lambda x: x['properties']['datetime']) + deduplicated = sorted(deduplicated, key=lambda x: x["properties"]["datetime"]) return ItemCollection(deduplicated) From b6c14479b600826e484843fadba4b09246540099 Mon Sep 17 00:00:00 2001 From: nicolasK Date: Thu, 13 Feb 2025 12:27:44 +0100 Subject: [PATCH 5/6] fix(deduplicates) : when no assets --- earthdaily/earthdatastore/__init__.py | 1 + earthdaily/earthdatastore/_filter_duplicate.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/earthdaily/earthdatastore/__init__.py b/earthdaily/earthdatastore/__init__.py index ba77f62b..8ffe3dc2 100644 --- a/earthdaily/earthdatastore/__init__.py +++ b/earthdaily/earthdatastore/__init__.py @@ -1118,6 +1118,7 @@ def datacube( "eq": cross_calibration_collection }, }, + deduplicate_items=False ) except Warning: raise Warning( diff --git a/earthdaily/earthdatastore/_filter_duplicate.py b/earthdaily/earthdatastore/_filter_duplicate.py index ff1897e3..213e6eec 100644 --- a/earthdaily/earthdatastore/_filter_duplicate.py +++ b/earthdaily/earthdatastore/_filter_duplicate.py @@ -86,13 +86,14 @@ def _extract_item_metadata( Tuple of (datetime, footprint) """ dt = _parse_datetime(item["properties"]["datetime"]) - _first_item = list(item["assets"].keys())[0] - if method == "proj:transform": + any_asset = item.get('assets',{}) != {} + if any_asset and method == 'proj:transform': + _first_item = list(item["assets"].keys())[0] footprint = tuple( item["assets"][_first_item].get("proj:transform", item["bbox"]) ) - elif method == "bbox": - footprint = tuple(round(bbox, 6) for bbox in item.bbox) + else: + footprint = tuple(round(bbox, 6) for bbox in item['bbox']) return dt, footprint From f2b7d52d848c2fc26555b66f0c6bad588c53df60 Mon Sep 17 00:00:00 2001 From: nkarasiak <12277252+nkarasiak@users.noreply.github.com> Date: Thu, 13 Feb 2025 11:28:10 +0000 Subject: [PATCH 6/6] style(ruff) : automatic lint/format --- earthdaily/earthdatastore/__init__.py | 2 +- earthdaily/earthdatastore/_filter_duplicate.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/earthdaily/earthdatastore/__init__.py b/earthdaily/earthdatastore/__init__.py index 8ffe3dc2..fe4822bd 100644 --- a/earthdaily/earthdatastore/__init__.py +++ b/earthdaily/earthdatastore/__init__.py @@ -1118,7 +1118,7 @@ def datacube( "eq": cross_calibration_collection }, }, - deduplicate_items=False + deduplicate_items=False, ) except Warning: raise Warning( diff --git a/earthdaily/earthdatastore/_filter_duplicate.py b/earthdaily/earthdatastore/_filter_duplicate.py index 213e6eec..3189ec0d 100644 --- a/earthdaily/earthdatastore/_filter_duplicate.py +++ b/earthdaily/earthdatastore/_filter_duplicate.py @@ -86,14 +86,14 @@ def _extract_item_metadata( Tuple of (datetime, footprint) """ dt = _parse_datetime(item["properties"]["datetime"]) - any_asset = item.get('assets',{}) != {} - if any_asset and method == 'proj:transform': - _first_item = list(item["assets"].keys())[0] + any_asset = item.get("assets", {}) != {} + if any_asset and method == "proj:transform": + _first_item = list(item["assets"].keys())[0] footprint = tuple( item["assets"][_first_item].get("proj:transform", item["bbox"]) ) else: - footprint = tuple(round(bbox, 6) for bbox in item['bbox']) + footprint = tuple(round(bbox, 6) for bbox in item["bbox"]) return dt, footprint