Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/load local content #24

Merged
merged 18 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
license = { file = "LICENSE" }
name = "goes-dl"
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.8"
requires-python = ">=3.9"

[project.urls]
# Homepage = "https://example.com"
Expand All @@ -70,6 +70,9 @@
version = { attr = "GOES_DL.__version__" }

[tool.setuptools.package-data]
GOES_DL = [
"py.typed",
]
config = [
"settings.json",
]
Expand All @@ -84,12 +87,16 @@
description = "Satellite imagery downloader for GOES and GridSat datasets"
license = "MIT"
name = "goes-dl"
packages = [
{ include = "GOES_DL" },
{ include = "GOES_DL/py.typed" },
]
readme = "README.md"
version = "0.1.0"

[tool.poetry.dependencies]
boto3 = "~1.35.42"
python = "^3.8"
python = "^3.9"
requests = "~2.32.3"

[tool.black]
Expand Down
2 changes: 2 additions & 0 deletions src/GOES_DL/datasource/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
from .datasource_aws import DatasourceAWS as DatasourceAWS
from .datasource_cache import DatasourceCache as DatasourceCache
from .datasource_http import DatasourceHTTP as DatasourceHTTP
from .datasource_local import DatasourceLocal as DatasourceLocal
from .datasource_repository import DatasourceRepository as DatasourceRepository

__all__ = [
"Datasource",
"DatasourceAWS",
"DatasourceCache",
"DatasourceHTTP",
"DatasourceLocal",
"DatasourceRepository",
]
151 changes: 151 additions & 0 deletions src/GOES_DL/datasource/datasource_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
Provide the DatasourceLocal class for handling local-based data sources.

Classes:
DatasourceLocal: Handle local-based data sources.
"""

import contextlib
import os
from pathlib import Path

from ..utils import FileRepository
from .constants import DownloadStatus
from .datasource_base import DatasourceBase
from .datasource_cache import DatasourceCache
from .datasource_repository import DatasourceRepository


class DatasourceLocal(DatasourceBase):
"""
Handle local-based data sources.

Provide methods to interact with local folders and files.

Methods
-------
download_file(file_path: str)
Retrieve a file from the datasource and save it into the local
repository.
listdir(dir_path: str)
List the contents of a remote directory.

Attributes
----------
source : FileRepository
A repository to manage files in a local drive
"""

source: FileRepository

def __init__(
self,
root_path: str | Path,
repository: str | Path | DatasourceRepository | None = None,
cache: float | DatasourceCache | None = None,
) -> None:
"""
Initialize the DatasourceLocal object.

Parameters
----------
root_path : str | Path
The root path of a local-based data source.
repository : str | Path | DatasourceRepository | None, optional
The directory where the files will be stored, by default
None.
cache : float | DatasourceCache | None, optional
The cache expiration time in seconds, by default None.

Raises
------
ValueError
If the resource does not exist or the user has no access.
"""
if not os.path.isdir(root_path):
raise ValueError(
f"Path '{root_path}' does not exist or you have no access."
)

self.source = FileRepository(root_path)

super().__init__(str(root_path), repository, cache)

def download_file(self, file_path: str) -> DownloadStatus:
"""
Download a file from the datasource into the local repository.

Get a file from a remote location or local repository. The path
provided must be relative to the base URL and local repository
root directory. The remote path is reconstructed in the local
repository.

Parameters
----------
file_path : str
The path to the remote file to be downloaded.

Returns
-------
DownloadStatus
`DownloadStatus.SUCCESS` if the file was downloaded
successfully; otherwise, `DownloadStatus.ALREADY` if the
file is already in the local repository.

Raises
------
RuntimeError
If the file cannot be retrieved or does not exist.
"""
try:
return self._download_file(file_path)

except FileNotFoundError as exc:
raise RuntimeError(
f"The file '{file_path}' does not exist: {exc}"
) from exc
except IOError as exc:
raise RuntimeError(
f"Unable to retrieve the file '{file_path}': {exc}"
) from exc

def listdir(self, dir_path: str) -> list[str]:
"""
List the contents of a directory.

Lists files within a directory in a local drive and its
subdirectories. The path is relative to the root path.

Parameters
----------
dir_path : str
The path to the directory. The path is relative to the root
path.

Returns
-------
list[str]
A list of file names in the directory.
"""
cached_paths = self.cache.get_item(dir_path)

if cached_paths is not None:
return cached_paths

folder_content: list[str] = []

with contextlib.suppress(FileNotFoundError):
folder_content = self.source.list_files(dir_path)

folder_content = [os.path.join(dir_path, f) for f in folder_content]

self.cache.add_item(dir_path, folder_content)

return folder_content

def _retrieve_file(self, file_path: str) -> bytes:
content = self.source.read_file(file_path)

self.repository.add_item(file_path, content)

return content
28 changes: 7 additions & 21 deletions src/GOES_DL/downloader/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,13 @@ def download_files(self, *, start: str, end: str = "") -> list[str]:
retrieved, e.g. if the file does not exist in the datasource or
an internal error occurred.
"""
files_in_range: list[str] = self._get_file_list(start, end)
files_in_range = self._get_file_list(start, end)

self._retrieve_files(files_in_range)

return files_in_range

def get_files(self, *, file_paths: list[str]) -> list[str]:
def get_files(self, *, file_paths: list[str]) -> None:
"""
Load a list of files from the datasource or local repository.

Expand All @@ -164,12 +164,6 @@ def get_files(self, *, file_paths: list[str]) -> list[str]:
file_paths : list[str]
A list with the file paths.

Returns
-------
list[str]
The list of file path and names with respect to the local
repository root directory.

Notes
-----
`ValueError` is raised if the start_time is not provided. The
Expand All @@ -183,8 +177,6 @@ def get_files(self, *, file_paths: list[str]) -> list[str]:
"""
self._retrieve_files(file_paths)

return file_paths

def list_files(self, *, start: str, end: str = "") -> list[str]:
"""
List the files that can be retrieved from the datasource.
Expand Down Expand Up @@ -262,7 +254,7 @@ def _filter_directory_content(
if not self.locator.match(basename):
continue

ct: datetime = self.locator.get_datetime(file)
ct = self.locator.get_datetime(file)

if datetime_ini <= ct <= datetime_fin:
files_in_range.append(file)
Expand Down Expand Up @@ -307,14 +299,10 @@ def _get_datetimes(
if not start_time:
raise ValueError("start_time must be provided")

datetime_ini: datetime = datetime.strptime(
start_time, self.date_format
)
datetime_ini = datetime.strptime(start_time, self.date_format)

if end_time:
datetime_fin: datetime = datetime.strptime(
end_time, self.date_format
)
datetime_fin = datetime.strptime(end_time, self.date_format)
else:
datetime_fin = datetime_ini

Expand Down Expand Up @@ -367,13 +355,11 @@ def _get_file_list(self, start_time: str, end_time: str = "") -> list[str]:
if self.show_progress:
print("Retrieving available file list")

datetime_ini: datetime
datetime_fin: datetime
datetime_ini, datetime_fin = self._get_datetimes(start_time, end_time)

paths: list[str] = self.locator.get_paths(datetime_ini, datetime_fin)
paths = self.locator.get_paths(datetime_ini, datetime_fin)

files: list[str] = self._retrieve_directory_content(paths)
files = self._retrieve_directory_content(paths)

return self._filter_directory_content(
datetime_ini, datetime_fin, files
Expand Down
Empty file added src/GOES_DL/py.typed
Empty file.
16 changes: 10 additions & 6 deletions src/GOES_DL/utils/file_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,14 @@ def create_directory(self, directory: str | Path) -> None:
Raises
------
FileExistsError
If the directory already exists.
If an object with the same path name already exists.
"""
dir_path: Path = self.base_directory / directory
if not dir_path.exists():
dir_path.mkdir(parents=True)
else:
raise FileExistsError(
f"The directory '{dir_path}' already exists."
f"An object with the path name '{dir_path}' already exists."
)

def delete_directory(self, directory: str | Path) -> None:
Expand Down Expand Up @@ -260,14 +260,18 @@ def list_files(self, directory: str | Path = "") -> list[str]:
Raises
------
NotADirectoryError
If the given directory does not exist or is not a directory.
If the given path is not a directory.
FileNotFoundError
If the directory does not exist.
"""
dir_path: Path = self.base_directory / directory
if dir_path.is_dir():
return [item.name for item in dir_path.iterdir() if item.is_file()]
raise NotADirectoryError(
f"The directory '{dir_path}' does not exist or is not a directory."
)
if dir_path.exists():
raise NotADirectoryError(
f"The path '{dir_path}' is not a directory."
)
raise FileNotFoundError(f"The directory '{dir_path}' does not exist.")

def move_file(
self,
Expand Down
39 changes: 32 additions & 7 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from GOES_DL.dataset.goes import GOESProductLocatorABIPP as ProductLocatorGOES
from GOES_DL.dataset.gridsat import GridSatProductLocatorB1 as ProductLocatorB1
from GOES_DL.dataset.gridsat import GridSatProductLocatorGC as ProductLocatorGC
from GOES_DL.datasource import DatasourceAWS, DatasourceHTTP
from GOES_DL.datasource import DatasourceAWS, DatasourceHTTP, DatasourceLocal
from GOES_DL.downloader import Downloader

DATE_FORMAT = "%Y-%m-%dT%H:%M%z"
Expand All @@ -54,8 +54,8 @@ def test(dl: Downloader, start: str, end: str = "") -> list[str]:
The downloader object to test.
start : str
The start date of the download range.
end : str, optional
The end date of the download range.
end : str
The end date of the download range, if any. The default is "".

Returns
-------
Expand Down Expand Up @@ -107,8 +107,7 @@ def test_gridsat_http() -> tuple[list[str], list[str]]:

def test_gridsat_goes() -> list[str]:
"""
Test the downloader object with GridSat-GOES/CONUS data and HTTP
datasource.
Test the downloader with GridSat-GOES/CONUS and HTTP datasource.

Returns
-------
Expand Down Expand Up @@ -167,10 +166,36 @@ def test_goes2() -> tuple[list[str], list[str]]:
return files1, files2


def main() -> None:
def test_goes3() -> tuple[list[str], list[str]]:
"""
Run all test functions.
Test the downloader object with GOES-16 data and local datasource.

Returns
-------
tuple[list[str], list[str]]
A tuple of lists of downloaded files.
"""
pd = ProductLocatorGOES("CMIP", "F", "C13", "G16")

# GOES-16 data is updated every 10 minutes. If you are downloading
# old data, you may leave the refresh rate as default.

repo_goes_l = "../../TFG_Tools/repository/20201114T20"

ds = DatasourceLocal(repo_goes_l, repo_goes_l, 0)
dl = Downloader(datasource=ds, locator=pd, date_format=DATE_FORMAT)

files1 = test(dl, "2020-11-14T20:00Z")
files2 = test(dl, "2020-11-14T20:00Z", "2020-11-15T19:00Z")

return files1, files2


def main() -> None:
"""Run all test functions."""
files2 = test_goes3()
print(files2)

files2 = test_goes2()
print(files2)

Expand Down