Skip to content

Commit

Permalink
style: apply pre-commit hooks to all files
Browse files Browse the repository at this point in the history
  • Loading branch information
martibosch committed Apr 24, 2024
1 parent 4282e01 commit 3a32604
Show file tree
Hide file tree
Showing 22 changed files with 50 additions and 20 deletions.
1 change: 1 addition & 0 deletions tstore/archive/attributes/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import pandas as pd

from tstore.archive.checks import check_is_tstore, check_tstore_ids
Expand Down
5 changes: 4 additions & 1 deletion tstore/archive/attributes/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
@author: ghiggi
"""

import pyarrow.parquet as pq

from tstore.archive.checks import check_is_tstore, check_tstore_ids
from tstore.archive.io import define_attributes_filepath


def get_tstore_ids_filters(tstore_ids=None):
def get_tstore_ids_filters(

Check warning on line 14 in tstore/archive/attributes/pyarrow.py

View check run for this annotation

Codecov / codecov/patch

tstore/archive/attributes/pyarrow.py#L14

Added line #L14 was not covered by tests
tstore_ids=None, # noqa: ARG001
):
"""Define filters for Parquet Dataset subsetting at read-time."""
# TODO implement logic
filters = None
Expand Down
6 changes: 5 additions & 1 deletion tstore/archive/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import datetime

import numpy as np
Expand Down Expand Up @@ -45,7 +46,10 @@ def check_ts_variables(ts_variables, base_dir):
return ts_variables


def check_tstore_ids(tstore_ids, base_dir):
def check_tstore_ids(

Check warning on line 49 in tstore/archive/checks.py

View check run for this annotation

Codecov / codecov/patch

tstore/archive/checks.py#L49

Added line #L49 was not covered by tests
tstore_ids,
base_dir, # noqa: ARG001
):
"""Check valid tstore_ids.
If tstore_ids=None, return None.
Expand Down
1 change: 1 addition & 0 deletions tstore/archive/metadata/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import yaml

from tstore.archive.io import define_metadata_filepath
Expand Down
1 change: 1 addition & 0 deletions tstore/archive/metadata/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import yaml

from tstore.archive.io import define_metadata_filepath
Expand Down
1 change: 1 addition & 0 deletions tstore/archive/partitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@


def get_partitioning_mapping_dict(time_var, backend="pandas"):
"""Get partitioning mapping dict."""
# Mapping of partitioning components to corresponding pandas attributes
if backend == "pandas":
partitioning_mapping = {
Expand Down
1 change: 1 addition & 0 deletions tstore/archive/ts/readers/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import dask.dataframe as dd
import pandas as pd

Expand Down
5 changes: 3 additions & 2 deletions tstore/archive/ts/readers/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
@author: ghiggi
"""

import polars as pl


def open_ts(
fpath,
partitions,
start_time=None,
end_time=None,
start_time=None, # noqa: ARG001
end_time=None, # noqa: ARG001
# Options
rechunk=True,
use_statistics=True,
Expand Down
3 changes: 2 additions & 1 deletion tstore/archive/ts/readers/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import pyarrow.parquet as pq

from tstore.archive.ts.utility import get_time_filters
Expand All @@ -15,7 +16,7 @@ def open_ts(
start_time=None,
end_time=None,
columns=None,
split_row_groups=False,
split_row_groups=False, # noqa: ARG001
# pyarrow-specific
filesystem=None,
use_threads=True,
Expand Down
5 changes: 4 additions & 1 deletion tstore/archive/ts/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
"""


def get_time_filters(start_time=None, end_time=None):
def get_time_filters(

Check warning on line 9 in tstore/archive/ts/utility.py

View check run for this annotation

Codecov / codecov/patch

tstore/archive/ts/utility.py#L9

Added line #L9 was not covered by tests
start_time=None, # noqa: ARG001
end_time=None, # noqa: ARG001
):
"""Define filters for Parquet Dataset subsetting at read-time."""
# TODO implement logic
filters = None
Expand Down
2 changes: 2 additions & 0 deletions tstore/archive/ts/writers/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import math
import os

Expand Down Expand Up @@ -102,6 +103,7 @@ def file_visitor(written_file):


def convert_size_to_bytes(size):
"""Convert size to bytes."""
if not isinstance(size, (str, int)):
raise TypeError("Expecting a string (i.e. 200MB) or the integer number of bytes.")
if isinstance(size, int):
Expand Down
2 changes: 1 addition & 1 deletion tstore/tests/test_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@


def test_dummy():
assert 1 == 1
assert 1 == 1 # noqa: PLR0133
2 changes: 1 addition & 1 deletion tstore/tsdf/extensions/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def ts_class(self):
# Required for all ExtensionArray subclasses
def isna(self):
"""A 1-D array indicating if the TS is missing."""
return pd.isnull(self._data)
return pd.isna(self._data)

Check warning on line 157 in tstore/tsdf/extensions/array.py

View check run for this annotation

Codecov / codecov/patch

tstore/tsdf/extensions/array.py#L157

Added line #L157 was not covered by tests

# Required for all ExtensionArray subclasses
def copy(self):
Expand Down
1 change: 1 addition & 0 deletions tstore/tsdf/extensions/ts_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import re
from typing import Any

Expand Down
1 change: 1 addition & 0 deletions tstore/tsdf/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import pandas as pd

from tstore.archive.io import get_ts_info
Expand Down
1 change: 1 addition & 0 deletions tstore/tsdf/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import numpy as np

from tstore.archive.io import (
Expand Down
2 changes: 1 addition & 1 deletion tstore/tslong/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import pandas as pd
import pyarrow as pa

Expand Down Expand Up @@ -120,7 +121,6 @@ def to_tstore(
# Write to disk per identifier
for tstore_id, df_group in df.groupby(id_var):
for ts_variable, columns in ts_variables.items():

# Retrieve columns of the TS object
if columns is None:
columns = [ts_variable]
Expand Down
1 change: 0 additions & 1 deletion tstore/tslong/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def to_tstore(
# Write to disk per identifier
for tstore_id, df_group in df.groupby(id_var):
for ts_variable, columns in ts_variables.items():

# Retrieve columns of the TS object
if columns is None:
columns = [ts_variable]
Expand Down
1 change: 1 addition & 0 deletions tstore/tslong/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

from functools import reduce

import numpy as np
Expand Down
1 change: 1 addition & 0 deletions tutorials/00_example_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""
# ruff: noqa: E402

import dask.datasets
import numpy as np
Expand Down
7 changes: 5 additions & 2 deletions tutorials/01_example_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import dask.datasets
import pandas as pd
import polars as pl
Expand Down Expand Up @@ -52,7 +53,8 @@
# - Should be based on tstore.archive.ts.writers
# --> All code should likely exploit the pyarrow write_partitioned_dataset() function
# --> Maybe we should call the method TS.to_parquet()
# --> Also when writing TSLONG to TStore, we could create a TS to then write to disk (see comment in tstore.tslong.pandas/polars)
# --> Also when writing TSLONG to TStore, we could create a TS to then write to disk (see comment in
# tstore.tslong.pandas/polars)

# Write to disk
fpath = "/tmp/ts_dask.parquet"
Expand All @@ -69,7 +71,8 @@
#### Read TS from disk
# --> Pandas, Polars and pyarrow <read_parquet> functions loads data into memory !
# --> Should we exploit only the pyarrow read function and then convert to the backend of choice?
# --> Should we wrap the reading with dask.delayed to enable lazy reading (not actually read data into memory till needed ?)
# --> Should we wrap the reading with dask.delayed to enable lazy reading (not actually read data into memory till
# needed ?)
# --> Dask (dd.read_parquet) and LazyPolars (scan_parquet) functions allow direct lazy reads

# Dask code
Expand Down
20 changes: 12 additions & 8 deletions tutorials/02_example_tsdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@author: ghiggi
"""

import dask.datasets
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -97,7 +98,8 @@

# If class inherit pandas.DataFrame or geopandas.DataFrame
# --> Need to redefine methods to returns TSDF (similar concept applies to TSLONG, ...)
# Or we avoid inheritance from *.DataFrame and we re-implement the relevant methods calling internally the *.DataFrame.<methods>
# Or we avoid inheritance from *.DataFrame and we re-implement the relevant methods calling internally the
# *.DataFrame.<methods>


#### TSDF - Write TStore
Expand Down Expand Up @@ -167,21 +169,23 @@
####--------------------------------------------------------------------------.
#### IMPLEMENTATIONS / THOUGHTS
# - Arrow/Pyarrow allows for zero-copy conversion between pandas/(dask)/polars. We should exploit that !
# - Pyarrow.Table object is difficult to manipulate (I would use it only for conversion across backend and reading/writing to/from disk)
# - Pyarrow.Table object is difficult to manipulate (I would use it only for conversion across backend and
# reading/writing to/from disk)
# - ID should be always stored as string (pyarrow.large_string), so to have some dtype when doing join/merge operations
# - I would suggest to first start implement the I/O and conversion with pandas and polars (with data in memory) (and eventually dask dataframe)
# - I would suggest to first start implement the I/O and conversion with pandas and polars (with data in memory) (and
# eventually dask dataframe)
# - When this is set (and eventually tested), we can implement the lazy reading of TSDF
# - dask.delayed for pandas and polars backends
# - dask.dataframe for dask backend
# - lazy polars for lazy_polars backend ...
# - I would expect that we enable to define a series of lazy operations (which can change the backend of the TS during computations),
# we must have something to control / enforce the final backend of the data to be written to disk. Because the dask delayed object does not
# know what will be the dtype of the TS.data
# - I would expect that we enable to define a series of lazy operations (which can change the backend of the TS during
# computations), we must have something to control / enforce the final backend of the data to be written to disk.
# Because the dask delayed object does not know what will be the dtype of the TS.data
# - A tool to switch TS object from an backend to another should be implemented
# - A method to .compute() data in memory should be also implemented

# - Ideally one should be able to apply functions to many TSArray etc, and the actual execution
# only taking place when writing to disk the timeseries (id per id), loading data into memory only at that moment and freeing the memory
# when the id is processed.
# only taking place when writing to disk the timeseries (id per id), loading data into memory only at that moment and
# freeing the memory when the id is processed.

####--------------------------------------------------------------------------.

0 comments on commit 3a32604

Please sign in to comment.