Skip to content

Commit 0b8fec2

Browse files
committed
Merge branch 'release/v0.9.2'
2 parents 123d01f + 36ffd53 commit 0b8fec2

18 files changed

+571
-123
lines changed

.bumpversion.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.9.1
2+
current_version = 0.9.2
33
commit = False
44
tag = False
55
allow_dirty = False

CHANGELOG.md

+38-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,43 @@
11
# Changelog
22

3-
## Unreleased
3+
## 0.9.2 - 🏗 Bug fixes, logging improvement
4+
5+
### Added
6+
7+
- Add progress bars to the computation of `LazyChunkSequence` and
8+
`NestedLazyChunkSequence`
9+
[PR #567](https://github.com/aai-institute/pyDVL/pull/567)
10+
- Add a device fixture for `pytest`, which depending on the availability and
11+
user input (`pytest --with-cuda`) resolves to cuda device
12+
[PR #574](https://github.com/aai-institute/pyDVL/pull/574)
13+
14+
### Fixed
15+
16+
- Fixed logging issue in decorator `log_duration`
17+
[PR #567](https://github.com/aai-institute/pyDVL/pull/567)
18+
- Fixed missing move of tensors to model device in `EkfacInfluence`
19+
implementation [PR #570](https://github.com/aai-institute/pyDVL/pull/570)
20+
- Missing move to device of `preconditioner` in `CgInfluence` implementation
21+
[PR #572](https://github.com/aai-institute/pyDVL/pull/572)
22+
- Raise a more specific error message, when a `RunTimeError` occurs in
23+
`torch.linalg.eigh`, so the user can check if it is related to a known
24+
issue
25+
[PR #578](https://github.com/aai-institute/pyDVL/pull/578)
26+
- Fix an edge case (empty train data) in the test
27+
`test_classwise_scorer_accuracies_manual_derivation`, which resulted
28+
in undefined behavior (`np.nan` to `int` conversion with different results
29+
depending on OS)
30+
[PR #579](https://github.com/aai-institute/pyDVL/pull/579)
31+
32+
### Changed
33+
34+
- Changed logging behavior of iterative methods `LissaInfluence` and
35+
`CgInfluence` to warn on not achieving desired tolerance within `maxiter`,
36+
add parameter `warn_on_max_iteration` to set the level for this information
37+
to `logging.DEBUG`
38+
[PR #567](https://github.com/aai-institute/pyDVL/pull/567)
39+
40+
## 0.9.1 - Bug fixes, logging improvement
441

542
### Fixed
643

CONTRIBUTING.md

+7
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,13 @@ There are a few important arguments:
131131
- `--slow-tests` enables running slow tests. See below for a description
132132
of slow tests.
133133

134+
- `--with-cuda` sets the device fixture in [tests/influence/torch/conftest.py](
135+
tests/influence/torch/conftest.py) to `cuda` if it is available.
136+
Using this fixture within tests, you can run parts of your tests on a `cuda`
137+
device. Be aware, that you still have to take care of the usage of the device
138+
manually in a specific test. Setting this flag does not result in
139+
running all tests on a GPU.
140+
134141
### Markers
135142

136143
We use a few different markers to differentiate between tests and runs

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
package_data={"pydvl": ["py.typed"]},
1313
packages=find_packages(where="src"),
1414
include_package_data=True,
15-
version="0.9.1",
15+
version="0.9.2",
1616
description="The Python Data Valuation Library",
1717
install_requires=[
1818
line

src/pydvl/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
The two main modules you will want to look at are [value][pydvl.value] and
88
[influence][pydvl.influence].
99
"""
10-
__version__ = "0.9.1"
10+
__version__ = "0.9.2"

src/pydvl/influence/array.py

+86-27
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,25 @@
66
(chunked in one resp. two dimensions), with support for efficient storage and retrieval
77
using the Zarr library.
88
"""
9+
from __future__ import annotations
910

1011
import logging
1112
from abc import ABC, abstractmethod
12-
from typing import Callable, Generator, Generic, List, Optional, Tuple, Union
13+
from typing import (
14+
Callable,
15+
Generator,
16+
Generic,
17+
Iterator,
18+
List,
19+
Optional,
20+
Tuple,
21+
Union,
22+
cast,
23+
)
1324

1425
import zarr
1526
from numpy.typing import NDArray
27+
from tqdm import tqdm
1628
from zarr.storage import StoreLike
1729

1830
from ..utils import log_duration
@@ -35,9 +47,12 @@ def from_numpy(self, x: NDArray) -> TensorType:
3547

3648
class SequenceAggregator(Generic[TensorType], ABC):
3749
@abstractmethod
38-
def __call__(self, tensor_generator: Generator[TensorType, None, None]):
50+
def __call__(
51+
self,
52+
tensor_sequence: LazyChunkSequence,
53+
):
3954
"""
40-
Aggregates tensors from a generator.
55+
Aggregates tensors from a sequence.
4156
4257
Implement this method to define how a sequence of tensors, provided by a
4358
generator, should be combined.
@@ -46,31 +61,37 @@ def __call__(self, tensor_generator: Generator[TensorType, None, None]):
4661

4762
class ListAggregator(SequenceAggregator):
4863
def __call__(
49-
self, tensor_generator: Generator[TensorType, None, None]
64+
self,
65+
tensor_sequence: LazyChunkSequence,
5066
) -> List[TensorType]:
5167
"""
5268
Aggregates tensors from a single-level generator into a list. This method simply
5369
collects each tensor emitted by the generator into a single list.
5470
5571
Args:
56-
tensor_generator: A generator that yields TensorType objects.
72+
tensor_sequence: Object wrapping a generator that yields `TensorType`
73+
objects.
5774
5875
Returns:
5976
A list containing all the tensors provided by the tensor_generator.
6077
"""
61-
return [t for t in tensor_generator]
78+
79+
gen = cast(Iterator[TensorType], tensor_sequence.generator_factory())
80+
81+
if tensor_sequence.len_generator is not None:
82+
gen = cast(
83+
Iterator[TensorType],
84+
tqdm(gen, total=tensor_sequence.len_generator, desc="Blocks"),
85+
)
86+
87+
return [t for t in gen]
6288

6389

6490
class NestedSequenceAggregator(Generic[TensorType], ABC):
6591
@abstractmethod
66-
def __call__(
67-
self,
68-
nested_generators_of_tensors: Generator[
69-
Generator[TensorType, None, None], None, None
70-
],
71-
):
92+
def __call__(self, nested_sequence_of_tensors: NestedLazyChunkSequence):
7293
"""
73-
Aggregates tensors from a generator of generators.
94+
Aggregates tensors from a nested sequence of tensors.
7495
7596
Implement this method to specify how tensors, nested in two layers of
7697
generators, should be combined. Useful for complex data structures where tensors
@@ -81,27 +102,36 @@ def __call__(
81102
class NestedListAggregator(NestedSequenceAggregator):
82103
def __call__(
83104
self,
84-
nested_generators_of_tensors: Generator[
85-
Generator[TensorType, None, None], None, None
86-
],
105+
nested_sequence_of_tensors: NestedLazyChunkSequence,
87106
) -> List[List[TensorType]]:
88107
"""
89108
Aggregates tensors from a nested generator structure into a list of lists.
90109
Each inner generator is converted into a list of tensors, resulting in a nested
91110
list structure.
92111
93112
Args:
94-
nested_generators_of_tensors: A generator of generators, where each inner
95-
generator yields TensorType objects.
113+
nested_sequence_of_tensors: Object wrapping a generator of generators,
114+
where each inner generator yields TensorType objects.
96115
97116
Returns:
98117
A list of lists, where each inner list contains tensors returned from one
99118
of the inner generators.
100119
"""
101-
return [list(tensor_gen) for tensor_gen in nested_generators_of_tensors]
120+
outer_gen = cast(
121+
Iterator[Iterator[TensorType]],
122+
nested_sequence_of_tensors.generator_factory(),
123+
)
124+
len_outer_gen = nested_sequence_of_tensors.len_outer_generator
125+
if len_outer_gen is not None:
126+
outer_gen = cast(
127+
Iterator[Iterator[TensorType]],
128+
tqdm(outer_gen, total=len_outer_gen, desc="Row blocks"),
129+
)
102130

131+
return [list(tensor_gen) for tensor_gen in outer_gen]
103132

104-
class LazyChunkSequence:
133+
134+
class LazyChunkSequence(Generic[TensorType]):
105135
"""
106136
A class representing a chunked, and lazily evaluated array,
107137
where the chunking is restricted to the first dimension
@@ -114,12 +144,18 @@ class LazyChunkSequence:
114144
Attributes:
115145
generator_factory: A factory function that returns
116146
a generator. This generator yields chunks of the large array when called.
147+
len_generator: if the number of elements from the generator is
148+
known from the context, this optional parameter can be used to improve
149+
logging by adding a progressbar.
117150
"""
118151

119152
def __init__(
120-
self, generator_factory: Callable[[], Generator[TensorType, None, None]]
153+
self,
154+
generator_factory: Callable[[], Generator[TensorType, None, None]],
155+
len_generator: Optional[int] = None,
121156
):
122157
self.generator_factory = generator_factory
158+
self.len_generator = len_generator
123159

124160
@log_duration(log_level=logging.INFO)
125161
def compute(self, aggregator: Optional[SequenceAggregator] = None):
@@ -140,7 +176,7 @@ def compute(self, aggregator: Optional[SequenceAggregator] = None):
140176
"""
141177
if aggregator is None:
142178
aggregator = ListAggregator()
143-
return aggregator(self.generator_factory())
179+
return aggregator(self)
144180

145181
@log_duration(log_level=logging.INFO)
146182
def to_zarr(
@@ -171,7 +207,15 @@ def to_zarr(
171207
"""
172208
row_idx = 0
173209
z = None
174-
for block in self.generator_factory():
210+
211+
gen = cast(Iterator[TensorType], self.generator_factory())
212+
213+
if self.len_generator is not None:
214+
gen = cast(
215+
Iterator[TensorType], tqdm(gen, total=self.len_generator, desc="Blocks")
216+
)
217+
218+
for block in gen:
175219
numpy_block = converter.to_numpy(block)
176220

177221
if z is None:
@@ -204,7 +248,7 @@ def _initialize_zarr_array(block: NDArray, path_or_url: str, overwrite: bool):
204248
)
205249

206250

207-
class NestedLazyChunkSequence:
251+
class NestedLazyChunkSequence(Generic[TensorType]):
208252
"""
209253
A class representing chunked, and lazily evaluated array, where the chunking is
210254
restricted to the first two dimensions.
@@ -216,16 +260,21 @@ class NestedLazyChunkSequence:
216260
217261
Attributes:
218262
generator_factory: A factory function that returns a generator of generators.
219-
Each inner generator yields chunks.
263+
Each inner generator yields chunks
264+
len_outer_generator: if the number of elements from the outer generator is
265+
known from the context, this optional parameter can be used to improve
266+
logging by adding a progressbar.
220267
"""
221268

222269
def __init__(
223270
self,
224271
generator_factory: Callable[
225272
[], Generator[Generator[TensorType, None, None], None, None]
226273
],
274+
len_outer_generator: Optional[int] = None,
227275
):
228276
self.generator_factory = generator_factory
277+
self.len_outer_generator = len_outer_generator
229278

230279
@log_duration(log_level=logging.INFO)
231280
def compute(self, aggregator: Optional[NestedSequenceAggregator] = None):
@@ -247,7 +296,7 @@ def compute(self, aggregator: Optional[NestedSequenceAggregator] = None):
247296
"""
248297
if aggregator is None:
249298
aggregator = NestedListAggregator()
250-
return aggregator(self.generator_factory())
299+
return aggregator(self)
251300

252301
@log_duration(log_level=logging.INFO)
253302
def to_zarr(
@@ -280,7 +329,17 @@ def to_zarr(
280329
row_idx = 0
281330
z = None
282331
numpy_block = None
283-
for row_blocks in self.generator_factory():
332+
block_generator = cast(Iterator[Iterator[TensorType]], self.generator_factory())
333+
334+
if self.len_outer_generator is not None:
335+
block_generator = cast(
336+
Iterator[Iterator[TensorType]],
337+
tqdm(
338+
block_generator, total=self.len_outer_generator, desc="Row blocks"
339+
),
340+
)
341+
342+
for row_blocks in block_generator:
284343
col_idx = 0
285344
for block in row_blocks:
286345
numpy_block = converter.to_numpy(block)

src/pydvl/influence/base_influence_function_model.py

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from enum import Enum
55
from typing import Collection, Generic, Iterable, Optional, Type, TypeVar
66

7+
__all__ = ["InfluenceMode"]
8+
79

810
class InfluenceMode(str, Enum):
911
"""

src/pydvl/influence/influence_calculator.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import logging
99
from functools import partial
10-
from typing import Generator, Iterable, Optional, Tuple, Type, Union
10+
from typing import Generator, Iterable, Optional, Sized, Tuple, Type, Union, cast
1111

1212
import distributed
1313
from dask import array as da
@@ -619,8 +619,14 @@ def influence_factors(
619619
Returns:
620620
A lazy data structure representing the chunks of the resulting tensor
621621
"""
622+
try:
623+
len_iterable = len(cast(Sized, data_iterable))
624+
except Exception as e:
625+
logger.debug(f"Failed to retrieve len of data iterable: {e}")
626+
len_iterable = None
627+
622628
tensors_gen_factory = partial(self._influence_factors_gen, data_iterable)
623-
return LazyChunkSequence(tensors_gen_factory)
629+
return LazyChunkSequence(tensors_gen_factory, len_generator=len_iterable)
624630

625631
def _influences_gen(
626632
self,
@@ -677,7 +683,15 @@ def influences(
677683
mode,
678684
)
679685

680-
return NestedLazyChunkSequence(nested_tensor_gen_factory)
686+
try:
687+
len_iterable = len(cast(Sized, test_data_iterable))
688+
except Exception as e:
689+
logger.debug(f"Failed to retrieve len of test data iterable: {e}")
690+
len_iterable = None
691+
692+
return NestedLazyChunkSequence(
693+
nested_tensor_gen_factory, len_outer_generator=len_iterable
694+
)
681695

682696
def _influences_from_factors_gen(
683697
self,
@@ -735,4 +749,13 @@ def influences_from_factors(
735749
train_data_iterable,
736750
mode,
737751
)
738-
return NestedLazyChunkSequence(nested_tensor_gen)
752+
753+
try:
754+
len_iterable = len(cast(Sized, z_test_factors))
755+
except Exception as e:
756+
logger.debug(f"Failed to retrieve len of factors iterable: {e}")
757+
len_iterable = None
758+
759+
return NestedLazyChunkSequence(
760+
nested_tensor_gen, len_outer_generator=len_iterable
761+
)

0 commit comments

Comments
 (0)