diff --git a/CHANGELOG.md b/CHANGELOG.md index 7abfc15d8..0769baed4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,9 @@ ### Changed +- Changed the way semi-value coefficients are composed with sampler weights in + order to avoid `OverflowError` for very small or large values + [PR #639](https://github.com/aai-institute/pyDVL/pull/639) - Uniformly distribute test points across processes for KNNShapley. Fail for `GroupedDataset` [PR #632](https://github.com/aai-institute/pyDVL/pull/632) - Introduced the concept of logical vs data indices for `Dataset`, and diff --git a/src/pydvl/valuation/methods/beta_shapley.py b/src/pydvl/valuation/methods/beta_shapley.py index 97ab0bc74..8ce1f9c19 100644 --- a/src/pydvl/valuation/methods/beta_shapley.py +++ b/src/pydvl/valuation/methods/beta_shapley.py @@ -29,8 +29,8 @@ def __init__( self.beta = beta self.const = sp.special.beta(alpha, beta) - def coefficient(self, n: int, k: int) -> float: + def coefficient(self, n: int, k: int, other: float) -> float: j = k + 1 w = sp.special.beta(j + self.beta - 1, n - j + self.alpha) / self.const - # return math.comb(n - 1, j - 1) * w * n - return float(w) + # return math.comb(n - 1, j - 1) * w * n * other + return float(w) * other diff --git a/src/pydvl/valuation/methods/data_banzhaf.py b/src/pydvl/valuation/methods/data_banzhaf.py index c9b237575..02dd09d80 100644 --- a/src/pydvl/valuation/methods/data_banzhaf.py +++ b/src/pydvl/valuation/methods/data_banzhaf.py @@ -35,5 +35,5 @@ class DataBanzhafValuation(SemivalueValuation): algorithm_name = "Data-Banzhaf" - def coefficient(self, n: int, k: int) -> float: - return float(1 / 2 ** (n - 1)) + def coefficient(self, n: int, k: int, other: float) -> float: + return float(other / 2 ** (n - 1)) diff --git a/src/pydvl/valuation/methods/data_shapley.py b/src/pydvl/valuation/methods/data_shapley.py index ebbacd921..2337dc275 100644 --- a/src/pydvl/valuation/methods/data_shapley.py +++ b/src/pydvl/valuation/methods/data_shapley.py @@ -10,5 +10,5 @@ class DataShapleyValuation(SemivalueValuation): algorithm_name = "Data-Shapley" - def coefficient(self, n: int, k: int) -> float: - return float(1 / math.comb(n - 1, k) / n) + def coefficient(self, n: int, k: int, other: float) -> float: + return other / math.comb(n - 1, k) / n diff --git a/src/pydvl/valuation/methods/delta_shapley.py b/src/pydvl/valuation/methods/delta_shapley.py index 4b921ac07..4852f6415 100644 --- a/src/pydvl/valuation/methods/delta_shapley.py +++ b/src/pydvl/valuation/methods/delta_shapley.py @@ -38,5 +38,5 @@ def __init__( ) super().__init__(utility, sampler, is_done, progress=progress) - def coefficient(self, n: int, k: int) -> float: - return float(1 / math.comb(n, k)) + def coefficient(self, n: int, k: int, other: float) -> float: + return other / math.comb(n, k) diff --git a/src/pydvl/valuation/methods/gt_shapley.py b/src/pydvl/valuation/methods/gt_shapley.py index c9aab6356..2f39cc8bb 100644 --- a/src/pydvl/valuation/methods/gt_shapley.py +++ b/src/pydvl/valuation/methods/gt_shapley.py @@ -222,7 +222,7 @@ def weight(n: int, subset_len: int) -> float: def make_strategy( self, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ) -> EvaluationStrategy: raise NotImplementedError("This is not a semi-value sampler.") diff --git a/src/pydvl/valuation/methods/loo.py b/src/pydvl/valuation/methods/loo.py index 7267371e6..a785a2f06 100644 --- a/src/pydvl/valuation/methods/loo.py +++ b/src/pydvl/valuation/methods/loo.py @@ -43,9 +43,9 @@ def __init__(self, utility: UtilityBase, progress: bool = False): progress=progress, ) - def coefficient(self, n: int, k: int) -> float: + def coefficient(self, n: int, k: int, other: float) -> float: """ This is never actually used to filter out sets, because the LOOSampler returns only complements of {idx}, but it is required by the abstract class. """ - return 1 if k == n - 1 else 0 + return other if k == n - 1 else 0 diff --git a/src/pydvl/valuation/methods/msr_banzhaf.py b/src/pydvl/valuation/methods/msr_banzhaf.py index 5d4484996..29473916b 100644 --- a/src/pydvl/valuation/methods/msr_banzhaf.py +++ b/src/pydvl/valuation/methods/msr_banzhaf.py @@ -66,8 +66,9 @@ def __init__( progress=progress, ) - def coefficient(self, n: int, k: int) -> float: - return 1.0 + def coefficient(self, n: int, k: int, other: float) -> float: + # Coefficient is 1.0 for all n and k + return other def fit(self, data: Dataset) -> Self: """Calculate the MSR Banzhaf valuation on a dataset. diff --git a/src/pydvl/valuation/methods/owen_shapley.py b/src/pydvl/valuation/methods/owen_shapley.py index 94c29d52a..c1df2e655 100644 --- a/src/pydvl/valuation/methods/owen_shapley.py +++ b/src/pydvl/valuation/methods/owen_shapley.py @@ -76,5 +76,6 @@ def fit(self, dataset: Dataset) -> Self: self.result._status = Status.Converged return self - def coefficient(self, n: int, k: int) -> float: - return 1 + def coefficient(self, n: int, k: int, other: float) -> float: + # Coefficient is 1.0 for all n and k + return other diff --git a/src/pydvl/valuation/methods/semivalue.py b/src/pydvl/valuation/methods/semivalue.py index 76ed383ad..e35c9eed1 100644 --- a/src/pydvl/valuation/methods/semivalue.py +++ b/src/pydvl/valuation/methods/semivalue.py @@ -84,12 +84,21 @@ def __init__( self.tqdm_args.update(progress if isinstance(progress, dict) else {}) @abstractmethod - def coefficient(self, n: int, k: int) -> float: - """Computes the coefficient for a given subset size. + def coefficient(self, n: int, k: int, other: float) -> float: + """Returns the function computing the final coefficient to be used in the + semi-value valuation. + + The semi-value coefficient is a function of the number of elements in the set, + and the size of the subset for which the coefficient is being computed. + Coefficients can be very large or very small, so that simply multiplying them + with the rest of the factors in a semi-value computation can lead to overflow or + underflow. To avoid this, we pass the other factors to this method, and delegate + the choice of whether to multiply or divide to the implementation. Args: n: Total number of elements in the set. k: Size of the subset for which the coefficient is being computed + other: The other factors in the computation. """ ... diff --git a/src/pydvl/valuation/samplers/base.py b/src/pydvl/valuation/samplers/base.py index 292b0d443..4222189fb 100644 --- a/src/pydvl/valuation/samplers/base.py +++ b/src/pydvl/valuation/samplers/base.py @@ -185,7 +185,7 @@ def weight(n: int, subset_len: int) -> float: def make_strategy( self, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ) -> EvaluationStrategy: """Returns the strategy for this sampler.""" ... # return SomeEvaluationStrategy(self) @@ -242,7 +242,7 @@ def __init__( self, sampler: SamplerT, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ): self.utility = utility self.n_indices = ( @@ -256,7 +256,7 @@ def __init__( if coefficient is not None: def coefficient_fun(n: int, subset_len: int) -> float: - return sampler.weight(n, subset_len) * coefficient(n, subset_len) + return coefficient(n, subset_len, sampler.weight(n, subset_len)) self.coefficient = coefficient_fun else: diff --git a/src/pydvl/valuation/samplers/classwise.py b/src/pydvl/valuation/samplers/classwise.py index 84cc7f20e..b1644c783 100644 --- a/src/pydvl/valuation/samplers/classwise.py +++ b/src/pydvl/valuation/samplers/classwise.py @@ -179,6 +179,6 @@ def sample_limit(self, indices: IndexSetT) -> int: def make_strategy( self, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ) -> EvaluationStrategy[IndexSampler, ValueUpdate]: return self.in_class.make_strategy(utility, coefficient) diff --git a/src/pydvl/valuation/samplers/msr.py b/src/pydvl/valuation/samplers/msr.py index 1afff71b0..b3226f68e 100644 --- a/src/pydvl/valuation/samplers/msr.py +++ b/src/pydvl/valuation/samplers/msr.py @@ -54,7 +54,7 @@ def weight(n: int, subset_len: int) -> float: def make_strategy( self, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ) -> MSREvaluationStrategy: return MSREvaluationStrategy(self, utility, coefficient) @@ -64,10 +64,9 @@ class MSREvaluationStrategy(EvaluationStrategy[SamplerT, MSRValueUpdate]): The MSR evaluation strategy makes one utility evaluation per sample but generates `n_indices` many updates from it. The updates will be used to update two running - means that will later be combined into on final value. We send the - `ValueUpdate.kind` field to `ValueUpdateKind.POSITVE` or `ValueUpdateKind.NEGATIVE` + means that will later be combined into a final value. We send the + `ValueUpdate.kind` field to `ValueUpdateKind.POSITIVE` or `ValueUpdateKind.NEGATIVE` to decide which of the two running means is going to be updated. - """ def process( diff --git a/src/pydvl/valuation/samplers/permutation.py b/src/pydvl/valuation/samplers/permutation.py index 79b398c8d..2000d2e96 100644 --- a/src/pydvl/valuation/samplers/permutation.py +++ b/src/pydvl/valuation/samplers/permutation.py @@ -22,7 +22,7 @@ import math from copy import copy from itertools import permutations -from typing import Callable, cast +from typing import Callable import numpy as np @@ -32,7 +32,6 @@ from pydvl.valuation.samplers.utils import StochasticSamplerMixin from pydvl.valuation.types import ( IndexSetT, - IndexT, NullaryPredicate, Sample, SampleBatch, @@ -98,7 +97,7 @@ def weight(n: int, subset_len: int) -> float: def make_strategy( self, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ) -> PermutationEvaluationStrategy: return PermutationEvaluationStrategy(self, utility, coefficient) @@ -152,7 +151,7 @@ def __init__( self, sampler: PermutationSampler, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ): super().__init__(sampler, utility, coefficient) self.truncation = copy(sampler.truncation) @@ -168,8 +167,6 @@ def process( curr = prev = self.utility(None) permutation = sample.subset for i, idx in enumerate(permutation): - # FIXME: type checker claims this could be Any (?) - idx = cast(IndexT, idx) if not truncated: new_sample = sample.with_idx(idx).with_subset(permutation[: i + 1]) curr = self.utility(new_sample) diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py index 3821a8557..60258ab56 100644 --- a/src/pydvl/valuation/samplers/powerset.py +++ b/src/pydvl/valuation/samplers/powerset.py @@ -179,7 +179,7 @@ def index_iterator( def make_strategy( self, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ) -> PowersetEvaluationStrategy: return PowersetEvaluationStrategy(self, utility, coefficient) @@ -236,7 +236,7 @@ def weight(n: int, subset_len: int) -> float: def make_strategy( self, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ) -> EvaluationStrategy: return LOOEvaluationStrategy(self, utility, coefficient) @@ -251,7 +251,7 @@ def __init__( self, sampler: LOOSampler, utility: UtilityBase, - coefficient: Callable[[int, int], float] | None = None, + coefficient: Callable[[int, int, float], float] | None = None, ): super().__init__(sampler, utility, coefficient) assert utility.training_data is not None diff --git a/tests/valuation/methods/test_semivalues.py b/tests/valuation/methods/test_semivalues.py index 93c989cbc..c0191fbb1 100644 --- a/tests/valuation/methods/test_semivalues.py +++ b/tests/valuation/methods/test_semivalues.py @@ -85,7 +85,7 @@ def test_coefficients(n, valuation_class, kwargs): ) s = [ - math.comb(n - 1, j - 1) * valuation.coefficient(n, j - 1) + valuation.coefficient(n, j - 1, math.comb(n - 1, j - 1)) for j in range(1, n + 1) ] assert np.isclose(1, np.sum(s))