tslearn-team
diff --git a/‎tslearn/preprocessing/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎tslearn/preprocessing/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎tslearn/preprocessing/preprocessing.py‎
Lines changed: 227 additions & 15 deletions b/‎tslearn/preprocessing/preprocessing.py‎
Lines changed: 227 additions & 15 deletions
@@ -6,11 +6,13 @@
 from .preprocessing import (
     TimeSeriesScalerMeanVariance,
     TimeSeriesScalerMinMax,
-    TimeSeriesResampler
+    TimeSeriesResampler,
+    TimeSeriesImputer
 )
 
 __all__ = [
     "TimeSeriesResampler",
     "TimeSeriesScalerMinMax",
-    "TimeSeriesScalerMeanVariance"
+    "TimeSeriesScalerMeanVariance",
+    "TimeSeriesImputer"
 ]
@@ -1,13 +1,17 @@
-import numpy
+from math import nan
+from typing import Callable, Optional, Union
 
-from scipy.interpolate import interp1d
+import numpy
 
 from sklearn.base import TransformerMixin
 from sklearn.utils.validation import check_is_fitted
 
 from tslearn.bases import TimeSeriesBaseEstimator
+from tslearn.bases.bases import ALLOW_VARIABLE_LENGTH
 from tslearn.utils import (
+    check_variable_length_input,
     to_time_series_dataset,
+    to_time_series,
     check_equal_size,
     ts_size,
     check_array,
@@ -17,14 +21,15 @@
 __author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'
 
 
-class TimeSeriesResampler(TransformerMixin):
+class TimeSeriesResampler(TransformerMixin, TimeSeriesBaseEstimator):
     """Resampler for time series. Resample time series so that they reach the
     target size.
 
     Parameters
     ----------
-    sz : int
-        Size of the output time series.
+    sz : int (default: -1)
+        Size of the output time series. If not strictly positive, the size of
+        the longuest timeseries in the dataset is used.
 
     Examples
     --------
@@ -35,8 +40,11 @@ class TimeSeriesResampler(TransformerMixin):
             [4.5],
             [6. ]]])
     """
-    def __init__(self, sz):
-        self.sz_ = sz
+    def __init__(self, sz: int=-1):
+        self.sz = sz
+
+    def _get_resampling_size(self, X):
+        return self.sz if self.sz > 0 else X.shape[1]
 
     def fit(self, X, y=None, **kwargs):
         """A dummy method such that it complies to the sklearn requirements.
@@ -51,11 +59,14 @@ def fit(self, X, y=None, **kwargs):
         -------
         self
         """
+        X_ = check_variable_length_input(X)
+        self._X_fit_dims = X_.shape
+
         return self
 
     def _transform_unit_sz(self, X):
         n_ts, sz, d = X.shape
-        X_out = numpy.empty((n_ts, self.sz_, d))
+        X_out = numpy.empty((n_ts, 1, d))
         for i in range(X.shape[0]):
             X_out[i] = numpy.nanmean(X[i], axis=0, keepdims=True)
         return X_out
@@ -88,22 +99,34 @@ def transform(self, X, y=None, **kwargs):
         numpy.ndarray
             Resampled time series dataset.
         """
-        X_ = to_time_series_dataset(X)
-        if self.sz_ == 1:
+        check_is_fitted(self, '_X_fit_dims')
+
+        X_ = check_variable_length_input(X)
+        X_ = check_dims(X_, X_fit_dims=self._X_fit_dims, extend=False)
+
+        target_sz = self._get_resampling_size(X_)
+        if target_sz == 1:
             return self._transform_unit_sz(X_)
+
         n_ts, sz, d = X_.shape
         equal_size = check_equal_size(X_)
-        X_out = numpy.empty((n_ts, self.sz_, d))
+        X_out = numpy.empty((n_ts, target_sz, d))
         for i in range(X_.shape[0]):
-            xnew = numpy.linspace(0, 1, self.sz_)
             if not equal_size:
                 sz = ts_size(X_[i])
             for di in range(d):
-                f = interp1d(numpy.linspace(0, 1, sz), X_[i, :sz, di],
-                             kind="slinear")
-                X_out[i, :, di] = f(xnew)
+                X_out[i, :, di] = numpy.interp(
+                    numpy.linspace(0, 1, target_sz),
+                    numpy.linspace(0, 1, sz),
+                    X_[i, :sz, di]
+                    )
         return X_out
 
+    def _more_tags(self):
+        more_tags = super()._more_tags()
+        more_tags.update({'allow_nan': True, ALLOW_VARIABLE_LENGTH: True})
+        return more_tags
+
 
 class TimeSeriesScalerMinMax(TransformerMixin, TimeSeriesBaseEstimator):
     """Scaler for time series datasets. Scales features values so that their span in given dimensions
@@ -347,3 +370,192 @@ def _more_tags(self):
         more_tags = super()._more_tags()
         more_tags.update({'allow_nan': True})
         return more_tags
+
+
+class TimeSeriesImputer(TransformerMixin, TimeSeriesBaseEstimator):
+    """Missing value imputer for time series.
+
+    Missing values are replaced according to the choosen imputation method.
+    There might be cases where the computation of missing values is impossible,
+    in which case nans are left unchanged
+    (ex: mean of all nans, ffill for the first value... ).
+
+    Parameters
+    ----------
+    method : {'mean', 'median', 'ffill', 'bfill', 'constant'} or
+             Callable (default: 'mean')
+        The method used to compute missing values.
+        When using a Callable, the function should take an array-like
+         representing a timeseries with missing values as input parameter and
+         should return the transformed timeseries.
+    value: float (default: nan)
+        The value to replace missing values with. Only used when method is
+        "constant".
+    keep_trailing_nans: bool (default: True)
+        Whether the trailing nans should be considered as padding for variable
+        length time series and kept unprocessed. When set to false, trailing nans
+         will be imputed, which can be usefull when feeding the imputer with
+        ref:`to_time_series_dataset <fun-tslearn.utils.to_time_series_dataset>`
+         results.
+
+    Notes
+    -----
+        This method allows datasets of variable lenght time series.
+        While most missing values should be replaced, there might still be nan
+        values in the resulting dataset representing padding when used with
+        variable length time series.
+
+    Examples
+    --------
+    >>> import math
+    >>> TimeSeriesImputer().fit_transform([[0, math.nan, 6]])
+    array([[[0.],
+            [3.],
+            [6.]]])
+    >>> TimeSeriesImputer().fit_transform([[numpy.nan, 3, 6], [numpy.nan, 3]])
+    array([[[4.5],
+            [3. ],
+            [6. ]],
+    <BLANKLINE>
+           [[3. ],
+            [3. ],
+            [nan]]])
+    >>> TimeSeriesImputer('ffill').fit_transform([[[1, math.nan], [2, 3]], [[3, 4], [4, math.nan]]])
+    array([[[ 1., nan],
+            [ 2.,  3.]],
+    <BLANKLINE>
+           [[ 3.,  4.],
+            [ 4.,  4.]]])
+    """
+    def __init__(self,
+                 method: Union[str, Callable]="mean",
+                 value:  Optional[float]=nan,
+                 keep_trailing_nans: bool = False):
+        self.method = method
+        self.value = value
+        self.keep_trailing_nans = keep_trailing_nans
+        super().__init__()
+
+    @property
+    def _imputer(self):
+        if callable(self.method):
+            return self.method
+
+        if hasattr(self, "_{}_impute".format(self.method)):
+            return getattr(self, "_{}_impute".format(self.method))
+        return None
+
+    def _constant_impute(self, ts):
+        return numpy.where(numpy.isnan(ts), self.value, ts)
+
+    @staticmethod
+    def _mean_impute(ts):
+        return numpy.where(numpy.isnan(ts), numpy.nanmean(ts, axis=0, keepdims=True), ts)
+
+    @staticmethod
+    def _median_impute(ts):
+        return numpy.where(numpy.isnan(ts), numpy.nanmedian(ts, axis=0, keepdims=True), ts)
+
+    @staticmethod
+    def _ffill_impute(ts):
+        # Forward fill
+        mask = numpy.isnan(ts)
+        idx = numpy.where(
+            ~mask,
+            numpy.arange(ts.shape[0]).reshape(ts.shape[0], 1),
+            0
+        )
+        numpy.maximum.accumulate(idx, axis=0, out=idx)
+        if ts.shape[-1] > 1:
+            # Multivariate
+            ts[mask] = ts[idx[mask], numpy.nonzero(mask)[1]]
+        else:
+            # Univariate
+            ts[mask] = ts[idx[mask]].flatten()
+        return ts
+
+    @staticmethod
+    def _bfill_impute(ts):
+        # Backward fill
+        mask = numpy.isnan(ts)
+        idx = numpy.where(
+            ~mask,
+            numpy.arange(ts.shape[0]).reshape(ts.shape[0], 1),
+            ts.shape[0] -1
+        )
+        numpy.minimum.accumulate(numpy.flip(idx, axis=0), axis=0, out=idx)
+        idx = numpy.flip(idx, axis=0)
+        if ts.shape[-1] > 1:
+            # Multivariate
+            ts[mask] = ts[idx[mask], numpy.nonzero(mask)[1]]
+        else:
+            # Univariate
+            ts[mask] = ts[idx[mask]].flatten()
+        return ts
+
+    def fit(self, X, y=None, **kwargs):
+        """A dummy method such that it complies to the sklearn requirements.
+        Since this method is completely stateless, it just returns itself.
+
+        Parameters
+        ----------
+        X
+            Ignored
+
+        Returns
+        -------
+        self
+        """
+        X_ = check_variable_length_input(X)
+        self._X_fit_dims = X_.shape
+        return self
+
+    def fit_transform(self, X, y=None, **kwargs):
+        """Fit to data, then transform it.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_ts, sz, d)
+            Time series dataset to be imputed.
+
+        Returns
+        -------
+        numpy.ndarray
+            Imputed time series dataset.
+        """
+        return self.fit(X).transform(X, kwargs)
+
+    def transform(self, X, y=None, **kwargs):
+        """Fit to data, then transform it.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_ts, sz, d)
+            Time series dataset to be imputed
+
+        Returns
+        -------
+        numpy.ndarray
+            Imputed time series dataset
+        """
+        check_is_fitted(self, '_X_fit_dims')
+
+        X_ = check_variable_length_input(X)
+        X_ = check_dims(X_, X_fit_dims=self._X_fit_dims, extend=False)
+
+        imputer = self._imputer
+        if imputer is None:
+            raise ValueError("Imputer {} not implemented.".format(self.method))
+
+        for ts_index in range(X_.shape[0]):
+            ts = to_time_series(X[ts_index])
+            stop_index = ts.shape[0]
+            if self.keep_trailing_nans:
+                stop_index = ts_size(ts)
+            X_[ts_index, :stop_index] = imputer(ts[:stop_index])
+        return to_time_series_dataset(X_)
+
+    def _more_tags(self):
+        more_tags = super()._more_tags()
+        more_tags.update({'allow_nan': True, ALLOW_VARIABLE_LENGTH: True})
+        return more_tags