Skip to content

Commit 0d92455

Browse files
authored
Add imputation capability (#577)
Add an TimeSeriesImputer class to handle basic imputation. Imputation is done per time series, using one of the available method (mean, median, forwardfill , backward fill). When used with variable lenght time series, the keep_trailing_nans parameter controls whether the nans padding is to be processed by the imputer.
1 parent 49dbbc4 commit 0d92455

File tree

5 files changed

+422
-24
lines changed

5 files changed

+422
-24
lines changed

tslearn/preprocessing/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66
from .preprocessing import (
77
TimeSeriesScalerMeanVariance,
88
TimeSeriesScalerMinMax,
9-
TimeSeriesResampler
9+
TimeSeriesResampler,
10+
TimeSeriesImputer
1011
)
1112

1213
__all__ = [
1314
"TimeSeriesResampler",
1415
"TimeSeriesScalerMinMax",
15-
"TimeSeriesScalerMeanVariance"
16+
"TimeSeriesScalerMeanVariance",
17+
"TimeSeriesImputer"
1618
]

tslearn/preprocessing/preprocessing.py

Lines changed: 227 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
1-
import numpy
1+
from math import nan
2+
from typing import Callable, Optional, Union
23

3-
from scipy.interpolate import interp1d
4+
import numpy
45

56
from sklearn.base import TransformerMixin
67
from sklearn.utils.validation import check_is_fitted
78

89
from tslearn.bases import TimeSeriesBaseEstimator
10+
from tslearn.bases.bases import ALLOW_VARIABLE_LENGTH
911
from tslearn.utils import (
12+
check_variable_length_input,
1013
to_time_series_dataset,
14+
to_time_series,
1115
check_equal_size,
1216
ts_size,
1317
check_array,
@@ -17,14 +21,15 @@
1721
__author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'
1822

1923

20-
class TimeSeriesResampler(TransformerMixin):
24+
class TimeSeriesResampler(TransformerMixin, TimeSeriesBaseEstimator):
2125
"""Resampler for time series. Resample time series so that they reach the
2226
target size.
2327
2428
Parameters
2529
----------
26-
sz : int
27-
Size of the output time series.
30+
sz : int (default: -1)
31+
Size of the output time series. If not strictly positive, the size of
32+
the longuest timeseries in the dataset is used.
2833
2934
Examples
3035
--------
@@ -35,8 +40,11 @@ class TimeSeriesResampler(TransformerMixin):
3540
[4.5],
3641
[6. ]]])
3742
"""
38-
def __init__(self, sz):
39-
self.sz_ = sz
43+
def __init__(self, sz: int=-1):
44+
self.sz = sz
45+
46+
def _get_resampling_size(self, X):
47+
return self.sz if self.sz > 0 else X.shape[1]
4048

4149
def fit(self, X, y=None, **kwargs):
4250
"""A dummy method such that it complies to the sklearn requirements.
@@ -51,11 +59,14 @@ def fit(self, X, y=None, **kwargs):
5159
-------
5260
self
5361
"""
62+
X_ = check_variable_length_input(X)
63+
self._X_fit_dims = X_.shape
64+
5465
return self
5566

5667
def _transform_unit_sz(self, X):
5768
n_ts, sz, d = X.shape
58-
X_out = numpy.empty((n_ts, self.sz_, d))
69+
X_out = numpy.empty((n_ts, 1, d))
5970
for i in range(X.shape[0]):
6071
X_out[i] = numpy.nanmean(X[i], axis=0, keepdims=True)
6172
return X_out
@@ -88,22 +99,34 @@ def transform(self, X, y=None, **kwargs):
8899
numpy.ndarray
89100
Resampled time series dataset.
90101
"""
91-
X_ = to_time_series_dataset(X)
92-
if self.sz_ == 1:
102+
check_is_fitted(self, '_X_fit_dims')
103+
104+
X_ = check_variable_length_input(X)
105+
X_ = check_dims(X_, X_fit_dims=self._X_fit_dims, extend=False)
106+
107+
target_sz = self._get_resampling_size(X_)
108+
if target_sz == 1:
93109
return self._transform_unit_sz(X_)
110+
94111
n_ts, sz, d = X_.shape
95112
equal_size = check_equal_size(X_)
96-
X_out = numpy.empty((n_ts, self.sz_, d))
113+
X_out = numpy.empty((n_ts, target_sz, d))
97114
for i in range(X_.shape[0]):
98-
xnew = numpy.linspace(0, 1, self.sz_)
99115
if not equal_size:
100116
sz = ts_size(X_[i])
101117
for di in range(d):
102-
f = interp1d(numpy.linspace(0, 1, sz), X_[i, :sz, di],
103-
kind="slinear")
104-
X_out[i, :, di] = f(xnew)
118+
X_out[i, :, di] = numpy.interp(
119+
numpy.linspace(0, 1, target_sz),
120+
numpy.linspace(0, 1, sz),
121+
X_[i, :sz, di]
122+
)
105123
return X_out
106124

125+
def _more_tags(self):
126+
more_tags = super()._more_tags()
127+
more_tags.update({'allow_nan': True, ALLOW_VARIABLE_LENGTH: True})
128+
return more_tags
129+
107130

108131
class TimeSeriesScalerMinMax(TransformerMixin, TimeSeriesBaseEstimator):
109132
"""Scaler for time series datasets. Scales features values so that their span in given dimensions
@@ -347,3 +370,192 @@ def _more_tags(self):
347370
more_tags = super()._more_tags()
348371
more_tags.update({'allow_nan': True})
349372
return more_tags
373+
374+
375+
class TimeSeriesImputer(TransformerMixin, TimeSeriesBaseEstimator):
376+
"""Missing value imputer for time series.
377+
378+
Missing values are replaced according to the choosen imputation method.
379+
There might be cases where the computation of missing values is impossible,
380+
in which case nans are left unchanged
381+
(ex: mean of all nans, ffill for the first value... ).
382+
383+
Parameters
384+
----------
385+
method : {'mean', 'median', 'ffill', 'bfill', 'constant'} or
386+
Callable (default: 'mean')
387+
The method used to compute missing values.
388+
When using a Callable, the function should take an array-like
389+
representing a timeseries with missing values as input parameter and
390+
should return the transformed timeseries.
391+
value: float (default: nan)
392+
The value to replace missing values with. Only used when method is
393+
"constant".
394+
keep_trailing_nans: bool (default: True)
395+
Whether the trailing nans should be considered as padding for variable
396+
length time series and kept unprocessed. When set to false, trailing nans
397+
will be imputed, which can be usefull when feeding the imputer with
398+
ref:`to_time_series_dataset <fun-tslearn.utils.to_time_series_dataset>`
399+
results.
400+
401+
Notes
402+
-----
403+
This method allows datasets of variable lenght time series.
404+
While most missing values should be replaced, there might still be nan
405+
values in the resulting dataset representing padding when used with
406+
variable length time series.
407+
408+
Examples
409+
--------
410+
>>> import math
411+
>>> TimeSeriesImputer().fit_transform([[0, math.nan, 6]])
412+
array([[[0.],
413+
[3.],
414+
[6.]]])
415+
>>> TimeSeriesImputer().fit_transform([[numpy.nan, 3, 6], [numpy.nan, 3]])
416+
array([[[4.5],
417+
[3. ],
418+
[6. ]],
419+
<BLANKLINE>
420+
[[3. ],
421+
[3. ],
422+
[nan]]])
423+
>>> TimeSeriesImputer('ffill').fit_transform([[[1, math.nan], [2, 3]], [[3, 4], [4, math.nan]]])
424+
array([[[ 1., nan],
425+
[ 2., 3.]],
426+
<BLANKLINE>
427+
[[ 3., 4.],
428+
[ 4., 4.]]])
429+
"""
430+
def __init__(self,
431+
method: Union[str, Callable]="mean",
432+
value: Optional[float]=nan,
433+
keep_trailing_nans: bool = False):
434+
self.method = method
435+
self.value = value
436+
self.keep_trailing_nans = keep_trailing_nans
437+
super().__init__()
438+
439+
@property
440+
def _imputer(self):
441+
if callable(self.method):
442+
return self.method
443+
444+
if hasattr(self, "_{}_impute".format(self.method)):
445+
return getattr(self, "_{}_impute".format(self.method))
446+
return None
447+
448+
def _constant_impute(self, ts):
449+
return numpy.where(numpy.isnan(ts), self.value, ts)
450+
451+
@staticmethod
452+
def _mean_impute(ts):
453+
return numpy.where(numpy.isnan(ts), numpy.nanmean(ts, axis=0, keepdims=True), ts)
454+
455+
@staticmethod
456+
def _median_impute(ts):
457+
return numpy.where(numpy.isnan(ts), numpy.nanmedian(ts, axis=0, keepdims=True), ts)
458+
459+
@staticmethod
460+
def _ffill_impute(ts):
461+
# Forward fill
462+
mask = numpy.isnan(ts)
463+
idx = numpy.where(
464+
~mask,
465+
numpy.arange(ts.shape[0]).reshape(ts.shape[0], 1),
466+
0
467+
)
468+
numpy.maximum.accumulate(idx, axis=0, out=idx)
469+
if ts.shape[-1] > 1:
470+
# Multivariate
471+
ts[mask] = ts[idx[mask], numpy.nonzero(mask)[1]]
472+
else:
473+
# Univariate
474+
ts[mask] = ts[idx[mask]].flatten()
475+
return ts
476+
477+
@staticmethod
478+
def _bfill_impute(ts):
479+
# Backward fill
480+
mask = numpy.isnan(ts)
481+
idx = numpy.where(
482+
~mask,
483+
numpy.arange(ts.shape[0]).reshape(ts.shape[0], 1),
484+
ts.shape[0] -1
485+
)
486+
numpy.minimum.accumulate(numpy.flip(idx, axis=0), axis=0, out=idx)
487+
idx = numpy.flip(idx, axis=0)
488+
if ts.shape[-1] > 1:
489+
# Multivariate
490+
ts[mask] = ts[idx[mask], numpy.nonzero(mask)[1]]
491+
else:
492+
# Univariate
493+
ts[mask] = ts[idx[mask]].flatten()
494+
return ts
495+
496+
def fit(self, X, y=None, **kwargs):
497+
"""A dummy method such that it complies to the sklearn requirements.
498+
Since this method is completely stateless, it just returns itself.
499+
500+
Parameters
501+
----------
502+
X
503+
Ignored
504+
505+
Returns
506+
-------
507+
self
508+
"""
509+
X_ = check_variable_length_input(X)
510+
self._X_fit_dims = X_.shape
511+
return self
512+
513+
def fit_transform(self, X, y=None, **kwargs):
514+
"""Fit to data, then transform it.
515+
516+
Parameters
517+
----------
518+
X : array-like of shape (n_ts, sz, d)
519+
Time series dataset to be imputed.
520+
521+
Returns
522+
-------
523+
numpy.ndarray
524+
Imputed time series dataset.
525+
"""
526+
return self.fit(X).transform(X, kwargs)
527+
528+
def transform(self, X, y=None, **kwargs):
529+
"""Fit to data, then transform it.
530+
531+
Parameters
532+
----------
533+
X : array-like of shape (n_ts, sz, d)
534+
Time series dataset to be imputed
535+
536+
Returns
537+
-------
538+
numpy.ndarray
539+
Imputed time series dataset
540+
"""
541+
check_is_fitted(self, '_X_fit_dims')
542+
543+
X_ = check_variable_length_input(X)
544+
X_ = check_dims(X_, X_fit_dims=self._X_fit_dims, extend=False)
545+
546+
imputer = self._imputer
547+
if imputer is None:
548+
raise ValueError("Imputer {} not implemented.".format(self.method))
549+
550+
for ts_index in range(X_.shape[0]):
551+
ts = to_time_series(X[ts_index])
552+
stop_index = ts.shape[0]
553+
if self.keep_trailing_nans:
554+
stop_index = ts_size(ts)
555+
X_[ts_index, :stop_index] = imputer(ts[:stop_index])
556+
return to_time_series_dataset(X_)
557+
558+
def _more_tags(self):
559+
more_tags = super()._more_tags()
560+
more_tags.update({'allow_nan': True, ALLOW_VARIABLE_LENGTH: True})
561+
return more_tags

0 commit comments

Comments
 (0)