1- import numpy
1+ from math import nan
2+ from typing import Callable , Optional , Union
23
3- from scipy . interpolate import interp1d
4+ import numpy
45
56from sklearn .base import TransformerMixin
67from sklearn .utils .validation import check_is_fitted
78
89from tslearn .bases import TimeSeriesBaseEstimator
10+ from tslearn .bases .bases import ALLOW_VARIABLE_LENGTH
911from tslearn .utils import (
12+ check_variable_length_input ,
1013 to_time_series_dataset ,
14+ to_time_series ,
1115 check_equal_size ,
1216 ts_size ,
1317 check_array ,
1721__author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'
1822
1923
20- class TimeSeriesResampler (TransformerMixin ):
24+ class TimeSeriesResampler (TransformerMixin , TimeSeriesBaseEstimator ):
2125 """Resampler for time series. Resample time series so that they reach the
2226 target size.
2327
2428 Parameters
2529 ----------
26- sz : int
27- Size of the output time series.
30+ sz : int (default: -1)
31+ Size of the output time series. If not strictly positive, the size of
32+ the longuest timeseries in the dataset is used.
2833
2934 Examples
3035 --------
@@ -35,8 +40,11 @@ class TimeSeriesResampler(TransformerMixin):
3540 [4.5],
3641 [6. ]]])
3742 """
38- def __init__ (self , sz ):
39- self .sz_ = sz
43+ def __init__ (self , sz : int = - 1 ):
44+ self .sz = sz
45+
46+ def _get_resampling_size (self , X ):
47+ return self .sz if self .sz > 0 else X .shape [1 ]
4048
4149 def fit (self , X , y = None , ** kwargs ):
4250 """A dummy method such that it complies to the sklearn requirements.
@@ -51,11 +59,14 @@ def fit(self, X, y=None, **kwargs):
5159 -------
5260 self
5361 """
62+ X_ = check_variable_length_input (X )
63+ self ._X_fit_dims = X_ .shape
64+
5465 return self
5566
5667 def _transform_unit_sz (self , X ):
5768 n_ts , sz , d = X .shape
58- X_out = numpy .empty ((n_ts , self . sz_ , d ))
69+ X_out = numpy .empty ((n_ts , 1 , d ))
5970 for i in range (X .shape [0 ]):
6071 X_out [i ] = numpy .nanmean (X [i ], axis = 0 , keepdims = True )
6172 return X_out
@@ -88,22 +99,34 @@ def transform(self, X, y=None, **kwargs):
8899 numpy.ndarray
89100 Resampled time series dataset.
90101 """
91- X_ = to_time_series_dataset (X )
92- if self .sz_ == 1 :
102+ check_is_fitted (self , '_X_fit_dims' )
103+
104+ X_ = check_variable_length_input (X )
105+ X_ = check_dims (X_ , X_fit_dims = self ._X_fit_dims , extend = False )
106+
107+ target_sz = self ._get_resampling_size (X_ )
108+ if target_sz == 1 :
93109 return self ._transform_unit_sz (X_ )
110+
94111 n_ts , sz , d = X_ .shape
95112 equal_size = check_equal_size (X_ )
96- X_out = numpy .empty ((n_ts , self . sz_ , d ))
113+ X_out = numpy .empty ((n_ts , target_sz , d ))
97114 for i in range (X_ .shape [0 ]):
98- xnew = numpy .linspace (0 , 1 , self .sz_ )
99115 if not equal_size :
100116 sz = ts_size (X_ [i ])
101117 for di in range (d ):
102- f = interp1d (numpy .linspace (0 , 1 , sz ), X_ [i , :sz , di ],
103- kind = "slinear" )
104- X_out [i , :, di ] = f (xnew )
118+ X_out [i , :, di ] = numpy .interp (
119+ numpy .linspace (0 , 1 , target_sz ),
120+ numpy .linspace (0 , 1 , sz ),
121+ X_ [i , :sz , di ]
122+ )
105123 return X_out
106124
125+ def _more_tags (self ):
126+ more_tags = super ()._more_tags ()
127+ more_tags .update ({'allow_nan' : True , ALLOW_VARIABLE_LENGTH : True })
128+ return more_tags
129+
107130
108131class TimeSeriesScalerMinMax (TransformerMixin , TimeSeriesBaseEstimator ):
109132 """Scaler for time series datasets. Scales features values so that their span in given dimensions
@@ -347,3 +370,192 @@ def _more_tags(self):
347370 more_tags = super ()._more_tags ()
348371 more_tags .update ({'allow_nan' : True })
349372 return more_tags
373+
374+
375+ class TimeSeriesImputer (TransformerMixin , TimeSeriesBaseEstimator ):
376+ """Missing value imputer for time series.
377+
378+ Missing values are replaced according to the choosen imputation method.
379+ There might be cases where the computation of missing values is impossible,
380+ in which case nans are left unchanged
381+ (ex: mean of all nans, ffill for the first value... ).
382+
383+ Parameters
384+ ----------
385+ method : {'mean', 'median', 'ffill', 'bfill', 'constant'} or
386+ Callable (default: 'mean')
387+ The method used to compute missing values.
388+ When using a Callable, the function should take an array-like
389+ representing a timeseries with missing values as input parameter and
390+ should return the transformed timeseries.
391+ value: float (default: nan)
392+ The value to replace missing values with. Only used when method is
393+ "constant".
394+ keep_trailing_nans: bool (default: True)
395+ Whether the trailing nans should be considered as padding for variable
396+ length time series and kept unprocessed. When set to false, trailing nans
397+ will be imputed, which can be usefull when feeding the imputer with
398+ ref:`to_time_series_dataset <fun-tslearn.utils.to_time_series_dataset>`
399+ results.
400+
401+ Notes
402+ -----
403+ This method allows datasets of variable lenght time series.
404+ While most missing values should be replaced, there might still be nan
405+ values in the resulting dataset representing padding when used with
406+ variable length time series.
407+
408+ Examples
409+ --------
410+ >>> import math
411+ >>> TimeSeriesImputer().fit_transform([[0, math.nan, 6]])
412+ array([[[0.],
413+ [3.],
414+ [6.]]])
415+ >>> TimeSeriesImputer().fit_transform([[numpy.nan, 3, 6], [numpy.nan, 3]])
416+ array([[[4.5],
417+ [3. ],
418+ [6. ]],
419+ <BLANKLINE>
420+ [[3. ],
421+ [3. ],
422+ [nan]]])
423+ >>> TimeSeriesImputer('ffill').fit_transform([[[1, math.nan], [2, 3]], [[3, 4], [4, math.nan]]])
424+ array([[[ 1., nan],
425+ [ 2., 3.]],
426+ <BLANKLINE>
427+ [[ 3., 4.],
428+ [ 4., 4.]]])
429+ """
430+ def __init__ (self ,
431+ method : Union [str , Callable ]= "mean" ,
432+ value : Optional [float ]= nan ,
433+ keep_trailing_nans : bool = False ):
434+ self .method = method
435+ self .value = value
436+ self .keep_trailing_nans = keep_trailing_nans
437+ super ().__init__ ()
438+
439+ @property
440+ def _imputer (self ):
441+ if callable (self .method ):
442+ return self .method
443+
444+ if hasattr (self , "_{}_impute" .format (self .method )):
445+ return getattr (self , "_{}_impute" .format (self .method ))
446+ return None
447+
448+ def _constant_impute (self , ts ):
449+ return numpy .where (numpy .isnan (ts ), self .value , ts )
450+
451+ @staticmethod
452+ def _mean_impute (ts ):
453+ return numpy .where (numpy .isnan (ts ), numpy .nanmean (ts , axis = 0 , keepdims = True ), ts )
454+
455+ @staticmethod
456+ def _median_impute (ts ):
457+ return numpy .where (numpy .isnan (ts ), numpy .nanmedian (ts , axis = 0 , keepdims = True ), ts )
458+
459+ @staticmethod
460+ def _ffill_impute (ts ):
461+ # Forward fill
462+ mask = numpy .isnan (ts )
463+ idx = numpy .where (
464+ ~ mask ,
465+ numpy .arange (ts .shape [0 ]).reshape (ts .shape [0 ], 1 ),
466+ 0
467+ )
468+ numpy .maximum .accumulate (idx , axis = 0 , out = idx )
469+ if ts .shape [- 1 ] > 1 :
470+ # Multivariate
471+ ts [mask ] = ts [idx [mask ], numpy .nonzero (mask )[1 ]]
472+ else :
473+ # Univariate
474+ ts [mask ] = ts [idx [mask ]].flatten ()
475+ return ts
476+
477+ @staticmethod
478+ def _bfill_impute (ts ):
479+ # Backward fill
480+ mask = numpy .isnan (ts )
481+ idx = numpy .where (
482+ ~ mask ,
483+ numpy .arange (ts .shape [0 ]).reshape (ts .shape [0 ], 1 ),
484+ ts .shape [0 ] - 1
485+ )
486+ numpy .minimum .accumulate (numpy .flip (idx , axis = 0 ), axis = 0 , out = idx )
487+ idx = numpy .flip (idx , axis = 0 )
488+ if ts .shape [- 1 ] > 1 :
489+ # Multivariate
490+ ts [mask ] = ts [idx [mask ], numpy .nonzero (mask )[1 ]]
491+ else :
492+ # Univariate
493+ ts [mask ] = ts [idx [mask ]].flatten ()
494+ return ts
495+
496+ def fit (self , X , y = None , ** kwargs ):
497+ """A dummy method such that it complies to the sklearn requirements.
498+ Since this method is completely stateless, it just returns itself.
499+
500+ Parameters
501+ ----------
502+ X
503+ Ignored
504+
505+ Returns
506+ -------
507+ self
508+ """
509+ X_ = check_variable_length_input (X )
510+ self ._X_fit_dims = X_ .shape
511+ return self
512+
513+ def fit_transform (self , X , y = None , ** kwargs ):
514+ """Fit to data, then transform it.
515+
516+ Parameters
517+ ----------
518+ X : array-like of shape (n_ts, sz, d)
519+ Time series dataset to be imputed.
520+
521+ Returns
522+ -------
523+ numpy.ndarray
524+ Imputed time series dataset.
525+ """
526+ return self .fit (X ).transform (X , kwargs )
527+
528+ def transform (self , X , y = None , ** kwargs ):
529+ """Fit to data, then transform it.
530+
531+ Parameters
532+ ----------
533+ X : array-like of shape (n_ts, sz, d)
534+ Time series dataset to be imputed
535+
536+ Returns
537+ -------
538+ numpy.ndarray
539+ Imputed time series dataset
540+ """
541+ check_is_fitted (self , '_X_fit_dims' )
542+
543+ X_ = check_variable_length_input (X )
544+ X_ = check_dims (X_ , X_fit_dims = self ._X_fit_dims , extend = False )
545+
546+ imputer = self ._imputer
547+ if imputer is None :
548+ raise ValueError ("Imputer {} not implemented." .format (self .method ))
549+
550+ for ts_index in range (X_ .shape [0 ]):
551+ ts = to_time_series (X [ts_index ])
552+ stop_index = ts .shape [0 ]
553+ if self .keep_trailing_nans :
554+ stop_index = ts_size (ts )
555+ X_ [ts_index , :stop_index ] = imputer (ts [:stop_index ])
556+ return to_time_series_dataset (X_ )
557+
558+ def _more_tags (self ):
559+ more_tags = super ()._more_tags ()
560+ more_tags .update ({'allow_nan' : True , ALLOW_VARIABLE_LENGTH : True })
561+ return more_tags
0 commit comments