diff --git a/dataprep_ml/cleaners.py b/dataprep_ml/cleaners.py index 821bd11..b8553a5 100644 --- a/dataprep_ml/cleaners.py +++ b/dataprep_ml/cleaners.py @@ -507,6 +507,10 @@ def clean_timeseries(df: pd.DataFrame, tss: dict) -> pd.DataFrame: # save original order of columns orig_cols = deepcopy(df.columns.to_list()) + + # cast order_by as numerical + df[tss['order_by']] = pd.to_numeric(df[tss['order_by']], errors='raise') + # fix duplicates by group if tss.get('group_by', False): correct_dfs = [] diff --git a/dataprep_ml/splitters.py b/dataprep_ml/splitters.py index d7eed04..c33297c 100644 --- a/dataprep_ml/splitters.py +++ b/dataprep_ml/splitters.py @@ -57,6 +57,13 @@ def splitter( else: train, dev, test = simple_split(data, pct_train, pct_dev, pct_test) + # Final assertions for time series + if min(len(train), len(dev)) < tss.get('window'): + raise Exception(f"Dataset size is too small for the specified window size ({tss.get('window')})") + + if min(len(train), len(dev), len(test)) < tss.get('horizon'): + raise Exception(f"Dataset size is too small for the specified horizon size ({tss.get('horizon')})") + return {"train": train, "test": test, "dev": dev, "stratified_on": stratify_on}