Skip to content

Add UnivariateTimeTypeToContinous - take II #295

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Aug 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/MLJModels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import MLJBase: @load
import MLJBase: Table, Continuous, Count, Finite, OrderedFactor, Multiclass

using Requires, Pkg, Pkg.TOML, OrderedCollections, Parameters
using Tables, CategoricalArrays, StatsBase, Statistics
using Tables, CategoricalArrays, StatsBase, Statistics, Dates
import Distributions

# for administrators to update Metadata.toml:
Expand All @@ -28,7 +28,8 @@ export ConstantRegressor, ConstantClassifier,
# from model/Transformers
export FeatureSelector, StaticTransformer, UnivariateDiscretizer,
UnivariateStandardizer, Standardizer, UnivariateBoxCoxTransformer,
OneHotEncoder, ContinuousEncoder, FillImputer, UnivariateFillImputer
OneHotEncoder, ContinuousEncoder, FillImputer, UnivariateFillImputer,
UnivariateTimeTypeToContinuous

const srcdir = dirname(@__FILE__) # the directory containing this file

Expand Down
160 changes: 156 additions & 4 deletions src/builtins/Transformers.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## CONSTANTS
## CONSTANTS

const N_VALUES_THRESH = 16 # for BoxCoxTransformation

Expand All @@ -23,6 +23,10 @@ const CONTINUOUS_ENCODER_DESCR = "Convert all `Finite` (categorical) and "*
"`Count` features (columns) of a table to `Continuous` and drop all "*
" remaining non-`Continuous` features. "
"features. "
const UNIVARIATE_TIME_TYPE_TO_CONTINUOUS = "Transform univariate "*
"data with element scitype `ScientificDateTime` so that it has "*
"`Continuous` element scitype, according to a learned scale. "


##
## IMPUTER
Expand Down Expand Up @@ -411,10 +415,152 @@ MLJBase.inverse_transform(transformer::UnivariateStandardizer, fitresult, w) =
[inverse_transform(transformer, fitresult, y) for y in w]


## CONTINUOUS TRANSFORM OF TIME TYPE FEATURES

"""
UnivariateTimeTypeToContinuous(zero_time=nothing, step=Hour(24))

Convert a `Date`, `DateTime`, and `Time` vector to `Float64` by
assuming `0.0` corresponds to the `zero_time` parameter and the time
increment to reach `1.0` is given by the `step` parameter. The type of
`zero_time` should match the type of the column if provided. If not
provided, then `zero_time` is inferred as the minimum time found in
the data when `fit` is called.

"""
mutable struct UnivariateTimeTypeToContinuous <: Unsupervised
zero_time::Union{Nothing, TimeType}
step::Period
end

function UnivariateTimeTypeToContinuous(;
zero_time=nothing, step=Dates.Hour(24))
model = UnivariateTimeTypeToContinuous(zero_time, step)
message = MLJBase.clean!(model)
isempty(message) || @warn message
return model
end

function MLJBase.clean!(model::UnivariateTimeTypeToContinuous)
# Step must be able to be added to zero_time if provided.
msg = ""
if model.zero_time !== nothing
try
tmp = model.zero_time + model.step
catch err
if err isa MethodError
model.zero_time, model.step, status, msg = _fix_zero_time_step(
model.zero_time, model.step)
if status === :error
# Unable to resolve, rethrow original error.
throw(err)
end
else
throw(err)
end
end
end
return msg
end

function _fix_zero_time_step(zero_time, step)
# Cannot add time parts to dates nor date parts to times.
# If a mismatch is encountered. Conversion from date parts to time parts
# is possible, but not from time parts to date parts because we cannot
# represent fractional date parts.
msg = ""
if zero_time isa Dates.Date && step isa Dates.TimePeriod
# Convert zero_time to a DateTime to resolve conflict.
if step % Hour(24) === Hour(0)
# We can convert step to Day safely
msg = "Cannot add `TimePeriod` `step` to `Date` `zero_time`. Converting `step` to `Day`."
step = convert(Day, step)
else
# We need datetime to be compatible with the step.
msg = "Cannot add `TimePeriod` `step` to `Date` `zero_time`. Converting `zero_time` to `DateTime`."
zero_time = convert(DateTime, zero_time)
end
return zero_time, step, :success, msg
elseif zero_time isa Dates.Time && step isa Dates.DatePeriod
# Convert step to Hour if possible. This will fail for
# isa(step, Month)
msg = "Cannot add `DatePeriod` `step` to `Time` `zero_time`. Converting `step` to `Hour`."
step = convert(Hour, step)
return zero_time, step, :success, msg
else
return zero_time, step, :error, msg
end
end

function MLJBase.fit(model::UnivariateTimeTypeToContinuous, verbosity::Int, X)
if model.zero_time !== nothing
min_dt = model.zero_time
step = model.step
# Check zero_time is compatible with X
example = first(X)
try
X - min_dt
catch err
if err isa MethodError
@warn "`$(typeof(min_dt))` `zero_time` is not compatible with `$(eltype(X))` vector. Attempting to convert `zero_time`."
min_dt = convert(eltype(X), min_dt)
else
throw(err)
end
end
else
min_dt = minimum(X)
step = model.step
message = ""
try
min_dt + step
catch err
if err isa MethodError
min_dt, step, status, message = _fix_zero_time_step(min_dt, step)
if status === :error
# Unable to resolve, rethrow original error.
throw(err)
end
else
throw(err)
end
end
isempty(message) || @warn message
end
cache = nothing
report = nothing
fitresult = (min_dt, step)
return fitresult, cache, report
end

function MLJBase.transform(model::UnivariateTimeTypeToContinuous, fitresult, X)
min_dt, step = fitresult
if typeof(min_dt) ≠ eltype(X)
# Cannot run if eltype in transform differs from zero_time from fit.
throw(ArgumentError("Different `TimeType` encountered during `transform` than expected from `fit`. Found `$(eltype(X))`, expected `$(typeof(min_dt))`"))
end
# Set the size of a single step.
next_time = min_dt + step
if next_time == min_dt
# Time type loops if step is a multiple of Hour(24), so calculate the
# number of multiples, then re-scale to Hour(12) and adjust delta to match original.
m = step / Dates.Hour(12)
delta = m * (
Float64(Dates.value(min_dt + Dates.Hour(12)) - Dates.value(min_dt)))
else
delta = Float64(Dates.value(min_dt + step) - Dates.value(min_dt))
end
return @. Float64(Dates.value(X - min_dt)) / delta
end


## STANDARDIZATION OF ORDINAL FEATURES OF TABULAR DATA

"""
Standardizer(; features=Symbol[], ignore=false, ordered_factor=false, count=false)
Standardizer(; features=Symbol[],
ignore=false,
ordered_factor=false,
count=false)

Unsupervised model for standardizing (whitening) the columns of
tabular data. If `features` is unspecified then all columns
Expand Down Expand Up @@ -1096,7 +1242,8 @@ metadata_pkg.(
(FeatureSelector, UnivariateStandardizer,
UnivariateDiscretizer, Standardizer,
UnivariateBoxCoxTransformer, UnivariateFillImputer,
OneHotEncoder, FillImputer, ContinuousEncoder),
OneHotEncoder, FillImputer, ContinuousEncoder,
UnivariateTimeTypeToContinuous),
name = "MLJModels",
uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7",
url = "https://github.com/alan-turing-institute/MLJModels.jl",
Expand Down Expand Up @@ -1170,4 +1317,9 @@ metadata_model(ContinuousEncoder,
descr = CONTINUOUS_ENCODER_DESCR,
path = "MLJModels.ContinuousEncoder")


metadata_model(UnivariateTimeTypeToContinuous,
input = AbstractVector{<:ScientificTypes.ScientificTimeType},
output = AbstractVector{Continuous},
weights = false,
descr = UNIVARIATE_TIME_TYPE_TO_CONTINUOUS,
path = "MLJModels.UnivariateTimeTypeToContinuous")
58 changes: 40 additions & 18 deletions src/registry/Metadata.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
[NearestNeighbors.KNNClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s105,1} where _s105<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s99,1} where _s99<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "NearestNeighbors"
":package_license" = "MIT"
Expand Down Expand Up @@ -1850,7 +1850,7 @@
[NaiveBayes.GaussianNBClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s104,1} where _s104<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s173,1} where _s173<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "NaiveBayes"
":package_license" = "unknown"
Expand All @@ -1872,7 +1872,7 @@
[NaiveBayes.MultinomialNBClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Count)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s104,1} where _s104<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s173,1} where _s173<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "NaiveBayes"
":package_license" = "unknown"
Expand Down Expand Up @@ -2064,13 +2064,13 @@
":prediction_type" = ":unknown"
":implemented_methods" = [":clean!", ":fit", ":fitted_params", ":transform"]
":hyperparameters" = "`(:k, :alg, :fun, :do_whiten, :maxiter, :tol, :winit, :mean)`"
":hyperparameter_types" = "`(\"Int64\", \"Symbol\", \"Symbol\", \"Bool\", \"Int64\", \"Real\", \"Union{Nothing, Array{_s197,2} where _s197<:Real}\", \"Union{Nothing, Array{Float64,1}, Real}\")`"
":hyperparameter_types" = "`(\"Int64\", \"Symbol\", \"Symbol\", \"Bool\", \"Int64\", \"Real\", \"Union{Nothing, Array{_s148,2} where _s148<:Real}\", \"Union{Nothing, Array{Float64,1}, Real}\")`"
":hyperparameter_ranges" = "`(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)`"

[MultivariateStats.BayesianLDA]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":target_scitype" = "`AbstractArray{_s226,1} where _s226<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s174,1} where _s174<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "MultivariateStats"
":package_license" = "MIT"
Expand All @@ -2092,7 +2092,7 @@
[MultivariateStats.BayesianSubspaceLDA]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":target_scitype" = "`AbstractArray{_s226,1} where _s226<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s174,1} where _s174<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "MultivariateStats"
":package_license" = "MIT"
Expand Down Expand Up @@ -2158,7 +2158,7 @@
[MultivariateStats.LDA]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":target_scitype" = "`AbstractArray{_s226,1} where _s226<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s174,1} where _s174<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "MultivariateStats"
":package_license" = "MIT"
Expand Down Expand Up @@ -2202,7 +2202,7 @@
[MultivariateStats.SubspaceLDA]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":target_scitype" = "`AbstractArray{_s226,1} where _s226<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s174,1} where _s174<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "MultivariateStats"
":package_license" = "MIT"
Expand All @@ -2224,7 +2224,7 @@
[DecisionTree.AdaBoostStumpClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:Union{AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous, AbstractArray{_s25,1} where _s25<:ScientificTypes.Count, AbstractArray{_s25,1} where _s25<:ScientificTypes.OrderedFactor}`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s226,1} where _s226<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s174,1} where _s174<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "DecisionTree"
":package_license" = "MIT"
Expand Down Expand Up @@ -2268,7 +2268,7 @@
[DecisionTree.DecisionTreeClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:Union{AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous, AbstractArray{_s25,1} where _s25<:ScientificTypes.Count, AbstractArray{_s25,1} where _s25<:ScientificTypes.OrderedFactor}`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s226,1} where _s226<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s174,1} where _s174<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "DecisionTree"
":package_license" = "MIT"
Expand Down Expand Up @@ -2312,7 +2312,7 @@
[DecisionTree.RandomForestClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:Union{AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous, AbstractArray{_s25,1} where _s25<:ScientificTypes.Count, AbstractArray{_s25,1} where _s25<:ScientificTypes.OrderedFactor}`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s226,1} where _s226<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s174,1} where _s174<:ScientificTypes.Finite`"
":is_pure_julia" = "`true`"
":package_name" = "DecisionTree"
":package_license" = "MIT"
Expand Down Expand Up @@ -2422,7 +2422,7 @@
[XGBoost.XGBoostClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s492,1} where _s492<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s472,1} where _s472<:ScientificTypes.Finite`"
":is_pure_julia" = "`false`"
":package_name" = "XGBoost"
":package_license" = "unknown"
Expand All @@ -2444,7 +2444,7 @@
[LightGBM.LGBMClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s576,1} where _s576<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s565,1} where _s565<:ScientificTypes.Finite`"
":is_pure_julia" = "`false`"
":package_name" = "LightGBM"
":package_license" = "MIT Expat"
Expand Down Expand Up @@ -2639,6 +2639,28 @@
":hyperparameter_types" = "``"
":hyperparameter_ranges" = "``"

[MLJModels.UnivariateTimeTypeToContinuous]
":input_scitype" = "`AbstractArray{_s55,1} where _s55<:ScientificTypes.ScientificTimeType`"
":output_scitype" = "`AbstractArray{ScientificTypes.Continuous,1}`"
":target_scitype" = "`ScientificTypes.Unknown`"
":is_pure_julia" = "`true`"
":package_name" = "MLJModels"
":package_license" = "MIT"
":load_path" = "MLJModels.UnivariateTimeTypeToContinuous"
":package_uuid" = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
":package_url" = "https://github.com/alan-turing-institute/MLJModels.jl"
":is_wrapper" = "`false`"
":supports_weights" = "`false`"
":supports_online" = "`false`"
":docstring" = "Transform univariate data with element scitype `ScientificDateTime` so that it has `Continuous` element scitype, according to a learned scale. \n→ based on [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl).\n→ do `@load UnivariateTimeTypeToContinuous pkg=\"MLJModels\"` to use the model.\n→ do `?UnivariateTimeTypeToContinuous` for documentation."
":name" = "UnivariateTimeTypeToContinuous"
":is_supervised" = "`false`"
":prediction_type" = ":unknown"
":implemented_methods" = [":clean!", ":fit", ":transform"]
":hyperparameters" = "`(:zero_time, :step)`"
":hyperparameter_types" = "`(\"Union{Nothing, Dates.TimeType}\", \"Dates.Period\")`"
":hyperparameter_ranges" = "`(nothing, nothing)`"

[MLJModels.OneHotEncoder]
":input_scitype" = "`ScientificTypes.Table`"
":output_scitype" = "`ScientificTypes.Table`"
Expand Down Expand Up @@ -2884,7 +2906,7 @@
[LIBSVM.LinearSVC]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s575,1} where _s575<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s564,1} where _s564<:ScientificTypes.Finite`"
":is_pure_julia" = "`false`"
":package_name" = "LIBSVM"
":package_license" = "unknown"
Expand Down Expand Up @@ -2928,7 +2950,7 @@
[LIBSVM.NuSVC]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s575,1} where _s575<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s564,1} where _s564<:ScientificTypes.Finite`"
":is_pure_julia" = "`false`"
":package_name" = "LIBSVM"
":package_license" = "unknown"
Expand All @@ -2950,7 +2972,7 @@
[LIBSVM.SVC]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s575,1} where _s575<:ScientificTypes.Finite`"
":target_scitype" = "`AbstractArray{_s564,1} where _s564<:ScientificTypes.Finite`"
":is_pure_julia" = "`false`"
":package_name" = "LIBSVM"
":package_license" = "unknown"
Expand All @@ -2971,7 +2993,7 @@

[LIBSVM.OneClassSVM]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`AbstractArray{_s575,1} where _s575<:ScientificTypes.Finite{2}`"
":output_scitype" = "`AbstractArray{_s564,1} where _s564<:ScientificTypes.Finite{2}`"
":target_scitype" = "`ScientificTypes.Unknown`"
":is_pure_julia" = "`false`"
":package_name" = "LIBSVM"
Expand All @@ -2994,7 +3016,7 @@
[GLM.LinearBinaryClassifier]
":input_scitype" = "`ScientificTypes.Table{_s23} where _s23<:(AbstractArray{_s25,1} where _s25<:ScientificTypes.Continuous)`"
":output_scitype" = "`ScientificTypes.Unknown`"
":target_scitype" = "`AbstractArray{_s576,1} where _s576<:ScientificTypes.Finite{2}`"
":target_scitype" = "`AbstractArray{_s565,1} where _s565<:ScientificTypes.Finite{2}`"
":is_pure_julia" = "`true`"
":package_name" = "GLM"
":package_license" = "MIT"
Expand Down
Loading