forked from JuliaStats/StatsModels.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodelframe.jl
260 lines (212 loc) · 8.92 KB
/
modelframe.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
"""
Wrapper which combines Formula (Terms) and an Table
This wrapper encapsulates all the information that's required to transform data
of the same structure as the wrapped data frame into a model matrix. This goes
above and beyond what's expressed in the `Formula` itself, for instance
including information on how each categorical variable should be coded.
Creating a `ModelFrame` first parses the `Formula` into `Terms`, checks which
variables are categorical and determines the appropriate contrasts to use, and
then creates the necessary contrasts matrices and stores the results.
# Constructors
```julia
ModelFrame(f::Formula, df::Table; contrasts::Dict = Dict())
ModelFrame(ex::Expr, d::Table; contrasts::Dict = Dict())
ModelFrame(terms::Terms, df::Table; contrasts::Dict = Dict())
# Inner constructors:
ModelFrame(df::Table, terms::Terms, missing::BitArray)
ModelFrame(df::Table, terms::Terms, missing::BitArray, contrasts::Dict{Symbol, ContrastsMatrix})
```
# Arguments
* `f::Formula`: Formula whose left hand side is the *response* and right hand
side are the *predictors*.
* `df::Table`: The data being modeled. This is used at this stage
to determine which variables are categorical, and otherwise held for
[`ModelMatrix`](@ref).
* `contrasts::Dict`: An optional Dict of contrast codings for each categorical
variable. Any unspecified variables will have [`DummyCoding`](@ref). As a
keyword argument, these can be either instances of a subtype of
[`AbstractContrasts`](@ref), or a [`ContrastsMatrix`](@ref). For the inner
constructor, they must be [`ContrastsMatrix`](@ref)es.
* `ex::Expr`: An expression which will be converted into a `Formula`.
* `terms::Terms`: For inner constructor, the parsed `Terms` from the `Formula`.
* `missing::BitArray`: For inner constructor, indicates whether each row of `df`
contains any missing data.
# Examples
```julia
julia> df = Table(x = 1:4, y = 5:9)
julia> mf = ModelFrame(y ~ 1 + x, df)
```
"""
type ModelFrame
df::Table
terms::Terms
msng::BitArray
## mapping from df keys to contrasts matrices
contrasts::Dict{Symbol, ContrastsMatrix}
end
is_categorical{T<:Real}(::AbstractArray{T}) = false
typealias NullableReal{T<:Real} Nullable{T}
is_categorical{T<:NullableReal}(::AbstractArray{T}) = false
is_categorical(::AbstractArray) = true
## Check for non-redundancy of columns. For instance, if x is a factor with two
## levels, it should be expanded into two columns in y~0+x but only one column
## in y~1+x. The default is the rank-reduced form (contrasts for n levels only
## produce n-1 columns). In general, an evaluation term x within a term
## x&y&... needs to be "promoted" to full rank if y&... hasn't already been
## included (either explicitly in the Terms or implicitly by promoting another
## term like z in z&y&...).
##
## This modifies the Terms, setting `trms.is_non_redundant = true` for all non-
## redundant evaluation terms.
function check_non_redundancy!(trms::Terms, df::Table)
(n_eterms, n_terms) = size(trms.factors)
encountered_columns = Vector{eltype(trms.factors)}[]
if trms.intercept
push!(encountered_columns, zeros(eltype(trms.factors), n_eterms))
end
for i_term in 1:n_terms
for i_eterm in 1:n_eterms
## only need to check eterms that are included and can be promoted
## (e.g., categorical variables that expand to multiple mm columns)
if Bool(trms.factors[i_eterm, i_term]) && is_categorical(df[trms.eterms[i_eterm]])
dropped = trms.factors[:,i_term]
dropped[i_eterm] = 0
if findfirst(encountered_columns, dropped) == 0
trms.is_non_redundant[i_eterm, i_term] = true
push!(encountered_columns, dropped)
end
end
end
## once we've checked all the eterms in this term, add it to the list
## of encountered terms/columns
push!(encountered_columns, Compat.view(trms.factors, :, i_term))
end
return trms.is_non_redundant
end
const DEFAULT_CONTRASTS = DummyCoding
# _unique(x::CategoricalArray) = unique(x)
# _unique(x::NullableCategoricalArray) = [get(l) for l in unique(x) if !isnull(l)]
# function _unique{T<:Nullable}(x::AbstractArray{T})
# levs = [get(l) for l in unique(x) if !isnull(l)]
# try; sort!(levs); end
# return levs
# end
# function _unique(x::AbstractArray)
# levs = unique(x)
# try; sort!(levs); end
# return levs
# end
## Set up contrasts:
## Combine actual DF columns and contrast types if necessary to compute the
## actual contrasts matrices, levels, and term names (using DummyCoding
## as the default)
function evalcontrasts(df::Table, contrasts::Dict = Dict())
evaledContrasts = Dict()
for (term, col) in eachcol(df)
is_categorical(col) || continue
evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ?
contrasts[term] :
DEFAULT_CONTRASTS(),
unique(col))
end
return evaledContrasts
end
## Default NULL handler. Others can be added as keyword arguments
function null_omit(df::Table)
cc = completecases(df)
df[cc,:], cc
end
function ModelFrame(trms::Terms, d::Table;
contrasts::Dict = Dict())
df, msng = null_omit(typeof(d)(map(x -> d[x], trms.eterms)))
names!(df, convert(Vector{Symbol}, map(string, trms.eterms)))
evaledContrasts = evalcontrasts(df, contrasts)
## Check for non-redundant terms, modifying terms in place
check_non_redundancy!(trms, df)
ModelFrame(df, trms, msng, evaledContrasts)
end
ModelFrame(df::Table, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df))
ModelFrame(f::Formula, d::Table; kwargs...) = ModelFrame(Terms(f), d; kwargs...)
ModelFrame(ex::Expr, d::Table; kwargs...) = ModelFrame(Formula(ex), d; kwargs...)
"""
setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
Modify the contrast coding system of a ModelFrame in place.
"""
function setcontrasts!(mf::ModelFrame, new_contrasts::Dict)
new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, unique(mf.df[col])))
for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ])
mf.contrasts = merge(mf.contrasts, new_contrasts)
return mf
end
setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs))
"""
StatsBase.model_response(mf::ModelFrame)
Extract the response column, if present. `DataVector` or
`PooledDataVector` columns are converted to `Array`s
"""
function StatsBase.model_response(mf::ModelFrame)
if mf.terms.response
convert(Array, mf.df[mf.terms.eterms[1]])
else
error("Model formula one-sided")
end
end
"""
termnames(term::Symbol, col)
Returns a vector of strings with the names of the coefficients
associated with a term. If the column corresponding to the term
is not categorical, a one-element vector is returned.
"""
termnames(term::Symbol, col) = [string(term)]
function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false)
if haskey(mf.contrasts, term)
termnames(term, mf.df[term],
non_redundant ?
ContrastsMatrix{FullDummyCoding}(mf.contrasts[term]) :
mf.contrasts[term])
else
termnames(term, mf.df[term])
end
end
termnames(term::Symbol, col::Any, contrast::ContrastsMatrix) =
["$term: $name" for name in contrast.termnames]
function expandtermnames(term::Vector)
if length(term) == 1
return term[1]
else
return foldr((a,b) -> vec([string(lev1, " & ", lev2) for
lev1 in a,
lev2 in b]),
term)
end
end
"""
coefnames(mf::ModelFrame)
Returns a vector of coefficient names constructed from the Terms
member and the types of the evaluation columns.
"""
function coefnames(mf::ModelFrame)
terms = droprandomeffects(dropresponse!(mf.terms))
## strategy mirrors ModelMatrx constructor:
eterm_names = @compat Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}()
term_names = Vector{Compat.UTF8String}[]
if terms.intercept
push!(term_names, Compat.UTF8String["(Intercept)"])
end
factors = terms.factors
for (i_term, term) in enumerate(terms.terms)
## names for columns for eval terms
names = Vector{Compat.UTF8String}[]
ff = Compat.view(factors, :, i_term)
eterms = Compat.view(terms.eterms, ff)
non_redundants = Compat.view(terms.is_non_redundant, ff, i_term)
for (et, nr) in zip(eterms, non_redundants)
if !haskey(eterm_names, (et, nr))
eterm_names[(et, nr)] = termnames(et, mf, non_redundant=nr)
end
push!(names, eterm_names[(et, nr)])
end
push!(term_names, expandtermnames(names))
end
reduce(vcat, Vector{Compat.UTF8String}(), term_names)
end