Skip to content
This repository was archived by the owner on May 4, 2019. It is now read-only.

Change unique() to return values in the same ordering as levels for PDAs #237

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 16 additions & 42 deletions src/pooleddataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,9 @@ end
#' @description
#'
#' Return a DataVector containing the unique values of a `PooledDataArray`,
#' in the order they appear in the data, including `NA` if any missing entries
#' in the order of `levels`, including `NA` if any missing entries
#' are encountered. For `PooledDataArray`s, this function is much less efficient
#' than `levels`, which does not return the values in the same order.
#' than `levels`.
#'
#' @param da::DataArray{T} `DataArray` whose unique values are desired.
#'
Expand All @@ -286,50 +286,24 @@ end
#' pdv = @pdata [1, -2, 1, NA, 4]
#' distinct_values = unique(pdv)
function Base.unique{T}(pda::PooledDataArray{T})
n = length(pda)
nlevels = length(pda.pool)
unique_values = Vector{T}(0)
sizehint!(unique_values, nlevels)
seen = Set{eltype(pda.refs)}()

firstna = 0
for i in 1:n
if isna(pda, i)
if firstna == 0
firstna = length(unique_values) + 1
end
elseif !in(pda.refs[i], seen)
push!(seen, pda.refs[i])
push!(unique_values, pda.pool[pda.refs[i]])
else
continue
end

if firstna > 0 && length(unique_values) == nlevels
break
seen = fill(false, nlevels + 1)
batch = 0
@inbounds for i in pda.refs
seen[i + 1] = true
# Only do a costly short-circuit check periodically
batch += 1
if batch > 1000
all(seen) && break
batch = 0
end
end

if firstna > 0
res = DataArray(Vector{T}(nlevels + 1))
i = 0
for val in unique_values
i += 1
if i == firstna
res.na[i] = true
i += 1
end
res.data[i] = val
end

if firstna == nlevels + 1
res.na[nlevels + 1] = true
end

return res
else
return DataArray(unique_values)
seenna = shift!(seen)
res = DataArray(levels(pda)[seen])
if seenna
push!(res, NA)
end
res
end

#' @description
Expand Down
20 changes: 12 additions & 8 deletions test/pooleddataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,20 @@ module TestPDA
@assert levels(setlevels!(@pdata([1.0, 2.0]), [3,4])) == [3.0, 4.0]

y = @pdata [1, NA, -2, 1, NA, 4, NA]
@assert isequal(unique(y), @pdata [1, NA, -2, 4])
@assert isequal(unique(reverse(y)), @data [NA, 4, 1, -2])
@assert isequal(unique(dropna(y)), @data [1, -2, 4])
@assert isequal(unique(reverse(dropna(y))), @data [4, 1, -2])
@assert isequal(unique(y), @pdata [-2, 1, 4, NA])
@assert isequal(unique(reverse(y)), @data [-2, 1, 4, NA])
@assert isequal(unique(dropna(y)), @data levels(dropna(y)))
@assert isequal(unique(reverse(dropna(y))), @data levels(reverse(dropna(y))))

z = @pdata ["frank", NA, "gertrude", "frank", NA, "herbert", NA]
@assert isequal(unique(z), @pdata ["frank", NA, "gertrude", "herbert"])
@assert isequal(unique(reverse(z)), @pdata [NA, "herbert", "frank", "gertrude"])
@assert isequal(unique(dropna(z)), @pdata ["frank", "gertrude", "herbert"])
@assert isequal(unique(reverse(dropna(z))), @pdata ["herbert", "frank", "gertrude"])
@assert isequal(unique(z), @pdata ["frank", "gertrude", "herbert", NA])
@assert isequal(unique(reverse(z)), @pdata ["frank", "gertrude", "herbert", NA])
@assert isequal(unique(dropna(z)), @data levels(dropna(z)))
@assert isequal(unique(reverse(dropna(z))), @data levels(reverse(dropna(z))))

# check case where some levels are not present in data
z[3] = "frank"
@assert isequal(unique(z), @pdata ["frank", "herbert", NA])

# check case where only NA occurs in final position
@assert isequal(unique(@pdata [1, 2, 1, NA]), @pdata [1, 2, NA])
Expand Down