Skip to content

Commit

Permalink
drop most suggests, simplify
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Mar 18, 2018
1 parent 8be9081 commit d90f911
Show file tree
Hide file tree
Showing 17 changed files with 144 additions and 557 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
^\.emacs\.desktop
^\.emacs\.desktop\.lock
^.*\.Rproj$
^\.Rproj\.user$
^\..+
25 changes: 23 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,25 @@
.Rproj.user
.Rhistory
# History files
.RData
.Rhistory
.Rapp.history

# Package build process
*-Ex.R
data.cube_*.tar.gz
data.cube.Rcheck

# Emacs IDE files
.emacs.desktop
.emacs.desktop.lock

# RStudio IDE files
.Rproj.user
data.cube.Rproj

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# object and shared objects
*.o
*.so
9 changes: 4 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
Package: data.cube
Type: Package
Title: OLAP cube data type
Version: 0.3.0
Date: 2016-04-27
Version: 0.4.0
Date: 2018-03-17
Author: Jan Gorecki
Maintainer: Jan Gorecki <[email protected]>
Description: Extends array for OLAP operations on multidimensional hierarchical data powered by data.table.
Depends: R (>= 3.1.0)
Imports: data.table (>= 1.9.7), R6
Suggests: big.data.table (>= 0.3.4), RSclient, Rserve, logR (>= 2.1.4), knitr, rmarkdown
Imports: data.table (>= 1.9.8), R6
Suggests: knitr
License: GPL-3
Additional_repositories: https://jangorecki.gitlab.io/big.data.table, https://jangorecki.gitlab.io/logR, https://Rdatatable.github.io/data.table
VignetteBuilder: knitr
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ S3method(as.data.table, dimension)
export(as.fact)
S3method(as.fact, default)
S3method(as.fact, data.table)
S3method(as.fact, list)
S3method(as.fact, big.data.table)

# *.fact
S3method(length, fact)
Expand Down
1 change: 0 additions & 1 deletion R/as.data.cube.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,6 @@ as.data.table.data.cube = function(x, na.fill = FALSE, dcast = FALSE, ...) {
}

as.array.data.cube = function(x, measure, na.fill = NA, ...) {
if (!x$fact$local) stop("Only local data.cube, not distributed ones, can be converted to array")
if (missing(measure)) measure = x$fact$measure.vars[1L]
if (length(measure) > 1L) stop("Your cube seems to have multiple measures, you must provide scalar column name as 'measure' argument to as.array.")
dimcols = lapply(x$dimensions, function(x) x$id.vars)
Expand Down
20 changes: 10 additions & 10 deletions R/as.dimension.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,20 @@ as.dimension.environment = function(x, ...){
}

null.dimension = function(...){
env = new.env()
env$data = data.table(NULL)
env$id.vars = character()
env$hierarchies = list()
env$levels = list()
env$fields = character()
as.dimension.environment(env)
ans = new.env()
ans$data = data.table(NULL)
ans$id.vars = character()
ans$hierarchies = list()
ans$levels = list()
ans$fields = character()
as.dimension.environment(ans)
}

# export

as.data.table.dimension = function(x, lvls = names(x$levels), ...) {
stopifnot(is.dimension(x))
r = copy(x$data)
lookupv(dims = lapply(x$levels[lvls], as.data.table.level), r)
r[]
ans = copy(x$data)
lookupv(dims = lapply(x$levels[lvls], as.data.table.level), ans)
ans[]
}
29 changes: 5 additions & 24 deletions R/as.fact.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,31 +35,8 @@ as.fact.data.table = function(x, id.vars = as.character(key(x)), measure.vars =
list(.fun.aggregate = sub.fun)))
}

#' @rdname as.fact
#' @method as.fact list
as.fact.list = function(x, id.vars = as.character(key(x)), measure.vars = setdiff(names(x), id.vars), fun.aggregate = sum, ..., measures = NULL) {
sub.fun = substitute(fun.aggregate)
stopifnot(requireNamespace("big.data.table", quietly = TRUE), big.data.table::is.rscl(x))
eval(substitute(fact$new(x, id.vars = id.vars, measure.vars = measure.vars, fun.aggregate = .fun.aggregate, ... = ..., measures = measures),
list(.fun.aggregate = sub.fun)))
}

#' @rdname as.fact
#' @method as.fact big.data.table
as.fact.big.data.table = function(x, id.vars, measure.vars = setdiff(names(x), id.vars), fun.aggregate = sum, ..., measures = NULL) {
sub.fun = substitute(fun.aggregate)
stopifnot(requireNamespace("big.data.table", quietly = TRUE), big.data.table::is.big.data.table(x), is.character(id.vars))
eval(substitute(fact$new(x, id.vars = id.vars, measure.vars = measure.vars, fun.aggregate = .fun.aggregate, ... = ..., measures = measures),
list(.fun.aggregate = sub.fun)))
}

as.fact.environment = function(x, ...) {
fact$new(.env = x)
}

null.fact = function(...) {
env = new.env()
env$local = TRUE
env$id.vars = character()
env$measure.vars = character()
env$measures = list()
Expand All @@ -72,4 +49,8 @@ null.fact = function(...) {
as.data.table.fact = function(x, ...) {
stopifnot(is.fact(x))
copy(x$data)
}
}

as.fact.environment = function(x, ...) {
fact$new(.env = x)
}
98 changes: 47 additions & 51 deletions R/data.cube.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,32 +81,32 @@ data.cube = R6Class(
dict = self$schema()
prnt = character()
prnt["header"] = "<data.cube>"
#prnt["distributed"] =
n.measures = length(self$fact$measure.vars)
prnt["fact"] = dict[type=="fact",
sprintf("fact%s:\n %s rows x %s dimensions x %s measures (%.2f MB)",
if(!self$fact$local) sprintf(" (distributed on %s nodes)", length(attr(self$fact$data, "rscl"))) else "",
sprintf("fact:\n %s rows x %s dimensions x %s measures (%.2f MB)",
nrow, ncol - n.measures, n.measures, mb)]
if (length(self$dimensions)) {
dt = dict[type=="dimension", .(nrow = nrow[is.na(entity)], ncol = ncol[is.na(entity)], mb = sum(mb, na.rm = TRUE)), .(name)]
size = dict[type=="dimension", .(mb = sum(mb, na.rm = TRUE)), .(name)]
nrnc = dict[type=="dimension" & is.na(entity), .(nrow, ncol), .(name)]
dt = nrnc[size, on="name"]
prnt["dims"] = paste0("dimensions:\n", paste(dt[, sprintf(" %s : %s entities x %s levels (%.2f MB)", name, nrow, ncol, mb)], collapse="\n"))
}
prnt["size"] = sprintf("total size: %.2f MB", dict[,sum(mb)])
cat(prnt, sep = "\n")
invisible(self)
},
denormalize = function(na.fill = FALSE, dims = names(self$dimensions)) {
r = as.data.table(self$fact)
ans = as.data.table.fact(self$fact)
if (isTRUE(na.fill)) {
# `nomatch` to be extended to raise error if fact has values not in dims, after data.table#857 resolved
dn = dimnames(self)
r = r[i=do.call(CJ, c(dn, list(sorted=TRUE, unique=TRUE))),
nomatch=NA,
on=setNames(names(dn), nm=self$fact$id.vars)]
ans = ans[i=do.call(CJ, c(dn, list(sorted=TRUE, unique=TRUE))),
nomatch=NA,
on=setNames(names(dn), nm=self$fact$id.vars)]
}
# lookup
lookupv(dims = lapply(self$dimensions[dims], as.data.table.dimension), r)
if (length(self$fact$id.vars)) setkeyv(r, self$fact$id.vars)[] else r[]
lookupv(dims = lapply(self$dimensions[dims], as.data.table.dimension), ans)
if (length(self$fact$id.vars)) setkeyv(ans, self$fact$id.vars)
ans[]
},
schema = function() {
rbindlist(list(
Expand Down Expand Up @@ -246,32 +246,32 @@ data.cube = R6Class(
} else self
)
# returned object
r = new.env()
ans = new.env()
# - [x] filter dimensions and levels while quering them to new environment
r$dimensions = sapply(names(self$dimensions), function(dim) {
if (dim %chin% names(i.sub)) self$dimensions[[dim]]$subset(i.sub = i.sub[[dim]])
else if (dim %chin% names(i.grp)) self$dimensions[[dim]]$rollup(i.grp[[dim]])
ans$dimensions = sapply(names(self$dimensions), function(dim) {
if (dim %chin% names(i.sub)) self$dimensions[[dim]]$subset(i.sub = i.sub[[dim]])
else if (dim %chin% names(i.grp)) self$dimensions[[dim]]$rollup(i.grp[[dim]])
}, simplify=FALSE)
r$id.vars = self$id.vars
ans$id.vars = self$id.vars
# - [x] filter fact - prepare index for subset fact
filter.dims = sapply(i.sub, function(x) length(x) || is.null(x)) # NULL is valid empty subset notation, as in base R
filter.dims = names(filter.dims)[as.logical(filter.dims)]
# primary keys of dimensions after filtering
dimkeys = sapply(names(r$dimensions)[names(r$dimensions) %chin% filter.dims], function(dim) {
r$dimensions[[dim]]$data[[r$dimensions[[dim]]$id.vars]]
dimkeys = sapply(names(ans$dimensions)[names(ans$dimensions) %chin% filter.dims], function(dim) {
ans$dimensions[[dim]]$data[[ans$dimensions[[dim]]$id.vars]]
}, simplify=FALSE)
stopifnot(names(dimkeys) %chin% names(r$dimensions)) # all names must match, before drop dims
stopifnot(names(dimkeys) %chin% names(ans$dimensions)) # all names must match, before drop dims
# - [x] drop sliced dimensions
if (drop) {
len1.dims = names(dimkeys)[sapply(dimkeys, length)==1L]
# if user provides multiple values to dimension filter key, it should not drop that dim even when only 1L was matched, base::array raises error on nomatch
filter.multkey = len1.dims[sapply(len1.dims, function(dim) length(i.sub[[dim]][[r$dimensions[[dim]]$id.vars]])) > 1L]
if.drop = names(r$dimensions) %chin% setdiff(len1.dims, filter.multkey)
r$dimensions[if.drop] = NULL
r$id.vars = r$id.vars[!if.drop]
filter.multkey = len1.dims[sapply(len1.dims, function(dim) length(i.sub[[dim]][[ans$dimensions[[dim]]$id.vars]])) > 1L]
if.drop = names(ans$dimensions) %chin% setdiff(len1.dims, filter.multkey)
ans$dimensions[if.drop] = NULL
ans$id.vars = ans$id.vars[!if.drop]
}
# - [x] subset fact
# - [ ] support for: filter `.`, collapse dim `-`, rollup `+`, cube `^`
# - [ ] support for:filter `.`, collapse dim `-`, rollup `+`, cube `^`
dimcols = self$id.vars[names(self$dimensions) %chin% names(dimkeys)]
stopifnot(length(dimcols) == length(dimkeys))
setattr(dimkeys, "names", dimcols)
Expand Down Expand Up @@ -312,28 +312,23 @@ data.cube = R6Class(
x = sapply(names(self$dimensions), function(x) self$dimensions[[x]]$rollup(i.ops = i.ops[[x]]), simplify=FALSE)
# all fields used in grouping for each dimension
new.fact = self$fact$rollup(x, collapse=collapse.cols, grouping.sets=groupingsets.cols, ops=i.ops, drop=drop)
# r$fact = new.fact
# ans$fact = new.fact
} else {
r$fact = self$fact$subset(dimkeys, collapse=collapse.cols, drop=drop)
ans$fact = self$fact$subset(dimkeys, collapse=collapse.cols, drop=drop)
}
stopifnot(ncol(r$fact$data) > 0L, length(collapse.cols)==length(collapse.dims))
stopifnot(ncol(ans$fact$data) > 0L, length(collapse.cols)==length(collapse.dims))
if (length(collapse.dims)) {
r$dimensions[collapse.dims] = NULL
r$id.vars = setdiff(r$id.vars, collapse.cols)
ans$dimensions[collapse.dims] = NULL
ans$id.vars = setdiff(ans$id.vars, collapse.cols)
}

# - [x] return cube with all dimensions filtered and fact filtered
as.data.cube.environment(r)
as.data.cube.environment(ans)
},
# setindex
setindex = function(drop = FALSE) {
optional.logR = function(x, .log = getOption("datacube.log")) {
if(isTRUE(.log)) eval.parent(substitute(logR(x), list(x = substitute(x)))) else x
}
r = list(
fact = optional.logR(self$fact$setindex(drop=drop)),
dimensions = lapply(self$dimensions, function(x) optional.logR(x$setindex(drop=drop)))
) # r - not used further but evaluated on lower classes
self$fact$setindex(drop=drop)
lapply(self$dimensions, function(x) x$setindex(drop=drop))
invisible(self)
},
rollup = function(...) {
Expand Down Expand Up @@ -390,6 +385,7 @@ is.data.cube = function(x) inherits(x, "data.cube")
#' @param x data.cube object
#' @param ... values to subset on corresponding dimensions, when wrapping in list it will refer to dimension hierarchies
#' @param drop logical, default TRUE, drop redundant dimensions, same as \emph{drop} argument in \code{[.array}.
#' @details The following syntax has been propose to subset data.cube: TODO #6
#' @return data.cube class object
"[.data.cube" = function(x, ..., drop = TRUE) {
if (!is.logical(drop)) stop("`drop` argument to data.cube subset must be logical. If argument name conflicts with your dimension name then provide it without name, elements in ... are matched by positions - as in array method - not names.")
Expand All @@ -408,8 +404,8 @@ is.data.cube = function(x) inherits(x, "data.cube")
return(x)
}
# proceed subset, also proceed empty subset `dc[,]` or `dc[, drop=.]`
r = x$subset(.dots = .dots, drop = drop)
r
ans = x$subset(.dots = .dots, drop = drop)
ans
}

# @title Extract data.cube
Expand All @@ -419,14 +415,14 @@ is.data.cube = function(x) inherits(x, "data.cube")
# @param by expression/character vector to aggregate measures accroding to \emph{j} argument.
# @return data.cube?? class object
# "[[.data.cube" = function(x, i, j, by) {
# r = x$extract(by = by, .call = match.call())
# r
# ans = x$extract(by = by, .call = match.call())
# ans
# }

dimnames.data.cube = function(x) {
r = sapply(x$dimensions, dimnames, simplify=FALSE)
if (!length(r)) return(NULL)
r
ans = sapply(x$dimensions, dimnames, simplify=FALSE)
if (!length(ans)) return(NULL)
ans
}

str.data.cube = function(object, ...) {
Expand All @@ -443,25 +439,25 @@ format.data.cube = function(x, na.fill = FALSE, measure.format = list(), dots.fo
length(names(measure.format))==length(measure.format),
names(measure.format) %in% measure.vars
)
r = x$denormalize(dims = character(0), na.fill = na.fill)
if (length(id.vars)) r = setorderv(r, cols = id.vars, order=1L, na.last=TRUE)
ans = x$denormalize(dims = character(0), na.fill = na.fill)
if (length(id.vars)) ans = setorderv(ans, cols = id.vars, order=1L, na.last=TRUE)
if (!is.null(measure.format)) { # measure.format=NULL will stop any formatting
for (mv in measure.vars) {
if (mv %chin% names(measure.format)) {
FUN = measure.format[[mv]]
set(r, i = NULL, j = mv, value = FUN(r[[mv]], ... = dots.format[[mv]]))
set(ans, i = NULL, j = mv, value = FUN(ans[[mv]], ... = dots.format[[mv]]))
} else {
if (!is.null(FUN <- x$fact$measures[[mv]]$fun.format)) {
set(r, i = NULL, j = mv, value = FUN(r[[mv]], ... = dots.format[[mv]]))
set(ans, i = NULL, j = mv, value = FUN(ans[[mv]], ... = dots.format[[mv]]))
}
}
}
}
if (isTRUE(dcast)) r = dcast.data.table(r, ...)
r[]
if (isTRUE(dcast)) ans = dcast.data.table(ans, ...)
ans[]
}

head.data.cube = function(x, n = 6L, ...) x$head(n)
head.data.cube = function(x, n = 6L, ...) x$head(n = n)

length.data.cube = function(x) as.integer(nrow(x$fact))
names.data.cube = function(x) as.character(names(x$fact))
Expand Down
4 changes: 4 additions & 0 deletions R/data.table.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@

## data.table helper to be not dependend on data.cube pkg

#' @title Convert array to data.table
#' @param x array
#' @param keep.rownames ignored
Expand Down Expand Up @@ -105,3 +108,4 @@ lookupv = function(dims, fact) {
lookup(fact, dim, setdiff(nd, nf))
})
}

3 changes: 2 additions & 1 deletion R/dimension.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ dimension = R6Class(
rbindlist(list(dimension_data_schema, levels_schema))
},
head = function(n = 6L) {
list(base = head(self$data, n), levels = lapply(self$levels, function(x) x$head(n = n)))
list(base = head(self$data, n = n), levels = lapply(self$levels, function(x) x$head(n = n)))
},
# subset
subset = function(i.sub) {
Expand Down Expand Up @@ -97,6 +97,7 @@ dimension = R6Class(
invisible(self)
},
rollup = function(x, i.ops) {
# TO DO reuse data.table
browser()
stopifnot(is.character(x), is.character(i.ops))
r = new.env()
Expand Down
Loading

0 comments on commit d90f911

Please sign in to comment.