minor progress on rollup

jangorecki · May 9, 2016 · 8be9081 · 8be9081
1 parent cdf529d
commit 8be9081
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 17 deletions.
diff --git a/R/data.cube.R b/R/data.cube.R
@@ -219,7 +219,7 @@ data.cube = R6Class(
             # - [ ] handle grouping detection for `+` and `^`
             grp.dims = names(dots.ops)[dots.ops %chin% c("+","^")]
             grp = dots[grp.dims]
-            dots[grp.dims] = sapply(grp.dims, function(x) list())
+            dots[grp.dims] = sapply(grp.dims, function(x) list(), simplify=FALSE)
 
             # return
             list(ops = dots.ops,
@@ -230,8 +230,8 @@ data.cube = R6Class(
             # - [x] catch dots, preprocess, evaluate
             if (missing(.dots)) .dots = match.call(expand.dots = FALSE)$`...`
             i.meta = self$parse.dots(.dots)
-            i.ops = i.meta$ops
-            i.sub = i.meta$sub
+            i.ops = i.meta$ops # operation type: ., -, +, ^
+            i.sub = i.meta$sub # subset filtering conditions
             i.grp = i.meta$grp # aggregation sets - rollup, cube
             # exit on `dc[.(),.(),.()]` considering drop, exit from `dc[]` wont use drop and is handled in "[.data.cube" function
             if (all(sapply(i.sub, identical, list())) && all(i.ops==".")) return(

diff --git a/R/dimension.R b/R/dimension.R
@@ -97,16 +97,13 @@ dimension = R6Class(
             invisible(self)
         },
         rollup = function(x, i.ops) {
-            stopifnot(is.character(i.ops))
+            browser()
+            stopifnot(is.character(x), is.character(i.ops))
             r = new.env()
+            r$hierarchies = sapply(self$hierarchies, function(h) h$rollup(x), simplify=FALSE)
+
             browser()
-            if (is.list(x)) {
-                stopifnot(names(x) %chin% names(self$hierarchies))
-                r$hierarchies = sapply(self$hierarchies, function(h) h$rollup(x), simplify=FALSE)
-
-            } else {
-
-            }
+            as.data.cube.environment(r)
         }
     )
 )

diff --git a/R/hierarchy.R b/R/hierarchy.R
@@ -18,11 +18,22 @@ hierarchy = R6Class(
             invisible(self)
         },
         rollup = function(x) {
+            stopifnot(is.character(x))
             # all higher attributes in hierarchy are taken
             # if not used, it won't be calculated in facts
+            browser()
+            # time.calendar = as.hierarchy(list(
+            #     year = character(),
+            #     quarter = character(),
+            #     month = character(),
+            #     date = c("year","quarter","month")
+            # ))
+            # define grain
+            lvl.match = chmatch(x, names(self$levels))
 
+            grain.lvl = names(time.calendar)[max(lvl.match)]
 
-            browser()
+
         }
     )
 )

diff --git a/vignettes/sub-.data.cube.Rmd b/vignettes/sub-.data.cube.Rmd
@@ -34,14 +34,17 @@ R *data.cube* class defined in [data.cube](https://gitlab.com/jangorecki/data.cu
 
 ## Start session
 
+If not installed use `install.packages("data.cube", repos = paste0("https://", c("jangorecki.gitlab.io/data.cube","Rdatatable.github.io/data.table","cran.rstudio.com")))`.
+
 ```{r run}
-# install.packages("data.cube", repos = paste0("https://", c("jangorecki.gitlab.io/data.cube","Rdatatable.github.io/data.table","cran.rstudio.com")))
 library(data.table)
 library(data.cube)
 ```
 
 ## Tiny example
 
+2x2x2 array vs data.cube.
+
 ```{r tiny}
 set.seed(1)
 # array
@@ -74,13 +77,15 @@ format(dc[,"2015", c("UK","IN")],
 
 ## Hierarchies example
 
+Filtering on attributes in hierarchies of dimension.
+
 ```{r hierarchies}
-# as.data.cube.list - investigate X to see structure
+# populate fact, dimensions and hierarchies
 X = populate_star(N=1e5)
 lapply(X[1:2], sapply, ncol)
 lapply(X[1:2], sapply, nrow)
 str(X[3L], max.level=3L) # hierarchy defined as list of levels and attributes on each level
-dc = as.data.cube(X)
+dc = as.data.cube(X) # as.data.cube.list
 print(dc)
 
 # slice
@@ -114,8 +119,10 @@ dc[,,, .(geog_region_name = "North Central")
 
 ## Aggregates
 
+Collapse dimension to sub-aggregate cube can be done with `` `-` `` quoted symbol. It is possible to filter out on that dimension before collapsing it using filter similarly to `.(...)` as  `` `-`(...) ``.  
+Filtering on attributes and group by customer and currency.
+
 ```{r aggregate_collapse}
-# aggregate by collapse dimension with `-` symbol, group by customer and currency
 dc[product = `-`,
    customer = .(),
    currency = .(curr_type = "crypto"),
@@ -125,7 +132,9 @@ dc[product = `-`,
 
 ## Scalability
 
-```{r scalability}
+Memory data.cube vs array.
+
+```{r memory_efficiency}
 # ~1e5 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826
 dc = as.data.cube(populate_star(N=1e5))
 ## estimated size of memory required to store an base R `array` for single numeric measure
@@ -148,3 +157,9 @@ sprintf("array: %.2f GB", (prod(dim(dc)) * 8)/(1024^3))
 ## fact table of *cube* object having multiple measures
 sprintf("data.cube: %.2f GB", as.numeric(object.size(dc$fact$data))/(1024^3))
 ```
+
+Memory cap is the most common issue on scalability. The is addressed by sharding fact table using [`big.data.table`](https://github.com/jangorecki/big.data.table) class instead of `data.table` in `fact` class object. CI process includes basic testing of that feature on old `cube` class, it wasn't yet implemented for `data.cube`.
+
+R single CPU limitation is now being addressed by parallelizing some data.table calls. This is now being developed in quite a low level and pretty well scalable way.  
+Sharding can also outsource CPU computation to nodes.  
+