mlr-org · sebffischer · Oct 18, 2024 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/.github/workflows/r-cmd-check.yml b/.github/workflows/r-cmd-check.yml
@@ -82,7 +82,7 @@ jobs:
         if: runner.os == 'Windows'
         id: get_package_version_windows
         run: |
-          $version = Rscript -e 'cat(as.character(packageVersion("torchvision")))' 
+          $version = Rscript -e 'cat(as.character(packageVersion("torchvision")))'
           echo "TORCHVISION_PACKAGE_VERSION=$version" >> $env:GITHUB_ENV
 
       - name: Get torch cache path (Linux/macOS)

diff --git a/.gitignore b/.gitignore
@@ -9,12 +9,18 @@ mlr3torch*.tgz
 *~
 docs
 inst/doc
-*.html
 **/.DS_Store
 /doc/
 /Meta/
 CRAN-SUBMISSION
 paper/data
 .idea/
 .vsc/
-paper/data
+paper/data/
+paper/benchmark/registry
+.vscode/
+paper/benchmark/registry-linux-cpu/
+paper/benchmark/registry-macos/
+paper/benchmark/registry-linux-gpu/
+paper/benchmark/registry-linux-gpu-optimizer/
+paper/benchmark/registry-linux-gpu-old/
diff --git a/R/learner_torch_methods.R b/R/learner_torch_methods.R
@@ -27,7 +27,8 @@ learner_torch_train = function(self, private, super, task, param_vals) {
     stopf("Training Dataloader of Learner '%s' has length 0", self$id)
   }
 
-  network = private$.network(task, param_vals)$to(device = param_vals$device)
+  network = private$.network(task, param_vals)
+  network$to(device = param_vals$device)
   if (isTRUE(param_vals$jit_trace) && !inherits(network, "script_module")) {
     example = get_example_batch(loader_train)$x
     example = lapply(example, function(x) x$to(device = param_vals$device))
@@ -134,6 +135,8 @@ train_loop = function(ctx, cbs) {
 
   ctx$network$train()
 
+  forward = get_forward(ctx$network)
+
   # if we increment epoch at the end of the loop it has the wrong value
   # during the final two callback stages
   ctx$epoch = 0L
@@ -145,6 +148,7 @@ train_loop = function(ctx, cbs) {
     indices = list()
     train_iterator = dataloader_make_iter(ctx$loader_train)
     ctx$step = 0L
+    eval_train = eval_train_in_epoch(ctx)
     while (ctx$step < length(ctx$loader_train)) {
       ctx$step = ctx$step + 1
       ctx$batch = dataloader_next(train_iterator)
@@ -155,9 +159,9 @@ train_loop = function(ctx, cbs) {
       call("on_batch_begin")
 
       if (length(ctx$batch$x) == 1L) {
-        ctx$y_hat = ctx$network(ctx$batch$x[[1L]])
+        ctx$y_hat = forward(ctx$batch$x[[1L]])
       } else {
-        ctx$y_hat = do.call(ctx$network, ctx$batch$x)
+        ctx$y_hat = do.call(forward, ctx$batch$x)
       }
 
       loss = ctx$loss_fn(ctx$y_hat, ctx$batch$y)
@@ -167,14 +171,16 @@ train_loop = function(ctx, cbs) {
       call("on_after_backward")
 
       ctx$last_loss = loss$item()
-      predictions[[length(predictions) + 1]] = ctx$y_hat$detach()
-      indices[[length(indices) + 1]] = as.integer(ctx$batch$.index$to(device = "cpu"))
+      if (eval_train) {
+        predictions[[length(predictions) + 1]] = ctx$y_hat$detach()
+        indices[[length(indices) + 1]] = as.integer(ctx$batch$.index$to(device = "cpu"))
+      }
       ctx$optimizer$step()
 
       call("on_batch_end")
     }
 
-    ctx$last_scores_train = if (eval_train_in_epoch(ctx)) {
+    ctx$last_scores_train = if (eval_train) {
       measure_prediction(
         pred_tensor = torch_cat(predictions, dim = 1L),
         measures = ctx$measures_train,

diff --git a/R/nn.R b/R/nn.R
@@ -11,9 +11,5 @@
 #' # is the same as:
 #' po2 = nn("linear")
 nn = function(.key, ...) {
-  args = list(...)
-  if (is.null(args$id)) {
-    args$id = .key
-  }
-  invoke(po, .obj = paste0("nn_", .key), .args = args)
+  invoke(po, .obj = paste0("nn_", .key), id = .key, ...)
 }
diff --git a/R/utils.R b/R/utils.R
@@ -275,6 +275,23 @@ order_named_args = function(f, l) {
   l2
 }
 
+get_forward = function(net) {
+  if (inherits(net, "script_module")) {
+    is_training = net$is_training
+    trainforward = net$trainforward
+    evalforward = net$evalforward
+    function(...) {
+      if (is_training()) {
+        trainforward(...)
+      } else {
+        evalforward(...)
+      }
+    }
+  } else {
+    net$forward
+  }
+}
+
 
 #' @title Network Output Dimension
 #' @description
@@ -314,7 +331,7 @@ all_or_none_ = function(...) {
 single_lazy_tensor = function(task) {
   identical(task$feature_types[, "type"][[1L]], "lazy_tensor")
 }
-                              
+
 n_num_features = function(task) {
   sum(task$feature_types$type %in% c("numeric", "integer"))
 }
@@ -325,4 +342,4 @@ n_categ_features = function(task) {
 
 n_ltnsr_features = function(task) {
   sum(task$feature_types$type == "lazy_tensor")
-}
+}
diff --git a/man-roxygen/paramset_torchlearner.R b/man-roxygen/paramset_torchlearner.R
@@ -64,7 +64,7 @@
 #'   The batch size (required).
 #' * `shuffle` :: `logical(1)`\cr
 #'   Whether to shuffle the instances in the dataset. This is initialized to `TRUE`,
-#'   which differs from the default (`FALSE`).
+#'   which differs from the default of the [`torch::dataloader`] which is `FALSE`.
 #' * `sampler` :: [`torch::sampler`]\cr
 #'   Object that defines how the dataloader draw samples.
 #' * `batch_sampler` :: [`torch::sampler`]\cr
@@ -91,4 +91,4 @@
 #' * `worker_packages` :: `character()`\cr
 #'   Which packages to load on the workers.
 #'
-#' Also see `torch::dataloder` for more information.
+#' Also see [`torch::dataloder`] for more information.
diff --git a/mlr3torch-benchmark-5274609.out b/mlr3torch-benchmark-5274609.out
@@ -0,0 +1,75 @@
+[INFO] Extracting squashfs filesystem...
+Parallel unsquashfs: Using 92 processors
+57832 inodes (209999 blocks) to write
+
+
+created 55975 files
+created 6137 directories
+created 1735 symlinks
+created 0 devices
+created 0 fifos
+created 0 sockets
+
+==========
+== CUDA ==
+==========
+
+CUDA Version 12.4.1
+
+Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+This container image and its contents are governed by the NVIDIA Deep Learning Container License.
+By pulling and using the container, you accept the terms and conditions of this license:
+https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license
+
+A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.
+
+WARNING: The NVIDIA Driver was not detected.  GPU functionality will not be available.
+   Use the NVIDIA Container Toolkit to start this container with GPU support; see
+   https://docs.nvidia.com/datacenter/cloud-native/ .
+
+R version 4.5.0 (2025-04-11)
+Platform: x86_64-pc-linux-gnu
+Running under: Ubuntu 22.04.4 LTS
+
+Matrix products: default
+BLAS:   /usr/local/lib/R/lib/libRblas.so 
+LAPACK: /usr/local/lib/R/lib/libRlapack.so;  LAPACK version 3.12.1
+
+locale:
+ [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
+ [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
+ [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
+ [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
+ [9] LC_ADDRESS=C               LC_TELEPHONE=C            
+[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
+
+time zone: Etc/UTC
+tzcode source: system (glibc)
+
+attached base packages:
+[1] stats     graphics  grDevices utils     datasets  methods   base     
+
+loaded via a namespace (and not attached):
+[1] compiler_4.5.0
+
+Attaching package: ‘mlr3misc’
+
+The following object is masked from ‘package:batchtools’:
+
+    chunk
+
+Sourcing configuration file '/mnt/data/mlr3torch/paper/batchtools.conf.R' ...
+Loading required package: checkmate
+Created registry in '/mnt/data/mlr3torch/paper/benchmark/registry' using cluster functions 'Interactive'
+Exporting new objects: 'time_rtorch' ...
+Adding problem 'runtime_train'
+Adding algorithm 'pytorch'
+Adding algorithm 'rtorch'
+Adding algorithm 'mlr3torch'
+Adding 180 experiments ('runtime_train'[30] x 'rtorch'[2] x repls[3]) ...
+Adding 180 experiments ('runtime_train'[30] x 'mlr3torch'[2] x repls[3]) ...
+Adding 180 experiments ('runtime_train'[30] x 'pytorch'[2] x repls[3]) ...
+Adding 180 experiments ('runtime_train'[30] x 'rtorch'[2] x repls[3]) ...
+Adding 180 experiments ('runtime_train'[30] x 'mlr3torch'[2] x repls[3]) ...
+Adding 180 experiments ('runtime_train'[30] x 'pytorch'[2] x repls[3]) ...