Skip to content

Commit a70e832

Browse files
authored
[R-package] [python-package] deprecate Dataset arguments to cv() and train() (#6446)
1 parent ae55f32 commit a70e832

File tree

15 files changed

+185
-61
lines changed

15 files changed

+185
-61
lines changed

R-package/R/lgb.cv.R

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ CVBooster <- R6::R6Class(
2525
#' @description Cross validation logic used by LightGBM
2626
#' @inheritParams lgb_shared_params
2727
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
28-
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
29-
#' @param weight vector of response values. If not NULL, will set to dataset
28+
#' @param label Deprecated. See "Deprecated Arguments" section below.
29+
#' @param weight Deprecated. See "Deprecated Arguments" section below.
3030
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
3131
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation.
3232
#' This parameter defaults to \code{TRUE}. Setting it to \code{FALSE} can lead to a
@@ -36,10 +36,8 @@ CVBooster <- R6::R6Class(
3636
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
3737
#' (each element must be a vector of test fold's indices). When folds are supplied,
3838
#' the \code{nfold} and \code{stratified} parameters are ignored.
39-
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
40-
#' @param categorical_feature categorical features. This can either be a character vector of feature
41-
#' names or an integer vector with the indices of the features (e.g.
42-
#' \code{c(1L, 10L)} to say "the first and tenth columns").
39+
#' @param colnames Deprecated. See "Deprecated Arguments" section below.
40+
#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
4341
#' @param callbacks List of callback functions that are applied at each iteration.
4442
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
4543
#' into a predictor model which frees up memory and the original datasets
@@ -70,6 +68,13 @@ CVBooster <- R6::R6Class(
7068
#' , nfold = 3L
7169
#' )
7270
#' }
71+
#'
72+
#' @section Deprecated Arguments:
73+
#'
74+
#' A future release of \code{lightgbm} will require passing an \code{lgb.Dataset}
75+
#' to argument \code{'data'}. It will also remove support for passing arguments
76+
#' \code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}.
77+
#'
7378
#' @importFrom data.table data.table setorderv
7479
#' @export
7580
lgb.cv <- function(params = list()
@@ -102,12 +107,32 @@ lgb.cv <- function(params = list()
102107

103108
# If 'data' is not an lgb.Dataset, try to construct one using 'label'
104109
if (!.is_Dataset(x = data)) {
110+
warning(paste0(
111+
"Passing anything other than an lgb.Dataset object to lgb.cv() is deprecated. "
112+
, "Either pass an lgb.Dataset object, or use lightgbm()."
113+
))
105114
if (is.null(label)) {
106115
stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
107116
}
108117
data <- lgb.Dataset(data = data, label = label)
109118
}
110119

120+
# raise deprecation warnings if necessary
121+
# ref: https://github.com/microsoft/LightGBM/issues/6435
122+
args <- names(match.call())
123+
if ("categorical_feature" %in% args) {
124+
.emit_dataset_kwarg_warning("categorical_feature", "lgb.cv")
125+
}
126+
if ("colnames" %in% args) {
127+
.emit_dataset_kwarg_warning("colnames", "lgb.cv")
128+
}
129+
if ("label" %in% args) {
130+
.emit_dataset_kwarg_warning("label", "lgb.cv")
131+
}
132+
if ("weight" %in% args) {
133+
.emit_dataset_kwarg_warning("weight", "lgb.cv")
134+
}
135+
111136
# set some parameters, resolving the way they were passed in with other parameters
112137
# in `params`.
113138
# this ensures that the model stored with Booster$save() correctly represents

R-package/R/lgb.train.R

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,8 @@
66
#' @inheritParams lgb_shared_params
77
#' @param valids a list of \code{lgb.Dataset} objects, used for validation
88
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
9-
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
10-
#' @param categorical_feature categorical features. This can either be a character vector of feature
11-
#' names or an integer vector with the indices of the features (e.g.
12-
#' \code{c(1L, 10L)} to say "the first and tenth columns").
9+
#' @param colnames Deprecated. See "Deprecated Arguments" section below.
10+
#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
1311
#' @param callbacks List of callback functions that are applied at each iteration.
1412
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
1513
#' booster model into a predictor model which frees up memory and the
@@ -43,6 +41,13 @@
4341
#' , early_stopping_rounds = 3L
4442
#' )
4543
#' }
44+
#'
45+
#' @section Deprecated Arguments:
46+
#'
47+
#' A future release of \code{lightgbm} will remove support for passing arguments
48+
#' \code{'categorical_feature'} and \code{'colnames'}. Pass those things to
49+
#' \code{lgb.Dataset} instead.
50+
#'
4651
#' @export
4752
lgb.train <- function(params = list(),
4853
data,
@@ -78,6 +83,16 @@ lgb.train <- function(params = list(),
7883
}
7984
}
8085

86+
# raise deprecation warnings if necessary
87+
# ref: https://github.com/microsoft/LightGBM/issues/6435
88+
args <- names(match.call())
89+
if ("categorical_feature" %in% args) {
90+
.emit_dataset_kwarg_warning("categorical_feature", "lgb.train")
91+
}
92+
if ("colnames" %in% args) {
93+
.emit_dataset_kwarg_warning("colnames", "lgb.train")
94+
}
95+
8196
# set some parameters, resolving the way they were passed in with other parameters
8297
# in `params`.
8398
# this ensures that the model stored with Booster$save() correctly represents

R-package/R/lightgbm.R

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,12 @@ NULL
144144
#'
145145
#' \emph{New in version 4.0.0}
146146
#'
147+
#' @param colnames Character vector of features. Only used if \code{data} is not an \code{\link{lgb.Dataset}}.
148+
#' @param categorical_feature categorical features. This can either be a character vector of feature
149+
#' names or an integer vector with the indices of the features (e.g.
150+
#' \code{c(1L, 10L)} to say "the first and tenth columns").
151+
#' Only used if \code{data} is not an \code{\link{lgb.Dataset}}.
152+
#'
147153
#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
148154
#' \itemize{
149155
#' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
@@ -152,10 +158,6 @@ NULL
152158
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
153159
#' \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function}
154160
#' \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}}
155-
#' \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset}
156-
#' \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature
157-
#' names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to
158-
#' say "the first and tenth columns").}
159161
#' \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
160162
#' into a predictor model which frees up memory and the original datasets}
161163
#' }
@@ -176,6 +178,8 @@ lightgbm <- function(data,
176178
objective = "auto",
177179
init_score = NULL,
178180
num_threads = NULL,
181+
colnames = NULL,
182+
categorical_feature = NULL,
179183
...) {
180184

181185
# validate inputs early to avoid unnecessary computation
@@ -221,7 +225,14 @@ lightgbm <- function(data,
221225

222226
# Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
223227
if (!.is_Dataset(x = dtrain)) {
224-
dtrain <- lgb.Dataset(data = data, label = label, weight = weights, init_score = init_score)
228+
dtrain <- lgb.Dataset(
229+
data = data
230+
, label = label
231+
, weight = weights
232+
, init_score = init_score
233+
, categorical_feature = categorical_feature
234+
, colnames = colnames
235+
)
225236
}
226237

227238
train_args <- list(

R-package/R/utils.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,19 @@
260260
return(a == b)
261261
}
262262
}
263+
264+
# ref: https://github.com/microsoft/LightGBM/issues/6435
265+
.emit_dataset_kwarg_warning <- function(calling_function, argname) {
266+
msg <- sprintf(
267+
paste0(
268+
"Argument '%s' to %s() is deprecated and will be removed in a future release. "
269+
, "Set '%s' with lgb.Dataset() instead. "
270+
, "See https://github.com/microsoft/LightGBM/issues/6435."
271+
)
272+
, argname
273+
, calling_function
274+
, argname
275+
)
276+
warning(msg)
277+
return(invisible(NULL))
278+
}

R-package/man/lgb.cv.Rd

Lines changed: 13 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

R-package/man/lgb.train.Rd

Lines changed: 11 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

R-package/man/lightgbm.Rd

Lines changed: 9 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

R-package/tests/testthat/test_basic.R

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", {
433433
}
434434
})
435435

436-
test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", {
436+
test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset and labels are not given", {
437437
bad_values <- list(
438438
4L
439439
, "hello"
@@ -1788,11 +1788,6 @@ test_that("lgb.train() works with early stopping for regression with a metric th
17881788

17891789

17901790
test_that("lgb.train() supports non-ASCII feature names", {
1791-
dtrain <- lgb.Dataset(
1792-
data = matrix(rnorm(400L), ncol = 4L)
1793-
, label = rnorm(100L)
1794-
, params = list(num_threads = .LGB_MAX_THREADS)
1795-
)
17961791
# content below is equivalent to
17971792
#
17981793
# feature_names <- c("F_零", "F_一", "F_二", "F_三")
@@ -1805,6 +1800,12 @@ test_that("lgb.train() supports non-ASCII feature names", {
18051800
, rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xba, 0x8c)))
18061801
, rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xb8, 0x89)))
18071802
)
1803+
dtrain <- lgb.Dataset(
1804+
data = matrix(rnorm(400L), ncol = 4L)
1805+
, label = rnorm(100L)
1806+
, params = list(num_threads = .LGB_MAX_THREADS)
1807+
, colnames = feature_names
1808+
)
18081809
bst <- lgb.train(
18091810
data = dtrain
18101811
, nrounds = 5L
@@ -1814,7 +1815,6 @@ test_that("lgb.train() supports non-ASCII feature names", {
18141815
, verbose = .LGB_VERBOSITY
18151816
, num_threads = .LGB_MAX_THREADS
18161817
)
1817-
, colnames = feature_names
18181818
)
18191819
expect_true(.is_Booster(bst))
18201820
dumped_model <- jsonlite::fromJSON(bst$dump_model())
@@ -2838,7 +2838,11 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
28382838

28392839
test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), {
28402840
set.seed(1L)
2841-
dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS))
2841+
dtrain <- lgb.Dataset(
2842+
train$data
2843+
, label = train$label
2844+
, params = list(num_threads = .LGB_MAX_THREADS)
2845+
)
28422846

28432847
params <- list(
28442848
objective = "regression"
@@ -2854,6 +2858,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
28542858
pred1 <- bst$predict(test$data)
28552859

28562860
new_colnames <- paste0(colnames(train$data), "_x")
2861+
dtrain$set_colnames(new_colnames)
28572862
params <- list(
28582863
objective = "regression"
28592864
, interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])
@@ -2864,7 +2869,6 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
28642869
data = dtrain
28652870
, params = params
28662871
, nrounds = 2L
2867-
, colnames = new_colnames
28682872
)
28692873
pred2 <- bst$predict(test$data)
28702874

0 commit comments

Comments
 (0)