From a5d68f303ba3498a0c528a25f275f96dbef2c94e Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sat, 16 Aug 2025 11:36:24 +0000 Subject: [PATCH 1/8] added a warning --- NEWS.md | 15 +++++++++++++++ R/data.table.R | 10 ++++++++++ inst/tests/tests.Rraw | 9 +++++++++ man/data.table.Rd | 4 ++-- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index fc69449438..b14c50454d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -107,6 +107,21 @@ 18. `fwrite` now allows `dec` to be the same as `sep` for edge cases where only one will be written, e.g. 0-row or 1-column tables. [#7227](https://github.com/Rdatatable/data.table/issues/7227). Thanks @MichaelChirico for the report and @venom1204 for the fix. +22. Using `by=` or `keyby=` with a simple numeric or character vector in `j` (e.g. `DT[, 1:2, by=grp]`) used to silently ignore the grouping argument. This now issues a warning to alert the user that grouping is not applied in this syntax and guides them to use the `.SD` idiom instead. [#5397](https://github.com/Rdatatable/data.table/issues/5397). Thanks to @mcol for the report and @venom1204 for the fix. + + ```r + DT = data.table(a=1:4, grp=c(1,1,2,2)) + DT[, 1, by = grp] + # a + # + # 1: 1 + # 2: 2 + # 3: 3 + # 4: 4 + # Warning message: + # `by` or `keyby` is ignored when `j` is a numeric vector... + ``` + ### NOTES 1. The following in-progress deprecations have proceeded: diff --git a/R/data.table.R b/R/data.table.R index c64c391b33..b7654b4204 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -739,6 +739,12 @@ replace_dot_alias = function(e) { if (!length(j) && !notj) return( null.data.table() ) if (is.factor(j)) j = as.character(j) # fix for FR: #358 if (is.character(j)) { + if (!missingby && (missing(with) || isTRUE(with))) { + warning( + "`by` or `keyby` is ignored when `j` is a character vector used for column selection. ", + "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]" + ) + } if (notj) { if (anyNA(idx <- chmatch(j, names_x))) warningf(ngettext(sum(is.na(idx)), "column not removed because not found: %s", "columns not removed because not found: %s"), @@ -762,6 +768,10 @@ replace_dot_alias = function(e) { # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977) # in both cases leave to the R-level subsetting of i and x together further below } else if (is.numeric(j)) { + if (!missingby) { + warning( + "`by` or `keyby` is ignored when `j` is a numeric vector used for column selection. ", "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]") + } j = as.integer(j) if (any(w <- (j>ncol(x)))) stopf("Item %d of j is %d which is outside the column number range [1,ncol=%d]", idx <- which.first(w), j[idx], ncol(x)) j = j[j!=0L] diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d7ffc476bb..ac365e07f8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21620,3 +21620,12 @@ local({ test(2338.9, {fwrite(dd, f, forceDecimal=FALSE); fread(f)}, di) }) +# 5397 - keyby/key ignored if numeric indices used in j +DT = data.table(a=1:4, b=5:8, g=c(1,1,2,2)) +test(2339.1, DT[, 1:2, by=g], DT[, 1:2], warning="`by` or `keyby` is ignored") +test(2339.2, DT[, 2:1, keyby=g], DT[, 2:1], warning="`by` or `keyby` is ignored") +test(2339.3, DT[, c("b", "a"), by=g, with=FALSE], DT[, c("b", "a")]) +expected_sd = data.table(g=c(1,1,2,2), a=1:4, b=5:8) +test(2339.4, DT[, .SD[, 1:2], by=g], expected_sd) +expected_single_int = data.table(g=c(1,2), V1=c(1,1)) +test(2339.5, DT[, 1, by=g], expected_single_int) \ No newline at end of file diff --git a/man/data.table.Rd b/man/data.table.Rd index 658c57234f..8f15f5ce79 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -97,8 +97,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac See \href{../doc/datatable-intro.html}{\code{vignette("datatable-intro")}} and \code{example(data.table)}.} - \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. The order of the rows within each group is preserved, as is the order of the groups. \code{by} accepts: - + \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). \emph{Note that `by` and `keyby` are ignored when `j` is a character or numeric vector used for selecting columns (i.e., when the internal `with=FALSE` is triggered).} The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. The order of the rows within each group is preserved, as is the order of the groups. \code{by} accepts: + \itemize{ \item A single unquoted column name: e.g., \code{DT[, .(sa=sum(a)), by=x]} From ea5458b35fd1d523ede80c2b978d554841e4823d Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sat, 16 Aug 2025 11:39:44 +0000 Subject: [PATCH 2/8] .. --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index b7654b4204..1615519fa9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -768,7 +768,7 @@ replace_dot_alias = function(e) { # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977) # in both cases leave to the R-level subsetting of i and x together further below } else if (is.numeric(j)) { - if (!missingby) { + if (!missingby && (missing(with) || isTRUE(with))) { warning( "`by` or `keyby` is ignored when `j` is a numeric vector used for column selection. ", "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]") } From 87b375f327d0f290818b9af17a84a1c2f5ea9a1d Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sat, 16 Aug 2025 17:32:42 +0000 Subject: [PATCH 3/8] corrected --- NEWS.md | 2 +- R/data.table.R | 18 ++++++++++-------- inst/tests/tests.Rraw | 2 +- man/data.table.Rd | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/NEWS.md b/NEWS.md index b14c50454d..d3be30ec1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -107,7 +107,7 @@ 18. `fwrite` now allows `dec` to be the same as `sep` for edge cases where only one will be written, e.g. 0-row or 1-column tables. [#7227](https://github.com/Rdatatable/data.table/issues/7227). Thanks @MichaelChirico for the report and @venom1204 for the fix. -22. Using `by=` or `keyby=` with a simple numeric or character vector in `j` (e.g. `DT[, 1:2, by=grp]`) used to silently ignore the grouping argument. This now issues a warning to alert the user that grouping is not applied in this syntax and guides them to use the `.SD` idiom instead. [#5397](https://github.com/Rdatatable/data.table/issues/5397). Thanks to @mcol for the report and @venom1204 for the fix. +19. Using `by=` or `keyby=` with a simple numeric or character vector in `j` (e.g. `DT[, 1:2, by=grp]`) used to silently ignore the grouping argument. This now issues a warning to alert the user that grouping is not applied in this syntax and guides them to use the `.SD` idiom instead. [#5397](https://github.com/Rdatatable/data.table/issues/5397). Thanks to @mcol for the report and @venom1204 for the fix. ```r DT = data.table(a=1:4, grp=c(1,1,2,2)) diff --git a/R/data.table.R b/R/data.table.R index 1615519fa9..7f78ae6b8b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -739,11 +739,11 @@ replace_dot_alias = function(e) { if (!length(j) && !notj) return( null.data.table() ) if (is.factor(j)) j = as.character(j) # fix for FR: #358 if (is.character(j)) { - if (!missingby && (missing(with) || isTRUE(with))) { - warning( - "`by` or `keyby` is ignored when `j` is a character vector used for column selection. ", - "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]" - ) + if (!missingby) { + warning( + "`by` or `keyby` is ignored when `j` is a character vector used for column selection. ", + "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]" + ) } if (notj) { if (anyNA(idx <- chmatch(j, names_x))) @@ -768,9 +768,11 @@ replace_dot_alias = function(e) { # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977) # in both cases leave to the R-level subsetting of i and x together further below } else if (is.numeric(j)) { - if (!missingby && (missing(with) || isTRUE(with))) { - warning( - "`by` or `keyby` is ignored when `j` is a numeric vector used for column selection. ", "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]") + if (!missingby) { + warning( + "`by` or `keyby` is ignored when `j` is a numeric vector used for column selection. ", + "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]" + ) } j = as.integer(j) if (any(w <- (j>ncol(x)))) stopf("Item %d of j is %d which is outside the column number range [1,ncol=%d]", idx <- which.first(w), j[idx], ncol(x)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ac365e07f8..f3a8f60095 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21624,7 +21624,7 @@ local({ DT = data.table(a=1:4, b=5:8, g=c(1,1,2,2)) test(2339.1, DT[, 1:2, by=g], DT[, 1:2], warning="`by` or `keyby` is ignored") test(2339.2, DT[, 2:1, keyby=g], DT[, 2:1], warning="`by` or `keyby` is ignored") -test(2339.3, DT[, c("b", "a"), by=g, with=FALSE], DT[, c("b", "a")]) +test(2339.3, DT[, c("b", "a"), by=g, with=FALSE], DT[, c("b", "a")], warning="`by` or `keyby` is ignored") expected_sd = data.table(g=c(1,1,2,2), a=1:4, b=5:8) test(2339.4, DT[, .SD[, 1:2], by=g], expected_sd) expected_single_int = data.table(g=c(1,2), V1=c(1,1)) diff --git a/man/data.table.Rd b/man/data.table.Rd index 8f15f5ce79..f583b5439e 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -97,7 +97,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac See \href{../doc/datatable-intro.html}{\code{vignette("datatable-intro")}} and \code{example(data.table)}.} - \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). \emph{Note that `by` and `keyby` are ignored when `j` is a character or numeric vector used for selecting columns (i.e., when the internal `with=FALSE` is triggered).} The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. The order of the rows within each group is preserved, as is the order of the groups. \code{by} accepts: + \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). \emph{Note that \code{by} and \code{keyby} are ignored when \code{j} is a character or numeric vector used for selecting columns (i.e., when the internal \code{with=FALSE} is triggered).} The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. ...} \itemize{ \item A single unquoted column name: e.g., \code{DT[, .(sa=sum(a)), by=x]} From 7b845330badde941a32038b151ec396afcc6c5cc Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sat, 16 Aug 2025 18:23:46 +0000 Subject: [PATCH 4/8] lintr --- R/data.table.R | 9 +++++---- man/data.table.Rd | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 7f78ae6b8b..2f789b7adf 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -741,8 +741,9 @@ replace_dot_alias = function(e) { if (is.character(j)) { if (!missingby) { warning( - "`by` or `keyby` is ignored when `j` is a character vector used for column selection. ", - "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]" + "`by` or `keyby` is ignored when `j` is a character vector used for column selection. ", + "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]", + call. = FALSE ) } if (notj) { @@ -770,8 +771,8 @@ replace_dot_alias = function(e) { } else if (is.numeric(j)) { if (!missingby) { warning( - "`by` or `keyby` is ignored when `j` is a numeric vector used for column selection. ", - "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]" + "`by` or `keyby` is ignored when `j` is a numeric vector used for column selection. ", "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]", + call. = FALSE ) } j = as.integer(j) diff --git a/man/data.table.Rd b/man/data.table.Rd index f583b5439e..d2dccead3e 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -97,8 +97,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac See \href{../doc/datatable-intro.html}{\code{vignette("datatable-intro")}} and \code{example(data.table)}.} - \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). \emph{Note that \code{by} and \code{keyby} are ignored when \code{j} is a character or numeric vector used for selecting columns (i.e., when the internal \code{with=FALSE} is triggered).} The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. ...} - + \item{by}{ Column names are seen as if they are variables (as in \code{j} when \code{with=TRUE}). \emph{Note that \code{by} and \code{keyby} are ignored when \code{j} is a character or numeric vector used for selecting columns (i.e., when the internal \code{with=FALSE} is triggered).} The \code{data.table} is then grouped by the \code{by} and \code{j} is evaluated within each group. The order of the rows within each group is preserved, as is the order of the groups. \code{by} accepts: + \itemize{ \item A single unquoted column name: e.g., \code{DT[, .(sa=sum(a)), by=x]} From 242847aa96971d23d9ed14de0aa49742d0a07536 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Wed, 20 Aug 2025 11:06:07 +0000 Subject: [PATCH 5/8] restructure --- R/data.table.R | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 2f789b7adf..bbc4042ffe 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -735,17 +735,22 @@ replace_dot_alias = function(e) { names(..syms) = ..syms j = eval(jsub, lapply(substr(..syms, 3L, nchar(..syms)), get, pos=parent.frame()), parent.frame()) } - if (is.logical(j)) j = which(j) if (!length(j) && !notj) return( null.data.table() ) if (is.factor(j)) j = as.character(j) # fix for FR: #358 + if (is.character(j) || (is.numeric(j) && !is.logical(j))) { + if (!missingby) { + j_type = if (is.character(j)) "a character" else "a numeric" + warning( + "`by` or `keyby` is ignored when `j` is ", j_type, " vector used for column selection. ", + "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]", + call. = FALSE + ) + } + } + + if (is.logical(j)) j = which(j) + if (is.character(j)) { - if (!missingby) { - warning( - "`by` or `keyby` is ignored when `j` is a character vector used for column selection. ", - "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]", - call. = FALSE - ) - } if (notj) { if (anyNA(idx <- chmatch(j, names_x))) warningf(ngettext(sum(is.na(idx)), "column not removed because not found: %s", "columns not removed because not found: %s"), @@ -769,12 +774,6 @@ replace_dot_alias = function(e) { # else the NA in ansvals are for join inherited scope (test 1973), and NA could be in irows from join and data in i should be returned (test 1977) # in both cases leave to the R-level subsetting of i and x together further below } else if (is.numeric(j)) { - if (!missingby) { - warning( - "`by` or `keyby` is ignored when `j` is a numeric vector used for column selection. ", "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]", - call. = FALSE - ) - } j = as.integer(j) if (any(w <- (j>ncol(x)))) stopf("Item %d of j is %d which is outside the column number range [1,ncol=%d]", idx <- which.first(w), j[idx], ncol(x)) j = j[j!=0L] From 418a74e11a8011b5ad2b8a9ace249bf6f912c77e Mon Sep 17 00:00:00 2001 From: venom1204 Date: Wed, 20 Aug 2025 20:35:29 +0000 Subject: [PATCH 6/8] added backticks --- R/data.table.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index bbc4042ffe..9b1cbcf1bd 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -740,11 +740,8 @@ replace_dot_alias = function(e) { if (is.character(j) || (is.numeric(j) && !is.logical(j))) { if (!missingby) { j_type = if (is.character(j)) "a character" else "a numeric" - warning( - "`by` or `keyby` is ignored when `j` is ", j_type, " vector used for column selection. ", - "Perhaps you intended to use `.SD`? For example: DT[, .SD[, ", deparse(jsub), "], by = ...]", - call. = FALSE - ) + fmt = "`by` or `keyby` is ignored when `j` is %s vector used for column selection. Perhaps you intended to use `.SD`? For example: DT[, .SD[, %s], by = ...]" + warning(sprintf(fmt, j_type, gsub("%", "%%", deparse(jsub))), call. = FALSE) } } From c2d84edb3c3bb3c135fe75f0b9efb134d3a2ec3e Mon Sep 17 00:00:00 2001 From: venom1204 Date: Wed, 20 Aug 2025 20:36:45 +0000 Subject: [PATCH 7/8] .. --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 9b1cbcf1bd..0832f21a01 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -741,7 +741,7 @@ replace_dot_alias = function(e) { if (!missingby) { j_type = if (is.character(j)) "a character" else "a numeric" fmt = "`by` or `keyby` is ignored when `j` is %s vector used for column selection. Perhaps you intended to use `.SD`? For example: DT[, .SD[, %s], by = ...]" - warning(sprintf(fmt, j_type, gsub("%", "%%", deparse(jsub))), call. = FALSE) + warningf(sprintf(fmt, j_type, gsub("%", "%%", deparse(jsub))), call. = FALSE) } } From b150b0a5ffc6865fd2f29a954ef8aec023d38808 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Wed, 20 Aug 2025 20:45:32 +0000 Subject: [PATCH 8/8] warningf --- R/data.table.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 0832f21a01..f60ade1ca0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -740,8 +740,10 @@ replace_dot_alias = function(e) { if (is.character(j) || (is.numeric(j) && !is.logical(j))) { if (!missingby) { j_type = if (is.character(j)) "a character" else "a numeric" - fmt = "`by` or `keyby` is ignored when `j` is %s vector used for column selection. Perhaps you intended to use `.SD`? For example: DT[, .SD[, %s], by = ...]" - warningf(sprintf(fmt, j_type, gsub("%", "%%", deparse(jsub))), call. = FALSE) + warningf( + "`by` or `keyby` is ignored when `j` is %s vector used for column selection. Perhaps you intended to use `.SD`? For example: DT[, .SD[, %s], by = ...]",j_type, + gsub("%", "%%", deparse(jsub)) + ) } }