Skip to content

Commit 5a084b0

Browse files
authored
slice_max() and slice_min() speed (#217)
And ensure it works with character columns. Fixes #216. Fixes #218.
1 parent 8ece213 commit 5a084b0

File tree

3 files changed

+50
-27
lines changed

3 files changed

+50
-27
lines changed

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# dtplyr (development version)
22

3+
* speed up `slice_min()` and `slice_max()` after `group_by()` (@mgirlich, #216).
4+
5+
* `slice_max()` now works when ordering by a character column (@mgirlich, #218).
6+
37
* `pivot_wider()` now names the columns correctly when `names_from` is a
48
numeric column (@mgirlich, #214).
59

R/step-subset-slice.R

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -108,22 +108,16 @@ slice_min.dtplyr_step <- function(.data, order_by, ..., n, prop, with_ties = TRU
108108
if (missing(order_by)) {
109109
abort("argument `order_by` is missing, with no default.")
110110
}
111-
order_by <- enexpr(order_by)
112111

113-
ellipsis::check_dots_empty()
114-
size <- check_slice_size(n, prop)
115-
if (with_ties) {
116-
j <- switch(size$type,
117-
n = expr(.SD[order(!!order_by)][!!smaller_ranks(!!order_by, !!size$n)]),
118-
prop = expr(.SD[order(!!order_by)][!!smaller_ranks(!!order_by, !!size$prop * .N)])
119-
)
120-
} else {
121-
j <- switch(size$type,
122-
n = expr(head(.SD[order(!!order_by)], !!size$n)),
123-
prop = expr(head(.SD[order(!!order_by)], !!size$prop * .N))
124-
)
125-
}
126-
step_subset_j(.data, j = j)
112+
slice_min_max(
113+
.data,
114+
order_by = enexpr(order_by),
115+
decreasing = FALSE,
116+
...,
117+
n = n,
118+
prop = prop,
119+
with_ties = with_ties
120+
)
127121
}
128122

129123
#' @rdname slice.dtplyr_step
@@ -133,30 +127,49 @@ slice_max.dtplyr_step <- function(.data, order_by, ..., n, prop, with_ties = TRU
133127
if (missing(order_by)) {
134128
abort("argument `order_by` is missing, with no default.")
135129
}
136-
order_by <- enexpr(order_by)
137130

131+
slice_min_max(
132+
.data,
133+
order_by = enexpr(order_by),
134+
decreasing = TRUE,
135+
...,
136+
n = n,
137+
prop = prop,
138+
with_ties = with_ties
139+
)
140+
}
141+
142+
slice_min_max <- function(.data, order_by, decreasing, ..., n, prop, with_ties = TRUE) {
138143
ellipsis::check_dots_empty()
139144
size <- check_slice_size(n, prop)
145+
146+
if (decreasing) {
147+
order_by <- expr(desc(!!order_by))
148+
}
149+
140150
if (with_ties) {
141-
j <- switch(size$type,
142-
n = expr(.SD[order(!!order_by, decreasing = TRUE)][!!smaller_ranks(-!!order_by, !!size$n)]),
143-
prop = expr(.SD[order(!!order_by, decreasing = TRUE)][!!smaller_ranks(-!!order_by, !!size$prop * .N)])
144-
)
151+
ties.method <- "min"
145152
} else {
146-
j <- switch(size$type,
147-
n = expr(head(.SD[order(!!order_by, decreasing = TRUE)], !!size$n)),
148-
prop = expr(head(.SD[order(!!order_by, decreasing = TRUE)], !!size$prop * .N))
149-
)
153+
ties.method <- "first"
150154
}
151155

152-
step_subset_j(.data, j = j)
156+
i <- switch(size$type,
157+
n = expr(!!smaller_ranks(!!order_by, !!size$n, ties.method = ties.method)),
158+
prop = expr(!!smaller_ranks(!!order_by, !!size$prop * .N, ties.method = ties.method))
159+
)
160+
161+
step_subset_i(.data, i) %>%
162+
arrange(!!order_by, .by_group = TRUE)
153163
}
154164

155-
smaller_ranks <- function(x, y) {
165+
smaller_ranks <- function(x, y, ties.method = "min") {
156166
x <- enexpr(x)
157167
y <- enexpr(y)
158168

159-
expr(frankv(!!x, ties.method = "min", na.last = "keep") <= !!y)
169+
# `frank()` by group is much slower than rank
170+
# https://github.com/Rdatatable/data.table/issues/3988
171+
# also https://github.com/Rdatatable/data.table/issues/4284
172+
expr(rank(!!x, ties.method = !!ties.method, na.last = "keep") <= !!y)
160173
}
161174

162175
#' @importFrom dplyr slice_sample

tests/testthat/test-step-subset-slice.R

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ test_that("min and max return ties by default", {
6868
expect_equal(dt %>% slice_max(x, with_ties = FALSE) %>% collect() %>% nrow(), 1)
6969
})
7070

71+
test_that("min and max work with character", {
72+
dt <- lazy_dt(data.table(x = c("b", "a", "d", "c")))
73+
expect_equal(dt %>% slice_min(x) %>% pull(x), "a")
74+
expect_equal(dt %>% slice_max(x) %>% pull(x), "d")
75+
})
76+
7177
test_that("min and max reorder results", {
7278
dt <- lazy_dt(data.frame(id = 1:4, x = c(2, 3, 1, 2)))
7379

0 commit comments

Comments
 (0)