slice_max() and slice_min() speed (#217)

mgirlich · web-flow · commit 5a084b0c7ba1 · 2021-03-05T07:18:27.000-06:00
And ensure it works with character columns. Fixes #216. Fixes #218.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # dtplyr (development version)
 
+* speed up `slice_min()` and `slice_max()` after `group_by()` (@mgirlich, #216).
+
+* `slice_max()` now works when ordering by a character column (@mgirlich, #218).
+
 * `pivot_wider()` now names the columns correctly when `names_from` is a
   numeric column (@mgirlich, #214).
 
diff --git a/R/step-subset-slice.R b/R/step-subset-slice.R
@@ -108,22 +108,16 @@ slice_min.dtplyr_step <- function(.data, order_by, ..., n, prop, with_ties = TRU
   if (missing(order_by)) {
     abort("argument `order_by` is missing, with no default.")
   }
-  order_by <- enexpr(order_by)
 
-  ellipsis::check_dots_empty()
-  size <- check_slice_size(n, prop)
-  if (with_ties) {
-    j <- switch(size$type,
-      n =    expr(.SD[order(!!order_by)][!!smaller_ranks(!!order_by, !!size$n)]),
-      prop = expr(.SD[order(!!order_by)][!!smaller_ranks(!!order_by, !!size$prop * .N)])
-    )
-  } else {
-    j <- switch(size$type,
-      n =    expr(head(.SD[order(!!order_by)], !!size$n)),
-      prop = expr(head(.SD[order(!!order_by)], !!size$prop * .N))
-    )
-  }
-  step_subset_j(.data, j = j)
+  slice_min_max(
+    .data,
+    order_by = enexpr(order_by),
+    decreasing = FALSE,
+    ...,
+    n =  n,
+    prop = prop,
+    with_ties = with_ties
+  )
 }
 
 #' @rdname slice.dtplyr_step
@@ -133,30 +127,49 @@ slice_max.dtplyr_step <- function(.data, order_by, ..., n, prop, with_ties = TRU
   if (missing(order_by)) {
     abort("argument `order_by` is missing, with no default.")
   }
-  order_by <- enexpr(order_by)
 
+  slice_min_max(
+    .data,
+    order_by = enexpr(order_by),
+    decreasing = TRUE,
+    ...,
+    n =  n,
+    prop = prop,
+    with_ties = with_ties
+  )
+}
+
+slice_min_max <- function(.data, order_by, decreasing, ..., n, prop, with_ties = TRUE) {
   ellipsis::check_dots_empty()
   size <- check_slice_size(n, prop)
+
+  if (decreasing) {
+    order_by <- expr(desc(!!order_by))
+  }
+
   if (with_ties) {
-    j <- switch(size$type,
-      n = expr(.SD[order(!!order_by, decreasing = TRUE)][!!smaller_ranks(-!!order_by, !!size$n)]),
-      prop = expr(.SD[order(!!order_by, decreasing = TRUE)][!!smaller_ranks(-!!order_by, !!size$prop * .N)])
-    )
+    ties.method <- "min"
   } else {
-    j <- switch(size$type,
-      n =    expr(head(.SD[order(!!order_by, decreasing = TRUE)], !!size$n)),
-      prop = expr(head(.SD[order(!!order_by, decreasing = TRUE)], !!size$prop * .N))
-    )
+    ties.method <- "first"
   }
 
-  step_subset_j(.data, j = j)
+  i <- switch(size$type,
+    n = expr(!!smaller_ranks(!!order_by, !!size$n, ties.method = ties.method)),
+    prop = expr(!!smaller_ranks(!!order_by, !!size$prop * .N, ties.method = ties.method))
+  )
+
+  step_subset_i(.data, i) %>%
+    arrange(!!order_by, .by_group = TRUE)
 }
 
-smaller_ranks <- function(x, y) {
+smaller_ranks <- function(x, y, ties.method = "min") {
   x <- enexpr(x)
   y <- enexpr(y)
 
-  expr(frankv(!!x, ties.method = "min", na.last = "keep") <= !!y)
+  # `frank()` by group is much slower than rank
+  # https://github.com/Rdatatable/data.table/issues/3988
+  # also https://github.com/Rdatatable/data.table/issues/4284
+  expr(rank(!!x, ties.method = !!ties.method, na.last = "keep") <= !!y)
 }
 
 #' @importFrom dplyr slice_sample
diff --git a/tests/testthat/test-step-subset-slice.R b/tests/testthat/test-step-subset-slice.R
@@ -68,6 +68,12 @@ test_that("min and max return ties by default", {
   expect_equal(dt %>% slice_max(x, with_ties = FALSE) %>% collect() %>% nrow(), 1)
 })
 
+test_that("min and max work with character", {
+  dt <- lazy_dt(data.table(x = c("b", "a", "d", "c")))
+  expect_equal(dt %>% slice_min(x) %>% pull(x), "a")
+  expect_equal(dt %>% slice_max(x) %>% pull(x), "d")
+})
+
 test_that("min and max reorder results", {
   dt <- lazy_dt(data.frame(id = 1:4, x = c(2, 3, 1, 2)))