Skip to content

Commit daea473

Browse files
Implement pivot_longer() (#204)
* Implement pivot_longer() * Use hack to attach tidyr when documenting
1 parent c887a26 commit daea473

File tree

9 files changed

+699
-3
lines changed

9 files changed

+699
-3
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,6 @@ VignetteBuilder:
3939
knitr
4040
Encoding: UTF-8
4141
LazyData: true
42-
Roxygen: list(markdown = TRUE)
42+
Roxygen: {library(tidyr); list(markdown = TRUE)}
4343
RoxygenNote: 7.1.1
4444
Config/testthat/edition: 3

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424

2525
* `fill()` (@markfairbanks, #197)
2626

27+
* `pivot_longer()` (@markfairbanks, #204)
28+
2729
* `replace_na()` (@markfairbanks, #202)
2830

2931
# dtplyr 1.1.0

R/step-call-pivot_longer.R

Lines changed: 366 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,366 @@
1+
#' Pivot data from wide to long
2+
#'
3+
#' @description
4+
#' This is a method for the tidyr `pivot_longer()` generic. It is translated to
5+
#' [data.table::melt()]
6+
#'
7+
#' @param data A [lazy_dt()].
8+
#' @inheritParams tidyr::pivot_longer
9+
#' @param names_ptypes,names_transform,values_ptypes,values_transform
10+
#' Not currently supported by dtplyr.
11+
#' @examples
12+
#' library(tidyr)
13+
#'
14+
#' # Simplest case where column names are character data
15+
#' relig_income_dt <- lazy_dt(relig_income)
16+
#' relig_income_dt %>%
17+
#' pivot_longer(!religion, names_to = "income", values_to = "count")
18+
#'
19+
#' # Slightly more complex case where columns have common prefix,
20+
#' # and missing missings are structural so should be dropped.
21+
#' billboard_dt <- lazy_dt(billboard)
22+
#' billboard %>%
23+
#' pivot_longer(
24+
#' cols = starts_with("wk"),
25+
#' names_to = "week",
26+
#' names_prefix = "wk",
27+
#' values_to = "rank",
28+
#' values_drop_na = TRUE
29+
#' )
30+
#'
31+
#' # Multiple variables stored in column names
32+
#' lazy_dt(who) %>%
33+
#' pivot_longer(
34+
#' cols = new_sp_m014:newrel_f65,
35+
#' names_to = c("diagnosis", "gender", "age"),
36+
#' names_pattern = "new_?(.*)_(.)(.*)",
37+
#' values_to = "count"
38+
#' )
39+
#'
40+
#' # Multiple observations per row
41+
#' anscombe_dt <- lazy_dt(anscombe)
42+
#' anscombe_dt %>%
43+
#' pivot_longer(
44+
#' everything(),
45+
#' names_to = c(".value", "set"),
46+
#' names_pattern = "(.)(.)"
47+
#' )
48+
# exported onLoad
49+
pivot_longer.dtplyr_step <- function(data,
50+
cols,
51+
names_to = "name",
52+
names_prefix = NULL,
53+
names_sep = NULL,
54+
names_pattern = NULL,
55+
names_ptypes = NULL,
56+
names_transform = NULL,
57+
names_repair = "check_unique",
58+
values_to = "value",
59+
values_drop_na = FALSE,
60+
values_ptypes = NULL,
61+
values_transform = NULL,
62+
...) {
63+
64+
if (!is.null(names_ptypes)) {
65+
abort("`names_ptypes` is not supported by dtplyr")
66+
}
67+
68+
if (!is.null(names_transform)) {
69+
abort("`names_transform` is not supported by dtplyr")
70+
}
71+
72+
if (!is.null(values_ptypes)) {
73+
abort("`values_ptypes` is not supported by dtplyr")
74+
}
75+
76+
if (!is.null(values_transform)) {
77+
abort("`values_transform` is not supported by dtplyr")
78+
}
79+
80+
sim_data <- simulate_vars(data)
81+
measure_vars <- names(tidyselect::eval_select(enquo(cols), sim_data))
82+
if (length(measure_vars) == 0) {
83+
abort("`cols` must select at least one column.")
84+
}
85+
86+
multiple_names_to <- length(names_to) > 1
87+
uses_dot_value <- ".value" %in% names_to
88+
89+
variable_name <- "variable"
90+
91+
if (uses_dot_value) {
92+
if (!is.null(names_sep)) {
93+
.value <- str_separate(measure_vars, into = names_to, sep = names_sep)$.value
94+
} else if (!is.null(names_pattern)) {
95+
.value <- str_extract(measure_vars, into = names_to, names_pattern)$.value
96+
} else {
97+
abort("If you use '.value' in `names_to` you must also supply
98+
`names_sep' or `names_pattern")
99+
}
100+
101+
v_fct <- factor(.value, levels = unique(.value))
102+
measure_vars <- split(measure_vars, v_fct)
103+
values_to <- names(measure_vars)
104+
names(measure_vars) <- NULL
105+
106+
if (multiple_names_to) {
107+
variable_name <- names_to[!names_to == ".value"]
108+
}
109+
} else if (multiple_names_to) {
110+
if (is.null(names_sep) && is.null(names_pattern)) {
111+
abort("If you supply multiple names in `names_to` you must also
112+
supply `names_sep` or `names_pattern`")
113+
} else if (!is.null(names_sep) && !is.null(names_pattern)) {
114+
abort("only one of names_sep or names_pattern should be provided")
115+
}
116+
} else {
117+
variable_name <- names_to
118+
}
119+
120+
args <- list(
121+
measure.vars = measure_vars,
122+
variable.name = variable_name,
123+
value.name = values_to,
124+
na.rm = values_drop_na,
125+
variable.factor = FALSE
126+
)
127+
128+
# Clean up call args if defaults are used
129+
if (variable_name == "variable") {
130+
args$variable.name <- NULL
131+
}
132+
133+
if (identical(values_to, "value")) {
134+
args$value.name <- NULL
135+
}
136+
137+
if (is_false(values_drop_na)) {
138+
args$na.rm <- NULL
139+
}
140+
141+
sim_vars <- names(sim_data)
142+
id_vars <- sim_vars[!sim_vars %in% unlist(measure_vars)]
143+
144+
out <- step_call(
145+
data,
146+
"melt",
147+
args = args,
148+
vars = c(id_vars, variable_name, values_to)
149+
)
150+
151+
if (!is.null(names_prefix)) {
152+
out <- mutate(out, !!variable_name := gsub(paste0("^", names_prefix), "", !!sym(variable_name)))
153+
}
154+
155+
if (multiple_names_to && !uses_dot_value) {
156+
if (!is.null(names_sep)) {
157+
into_cols <- str_separate(pull(out, !!sym(variable_name)), names_to, sep = names_sep)
158+
} else {
159+
into_cols <- str_extract(pull(out, !!sym(variable_name)), into = names_to, regex = names_pattern)
160+
}
161+
out <- mutate(out, !!!into_cols)
162+
163+
# Need to drop variable_name and move names_to vars to correct position
164+
# Recreates relocate logic so only select is necessary, not relocate + select
165+
out_vars <- out$vars
166+
var_idx <- which(out_vars == variable_name)
167+
before_vars <- out_vars[seq_along(out_vars) < var_idx]
168+
after_vars <- out_vars[seq_along(out_vars) > var_idx]
169+
170+
out <- select(out, !!!syms(before_vars), !!!syms(names_to), !!!syms(after_vars))
171+
} else if (!multiple_names_to && uses_dot_value) {
172+
out <- mutate(out, variable = NULL)
173+
}
174+
175+
step_repair(out, repair = names_repair)
176+
}
177+
178+
# exported onLoad
179+
pivot_longer.data.table <- function(data,
180+
cols,
181+
names_to = "name",
182+
names_prefix = NULL,
183+
names_sep = NULL,
184+
names_pattern = NULL,
185+
names_ptypes = NULL,
186+
names_transform = NULL,
187+
names_repair = "check_unique",
188+
values_to = "value",
189+
values_drop_na = FALSE,
190+
values_ptypes = NULL,
191+
values_transform = NULL,
192+
...) {
193+
data <- lazy_dt(data)
194+
tidyr::pivot_longer(
195+
data = data,
196+
cols = {{ cols }},
197+
names_to = names_to,
198+
names_prefix = names_prefix,
199+
names_sep = names_sep,
200+
names_pattern = names_pattern,
201+
names_ptypes = names_ptypes,
202+
names_transform = names_transform,
203+
names_repair = names_repair,
204+
values_to = values_to,
205+
values_drop_na = values_drop_na,
206+
values_ptypes = values_ptypes,
207+
values_transform = values_transform,
208+
...
209+
)
210+
}
211+
212+
# ==============================================================================
213+
# inlined from tidyr
214+
# https://github.com/tidyverse/tidyr/issues/1103
215+
# ==============================================================================
216+
# nocov start
217+
218+
# str_extract() -----------------------------------------------------------------
219+
str_extract <- function(x, into, regex, convert = FALSE) {
220+
stopifnot(
221+
is_string(regex),
222+
is_character(into)
223+
)
224+
225+
out <- str_match_first(x, regex)
226+
if (length(out) != length(into)) {
227+
stop(
228+
"`regex` should define ", length(into), " groups; ", ncol(out), " found.",
229+
call. = FALSE
230+
)
231+
}
232+
233+
# Handle duplicated names
234+
if (anyDuplicated(into)) {
235+
pieces <- split(out, into)
236+
into <- names(pieces)
237+
out <- lapply(pieces, pmap_chr, paste0, sep = "")
238+
}
239+
240+
into <- as_utf8_character(into)
241+
242+
non_na_into <- !is.na(into)
243+
out <- out[non_na_into]
244+
names(out) <- into[non_na_into]
245+
246+
if (convert) {
247+
out[] <- lapply(out, utils::type.convert, as.is = TRUE)
248+
}
249+
250+
out
251+
}
252+
253+
str_match_first <- function(string, regex) {
254+
loc <- regexpr(regex, string, perl = TRUE)
255+
loc <- group_loc(loc)
256+
257+
out <- lapply(
258+
seq_len(loc$matches),
259+
function(i) substr(string, loc$start[, i], loc$end[, i])
260+
)
261+
out[-1]
262+
}
263+
264+
group_loc <- function(x) {
265+
start <- cbind(as.vector(x), attr(x, "capture.start"))
266+
end <- start + cbind(attr(x, "match.length"), attr(x, "capture.length")) - 1L
267+
268+
no_match <- start == -1L
269+
start[no_match] <- NA
270+
end[no_match] <- NA
271+
272+
list(matches = ncol(start), start = start, end = end)
273+
}
274+
275+
# str_separate() -----------------------------------------------------------------
276+
277+
str_separate <- function(x, into, sep, convert = FALSE, extra = "warn", fill = "warn") {
278+
if (!is.character(into)) {
279+
abort("`into` must be a character vector")
280+
}
281+
282+
if (is.numeric(sep)) {
283+
out <- strsep(x, sep)
284+
} else if (is_character(sep)) {
285+
out <- data.table::tstrsplit(x, sep, fixed = TRUE, names = TRUE)
286+
out <- as_tibble(out)
287+
} else {
288+
abort("`sep` must be either numeric or character")
289+
}
290+
291+
names(out) <- as_utf8_character(into)
292+
out <- out[!is.na(names(out))]
293+
if (convert) {
294+
out[] <- lapply(out, utils::type.convert, as.is = TRUE)
295+
}
296+
out
297+
}
298+
299+
strsep <- function(x, sep) {
300+
nchar <- nchar(x)
301+
pos <- lapply(sep, function(i) {
302+
if (i >= 0) return(i)
303+
pmax(0, nchar + i)
304+
})
305+
pos <- c(list(0), pos, list(nchar))
306+
307+
lapply(1:(length(pos) - 1), function(i) {
308+
substr(x, pos[[i]] + 1, pos[[i + 1]])
309+
})
310+
}
311+
312+
str_split_n <- function(x, pattern, n_max = -1) {
313+
if (is.factor(x)) {
314+
x <- as.character(x)
315+
}
316+
m <- gregexpr(pattern, x, perl = TRUE)
317+
if (n_max > 0) {
318+
m <- lapply(m, function(x) slice_match(x, seq_along(x) < n_max))
319+
}
320+
regmatches(x, m, invert = TRUE)
321+
}
322+
323+
slice_match <- function(x, i) {
324+
structure(
325+
x[i],
326+
match.length = attr(x, "match.length")[i],
327+
index.type = attr(x, "index.type"),
328+
useBytes = attr(x, "useBytes")
329+
)
330+
}
331+
332+
list_indices <- function(x, max = 20) {
333+
if (length(x) > max) {
334+
x <- c(x[seq_len(max)], "...")
335+
}
336+
337+
paste(x, collapse = ", ")
338+
}
339+
340+
# pmap()/pmap_chr() -----------------------------------------------------------------
341+
342+
args_recycle <- function(args) {
343+
lengths <- vapply(args, length, integer(1))
344+
n <- max(lengths)
345+
346+
stopifnot(all(lengths == 1L | lengths == n))
347+
to_recycle <- lengths == 1L
348+
args[to_recycle] <- lapply(args[to_recycle], function(x) rep.int(x, n))
349+
350+
args
351+
}
352+
353+
pmap <- function(.l, .f, ...) {
354+
args <- args_recycle(.l)
355+
do.call("mapply", c(
356+
FUN = list(quote(.f)),
357+
args, MoreArgs = quote(list(...)),
358+
SIMPLIFY = FALSE, USE.NAMES = FALSE
359+
))
360+
}
361+
362+
pmap_chr <- function(.l, .f, ...) {
363+
as.character(pmap(.l, .f, ...))
364+
}
365+
366+
# nocov end

R/tidyeval.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ dt_eval <- function(x) {
1414
# Make sure data.table functions are available so dtplyr still works
1515
# even when data.table isn't attached
1616
dt_funs <- c(
17-
"copy", "dcast", "nafill",
17+
"copy", "dcast", "melt", "nafill",
1818
"fcase", "fcoalesce", "fintersect", "frank", "frankv", "fsetdiff", "funion",
1919
"setcolorder", "setnames"
2020
)

0 commit comments

Comments
 (0)