Skip to content

Commit 880bed5

Browse files
authored
Import R data frame attributes as metadata (#93)
Use the `metadata` function recently added to DataAPI and DataFrames to import R `data.frame` attributes and set them as `DataFrame` metadata. R stores per-column attributes in vector objects, while DataFrames.jl stores them in the `DataFrame` object, as there is no generic mechanism to attach metadata to an `AbstractVector` object. The `row.names` attribute is skipped as it is not appropriate to store it as global metadata given that it will get out of sync after subsetting rows. We could provide a way to turn row names into a column instead. Also add methods to check equality between two `DictoVec` objects as these are useful for tests (haven commonly sets named numeric vectors to store value labels so this case deserves testing).
1 parent 5df74cf commit 880bed5

File tree

10 files changed

+176
-17
lines changed

10 files changed

+176
-17
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
fail-fast: false
1717
matrix:
1818
version:
19-
- '1.0'
19+
- '1.6'
2020
- '1' # automatically expands to the latest stable 1.x release of Julia
2121
- 'nightly'
2222
os:
@@ -27,7 +27,7 @@ jobs:
2727
- x86
2828
include: # macos doesn't support x86
2929
- os: macos-latest
30-
version: '1.0'
30+
version: '1.6'
3131
arch: x64
3232
- os: macos-latest
3333
version: '1'

Project.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
name = "RData"
22
uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
3-
version = "0.8.3"
3+
version = "1.0.0"
44

55
[deps]
66
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
77
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
8+
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
89
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
910
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1011
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
@@ -15,11 +16,12 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
1516
[compat]
1617
CategoricalArrays = "0.8, 0.9, 0.10"
1718
CodecZlib = "0.4, 0.5, 0.6, 0.7"
18-
DataFrames = "0.21, 0.22, 1.0"
19+
DataAPI = "1.12.0"
20+
DataFrames = "1.4.0"
1921
FileIO = "1.6.5"
2022
Requires = "1.0.0"
2123
TimeZones = "0.7, 0.8, 0.9, 0.10, 1.0"
22-
julia = "1"
24+
julia = "1.6"
2325

2426
[extras]
2527
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"

src/DictoVec.jl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,15 @@ struct DictoVec{T}
2626
end
2727
end
2828

29+
Base.:(==)(dict1::DictoVec, dict2::DictoVec) =
30+
dict1.name2index == dict2.name2index && dict1.data == dict2.data
31+
Base.isequal(dict1::DictoVec, dict2::DictoVec) =
32+
isequal(dict1.name2index, dict2.name2index) && isequal(dict1.data, dict2.data)
33+
34+
const hash_dictovec_seed = UInt === UInt64 ? 0xe00ac4bbcfc2fa07 : 0x57f3f900
35+
Base.hash(dict::DictoVec, h::UInt) =
36+
hash(dict.name2index, hash(dict.data, h + hash_dictovec_seed))
37+
2938
Base.eltype(::Type{DictoVec{T}}) where T = T
3039
Base.eltype(dict::DictoVec) = eltype(typeof(dict))
3140
Base.length(dict::DictoVec) = length(dict.data)

src/RData.jl

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module RData
22

3-
using DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode
3+
using DataAPI, DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode
44

55
export
66
sexp2julia,
@@ -44,6 +44,8 @@ end
4444
## supported `kwoptions`:
4545
## convert::Bool (true by default) for converting R objects into Julia equivalents,
4646
## otherwise load() returns R internal representation (ROBJ-derived objects)
47+
## metadata::Bool (true by default) for importing R attributes into metadata
48+
## (only has an effect for data frames currently)
4749
## TODO option for disabling names checking (e.g. column names)
4850
##
4951
##############################################################################
@@ -57,6 +59,7 @@ function fileio_load(s::Stream{format"RData"}; kwoptions...)
5759
@debug "minimal R version: $(ctx.Rmin)"
5860

5961
convert2julia = get(ctx.kwdict, :convert, true)
62+
metadata = get(ctx.kwdict, :metadata, true)
6063

6164
# top level read -- must be a paired list of objects
6265
# we read it here to be able to convert to julia objects inplace
@@ -70,7 +73,7 @@ function fileio_load(s::Stream{format"RData"}; kwoptions...)
7073
tag = readitem(ctx)
7174
obj_name = convert(RString, isa(tag, RSymbol) ? tag.displayname : "\0")
7275
obj = readitem(ctx)
73-
setindex!(res, (convert2julia ? sexp2julia(obj) : obj), obj_name)
76+
setindex!(res, (convert2julia ? sexp2julia(obj, metadata=metadata) : obj), obj_name)
7477
fl = readuint32(ctx.io)
7578
readattrs(ctx, fl)
7679
end
@@ -84,7 +87,8 @@ function fileio_load(s::Stream{format"RDataSingle"}; kwoptions...)
8487
ctx = RDAContext(rdaio(io, chomp(readline(io))); kwoptions...)
8588
@assert ctx.fmtver == 2 || ctx.fmtver == 3 # supported format versions
8689
convert2julia = get(ctx.kwdict, :convert, true)
87-
return convert2julia ? sexp2julia(readitem(ctx)) : readitem(ctx)
90+
metadata = get(ctx.kwdict, :metadata, true)
91+
return convert2julia ? sexp2julia(readitem(ctx), metadata=metadata) : readitem(ctx)
8892
end
8993

9094
function fileio_load(f::Union{File{format"RData"}, File{format"RDataSingle"}};

src/convert.jl

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -194,12 +194,12 @@ function jlvec(::Type{T}, rv::RVEC, force_missing::Bool=true) where T
194194
end
195195
end
196196

197-
function sexp2julia(rex::RSEXPREC)
197+
function sexp2julia(rex::RSEXPREC; metadata::Bool=true)
198198
@warn "Conversion of $(typeof(rex)) to Julia is not implemented" maxlog=1
199199
return nothing
200200
end
201201

202-
function sexp2julia(rv::RVEC)
202+
function sexp2julia(rv::RVEC; metadata::Bool=true)
203203
# TODO dimnames?
204204
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
205205
jv = jlvec(rv, false)
@@ -222,22 +222,48 @@ function sexp2julia(rv::RVEC)
222222
end
223223
end
224224

225-
function sexp2julia(rl::RList)
225+
function sexp2julia(rl::RList; metadata::Bool=true)
226226
if isdataframe(rl)
227227
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
228-
DataFrame(Any[isa(col, RAltRep) ? sexp2julia(col) : jlvec(col, false) for col in rl.data],
229-
identifier.(names(rl)), makeunique=true)
228+
cols = Any[isa(col, RAltRep) ? sexp2julia(col) : jlvec(col, false) for col in rl.data]
229+
nms = identifier.(names(rl))
230+
obj = DataFrame(cols, nms, makeunique=true)
231+
if metadata
232+
for (key, val) in pairs(rl.attr)
233+
# skip already processed system attributes
234+
if key in ("names", "class")
235+
continue
236+
elseif key in ("comment", "label")
237+
metadata!(obj, key, sexp2julia(val; metadata=metadata), style=:note)
238+
else
239+
metadata!(obj, key, sexp2julia(val), style=:default)
240+
end
241+
end
242+
for (col, name) in zip(rl.data, nms)
243+
for (key, val) in pairs(col.attr)
244+
# skip already processed system attributes
245+
if key in ("names", "class", "levels")
246+
continue
247+
elseif key in ("comment", "label", "units")
248+
colmetadata!(obj, name, key, sexp2julia(val), style=:note)
249+
else
250+
colmetadata!(obj, name, key, sexp2julia(val), style=:default)
251+
end
252+
end
253+
end
254+
end
230255
elseif hasnames(rl)
231-
DictoVec(jlvec(Any, rl), names(rl))
256+
obj = DictoVec(jlvec(Any, rl), names(rl))
232257
else
233258
# FIXME return DictoVec if forceDictoVec is on
234-
jlvec(Any, rl)
259+
obj = jlvec(Any, rl)
235260
end
261+
return obj
236262
end
237263

238-
function sexp2julia(ar::RAltRep)
264+
function sexp2julia(ar::RAltRep; metadata::Bool=true)
239265
if iswrapped(ar)
240-
return sexp2julia(unwrap(ar))
266+
return sexp2julia(unwrap(ar), metadata=metadata)
241267
elseif iscompactseq(ar)
242268
return jlrange(ar)
243269
else

test/DictoVec.jl

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ end
2828
@test_throws KeyError dv["a"]
2929
@test_throws KeyError dv[:a]
3030

31+
@test dv == DictoVec(Symbol[]) == DictoVec(Int[])
32+
@test isequal(dv, DictoVec(Symbol[]))
33+
@test isequal(dv, DictoVec(Int[]))
34+
@test dv != DictoVec([:a], ["a"])
35+
@test !isequal(dv, DictoVec([:a], ["a"]))
36+
@test hash(dv) == hash(DictoVec(Symbol[])) == hash(DictoVec(Int[]))
37+
3138
@test get(dv, 1, :x) == :x
3239
@test get(() -> :y, dv, 1) == :y
3340
@test get(dv, "a", :x) == :x
@@ -88,6 +95,17 @@ end
8895
@test collect(keys(dv)) == RData.RString[]
8996
@test values(dv) == [2.0, 5.0, 4.0]
9097

98+
@test dv == DictoVec([2.0, 5.0, 4.0])
99+
@test dv == DictoVec([2, 5, 4])
100+
@test isequal(dv, DictoVec([2.0, 5.0, 4.0]))
101+
@test dv != DictoVec([3.0, 5.0, 4.0])
102+
@test !isequal(dv, DictoVec([3.0, 5.0, 4.0]))
103+
@test dv != DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"])
104+
@test !isequal(dv, DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"]))
105+
@test hash(dv) ==
106+
hash(DictoVec([2.0, 5.0, 4.0])) ==
107+
hash(DictoVec([2, 5, 4]))
108+
91109
@test_throws BoundsError dv[0]
92110
@test_throws BoundsError dv[4]
93111
@test dv[1] == 2.0
@@ -121,6 +139,17 @@ end
121139
@test values(dv) == [2.0, 5.0, 4.0]
122140
@test show2string(dv) == "DictoVec{Float64}(\"a\"=>2.0,\"b\"=>5.0,\"c\"=>4.0)"
123141

142+
@test dv == DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"])
143+
@test dv == DictoVec([2, 5, 4], ["a", "b", "c"])
144+
@test isequal(dv, DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"]))
145+
@test dv != DictoVec([3.0, 5.0, 4.0], ["a", "b", "c"])
146+
@test !isequal(dv, DictoVec([3.0, 5.0, 4.0], ["a", "b", "c"]))
147+
@test dv != DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"])
148+
@test !isequal(dv, DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"]))
149+
@test hash(dv) ==
150+
hash(DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"])) ==
151+
hash(DictoVec([2, 5, 4], ["a", "b", "c"]))
152+
124153
@test dv[1] === 2.0
125154
@test dv["a"] === 2.0
126155
@test dv[[1, 3]] == [2.0, 4.0]
@@ -142,6 +171,23 @@ end
142171
@test show2string(dv) == "DictoVec{Float64}(\"a\"=>6.0,\"c\"=>4.0)"
143172
end
144173

174+
@testset "== and isequal with -0.0, NaN and missing" begin
175+
@test DictoVec([0.0, 5.0, 4.0], ["b", "c", "a"]) ==
176+
DictoVec([-0.0, 5.0, 4.0], ["b", "c", "a"])
177+
@test !isequal(DictoVec([0.0, 5.0, 4.0], ["b", "c", "a"]),
178+
DictoVec([-0.0, 5.0, 4.0], ["b", "c", "a"]))
179+
180+
@test DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]) !=
181+
DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"])
182+
@test isequal(DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]),
183+
DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]))
184+
185+
@test ismissing(DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]) !=
186+
DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]))
187+
@test isequal(DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]),
188+
DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]))
189+
end
190+
145191
end
146192

147193
end # TestDictoVec

test/RDA.jl

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ using Test
33
using DataFrames
44
using CategoricalArrays
55
using RData
6+
using TimeZones
67

78
@testset "Loading RData files (version=$ver)" for ver in (2, 3)
89
rdata_path = joinpath(dirname(@__FILE__), "data_v$ver")
@@ -142,6 +143,40 @@ using RData
142143
@test testdf[!, "listascol2"] isa Vector{Any}
143144
@test isequal(testdf[!, "listascol2"], [[1., 2.], [3, 4], [5., 6., 7.]])
144145
end # list of vectors
146+
147+
@testset "Data frames attributes to metadata (version=3)" begin
148+
df = load(joinpath("data_v3", "dfattributes.rda"))["df"]
149+
150+
@test isequal(Dict(k => metadata(df, k, style=true) for k in metadatakeys(df)),
151+
Dict("collectiontimes" => ([ZonedDateTime(2022, 05, 25, 22, 5, tz"UTC"),
152+
ZonedDateTime(2022, 05, 26, 22, 5, tz"UTC")],
153+
:default),
154+
"comment" => ("This is a data frame", :note),
155+
"row.names" => ([missing, -6], :default)))
156+
@test Dict(k => colmetadata(df, :v1, k, style=true) for k in colmetadatakeys(df, :v1)) ==
157+
Dict("label" => ("V1", :note),
158+
"labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default))
159+
@test Dict(k => colmetadata(df, :v2, k, style=true) for k in colmetadatakeys(df, :v2)) ==
160+
Dict("label" => ("V2", :note),
161+
"labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default),
162+
"na_values" => (3.0, :default))
163+
@test Dict(k => colmetadata(df, :v3, k, style=true) for k in colmetadatakeys(df, :v3)) ==
164+
Dict("label" => ("V3", :note),
165+
"labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default),
166+
"na_range" => ([3.0, Inf], :default))
167+
@test Dict(k => colmetadata(df, :v4, k, style=true) for k in colmetadatakeys(df, :v4)) ==
168+
Dict("label" => ("V4", :note),
169+
"comment" => ("A comment", :note),
170+
"units" => ("m/s^2", :note),
171+
"custom" => (1, :default))
172+
173+
df = load(joinpath("data_v3", "dfattributes.rda"), metadata=false)["df"]
174+
@test isempty(metadatakeys(df))
175+
@test isempty(colmetadatakeys(df, :v1))
176+
@test isempty(colmetadatakeys(df, :v2))
177+
@test isempty(colmetadatakeys(df, :v3))
178+
@test isempty(colmetadatakeys(df, :v4))
179+
end
145180
end # for ver in ...
146181

147182
@testset "Loading AltRep-containing RData files (version=3)" begin

test/data_v2/dfattributes.rda

406 Bytes
Binary file not shown.

test/data_v3/dfattributes.rda

406 Bytes
Binary file not shown.

test/generate_rda.R

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,43 @@ saveRDS(list(as.POSIXct("2017-01-01 13:23"),
100100
file=file.path(rdata_path, "datetimes_tz.rds"), version=ver)
101101
Sys.setenv(TZ = sys_tz) # restore timezone
102102

103+
# Importing data frame attributes as defined by common packages to metadata
104+
105+
# Column-level attributes used by packages haven, labelled and sjlabelled
106+
# Generating code:
107+
# library(haven)
108+
# v1 <- labelled(c(1, 2, 2, 3, NA, 1), label="V1", labels=c(a=1, b=2, c=3))
109+
# v2 <- labelled_spss(c(1, 2, 2, 3, NA, 1), label="V2", labels=c(a=1, b=2, c=3),
110+
# na_values=3)
111+
# v3 <- labelled_spss(c(1, 2, 2, 3, NA, 1), label="V3", labels=c(a=1, b=2, c=3),
112+
# na_range=c(3, Inf))
113+
v1 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V1",
114+
class="numeric")
115+
v2 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V2",
116+
na_values=3, class="numeric")
117+
v3 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V3",
118+
na_range=c(3, Inf), class="numeric")
119+
120+
# Column-level attributes used by packages Hmisc, units and labelVector
121+
# (plus `comment` from base R and some custom attributes)
122+
# Generating code:
123+
# library(Hmisc)
124+
# v4 <- c(1, 2, 2, 3, NA, 1)
125+
# label(v4) <- "V4"
126+
# comment(v4) <- "A comment"
127+
# units(v4) <- "m/s^2"
128+
# attr(v4, "custom") <- 1
129+
v4 <- structure(c(1, 2, 2, 3, NA, 1), label="V4", class="numeric",
130+
comment="A comment", units="m/s^2", custom=1)
131+
132+
# Data frame-level attributes
133+
df <- data.frame(v1, v2, v3, v4)
134+
comment(df) <- "This is a data frame"
135+
attr(df, "collectiontimes") <- c(as.POSIXct("2022-05-25 22:05:00", tz="UTC"),
136+
as.POSIXct("2022-05-26 22:05:00", tz="UTC"))
137+
138+
save(df, file=file.path(rdata_path, "dfattributes.rda"))
139+
103140
} # for (ver in ...)
104141

105142
# generate V3 format AltRep objects

0 commit comments

Comments
 (0)