Import R data frame attributes as metadata (#93)

nalimilan · web-flow · commit 880bed591a09 · 2022-10-10T08:38:42.000+02:00
Use the `metadata` function recently added to DataAPI and DataFrames
to import R `data.frame` attributes and set them as `DataFrame` metadata.
R stores per-column attributes in vector objects, while DataFrames.jl
stores them in the `DataFrame` object, as there is no generic mechanism
to attach metadata to an `AbstractVector` object.

The `row.names` attribute is skipped as it is not appropriate to store
it as global metadata given that it will get out of sync after subsetting
rows. We could provide a way to turn row names into a column instead.

Also add methods to check equality between two `DictoVec` objects
as these are useful for tests (haven commonly sets named numeric vectors
to store value labels so this case deserves testing).
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.0'
+          - '1.6'
           - '1' # automatically expands to the latest stable 1.x release of Julia
           - 'nightly'
         os:
@@ -27,7 +27,7 @@ jobs:
           - x86
         include: # macos doesn't support x86
           - os: macos-latest
-            version: '1.0'
+            version: '1.6'
             arch: x64
           - os: macos-latest
             version: '1'
diff --git a/Project.toml b/Project.toml
@@ -1,10 +1,11 @@
 name = "RData"
 uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
-version = "0.8.3"
+version = "1.0.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
+DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
@@ -15,11 +16,12 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 [compat]
 CategoricalArrays = "0.8, 0.9, 0.10"
 CodecZlib = "0.4, 0.5, 0.6, 0.7"
-DataFrames = "0.21, 0.22, 1.0"
+DataAPI = "1.12.0"
+DataFrames = "1.4.0"
 FileIO = "1.6.5"
 Requires = "1.0.0"
 TimeZones = "0.7, 0.8, 0.9, 0.10, 1.0"
-julia = "1"
+julia = "1.6"
 
 [extras]
 CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
diff --git a/src/DictoVec.jl b/src/DictoVec.jl
@@ -26,6 +26,15 @@ struct DictoVec{T}
     end
 end
 
+Base.:(==)(dict1::DictoVec, dict2::DictoVec) =
+    dict1.name2index == dict2.name2index && dict1.data == dict2.data
+Base.isequal(dict1::DictoVec, dict2::DictoVec) =
+    isequal(dict1.name2index, dict2.name2index) && isequal(dict1.data, dict2.data)
+
+const hash_dictovec_seed = UInt === UInt64 ? 0xe00ac4bbcfc2fa07 : 0x57f3f900
+Base.hash(dict::DictoVec, h::UInt) =
+    hash(dict.name2index, hash(dict.data, h + hash_dictovec_seed))
+
 Base.eltype(::Type{DictoVec{T}}) where T = T
 Base.eltype(dict::DictoVec) = eltype(typeof(dict))
 Base.length(dict::DictoVec) = length(dict.data)
diff --git a/src/RData.jl b/src/RData.jl
@@ -1,6 +1,6 @@
 module RData
 
-using DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode
+using DataAPI, DataFrames, CategoricalArrays, FileIO, TimeZones, Unicode
 
 export
     sexp2julia,
@@ -44,6 +44,8 @@ end
 ## supported `kwoptions`:
 ## convert::Bool (true by default) for converting R objects into Julia equivalents,
 ##               otherwise load() returns R internal representation (ROBJ-derived objects)
+## metadata::Bool (true by default) for importing R attributes into metadata
+##                (only has an effect for data frames currently)
 ## TODO option for disabling names checking (e.g. column names)
 ##
 ##############################################################################
@@ -57,6 +59,7 @@ function fileio_load(s::Stream{format"RData"}; kwoptions...)
     @debug "minimal R version: $(ctx.Rmin)"
 
     convert2julia = get(ctx.kwdict, :convert, true)
+    metadata = get(ctx.kwdict, :metadata, true)
 
     # top level read -- must be a paired list of objects
     # we read it here to be able to convert to julia objects inplace
@@ -70,7 +73,7 @@ function fileio_load(s::Stream{format"RData"}; kwoptions...)
         tag = readitem(ctx)
         obj_name = convert(RString, isa(tag, RSymbol) ? tag.displayname : "\0")
         obj = readitem(ctx)
-        setindex!(res, (convert2julia ? sexp2julia(obj) : obj), obj_name)
+        setindex!(res, (convert2julia ? sexp2julia(obj, metadata=metadata) : obj), obj_name)
         fl = readuint32(ctx.io)
         readattrs(ctx, fl)
     end
@@ -84,7 +87,8 @@ function fileio_load(s::Stream{format"RDataSingle"}; kwoptions...)
     ctx = RDAContext(rdaio(io, chomp(readline(io))); kwoptions...)
     @assert ctx.fmtver == 2 || ctx.fmtver == 3  # supported format versions
     convert2julia = get(ctx.kwdict, :convert, true)
-    return convert2julia ? sexp2julia(readitem(ctx)) : readitem(ctx)
+    metadata = get(ctx.kwdict, :metadata, true)
+    return convert2julia ? sexp2julia(readitem(ctx), metadata=metadata) : readitem(ctx)
 end
 
 function fileio_load(f::Union{File{format"RData"}, File{format"RDataSingle"}};
diff --git a/src/convert.jl b/src/convert.jl
@@ -194,12 +194,12 @@ function jlvec(::Type{T}, rv::RVEC, force_missing::Bool=true) where T
     end
 end
 
-function sexp2julia(rex::RSEXPREC)
+function sexp2julia(rex::RSEXPREC; metadata::Bool=true)
     @warn "Conversion of $(typeof(rex)) to Julia is not implemented" maxlog=1
     return nothing
 end
 
-function sexp2julia(rv::RVEC)
+function sexp2julia(rv::RVEC; metadata::Bool=true)
     # TODO dimnames?
     # FIXME add force_missing option to control whether always convert to Union{T, Missing}
     jv = jlvec(rv, false)
@@ -222,22 +222,48 @@ function sexp2julia(rv::RVEC)
     end
 end
 
-function sexp2julia(rl::RList)
+function sexp2julia(rl::RList; metadata::Bool=true)
     if isdataframe(rl)
         # FIXME add force_missing option to control whether always convert to Union{T, Missing}
-        DataFrame(Any[isa(col, RAltRep) ? sexp2julia(col) : jlvec(col, false) for col in rl.data],
-                  identifier.(names(rl)), makeunique=true)
+        cols = Any[isa(col, RAltRep) ? sexp2julia(col) : jlvec(col, false) for col in rl.data]
+        nms = identifier.(names(rl))
+        obj = DataFrame(cols, nms, makeunique=true)
+        if metadata
+            for (key, val) in pairs(rl.attr)
+                # skip already processed system attributes
+                if key in ("names", "class")
+                    continue
+                elseif key in ("comment", "label")
+                    metadata!(obj, key, sexp2julia(val; metadata=metadata), style=:note)
+                else
+                    metadata!(obj, key, sexp2julia(val), style=:default)
+                end
+            end
+            for (col, name) in zip(rl.data, nms)
+                for (key, val) in pairs(col.attr)
+                    # skip already processed system attributes
+                    if key in ("names", "class", "levels")
+                        continue
+                    elseif key in ("comment", "label", "units")
+                        colmetadata!(obj, name, key, sexp2julia(val), style=:note)
+                    else
+                        colmetadata!(obj, name, key, sexp2julia(val), style=:default)
+                    end
+                end
+            end
+        end
     elseif hasnames(rl)
-        DictoVec(jlvec(Any, rl), names(rl))
+        obj = DictoVec(jlvec(Any, rl), names(rl))
     else
         # FIXME return DictoVec if forceDictoVec is on
-        jlvec(Any, rl)
+        obj = jlvec(Any, rl)
     end
+    return obj
 end
 
-function sexp2julia(ar::RAltRep)
+function sexp2julia(ar::RAltRep; metadata::Bool=true)
     if iswrapped(ar)
-        return sexp2julia(unwrap(ar))
+        return sexp2julia(unwrap(ar), metadata=metadata)
     elseif iscompactseq(ar)
         return jlrange(ar)
     else
diff --git a/test/DictoVec.jl b/test/DictoVec.jl
@@ -28,6 +28,13 @@ end
     @test_throws KeyError dv["a"]
     @test_throws KeyError dv[:a]
 
+    @test dv == DictoVec(Symbol[]) == DictoVec(Int[])
+    @test isequal(dv, DictoVec(Symbol[]))
+    @test isequal(dv, DictoVec(Int[]))
+    @test dv != DictoVec([:a], ["a"])
+    @test !isequal(dv, DictoVec([:a], ["a"]))
+    @test hash(dv) == hash(DictoVec(Symbol[])) == hash(DictoVec(Int[]))
+
     @test get(dv, 1, :x) == :x
     @test get(() -> :y, dv, 1) == :y
     @test get(dv, "a", :x) == :x
@@ -88,6 +95,17 @@ end
     @test collect(keys(dv)) == RData.RString[]
     @test values(dv) == [2.0, 5.0, 4.0]
 
+    @test dv == DictoVec([2.0, 5.0, 4.0])
+    @test dv == DictoVec([2, 5, 4])
+    @test isequal(dv, DictoVec([2.0, 5.0, 4.0]))
+    @test dv != DictoVec([3.0, 5.0, 4.0])
+    @test !isequal(dv, DictoVec([3.0, 5.0, 4.0]))
+    @test dv != DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"])
+    @test !isequal(dv, DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"]))
+    @test hash(dv) ==
+        hash(DictoVec([2.0, 5.0, 4.0])) ==
+        hash(DictoVec([2, 5, 4]))
+
     @test_throws BoundsError dv[0]
     @test_throws BoundsError dv[4]
     @test dv[1] == 2.0
@@ -121,6 +139,17 @@ end
     @test values(dv) == [2.0, 5.0, 4.0]
     @test show2string(dv) == "DictoVec{Float64}(\"a\"=>2.0,\"b\"=>5.0,\"c\"=>4.0)"
 
+    @test dv == DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"])
+    @test dv == DictoVec([2, 5, 4], ["a", "b", "c"])
+    @test isequal(dv, DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"]))
+    @test dv != DictoVec([3.0, 5.0, 4.0], ["a", "b", "c"])
+    @test !isequal(dv, DictoVec([3.0, 5.0, 4.0], ["a", "b", "c"]))
+    @test dv != DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"])
+    @test !isequal(dv, DictoVec([2.0, 5.0, 4.0], ["b", "c", "a"]))
+    @test hash(dv) ==
+        hash(DictoVec([2.0, 5.0, 4.0], ["a", "b", "c"])) ==
+        hash(DictoVec([2, 5, 4], ["a", "b", "c"]))
+
     @test dv[1] === 2.0
     @test dv["a"] === 2.0
     @test dv[[1, 3]] == [2.0, 4.0]
@@ -142,6 +171,23 @@ end
     @test show2string(dv) == "DictoVec{Float64}(\"a\"=>6.0,\"c\"=>4.0)"
 end
 
+@testset "== and isequal with -0.0, NaN and missing" begin
+    @test DictoVec([0.0, 5.0, 4.0], ["b", "c", "a"]) ==
+        DictoVec([-0.0, 5.0, 4.0], ["b", "c", "a"])
+    @test !isequal(DictoVec([0.0, 5.0, 4.0], ["b", "c", "a"]),
+        DictoVec([-0.0, 5.0, 4.0], ["b", "c", "a"]))
+
+    @test DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]) !=
+        DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"])
+    @test isequal(DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]),
+        DictoVec([NaN, 5.0, 4.0], ["b", "c", "a"]))
+
+    @test ismissing(DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]) !=
+                    DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]))
+    @test isequal(DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]),
+        DictoVec([missing, 5.0, 4.0], ["b", "c", "a"]))
+end
+
 end
 
 end # TestDictoVec
diff --git a/test/RDA.jl b/test/RDA.jl
@@ -3,6 +3,7 @@ using Test
 using DataFrames
 using CategoricalArrays
 using RData
+using TimeZones
 
 @testset "Loading RData files (version=$ver)" for ver in (2, 3)
     rdata_path = joinpath(dirname(@__FILE__), "data_v$ver")
@@ -142,6 +143,40 @@ using RData
         @test testdf[!, "listascol2"] isa Vector{Any}
         @test isequal(testdf[!, "listascol2"], [[1., 2.], [3, 4], [5., 6., 7.]])
     end # list of vectors
+
+    @testset "Data frames attributes to metadata (version=3)" begin
+        df = load(joinpath("data_v3", "dfattributes.rda"))["df"]
+
+        @test isequal(Dict(k => metadata(df, k, style=true) for k in metadatakeys(df)),
+                    Dict("collectiontimes" => ([ZonedDateTime(2022, 05, 25, 22, 5, tz"UTC"),
+                                                ZonedDateTime(2022, 05, 26, 22, 5, tz"UTC")],
+                                                :default),
+                        "comment" => ("This is a data frame", :note),
+                        "row.names" => ([missing, -6], :default)))
+        @test Dict(k => colmetadata(df, :v1, k, style=true) for k in colmetadatakeys(df, :v1)) ==
+            Dict("label" => ("V1", :note),
+                 "labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default))
+        @test Dict(k => colmetadata(df, :v2, k, style=true) for k in colmetadatakeys(df, :v2)) ==
+            Dict("label" => ("V2", :note),
+                 "labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default),
+                 "na_values" => (3.0, :default))
+        @test Dict(k => colmetadata(df, :v3, k, style=true) for k in colmetadatakeys(df, :v3)) ==
+            Dict("label" => ("V3", :note),
+                 "labels" => (DictoVec([1.0, 2.0, 3.0], ["a", "b", "c"]), :default),
+                 "na_range" => ([3.0, Inf], :default))
+        @test Dict(k => colmetadata(df, :v4, k, style=true) for k in colmetadatakeys(df, :v4)) ==
+            Dict("label" => ("V4", :note),
+                 "comment" => ("A comment", :note),
+                 "units" => ("m/s^2", :note),
+                 "custom" => (1, :default))
+
+        df = load(joinpath("data_v3", "dfattributes.rda"), metadata=false)["df"]
+        @test isempty(metadatakeys(df))
+        @test isempty(colmetadatakeys(df, :v1))
+        @test isempty(colmetadatakeys(df, :v2))
+        @test isempty(colmetadatakeys(df, :v3))
+        @test isempty(colmetadatakeys(df, :v4))
+    end
 end # for ver in ...
 
 @testset "Loading AltRep-containing RData files (version=3)" begin
diff --git a/test/data_v2/dfattributes.rda b/test/data_v2/dfattributes.rda
diff --git a/test/data_v3/dfattributes.rda b/test/data_v3/dfattributes.rda
diff --git a/test/generate_rda.R b/test/generate_rda.R
@@ -100,6 +100,43 @@ saveRDS(list(as.POSIXct("2017-01-01 13:23"),
         file=file.path(rdata_path, "datetimes_tz.rds"), version=ver)
 Sys.setenv(TZ = sys_tz) # restore timezone
 
+# Importing data frame attributes as defined by common packages to metadata
+
+# Column-level attributes used by packages haven, labelled and sjlabelled
+# Generating code:
+# library(haven)
+# v1 <- labelled(c(1, 2, 2, 3, NA, 1), label="V1", labels=c(a=1, b=2, c=3))
+# v2 <- labelled_spss(c(1, 2, 2, 3, NA, 1), label="V2", labels=c(a=1, b=2, c=3),
+#                     na_values=3)
+# v3 <- labelled_spss(c(1, 2, 2, 3, NA, 1), label="V3", labels=c(a=1, b=2, c=3),
+#                     na_range=c(3, Inf))
+v1 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V1",
+                class="numeric")
+v2 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V2",
+                na_values=3, class="numeric")
+v3 <- structure(c(1, 2, 2, 3, NA, 1), labels=c(a=1, b=2, c=3), label="V3",
+                na_range=c(3, Inf), class="numeric")
+
+# Column-level attributes used by packages Hmisc, units and labelVector
+# (plus `comment` from base R and some custom attributes)
+# Generating code:
+# library(Hmisc)
+# v4 <- c(1, 2, 2, 3, NA, 1)
+# label(v4) <- "V4"
+# comment(v4) <- "A comment"
+# units(v4) <- "m/s^2"
+# attr(v4, "custom") <- 1
+v4 <- structure(c(1, 2, 2, 3, NA, 1), label="V4", class="numeric",
+                comment="A comment", units="m/s^2", custom=1)
+
+# Data frame-level attributes
+df <- data.frame(v1, v2, v3, v4)
+comment(df) <- "This is a data frame"
+attr(df, "collectiontimes") <- c(as.POSIXct("2022-05-25 22:05:00", tz="UTC"),
+                                 as.POSIXct("2022-05-26 22:05:00", tz="UTC"))
+
+save(df, file=file.path(rdata_path, "dfattributes.rda"))
+
 } # for (ver in ...)
 
 # generate V3 format AltRep objects