17
17
18
18
//! [`VariantArray`] implementation
19
19
20
- use arrow:: array:: { Array , ArrayData , ArrayRef , AsArray , StructArray } ;
20
+ use arrow:: array:: { Array , ArrayData , ArrayRef , AsArray , BinaryViewArray , StructArray } ;
21
21
use arrow:: buffer:: NullBuffer ;
22
+ use arrow:: datatypes:: Int32Type ;
22
23
use arrow_schema:: { ArrowError , DataType } ;
23
24
use parquet_variant:: Variant ;
24
25
use std:: any:: Any ;
@@ -44,27 +45,90 @@ use std::sync::Arc;
44
45
/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing
45
46
#[ derive( Debug ) ]
46
47
pub struct VariantArray {
47
- /// StructArray of up to three fields:
48
- ///
49
- /// 1. A required field named `metadata` which is binary, large_binary, or
50
- /// binary_view
51
- ///
52
- /// 2. An optional field named `value` that is binary, large_binary, or
53
- /// binary_view
54
- ///
55
- /// 3. An optional field named `typed_value` which can be any primitive type
56
- /// or be a list, large_list, list_view or struct
57
- ///
58
- /// NOTE: It is also permissible for the metadata field to be
59
- /// Dictionary-Encoded, preferably (but not required) with an index type of
60
- /// int8.
48
+ /// Reference to the underlying StructArray
61
49
inner : StructArray ,
62
50
63
- /// Reference to the metadata column of inner
64
- metadata_ref : ArrayRef ,
51
+ /// how is this variant array shredded?
52
+ shredding_state : ShreddingState ,
53
+ }
54
+
55
+ /// Variant arrays can be shredded in one of three states, encoded here
56
+ #[ derive( Debug ) ]
57
+ pub enum ShreddingState {
58
+ /// This variant has no typed_value field
59
+ Unshredded {
60
+ metadata : BinaryViewArray ,
61
+ value : BinaryViewArray ,
62
+ } ,
63
+ /// This variant has a typed_value field and no value field
64
+ /// meaning it is fully shredded (aka the value is stored in typed_value)
65
+ FullyShredded {
66
+ metadata : BinaryViewArray ,
67
+ typed_value : ArrayRef ,
68
+ } ,
69
+ /// This variant has both a value field and a typed_value field
70
+ /// meaning it is partially shredded: first the typed_value is used, and
71
+ /// if that is null, the value field is used.
72
+ PartiallyShredded {
73
+ metadata : BinaryViewArray ,
74
+ value : BinaryViewArray ,
75
+ typed_value : ArrayRef ,
76
+ } ,
77
+ }
78
+
79
+ impl ShreddingState {
80
+ /// Return a reference to the metadata field
81
+ pub fn metadata_field ( & self ) -> & BinaryViewArray {
82
+ match self {
83
+ ShreddingState :: Unshredded { metadata, .. } => metadata,
84
+ ShreddingState :: FullyShredded { metadata, .. } => metadata,
85
+ ShreddingState :: PartiallyShredded { metadata, .. } => metadata,
86
+ }
87
+ }
88
+
89
+ /// Return a reference to the value field, if present
90
+ pub fn value_field ( & self ) -> Option < & BinaryViewArray > {
91
+ match self {
92
+ ShreddingState :: Unshredded { value, .. } => Some ( value) ,
93
+ ShreddingState :: FullyShredded { .. } => None ,
94
+ ShreddingState :: PartiallyShredded { value, .. } => Some ( value) ,
95
+ }
96
+ }
97
+
98
+ /// Return a reference to the typed_value field, if present
99
+ pub fn typed_value_field ( & self ) -> Option < & ArrayRef > {
100
+ match self {
101
+ ShreddingState :: Unshredded { .. } => None ,
102
+ ShreddingState :: FullyShredded { typed_value, .. } => Some ( typed_value) ,
103
+ ShreddingState :: PartiallyShredded { typed_value, .. } => Some ( typed_value) ,
104
+ }
105
+ }
65
106
66
- /// Reference to the value column of inner
67
- value_ref : ArrayRef ,
107
+ /// Slice all the underlying arrays
108
+ pub fn slice ( & self , offset : usize , length : usize ) -> Self {
109
+ match self {
110
+ ShreddingState :: Unshredded { metadata, value } => ShreddingState :: Unshredded {
111
+ metadata : metadata. slice ( offset, length) ,
112
+ value : value. slice ( offset, length) ,
113
+ } ,
114
+ ShreddingState :: FullyShredded {
115
+ metadata,
116
+ typed_value,
117
+ } => ShreddingState :: FullyShredded {
118
+ metadata : metadata. slice ( offset, length) ,
119
+ typed_value : typed_value. slice ( offset, length) ,
120
+ } ,
121
+ ShreddingState :: PartiallyShredded {
122
+ metadata,
123
+ value,
124
+ typed_value,
125
+ } => ShreddingState :: PartiallyShredded {
126
+ metadata : metadata. slice ( offset, length) ,
127
+ value : value. slice ( offset, length) ,
128
+ typed_value : typed_value. slice ( offset, length) ,
129
+ } ,
130
+ }
131
+ }
68
132
}
69
133
70
134
impl VariantArray {
@@ -79,12 +143,22 @@ impl VariantArray {
79
143
/// # Errors:
80
144
/// - If the `StructArray` does not contain the required fields
81
145
///
82
- /// # Current support
83
- /// This structure does not (yet) support the full Arrow Variant Array specification.
146
+ /// # Requirements of the `StructArray`
147
+ ///
148
+ /// 1. A required field named `metadata` which is binary, large_binary, or
149
+ /// binary_view
84
150
///
85
- /// Only `StructArrays` with `metadata` and `value` fields that are
86
- /// [`BinaryViewArray`] are supported. Shredded values are not currently supported
87
- /// nor are using types other than `BinaryViewArray`
151
+ /// 2. An optional field named `value` that is binary, large_binary, or
152
+ /// binary_view
153
+ ///
154
+ /// 3. An optional field named `typed_value` which can be any primitive type
155
+ /// or be a list, large_list, list_view or struct
156
+ ///
157
+ /// NOTE: It is also permissible for the metadata field to be
158
+ /// Dictionary-Encoded, preferably (but not required) with an index type of
159
+ /// int8.
160
+ ///
161
+ /// Currently, only [`BinaryViewArray`] are supported.
88
162
///
89
163
/// [`BinaryViewArray`]: arrow::array::BinaryViewArray
90
164
pub fn try_new ( inner : ArrayRef ) -> Result < Self , ArrowError > {
@@ -93,35 +167,64 @@ impl VariantArray {
93
167
"Invalid VariantArray: requires StructArray as input" . to_string ( ) ,
94
168
) ) ;
95
169
} ;
96
- // Ensure the StructArray has a metadata field of BinaryView
97
170
98
- let Some ( metadata_field) = VariantArray :: find_metadata_field ( inner) else {
171
+ // Note the specification allows for any order so we must search by name
172
+
173
+ // Ensure the StructArray has a metadata field of BinaryView
174
+ let Some ( metadata_field) = inner. column_by_name ( "metadata" ) else {
99
175
return Err ( ArrowError :: InvalidArgumentError (
100
176
"Invalid VariantArray: StructArray must contain a 'metadata' field" . to_string ( ) ,
101
177
) ) ;
102
178
} ;
103
- if metadata_field . data_type ( ) != & DataType :: BinaryView {
179
+ let Some ( metadata ) = metadata_field . as_binary_view_opt ( ) else {
104
180
return Err ( ArrowError :: NotYetImplemented ( format ! (
105
181
"VariantArray 'metadata' field must be BinaryView, got {}" ,
106
182
metadata_field. data_type( )
107
183
) ) ) ;
108
- }
109
- let Some ( value_field) = VariantArray :: find_value_field ( inner) else {
110
- return Err ( ArrowError :: InvalidArgumentError (
111
- "Invalid VariantArray: StructArray must contain a 'value' field" . to_string ( ) ,
112
- ) ) ;
113
184
} ;
114
- if value_field. data_type ( ) != & DataType :: BinaryView {
115
- return Err ( ArrowError :: NotYetImplemented ( format ! (
116
- "VariantArray 'value' field must be BinaryView, got {}" ,
117
- value_field. data_type( )
118
- ) ) ) ;
119
- }
185
+
186
+ // Find the value field, if present
187
+ let value_field = inner. column_by_name ( "value" ) ;
188
+ let value = value_field
189
+ . map ( |v| match v. as_binary_view_opt ( ) {
190
+ Some ( bv) => Ok ( bv) ,
191
+ None => Err ( ArrowError :: NotYetImplemented ( format ! (
192
+ "VariantArray 'value' field must be BinaryView, got {}" ,
193
+ v. data_type( )
194
+ ) ) ) ,
195
+ } )
196
+ . transpose ( ) ?;
197
+
198
+ // Find the typed_value field, if present
199
+ let typed_value = inner. column_by_name ( "typed_value" ) ;
200
+
201
+ // Note these clones are cheap, they just bump the ref count
202
+ let inner = inner. clone ( ) ;
203
+ let metadata = metadata. clone ( ) ;
204
+ let value = value. cloned ( ) ;
205
+ let typed_value = typed_value. cloned ( ) ;
206
+
207
+ let shredding_state = match ( metadata, value, typed_value) {
208
+ ( metadata, Some ( value) , Some ( typed_value) ) => ShreddingState :: PartiallyShredded {
209
+ metadata,
210
+ value,
211
+ typed_value,
212
+ } ,
213
+ ( metadata, Some ( value) , None ) => ShreddingState :: Unshredded { metadata, value } ,
214
+ ( metadata, None , Some ( typed_value) ) => ShreddingState :: FullyShredded {
215
+ metadata,
216
+ typed_value,
217
+ } ,
218
+ ( _metadata_field, None , None ) => {
219
+ return Err ( ArrowError :: InvalidArgumentError ( String :: from (
220
+ "VariantArray has neither value nor typed_value field" ,
221
+ ) ) ) ;
222
+ }
223
+ } ;
120
224
121
225
Ok ( Self {
122
- inner : inner. clone ( ) ,
123
- metadata_ref : metadata_field,
124
- value_ref : value_field,
226
+ inner,
227
+ shredding_state,
125
228
} )
126
229
}
127
230
@@ -135,36 +238,87 @@ impl VariantArray {
135
238
self . inner
136
239
}
137
240
241
+ /// Return the shredding state of this `VariantArray`
242
+ pub fn shredding_state ( & self ) -> & ShreddingState {
243
+ & self . shredding_state
244
+ }
245
+
138
246
/// Return the [`Variant`] instance stored at the given row
139
247
///
140
- /// Panics if the index is out of bounds.
248
+ /// Consistently with other Arrow arrays types, this API requires you to
249
+ /// check for nulls first using [`Self::is_valid`].
250
+ ///
251
+ /// # Panics
252
+ /// * if the index is out of bounds
253
+ /// * if the array value is null
254
+ ///
255
+ /// If this is a shredded variant but has no value at the shredded location, it
256
+ /// will return [`Variant::Null`].
257
+ ///
258
+ ///
259
+ /// # Performance Note
260
+ ///
261
+ /// This is certainly not the most efficient way to access values in a
262
+ /// `VariantArray`, but it is useful for testing and debugging.
141
263
///
142
264
/// Note: Does not do deep validation of the [`Variant`], so it is up to the
143
265
/// caller to ensure that the metadata and value were constructed correctly.
144
266
pub fn value ( & self , index : usize ) -> Variant {
145
- let metadata = self . metadata_field ( ) . as_binary_view ( ) . value ( index) ;
146
- let value = self . value_field ( ) . as_binary_view ( ) . value ( index) ;
147
- Variant :: new ( metadata, value)
267
+ match & self . shredding_state {
268
+ ShreddingState :: Unshredded { metadata, value } => {
269
+ Variant :: new ( metadata. value ( index) , value. value ( index) )
270
+ }
271
+ ShreddingState :: FullyShredded {
272
+ metadata : _,
273
+ typed_value,
274
+ } => {
275
+ if typed_value. is_null ( index) {
276
+ Variant :: Null
277
+ } else {
278
+ typed_value_to_variant ( typed_value, index)
279
+ }
280
+ }
281
+ ShreddingState :: PartiallyShredded {
282
+ metadata,
283
+ value,
284
+ typed_value,
285
+ } => {
286
+ if typed_value. is_null ( index) {
287
+ Variant :: new ( metadata. value ( index) , value. value ( index) )
288
+ } else {
289
+ typed_value_to_variant ( typed_value, index)
290
+ }
291
+ }
292
+ }
148
293
}
149
294
150
- fn find_metadata_field ( array : & StructArray ) -> Option < ArrayRef > {
151
- array. column_by_name ( "metadata" ) . cloned ( )
295
+ /// Return a reference to the metadata field of the [`StructArray`]
296
+ pub fn metadata_field ( & self ) -> & BinaryViewArray {
297
+ self . shredding_state . metadata_field ( )
152
298
}
153
299
154
- fn find_value_field ( array : & StructArray ) -> Option < ArrayRef > {
155
- array. column_by_name ( "value" ) . cloned ( )
300
+ /// Return a reference to the value field of the `StructArray`
301
+ pub fn value_field ( & self ) -> Option < & BinaryViewArray > {
302
+ self . shredding_state . value_field ( )
156
303
}
157
304
158
- /// Return a reference to the metadata field of the [`StructArray`]
159
- pub fn metadata_field ( & self ) -> & ArrayRef {
160
- // spec says fields order is not guaranteed, so we search by name
161
- & self . metadata_ref
305
+ /// Return a reference to the typed_value field of the `StructArray`, if present
306
+ pub fn typed_value_field ( & self ) -> Option < & ArrayRef > {
307
+ self . shredding_state . typed_value_field ( )
162
308
}
309
+ }
163
310
164
- /// Return a reference to the value field of the `StructArray`
165
- pub fn value_field ( & self ) -> & ArrayRef {
166
- // spec says fields order is not guaranteed, so we search by name
167
- & self . value_ref
311
+ /// returns the non-null element at index as a Variant
312
+ fn typed_value_to_variant ( typed_value : & ArrayRef , index : usize ) -> Variant {
313
+ match typed_value. data_type ( ) {
314
+ DataType :: Int32 => {
315
+ let typed_value = typed_value. as_primitive :: < Int32Type > ( ) ;
316
+ Variant :: from ( typed_value. value ( index) )
317
+ }
318
+ // todo other types here
319
+ _ => {
320
+ todo ! ( ) ; // Unsupported typed_value type
321
+ }
168
322
}
169
323
}
170
324
@@ -186,13 +340,11 @@ impl Array for VariantArray {
186
340
}
187
341
188
342
fn slice ( & self , offset : usize , length : usize ) -> ArrayRef {
189
- let slice = self . inner . slice ( offset, length) ;
190
- let met = self . metadata_ref . slice ( offset, length) ;
191
- let val = self . value_ref . slice ( offset, length) ;
343
+ let inner = self . inner . slice ( offset, length) ;
344
+ let shredding_state = self . shredding_state . slice ( offset, length) ;
192
345
Arc :: new ( Self {
193
- inner : slice,
194
- metadata_ref : met,
195
- value_ref : val,
346
+ inner,
347
+ shredding_state,
196
348
} )
197
349
}
198
350
@@ -258,7 +410,7 @@ mod test {
258
410
let err = VariantArray :: try_new ( Arc :: new ( array) ) ;
259
411
assert_eq ! (
260
412
err. unwrap_err( ) . to_string( ) ,
261
- "Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field"
413
+ "Invalid argument error: VariantArray has neither value nor typed_value field"
262
414
) ;
263
415
}
264
416
0 commit comments