Skip to content

Commit 3ba9fe0

Browse files
committed
[Variant] Strawman / infrastructure for variant_get of shredded values
1 parent a535d3b commit 3ba9fe0

File tree

9 files changed

+1051
-249
lines changed

9 files changed

+1051
-249
lines changed

parquet-variant-compute/src/from_json.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, Ar
5252
#[cfg(test)]
5353
mod test {
5454
use crate::batch_json_string_to_variant;
55-
use arrow::array::{Array, ArrayRef, AsArray, StringArray};
55+
use arrow::array::{Array, ArrayRef, StringArray};
5656
use arrow_schema::ArrowError;
5757
use parquet_variant::{Variant, VariantBuilder};
5858
use std::sync::Arc;
@@ -69,8 +69,8 @@ mod test {
6969
let array_ref: ArrayRef = Arc::new(input);
7070
let variant_array = batch_json_string_to_variant(&array_ref).unwrap();
7171

72-
let metadata_array = variant_array.metadata_field().as_binary_view();
73-
let value_array = variant_array.value_field().as_binary_view();
72+
let metadata_array = variant_array.metadata_field();
73+
let value_array = variant_array.value_field().expect("value field");
7474

7575
// Compare row 0
7676
assert!(!variant_array.is_null(0));

parquet-variant-compute/src/variant_array.rs

Lines changed: 216 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717

1818
//! [`VariantArray`] implementation
1919
20-
use arrow::array::{Array, ArrayData, ArrayRef, AsArray, StructArray};
20+
use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray};
2121
use arrow::buffer::NullBuffer;
22+
use arrow::datatypes::Int32Type;
2223
use arrow_schema::{ArrowError, DataType};
2324
use parquet_variant::Variant;
2425
use std::any::Any;
@@ -44,27 +45,90 @@ use std::sync::Arc;
4445
/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing
4546
#[derive(Debug)]
4647
pub struct VariantArray {
47-
/// StructArray of up to three fields:
48-
///
49-
/// 1. A required field named `metadata` which is binary, large_binary, or
50-
/// binary_view
51-
///
52-
/// 2. An optional field named `value` that is binary, large_binary, or
53-
/// binary_view
54-
///
55-
/// 3. An optional field named `typed_value` which can be any primitive type
56-
/// or be a list, large_list, list_view or struct
57-
///
58-
/// NOTE: It is also permissible for the metadata field to be
59-
/// Dictionary-Encoded, preferably (but not required) with an index type of
60-
/// int8.
48+
/// Reference to the underlying StructArray
6149
inner: StructArray,
6250

63-
/// Reference to the metadata column of inner
64-
metadata_ref: ArrayRef,
51+
/// how is this variant array shredded?
52+
shredding_state: ShreddingState,
53+
}
54+
55+
/// Variant arrays can be shredded in one of three states, encoded here
56+
#[derive(Debug)]
57+
pub enum ShreddingState {
58+
/// This variant has no typed_value field
59+
Unshredded {
60+
metadata: BinaryViewArray,
61+
value: BinaryViewArray,
62+
},
63+
/// This variant has a typed_value field and no value field
64+
/// meaning it is fully shredded (aka the value is stored in typed_value)
65+
FullyShredded {
66+
metadata: BinaryViewArray,
67+
typed_value: ArrayRef,
68+
},
69+
/// This variant has both a value field and a typed_value field
70+
/// meaning it is partially shredded: first the typed_value is used, and
71+
/// if that is null, the value field is used.
72+
PartiallyShredded {
73+
metadata: BinaryViewArray,
74+
value: BinaryViewArray,
75+
typed_value: ArrayRef,
76+
},
77+
}
78+
79+
impl ShreddingState {
80+
/// Return a reference to the metadata field
81+
pub fn metadata_field(&self) -> &BinaryViewArray {
82+
match self {
83+
ShreddingState::Unshredded { metadata, .. } => metadata,
84+
ShreddingState::FullyShredded { metadata, .. } => metadata,
85+
ShreddingState::PartiallyShredded { metadata, .. } => metadata,
86+
}
87+
}
88+
89+
/// Return a reference to the value field, if present
90+
pub fn value_field(&self) -> Option<&BinaryViewArray> {
91+
match self {
92+
ShreddingState::Unshredded { value, .. } => Some(value),
93+
ShreddingState::FullyShredded { .. } => None,
94+
ShreddingState::PartiallyShredded { value, .. } => Some(value),
95+
}
96+
}
97+
98+
/// Return a reference to the typed_value field, if present
99+
pub fn typed_value_field(&self) -> Option<&ArrayRef> {
100+
match self {
101+
ShreddingState::Unshredded { .. } => None,
102+
ShreddingState::FullyShredded { typed_value, .. } => Some(typed_value),
103+
ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value),
104+
}
105+
}
65106

66-
/// Reference to the value column of inner
67-
value_ref: ArrayRef,
107+
/// Slice all the underlying arrays
108+
pub fn slice(&self, offset: usize, length: usize) -> Self {
109+
match self {
110+
ShreddingState::Unshredded { metadata, value } => ShreddingState::Unshredded {
111+
metadata: metadata.slice(offset, length),
112+
value: value.slice(offset, length),
113+
},
114+
ShreddingState::FullyShredded {
115+
metadata,
116+
typed_value,
117+
} => ShreddingState::FullyShredded {
118+
metadata: metadata.slice(offset, length),
119+
typed_value: typed_value.slice(offset, length),
120+
},
121+
ShreddingState::PartiallyShredded {
122+
metadata,
123+
value,
124+
typed_value,
125+
} => ShreddingState::PartiallyShredded {
126+
metadata: metadata.slice(offset, length),
127+
value: value.slice(offset, length),
128+
typed_value: typed_value.slice(offset, length),
129+
},
130+
}
131+
}
68132
}
69133

70134
impl VariantArray {
@@ -79,12 +143,22 @@ impl VariantArray {
79143
/// # Errors:
80144
/// - If the `StructArray` does not contain the required fields
81145
///
82-
/// # Current support
83-
/// This structure does not (yet) support the full Arrow Variant Array specification.
146+
/// # Requirements of the `StructArray`
147+
///
148+
/// 1. A required field named `metadata` which is binary, large_binary, or
149+
/// binary_view
84150
///
85-
/// Only `StructArrays` with `metadata` and `value` fields that are
86-
/// [`BinaryViewArray`] are supported. Shredded values are not currently supported
87-
/// nor are using types other than `BinaryViewArray`
151+
/// 2. An optional field named `value` that is binary, large_binary, or
152+
/// binary_view
153+
///
154+
/// 3. An optional field named `typed_value` which can be any primitive type
155+
/// or be a list, large_list, list_view or struct
156+
///
157+
/// NOTE: It is also permissible for the metadata field to be
158+
/// Dictionary-Encoded, preferably (but not required) with an index type of
159+
/// int8.
160+
///
161+
/// Currently, only [`BinaryViewArray`] are supported.
88162
///
89163
/// [`BinaryViewArray`]: arrow::array::BinaryViewArray
90164
pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
@@ -93,35 +167,64 @@ impl VariantArray {
93167
"Invalid VariantArray: requires StructArray as input".to_string(),
94168
));
95169
};
96-
// Ensure the StructArray has a metadata field of BinaryView
97170

98-
let Some(metadata_field) = VariantArray::find_metadata_field(inner) else {
171+
// Note the specification allows for any order so we must search by name
172+
173+
// Ensure the StructArray has a metadata field of BinaryView
174+
let Some(metadata_field) = inner.column_by_name("metadata") else {
99175
return Err(ArrowError::InvalidArgumentError(
100176
"Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(),
101177
));
102178
};
103-
if metadata_field.data_type() != &DataType::BinaryView {
179+
let Some(metadata) = metadata_field.as_binary_view_opt() else {
104180
return Err(ArrowError::NotYetImplemented(format!(
105181
"VariantArray 'metadata' field must be BinaryView, got {}",
106182
metadata_field.data_type()
107183
)));
108-
}
109-
let Some(value_field) = VariantArray::find_value_field(inner) else {
110-
return Err(ArrowError::InvalidArgumentError(
111-
"Invalid VariantArray: StructArray must contain a 'value' field".to_string(),
112-
));
113184
};
114-
if value_field.data_type() != &DataType::BinaryView {
115-
return Err(ArrowError::NotYetImplemented(format!(
116-
"VariantArray 'value' field must be BinaryView, got {}",
117-
value_field.data_type()
118-
)));
119-
}
185+
186+
// Find the value field, if present
187+
let value_field = inner.column_by_name("value");
188+
let value = value_field
189+
.map(|v| match v.as_binary_view_opt() {
190+
Some(bv) => Ok(bv),
191+
None => Err(ArrowError::NotYetImplemented(format!(
192+
"VariantArray 'value' field must be BinaryView, got {}",
193+
v.data_type()
194+
))),
195+
})
196+
.transpose()?;
197+
198+
// Find the typed_value field, if present
199+
let typed_value = inner.column_by_name("typed_value");
200+
201+
// Note these clones are cheap, they just bump the ref count
202+
let inner = inner.clone();
203+
let metadata = metadata.clone();
204+
let value = value.cloned();
205+
let typed_value = typed_value.cloned();
206+
207+
let shredding_state = match (metadata, value, typed_value) {
208+
(metadata, Some(value), Some(typed_value)) => ShreddingState::PartiallyShredded {
209+
metadata,
210+
value,
211+
typed_value,
212+
},
213+
(metadata, Some(value), None) => ShreddingState::Unshredded { metadata, value },
214+
(metadata, None, Some(typed_value)) => ShreddingState::FullyShredded {
215+
metadata,
216+
typed_value,
217+
},
218+
(_metadata_field, None, None) => {
219+
return Err(ArrowError::InvalidArgumentError(String::from(
220+
"VariantArray has neither value nor typed_value field",
221+
)));
222+
}
223+
};
120224

121225
Ok(Self {
122-
inner: inner.clone(),
123-
metadata_ref: metadata_field,
124-
value_ref: value_field,
226+
inner,
227+
shredding_state,
125228
})
126229
}
127230

@@ -135,36 +238,87 @@ impl VariantArray {
135238
self.inner
136239
}
137240

241+
/// Return the shredding state of this `VariantArray`
242+
pub fn shredding_state(&self) -> &ShreddingState {
243+
&self.shredding_state
244+
}
245+
138246
/// Return the [`Variant`] instance stored at the given row
139247
///
140-
/// Panics if the index is out of bounds.
248+
/// Consistently with other Arrow arrays types, this API requires you to
249+
/// check for nulls first using [`Self::is_valid`].
250+
///
251+
/// # Panics
252+
/// * if the index is out of bounds
253+
/// * if the array value is null
254+
///
255+
/// If this is a shredded variant but has no value at the shredded location, it
256+
/// will return [`Variant::Null`].
257+
///
258+
///
259+
/// # Performance Note
260+
///
261+
/// This is certainly not the most efficient way to access values in a
262+
/// `VariantArray`, but it is useful for testing and debugging.
141263
///
142264
/// Note: Does not do deep validation of the [`Variant`], so it is up to the
143265
/// caller to ensure that the metadata and value were constructed correctly.
144266
pub fn value(&self, index: usize) -> Variant {
145-
let metadata = self.metadata_field().as_binary_view().value(index);
146-
let value = self.value_field().as_binary_view().value(index);
147-
Variant::new(metadata, value)
267+
match &self.shredding_state {
268+
ShreddingState::Unshredded { metadata, value } => {
269+
Variant::new(metadata.value(index), value.value(index))
270+
}
271+
ShreddingState::FullyShredded {
272+
metadata: _,
273+
typed_value,
274+
} => {
275+
if typed_value.is_null(index) {
276+
Variant::Null
277+
} else {
278+
typed_value_to_variant(typed_value, index)
279+
}
280+
}
281+
ShreddingState::PartiallyShredded {
282+
metadata,
283+
value,
284+
typed_value,
285+
} => {
286+
if typed_value.is_null(index) {
287+
Variant::new(metadata.value(index), value.value(index))
288+
} else {
289+
typed_value_to_variant(typed_value, index)
290+
}
291+
}
292+
}
148293
}
149294

150-
fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> {
151-
array.column_by_name("metadata").cloned()
295+
/// Return a reference to the metadata field of the [`StructArray`]
296+
pub fn metadata_field(&self) -> &BinaryViewArray {
297+
self.shredding_state.metadata_field()
152298
}
153299

154-
fn find_value_field(array: &StructArray) -> Option<ArrayRef> {
155-
array.column_by_name("value").cloned()
300+
/// Return a reference to the value field of the `StructArray`
301+
pub fn value_field(&self) -> Option<&BinaryViewArray> {
302+
self.shredding_state.value_field()
156303
}
157304

158-
/// Return a reference to the metadata field of the [`StructArray`]
159-
pub fn metadata_field(&self) -> &ArrayRef {
160-
// spec says fields order is not guaranteed, so we search by name
161-
&self.metadata_ref
305+
/// Return a reference to the typed_value field of the `StructArray`, if present
306+
pub fn typed_value_field(&self) -> Option<&ArrayRef> {
307+
self.shredding_state.typed_value_field()
162308
}
309+
}
163310

164-
/// Return a reference to the value field of the `StructArray`
165-
pub fn value_field(&self) -> &ArrayRef {
166-
// spec says fields order is not guaranteed, so we search by name
167-
&self.value_ref
311+
/// returns the non-null element at index as a Variant
312+
fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant {
313+
match typed_value.data_type() {
314+
DataType::Int32 => {
315+
let typed_value = typed_value.as_primitive::<Int32Type>();
316+
Variant::from(typed_value.value(index))
317+
}
318+
// todo other types here
319+
_ => {
320+
todo!(); // Unsupported typed_value type
321+
}
168322
}
169323
}
170324

@@ -186,13 +340,11 @@ impl Array for VariantArray {
186340
}
187341

188342
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
189-
let slice = self.inner.slice(offset, length);
190-
let met = self.metadata_ref.slice(offset, length);
191-
let val = self.value_ref.slice(offset, length);
343+
let inner = self.inner.slice(offset, length);
344+
let shredding_state = self.shredding_state.slice(offset, length);
192345
Arc::new(Self {
193-
inner: slice,
194-
metadata_ref: met,
195-
value_ref: val,
346+
inner,
347+
shredding_state,
196348
})
197349
}
198350

@@ -258,7 +410,7 @@ mod test {
258410
let err = VariantArray::try_new(Arc::new(array));
259411
assert_eq!(
260412
err.unwrap_err().to_string(),
261-
"Invalid argument error: Invalid VariantArray: StructArray must contain a 'value' field"
413+
"Invalid argument error: VariantArray has neither value nor typed_value field"
262414
);
263415
}
264416

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ mod test {
375375

376376
// the metadata and value fields of non shredded variants should not be null
377377
assert!(variant_array.metadata_field().nulls().is_none());
378-
assert!(variant_array.value_field().nulls().is_none());
378+
assert!(variant_array.value_field().unwrap().nulls().is_none());
379379
let DataType::Struct(fields) = variant_array.data_type() else {
380380
panic!("Expected VariantArray to have Struct data type");
381381
};

0 commit comments

Comments
 (0)