Skip to content

Commit 5d79a9c

Browse files
committed
[Variant] Support variant's typed_value for DataType::{Binary/LargeBinary/BinaryView}
1 parent 40300ca commit 5d79a9c

File tree

5 files changed

+157
-13
lines changed

5 files changed

+157
-13
lines changed

arrow-array/src/builder/generic_bytes_builder.rs

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
357357
/// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \
358358
///
359359
/// These capacities are preallocation hints used to improve performance,
360-
/// but consuquences of passing a hint too large or too small should be negligible.
360+
/// but consequences of passing a hint too large or too small should be negligible.
361361
const AVERAGE_STRING_LENGTH: usize = 16;
362362
/// Trait for string-like array builders
363363
///
@@ -392,6 +392,50 @@ impl<O: OffsetSizeTrait> StringLikeArrayBuilder for GenericStringBuilder<O> {
392392
}
393393
}
394394

395+
/// A byte size value representing the number of bytes to allocate per binary in [`GenericBinaryBuilder`]
396+
///
397+
/// To create a [`GenericBinaryBuilder`] using `.with_capacity` we are required to provide: \
398+
/// - `item_capacity` - the row count \
399+
/// - `data_capacity` - total binary byte count \
400+
///
401+
/// We will use the `AVERAGE_BINARY_LENGTH` * row_count for `data_capacity`. \
402+
///
403+
/// These capacities are preallocation hints used to improve performance,
404+
/// but consequences of passing a hint too large or too small should be negligible.
405+
const AVERAGE_BINARY_LENGTH: usize = 128;
406+
/// Trait for binary-like array builders
407+
///
408+
/// This trait provides unified interface for builders that append binary-like data
409+
/// such as [`GenericBinaryBuilder<O>`] and [`crate::builder::BinaryViewBuilder`]
410+
pub trait BinaryLikeArrayBuilder: ArrayBuilder {
411+
/// Returns a human-readable type name for the builder.
412+
fn type_name() -> &'static str;
413+
414+
/// Creates a new builder with the given row capacity.
415+
fn with_capacity(capacity: usize) -> Self;
416+
417+
/// Appends a non-null string value to the builder.
418+
fn append_value(&mut self, value: &[u8]);
419+
420+
/// Appends a null value to the builder.
421+
fn append_null(&mut self);
422+
}
423+
424+
impl<O: OffsetSizeTrait> BinaryLikeArrayBuilder for GenericBinaryBuilder<O> {
425+
fn type_name() -> &'static str {
426+
std::any::type_name::<Self>()
427+
}
428+
fn with_capacity(capacity: usize) -> Self {
429+
Self::with_capacity(capacity, capacity * AVERAGE_BINARY_LENGTH)
430+
}
431+
fn append_value(&mut self, value: &[u8]) {
432+
Self::append_value(self, value);
433+
}
434+
fn append_null(&mut self) {
435+
Self::append_null(self);
436+
}
437+
}
438+
395439
/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
396440
///
397441
/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow_schema::ArrowError;
2525
use hashbrown::HashTable;
2626
use hashbrown::hash_table::Entry;
2727

28-
use crate::builder::{ArrayBuilder, StringLikeArrayBuilder};
28+
use crate::builder::{ArrayBuilder, BinaryLikeArrayBuilder, StringLikeArrayBuilder};
2929
use crate::types::bytes::ByteArrayNativeType;
3030
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
3131
use crate::{Array, ArrayRef, GenericByteViewArray};
@@ -570,6 +570,21 @@ impl StringLikeArrayBuilder for StringViewBuilder {
570570
///
571571
pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>;
572572

573+
impl BinaryLikeArrayBuilder for BinaryViewBuilder {
574+
fn type_name() -> &'static str {
575+
std::any::type_name::<BinaryViewBuilder>()
576+
}
577+
fn with_capacity(capacity: usize) -> Self {
578+
Self::with_capacity(capacity)
579+
}
580+
fn append_value(&mut self, value: &[u8]) {
581+
Self::append_value(self, value);
582+
}
583+
fn append_null(&mut self) {
584+
Self::append_null(self);
585+
}
586+
}
587+
573588
/// Creates a view from a fixed length input (the compiler can generate
574589
/// specialized code for this)
575590
fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {

parquet-variant-compute/src/variant_array.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,17 +1172,17 @@ fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, Dat
11721172
Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
11731173
Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
11741174

1175-
// Binary and string are allowed. Force Binary to BinaryView because that's what the parquet
1175+
// Binary and string are allowed. Force Binary/LargeBinary to BinaryView because that's what the parquet
11761176
// reader returns and what the rest of the variant code expects.
1177-
Binary => Cow::Owned(DataType::BinaryView),
1177+
Binary | LargeBinary => Cow::Owned(BinaryView),
11781178
BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(),
11791179

11801180
// UUID maps to 16-byte fixed-size binary; no other width is allowed
11811181
FixedSizeBinary(16) => borrow!(),
11821182
FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
11831183

11841184
// We can _possibly_ allow (some of) these some day?
1185-
LargeBinary | ListView(_) | LargeList(_) | LargeListView(_) => {
1185+
ListView(_) | LargeList(_) | LargeListView(_) => {
11861186
fail!()
11871187
}
11881188

parquet-variant-compute/src/variant_get.rs

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -306,13 +306,15 @@ mod test {
306306
use std::sync::Arc;
307307

308308
use super::{GetOptions, variant_get};
309+
use crate::arrow_to_variant::ArrowToVariantRowBuilder::LargeBinary;
309310
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
310311
use crate::{VariantArray, VariantArrayBuilder, json_to_variant};
311312
use arrow::array::{
312-
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array,
313-
Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array,
314-
Int16Array, Int32Array, Int64Array, LargeStringArray, NullBuilder, StringArray,
315-
StringViewArray, StructArray, Time64MicrosecondArray,
313+
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
314+
Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array,
315+
Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, LargeBinaryArray,
316+
LargeStringArray, NullBuilder, StringArray, StringViewArray, StructArray,
317+
Time64MicrosecondArray,
316318
};
317319
use arrow::buffer::NullBuffer;
318320
use arrow::compute::CastOptions;
@@ -1316,6 +1318,63 @@ mod test {
13161318
)
13171319
}
13181320

1321+
perfectly_shredded_variant_array_fn!(perfectly_shredded_binary_variant_array, || {
1322+
BinaryArray::from(vec![
1323+
Some(b"Apache" as &[u8]),
1324+
Some(b"Arrow-rs" as &[u8]),
1325+
Some(b"Parquet-variant" as &[u8]),
1326+
])
1327+
});
1328+
1329+
perfectly_shredded_to_arrow_primitive_test!(
1330+
get_variant_perfectly_shredded_binary_as_binary,
1331+
DataType::Binary,
1332+
perfectly_shredded_binary_variant_array,
1333+
BinaryArray::from(vec![
1334+
Some(b"Apache" as &[u8]),
1335+
Some(b"Arrow-rs" as &[u8]),
1336+
Some(b"Parquet-variant" as &[u8]),
1337+
])
1338+
);
1339+
1340+
perfectly_shredded_variant_array_fn!(perfectly_shredded_large_binary_variant_array, || {
1341+
LargeBinaryArray::from(vec![
1342+
Some(b"Apache" as &[u8]),
1343+
Some(b"Arrow-rs" as &[u8]),
1344+
Some(b"Parquet-variant" as &[u8]),
1345+
])
1346+
});
1347+
1348+
perfectly_shredded_to_arrow_primitive_test!(
1349+
get_variant_perfectly_shredded_large_binary_as_large_binary,
1350+
DataType::LargeBinary,
1351+
perfectly_shredded_large_binary_variant_array,
1352+
LargeBinaryArray::from(vec![
1353+
Some(b"Apache" as &[u8]),
1354+
Some(b"Arrow-rs" as &[u8]),
1355+
Some(b"Parquet-variant" as &[u8]),
1356+
])
1357+
);
1358+
1359+
perfectly_shredded_variant_array_fn!(perfectly_shredded_binary_view_variant_array, || {
1360+
BinaryViewArray::from(vec![
1361+
Some(b"Apache" as &[u8]),
1362+
Some(b"Arrow-rs" as &[u8]),
1363+
Some(b"Parquet-variant" as &[u8]),
1364+
])
1365+
});
1366+
1367+
perfectly_shredded_to_arrow_primitive_test!(
1368+
get_variant_perfectly_shredded_binary_view_as_binary_view,
1369+
DataType::BinaryView,
1370+
perfectly_shredded_binary_view_variant_array,
1371+
BinaryViewArray::from(vec![
1372+
Some(b"Apache" as &[u8]),
1373+
Some(b"Arrow-rs" as &[u8]),
1374+
Some(b"Parquet-variant" as &[u8]),
1375+
])
1376+
);
1377+
13191378
/// Return a VariantArray that represents a normal "shredded" variant
13201379
/// for the following example
13211380
///

parquet-variant-compute/src/variant_to_arrow.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616
// under the License.
1717

1818
use arrow::array::{
19-
ArrayRef, BinaryViewArray, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder,
20-
NullArray, NullBufferBuilder, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder,
21-
StringViewBuilder,
19+
ArrayRef, BinaryLikeArrayBuilder, BinaryViewArray, BinaryViewBuilder, BooleanBuilder,
20+
FixedSizeBinaryBuilder, GenericByteBuilder, LargeStringBuilder, NullArray, NullBufferBuilder,
21+
PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder,
2222
};
2323
use arrow::compute::{CastOptions, DecimalCast};
24-
use arrow::datatypes::{self, DataType, DecimalType};
24+
use arrow::datatypes::{self, BinaryType, DataType, DecimalType, LargeBinaryType};
2525
use arrow::error::{ArrowError, Result};
2626
use parquet_variant::{Variant, VariantPath};
2727

@@ -66,6 +66,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
6666
String(VariantToStringArrowBuilder<'a, StringBuilder>),
6767
LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>),
6868
StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>),
69+
Binary(VariantToBinaryArrowRowBuilder<'a, GenericByteBuilder<BinaryType>>),
70+
LargeBinary(VariantToBinaryArrowRowBuilder<'a, GenericByteBuilder<LargeBinaryType>>),
71+
BinaryView(VariantToBinaryArrowRowBuilder<'a, BinaryViewBuilder>),
6972
}
7073

7174
/// Builder for converting variant values into strongly typed Arrow arrays.
@@ -111,6 +114,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
111114
String(b) => b.append_null(),
112115
LargeString(b) => b.append_null(),
113116
StringView(b) => b.append_null(),
117+
Binary(b) => b.append_null(),
118+
LargeBinary(b) => b.append_null(),
119+
BinaryView(b) => b.append_null(),
114120
}
115121
}
116122

@@ -144,6 +150,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
144150
String(b) => b.append_value(value),
145151
LargeString(b) => b.append_value(value),
146152
StringView(b) => b.append_value(value),
153+
Binary(b) => b.append_value(value),
154+
LargeBinary(b) => b.append_value(value),
155+
BinaryView(b) => b.append_value(value),
147156
}
148157
}
149158

@@ -177,6 +186,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
177186
String(b) => b.finish(),
178187
LargeString(b) => b.finish(),
179188
StringView(b) => b.finish(),
189+
Binary(b) => b.finish(),
190+
LargeBinary(b) => b.finish(),
191+
BinaryView(b) => b.finish(),
180192
}
181193
}
182194
}
@@ -322,6 +334,13 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
322334
LargeString(VariantToStringArrowBuilder::new(cast_options, capacity))
323335
}
324336
DataType::Utf8View => StringView(VariantToStringArrowBuilder::new(cast_options, capacity)),
337+
DataType::Binary => Binary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity)),
338+
DataType::LargeBinary => {
339+
LargeBinary(VariantToBinaryArrowRowBuilder::new(cast_options, capacity))
340+
}
341+
DataType::BinaryView => {
342+
BinaryView(VariantToBinaryArrowRowBuilder::new(cast_options, capacity))
343+
}
325344
_ if data_type.is_primitive() => {
326345
return Err(ArrowError::NotYetImplemented(format!(
327346
"Primitive data_type {data_type:?} not yet implemented"
@@ -506,6 +525,13 @@ define_variant_to_primitive_builder!(
506525
type_name: T::DATA_TYPE
507526
);
508527

528+
define_variant_to_primitive_builder!(
529+
struct VariantToBinaryArrowRowBuilder<'a, B: BinaryLikeArrayBuilder>
530+
|capacity| -> B { B::with_capacity(capacity) },
531+
|value| value.as_u8_slice(),
532+
type_name: B::type_name()
533+
);
534+
509535
/// Builder for converting variant values to arrow Decimal values
510536
pub(crate) struct VariantToDecimalArrowRowBuilder<'a, T>
511537
where

0 commit comments

Comments
 (0)