Skip to content

Commit 81867eb

Browse files
authored
[Variant] Implement VariantArray::value for shredded variants (#8105)
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Closes #8091 . # Rationale for this change Implement `VariantArray::value` for some more shredded variants(eg. primitive_conversion/generic_conversion/non_generic_conversion). # What changes are included in this PR? - Extract all `macroRules` to a separate module `type_conversion.rs` - Add a macro for `variant value` # Are these changes tested? Covered by the existing test # Are there any user-facing changes? No
1 parent 4009514 commit 81867eb

File tree

8 files changed

+231
-170
lines changed

8 files changed

+231
-170
lines changed

parquet-variant-compute/src/cast_to_variant.rs

Lines changed: 34 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717

1818
use std::sync::Arc;
1919

20+
use crate::type_conversion::{
21+
decimal_to_variant_decimal, generic_conversion_array, non_generic_conversion_array,
22+
primitive_conversion_array,
23+
};
2024
use crate::{VariantArray, VariantArrayBuilder};
2125
use arrow::array::{
2226
Array, AsArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
@@ -37,60 +41,10 @@ use arrow::temporal_conversions::{
3741
};
3842
use arrow_schema::{ArrowError, DataType, TimeUnit};
3943
use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc};
40-
use half::f16;
4144
use parquet_variant::{
4245
Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8,
4346
};
4447

45-
/// Convert the input array of a specific primitive type to a `VariantArray`
46-
/// row by row
47-
macro_rules! primitive_conversion {
48-
($t:ty, $input:expr, $builder:expr) => {{
49-
let array = $input.as_primitive::<$t>();
50-
for i in 0..array.len() {
51-
if array.is_null(i) {
52-
$builder.append_null();
53-
continue;
54-
}
55-
$builder.append_variant(Variant::from(array.value(i)));
56-
}
57-
}};
58-
}
59-
60-
/// Convert the input array to a `VariantArray` row by row, using `method`
61-
/// requiring a generic type to downcast the generic array to a specific
62-
/// array type and `cast_fn` to transform each element to a type compatible with Variant
63-
macro_rules! generic_conversion {
64-
($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{
65-
let array = $input.$method::<$t>();
66-
for i in 0..array.len() {
67-
if array.is_null(i) {
68-
$builder.append_null();
69-
continue;
70-
}
71-
let cast_value = $cast_fn(array.value(i));
72-
$builder.append_variant(Variant::from(cast_value));
73-
}
74-
}};
75-
}
76-
77-
/// Convert the input array to a `VariantArray` row by row, using `method`
78-
/// not requiring a generic type to downcast the generic array to a specific
79-
/// array type and `cast_fn` to transform each element to a type compatible with Variant
80-
macro_rules! non_generic_conversion {
81-
($method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{
82-
let array = $input.$method();
83-
for i in 0..array.len() {
84-
if array.is_null(i) {
85-
$builder.append_null();
86-
continue;
87-
}
88-
let cast_value = $cast_fn(array.value(i));
89-
$builder.append_variant(Variant::from(cast_value));
90-
}
91-
}};
92-
}
93-
9448
fn convert_timestamp(
9549
time_unit: &TimeUnit,
9650
time_zone: &Option<Arc<str>>,
@@ -159,61 +113,6 @@ fn convert_timestamp(
159113
}
160114
}
161115

162-
/// Convert a decimal value to a `VariantDecimal`
163-
macro_rules! decimal_to_variant_decimal {
164-
($v:ident, $scale:expr, $value_type:ty, $variant_type:ty) => {
165-
if *$scale < 0 {
166-
// For negative scale, we need to multiply the value by 10^|scale|
167-
// For example: 123 with scale -2 becomes 12300
168-
let multiplier = (10 as $value_type).pow((-*$scale) as u32);
169-
// Check for overflow
170-
if $v > 0 && $v > <$value_type>::MAX / multiplier {
171-
return Variant::Null;
172-
}
173-
if $v < 0 && $v < <$value_type>::MIN / multiplier {
174-
return Variant::Null;
175-
}
176-
<$variant_type>::try_new($v * multiplier, 0)
177-
.map(|v| v.into())
178-
.unwrap_or(Variant::Null)
179-
} else {
180-
<$variant_type>::try_new($v, *$scale as u8)
181-
.map(|v| v.into())
182-
.unwrap_or(Variant::Null)
183-
}
184-
};
185-
}
186-
187-
/// Convert arrays that don't need generic type parameters
188-
macro_rules! cast_conversion_nongeneric {
189-
($method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{
190-
let array = $input.$method();
191-
for i in 0..array.len() {
192-
if array.is_null(i) {
193-
$builder.append_null();
194-
continue;
195-
}
196-
let cast_value = $cast_fn(array.value(i));
197-
$builder.append_variant(Variant::from(cast_value));
198-
}
199-
}};
200-
}
201-
202-
/// Convert string arrays using the offset size as the type parameter
203-
macro_rules! cast_conversion_string {
204-
($offset_type:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{
205-
let array = $input.$method::<$offset_type>();
206-
for i in 0..array.len() {
207-
if array.is_null(i) {
208-
$builder.append_null();
209-
continue;
210-
}
211-
let cast_value = $cast_fn(array.value(i));
212-
$builder.append_variant(Variant::from(cast_value));
213-
}
214-
}};
215-
}
216-
217116
/// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you
218117
/// need to convert a specific data type
219118
///
@@ -250,58 +149,52 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
250149
// todo: handle other types like Boolean, Date, Timestamp, etc.
251150
match input_type {
252151
DataType::Boolean => {
253-
non_generic_conversion!(as_boolean, |v| v, input, builder);
152+
non_generic_conversion_array!(input.as_boolean(), |v| v, builder);
254153
}
255154
DataType::Binary => {
256-
generic_conversion!(BinaryType, as_bytes, |v| v, input, builder);
155+
generic_conversion_array!(BinaryType, as_bytes, |v| v, input, builder);
257156
}
258157
DataType::LargeBinary => {
259-
generic_conversion!(LargeBinaryType, as_bytes, |v| v, input, builder);
158+
generic_conversion_array!(LargeBinaryType, as_bytes, |v| v, input, builder);
260159
}
261160
DataType::BinaryView => {
262-
generic_conversion!(BinaryViewType, as_byte_view, |v| v, input, builder);
161+
generic_conversion_array!(BinaryViewType, as_byte_view, |v| v, input, builder);
263162
}
264163
DataType::Int8 => {
265-
primitive_conversion!(Int8Type, input, builder);
164+
primitive_conversion_array!(Int8Type, input, builder);
266165
}
267166
DataType::Int16 => {
268-
primitive_conversion!(Int16Type, input, builder);
167+
primitive_conversion_array!(Int16Type, input, builder);
269168
}
270169
DataType::Int32 => {
271-
primitive_conversion!(Int32Type, input, builder);
170+
primitive_conversion_array!(Int32Type, input, builder);
272171
}
273172
DataType::Int64 => {
274-
primitive_conversion!(Int64Type, input, builder);
173+
primitive_conversion_array!(Int64Type, input, builder);
275174
}
276175
DataType::UInt8 => {
277-
primitive_conversion!(UInt8Type, input, builder);
176+
primitive_conversion_array!(UInt8Type, input, builder);
278177
}
279178
DataType::UInt16 => {
280-
primitive_conversion!(UInt16Type, input, builder);
179+
primitive_conversion_array!(UInt16Type, input, builder);
281180
}
282181
DataType::UInt32 => {
283-
primitive_conversion!(UInt32Type, input, builder);
182+
primitive_conversion_array!(UInt32Type, input, builder);
284183
}
285184
DataType::UInt64 => {
286-
primitive_conversion!(UInt64Type, input, builder);
185+
primitive_conversion_array!(UInt64Type, input, builder);
287186
}
288187
DataType::Float16 => {
289-
generic_conversion!(
290-
Float16Type,
291-
as_primitive,
292-
|v: f16| -> f32 { v.into() },
293-
input,
294-
builder
295-
);
188+
generic_conversion_array!(Float16Type, as_primitive, f32::from, input, builder);
296189
}
297190
DataType::Float32 => {
298-
primitive_conversion!(Float32Type, input, builder);
191+
primitive_conversion_array!(Float32Type, input, builder);
299192
}
300193
DataType::Float64 => {
301-
primitive_conversion!(Float64Type, input, builder);
194+
primitive_conversion_array!(Float64Type, input, builder);
302195
}
303196
DataType::Decimal32(_, scale) => {
304-
generic_conversion!(
197+
generic_conversion_array!(
305198
Decimal32Type,
306199
as_primitive,
307200
|v| decimal_to_variant_decimal!(v, scale, i32, VariantDecimal4),
@@ -310,7 +203,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
310203
);
311204
}
312205
DataType::Decimal64(_, scale) => {
313-
generic_conversion!(
206+
generic_conversion_array!(
314207
Decimal64Type,
315208
as_primitive,
316209
|v| decimal_to_variant_decimal!(v, scale, i64, VariantDecimal8),
@@ -319,7 +212,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
319212
);
320213
}
321214
DataType::Decimal128(_, scale) => {
322-
generic_conversion!(
215+
generic_conversion_array!(
323216
Decimal128Type,
324217
as_primitive,
325218
|v| decimal_to_variant_decimal!(v, scale, i128, VariantDecimal16),
@@ -328,7 +221,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
328221
);
329222
}
330223
DataType::Decimal256(_, scale) => {
331-
generic_conversion!(
224+
generic_conversion_array!(
332225
Decimal256Type,
333226
as_primitive,
334227
|v: i256| {
@@ -346,7 +239,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
346239
);
347240
}
348241
DataType::FixedSizeBinary(_) => {
349-
non_generic_conversion!(as_fixed_size_binary, |v| v, input, builder);
242+
non_generic_conversion_array!(input.as_fixed_size_binary(), |v| v, builder);
350243
}
351244
DataType::Null => {
352245
for _ in 0..input.len() {
@@ -359,7 +252,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
359252
DataType::Time32(unit) => {
360253
match *unit {
361254
TimeUnit::Second => {
362-
generic_conversion!(
255+
generic_conversion_array!(
363256
Time32SecondType,
364257
as_primitive,
365258
// nano second are always 0
@@ -369,7 +262,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
369262
);
370263
}
371264
TimeUnit::Millisecond => {
372-
generic_conversion!(
265+
generic_conversion_array!(
373266
Time32MillisecondType,
374267
as_primitive,
375268
|v| NaiveTime::from_num_seconds_from_midnight_opt(
@@ -392,7 +285,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
392285
DataType::Time64(unit) => {
393286
match *unit {
394287
TimeUnit::Microsecond => {
395-
generic_conversion!(
288+
generic_conversion_array!(
396289
Time64MicrosecondType,
397290
as_primitive,
398291
|v| NaiveTime::from_num_seconds_from_midnight_opt(
@@ -405,7 +298,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
405298
);
406299
}
407300
TimeUnit::Nanosecond => {
408-
generic_conversion!(
301+
generic_conversion_array!(
409302
Time64NanosecondType,
410303
as_primitive,
411304
|v| NaiveTime::from_num_seconds_from_midnight_opt(
@@ -433,13 +326,13 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
433326
));
434327
}
435328
DataType::Utf8 => {
436-
cast_conversion_string!(i32, as_string, |v| v, input, builder);
329+
generic_conversion_array!(i32, as_string, |v| v, input, builder);
437330
}
438331
DataType::LargeUtf8 => {
439-
cast_conversion_string!(i64, as_string, |v| v, input, builder);
332+
generic_conversion_array!(i64, as_string, |v| v, input, builder);
440333
}
441334
DataType::Utf8View => {
442-
cast_conversion_nongeneric!(as_string_view, |v| v, input, builder);
335+
non_generic_conversion_array!(input.as_string_view(), |v| v, builder);
443336
}
444337
DataType::Struct(_) => {
445338
let struct_array = input.as_struct();
@@ -487,7 +380,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
487380
}
488381
}
489382
DataType::Date32 => {
490-
generic_conversion!(
383+
generic_conversion_array!(
491384
Date32Type,
492385
as_primitive,
493386
|v: i32| -> NaiveDate { Date32Type::to_naive_date(v) },
@@ -496,7 +389,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
496389
);
497390
}
498391
DataType::Date64 => {
499-
generic_conversion!(
392+
generic_conversion_array!(
500393
Date64Type,
501394
as_primitive,
502395
|v: i64| { Date64Type::to_naive_date_opt(v).unwrap() },
@@ -723,6 +616,7 @@ mod tests {
723616
use arrow_schema::{
724617
DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION,
725618
};
619+
use half::f16;
726620
use parquet_variant::{Variant, VariantDecimal16};
727621
use std::{sync::Arc, vec};
728622

parquet-variant-compute/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
pub mod cast_to_variant;
3939
mod from_json;
4040
mod to_json;
41+
mod type_conversion;
4142
mod variant_array;
4243
mod variant_array_builder;
4344
pub mod variant_get;

0 commit comments

Comments
 (0)