From 2134dcc6b2f1635d4a6ac64a695c9522c9a29c2b Mon Sep 17 00:00:00 2001 From: klion26 Date: Sat, 27 Sep 2025 11:36:51 +0800 Subject: [PATCH 01/11] [Variant] Support primitive variant to arrow row for boolean --- parquet-variant-compute/src/variant_get.rs | 77 ++++++++++++++----- .../src/variant_to_arrow.rs | 57 +++++++++++++- 2 files changed, 115 insertions(+), 19 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 4859abe8aaed..d34d10ed34df 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -306,6 +306,7 @@ mod test { use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow::datatypes::DataType::{Int16, Int32, Int64}; + use arrow_schema::DataType::{Boolean, Float32, Float64, Int8}; use arrow_schema::{DataType, Field, FieldRef, Fields}; use chrono::DateTime; use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, Variant, VariantPath}; @@ -713,6 +714,13 @@ mod test { }; } + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_int18_as_int8, + Int8, + perfectly_shredded_int8_variant_array, + Int8Array::from(vec![Some(1), Some(2), Some(3)]) + ); + perfectly_shredded_to_arrow_primitive_test!( get_variant_perfectly_shredded_int16_as_int16, Int16, @@ -734,19 +742,29 @@ mod test { Int64Array::from(vec![Some(1), Some(2), Some(3)]) ); - /// Return a VariantArray that represents a perfectly "shredded" variant - /// for the given typed value. - /// - /// The schema of the corresponding `StructArray` would look like this: - /// - /// ```text - /// StructArray { - /// metadata: BinaryViewArray, - /// typed_value: Int32Array, - /// } - /// ``` - macro_rules! numeric_perfectly_shredded_variant_array_fn { - ($func:ident, $array_type:ident, $primitive_type:ty) => { + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_float32_as_float32, + Float32, + perfectly_shredded_float32_variant_array, + Float32Array::from(vec![Some(1.0), Some(2.0), Some(3.0)]) + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_float64_as_float64, + Float64, + perfectly_shredded_float64_variant_array, + Float64Array::from(vec![Some(1.0), Some(2.0), Some(3.0)]) + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_boolean_as_boolean, + Boolean, + perfectly_shredded_bool_variant_array, + BooleanArray::from(vec![Some(true), Some(false), Some(true)]) + ); + + macro_rules! perfectly_shredded_variant_array_fn { + ($func:ident, $typed_value_gen:expr) => { fn $func() -> ArrayRef { // At the time of writing, the `VariantArrayBuilder` does not support shredding. // so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895 @@ -754,11 +772,7 @@ mod test { EMPTY_VARIANT_METADATA_BYTES, 3, )); - let typed_value = $array_type::from(vec![ - Some(<$primitive_type>::try_from(1u8).unwrap()), - Some(<$primitive_type>::try_from(2u8).unwrap()), - Some(<$primitive_type>::try_from(3u8).unwrap()), - ]); + let typed_value = $typed_value_gen(); let struct_array = StructArrayBuilder::new() .with_field("metadata", Arc::new(metadata), false) @@ -772,6 +786,33 @@ mod test { }; } + perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || { + BooleanArray::from(vec![Some(true), Some(false), Some(true)]) + }); + + /// Return a VariantArray that represents a perfectly "shredded" variant + /// for the given typed value. + /// + /// The schema of the corresponding `StructArray` would look like this: + /// + /// ```text + /// StructArray { + /// metadata: BinaryViewArray, + /// typed_value: Int32Array, + /// } + /// ``` + macro_rules! numeric_perfectly_shredded_variant_array_fn { + ($func:ident, $array_type:ident, $primitive_type:ty) => { + perfectly_shredded_variant_array_fn!($func, || { + $array_type::from(vec![ + Some(<$primitive_type>::try_from(1u8).unwrap()), + Some(<$primitive_type>::try_from(2u8).unwrap()), + Some(<$primitive_type>::try_from(3u8).unwrap()), + ]) + }); + }; + } + numeric_perfectly_shredded_variant_array_fn!( perfectly_shredded_int8_variant_array, Int8Array, diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 50249aa63d20..7e2220474d46 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder}; +use arrow::array::{ + Array, ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveArray, PrimitiveBuilder, +}; use arrow::compute::CastOptions; use arrow::datatypes::{self, ArrowPrimitiveType, DataType}; use arrow::error::{ArrowError, Result}; @@ -30,6 +32,7 @@ use std::sync::Arc; /// `VariantToArrowRowBuilder` (below) and `VariantToShreddedPrimitiveVariantRowBuilder` (in /// `shred_variant.rs`). pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { + Boolean(VariantToBooleanArrowRowBuilder<'a>), Int8(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int8Type>), Int16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int16Type>), Int32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Int32Type>), @@ -41,6 +44,7 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>), Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>), Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>), + Boolean(VariantToBooleanArrowRowBuilder<'a>), } /// Builder for converting variant values into strongly typed Arrow arrays. @@ -59,6 +63,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { pub fn append_null(&mut self) -> Result<()> { use PrimitiveVariantToArrowRowBuilder::*; match self { + Boolean(b) => b.append_null(), Int8(b) => b.append_null(), Int16(b) => b.append_null(), Int32(b) => b.append_null(), @@ -70,6 +75,8 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float16(b) => b.append_null(), Float32(b) => b.append_null(), Float64(b) => b.append_null(), + TimestampMicro(b) => b.append_null(), + TimestampNano(b) => b.append_null(), } } @@ -87,12 +94,14 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float16(b) => b.append_value(value), Float32(b) => b.append_value(value), Float64(b) => b.append_value(value), + Boolean(b) => b.append_value(value), } } pub fn finish(self) -> Result { use PrimitiveVariantToArrowRowBuilder::*; match self { + Boolean(b) => b.finish(), Int8(b) => b.finish(), Int16(b) => b.finish(), Int32(b) => b.finish(), @@ -104,6 +113,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float16(b) => b.finish(), Float32(b) => b.finish(), Float64(b) => b.finish(), + Boolean(b) => b.finish(), } } } @@ -146,6 +156,7 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( use PrimitiveVariantToArrowRowBuilder::*; let builder = match data_type { + DataType::Boolean => Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)), DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, @@ -190,6 +201,7 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( cast_options, capacity, )), + DataType::Boolean => Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)), _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( "Primitive data_type {data_type:?} not yet implemented" @@ -297,6 +309,49 @@ fn get_type_name() -> &'static str { } } +/// Builder for converting variant values to boolean values +/// Boolean is not primitive types in Arrow, so we need a separate builder +pub(crate) struct VariantToBooleanArrowRowBuilder<'a> { + builder: arrow::array::BooleanBuilder, + cast_options: &'a CastOptions<'a>, +} + +impl<'a> VariantToBooleanArrowRowBuilder<'a> { + fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self { + Self { + builder: arrow::array::BooleanBuilder::with_capacity(capacity), + cast_options, + } + } + + fn append_null(&mut self) -> Result<()> { + self.builder.append_null(); + Ok(()) + } + + fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { + if let Some(v) = value.as_boolean() { + self.builder.append_value(v); + Ok(true) + } else { + if !self.cast_options.safe { + // Unsafe casting: return error on conversion failure + return Err(ArrowError::CastError(format!( + "Failed to extract boolean from variant {:?} at path VariantPath([])", + value + ))); + } + // Safe casting: append null on conversion failure + self.builder.append_null(); + Ok(false) + } + } + + fn finish(mut self) -> Result { + Ok(Arc::new(self.builder.finish())) + } +} + /// Builder for converting variant values to primitive values pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: PrimitiveFromVariant> { builder: arrow::array::PrimitiveBuilder, From 76f51ad21bf46301aba688e8a37ba05115e0a0d7 Mon Sep 17 00:00:00 2001 From: klion26 Date: Sun, 28 Sep 2025 16:57:19 +0800 Subject: [PATCH 02/11] [Variant] Support primitive variant to arrow row for timestamp(micro&nano) and time --- .../src/type_conversion.rs | 29 +++++ parquet-variant-compute/src/variant_get.rs | 105 +++++++++++++++++- .../src/variant_to_arrow.rs | 62 ++++++++++- 3 files changed, 188 insertions(+), 8 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 5dda1855297a..a985542e5360 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -18,6 +18,7 @@ //! Module for transforming a typed arrow `Array` to `VariantArray`. use arrow::datatypes::{self, ArrowPrimitiveType}; +use chrono::Datelike; use parquet_variant::Variant; /// Options for controlling the behavior of `cast_to_variant_with_options`. @@ -61,6 +62,34 @@ impl_primitive_from_variant!(datatypes::Float16Type, as_f16); impl_primitive_from_variant!(datatypes::Float32Type, as_f32); impl_primitive_from_variant!(datatypes::Float64Type, as_f64); +impl VariantAsPrimitive for Variant<'_, '_> { + fn as_primitive(&self) -> Option { + match self { + Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()), + Variant::TimestampNtzMicros(ndt) => Some(ndt.and_utc().timestamp_micros()), + _ => None, + } + } +} + +impl VariantAsPrimitive for Variant<'_, '_> { + fn as_primitive(&self) -> Option { + match self { + Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(), + Variant::TimestampNtzNanos(ndt) => ndt.and_utc().timestamp_nanos_opt(), + _ => None, + } + } +} + +impl VariantAsPrimitive for Variant<'_, '_> { + fn as_primitive(&self) -> Option { + // The number of days from 0001-01-01 to 1970-01-01. + const DAYS_FROM_CE_TO_UNIX_EPOCH: i32 = 719163; + self.as_naive_date() + .map(|d| d.num_days_from_ce() - DAYS_FROM_CE_TO_UNIX_EPOCH) + } +} /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! non_generic_conversion_single_value { ($array:expr, $cast_fn:expr, $index:expr) => {{ diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index d34d10ed34df..fc34493c9e7e 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -300,14 +300,14 @@ mod test { use crate::json_to_variant; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use arrow::array::{ - Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array, - Int8Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray, + Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow::datatypes::DataType::{Int16, Int32, Int64}; use arrow_schema::DataType::{Boolean, Float32, Float64, Int8}; - use arrow_schema::{DataType, Field, FieldRef, Fields}; + use arrow_schema::{DataType, Field, FieldRef, Fields, TimeUnit}; use chrono::DateTime; use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, Variant, VariantPath}; @@ -701,7 +701,7 @@ mod test { } macro_rules! perfectly_shredded_to_arrow_primitive_test { - ($name:ident, $primitive_type:ident, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => { + ($name:ident, $primitive_type:expr, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => { #[test] fn $name() { let array = $perfectly_shredded_array_gen_fun(); @@ -844,6 +844,103 @@ mod test { f64 ); + perfectly_shredded_variant_array_fn!( + perfectly_shredded_timestamp_micro_ntz_variant_array, + || { + arrow::array::TimestampMicrosecondArray::from(vec![ + Some(-456000), + Some(1758602096000001), + Some(1758602096000002), + ]) + } + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_timestamp_micro_ntz_as_timestamp_micro_ntz, + DataType::Timestamp(TimeUnit::Microsecond, None), + perfectly_shredded_timestamp_micro_ntz_variant_array, + arrow::array::TimestampMicrosecondArray::from(vec![ + Some(-456000), + Some(1758602096000001), + Some(1758602096000002), + ]) + ); + + perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_micro_variant_array, || { + arrow::array::TimestampMicrosecondArray::from(vec![ + Some(-456000), + Some(1758602096000001), + Some(1758602096000002), + ]) + .with_timezone("+00:00") + }); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_timestamp_micro_as_timestamp_micro, + DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+00:00"))), + perfectly_shredded_timestamp_micro_variant_array, + arrow::array::TimestampMicrosecondArray::from(vec![ + Some(-456000), + Some(1758602096000001), + Some(1758602096000002), + ]) + .with_timezone("+00:00") + ); + + perfectly_shredded_variant_array_fn!( + perfectly_shredded_timestamp_nano_ntz_variant_array, + || { + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-4999999561), + Some(1758602096000000001), + Some(1758602096000000002), + ]) + } + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_timestamp_nano_ntz_as_timestamp_nano_ntz, + DataType::Timestamp(TimeUnit::Nanosecond, None), + perfectly_shredded_timestamp_nano_ntz_variant_array, + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-4999999561), + Some(1758602096000000001), + Some(1758602096000000002), + ]) + ); + + perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_nano_variant_array, || { + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-4999999561), + Some(1758602096000000001), + Some(1758602096000000002), + ]) + .with_timezone("+00:00") + }); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_timestamp_nano_as_timestamp_nano, + DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))), + perfectly_shredded_timestamp_nano_variant_array, + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-4999999561), + Some(1758602096000000001), + Some(1758602096000000002), + ]) + .with_timezone("+00:00") + ); + + perfectly_shredded_variant_array_fn!(perfectly_shredded_date_variant_array, || { + Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)]) + }); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_date_as_date, + DataType::Date32, + perfectly_shredded_date_variant_array, + Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)]) + ); + macro_rules! assert_variant_get_as_variant_array_with_default_option { ($variant_array: expr, $array_expected: expr) => {{ let options = GetOptions::new(); diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 7e2220474d46..6e3f9220b589 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -26,6 +26,8 @@ use parquet_variant::{Variant, VariantPath}; use crate::type_conversion::PrimitiveFromVariant; use crate::{VariantArray, VariantValueArrayBuilder}; +use arrow_schema::DataType::Date32; +use arrow_schema::TimeUnit; use std::sync::Arc; /// Builder for converting variant values to primitive Arrow arrays. It is used by both @@ -44,7 +46,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>), Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>), Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>), - Boolean(VariantToBooleanArrowRowBuilder<'a>), + TimestampMicro(VariantToPrimitiveArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>), + TimestampNano(VariantToPrimitiveArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), + Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>), } /// Builder for converting variant values into strongly typed Arrow arrays. @@ -77,6 +81,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float64(b) => b.append_null(), TimestampMicro(b) => b.append_null(), TimestampNano(b) => b.append_null(), + Date(b) => b.append_null(), } } @@ -95,6 +100,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float32(b) => b.append_value(value), Float64(b) => b.append_value(value), Boolean(b) => b.append_value(value), + TimestampMicro(b) => b.append_value(value), + TimestampNano(b) => b.append_value(value), + Date(b) => b.append_value(value), } } @@ -113,7 +121,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float16(b) => b.finish(), Float32(b) => b.finish(), Float64(b) => b.finish(), - Boolean(b) => b.finish(), + TimestampMicro(b) => b.finish(), + TimestampNano(b) => b.finish(), + Date(b) => b.finish(), } } } @@ -201,7 +211,29 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( cast_options, capacity, )), - DataType::Boolean => Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)), + DataType::Timestamp(TimeUnit::Microsecond, tz) => { + let target_type = DataType::Timestamp(TimeUnit::Microsecond, tz.clone()); + + TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type( + cast_options, + capacity, + Some(target_type), + )) + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + let target_type = DataType::Timestamp(TimeUnit::Nanosecond, tz.clone()); + + TimestampNano(VariantToPrimitiveArrowRowBuilder::new_with_target_type( + cast_options, + capacity, + Some(target_type), + )) + } + DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new_with_target_type( + cast_options, + capacity, + Some(Date32), + )), _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( "Primitive data_type {data_type:?} not yet implemented" @@ -305,6 +337,8 @@ fn get_type_name() -> &'static str { "arrow_array::types::Float32Type" => "Float32", "arrow_array::types::Float64Type" => "Float64", "arrow_array::types::Float16Type" => "Float16", + "arrow_array::types::TimestampMicrosecondType" => "Timestamp(Microsecond)", + "arrow_array::types::TimestampNanosecondType" => "Timestamp(Nanosecond)", _ => "Unknown", } } @@ -356,13 +390,24 @@ impl<'a> VariantToBooleanArrowRowBuilder<'a> { pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: PrimitiveFromVariant> { builder: arrow::array::PrimitiveBuilder, cast_options: &'a CastOptions<'a>, + // this used to change the data type of the resulting array, e.g. to add timezone info + target_data_type: Option, } impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> { fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self { + Self::new_with_target_type(cast_options, capacity, None) + } + + fn new_with_target_type( + cast_options: &'a CastOptions<'a>, + capacity: usize, + target_data_type: Option, + ) -> Self { Self { builder: PrimitiveBuilder::::with_capacity(capacity), cast_options, + target_data_type, } } } @@ -393,7 +438,16 @@ impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> { } fn finish(mut self) -> Result { - Ok(Arc::new(self.builder.finish())) + let array: PrimitiveArray = self.builder.finish(); + + if let Some(target_type) = self.target_data_type { + let data = array.into_data(); + let new_data = data.into_builder().data_type(target_type).build()?; + let array_with_new_type = PrimitiveArray::::from(new_data); + return Ok(Arc::new(array_with_new_type)); + } + + Ok(Arc::new(array)) } } From fdfb93c3898ca9c5365cc21be9ba5f5b3afc4d58 Mon Sep 17 00:00:00 2001 From: klion26 Date: Wed, 1 Oct 2025 20:26:09 +0800 Subject: [PATCH 03/11] address comment 1 extract builder constructing logic to macro_rules 2 add micro -> nano test --- .../src/type_conversion.rs | 68 +++--- parquet-variant-compute/src/variant_get.rs | 25 +++ .../src/variant_to_arrow.rs | 203 +++++++----------- 3 files changed, 146 insertions(+), 150 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index a985542e5360..f56d69164369 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -17,8 +17,10 @@ //! Module for transforming a typed arrow `Array` to `VariantArray`. -use arrow::datatypes::{self, ArrowPrimitiveType}; -use chrono::Datelike; +use arrow::datatypes::{ + self, ArrowPrimitiveType, ArrowTimestampType, Date32Type, TimestampMicrosecondType, + TimestampNanosecondType, +}; use parquet_variant::Variant; /// Options for controlling the behavior of `cast_to_variant_with_options`. @@ -41,10 +43,12 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType { /// Macro to generate PrimitiveFromVariant implementations for Arrow primitive types macro_rules! impl_primitive_from_variant { - ($arrow_type:ty, $variant_method:ident) => { + ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => { impl PrimitiveFromVariant for $arrow_type { fn from_variant(variant: &Variant<'_, '_>) -> Option { - variant.$variant_method() + let value = variant.$variant_method(); + $( let value = value.map($cast_fn); )? + value } } }; @@ -61,35 +65,45 @@ impl_primitive_from_variant!(datatypes::UInt64Type, as_u64); impl_primitive_from_variant!(datatypes::Float16Type, as_f16); impl_primitive_from_variant!(datatypes::Float32Type, as_f32); impl_primitive_from_variant!(datatypes::Float64Type, as_f64); +impl_primitive_from_variant!( + datatypes::Date32Type, + as_naive_date, + Date32Type::from_naive_date +); -impl VariantAsPrimitive for Variant<'_, '_> { - fn as_primitive(&self) -> Option { - match self { - Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()), - Variant::TimestampNtzMicros(ndt) => Some(ndt.and_utc().timestamp_micros()), - _ => None, - } - } +pub(crate) trait TimestampFromVariant: ArrowTimestampType { + fn from_variant(variant: &Variant<'_, '_>) -> Option; } -impl VariantAsPrimitive for Variant<'_, '_> { - fn as_primitive(&self) -> Option { - match self { - Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(), - Variant::TimestampNtzNanos(ndt) => ndt.and_utc().timestamp_nanos_opt(), - _ => None, +macro_rules! impl_timestamp_from_variant { + ($timestamp_type:ty, { + $(($variant_pattern:pat, $conversion:expr)),+ $(,)? + }) => { + impl TimestampFromVariant for $timestamp_type { + fn from_variant(variant: &Variant<'_, '_>) -> Option { + match variant { + $( + $variant_pattern => $conversion, + )+ + _ => None, + } + } } - } + }; } -impl VariantAsPrimitive for Variant<'_, '_> { - fn as_primitive(&self) -> Option { - // The number of days from 0001-01-01 to 1970-01-01. - const DAYS_FROM_CE_TO_UNIX_EPOCH: i32 = 719163; - self.as_naive_date() - .map(|d| d.num_days_from_ce() - DAYS_FROM_CE_TO_UNIX_EPOCH) - } -} +impl_timestamp_from_variant!(TimestampMicrosecondType, { + (Variant::TimestampMicros(t), Some(t.timestamp_micros())), + (Variant::TimestampNtzMicros(t), Some(t.and_utc().timestamp_micros())), +}); + +impl_timestamp_from_variant!(TimestampNanosecondType, { + (Variant::TimestampMicros(t), Some(t.timestamp_micros()).map(|t| t * 1000)), + (Variant::TimestampNtzMicros(t), Some(t.and_utc().timestamp_micros()).map(|t| t * 1000)), + (Variant::TimestampNanos(t), t.timestamp_nanos_opt()), + (Variant::TimestampNtzNanos(t), t.and_utc().timestamp_nanos_opt()), +}); + /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! non_generic_conversion_single_value { ($array:expr, $cast_fn:expr, $index:expr) => {{ diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index fc34493c9e7e..b8f98619bc70 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -866,6 +866,18 @@ mod test { ]) ); + // test converting micro to nano + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_timestamp_micro_ntz_as_nano_ntz, + DataType::Timestamp(TimeUnit::Nanosecond, None), + perfectly_shredded_timestamp_micro_ntz_variant_array, + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-456000000), + Some(1758602096000001000), + Some(1758602096000002000) + ]) + ); + perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_micro_variant_array, || { arrow::array::TimestampMicrosecondArray::from(vec![ Some(-456000), @@ -887,6 +899,19 @@ mod test { .with_timezone("+00:00") ); + // test converting micro to nano + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_timestamp_micro_as_nano, + DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))), + perfectly_shredded_timestamp_micro_variant_array, + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-456000000), + Some(1758602096000001000), + Some(1758602096000002000) + ]) + .with_timezone("+00:00") + ); + perfectly_shredded_variant_array_fn!( perfectly_shredded_timestamp_nano_ntz_variant_array, || { diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 6e3f9220b589..ab72acad9ca6 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -16,17 +16,16 @@ // under the License. use arrow::array::{ - Array, ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveArray, PrimitiveBuilder, + builder::BooleanBuilder, ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder, }; use arrow::compute::CastOptions; use arrow::datatypes::{self, ArrowPrimitiveType, DataType}; use arrow::error::{ArrowError, Result}; use parquet_variant::{Variant, VariantPath}; -use crate::type_conversion::PrimitiveFromVariant; +use crate::type_conversion::{PrimitiveFromVariant, TimestampFromVariant}; use crate::{VariantArray, VariantValueArrayBuilder}; -use arrow_schema::DataType::Date32; use arrow_schema::TimeUnit; use std::sync::Arc; @@ -46,8 +45,8 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>), Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>), Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>), - TimestampMicro(VariantToPrimitiveArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>), - TimestampNano(VariantToPrimitiveArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), + TimestampMicro(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>), + TimestampNano(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>), } @@ -88,6 +87,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { pub fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { use PrimitiveVariantToArrowRowBuilder::*; match self { + Boolean(b) => b.append_value(value), Int8(b) => b.append_value(value), Int16(b) => b.append_value(value), Int32(b) => b.append_value(value), @@ -99,7 +99,6 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float16(b) => b.append_value(value), Float32(b) => b.append_value(value), Float64(b) => b.append_value(value), - Boolean(b) => b.append_value(value), TimestampMicro(b) => b.append_value(value), TimestampNano(b) => b.append_value(value), Date(b) => b.append_value(value), @@ -211,28 +210,16 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( cast_options, capacity, )), - DataType::Timestamp(TimeUnit::Microsecond, tz) => { - let target_type = DataType::Timestamp(TimeUnit::Microsecond, tz.clone()); - - TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type( - cast_options, - capacity, - Some(target_type), - )) - } - DataType::Timestamp(TimeUnit::Nanosecond, tz) => { - let target_type = DataType::Timestamp(TimeUnit::Nanosecond, tz.clone()); - - TimestampNano(VariantToPrimitiveArrowRowBuilder::new_with_target_type( - cast_options, - capacity, - Some(target_type), - )) - } - DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new_with_target_type( + DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro( + VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), + ), + + DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano( + VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), + ), + DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, - Some(Date32), )), _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( @@ -339,118 +326,88 @@ fn get_type_name() -> &'static str { "arrow_array::types::Float16Type" => "Float16", "arrow_array::types::TimestampMicrosecondType" => "Timestamp(Microsecond)", "arrow_array::types::TimestampNanosecondType" => "Timestamp(Nanosecond)", + "arrow_array::types::Date32Type" => "Date32", _ => "Unknown", } } -/// Builder for converting variant values to boolean values -/// Boolean is not primitive types in Arrow, so we need a separate builder -pub(crate) struct VariantToBooleanArrowRowBuilder<'a> { - builder: arrow::array::BooleanBuilder, - cast_options: &'a CastOptions<'a>, -} - -impl<'a> VariantToBooleanArrowRowBuilder<'a> { - fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self { - Self { - builder: arrow::array::BooleanBuilder::with_capacity(capacity), - cast_options, +macro_rules! define_variant_to_primitive_builder { + (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?> + |$array_param:ident $(, $field:ident: $field_type:ty)?| -> $builder_name:ident $(< $array_type:ty >)? { $init_expr: expr }, + |$value: ident| $value_transform:expr, + type_name: $type_name:expr) => { + pub(crate) struct $name<$lifetime $(, $generic : $bound )?> + { + builder: $builder_name $(<$array_type>)?, + cast_options: &$lifetime CastOptions<$lifetime>, } - } - fn append_null(&mut self) -> Result<()> { - self.builder.append_null(); - Ok(()) - } - - fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { - if let Some(v) = value.as_boolean() { - self.builder.append_value(v); - Ok(true) - } else { - if !self.cast_options.safe { - // Unsafe casting: return error on conversion failure - return Err(ArrowError::CastError(format!( - "Failed to extract boolean from variant {:?} at path VariantPath([])", - value - ))); + impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic )?> { + fn new( + cast_options: &$lifetime CastOptions<$lifetime>, + $array_param: usize + // add this so that $init_expr can use it + $(, $field: $field_type)?) -> Self { + Self { + builder: $init_expr, + cast_options, + } } - // Safe casting: append null on conversion failure - self.builder.append_null(); - Ok(false) - } - } - - fn finish(mut self) -> Result { - Ok(Arc::new(self.builder.finish())) - } -} - -/// Builder for converting variant values to primitive values -pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: PrimitiveFromVariant> { - builder: arrow::array::PrimitiveBuilder, - cast_options: &'a CastOptions<'a>, - // this used to change the data type of the resulting array, e.g. to add timezone info - target_data_type: Option, -} - -impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> { - fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self { - Self::new_with_target_type(cast_options, capacity, None) - } - - fn new_with_target_type( - cast_options: &'a CastOptions<'a>, - capacity: usize, - target_data_type: Option, - ) -> Self { - Self { - builder: PrimitiveBuilder::::with_capacity(capacity), - cast_options, - target_data_type, - } - } -} -impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> { - fn append_null(&mut self) -> Result<()> { - self.builder.append_null(); - Ok(()) - } - - fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { - if let Some(v) = T::from_variant(value) { - self.builder.append_value(v); - Ok(true) - } else { - if !self.cast_options.safe { - // Unsafe casting: return error on conversion failure - return Err(ArrowError::CastError(format!( - "Failed to extract primitive of type {} from variant {:?} at path VariantPath([])", - get_type_name::(), - value - ))); + fn append_null(&mut self) -> Result<()> { + self.builder.append_null(); + Ok(()) } - // Safe casting: append null on conversion failure - self.builder.append_null(); - Ok(false) - } - } - fn finish(mut self) -> Result { - let array: PrimitiveArray = self.builder.finish(); + fn append_value(&mut self, $value: &Variant<'_, '_>) -> Result { + if let Some(v) = $value_transform { + self.builder.append_value(v); + Ok(true) + } else { + if !self.cast_options.safe { + // Unsafe casting: return error on conversion failure + return Err(ArrowError::CastError(format!( + "Failed to extract primitive of type {} from variant {:?} at path VariantPath([])", + $type_name, + $value + ))); + } + // Safe casting: append null on conversion failure + self.builder.append_null(); + Ok(false) + } + } - if let Some(target_type) = self.target_data_type { - let data = array.into_data(); - let new_data = data.into_builder().data_type(target_type).build()?; - let array_with_new_type = PrimitiveArray::::from(new_data); - return Ok(Arc::new(array_with_new_type)); + fn finish(mut self) -> Result { + Ok(Arc::new(self.builder.finish())) + } } - - Ok(Arc::new(array)) } } +define_variant_to_primitive_builder!( + struct VariantToBooleanArrowRowBuilder<'a> + |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) }, + |value| value.as_boolean(), + type_name: "Boolean" +); + +define_variant_to_primitive_builder!( + struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant> + |capacity| -> PrimitiveBuilder { PrimitiveBuilder::::with_capacity(capacity) }, + |value| T::from_variant(value), + type_name: get_type_name::() +); + +define_variant_to_primitive_builder!( + struct VariantToTimestampArrowRowBuilder<'a, T:TimestampFromVariant> + |capacity, tz: Option> | -> PrimitiveBuilder { + PrimitiveBuilder::::with_capacity(capacity).with_timezone_opt(tz) + }, + |value| T::from_variant(value), + type_name: get_type_name::() +); + /// Builder for creating VariantArray output (for path extraction without type conversion) pub(crate) struct VariantToBinaryVariantArrowRowBuilder { metadata: BinaryViewArray, From 48047fe77db6a42a57c3546661b217225d55ef60 Mon Sep 17 00:00:00 2001 From: klion26 Date: Fri, 3 Oct 2025 12:51:16 +0800 Subject: [PATCH 04/11] address comment --- .../src/type_conversion.rs | 30 ++++++++++--------- .../src/variant_to_arrow.rs | 5 ++-- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index f56d69164369..9b3142515c8e 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -76,9 +76,9 @@ pub(crate) trait TimestampFromVariant: ArrowTimestampType { } macro_rules! impl_timestamp_from_variant { - ($timestamp_type:ty, { - $(($variant_pattern:pat, $conversion:expr)),+ $(,)? - }) => { + ($timestamp_type:ty, + $( $variant_pattern:pat => $conversion:expr ),+ $(,)? + ) => { impl TimestampFromVariant for $timestamp_type { fn from_variant(variant: &Variant<'_, '_>) -> Option { match variant { @@ -92,17 +92,19 @@ macro_rules! impl_timestamp_from_variant { }; } -impl_timestamp_from_variant!(TimestampMicrosecondType, { - (Variant::TimestampMicros(t), Some(t.timestamp_micros())), - (Variant::TimestampNtzMicros(t), Some(t.and_utc().timestamp_micros())), -}); - -impl_timestamp_from_variant!(TimestampNanosecondType, { - (Variant::TimestampMicros(t), Some(t.timestamp_micros()).map(|t| t * 1000)), - (Variant::TimestampNtzMicros(t), Some(t.and_utc().timestamp_micros()).map(|t| t * 1000)), - (Variant::TimestampNanos(t), t.timestamp_nanos_opt()), - (Variant::TimestampNtzNanos(t), t.and_utc().timestamp_nanos_opt()), -}); +impl_timestamp_from_variant!( + TimestampMicrosecondType, + Variant::TimestampMicros(t) => Some(t.timestamp_micros()), + Variant::TimestampNtzMicros(t) => Some(t.and_utc().timestamp_micros()), +); + +impl_timestamp_from_variant!( + TimestampNanosecondType, + Variant::TimestampMicros(t) => Some(t.timestamp_micros()).map(|t| t * 1000), + Variant::TimestampNtzMicros(t) => Some(t.and_utc().timestamp_micros()).map(|t| t * 1000), + Variant::TimestampNanos(t) => t.timestamp_nanos_opt(), + Variant::TimestampNtzNanos(t) => t.and_utc().timestamp_nanos_opt(), +); /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! non_generic_conversion_single_value { diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index ab72acad9ca6..3c22e70c160b 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -345,9 +345,10 @@ macro_rules! define_variant_to_primitive_builder { impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic )?> { fn new( cast_options: &$lifetime CastOptions<$lifetime>, - $array_param: usize + $array_param: usize, // add this so that $init_expr can use it - $(, $field: $field_type)?) -> Self { + $( $field: $field_type, )? + ) -> Self { Self { builder: $init_expr, cast_options, From b0747282edde6ee4d0c79bbc7583aa4e416cea81 Mon Sep 17 00:00:00 2001 From: klion26 Date: Fri, 3 Oct 2025 14:01:39 +0800 Subject: [PATCH 05/11] add Variant::{as_timestamp_micros/as_timestamp_nanos} and simplify the impl of for timestamp types --- .../src/type_conversion.rs | 66 ++++++++----------- parquet-variant/src/variant.rs | 66 +++++++++++++++++++ 2 files changed, 93 insertions(+), 39 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 9b3142515c8e..04988d20cfcd 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -17,10 +17,7 @@ //! Module for transforming a typed arrow `Array` to `VariantArray`. -use arrow::datatypes::{ - self, ArrowPrimitiveType, ArrowTimestampType, Date32Type, TimestampMicrosecondType, - TimestampNanosecondType, -}; +use arrow::datatypes::{self, ArrowPrimitiveType, ArrowTimestampType, Date32Type}; use parquet_variant::Variant; /// Options for controlling the behavior of `cast_to_variant_with_options`. @@ -41,6 +38,13 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType { fn from_variant(variant: &Variant<'_, '_>) -> Option; } +/// Extension trait for Arrow timestamp types that can extract their native value from a Variant +/// We can't use [`PrimitiveFromVariant`] directly because we might need to use methods that +/// are only available on [`ArrowTimestampType`] (such as with_timezone_opt) +pub(crate) trait TimestampFromVariant: ArrowTimestampType { + fn from_variant(variant: &Variant<'_, '_>) -> Option; +} + /// Macro to generate PrimitiveFromVariant implementations for Arrow primitive types macro_rules! impl_primitive_from_variant { ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => { @@ -52,6 +56,18 @@ macro_rules! impl_primitive_from_variant { } } }; + ($arrow_type:ty $(, $variant_method:ident => $cast_fn:expr )+ ) => { + impl TimestampFromVariant for $arrow_type { + fn from_variant(variant: &Variant<'_, '_>) -> Option { + $( + if let Some(value) = variant.$variant_method() { + return Some($cast_fn(value)); + } + )+ + None + } + } + }; } impl_primitive_from_variant!(datatypes::Int32Type, as_int32); @@ -70,41 +86,13 @@ impl_primitive_from_variant!( as_naive_date, Date32Type::from_naive_date ); - -pub(crate) trait TimestampFromVariant: ArrowTimestampType { - fn from_variant(variant: &Variant<'_, '_>) -> Option; -} - -macro_rules! impl_timestamp_from_variant { - ($timestamp_type:ty, - $( $variant_pattern:pat => $conversion:expr ),+ $(,)? - ) => { - impl TimestampFromVariant for $timestamp_type { - fn from_variant(variant: &Variant<'_, '_>) -> Option { - match variant { - $( - $variant_pattern => $conversion, - )+ - _ => None, - } - } - } - }; -} - -impl_timestamp_from_variant!( - TimestampMicrosecondType, - Variant::TimestampMicros(t) => Some(t.timestamp_micros()), - Variant::TimestampNtzMicros(t) => Some(t.and_utc().timestamp_micros()), -); - -impl_timestamp_from_variant!( - TimestampNanosecondType, - Variant::TimestampMicros(t) => Some(t.timestamp_micros()).map(|t| t * 1000), - Variant::TimestampNtzMicros(t) => Some(t.and_utc().timestamp_micros()).map(|t| t * 1000), - Variant::TimestampNanos(t) => t.timestamp_nanos_opt(), - Variant::TimestampNtzNanos(t) => t.and_utc().timestamp_nanos_opt(), -); +impl_primitive_from_variant!( + datatypes::TimestampMicrosecondType, + as_timestamp_micros => |t| t); +impl_primitive_from_variant!( + datatypes::TimestampNanosecondType, + as_timestamp_micros => |t| 1000 * t, + as_timestamp_nanos => |t| t); /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! non_generic_conversion_single_value { diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 849947675b13..0ca7871dff34 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -561,6 +561,72 @@ impl<'m, 'v> Variant<'m, 'v> { } } + /// Converts this variant to a `i64` representing microseconds since the Unix epoch if possible. + /// This is useful when convert the variant to arrow types. + /// + /// Returns Some(i64) for [`Variant::TimestampMicros`] and [`Variant::TimestampNtzMicros`], + /// None for the other variant types. + /// + /// ``` + /// use parquet_variant::Variant; + /// use chrono::NaiveDate; + /// + /// // you can extract an i64 from Variant::TimestampMicros + /// let datetime = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_milli_opt(12, 34, 56, 789).unwrap().and_utc(); + /// let v1 = Variant::from(datetime); + /// assert_eq!(v1.as_timestamp_micros(), Some(1759494896789000)); + /// + /// // or Variant::TimestampNtzMicros + /// let datetime_ntz = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_milli_opt(12, 34, 56, 789).unwrap(); + /// let v2 = Variant::from(datetime_ntz); + /// assert_eq!(v1.as_timestamp_micros(), Some(1759494896789000)); + /// + /// // but not from other variants + /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap().and_utc(); + /// let v3 = Variant::from(datetime_nanos); + /// assert_eq!(v3.as_timestamp_micros(), None); + /// ``` + pub fn as_timestamp_micros(&self) -> Option { + match *self { + Variant::TimestampMicros(d) => Some(d.timestamp_micros()), + Variant::TimestampNtzMicros(d) => Some(d.and_utc().timestamp_micros()), + _ => None, + } + } + + /// Converts this variant to a `i64` representing nanoseconds since the Unix epoch if possible. + /// This is useful when convert the variant to arrow types. + /// + /// Returns Some(i64) for [`Variant::TimestampNanos`] and [`Variant::TimestampNtzNanos`], + /// None for the other variant types. + /// + /// ``` + /// use parquet_variant::Variant; + /// use chrono::NaiveDate; + /// + /// // you can extract an i64 from Variant::TimestampNanos + /// let datetime = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap().and_utc(); + /// let v1 = Variant::from(datetime); + /// assert_eq!(v1.as_timestamp_nanos(), Some(1759494896789123456)); + /// + /// // or Variant::TimestampNtzNanos + /// let datetime_ntz = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap(); + /// let v2 = Variant::from(datetime_ntz); + /// assert_eq!(v1.as_timestamp_nanos(), Some(1759494896789123456)); + /// + /// // but not from other variants + /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_micro_opt(12, 34, 56, 789).unwrap().and_utc(); + /// let v3 = Variant::from(datetime_micros); + /// assert_eq!(v3.as_timestamp_nanos(), None); + /// ``` + pub fn as_timestamp_nanos(&self) -> Option { + match *self { + Variant::TimestampNanos(d) => d.timestamp_nanos_opt(), + Variant::TimestampNtzNanos(d) => d.and_utc().timestamp_nanos_opt(), + _ => None, + } + } + /// Converts this variant to a `NaiveDateTime` if possible. /// /// Returns `Some(NaiveDateTime)` for timestamp variants, From b62e15bc9e38a1d7cf22fb3da111d1d7934a1059 Mon Sep 17 00:00:00 2001 From: klion26 Date: Fri, 3 Oct 2025 16:03:15 +0800 Subject: [PATCH 06/11] improve the macro rules for impl_primitive_form_variant for timestamp --- .../src/type_conversion.rs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 04988d20cfcd..943dbcb89ba8 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -56,15 +56,15 @@ macro_rules! impl_primitive_from_variant { } } }; - ($arrow_type:ty $(, $variant_method:ident => $cast_fn:expr )+ ) => { + ($arrow_type:ty, $( $variant_type:pat => $variant_method:ident, $cast_fn:expr ),+ $(,)?) => { impl TimestampFromVariant for $arrow_type { fn from_variant(variant: &Variant<'_, '_>) -> Option { - $( - if let Some(value) = variant.$variant_method() { - return Some($cast_fn(value)); - } - )+ - None + match variant { + $( + $variant_type => variant.$variant_method().map($cast_fn), + )+ + _ => None + } } } }; @@ -88,11 +88,11 @@ impl_primitive_from_variant!( ); impl_primitive_from_variant!( datatypes::TimestampMicrosecondType, - as_timestamp_micros => |t| t); + Variant::TimestampNtzMicros(_) | Variant::TimestampMicros(_) => as_timestamp_micros, |t| t); impl_primitive_from_variant!( datatypes::TimestampNanosecondType, - as_timestamp_micros => |t| 1000 * t, - as_timestamp_nanos => |t| t); + Variant::TimestampNtzMicros(_) | Variant::TimestampMicros(_) => as_timestamp_micros, |t| 1000 * t, + Variant::TimestampNtzNanos(_) | Variant::TimestampNanos(_) => as_timestamp_nanos, |t| t); /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! non_generic_conversion_single_value { From d1a7d065e0093f036471b1ba5e8280d73285b1f1 Mon Sep 17 00:00:00 2001 From: klion26 Date: Sun, 5 Oct 2025 20:08:01 +0800 Subject: [PATCH 07/11] address comments --- .../src/type_conversion.rs | 53 ++++++--- parquet-variant-compute/src/variant_get.rs | 25 ---- .../src/variant_to_arrow.rs | 26 ++++- parquet-variant/src/variant.rs | 110 ++++++++---------- 4 files changed, 111 insertions(+), 103 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 943dbcb89ba8..0ad8edb1e849 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -18,6 +18,7 @@ //! Module for transforming a typed arrow `Array` to `VariantArray`. use arrow::datatypes::{self, ArrowPrimitiveType, ArrowTimestampType, Date32Type}; +use chrono::{DateTime, Utc}; use parquet_variant::Variant; /// Options for controlling the behavior of `cast_to_variant_with_options`. @@ -41,10 +42,19 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType { /// Extension trait for Arrow timestamp types that can extract their native value from a Variant /// We can't use [`PrimitiveFromVariant`] directly because we might need to use methods that /// are only available on [`ArrowTimestampType`] (such as with_timezone_opt) -pub(crate) trait TimestampFromVariant: ArrowTimestampType { +pub(crate) trait TimestampFromVariant: ArrowTimestampType { fn from_variant(variant: &Variant<'_, '_>) -> Option; } +/// Extension trait that [`ArrowTimestampType`] handle [`DateTime`] like [`NaiveDateTime`] +trait MakeValueTz: ArrowTimestampType { + fn make_value_tz(timestamp: DateTime) -> Option { + Self::make_value(timestamp.naive_utc()) + } +} + +impl MakeValueTz for T {} + /// Macro to generate PrimitiveFromVariant implementations for Arrow primitive types macro_rules! impl_primitive_from_variant { ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => { @@ -56,15 +66,13 @@ macro_rules! impl_primitive_from_variant { } } }; - ($arrow_type:ty, $( $variant_type:pat => $variant_method:ident, $cast_fn:expr ),+ $(,)?) => { - impl TimestampFromVariant for $arrow_type { +} + +macro_rules! impl_timestamp_from_variant { + ($timestamp_type:ty, $variant_method:ident, ntz=$ntz:ident, $cast_fn:expr $(,)?) => { + impl TimestampFromVariant<{ $ntz }> for $timestamp_type { fn from_variant(variant: &Variant<'_, '_>) -> Option { - match variant { - $( - $variant_type => variant.$variant_method().map($cast_fn), - )+ - _ => None - } + variant.$variant_method().and_then($cast_fn) } } }; @@ -86,13 +94,30 @@ impl_primitive_from_variant!( as_naive_date, Date32Type::from_naive_date ); -impl_primitive_from_variant!( +impl_timestamp_from_variant!( datatypes::TimestampMicrosecondType, - Variant::TimestampNtzMicros(_) | Variant::TimestampMicros(_) => as_timestamp_micros, |t| t); -impl_primitive_from_variant!( + as_timestamp_ntz_micros, + ntz = true, + Self::make_value, +); +impl_timestamp_from_variant!( + datatypes::TimestampMicrosecondType, + as_timestamp_micros, + ntz = false, + Self::make_value_tz +); +impl_timestamp_from_variant!( datatypes::TimestampNanosecondType, - Variant::TimestampNtzMicros(_) | Variant::TimestampMicros(_) => as_timestamp_micros, |t| 1000 * t, - Variant::TimestampNtzNanos(_) | Variant::TimestampNanos(_) => as_timestamp_nanos, |t| t); + as_timestamp_ntz_nanos, + ntz = true, + Self::make_value +); +impl_timestamp_from_variant!( + datatypes::TimestampNanosecondType, + as_timestamp_nanos, + ntz = false, + Self::make_value_tz +); /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! non_generic_conversion_single_value { diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index b8f98619bc70..fc34493c9e7e 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -866,18 +866,6 @@ mod test { ]) ); - // test converting micro to nano - perfectly_shredded_to_arrow_primitive_test!( - get_variant_perfectly_shredded_timestamp_micro_ntz_as_nano_ntz, - DataType::Timestamp(TimeUnit::Nanosecond, None), - perfectly_shredded_timestamp_micro_ntz_variant_array, - arrow::array::TimestampNanosecondArray::from(vec![ - Some(-456000000), - Some(1758602096000001000), - Some(1758602096000002000) - ]) - ); - perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_micro_variant_array, || { arrow::array::TimestampMicrosecondArray::from(vec![ Some(-456000), @@ -899,19 +887,6 @@ mod test { .with_timezone("+00:00") ); - // test converting micro to nano - perfectly_shredded_to_arrow_primitive_test!( - get_variant_perfectly_shredded_timestamp_micro_as_nano, - DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))), - perfectly_shredded_timestamp_micro_variant_array, - arrow::array::TimestampNanosecondArray::from(vec![ - Some(-456000000), - Some(1758602096000001000), - Some(1758602096000002000) - ]) - .with_timezone("+00:00") - ); - perfectly_shredded_variant_array_fn!( perfectly_shredded_timestamp_nano_ntz_variant_array, || { diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 3c22e70c160b..4489358903ab 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -46,7 +46,11 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>), Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>), TimestampMicro(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>), + TimestampMicroNtz( + VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>, + ), TimestampNano(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), + TimestampNanoNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>), } @@ -79,7 +83,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float32(b) => b.append_null(), Float64(b) => b.append_null(), TimestampMicro(b) => b.append_null(), + TimestampMicroNtz(b) => b.append_null(), TimestampNano(b) => b.append_null(), + TimestampNanoNtz(b) => b.append_null(), Date(b) => b.append_null(), } } @@ -100,7 +106,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float32(b) => b.append_value(value), Float64(b) => b.append_value(value), TimestampMicro(b) => b.append_value(value), + TimestampMicroNtz(b) => b.append_value(value), TimestampNano(b) => b.append_value(value), + TimestampNanoNtz(b) => b.append_value(value), Date(b) => b.append_value(value), } } @@ -121,7 +129,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Float32(b) => b.finish(), Float64(b) => b.finish(), TimestampMicro(b) => b.finish(), + TimestampMicroNtz(b) => b.finish(), TimestampNano(b) => b.finish(), + TimestampNanoNtz(b) => b.finish(), Date(b) => b.finish(), } } @@ -210,10 +220,15 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( cast_options, capacity, )), + DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz( + VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity), + ), DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro( VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), ), - + DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz( + VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity), + ), DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano( VariantToTimestampArrowRowBuilder::new(cast_options, capacity, tz.clone()), ), @@ -401,7 +416,14 @@ define_variant_to_primitive_builder!( ); define_variant_to_primitive_builder!( - struct VariantToTimestampArrowRowBuilder<'a, T:TimestampFromVariant> + struct VariantToTimestampNtzArrowRowBuilder<'a, T:TimestampFromVariant> + |capacity| -> PrimitiveBuilder { PrimitiveBuilder::::with_capacity(capacity) }, + |value| T::from_variant(value), + type_name: get_type_name::() +); + +define_variant_to_primitive_builder!( + struct VariantToTimestampArrowRowBuilder<'a, T:TimestampFromVariant> |capacity, tz: Option> | -> PrimitiveBuilder { PrimitiveBuilder::::with_capacity(capacity).with_timezone_opt(tz) }, diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 0ca7871dff34..f206e62f64f4 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -533,8 +533,8 @@ impl<'m, 'v> Variant<'m, 'v> { /// Converts this variant to a `DateTime` if possible. /// - /// Returns `Some(DateTime)` for timestamp variants, - /// `None` for non-timestamp variants. + /// Returns `Some(DateTime)` for [`Variant::TimestampMicros`] variants, + /// `None` for other variants. /// /// # Examples /// @@ -545,92 +545,81 @@ impl<'m, 'v> Variant<'m, 'v> { /// // you can extract a DateTime from a UTC-adjusted variant /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc(); /// let v1 = Variant::from(datetime); - /// assert_eq!(v1.as_datetime_utc(), Some(datetime)); + /// assert_eq!(v1.as_timestamp_micros(), Some(datetime)); + /// + /// // but not for other variants. /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap().and_utc(); /// let v2 = Variant::from(datetime_nanos); - /// assert_eq!(v2.as_datetime_utc(), Some(datetime_nanos)); - /// - /// // but not from other variants - /// let v3 = Variant::from("hello!"); - /// assert_eq!(v3.as_datetime_utc(), None); + /// assert_eq!(v2.as_timestamp_micros(), None); /// ``` - pub fn as_datetime_utc(&self) -> Option> { + pub fn as_timestamp_micros(&self) -> Option> { match *self { - Variant::TimestampMicros(d) | Variant::TimestampNanos(d) => Some(d), + Variant::TimestampMicros(d) => Some(d), _ => None, } } - /// Converts this variant to a `i64` representing microseconds since the Unix epoch if possible. - /// This is useful when convert the variant to arrow types. + /// Converts this variant to a `NaiveDateTime` if possible. /// - /// Returns Some(i64) for [`Variant::TimestampMicros`] and [`Variant::TimestampNtzMicros`], - /// None for the other variant types. + /// Returns `Some(NaiveDateTime)` for [`Variant::TimestampNtzMicros`] variants, + /// `None` for other variants. + /// + /// # Examples /// /// ``` /// use parquet_variant::Variant; /// use chrono::NaiveDate; /// - /// // you can extract an i64 from Variant::TimestampMicros - /// let datetime = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_milli_opt(12, 34, 56, 789).unwrap().and_utc(); + /// // you can extract a NaiveDateTime from a non-UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap(); /// let v1 = Variant::from(datetime); - /// assert_eq!(v1.as_timestamp_micros(), Some(1759494896789000)); + /// assert_eq!(v1.as_timestamp_ntz_micros(), Some(datetime)); /// - /// // or Variant::TimestampNtzMicros - /// let datetime_ntz = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_milli_opt(12, 34, 56, 789).unwrap(); - /// let v2 = Variant::from(datetime_ntz); - /// assert_eq!(v1.as_timestamp_micros(), Some(1759494896789000)); - /// - /// // but not from other variants - /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap().and_utc(); - /// let v3 = Variant::from(datetime_nanos); - /// assert_eq!(v3.as_timestamp_micros(), None); + /// // but not for other variants. + /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap(); + /// let v2 = Variant::from(datetime_nanos); + /// assert_eq!(v2.as_timestamp_micros(), None); /// ``` - pub fn as_timestamp_micros(&self) -> Option { + pub fn as_timestamp_ntz_micros(&self) -> Option { match *self { - Variant::TimestampMicros(d) => Some(d.timestamp_micros()), - Variant::TimestampNtzMicros(d) => Some(d.and_utc().timestamp_micros()), + Variant::TimestampNtzMicros(d) => Some(d), _ => None, } } - /// Converts this variant to a `i64` representing nanoseconds since the Unix epoch if possible. - /// This is useful when convert the variant to arrow types. + /// Converts this variant to a `DateTime` if possible. + /// + /// Returns `Some(DateTime)` for [`Variant::TimestampNanos`] variants, + /// `None` for other variants. /// - /// Returns Some(i64) for [`Variant::TimestampNanos`] and [`Variant::TimestampNtzNanos`], - /// None for the other variant types. + /// # Examples /// /// ``` /// use parquet_variant::Variant; /// use chrono::NaiveDate; /// - /// // you can extract an i64 from Variant::TimestampNanos - /// let datetime = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap().and_utc(); + /// // you can extract a DateTime from a UTC-adjusted variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap().and_utc(); /// let v1 = Variant::from(datetime); - /// assert_eq!(v1.as_timestamp_nanos(), Some(1759494896789123456)); - /// - /// // or Variant::TimestampNtzNanos - /// let datetime_ntz = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap(); - /// let v2 = Variant::from(datetime_ntz); - /// assert_eq!(v1.as_timestamp_nanos(), Some(1759494896789123456)); + /// assert_eq!(v1.as_timestamp_nanos(), Some(datetime)); /// - /// // but not from other variants - /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 10, 03).unwrap().and_hms_micro_opt(12, 34, 56, 789).unwrap().and_utc(); - /// let v3 = Variant::from(datetime_micros); - /// assert_eq!(v3.as_timestamp_nanos(), None); + /// // but not for other variants. + /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_milli_opt(12, 33, 54, 123).unwrap().and_utc(); + /// // this will convert to `Variant::TimestampMicros`. + /// let v2 = Variant::from(datetime_micros); + /// assert_eq!(v2.as_timestamp_nanos(), None); /// ``` - pub fn as_timestamp_nanos(&self) -> Option { + pub fn as_timestamp_nanos(&self) -> Option> { match *self { - Variant::TimestampNanos(d) => d.timestamp_nanos_opt(), - Variant::TimestampNtzNanos(d) => d.and_utc().timestamp_nanos_opt(), + Variant::TimestampNanos(d) => Some(d), _ => None, } } /// Converts this variant to a `NaiveDateTime` if possible. /// - /// Returns `Some(NaiveDateTime)` for timestamp variants, - /// `None` for non-timestamp variants. + /// Returns `Some(NaiveDateTime)` for [`Variant::TimestampNtzNanos`] variants, + /// `None` for other variants. /// /// # Examples /// @@ -639,22 +628,19 @@ impl<'m, 'v> Variant<'m, 'v> { /// use chrono::NaiveDate; /// /// // you can extract a NaiveDateTime from a non-UTC-adjusted variant - /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap(); + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap(); /// let v1 = Variant::from(datetime); - /// assert_eq!(v1.as_naive_datetime(), Some(datetime)); - /// - /// // or a UTC-adjusted variant - /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_nano_opt(12, 34, 56, 123456789).unwrap(); - /// let v2 = Variant::from(datetime); - /// assert_eq!(v2.as_naive_datetime(), Some(datetime)); + /// assert_eq!(v1.as_timestamp_ntz_nanos(), Some(datetime)); /// - /// // but not from other variants - /// let v3 = Variant::from("hello!"); - /// assert_eq!(v3.as_naive_datetime(), None); + /// // but not for other variants. + /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_milli_opt(12, 33, 54, 123).unwrap(); + /// // this will convert to `Variant::TimestampMicros`. + /// let v2 = Variant::from(datetime_micros); + /// assert_eq!(v2.as_timestamp_ntz_nanos(), None); /// ``` - pub fn as_naive_datetime(&self) -> Option { + pub fn as_timestamp_ntz_nanos(&self) -> Option { match *self { - Variant::TimestampNtzMicros(d) | Variant::TimestampNtzNanos(d) => Some(d), + Variant::TimestampNtzNanos(d) => Some(d), _ => None, } } From 6762eff269a85b59374b2fa14522d0e923aea77f Mon Sep 17 00:00:00 2001 From: klion26 Date: Sun, 5 Oct 2025 20:48:26 +0800 Subject: [PATCH 08/11] fix style --- parquet-variant-compute/src/variant_get.rs | 2 +- parquet-variant-compute/src/variant_to_arrow.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index fc34493c9e7e..955e6e1cd2ff 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -301,7 +301,7 @@ mod test { use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, + Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, StringArray, StructArray, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 4489358903ab..d60a4eea05c0 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -16,7 +16,7 @@ // under the License. use arrow::array::{ - builder::BooleanBuilder, ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder, + ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder, builder::BooleanBuilder, }; use arrow::compute::CastOptions; use arrow::datatypes::{self, ArrowPrimitiveType, DataType}; From 2505dfa65e354fc82819a70894f7bba84edcfae8 Mon Sep 17 00:00:00 2001 From: klion26 Date: Sun, 5 Oct 2025 20:55:12 +0800 Subject: [PATCH 09/11] make doc clean --- parquet-variant-compute/src/type_conversion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 0ad8edb1e849..48482bbec04b 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -46,7 +46,7 @@ pub(crate) trait TimestampFromVariant: ArrowTimestampType { fn from_variant(variant: &Variant<'_, '_>) -> Option; } -/// Extension trait that [`ArrowTimestampType`] handle [`DateTime`] like [`NaiveDateTime`] +/// Extension trait that `ArrowTimestampType` handle `DateTime` like `NaiveDateTime` trait MakeValueTz: ArrowTimestampType { fn make_value_tz(timestamp: DateTime) -> Option { Self::make_value(timestamp.naive_utc()) From f32574645c5399e7fc0476ed18229197b949b400 Mon Sep 17 00:00:00 2001 From: klion26 Date: Tue, 7 Oct 2025 19:59:05 +0800 Subject: [PATCH 10/11] address comments --- .../src/type_conversion.rs | 4 +- parquet-variant/src/variant.rs | 70 ++++++++++++++----- 2 files changed, 55 insertions(+), 19 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 48482bbec04b..43efaff4e011 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -40,8 +40,8 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType { } /// Extension trait for Arrow timestamp types that can extract their native value from a Variant -/// We can't use [`PrimitiveFromVariant`] directly because we might need to use methods that -/// are only available on [`ArrowTimestampType`] (such as with_timezone_opt) +/// We can't use [`PrimitiveFromVariant`] directly because we need _two_ implementations for each +/// timestamp type -- the `NTZ` param here. pub(crate) trait TimestampFromVariant: ArrowTimestampType { fn from_variant(variant: &Variant<'_, '_>) -> Option; } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index f206e62f64f4..f754e3cc2ab0 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -543,12 +543,20 @@ impl<'m, 'v> Variant<'m, 'v> { /// use chrono::NaiveDate; /// /// // you can extract a DateTime from a UTC-adjusted variant - /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap().and_utc(); + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16) + /// .unwrap() + /// .and_hms_milli_opt(12, 34, 56, 780) + /// .unwrap() + /// .and_utc(); /// let v1 = Variant::from(datetime); /// assert_eq!(v1.as_timestamp_micros(), Some(datetime)); /// /// // but not for other variants. - /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap().and_utc(); + /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 8, 14) + /// .unwrap() + /// .and_hms_nano_opt(12, 33, 54, 123456789) + /// .unwrap() + /// .and_utc(); /// let v2 = Variant::from(datetime_nanos); /// assert_eq!(v2.as_timestamp_micros(), None); /// ``` @@ -571,12 +579,18 @@ impl<'m, 'v> Variant<'m, 'v> { /// use chrono::NaiveDate; /// /// // you can extract a NaiveDateTime from a non-UTC-adjusted variant - /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_milli_opt(12, 34, 56, 780).unwrap(); + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16) + /// .unwrap() + /// .and_hms_milli_opt(12, 34, 56, 780) + /// .unwrap(); /// let v1 = Variant::from(datetime); /// assert_eq!(v1.as_timestamp_ntz_micros(), Some(datetime)); /// /// // but not for other variants. - /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_nano_opt(12, 33, 54, 123456789).unwrap(); + /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 8, 14) + /// .unwrap() + /// .and_hms_nano_opt(12, 33, 54, 123456789) + /// .unwrap(); /// let v2 = Variant::from(datetime_nanos); /// assert_eq!(v2.as_timestamp_micros(), None); /// ``` @@ -589,7 +603,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// Converts this variant to a `DateTime` if possible. /// - /// Returns `Some(DateTime)` for [`Variant::TimestampNanos`] variants, + /// Returns `Some(DateTime)` for timestamp variants, /// `None` for other variants. /// /// # Examples @@ -598,27 +612,39 @@ impl<'m, 'v> Variant<'m, 'v> { /// use parquet_variant::Variant; /// use chrono::NaiveDate; /// - /// // you can extract a DateTime from a UTC-adjusted variant - /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap().and_utc(); + /// // you can extract a DateTime from a UTC-adjusted nanosecond-precision variant + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16) + /// .unwrap() + /// .and_hms_nano_opt(12, 34, 56, 789123456) + /// .unwrap() + /// .and_utc(); /// let v1 = Variant::from(datetime); /// assert_eq!(v1.as_timestamp_nanos(), Some(datetime)); /// - /// // but not for other variants. - /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_milli_opt(12, 33, 54, 123).unwrap().and_utc(); + /// // or from UTC-adjusted microsecond-precision variant + /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 8, 14) + /// .unwrap() + /// .and_hms_milli_opt(12, 33, 54, 123) + /// .unwrap() + /// .and_utc(); /// // this will convert to `Variant::TimestampMicros`. /// let v2 = Variant::from(datetime_micros); - /// assert_eq!(v2.as_timestamp_nanos(), None); + /// assert_eq!(v2.as_timestamp_nanos(), Some(datetime_micros)); + /// + /// // but not for other variants. + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_timestamp_nanos(), None); /// ``` pub fn as_timestamp_nanos(&self) -> Option> { match *self { - Variant::TimestampNanos(d) => Some(d), + Variant::TimestampNanos(d) | Variant::TimestampMicros(d) => Some(d), _ => None, } } /// Converts this variant to a `NaiveDateTime` if possible. /// - /// Returns `Some(NaiveDateTime)` for [`Variant::TimestampNtzNanos`] variants, + /// Returns `Some(NaiveDateTime)` for timestamp variants, /// `None` for other variants. /// /// # Examples @@ -628,19 +654,29 @@ impl<'m, 'v> Variant<'m, 'v> { /// use chrono::NaiveDate; /// /// // you can extract a NaiveDateTime from a non-UTC-adjusted variant - /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap(); + /// let datetime = NaiveDate::from_ymd_opt(2025, 4, 16) + /// .unwrap() + /// .and_hms_nano_opt(12, 34, 56, 789123456) + /// .unwrap(); /// let v1 = Variant::from(datetime); /// assert_eq!(v1.as_timestamp_ntz_nanos(), Some(datetime)); /// - /// // but not for other variants. - /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 8, 14).unwrap().and_hms_milli_opt(12, 33, 54, 123).unwrap(); + /// // or from a microsecond-precision non-UTC-adjusted variant + /// let datetime_micros = NaiveDate::from_ymd_opt(2025, 8, 14) + /// .unwrap() + /// .and_hms_milli_opt(12, 33, 54, 123) + /// .unwrap(); /// // this will convert to `Variant::TimestampMicros`. /// let v2 = Variant::from(datetime_micros); - /// assert_eq!(v2.as_timestamp_ntz_nanos(), None); + /// assert_eq!(v2.as_timestamp_ntz_nanos(), Some(datetime_micros)); + /// + /// // but not for other variants. + /// let v3 = Variant::from("hello!"); + /// assert_eq!(v3.as_timestamp_ntz_nanos(), None); /// ``` pub fn as_timestamp_ntz_nanos(&self) -> Option { match *self { - Variant::TimestampNtzNanos(d) => Some(d), + Variant::TimestampNtzNanos(d) | Variant::TimestampNtzMicros(d) => Some(d), _ => None, } } From 0da1287224e50aa28b55b03a61bff48fca0712de Mon Sep 17 00:00:00 2001 From: klion26 Date: Wed, 8 Oct 2025 19:28:16 +0800 Subject: [PATCH 11/11] simplify logic for timestamp tz, add tests micro->nano back --- .../src/type_conversion.rs | 14 ++--------- parquet-variant-compute/src/variant_get.rs | 25 +++++++++++++++++++ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 43efaff4e011..5afebb1bfa6b 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -18,7 +18,6 @@ //! Module for transforming a typed arrow `Array` to `VariantArray`. use arrow::datatypes::{self, ArrowPrimitiveType, ArrowTimestampType, Date32Type}; -use chrono::{DateTime, Utc}; use parquet_variant::Variant; /// Options for controlling the behavior of `cast_to_variant_with_options`. @@ -46,15 +45,6 @@ pub(crate) trait TimestampFromVariant: ArrowTimestampType { fn from_variant(variant: &Variant<'_, '_>) -> Option; } -/// Extension trait that `ArrowTimestampType` handle `DateTime` like `NaiveDateTime` -trait MakeValueTz: ArrowTimestampType { - fn make_value_tz(timestamp: DateTime) -> Option { - Self::make_value(timestamp.naive_utc()) - } -} - -impl MakeValueTz for T {} - /// Macro to generate PrimitiveFromVariant implementations for Arrow primitive types macro_rules! impl_primitive_from_variant { ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => { @@ -104,7 +94,7 @@ impl_timestamp_from_variant!( datatypes::TimestampMicrosecondType, as_timestamp_micros, ntz = false, - Self::make_value_tz + |timestamp| Self::make_value(timestamp.naive_utc()) ); impl_timestamp_from_variant!( datatypes::TimestampNanosecondType, @@ -116,7 +106,7 @@ impl_timestamp_from_variant!( datatypes::TimestampNanosecondType, as_timestamp_nanos, ntz = false, - Self::make_value_tz + |timestamp| Self::make_value(timestamp.naive_utc()) ); /// Convert the value at a specific index in the given array into a `Variant`. diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 955e6e1cd2ff..8ee489cfe583 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -866,6 +866,18 @@ mod test { ]) ); + // test converting micro to nano + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_timestamp_micro_ntz_as_nano_ntz, + DataType::Timestamp(TimeUnit::Nanosecond, None), + perfectly_shredded_timestamp_micro_ntz_variant_array, + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-456000000), + Some(1758602096000001000), + Some(1758602096000002000) + ]) + ); + perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_micro_variant_array, || { arrow::array::TimestampMicrosecondArray::from(vec![ Some(-456000), @@ -887,6 +899,19 @@ mod test { .with_timezone("+00:00") ); + // test converting micro to nano + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_timestamp_micro_as_nano, + DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))), + perfectly_shredded_timestamp_micro_variant_array, + arrow::array::TimestampNanosecondArray::from(vec![ + Some(-456000000), + Some(1758602096000001000), + Some(1758602096000002000) + ]) + .with_timezone("+00:00") + ); + perfectly_shredded_variant_array_fn!( perfectly_shredded_timestamp_nano_ntz_variant_array, || {