Skip to content

Commit d6ee3ba

Browse files
committed
[Variant] Support primitive variant to arrow row for timestamp(micro&nano) and time
1 parent 56649bf commit d6ee3ba

File tree

3 files changed

+188
-8
lines changed

3 files changed

+188
-8
lines changed

parquet-variant-compute/src/type_conversion.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
//! Module for transforming a typed arrow `Array` to `VariantArray`.
1919
2020
use arrow::datatypes::{self, ArrowPrimitiveType};
21+
use chrono::Datelike;
2122
use parquet_variant::Variant;
2223

2324
/// Options for controlling the behavior of `cast_to_variant_with_options`.
@@ -98,6 +99,34 @@ impl VariantAsPrimitive<datatypes::UInt64Type> for Variant<'_, '_> {
9899
}
99100
}
100101

102+
impl VariantAsPrimitive<datatypes::TimestampMicrosecondType> for Variant<'_, '_> {
103+
fn as_primitive(&self) -> Option<i64> {
104+
match self {
105+
Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()),
106+
Variant::TimestampNtzMicros(ndt) => Some(ndt.and_utc().timestamp_micros()),
107+
_ => None,
108+
}
109+
}
110+
}
111+
112+
impl VariantAsPrimitive<datatypes::TimestampNanosecondType> for Variant<'_, '_> {
113+
fn as_primitive(&self) -> Option<i64> {
114+
match self {
115+
Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(),
116+
Variant::TimestampNtzNanos(ndt) => ndt.and_utc().timestamp_nanos_opt(),
117+
_ => None,
118+
}
119+
}
120+
}
121+
122+
impl VariantAsPrimitive<datatypes::Date32Type> for Variant<'_, '_> {
123+
fn as_primitive(&self) -> Option<i32> {
124+
// The number of days from 0001-01-01 to 1970-01-01.
125+
const DAYS_FROM_CE_TO_UNIX_EPOCH: i32 = 719163;
126+
self.as_naive_date()
127+
.map(|d| d.num_days_from_ce() - DAYS_FROM_CE_TO_UNIX_EPOCH)
128+
}
129+
}
101130
/// Convert the value at a specific index in the given array into a `Variant`.
102131
macro_rules! non_generic_conversion_single_value {
103132
($array:expr, $cast_fn:expr, $index:expr) => {{

parquet-variant-compute/src/variant_get.rs

Lines changed: 101 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -301,14 +301,14 @@ mod test {
301301
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
302302
use crate::VariantArray;
303303
use arrow::array::{
304-
Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array, Float64Array,
305-
Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray,
304+
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Float32Array,
305+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray,
306306
};
307307
use arrow::buffer::NullBuffer;
308308
use arrow::compute::CastOptions;
309309
use arrow::datatypes::DataType::{Int16, Int32, Int64};
310310
use arrow_schema::DataType::{Boolean, Float32, Float64, Int8};
311-
use arrow_schema::{DataType, Field, FieldRef, Fields};
311+
use arrow_schema::{DataType, Field, FieldRef, Fields, TimeUnit};
312312
use chrono::DateTime;
313313
use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES};
314314

@@ -699,7 +699,7 @@ mod test {
699699
}
700700

701701
macro_rules! perfectly_shredded_to_arrow_primitive_test {
702-
($name:ident, $primitive_type:ident, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
702+
($name:ident, $primitive_type:expr, $perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
703703
#[test]
704704
fn $name() {
705705
let array = $perfectly_shredded_array_gen_fun();
@@ -842,6 +842,103 @@ mod test {
842842
f64
843843
);
844844

845+
perfectly_shredded_variant_array_fn!(
846+
perfectly_shredded_timestamp_micro_ntz_variant_array,
847+
|| {
848+
arrow::array::TimestampMicrosecondArray::from(vec![
849+
Some(-456000),
850+
Some(1758602096000001),
851+
Some(1758602096000002),
852+
])
853+
}
854+
);
855+
856+
perfectly_shredded_to_arrow_primitive_test!(
857+
get_variant_perfectly_shredded_timestamp_micro_ntz_as_timestamp_micro_ntz,
858+
DataType::Timestamp(TimeUnit::Microsecond, None),
859+
perfectly_shredded_timestamp_micro_ntz_variant_array,
860+
arrow::array::TimestampMicrosecondArray::from(vec![
861+
Some(-456000),
862+
Some(1758602096000001),
863+
Some(1758602096000002),
864+
])
865+
);
866+
867+
perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_micro_variant_array, || {
868+
arrow::array::TimestampMicrosecondArray::from(vec![
869+
Some(-456000),
870+
Some(1758602096000001),
871+
Some(1758602096000002),
872+
])
873+
.with_timezone("+00:00")
874+
});
875+
876+
perfectly_shredded_to_arrow_primitive_test!(
877+
get_variant_perfectly_shredded_timestamp_micro_as_timestamp_micro,
878+
DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+00:00"))),
879+
perfectly_shredded_timestamp_micro_variant_array,
880+
arrow::array::TimestampMicrosecondArray::from(vec![
881+
Some(-456000),
882+
Some(1758602096000001),
883+
Some(1758602096000002),
884+
])
885+
.with_timezone("+00:00")
886+
);
887+
888+
perfectly_shredded_variant_array_fn!(
889+
perfectly_shredded_timestamp_nano_ntz_variant_array,
890+
|| {
891+
arrow::array::TimestampNanosecondArray::from(vec![
892+
Some(-4999999561),
893+
Some(1758602096000000001),
894+
Some(1758602096000000002),
895+
])
896+
}
897+
);
898+
899+
perfectly_shredded_to_arrow_primitive_test!(
900+
get_variant_perfectly_shredded_timestamp_nano_ntz_as_timestamp_nano_ntz,
901+
DataType::Timestamp(TimeUnit::Nanosecond, None),
902+
perfectly_shredded_timestamp_nano_ntz_variant_array,
903+
arrow::array::TimestampNanosecondArray::from(vec![
904+
Some(-4999999561),
905+
Some(1758602096000000001),
906+
Some(1758602096000000002),
907+
])
908+
);
909+
910+
perfectly_shredded_variant_array_fn!(perfectly_shredded_timestamp_nano_variant_array, || {
911+
arrow::array::TimestampNanosecondArray::from(vec![
912+
Some(-4999999561),
913+
Some(1758602096000000001),
914+
Some(1758602096000000002),
915+
])
916+
.with_timezone("+00:00")
917+
});
918+
919+
perfectly_shredded_to_arrow_primitive_test!(
920+
get_variant_perfectly_shredded_timestamp_nano_as_timestamp_nano,
921+
DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+00:00"))),
922+
perfectly_shredded_timestamp_nano_variant_array,
923+
arrow::array::TimestampNanosecondArray::from(vec![
924+
Some(-4999999561),
925+
Some(1758602096000000001),
926+
Some(1758602096000000002),
927+
])
928+
.with_timezone("+00:00")
929+
);
930+
931+
perfectly_shredded_variant_array_fn!(perfectly_shredded_date_variant_array, || {
932+
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
933+
});
934+
935+
perfectly_shredded_to_arrow_primitive_test!(
936+
get_variant_perfectly_shredded_date_as_date,
937+
DataType::Date32,
938+
perfectly_shredded_date_variant_array,
939+
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
940+
);
941+
845942
macro_rules! assert_variant_get_as_variant_array_with_default_option {
846943
($variant_array: expr, $array_expected: expr) => {{
847944
let options = GetOptions::new();

parquet-variant-compute/src/variant_to_arrow.rs

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ use parquet_variant::{Variant, VariantPath};
2626
use crate::type_conversion::VariantAsPrimitive;
2727
use crate::{VariantArray, VariantValueArrayBuilder};
2828

29+
use arrow_schema::DataType::Date32;
30+
use arrow_schema::TimeUnit;
2931
use std::sync::Arc;
3032

3133
/// Builder for converting variant values to primitive Arrow arrays. It is used by both
@@ -44,7 +46,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
4446
Float16(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float16Type>),
4547
Float32(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float32Type>),
4648
Float64(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Float64Type>),
47-
Boolean(VariantToBooleanArrowRowBuilder<'a>),
49+
TimestampMicro(VariantToPrimitiveArrowRowBuilder<'a, datatypes::TimestampMicrosecondType>),
50+
TimestampNano(VariantToPrimitiveArrowRowBuilder<'a, datatypes::TimestampNanosecondType>),
51+
Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>),
4852
}
4953

5054
/// Builder for converting variant values into strongly typed Arrow arrays.
@@ -77,6 +81,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
7781
Float64(b) => b.append_null(),
7882
TimestampMicro(b) => b.append_null(),
7983
TimestampNano(b) => b.append_null(),
84+
Date(b) => b.append_null(),
8085
}
8186
}
8287

@@ -95,6 +100,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
95100
Float32(b) => b.append_value(value),
96101
Float64(b) => b.append_value(value),
97102
Boolean(b) => b.append_value(value),
103+
TimestampMicro(b) => b.append_value(value),
104+
TimestampNano(b) => b.append_value(value),
105+
Date(b) => b.append_value(value),
98106
}
99107
}
100108

@@ -113,7 +121,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
113121
Float16(b) => b.finish(),
114122
Float32(b) => b.finish(),
115123
Float64(b) => b.finish(),
116-
Boolean(b) => b.finish(),
124+
TimestampMicro(b) => b.finish(),
125+
TimestampNano(b) => b.finish(),
126+
Date(b) => b.finish(),
117127
}
118128
}
119129
}
@@ -201,7 +211,29 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
201211
cast_options,
202212
capacity,
203213
)),
204-
DataType::Boolean => Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)),
214+
DataType::Timestamp(TimeUnit::Microsecond, tz) => {
215+
let target_type = DataType::Timestamp(TimeUnit::Microsecond, tz.clone());
216+
217+
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
218+
cast_options,
219+
capacity,
220+
Some(target_type),
221+
))
222+
}
223+
DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
224+
let target_type = DataType::Timestamp(TimeUnit::Nanosecond, tz.clone());
225+
226+
TimestampNano(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
227+
cast_options,
228+
capacity,
229+
Some(target_type),
230+
))
231+
}
232+
DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
233+
cast_options,
234+
capacity,
235+
Some(Date32),
236+
)),
205237
_ if data_type.is_primitive() => {
206238
return Err(ArrowError::NotYetImplemented(format!(
207239
"Primitive data_type {data_type:?} not yet implemented"
@@ -305,6 +337,8 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str {
305337
"arrow_array::types::Float32Type" => "Float32",
306338
"arrow_array::types::Float64Type" => "Float64",
307339
"arrow_array::types::Float16Type" => "Float16",
340+
"arrow_array::types::TimestampMicrosecondType" => "Timestamp(Microsecond)",
341+
"arrow_array::types::TimestampNanosecondType" => "Timestamp(Nanosecond)",
308342
_ => "Unknown",
309343
}
310344
}
@@ -356,13 +390,24 @@ impl<'a> VariantToBooleanArrowRowBuilder<'a> {
356390
pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: ArrowPrimitiveType> {
357391
builder: arrow::array::PrimitiveBuilder<T>,
358392
cast_options: &'a CastOptions<'a>,
393+
// this used to change the data type of the resulting array, e.g. to add timezone info
394+
target_data_type: Option<DataType>,
359395
}
360396

361397
impl<'a, T: ArrowPrimitiveType> VariantToPrimitiveArrowRowBuilder<'a, T> {
362398
fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
399+
Self::new_with_target_type(cast_options, capacity, None)
400+
}
401+
402+
fn new_with_target_type(
403+
cast_options: &'a CastOptions<'a>,
404+
capacity: usize,
405+
target_data_type: Option<DataType>,
406+
) -> Self {
363407
Self {
364408
builder: PrimitiveBuilder::<T>::with_capacity(capacity),
365409
cast_options,
410+
target_data_type,
366411
}
367412
}
368413
}
@@ -397,7 +442,16 @@ where
397442
}
398443

399444
fn finish(mut self) -> Result<ArrayRef> {
400-
Ok(Arc::new(self.builder.finish()))
445+
let array: PrimitiveArray<T> = self.builder.finish();
446+
447+
if let Some(target_type) = self.target_data_type {
448+
let data = array.into_data();
449+
let new_data = data.into_builder().data_type(target_type).build()?;
450+
let array_with_new_type = PrimitiveArray::<T>::from(new_data);
451+
return Ok(Arc::new(array_with_new_type));
452+
}
453+
454+
Ok(Arc::new(array))
401455
}
402456
}
403457

0 commit comments

Comments
 (0)