Skip to content

Commit ebb6ede

Browse files
liamzwbaoalamb
andauthored
[Variant]: Implement DataType::RunEndEncoded support for cast_to_variant kernel (#8174)
# Which issue does this PR close? - Closes #8064. # Rationale for this change # What changes are included in this PR? Implement `DataType::RunEndEncoded` for `cast_to_variant` # Are these changes tested? Yes # Are there any user-facing changes? New cast type supported --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 62770b6 commit ebb6ede

File tree

1 file changed

+106
-7
lines changed

1 file changed

+106
-7
lines changed

parquet-variant-compute/src/cast_to_variant.rs

Lines changed: 106 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ use arrow::array::{
2323
TimestampSecondArray,
2424
};
2525
use arrow::datatypes::{
26-
i256, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type,
27-
Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type,
28-
Int64Type, Int8Type, LargeBinaryType, Time32MillisecondType, Time32SecondType,
29-
Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
26+
i256, ArrowNativeType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type,
27+
Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type,
28+
Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType,
29+
Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type,
30+
UInt64Type, UInt8Type,
3031
};
3132
use arrow::temporal_conversions::{
3233
timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime,
@@ -502,6 +503,17 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
502503
builder
503504
);
504505
}
506+
DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
507+
DataType::Int16 => process_run_end_encoded::<Int16Type>(input, &mut builder)?,
508+
DataType::Int32 => process_run_end_encoded::<Int32Type>(input, &mut builder)?,
509+
DataType::Int64 => process_run_end_encoded::<Int64Type>(input, &mut builder)?,
510+
_ => {
511+
return Err(ArrowError::CastError(format!(
512+
"Unsupported run ends type: {:?}",
513+
run_ends.data_type()
514+
)));
515+
}
516+
},
505517
DataType::Dictionary(_, _) => {
506518
let dict_array = input.as_any_dictionary();
507519
let values_variant_array = cast_to_variant(dict_array.values().as_ref())?;
@@ -532,6 +544,41 @@ pub fn cast_to_variant(input: &dyn Array) -> Result<VariantArray, ArrowError> {
532544
Ok(builder.build())
533545
}
534546

547+
/// Generic function to process run-end encoded arrays
548+
fn process_run_end_encoded<R: RunEndIndexType>(
549+
input: &dyn Array,
550+
builder: &mut VariantArrayBuilder,
551+
) -> Result<(), ArrowError> {
552+
let run_array = input.as_run::<R>();
553+
let values_variant_array = cast_to_variant(run_array.values().as_ref())?;
554+
555+
// Process runs in batches for better performance
556+
let run_ends = run_array.run_ends().values();
557+
let mut logical_start = 0;
558+
559+
for (physical_idx, &run_end) in run_ends.iter().enumerate() {
560+
let logical_end = run_end.as_usize();
561+
let run_length = logical_end - logical_start;
562+
563+
if values_variant_array.is_null(physical_idx) {
564+
// Append nulls for the entire run
565+
for _ in 0..run_length {
566+
builder.append_null();
567+
}
568+
} else {
569+
// Get the value once and append it for the entire run
570+
let value = values_variant_array.value(physical_idx);
571+
for _ in 0..run_length {
572+
builder.append_variant(value.clone());
573+
}
574+
}
575+
576+
logical_start = logical_end;
577+
}
578+
579+
Ok(())
580+
}
581+
535582
// TODO do we need a cast_with_options to allow specifying conversion behavior,
536583
// e.g. how to handle overflows, whether to convert to Variant::Null or return
537584
// an error, etc. ?
@@ -544,9 +591,9 @@ mod tests {
544591
Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray, FixedSizeBinaryBuilder,
545592
Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder,
546593
Int16Array, Int32Array, Int64Array, Int8Array, IntervalYearMonthArray, LargeStringArray,
547-
NullArray, StringArray, StringViewArray, StructArray, Time32MillisecondArray,
548-
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array,
549-
UInt64Array, UInt8Array,
594+
NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray,
595+
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
596+
UInt16Array, UInt32Array, UInt64Array, UInt8Array,
550597
};
551598
use arrow::buffer::NullBuffer;
552599
use arrow_schema::{Field, Fields};
@@ -1847,6 +1894,58 @@ mod tests {
18471894
);
18481895
}
18491896

1897+
#[test]
1898+
fn test_cast_to_variant_run_end_encoded() {
1899+
let mut builder = StringRunBuilder::<Int32Type>::new();
1900+
builder.append_value("apple");
1901+
builder.append_value("apple");
1902+
builder.append_value("banana");
1903+
builder.append_value("banana");
1904+
builder.append_value("banana");
1905+
builder.append_value("cherry");
1906+
let run_array = builder.finish();
1907+
1908+
run_test(
1909+
Arc::new(run_array),
1910+
vec![
1911+
Some(Variant::from("apple")),
1912+
Some(Variant::from("apple")),
1913+
Some(Variant::from("banana")),
1914+
Some(Variant::from("banana")),
1915+
Some(Variant::from("banana")),
1916+
Some(Variant::from("cherry")),
1917+
],
1918+
);
1919+
}
1920+
1921+
#[test]
1922+
fn test_cast_to_variant_run_end_encoded_with_nulls() {
1923+
use arrow::array::StringRunBuilder;
1924+
use arrow::datatypes::Int32Type;
1925+
1926+
// Test run-end encoded array with nulls
1927+
let mut builder = StringRunBuilder::<Int32Type>::new();
1928+
builder.append_value("apple");
1929+
builder.append_null();
1930+
builder.append_value("banana");
1931+
builder.append_value("banana");
1932+
builder.append_null();
1933+
builder.append_null();
1934+
let run_array = builder.finish();
1935+
1936+
run_test(
1937+
Arc::new(run_array),
1938+
vec![
1939+
Some(Variant::from("apple")),
1940+
None,
1941+
Some(Variant::from("banana")),
1942+
Some(Variant::from("banana")),
1943+
None,
1944+
None,
1945+
],
1946+
);
1947+
}
1948+
18501949
#[test]
18511950
fn test_cast_to_variant_dictionary() {
18521951
let values = StringArray::from(vec!["apple", "banana", "cherry", "date"]);

0 commit comments

Comments
 (0)