diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 542b61d892f1..f50dd3420baa 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -140,8 +140,8 @@ pub unsafe fn export_array_into_raw( Ok(()) } -// returns the number of bits that buffer `i` (in the C data interface) is expected to have. -// This is set by the Arrow specification +/// returns the number of bits that buffer `i` (in the C data interface) is expected to have. +/// This is set by the Arrow specification fn bit_width(data_type: &DataType, i: usize) -> Result { if let Some(primitive) = data_type.primitive_width() { return match i { @@ -180,6 +180,10 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { | (DataType::List(_), 1) | (DataType::Map(_, _), 1) => i32::BITS as _, (DataType::Utf8, 2) | (DataType::Binary, 2) => u8::BITS as _, + // List views have two i32 buffers, offsets and sizes + (DataType::ListView(_), 1) | (DataType::ListView(_), 2) => i32::BITS as _, + // Large list views have two i64 buffers, offsets and sizes + (DataType::LargeListView(_), 1) | (DataType::LargeListView(_), 2) => i64::BITS as _, (DataType::List(_), _) | (DataType::Map(_, _), _) => { return Err(ArrowError::CDataInterface(format!( "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." @@ -351,6 +355,8 @@ impl ImportedArrowArray<'_> { DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) + | DataType::ListView(field) + | DataType::LargeListView(field) | DataType::Map(field, _) => Ok([self.consume_child(0, field.data_type())?].to_vec()), DataType::Struct(fields) => { assert!(fields.len() == self.array.num_children()); @@ -471,6 +477,14 @@ impl ImportedArrowArray<'_> { debug_assert_eq!(bits % 8, 0); (length + 1) * (bits / 8) } + (DataType::ListView(_), 1) + | (DataType::ListView(_), 2) + | (DataType::LargeListView(_), 1) + | (DataType::LargeListView(_), 2) => { + let bits = bit_width(data_type, i)?; + debug_assert_eq!(bits % 8, 0); + length * (bits / 8) + } (DataType::Utf8, 2) | (DataType::Binary, 2) => { if self.array.is_empty() { return Ok(0); @@ -553,7 +567,7 @@ mod tests_to_then_from_ffi { use std::collections::HashMap; use std::mem::ManuallyDrop; - use arrow_buffer::NullBuffer; + use arrow_buffer::{ArrowNativeType, NullBuffer}; use arrow_schema::Field; use crate::builder::UnionBuilder; @@ -783,6 +797,71 @@ mod tests_to_then_from_ffi { test_generic_list::() } + fn test_generic_list_view() -> Result<()> { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int16) + .len(8) + .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = [0_usize, 3, 6] + .iter() + .map(|i| Offset::from_usize(*i).unwrap()) + .collect::(); + + let sizes_buffer = [3_usize, 3, 2] + .iter() + .map(|i| Offset::from_usize(*i).unwrap()) + .collect::(); + + // Construct a list array from the above two + let list_view_dt = GenericListViewArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( + Field::new_list_field(DataType::Int16, false), + )); + + let list_data = ArrayData::builder(list_view_dt) + .len(3) + .add_buffer(value_offsets) + .add_buffer(sizes_buffer) + .add_child_data(value_data) + .build() + .unwrap(); + + let original = GenericListViewArray::::from(list_data.clone()); + + // export it + let (array, schema) = to_ffi(&original.to_data())?; + + // (simulate consumer) import it + let data = unsafe { from_ffi(array, &schema) }?; + let array = make_array(data); + + // downcast + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!(&array.value(0), &original.value(0)); + assert_eq!(&array.value(1), &original.value(1)); + assert_eq!(&array.value(2), &original.value(2)); + + Ok(()) + } + + #[test] + fn test_list_view() -> Result<()> { + test_generic_list_view::() + } + + #[test] + fn test_large_list_view() -> Result<()> { + test_generic_list_view::() + } + fn test_generic_binary() -> Result<()> { // create an array natively let array: Vec> = vec![Some(b"a"), None, Some(b"aaa")]; @@ -1315,6 +1394,7 @@ mod tests_from_ffi { use std::ptr::NonNull; use std::sync::Arc; + use arrow_buffer::NullBuffer; #[cfg(not(feature = "force_validate"))] use arrow_buffer::{ScalarBuffer, bit_util, buffer::Buffer}; #[cfg(feature = "force_validate")] @@ -1325,6 +1405,7 @@ mod tests_from_ffi { use arrow_schema::{DataType, Field}; use super::Result; + use crate::builder::GenericByteViewBuilder; use crate::types::{BinaryViewType, ByteViewType, Int32Type, StringViewType}; use crate::{ @@ -1528,6 +1609,65 @@ mod tests_from_ffi { test_round_trip(&data) } + #[test] + fn test_list_view() -> Result<()> { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int16) + .len(8) + .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from(vec![0_i32, 3, 6]); + let sizes_buffer = Buffer::from(vec![3_i32, 3, 2]); + + // Construct a list array from the above two + let list_view_dt = + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, false))); + + let list_view_data = ArrayData::builder(list_view_dt) + .len(3) + .add_buffer(value_offsets) + .add_buffer(sizes_buffer) + .add_child_data(value_data) + .build() + .unwrap(); + + test_round_trip(&list_view_data) + } + + #[test] + fn test_list_view_with_nulls() -> Result<()> { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int16) + .len(8) + .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7], null] + let value_offsets = Buffer::from(vec![0_i32, 3, 6, 8]); + let sizes_buffer = Buffer::from(vec![3_i32, 3, 2, 0]); + + // Construct a list array from the above two + let list_view_dt = + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, true))); + + let list_view_data = ArrayData::builder(list_view_dt) + .len(4) + .add_buffer(value_offsets) + .add_buffer(sizes_buffer) + .add_child_data(value_data) + .nulls(Some(NullBuffer::from(vec![true, true, true, false]))) + .build() + .unwrap(); + + test_round_trip(&list_view_data) + } + #[test] #[cfg(not(feature = "force_validate"))] fn test_empty_string_with_non_zero_offset() -> Result<()> { diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 91957e14f332..e9988b2ccf99 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -620,6 +620,16 @@ impl ArrayData { vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)], true, ), + DataType::ListView(f) => ( + vec![zeroed(len * 4), zeroed(len * 4)], + vec![ArrayData::new_empty(f.data_type())], + true, + ), + DataType::LargeListView(f) => ( + vec![zeroed(len * 8), zeroed(len * 8)], + vec![ArrayData::new_empty(f.data_type())], + true, + ), DataType::Struct(fields) => ( vec![], fields @@ -1783,7 +1793,7 @@ impl DataTypeLayout { }, ], can_contain_null_mask: true, - variadic: true, + variadic: false, } } } diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 3b46d5729a1f..f5d53155fe78 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -27,7 +27,9 @@ import arrow_pyarrow_integration_testing as rust -PYARROW_PRE_14 = int(pa.__version__.split('.')[0]) < 14 +PYARROW_MAJOR_VER = int(pa.__version__.split(".")[0]) +PYARROW_PRE_14 = PYARROW_MAJOR_VER < 14 +PYARROW_PRE_16 = PYARROW_MAJOR_VER < 16 @contextlib.contextmanager @@ -112,8 +114,16 @@ def assert_pyarrow_leak(): ), ] -_unsupported_pyarrow_types = [ -] +if PYARROW_MAJOR_VER >= 16: + _supported_pyarrow_types.extend( + [ + pa.list_view(pa.uint64()), + pa.large_list_view(pa.uint64()), + pa.list_view(pa.string()), + pa.large_list_view(pa.string()), + ] + ) + # As of pyarrow 14, pyarrow implements the Arrow PyCapsule interface # (https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). @@ -158,12 +168,6 @@ def test_type_roundtrip_pycapsule(pyarrow_type): assert restored == pyarrow_type assert restored is not pyarrow_type - -@pytest.mark.parametrize("pyarrow_type", _unsupported_pyarrow_types, ids=str) -def test_type_roundtrip_raises(pyarrow_type): - with pytest.raises(pa.ArrowException): - rust.round_trip_type(pyarrow_type) - @pytest.mark.parametrize('pyarrow_type', _supported_pyarrow_types, ids=str) def test_field_roundtrip(pyarrow_type): pyarrow_field = pa.field("test", pyarrow_type, nullable=True) @@ -337,6 +341,21 @@ def test_list_array(): del a del b + +@pytest.mark.skipif(PYARROW_PRE_16, reason="requires pyarrow 16") +def test_list_view_array(): + """ + Python -> Rust -> Python + """ + a = pa.array([[], None, [1, 2], [4, 5, 6]], pa.list_view(pa.int64())) + b = rust.round_trip_array(a) + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + del a + del b + + def test_map_array(): """ Python -> Rust -> Python diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 2f890508e081..a1a32224a4f6 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -456,6 +456,14 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { let c_child = c_schema.child(0); DataType::LargeList(Arc::new(Field::try_from(c_child)?)) } + "+vl" => { + let c_child = c_schema.child(0); + DataType::ListView(Arc::new(Field::try_from(c_child)?)) + } + "+vL" => { + let c_child = c_schema.child(0); + DataType::LargeListView(Arc::new(Field::try_from(c_child)?)) + } "+s" => { let fields = c_schema.children().map(Field::try_from); DataType::Struct(fields.collect::>()?) @@ -657,6 +665,8 @@ impl TryFrom<&DataType> for FFI_ArrowSchema { let children = match dtype { DataType::List(child) | DataType::LargeList(child) + | DataType::ListView(child) + | DataType::LargeListView(child) | DataType::FixedSizeList(child, _) | DataType::Map(child, _) => { vec![FFI_ArrowSchema::try_from(child.as_ref())?] @@ -746,6 +756,8 @@ fn get_format_string(dtype: &DataType) -> Result, ArrowError> DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()), DataType::List(_) => Ok("+l".into()), DataType::LargeList(_) => Ok("+L".into()), + DataType::ListView(_) => Ok("+vl".into()), + DataType::LargeListView(_) => Ok("+vL".into()), DataType::Struct(_) => Ok("+s".into()), DataType::Map(_, _) => Ok("+m".into()), DataType::RunEndEncoded(_, _) => Ok("+r".into()), @@ -874,6 +886,16 @@ mod tests { DataType::Int16, false, )))); + round_trip_type(DataType::ListView(Arc::new(Field::new( + "a", + DataType::Int16, + false, + )))); + round_trip_type(DataType::LargeListView(Arc::new(Field::new( + "a", + DataType::Int16, + false, + )))); round_trip_type(DataType::Struct(Fields::from(vec![Field::new( "a", DataType::Utf8,