Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 143 additions & 3 deletions arrow-array/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ pub unsafe fn export_array_into_raw(
Ok(())
}

// returns the number of bits that buffer `i` (in the C data interface) is expected to have.
// This is set by the Arrow specification
/// returns the number of bits that buffer `i` (in the C data interface) is expected to have.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just nicer when working with IDEs.

/// This is set by the Arrow specification
fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
if let Some(primitive) = data_type.primitive_width() {
return match i {
Expand Down Expand Up @@ -180,6 +180,10 @@ fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
| (DataType::List(_), 1)
| (DataType::Map(_, _), 1) => i32::BITS as _,
(DataType::Utf8, 2) | (DataType::Binary, 2) => u8::BITS as _,
// List views have two i32 buffers, offsets and sizes
(DataType::ListView(_), 1) | (DataType::ListView(_), 2) => i32::BITS as _,
// Large list views have two i64 buffers, offsets and sizes
(DataType::LargeListView(_), 1) | (DataType::LargeListView(_), 2) => i64::BITS as _,
(DataType::List(_), _) | (DataType::Map(_, _), _) => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented."
Expand Down Expand Up @@ -351,6 +355,8 @@ impl ImportedArrowArray<'_> {
DataType::List(field)
| DataType::FixedSizeList(field, _)
| DataType::LargeList(field)
| DataType::ListView(field)
| DataType::LargeListView(field)
| DataType::Map(field, _) => Ok([self.consume_child(0, field.data_type())?].to_vec()),
DataType::Struct(fields) => {
assert!(fields.len() == self.array.num_children());
Expand Down Expand Up @@ -471,6 +477,14 @@ impl ImportedArrowArray<'_> {
debug_assert_eq!(bits % 8, 0);
(length + 1) * (bits / 8)
}
(DataType::ListView(_), 1)
| (DataType::ListView(_), 2)
| (DataType::LargeListView(_), 1)
| (DataType::LargeListView(_), 2) => {
let bits = bit_width(data_type, i)?;
debug_assert_eq!(bits % 8, 0);
length * (bits / 8)
}
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
if self.array.is_empty() {
return Ok(0);
Expand Down Expand Up @@ -553,7 +567,7 @@ mod tests_to_then_from_ffi {
use std::collections::HashMap;
use std::mem::ManuallyDrop;

use arrow_buffer::NullBuffer;
use arrow_buffer::{ArrowNativeType, NullBuffer};
use arrow_schema::Field;

use crate::builder::UnionBuilder;
Expand Down Expand Up @@ -783,6 +797,71 @@ mod tests_to_then_from_ffi {
test_generic_list::<i64>()
}

fn test_generic_list_view<Offset: OffsetSizeTrait + ArrowNativeType>() -> Result<()> {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int16)
.len(8)
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
.build()
.unwrap();

// Construct a buffer for value offsets, for the nested array:
// [[0, 1, 2], [3, 4, 5], [6, 7]]
let value_offsets = [0_usize, 3, 6]
.iter()
.map(|i| Offset::from_usize(*i).unwrap())
.collect::<Buffer>();

let sizes_buffer = [3_usize, 3, 2]
.iter()
.map(|i| Offset::from_usize(*i).unwrap())
.collect::<Buffer>();

// Construct a list array from the above two
let list_view_dt = GenericListViewArray::<Offset>::DATA_TYPE_CONSTRUCTOR(Arc::new(
Field::new_list_field(DataType::Int16, false),
));

let list_data = ArrayData::builder(list_view_dt)
.len(3)
.add_buffer(value_offsets)
.add_buffer(sizes_buffer)
.add_child_data(value_data)
.build()
.unwrap();

let original = GenericListViewArray::<Offset>::from(list_data.clone());

// export it
let (array, schema) = to_ffi(&original.to_data())?;

// (simulate consumer) import it
let data = unsafe { from_ffi(array, &schema) }?;
let array = make_array(data);

// downcast
let array = array
.as_any()
.downcast_ref::<GenericListViewArray<Offset>>()
.unwrap();

assert_eq!(&array.value(0), &original.value(0));
assert_eq!(&array.value(1), &original.value(1));
assert_eq!(&array.value(2), &original.value(2));

Ok(())
}

#[test]
fn test_list_view() -> Result<()> {
test_generic_list_view::<i32>()
}

#[test]
fn test_large_list_view() -> Result<()> {
test_generic_list_view::<i64>()
}

fn test_generic_binary<Offset: OffsetSizeTrait>() -> Result<()> {
// create an array natively
let array: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")];
Expand Down Expand Up @@ -1315,6 +1394,7 @@ mod tests_from_ffi {
use std::ptr::NonNull;
use std::sync::Arc;

use arrow_buffer::NullBuffer;
#[cfg(not(feature = "force_validate"))]
use arrow_buffer::{ScalarBuffer, bit_util, buffer::Buffer};
#[cfg(feature = "force_validate")]
Expand All @@ -1325,6 +1405,7 @@ mod tests_from_ffi {
use arrow_schema::{DataType, Field};

use super::Result;

use crate::builder::GenericByteViewBuilder;
use crate::types::{BinaryViewType, ByteViewType, Int32Type, StringViewType};
use crate::{
Expand Down Expand Up @@ -1528,6 +1609,65 @@ mod tests_from_ffi {
test_round_trip(&data)
}

#[test]
fn test_list_view() -> Result<()> {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int16)
.len(8)
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
.build()
.unwrap();

// Construct a buffer for value offsets, for the nested array:
// [[0, 1, 2], [3, 4, 5], [6, 7]]
let value_offsets = Buffer::from(vec![0_i32, 3, 6]);
let sizes_buffer = Buffer::from(vec![3_i32, 3, 2]);

// Construct a list array from the above two
let list_view_dt =
DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, false)));

let list_view_data = ArrayData::builder(list_view_dt)
.len(3)
.add_buffer(value_offsets)
.add_buffer(sizes_buffer)
.add_child_data(value_data)
.build()
.unwrap();

test_round_trip(&list_view_data)
}

#[test]
fn test_list_view_with_nulls() -> Result<()> {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int16)
.len(8)
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
.build()
.unwrap();

// Construct a buffer for value offsets, for the nested array:
// [[0, 1, 2], [3, 4, 5], [6, 7], null]
let value_offsets = Buffer::from(vec![0_i32, 3, 6, 8]);
let sizes_buffer = Buffer::from(vec![3_i32, 3, 2, 0]);

// Construct a list array from the above two
let list_view_dt =
DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, true)));

let list_view_data = ArrayData::builder(list_view_dt)
.len(4)
.add_buffer(value_offsets)
.add_buffer(sizes_buffer)
.add_child_data(value_data)
.nulls(Some(NullBuffer::from(vec![true, true, true, false])))
.build()
.unwrap();

test_round_trip(&list_view_data)
}

#[test]
#[cfg(not(feature = "force_validate"))]
fn test_empty_string_with_non_zero_offset() -> Result<()> {
Expand Down
12 changes: 11 additions & 1 deletion arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,16 @@ impl ArrayData {
vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
true,
),
DataType::ListView(f) => (
vec![zeroed(len * 4), zeroed(len * 4)],
vec![ArrayData::new_empty(f.data_type())],
true,
),
DataType::LargeListView(f) => (
vec![zeroed(len * 8), zeroed(len * 8)],
vec![ArrayData::new_empty(f.data_type())],
true,
),
DataType::Struct(fields) => (
vec![],
fields
Expand Down Expand Up @@ -1783,7 +1793,7 @@ impl DataTypeLayout {
},
],
can_contain_null_mask: true,
variadic: true,
variadic: false,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bug, see the spec, list views aren't represented by a variable number of buffers.

}
}
}
Expand Down
37 changes: 28 additions & 9 deletions arrow-pyarrow-integration-testing/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@

import arrow_pyarrow_integration_testing as rust

PYARROW_PRE_14 = int(pa.__version__.split('.')[0]) < 14
PYARROW_MAJOR_VER = int(pa.__version__.split(".")[0])
PYARROW_PRE_14 = PYARROW_MAJOR_VER < 14
PYARROW_PRE_16 = PYARROW_MAJOR_VER < 16


@contextlib.contextmanager
Expand Down Expand Up @@ -112,8 +114,16 @@ def assert_pyarrow_leak():
),
]

_unsupported_pyarrow_types = [
]
if PYARROW_MAJOR_VER >= 16:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did this to get everything green, but I think that it might be a good opportunity to only test with pyarrow 16 and above, which supports all relevant features here - stringview, bianryview and listview and the pycapsule API.
By the time this change (if accepted) will be released, pyarrow 15 will be almost two years old, which seems like enough time to drop dedicated testing for it. This will allow the CI to focus on more relevant releases that also include all the APIs arrow-rs supports.

_supported_pyarrow_types.extend(
[
pa.list_view(pa.uint64()),
pa.large_list_view(pa.uint64()),
pa.list_view(pa.string()),
pa.large_list_view(pa.string()),
]
)


# As of pyarrow 14, pyarrow implements the Arrow PyCapsule interface
# (https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
Expand Down Expand Up @@ -158,12 +168,6 @@ def test_type_roundtrip_pycapsule(pyarrow_type):
assert restored == pyarrow_type
assert restored is not pyarrow_type


@pytest.mark.parametrize("pyarrow_type", _unsupported_pyarrow_types, ids=str)
def test_type_roundtrip_raises(pyarrow_type):
with pytest.raises(pa.ArrowException):
rust.round_trip_type(pyarrow_type)

@pytest.mark.parametrize('pyarrow_type', _supported_pyarrow_types, ids=str)
def test_field_roundtrip(pyarrow_type):
pyarrow_field = pa.field("test", pyarrow_type, nullable=True)
Expand Down Expand Up @@ -337,6 +341,21 @@ def test_list_array():
del a
del b


@pytest.mark.skipif(PYARROW_PRE_16, reason="requires pyarrow 16")
def test_list_view_array():
"""
Python -> Rust -> Python
"""
a = pa.array([[], None, [1, 2], [4, 5, 6]], pa.list_view(pa.int64()))
b = rust.round_trip_array(a)
b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type
del a
del b


def test_map_array():
"""
Python -> Rust -> Python
Expand Down
22 changes: 22 additions & 0 deletions arrow-schema/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,14 @@ impl TryFrom<&FFI_ArrowSchema> for DataType {
let c_child = c_schema.child(0);
DataType::LargeList(Arc::new(Field::try_from(c_child)?))
}
"+vl" => {
let c_child = c_schema.child(0);
DataType::ListView(Arc::new(Field::try_from(c_child)?))
}
"+vL" => {
let c_child = c_schema.child(0);
DataType::LargeListView(Arc::new(Field::try_from(c_child)?))
}
"+s" => {
let fields = c_schema.children().map(Field::try_from);
DataType::Struct(fields.collect::<Result<_, ArrowError>>()?)
Expand Down Expand Up @@ -657,6 +665,8 @@ impl TryFrom<&DataType> for FFI_ArrowSchema {
let children = match dtype {
DataType::List(child)
| DataType::LargeList(child)
| DataType::ListView(child)
| DataType::LargeListView(child)
| DataType::FixedSizeList(child, _)
| DataType::Map(child, _) => {
vec![FFI_ArrowSchema::try_from(child.as_ref())?]
Expand Down Expand Up @@ -746,6 +756,8 @@ fn get_format_string(dtype: &DataType) -> Result<Cow<'static, str>, ArrowError>
DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()),
DataType::List(_) => Ok("+l".into()),
DataType::LargeList(_) => Ok("+L".into()),
DataType::ListView(_) => Ok("+vl".into()),
DataType::LargeListView(_) => Ok("+vL".into()),
DataType::Struct(_) => Ok("+s".into()),
DataType::Map(_, _) => Ok("+m".into()),
DataType::RunEndEncoded(_, _) => Ok("+r".into()),
Expand Down Expand Up @@ -874,6 +886,16 @@ mod tests {
DataType::Int16,
false,
))));
round_trip_type(DataType::ListView(Arc::new(Field::new(
"a",
DataType::Int16,
false,
))));
round_trip_type(DataType::LargeListView(Arc::new(Field::new(
"a",
DataType::Int16,
false,
))));
round_trip_type(DataType::Struct(Fields::from(vec![Field::new(
"a",
DataType::Utf8,
Expand Down
Loading