Skip to content

Commit d8b5ef7

Browse files
[Variant] Avoid superflous validation checks (#7906)
# Which issue does this PR close? - Closes #7900 # Rational for this change We can avoid certain checks in the validation code since other checks already guarantee these invariants for us
1 parent 7b219f9 commit d8b5ef7

File tree

4 files changed

+38
-41
lines changed

4 files changed

+38
-41
lines changed

parquet-variant/src/variant.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,8 +1199,20 @@ impl TryFrom<(i128, u8)> for Variant<'_, '_> {
11991199

12001200
#[cfg(test)]
12011201
mod tests {
1202+
12021203
use super::*;
12031204

1205+
#[test]
1206+
fn test_empty_variant_will_fail() {
1207+
let metadata = VariantMetadata::try_new(&[1, 0, 0]).unwrap();
1208+
1209+
let err = Variant::try_new_with_metadata(metadata, &[]).unwrap_err();
1210+
1211+
assert!(matches!(
1212+
err,
1213+
ArrowError::InvalidArgumentError(ref msg) if msg == "Received empty bytes"));
1214+
}
1215+
12041216
#[test]
12051217
fn test_construct_short_string() {
12061218
let short_string = ShortString::try_new("norm").expect("should fit in short string");

parquet-variant/src/variant/list.rs

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -216,28 +216,19 @@ impl<'m, 'v> VariantList<'m, 'v> {
216216
self.header.first_offset_byte() as _..self.first_value_byte as _,
217217
)?;
218218

219-
let offsets =
220-
map_bytes_to_offsets(offset_buffer, self.header.offset_size).collect::<Vec<_>>();
221-
222-
// Validate offsets are in-bounds and monotonically increasing.
223-
// Since shallow verification checks whether the first and last offsets are in-bounds,
224-
// we can also verify all offsets are in-bounds by checking if offsets are monotonically increasing.
225-
let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
226-
if !are_offsets_monotonic {
227-
return Err(ArrowError::InvalidArgumentError(
228-
"offsets are not monotonically increasing".to_string(),
229-
));
230-
}
231-
232219
let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?;
233220

234221
// Validate whether values are valid variant objects
235-
for i in 1..offsets.len() {
236-
let start_offset = offsets[i - 1];
237-
let end_offset = offsets[i];
238-
239-
let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?;
222+
//
223+
// Since we use offsets to slice into the value buffer, this also verifies all offsets are in-bounds
224+
// and monotonically increasing
225+
let mut offset_iter = map_bytes_to_offsets(offset_buffer, self.header.offset_size);
226+
let mut current_offset = offset_iter.next().unwrap_or(0);
227+
228+
for next_offset in offset_iter {
229+
let value_bytes = slice_from_slice(value_buffer, current_offset..next_offset)?;
240230
Variant::try_new_with_metadata(self.metadata.clone(), value_bytes)?;
231+
current_offset = next_offset;
241232
}
242233

243234
self.validated = true;

parquet-variant/src/variant/metadata.rs

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -237,22 +237,15 @@ impl<'m> VariantMetadata<'m> {
237237
let offsets =
238238
map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>();
239239

240-
// Validate offsets are in-bounds and monotonically increasing.
241-
// Since shallow validation ensures the first and last offsets are in bounds, we can also verify all offsets
242-
// are in-bounds by checking if offsets are monotonically increasing.
243-
let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
244-
if !are_offsets_monotonic {
245-
return Err(ArrowError::InvalidArgumentError(
246-
"offsets not monotonically increasing".to_string(),
247-
));
248-
}
249-
250240
// Verify the string values in the dictionary are UTF-8 encoded strings.
251241
let value_buffer =
252242
string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
253243

254244
if self.header.is_sorted {
255245
// Validate the dictionary values are unique and lexicographically sorted
246+
//
247+
// Since we use the offsets to access dictionary values, this also validates
248+
// offsets are in-bounds and monotonically increasing
256249
let are_dictionary_values_unique_and_sorted = (1..offsets.len())
257250
.map(|i| {
258251
let field_range = offsets[i - 1]..offsets[i];
@@ -268,6 +261,18 @@ impl<'m> VariantMetadata<'m> {
268261
"dictionary values are not unique and ordered".to_string(),
269262
));
270263
}
264+
} else {
265+
// Validate offsets are in-bounds and monotonically increasing
266+
//
267+
// Since shallow validation ensures the first and last offsets are in bounds,
268+
// we can also verify all offsets are in-bounds by checking if
269+
// offsets are monotonically increasing
270+
let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
271+
if !are_offsets_monotonic {
272+
return Err(ArrowError::InvalidArgumentError(
273+
"offsets not monotonically increasing".to_string(),
274+
));
275+
}
271276
}
272277

273278
self.validated = true;

parquet-variant/src/variant/object.rs

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,8 @@ impl<'m, 'v> VariantObject<'m, 'v> {
242242
} else {
243243
// The metadata dictionary can't guarantee uniqueness or sortedness, so we have to parse out the corresponding field names
244244
// to check lexicographical order
245+
//
246+
// Since we are probing the metadata dictionary by field id, this also verifies field ids are in-bounds
245247
let are_field_names_sorted = field_ids
246248
.iter()
247249
.map(|&i| self.metadata.get(i))
@@ -253,19 +255,6 @@ impl<'m, 'v> VariantObject<'m, 'v> {
253255
"field names not sorted".to_string(),
254256
));
255257
}
256-
257-
// Since field ids are not guaranteed to be sorted, scan over all field ids
258-
// and check that field ids are less than dictionary size
259-
260-
let are_field_ids_in_bounds = field_ids
261-
.iter()
262-
.all(|&id| id < self.metadata.dictionary_size());
263-
264-
if !are_field_ids_in_bounds {
265-
return Err(ArrowError::InvalidArgumentError(
266-
"field id is not valid".to_string(),
267-
));
268-
}
269258
}
270259

271260
// Validate whether values are valid variant objects

0 commit comments

Comments
 (0)