Skip to content

Commit 079d4f2

Browse files
ding-youngalamb
andauthored
Improve memory usage for arrow-row -> String/BinaryView when utf8 validation disabled (#7917)
# Which issue does this PR close? - Related to #6057 . # Rationale for this change As described in above issue, when constructing a `StringViewArray` from rows, we currently store inline strings twice: once through `make_view`, and again in the `values buffer` so that we can validate utf8 in one go. However, this is suboptimal in terms of memory consumption, so ideally, we should avoid placing inline strings into the values buffer when UTF-8 validation is disabled. # What changes are included in this PR? When UTF-8 validation is disabled, this PR modifies the string/bytes view array construction from rows as follows: 1. The capacity of the values buffer is set to accommodate only long strings plus 12 bytes for a single inline string placeholder. 2. All decoded strings are initially appended to the values buffer. 3. If a string turns out to be an inline string, it is included via `make_view`, and then the corresponding inline portion is truncated from the values buffer, ensuring the inline string does not appear twice in the resulting array. # Are these changes tested? 1. copied & modified existing `fuzz_test` to set disable utf8 validation. 2. Run bench & add bench case when array consists of both inline string & long strings # Are there any user-facing changes? No. # Considered alternatives One idea was to support separate buffers for inline strings even when UTF-8 validation is enabled. However, since we need to call `decoded_len()` first to determine the target buffer, this approach can be tricky or inefficient: - For example, precomputing a boolean flag per string to determine which buffer to use would increase temporary memory usage. - Alternatively, appending to the values buffer first and then moving inline strings to a separate buffer would lead to frequent memcpy overhead. Given that datafusion disables UTF-8 validation when using RowConverter, this PR focuses on improving memory efficiency specifically when validation is turned off. --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent d634ac8 commit 079d4f2

File tree

2 files changed

+95
-6
lines changed

2 files changed

+95
-6
lines changed

arrow-row/src/lib.rs

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3101,13 +3101,16 @@ mod tests {
31013101
}
31023102
}
31033103

3104+
// Convert rows produced from convert_columns().
3105+
// Note: validate_utf8 is set to false since Row is initialized through empty_rows()
31043106
let back = converter.convert_rows(&rows).unwrap();
31053107
for (actual, expected) in back.iter().zip(&arrays) {
31063108
actual.to_data().validate_full().unwrap();
31073109
dictionary_eq(actual, expected)
31083110
}
31093111

3110-
// Check that we can convert
3112+
// Check that we can convert rows into ByteArray and then parse, convert it back to array
3113+
// Note: validate_utf8 is set to true since Row is initialized through RowParser
31113114
let rows = rows.try_into_binary().expect("reasonable size");
31123115
let parser = converter.parser();
31133116
let back = converter
@@ -3238,4 +3241,64 @@ mod tests {
32383241
Ok(_) => panic!("Expected NotYetImplemented error for map data type"),
32393242
}
32403243
}
3244+
3245+
#[test]
3246+
fn test_values_buffer_smaller_when_utf8_validation_disabled() {
3247+
fn get_values_buffer_len(col: ArrayRef) -> (usize, usize) {
3248+
// 1. Convert cols into rows
3249+
let converter = RowConverter::new(vec![SortField::new(DataType::Utf8View)]).unwrap();
3250+
3251+
// 2a. Convert rows into colsa (validate_utf8 = false)
3252+
let rows = converter.convert_columns(&[col]).unwrap();
3253+
let converted = converter.convert_rows(&rows).unwrap();
3254+
let unchecked_values_len = converted[0].as_string_view().data_buffers()[0].len();
3255+
3256+
// 2b. Convert rows into cols (validate_utf8 = true since Row is initialized through RowParser)
3257+
let rows = rows.try_into_binary().expect("reasonable size");
3258+
let parser = converter.parser();
3259+
let converted = converter
3260+
.convert_rows(rows.iter().map(|b| parser.parse(b.expect("valid bytes"))))
3261+
.unwrap();
3262+
let checked_values_len = converted[0].as_string_view().data_buffers()[0].len();
3263+
(unchecked_values_len, checked_values_len)
3264+
}
3265+
3266+
// Case1. StringViewArray with inline strings
3267+
let col = Arc::new(StringViewArray::from_iter([
3268+
Some("hello"), // short(5)
3269+
None, // null
3270+
Some("short"), // short(5)
3271+
Some("tiny"), // short(4)
3272+
])) as ArrayRef;
3273+
3274+
let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
3275+
// Since there are no long (>12) strings, len of values buffer is 0
3276+
assert_eq!(unchecked_values_len, 0);
3277+
// When utf8 validation enabled, values buffer includes inline strings (5+5+4)
3278+
assert_eq!(checked_values_len, 14);
3279+
3280+
// Case2. StringViewArray with long(>12) strings
3281+
let col = Arc::new(StringViewArray::from_iter([
3282+
Some("this is a very long string over 12 bytes"),
3283+
Some("another long string to test the buffer"),
3284+
])) as ArrayRef;
3285+
3286+
let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
3287+
// Since there are no inline strings, expected length of values buffer is the same
3288+
assert!(unchecked_values_len > 0);
3289+
assert_eq!(unchecked_values_len, checked_values_len);
3290+
3291+
// Case3. StringViewArray with both short and long strings
3292+
let col = Arc::new(StringViewArray::from_iter([
3293+
Some("tiny"), // 4 (short)
3294+
Some("thisisexact13"), // 13 (long)
3295+
None,
3296+
Some("short"), // 5 (short)
3297+
])) as ArrayRef;
3298+
3299+
let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
3300+
// Since there is single long string, len of values buffer is 13
3301+
assert_eq!(unchecked_values_len, 13);
3302+
assert!(checked_values_len > unchecked_values_len);
3303+
}
32413304
}

arrow-row/src/variable.rs

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use arrow_array::builder::BufferBuilder;
2020
use arrow_array::*;
2121
use arrow_buffer::bit_util::ceil;
2222
use arrow_buffer::MutableBuffer;
23-
use arrow_data::ArrayDataBuilder;
23+
use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN};
2424
use arrow_schema::{DataType, SortOptions};
2525
use builder::make_view;
2626

@@ -249,9 +249,10 @@ pub fn decode_binary<I: OffsetSizeTrait>(
249249
fn decode_binary_view_inner(
250250
rows: &mut [&[u8]],
251251
options: SortOptions,
252-
check_utf8: bool,
252+
validate_utf8: bool,
253253
) -> BinaryViewArray {
254254
let len = rows.len();
255+
let inline_str_max_len = MAX_INLINE_VIEW_LEN as usize;
255256

256257
let mut null_count = 0;
257258

@@ -261,13 +262,33 @@ fn decode_binary_view_inner(
261262
valid
262263
});
263264

264-
let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum();
265+
// If we are validating UTF-8, decode all string values (including short strings)
266+
// into the values buffer and validate UTF-8 once. If not validating,
267+
// we save memory by only copying long strings to the values buffer, as short strings
268+
// will be inlined into the view and do not need to be stored redundantly.
269+
let values_capacity = if validate_utf8 {
270+
// Capacity for all long and short strings
271+
rows.iter().map(|row| decoded_len(row, options)).sum()
272+
} else {
273+
// Capacity for all long strings plus room for one short string
274+
rows.iter().fold(0, |acc, row| {
275+
let len = decoded_len(row, options);
276+
if len > inline_str_max_len {
277+
acc + len
278+
} else {
279+
acc
280+
}
281+
}) + inline_str_max_len
282+
};
265283
let mut values = MutableBuffer::new(values_capacity);
266-
let mut views = BufferBuilder::<u128>::new(len);
267284

285+
let mut views = BufferBuilder::<u128>::new(len);
268286
for row in rows {
269287
let start_offset = values.len();
270288
let offset = decode_blocks(row, options, |b| values.extend_from_slice(b));
289+
// Measure string length via change in values buffer.
290+
// Used to check if decoded value should be truncated (short string) when validate_utf8 is false
291+
let decoded_len = values.len() - start_offset;
271292
if row[0] == null_sentinel(options) {
272293
debug_assert_eq!(offset, 1);
273294
debug_assert_eq!(start_offset, values.len());
@@ -282,11 +303,16 @@ fn decode_binary_view_inner(
282303

283304
let view = make_view(val, 0, start_offset as u32);
284305
views.append(view);
306+
307+
// truncate inline string in values buffer if validate_utf8 is false
308+
if !validate_utf8 && decoded_len <= inline_str_max_len {
309+
values.truncate(start_offset);
310+
}
285311
}
286312
*row = &row[offset..];
287313
}
288314

289-
if check_utf8 {
315+
if validate_utf8 {
290316
// the values contains all data, no matter if it is short or long
291317
// we can validate utf8 in one go.
292318
std::str::from_utf8(values.as_slice()).unwrap();

0 commit comments

Comments
 (0)