Improve memory usage for arrow-row -> String/BinaryView when utf8 validation disabled (#7917)

ding-young · alamb · web-flow · commit 079d4f2db87c · 2025-07-29T11:52:51.000-04:00
# Which issue does this PR close? - Related to #6057 . # Rationale for this change As described in above issue, when constructing a `StringViewArray` from rows, we currently store inline strings twice: once through `make_view`, and again in the `values buffer` so that we can validate utf8 in one go. However, this is suboptimal in terms of memory consumption, so ideally, we should avoid placing inline strings into the values buffer when UTF-8 validation is disabled. # What changes are included in this PR? When UTF-8 validation is disabled, this PR modifies the string/bytes view array construction from rows as follows: 1. The capacity of the values buffer is set to accommodate only long strings plus 12 bytes for a single inline string placeholder. 2. All decoded strings are initially appended to the values buffer. 3. If a string turns out to be an inline string, it is included via `make_view`, and then the corresponding inline portion is truncated from the values buffer, ensuring the inline string does not appear twice in the resulting array. # Are these changes tested? 1. copied & modified existing `fuzz_test` to set disable utf8 validation. 2. Run bench & add bench case when array consists of both inline string & long strings # Are there any user-facing changes? No. # Considered alternatives One idea was to support separate buffers for inline strings even when UTF-8 validation is enabled. However, since we need to call `decoded_len()` first to determine the target buffer, this approach can be tricky or inefficient: - For example, precomputing a boolean flag per string to determine which buffer to use would increase temporary memory usage. - Alternatively, appending to the values buffer first and then moving inline strings to a separate buffer would lead to frequent memcpy overhead. Given that datafusion disables UTF-8 validation when using RowConverter, this PR focuses on improving memory efficiency specifically when validation is turned off. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
@@ -3101,13 +3101,16 @@ mod tests {
                 }
             }
 
+            // Convert rows produced from convert_columns().
+            // Note: validate_utf8 is set to false since Row is initialized through empty_rows()
             let back = converter.convert_rows(&rows).unwrap();
             for (actual, expected) in back.iter().zip(&arrays) {
                 actual.to_data().validate_full().unwrap();
                 dictionary_eq(actual, expected)
             }
 
-            // Check that we can convert
+            // Check that we can convert rows into ByteArray and then parse, convert it back to array
+            // Note: validate_utf8 is set to true since Row is initialized through RowParser
             let rows = rows.try_into_binary().expect("reasonable size");
             let parser = converter.parser();
             let back = converter
@@ -3238,4 +3241,64 @@ mod tests {
             Ok(_) => panic!("Expected NotYetImplemented error for map data type"),
         }
     }
+
+    #[test]
+    fn test_values_buffer_smaller_when_utf8_validation_disabled() {
+        fn get_values_buffer_len(col: ArrayRef) -> (usize, usize) {
+            // 1. Convert cols into rows
+            let converter = RowConverter::new(vec![SortField::new(DataType::Utf8View)]).unwrap();
+
+            // 2a. Convert rows into colsa (validate_utf8 = false)
+            let rows = converter.convert_columns(&[col]).unwrap();
+            let converted = converter.convert_rows(&rows).unwrap();
+            let unchecked_values_len = converted[0].as_string_view().data_buffers()[0].len();
+
+            // 2b. Convert rows into cols (validate_utf8 = true since Row is initialized through RowParser)
+            let rows = rows.try_into_binary().expect("reasonable size");
+            let parser = converter.parser();
+            let converted = converter
+                .convert_rows(rows.iter().map(|b| parser.parse(b.expect("valid bytes"))))
+                .unwrap();
+            let checked_values_len = converted[0].as_string_view().data_buffers()[0].len();
+            (unchecked_values_len, checked_values_len)
+        }
+
+        // Case1. StringViewArray with inline strings
+        let col = Arc::new(StringViewArray::from_iter([
+            Some("hello"), // short(5)
+            None,          // null
+            Some("short"), // short(5)
+            Some("tiny"),  // short(4)
+        ])) as ArrayRef;
+
+        let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
+        // Since there are no long (>12) strings, len of values buffer is 0
+        assert_eq!(unchecked_values_len, 0);
+        // When utf8 validation enabled, values buffer includes inline strings (5+5+4)
+        assert_eq!(checked_values_len, 14);
+
+        // Case2. StringViewArray with long(>12) strings
+        let col = Arc::new(StringViewArray::from_iter([
+            Some("this is a very long string over 12 bytes"),
+            Some("another long string to test the buffer"),
+        ])) as ArrayRef;
+
+        let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
+        // Since there are no inline strings, expected length of values buffer is the same
+        assert!(unchecked_values_len > 0);
+        assert_eq!(unchecked_values_len, checked_values_len);
+
+        // Case3. StringViewArray with both short and long strings
+        let col = Arc::new(StringViewArray::from_iter([
+            Some("tiny"),          // 4 (short)
+            Some("thisisexact13"), // 13 (long)
+            None,
+            Some("short"), // 5 (short)
+        ])) as ArrayRef;
+
+        let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
+        // Since there is single long string, len of values buffer is 13
+        assert_eq!(unchecked_values_len, 13);
+        assert!(checked_values_len > unchecked_values_len);
+    }
 }
diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
@@ -20,7 +20,7 @@ use arrow_array::builder::BufferBuilder;
 use arrow_array::*;
 use arrow_buffer::bit_util::ceil;
 use arrow_buffer::MutableBuffer;
-use arrow_data::ArrayDataBuilder;
+use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN};
 use arrow_schema::{DataType, SortOptions};
 use builder::make_view;
 
@@ -249,9 +249,10 @@ pub fn decode_binary<I: OffsetSizeTrait>(
 fn decode_binary_view_inner(
     rows: &mut [&[u8]],
     options: SortOptions,
-    check_utf8: bool,
+    validate_utf8: bool,
 ) -> BinaryViewArray {
     let len = rows.len();
+    let inline_str_max_len = MAX_INLINE_VIEW_LEN as usize;
 
     let mut null_count = 0;
 
@@ -261,13 +262,33 @@ fn decode_binary_view_inner(
         valid
     });
 
-    let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum();
+    // If we are validating UTF-8, decode all string values (including short strings)
+    // into the values buffer and validate UTF-8 once. If not validating,
+    // we save memory by only copying long strings to the values buffer, as short strings
+    // will be inlined into the view and do not need to be stored redundantly.
+    let values_capacity = if validate_utf8 {
+        // Capacity for all long and short strings
+        rows.iter().map(|row| decoded_len(row, options)).sum()
+    } else {
+        // Capacity for all long strings plus room for one short string
+        rows.iter().fold(0, |acc, row| {
+            let len = decoded_len(row, options);
+            if len > inline_str_max_len {
+                acc + len
+            } else {
+                acc
+            }
+        }) + inline_str_max_len
+    };
     let mut values = MutableBuffer::new(values_capacity);
-    let mut views = BufferBuilder::<u128>::new(len);
 
+    let mut views = BufferBuilder::<u128>::new(len);
     for row in rows {
         let start_offset = values.len();
         let offset = decode_blocks(row, options, |b| values.extend_from_slice(b));
+        // Measure string length via change in values buffer.
+        // Used to check if decoded value should be truncated (short string) when validate_utf8 is false
+        let decoded_len = values.len() - start_offset;
         if row[0] == null_sentinel(options) {
             debug_assert_eq!(offset, 1);
             debug_assert_eq!(start_offset, values.len());
@@ -282,11 +303,16 @@ fn decode_binary_view_inner(
 
             let view = make_view(val, 0, start_offset as u32);
             views.append(view);
+
+            // truncate inline string in values buffer if validate_utf8 is false
+            if !validate_utf8 && decoded_len <= inline_str_max_len {
+                values.truncate(start_offset);
+            }
         }
         *row = &row[offset..];
     }
 
-    if check_utf8 {
+    if validate_utf8 {
         // the values contains all data, no matter if it is short or long
         // we can validate utf8 in one go.
         std::str::from_utf8(values.as_slice()).unwrap();