Skip to content

Commit 94d51f4

Browse files
authored
perf: add repeat_slice_n_times to MutableBuffer (#8658)
# Which issue does this PR close? N/A # Rationale for this change I want to repeat the same value multiple times in a very fast way which will be used in: - #8653 After this and the pr below is merged will improve the datafusion scalar to array to use this and make it really really fast: - #8656 # What changes are included in this PR? Created a function in `MutableBuffer` to repeat a slice a number of times in a logarithmic way to reduce memcopy calls # Are these changes tested? Yes # Are there any user-facing changes? Yes, and added docs ------- Extracted from: - #8653 Benchmark results on local machine | Slice Length | Repetitions (n) | repeat_slice_n_times | extend_from_slice loop | Speedup | |--------------|-----------------|----------------------|------------------------|---------| | 3 | 3 | 47.092 ns | 41.910 ns | 0.89x | | 3 | 64 | 63.548 ns | 222.29 ns | 3.50x | | 3 | 1024 | 105.57 ns | 3.031 µs | 28.7x | | 3 | 8192 | 405.71 ns | 24.170 µs | 59.6x | | 20 | 3 | 48.437 ns | 46.437 ns | 0.96x | | 20 | 64 | 74.993 ns | 319.04 ns | 4.25x | | 20 | 1024 | 350.94 ns | 4.437 µs | 12.6x | | 20 | 8192 | 2.440 µs | 35.524 µs | 14.6x | | 100 | 3 | 50.369 ns | 47.568 ns | 0.94x | | 100 | 64 | 119.70 ns | 165.37 ns | 1.38x | | 100 | 1024 | 1.734 µs | 2.623 µs | 1.51x | | 100 | 8192 | 10.615 µs | 19.750 µs | 1.86x | these are the results: <details> <summary>Result</summary> ``` MutableBuffer repeat slice/repeat_slice_n_times/slice_len=3 n=3 time: [46.719 ns 47.092 ns 47.453 ns] Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) low mild 1 (1.00%) high mild MutableBuffer repeat slice/extend_from_slice loop/slice_len=3 n=3 time: [41.833 ns 41.910 ns 41.996 ns] Found 11 outliers among 100 measurements (11.00%) 9 (9.00%) high mild 2 (2.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=3 n=64 time: [62.935 ns 63.548 ns 64.183 ns] Found 5 outliers among 100 measurements (5.00%) 5 (5.00%) high mild MutableBuffer repeat slice/extend_from_slice loop/slice_len=3 n=64 time: [221.75 ns 222.29 ns 222.86 ns] Found 5 outliers among 100 measurements (5.00%) 3 (3.00%) high mild 2 (2.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=3 n=1024 time: [105.15 ns 105.57 ns 106.01 ns] Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high severe MutableBuffer repeat slice/extend_from_slice loop/slice_len=3 n=1024 time: [3.0240 µs 3.0308 µs 3.0395 µs] Found 11 outliers among 100 measurements (11.00%) 2 (2.00%) low mild 5 (5.00%) high mild 4 (4.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=3 n=8192 time: [401.57 ns 405.71 ns 409.94 ns] Found 6 outliers among 100 measurements (6.00%) 6 (6.00%) high mild MutableBuffer repeat slice/extend_from_slice loop/slice_len=3 n=8192 time: [24.124 µs 24.170 µs 24.222 µs] Found 5 outliers among 100 measurements (5.00%) 3 (3.00%) high mild 2 (2.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=20 n=3 time: [48.287 ns 48.437 ns 48.606 ns] Found 8 outliers among 100 measurements (8.00%) 5 (5.00%) high mild 3 (3.00%) high severe MutableBuffer repeat slice/extend_from_slice loop/slice_len=20 n=3 time: [46.289 ns 46.437 ns 46.611 ns] Found 6 outliers among 100 measurements (6.00%) 3 (3.00%) high mild 3 (3.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=20 n=64 time: [74.625 ns 74.993 ns 75.395 ns] Found 3 outliers among 100 measurements (3.00%) 3 (3.00%) high mild MutableBuffer repeat slice/extend_from_slice loop/slice_len=20 n=64 time: [318.20 ns 319.04 ns 319.98 ns] Found 8 outliers among 100 measurements (8.00%) 3 (3.00%) high mild 5 (5.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=20 n=1024 time: [346.66 ns 350.94 ns 355.17 ns] Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) low mild 2 (2.00%) high severe MutableBuffer repeat slice/extend_from_slice loop/slice_len=20 n=1024 time: [4.4251 µs 4.4369 µs 4.4506 µs] Found 8 outliers among 100 measurements (8.00%) 1 (1.00%) low mild 2 (2.00%) high mild 5 (5.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=20 n=8192 time: [2.4336 µs 2.4401 µs 2.4465 µs] Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) high mild 1 (1.00%) high severe MutableBuffer repeat slice/extend_from_slice loop/slice_len=20 n=8192 time: [35.466 µs 35.524 µs 35.589 µs] Found 4 outliers among 100 measurements (4.00%) 1 (1.00%) low mild 2 (2.00%) high mild 1 (1.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=100 n=3 time: [50.209 ns 50.369 ns 50.530 ns] Found 5 outliers among 100 measurements (5.00%) 5 (5.00%) high mild MutableBuffer repeat slice/extend_from_slice loop/slice_len=100 n=3 time: [47.439 ns 47.568 ns 47.701 ns] Found 2 outliers among 100 measurements (2.00%) 2 (2.00%) high mild MutableBuffer repeat slice/repeat_slice_n_times/slice_len=100 n=64 time: [117.77 ns 119.70 ns 122.00 ns] Found 12 outliers among 100 measurements (12.00%) 7 (7.00%) high mild 5 (5.00%) high severe MutableBuffer repeat slice/extend_from_slice loop/slice_len=100 n=64 time: [164.88 ns 165.37 ns 166.07 ns] Found 6 outliers among 100 measurements (6.00%) 5 (5.00%) high mild 1 (1.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=100 n=1024 time: [1.7278 µs 1.7335 µs 1.7398 µs] Found 7 outliers among 100 measurements (7.00%) 1 (1.00%) low mild 5 (5.00%) high mild 1 (1.00%) high severe MutableBuffer repeat slice/extend_from_slice loop/slice_len=100 n=1024 time: [2.6176 µs 2.6232 µs 2.6305 µs] Found 5 outliers among 100 measurements (5.00%) 1 (1.00%) high mild 4 (4.00%) high severe MutableBuffer repeat slice/repeat_slice_n_times/slice_len=100 n=8192 time: [10.583 µs 10.615 µs 10.649 µs] Found 3 outliers among 100 measurements (3.00%) 3 (3.00%) high mild MutableBuffer repeat slice/extend_from_slice loop/slice_len=100 n=8192 time: [19.471 µs 19.750 µs 20.185 µs] Found 9 outliers among 100 measurements (9.00%) 2 (2.00%) high mild 7 (7.00%) high severe ``` </details>
1 parent f0f6128 commit 94d51f4

File tree

3 files changed

+275
-0
lines changed

3 files changed

+275
-0
lines changed

arrow-buffer/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,8 @@ harness = false
5959
[[bench]]
6060
name = "offset"
6161
harness = false
62+
63+
[[bench]]
64+
name = "mutable_buffer_repeat_slice"
65+
harness = false
66+
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow_buffer::Buffer;
19+
use criterion::*;
20+
use rand::distr::Alphanumeric;
21+
use rand::rngs::StdRng;
22+
use rand::{Rng, SeedableRng};
23+
use std::hint;
24+
25+
fn criterion_benchmark(c: &mut Criterion) {
26+
let mut group = c.benchmark_group("MutableBuffer repeat slice");
27+
let mut rng = StdRng::seed_from_u64(42);
28+
29+
for slice_length in [3, 20, 100] {
30+
let slice_to_repeat: Vec<u8> = hint::black_box(
31+
(&mut rng)
32+
.sample_iter(&Alphanumeric)
33+
.take(slice_length)
34+
.collect(),
35+
);
36+
let slice_to_repeat: &[u8] = slice_to_repeat.as_ref();
37+
38+
for repeat_count in [3, 64, 1024, 8192] {
39+
let parameter_string = format!("slice_len={slice_length} n={repeat_count}");
40+
41+
group.bench_with_input(
42+
BenchmarkId::new("repeat_slice_n_times", &parameter_string),
43+
&(repeat_count),
44+
|b, &repeat_count| {
45+
b.iter(|| {
46+
let mut mutable_buffer = arrow_buffer::MutableBuffer::with_capacity(0);
47+
48+
mutable_buffer.repeat_slice_n_times(slice_to_repeat, repeat_count);
49+
50+
let buffer: Buffer = mutable_buffer.into();
51+
52+
buffer
53+
})
54+
},
55+
);
56+
group.bench_with_input(
57+
BenchmarkId::new("extend_from_slice loop", &parameter_string),
58+
&(repeat_count),
59+
|b, &repeat_count| {
60+
b.iter(|| {
61+
let mut mutable_buffer = arrow_buffer::MutableBuffer::with_capacity(
62+
size_of_val(slice_to_repeat) * repeat_count,
63+
);
64+
65+
for _ in 0..repeat_count {
66+
mutable_buffer.extend_from_slice(slice_to_repeat);
67+
}
68+
69+
let buffer: Buffer = mutable_buffer.into();
70+
71+
buffer
72+
})
73+
},
74+
);
75+
}
76+
}
77+
}
78+
79+
criterion_group!(benches, criterion_benchmark);
80+
criterion_main!(benches);

arrow-buffer/src/buffer/mutable.rs

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,75 @@ impl MutableBuffer {
222222
}
223223
}
224224

225+
/// Adding to this mutable buffer `slice_to_repeat` repeated `repeat_count` times.
226+
///
227+
/// # Example
228+
///
229+
/// ## Repeat the same string bytes multiple times
230+
/// ```
231+
/// # use arrow_buffer::buffer::MutableBuffer;
232+
/// let mut buffer = MutableBuffer::new(0);
233+
/// let bytes_to_repeat = b"ab";
234+
/// buffer.repeat_slice_n_times(bytes_to_repeat, 3);
235+
/// assert_eq!(buffer.as_slice(), b"ababab");
236+
/// ```
237+
pub fn repeat_slice_n_times<T: ArrowNativeType>(
238+
&mut self,
239+
slice_to_repeat: &[T],
240+
repeat_count: usize,
241+
) {
242+
if repeat_count == 0 || slice_to_repeat.is_empty() {
243+
return;
244+
}
245+
246+
let bytes_to_repeat = size_of_val(slice_to_repeat);
247+
248+
// Ensure capacity
249+
self.reserve(repeat_count * bytes_to_repeat);
250+
251+
// Save the length before we do all the copies to know where to start from
252+
let length_before = self.len;
253+
254+
// Copy the initial slice once so we can use doubling strategy on it
255+
self.extend_from_slice(slice_to_repeat);
256+
257+
// This tracks how much bytes we have added by repeating so far
258+
let added_repeats_length = bytes_to_repeat;
259+
assert_eq!(
260+
self.len - length_before,
261+
added_repeats_length,
262+
"should copy exactly the same number of bytes"
263+
);
264+
265+
// Number of times the slice was repeated
266+
let mut already_repeated_times = 1;
267+
268+
// We will use doubling strategy to fill the buffer in log(repeat_count) steps
269+
while already_repeated_times < repeat_count {
270+
// How many slices can we copy in this iteration
271+
// (either double what we have, or just the remaining ones)
272+
let number_of_slices_to_copy =
273+
already_repeated_times.min(repeat_count - already_repeated_times);
274+
let number_of_bytes_to_copy = number_of_slices_to_copy * bytes_to_repeat;
275+
276+
unsafe {
277+
// Get to the start of the data before we started copying anything
278+
let src = self.data.as_ptr().add(length_before) as *const u8;
279+
280+
// Go to the current location to copy to (end of current data)
281+
let dst = self.data.as_ptr().add(self.len);
282+
283+
// SAFETY: the pointers are not overlapping as there is `number_of_bytes_to_copy` or less between them
284+
std::ptr::copy_nonoverlapping(src, dst, number_of_bytes_to_copy)
285+
}
286+
287+
// Advance the length by the amount of data we just copied (doubled)
288+
self.len += number_of_bytes_to_copy;
289+
290+
already_repeated_times += number_of_slices_to_copy;
291+
}
292+
}
293+
225294
#[cold]
226295
fn reallocate(&mut self, capacity: usize) {
227296
let new_layout = Layout::from_size_align(capacity, self.layout.align()).unwrap();
@@ -1184,4 +1253,125 @@ mod tests {
11841253
assert_eq!(pool.used(), 0);
11851254
}
11861255
}
1256+
1257+
fn create_expected_repeated_slice<T: ArrowNativeType>(
1258+
slice_to_repeat: &[T],
1259+
repeat_count: usize,
1260+
) -> Buffer {
1261+
let mut expected = MutableBuffer::new(size_of_val(slice_to_repeat) * repeat_count);
1262+
for _ in 0..repeat_count {
1263+
// Not using push_slice_repeated as this is the function under test
1264+
expected.extend_from_slice(slice_to_repeat);
1265+
}
1266+
expected.into()
1267+
}
1268+
1269+
// Helper to test a specific repeat count with various slice sizes
1270+
fn test_repeat_count<T: ArrowNativeType + PartialEq + std::fmt::Debug>(
1271+
repeat_count: usize,
1272+
test_data: &[T],
1273+
) {
1274+
let mut buffer = MutableBuffer::new(0);
1275+
buffer.repeat_slice_n_times(test_data, repeat_count);
1276+
1277+
let expected = create_expected_repeated_slice(test_data, repeat_count);
1278+
let result: Buffer = buffer.into();
1279+
1280+
assert_eq!(
1281+
result,
1282+
expected,
1283+
"Failed for repeat_count={}, slice_len={}",
1284+
repeat_count,
1285+
test_data.len()
1286+
);
1287+
}
1288+
1289+
#[test]
1290+
fn test_repeat_slice_count_edge_cases() {
1291+
// Empty slice
1292+
test_repeat_count(100, &[] as &[i32]);
1293+
1294+
// Zero repeats
1295+
test_repeat_count(0, &[1i32, 2, 3]);
1296+
}
1297+
1298+
#[test]
1299+
fn test_small_repeats_counts() {
1300+
// test any special implementation for small repeat counts
1301+
let data = &[1u8, 2, 3, 4, 5];
1302+
1303+
for _ in 1..=10 {
1304+
test_repeat_count(2, data);
1305+
}
1306+
}
1307+
1308+
#[test]
1309+
fn test_different_size_of_i32_repeat_slice() {
1310+
let data: &[i32] = &[1, 2, 3];
1311+
let data_with_single_item: &[i32] = &[42];
1312+
1313+
for data in &[data, data_with_single_item] {
1314+
for item in 1..=9 {
1315+
let base_repeat_count = 2_usize.pow(item);
1316+
test_repeat_count(base_repeat_count - 1, data);
1317+
test_repeat_count(base_repeat_count, data);
1318+
test_repeat_count(base_repeat_count + 1, data);
1319+
}
1320+
}
1321+
}
1322+
1323+
#[test]
1324+
fn test_different_size_of_u8_repeat_slice() {
1325+
let data: &[u8] = &[1, 2, 3];
1326+
let data_with_single_item: &[u8] = &[10];
1327+
1328+
for data in &[data, data_with_single_item] {
1329+
for item in 1..=9 {
1330+
let base_repeat_count = 2_usize.pow(item);
1331+
test_repeat_count(base_repeat_count - 1, data);
1332+
test_repeat_count(base_repeat_count, data);
1333+
test_repeat_count(base_repeat_count + 1, data);
1334+
}
1335+
}
1336+
}
1337+
1338+
#[test]
1339+
fn test_different_size_of_u16_repeat_slice() {
1340+
let data: &[u16] = &[1, 2, 3];
1341+
let data_with_single_item: &[u16] = &[10];
1342+
1343+
for data in &[data, data_with_single_item] {
1344+
for item in 1..=9 {
1345+
let base_repeat_count = 2_usize.pow(item);
1346+
test_repeat_count(base_repeat_count - 1, data);
1347+
test_repeat_count(base_repeat_count, data);
1348+
test_repeat_count(base_repeat_count + 1, data);
1349+
}
1350+
}
1351+
}
1352+
1353+
#[test]
1354+
fn test_various_slice_lengths() {
1355+
// Test different slice lengths with same repeat pattern
1356+
let repeat_count = 37; // Arbitrary non-power-of-2
1357+
1358+
// Single element
1359+
test_repeat_count(repeat_count, &[42i32]);
1360+
1361+
// Small slices
1362+
test_repeat_count(repeat_count, &[1i32, 2]);
1363+
test_repeat_count(repeat_count, &[1i32, 2, 3]);
1364+
test_repeat_count(repeat_count, &[1i32, 2, 3, 4]);
1365+
test_repeat_count(repeat_count, &[1i32, 2, 3, 4, 5]);
1366+
1367+
// Larger slices
1368+
let data_10: Vec<i32> = (0..10).collect();
1369+
test_repeat_count(repeat_count, &data_10);
1370+
1371+
let data_100: Vec<i32> = (0..100).collect();
1372+
test_repeat_count(repeat_count, &data_100);
1373+
1374+
let data_1000: Vec<i32> = (0..1000).collect();
1375+
test_repeat_count(repeat_count, &data_1000);
1376+
}
11871377
}

0 commit comments

Comments
 (0)