diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 271ba6ddcff5..296970e44f86 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -517,6 +517,22 @@ config_namespace! { /// batches and merged. pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024 + /// Maximum size in bytes for individual spill files before rotating to a new file. + /// + /// When operators spill data to disk (e.g., RepartitionExec), they write + /// multiple batches to the same file until this size limit is reached, then rotate + /// to a new file. This reduces syscall overhead compared to one-file-per-batch + /// while preventing files from growing too large. + /// + /// A larger value reduces file creation overhead but may hold more disk space. + /// A smaller value creates more files but allows finer-grained space reclamation + /// as files can be deleted once fully consumed. + /// + /// Not all operators support this feature, some may create spill files larger than the limit. + /// + /// Default: 128 MB + pub max_spill_file_size_bytes: usize, default = 128 * 1024 * 1024 + /// Number of files to read in parallel when inferring schema and statistics pub meta_fetch_concurrency: usize, default = 32 diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 8174f71c31af..17040d2ac4a8 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -39,6 +39,7 @@ use crate::repartition::distributor_channels::{ }; use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::spill::spill_manager::SpillManager; +use crate::spill::spill_pool::SpillPool; use crate::stream::RecordBatchStreamAdapter; use crate::{DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics}; @@ -51,7 +52,6 @@ use datafusion_common::utils::transpose; use datafusion_common::{internal_err, ColumnStatistics, HashMap}; use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_common_runtime::SpawnedTask; -use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::memory_pool::MemoryConsumer; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr}; @@ -73,21 +73,24 @@ mod distributor_channels; enum RepartitionBatch { /// Batch held in memory (counts against memory reservation) Memory(RecordBatch), - /// Batch spilled to disk (one file per batch for queue semantics) - /// File automatically deleted when dropped via reference counting - /// The size field stores the original batch size for validation when reading back - Spilled { - spill_file: RefCountedTempFile, - size: usize, - }, + /// Marker indicating a batch was spilled to the partition's SpillPool + /// The actual batch can be retrieved by reading from the SpillPoolStream + Spilled, } type MaybeBatch = Option>; type InputPartitionsToCurrentPartitionSender = Vec>; type InputPartitionsToCurrentPartitionReceiver = Vec>; +/// Output channel with its associated memory reservation and spill pool +#[derive(Clone)] +struct OutputChannel { + sender: DistributionSender, + reservation: SharedMemoryReservation, + spill_pool: Arc>, +} + /// Channels and resources for a single output partition -#[derive(Debug)] struct PartitionChannels { /// Senders for each input partition to send data to this output partition tx: InputPartitionsToCurrentPartitionSender, @@ -95,20 +98,29 @@ struct PartitionChannels { rx: InputPartitionsToCurrentPartitionReceiver, /// Memory reservation for this output partition reservation: SharedMemoryReservation, - /// Spill manager for handling disk spills for this output partition - spill_manager: Arc, + /// SpillPools for batched spilling - one per input partition (FIFO semantics) + /// Each (input, output) pair gets its own SpillPool to maintain proper ordering + spill_pools: Vec>>, } -#[derive(Debug)] struct ConsumingInputStreamsState { /// Channels for sending batches from input partitions to output partitions. /// Key is the partition number. channels: HashMap, - /// Helper that ensures that that background job is killed once it is no longer needed. + /// Helper that ensures that background jobs are killed once they are no longer needed. abort_helper: Arc>>, } +impl Debug for ConsumingInputStreamsState { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConsumingInputStreamsState") + .field("num_channels", &self.channels.len()) + .field("abort_helper", &self.abort_helper) + .finish() + } +} + /// Inner state of [`RepartitionExec`]. enum RepartitionExecState { /// Not initialized yet. This is the default state stored in the RepartitionExec node @@ -171,6 +183,7 @@ impl RepartitionExecState { Ok(()) } + #[expect(clippy::too_many_arguments)] fn consume_input_streams( &mut self, input: Arc, @@ -179,6 +192,7 @@ impl RepartitionExecState { preserve_order: bool, name: String, context: Arc, + spill_manager: SpillManager, ) -> Result<&mut ConsumingInputStreamsState> { let streams_and_metrics = match self { RepartitionExecState::NotInitialized => { @@ -202,6 +216,8 @@ impl RepartitionExecState { let num_input_partitions = streams_and_metrics.len(); let num_output_partitions = partitioning.partition_count(); + let spill_manager = Arc::new(spill_manager); + let (txs, rxs) = if preserve_order { let (txs, rxs) = partition_aware_channels(num_input_partitions, num_output_partitions); @@ -230,19 +246,29 @@ impl RepartitionExecState { .with_can_spill(true) .register(context.memory_pool()), )); - let spill_metrics = SpillMetrics::new(&metrics, partition); - let spill_manager = Arc::new(SpillManager::new( - Arc::clone(&context.runtime_env()), - spill_metrics, - input.schema(), - )); + + // Create one SpillPool per input partition for this output partition + // This ensures proper FIFO ordering within each (input, output) pair + let max_file_size = context + .session_config() + .options() + .execution + .max_spill_file_size_bytes; + let spill_pools: Vec<_> = (0..num_input_partitions) + .map(|_| { + let spill_pool = + SpillPool::new(max_file_size, Arc::clone(&spill_manager)); + Arc::new(Mutex::new(spill_pool)) + }) + .collect(); + channels.insert( partition, PartitionChannels { tx, rx, reservation, - spill_manager, + spill_pools, }, ); } @@ -257,11 +283,11 @@ impl RepartitionExecState { .map(|(partition, channels)| { ( *partition, - ( - channels.tx[i].clone(), - Arc::clone(&channels.reservation), - Arc::clone(&channels.spill_manager), - ), + OutputChannel { + sender: channels.tx[i].clone(), + reservation: Arc::clone(&channels.reservation), + spill_pool: Arc::clone(&channels.spill_pools[i]), + }, ) }) .collect(); @@ -278,9 +304,7 @@ impl RepartitionExecState { let wait_for_task = SpawnedTask::spawn(RepartitionExec::wait_for_task( input_task, txs.into_iter() - .map(|(partition, (tx, _reservation, _spill_manager))| { - (partition, tx) - }) + .map(|(partition, channel)| (partition, channel.sender)) .collect(), )); spawned_tasks.push(wait_for_task); @@ -590,7 +614,7 @@ impl RepartitionExec { &self.cache.partitioning } - /// Get preserve_order flag of the RepartitionExecutor + /// Get preserve_order flag of the RepartitionExec /// `true` means `SortPreservingRepartitionExec`, `false` means `RepartitionExec` pub fn preserve_order(&self) -> bool { self.preserve_order @@ -696,6 +720,8 @@ impl ExecutionPlan for RepartitionExec { partition ); + let spill_metrics = SpillMetrics::new(&self.metrics, partition); + let input = Arc::clone(&self.input); let partitioning = self.partitioning().clone(); let metrics = self.metrics.clone(); @@ -704,6 +730,12 @@ impl ExecutionPlan for RepartitionExec { let schema = self.schema(); let schema_captured = Arc::clone(&schema); + let spill_manager = SpillManager::new( + Arc::clone(&context.runtime_env()), + spill_metrics, + input.schema(), + ); + // Get existing ordering to use for merging let sort_exprs = self.sort_exprs().cloned(); @@ -721,7 +753,7 @@ impl ExecutionPlan for RepartitionExec { let num_input_partitions = input.output_partitioning().partition_count(); // lock scope - let (mut rx, reservation, spill_manager, abort_helper) = { + let (mut rx, reservation, spill_pools, abort_helper) = { // lock mutexes let mut state = state.lock(); let state = state.consume_input_streams( @@ -731,6 +763,7 @@ impl ExecutionPlan for RepartitionExec { preserve_order, name.clone(), Arc::clone(&context), + spill_manager.clone(), )?; // now return stream for the specified *output* partition which will @@ -738,7 +771,7 @@ impl ExecutionPlan for RepartitionExec { let PartitionChannels { rx, reservation, - spill_manager, + spill_pools, .. } = state .channels @@ -748,7 +781,7 @@ impl ExecutionPlan for RepartitionExec { ( rx, reservation, - spill_manager, + spill_pools, Arc::clone(&state.abort_helper), ) }; @@ -759,16 +792,22 @@ impl ExecutionPlan for RepartitionExec { if preserve_order { // Store streams from all the input partitions: + // Each input partition gets its own SpillPool to maintain proper FIFO ordering let input_streams = rx .into_iter() - .map(|receiver| { + .enumerate() + .map(|(idx, receiver)| { + let spill_pool = Arc::clone(&spill_pools[idx]); + let spill_stream = SpillPool::reader(Arc::clone(&spill_pool)); + Box::pin(PerPartitionStream { schema: Arc::clone(&schema_captured), receiver, _drop_helper: Arc::clone(&abort_helper), reservation: Arc::clone(&reservation), - spill_manager: Arc::clone(&spill_manager), - state: RepartitionStreamState::ReceivingFromChannel, + spill_pool, + spill_stream, + input_finished: false, }) as SendableRecordBatchStream }) .collect::>(); @@ -788,8 +827,13 @@ impl ExecutionPlan for RepartitionExec { .with_batch_size(context.session_config().batch_size()) .with_fetch(fetch) .with_reservation(merge_reservation) + .with_spill_manager(spill_manager) .build() } else { + // Non-preserve-order case: single input stream, so use the first SpillPool + let spill_pool = Arc::clone(&spill_pools[0]); + let spill_stream = SpillPool::reader(Arc::clone(&spill_pool)); + Ok(Box::pin(RepartitionStream { num_input_partitions, num_input_partitions_processed: 0, @@ -797,8 +841,9 @@ impl ExecutionPlan for RepartitionExec { input: rx.swap_remove(0), _drop_helper: abort_helper, reservation, - spill_manager, - state: RepartitionStreamState::ReceivingFromChannel, + spill_pool, + spill_stream, + all_inputs_finished: false, }) as SendableRecordBatchStream) } }) @@ -1034,17 +1079,10 @@ impl RepartitionExec { /// Pulls data from the specified input plan, feeding it to the /// output partitions based on the desired partitioning /// - /// txs hold the output sending channels for each output partition + /// `output_channels` holds the output sending channels for each output partition async fn pull_from_input( mut stream: SendableRecordBatchStream, - mut output_channels: HashMap< - usize, - ( - DistributionSender, - SharedMemoryReservation, - Arc, - ), - >, + mut output_channels: HashMap, partitioning: Partitioning, metrics: RepartitionMetrics, ) -> Result<()> { @@ -1076,37 +1114,35 @@ impl RepartitionExec { let timer = metrics.send_time[partition].timer(); // if there is still a receiver, send to it - if let Some((tx, reservation, spill_manager)) = - output_channels.get_mut(&partition) - { + if let Some(channel) = output_channels.get_mut(&partition) { let (batch_to_send, is_memory_batch) = - match reservation.lock().try_grow(size) { + match channel.reservation.lock().try_grow(size) { Ok(_) => { // Memory available - send in-memory batch (RepartitionBatch::Memory(batch), true) } Err(_) => { - // We're memory limited - spill this single batch to its own file - let spill_file = spill_manager - .spill_record_batch_and_finish( - &[batch], - &format!( - "RepartitionExec spill partition {partition}" - ), - )? - // Note that we handled empty batch above, so this is safe - .expect("non-empty batch should produce spill file"); - - // Store size for validation when reading back - (RepartitionBatch::Spilled { spill_file, size }, false) + // We're memory limited - spill to SpillPool + // SpillPool handles file handle reuse and rotation + { + let mut pool = channel.spill_pool.lock(); + pool.push_batch(&batch)?; + // Flush immediately to make the batch available for reading + // This is necessary for order-preserving repartition where + // the reader needs immediate access to spilled data + pool.flush()?; + } + + // Send marker indicating batch was spilled + (RepartitionBatch::Spilled, false) } }; - if tx.send(Some(Ok(batch_to_send))).await.is_err() { + if channel.sender.send(Some(Ok(batch_to_send))).await.is_err() { // If the other end has hung up, it was an early shutdown (e.g. LIMIT) // Only shrink memory if it was a memory batch if is_memory_batch { - reservation.lock().shrink(size); + channel.reservation.lock().shrink(size); } output_channels.remove(&partition); } @@ -1188,13 +1224,6 @@ impl RepartitionExec { } } -enum RepartitionStreamState { - /// Waiting for next item from channel - ReceivingFromChannel, - /// Reading a spilled batch from disk (stream reads via tokio::fs) - ReadingSpilledBatch(SendableRecordBatchStream), -} - struct RepartitionStream { /// Number of input partitions that will be sending batches to this output channel num_input_partitions: usize, @@ -1214,11 +1243,14 @@ struct RepartitionStream { /// Memory reservation. reservation: SharedMemoryReservation, - /// Spill manager for reading spilled batches - spill_manager: Arc, + /// SpillPool for batched spilling with FIFO semantics (shared for writing) + spill_pool: Arc>, - /// Current state of the stream - state: RepartitionStreamState, + /// Infinite stream for reading from the spill pool + spill_stream: SendableRecordBatchStream, + + /// Flag indicating all inputs have finished + all_inputs_finished: bool, } impl Stream for RepartitionStream { @@ -1228,68 +1260,96 @@ impl Stream for RepartitionStream { mut self: Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { + use futures::StreamExt; + loop { - match &mut self.state { - RepartitionStreamState::ReceivingFromChannel => { - let value = futures::ready!(self.input.recv().poll_unpin(cx)); - match value { - Some(Some(v)) => match v { - Ok(RepartitionBatch::Memory(batch)) => { - // Release memory and return - self.reservation - .lock() - .shrink(batch.get_array_memory_size()); - return Poll::Ready(Some(Ok(batch))); - } - Ok(RepartitionBatch::Spilled { spill_file, size }) => { - // Read from disk - SpillReaderStream uses tokio::fs internally - // Pass the original size for validation - let stream = self - .spill_manager - .read_spill_as_stream(spill_file, Some(size))?; - self.state = - RepartitionStreamState::ReadingSpilledBatch(stream); - // Continue loop to poll the stream immediately - } - Err(e) => { - return Poll::Ready(Some(Err(e))); - } - }, - Some(None) => { - self.num_input_partitions_processed += 1; - - if self.num_input_partitions - == self.num_input_partitions_processed - { - // all input partitions have finished sending batches - return Poll::Ready(None); - } else { - // other partitions still have data to send - continue; - } - } - None => { - return Poll::Ready(None); + // First, check if there's a spilled batch available + match self.spill_stream.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + // Got a spilled batch + return Poll::Ready(Some(Ok(batch))); + } + Poll::Ready(Some(Err(e))) => { + return Poll::Ready(Some(Err(e))); + } + Poll::Ready(None) => { + // Spill stream ended - all spilled data has been read + return Poll::Ready(None); + } + Poll::Pending => { + // No spilled data available right now + if self.all_inputs_finished { + // All inputs finished, verify pool is finalized before waiting + // If not finalized, we may hang indefinitely + if !self.spill_pool.lock().is_finalized() { + return Poll::Ready(Some(Err(DataFusionError::Internal( + "Spill pool not finalized despite all inputs finishing" + .to_string(), + )))); } + // Pool is finalized, wait for spill stream to have more data or finish + return Poll::Pending; } + // Otherwise check the channel } - RepartitionStreamState::ReadingSpilledBatch(stream) => { - match futures::ready!(stream.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - // Return batch and stay in ReadingSpilledBatch state to read more batches - return Poll::Ready(Some(Ok(batch))); - } - Some(Err(e)) => { - self.state = RepartitionStreamState::ReceivingFromChannel; - return Poll::Ready(Some(Err(e))); - } - None => { - // Spill stream ended - go back to receiving from channel - self.state = RepartitionStreamState::ReceivingFromChannel; - continue; - } + } + + // If all inputs are finished, don't poll channel anymore, just wait for spill_stream + if self.all_inputs_finished { + return Poll::Pending; + } + + // Try to get next item from channel + let value = match self.input.recv().poll_unpin(cx) { + Poll::Ready(v) => v, + Poll::Pending => { + // Nothing from channel either, wait + return Poll::Pending; + } + }; + + match value { + Some(Some(v)) => match v { + Ok(RepartitionBatch::Memory(batch)) => { + // Release memory and return + self.reservation + .lock() + .shrink(batch.get_array_memory_size()); + return Poll::Ready(Some(Ok(batch))); + } + Ok(RepartitionBatch::Spilled) => { + // Batch was spilled, it's available in spill_stream + // Loop back to poll spill_stream again + continue; + } + Err(e) => { + return Poll::Ready(Some(Err(e))); + } + }, + Some(None) => { + self.num_input_partitions_processed += 1; + + if self.num_input_partitions == self.num_input_partitions_processed { + // All input partitions have finished sending batches + // Flush and finalize the SpillPool + { + let mut pool = self.spill_pool.lock(); + if let Err(e) = pool.flush() { + return Poll::Ready(Some(Err(e))); + } + pool.finalize(); + } // Drop the lock before continuing + self.all_inputs_finished = true; + // Continue to drain any remaining spilled batches + continue; + } else { + // other partitions still have data to send + continue; } } + None => { + return Poll::Ready(None); + } } } } @@ -1317,11 +1377,14 @@ struct PerPartitionStream { /// Memory reservation. reservation: SharedMemoryReservation, - /// Spill manager for reading spilled batches - spill_manager: Arc, + /// SpillPool for batched spilling with FIFO semantics (shared for writing) + spill_pool: Arc>, + + /// Infinite stream for reading from the spill pool + spill_stream: SendableRecordBatchStream, - /// Current state of the stream - state: RepartitionStreamState, + /// Flag indicating input partition has finished + input_finished: bool, } impl Stream for PerPartitionStream { @@ -1331,58 +1394,85 @@ impl Stream for PerPartitionStream { mut self: Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { + use futures::StreamExt; + loop { - match &mut self.state { - RepartitionStreamState::ReceivingFromChannel => { - let value = futures::ready!(self.receiver.recv().poll_unpin(cx)); - match value { - Some(Some(v)) => match v { - Ok(RepartitionBatch::Memory(batch)) => { - // Release memory and return - self.reservation - .lock() - .shrink(batch.get_array_memory_size()); - return Poll::Ready(Some(Ok(batch))); - } - Ok(RepartitionBatch::Spilled { spill_file, size }) => { - // Read from disk - SpillReaderStream uses tokio::fs internally - // Pass the original size for validation - let stream = self - .spill_manager - .read_spill_as_stream(spill_file, Some(size))?; - self.state = - RepartitionStreamState::ReadingSpilledBatch(stream); - // Continue loop to poll the stream immediately - } - Err(e) => { - return Poll::Ready(Some(Err(e))); - } - }, - Some(None) => { - // Input partition has finished sending batches - return Poll::Ready(None); - } - None => return Poll::Ready(None), + // First, check if there's a spilled batch available + match self.spill_stream.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + // Got a spilled batch + return Poll::Ready(Some(Ok(batch))); + } + Poll::Ready(Some(Err(e))) => { + return Poll::Ready(Some(Err(e))); + } + Poll::Ready(None) => { + // Spill stream ended - all spilled data has been read + // Only end the stream if input is also finished + if self.input_finished { + return Poll::Ready(None); } + // Otherwise, continue to check the channel for new data } + Poll::Pending => { + // No spilled data available yet (async I/O in progress) + if self.input_finished { + // Input finished, but spill stream might have data being read + // Wait for the async I/O to complete + return Poll::Pending; + } + // Spill stream is pending but not finished + // Fall through to check the channel + } + } - RepartitionStreamState::ReadingSpilledBatch(stream) => { - match futures::ready!(stream.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - // Return batch and stay in ReadingSpilledBatch state to read more batches - return Poll::Ready(Some(Ok(batch))); - } - Some(Err(e)) => { - self.state = RepartitionStreamState::ReceivingFromChannel; + // If input is finished, don't poll channel anymore - just wait for spill stream + if self.input_finished { + return Poll::Pending; + } + + // Try to get next item from channel + let value = match self.receiver.recv().poll_unpin(cx) { + Poll::Ready(v) => v, + Poll::Pending => { + // Nothing from channel either, wait + return Poll::Pending; + } + }; + + match value { + Some(Some(v)) => match v { + Ok(RepartitionBatch::Memory(batch)) => { + // Release memory and return + self.reservation + .lock() + .shrink(batch.get_array_memory_size()); + return Poll::Ready(Some(Ok(batch))); + } + Ok(RepartitionBatch::Spilled) => { + // Batch was spilled, it's available in spill_stream + // Loop back to poll spill_stream again + continue; + } + Err(e) => { + return Poll::Ready(Some(Err(e))); + } + }, + Some(None) => { + // Input partition has finished sending batches + // Flush and finalize the SpillPool + { + let mut pool = self.spill_pool.lock(); + if let Err(e) = pool.flush() { return Poll::Ready(Some(Err(e))); } - None => { - // Spill stream ended - go back to receiving from channel - self.state = RepartitionStreamState::ReceivingFromChannel; - continue; - } - } + pool.finalize(); + } // Drop the lock before continuing + self.input_finished = true; + // Continue to drain any remaining spilled batches + continue; } + None => return Poll::Ready(None), } } } @@ -2144,8 +2234,10 @@ mod tests { #[cfg(test)] mod test { + use arrow::array::record_batch; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::assert_batches_eq; use super::*; use crate::test::TestMemoryExec; @@ -2229,6 +2321,132 @@ mod test { Ok(()) } + #[tokio::test] + async fn test_preserve_order_with_spilling() -> Result<()> { + use datafusion_execution::runtime_env::RuntimeEnvBuilder; + use datafusion_execution::TaskContext; + + // Create sorted input data across multiple partitions + // Partition1: [1,3], [5,7], [9,11] + // Partition2: [2,4], [6,8], [10,12] + let batch1 = record_batch!(("c0", UInt32, [1, 3])).unwrap(); + let batch2 = record_batch!(("c0", UInt32, [2, 4])).unwrap(); + let batch3 = record_batch!(("c0", UInt32, [5, 7])).unwrap(); + let batch4 = record_batch!(("c0", UInt32, [6, 8])).unwrap(); + let batch5 = record_batch!(("c0", UInt32, [9, 11])).unwrap(); + let batch6 = record_batch!(("c0", UInt32, [10, 12])).unwrap(); + let schema = batch1.schema(); + let sort_exprs = LexOrdering::new([PhysicalSortExpr { + expr: col("c0", &schema).unwrap(), + options: SortOptions::default().asc(), + }]) + .unwrap(); + let partition1 = vec![batch1.clone(), batch3.clone(), batch5.clone()]; + let partition2 = vec![batch2.clone(), batch4.clone(), batch6.clone()]; + let input_partitions = vec![partition1, partition2]; + + // Set up context with tight memory limit to force spilling + // Sorting needs some non-spillable memory, so 64 bytes should force spilling while still allowing the query to complete + let runtime = RuntimeEnvBuilder::default() + .with_memory_limit(64, 1.0) + .build_arc()?; + + let task_ctx = TaskContext::default().with_runtime(runtime); + let task_ctx = Arc::new(task_ctx); + + // Create physical plan with order preservation + let exec = TestMemoryExec::try_new(&input_partitions, Arc::clone(&schema), None)? + .try_with_sort_information(vec![sort_exprs.clone(), sort_exprs])?; + let exec = Arc::new(TestMemoryExec::update_cache(Arc::new(exec))); + // Repartition into 3 partitions with order preservation + // We expect 1 batch per output partition after repartitioning + let exec = RepartitionExec::try_new(exec, Partitioning::RoundRobinBatch(3))? + .with_preserve_order(); + + let mut batches = vec![]; + + // Collect all partitions - should succeed by spilling to disk + for i in 0..exec.partitioning().partition_count() { + let mut stream = exec.execute(i, Arc::clone(&task_ctx))?; + while let Some(result) = stream.next().await { + let batch = result?; + batches.push(batch); + } + } + + #[rustfmt::skip] + let expected = [ + [ + "+----+", + "| c0 |", + "+----+", + "| 1 |", + "| 2 |", + "| 3 |", + "| 4 |", + "+----+", + ], + [ + "+----+", + "| c0 |", + "+----+", + "| 5 |", + "| 6 |", + "| 7 |", + "| 8 |", + "+----+", + ], + [ + "+----+", + "| c0 |", + "+----+", + "| 9 |", + "| 10 |", + "| 11 |", + "| 12 |", + "+----+", + ], + ]; + + for (batch, expected) in batches.iter().zip(expected.iter()) { + assert_batches_eq!(expected, &[batch.clone()]); + } + + // We should have spilled ~ all of the data. + // - We spill data during the repartitioning phase + // - We may also spill during the final merge sort + let all_batches = [batch1, batch2, batch3, batch4, batch5, batch6]; + let metrics = exec.metrics().unwrap(); + assert!( + metrics.spill_count().unwrap() > input_partitions.len(), + "Expected spill_count > {} for order-preserving repartition, but got {:?}", + input_partitions.len(), + metrics.spill_count() + ); + assert!( + metrics.spilled_bytes().unwrap() + > all_batches + .iter() + .map(|b| b.get_array_memory_size()) + .sum::(), + "Expected spilled_bytes > {} for order-preserving repartition, got {}", + all_batches + .iter() + .map(|b| b.get_array_memory_size()) + .sum::(), + metrics.spilled_bytes().unwrap() + ); + assert!( + metrics.spilled_rows().unwrap() + >= all_batches.iter().map(|b| b.num_rows()).sum::(), + "Expected spilled_rows > {} for order-preserving repartition, got {}", + all_batches.iter().map(|b| b.num_rows()).sum::(), + metrics.spilled_rows().unwrap() + ); + + Ok(()) + } + #[tokio::test] async fn test_repartition() -> Result<()> { let schema = test_schema(); diff --git a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs index 14917e23b792..3149c829c726 100644 --- a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs +++ b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs @@ -35,6 +35,10 @@ pub struct InProgressSpillFile { writer: Option, /// Lazily initialized in-progress file, it will be moved out when the `finish` method is invoked in_progress_file: Option, + /// Number of batches written to this file + batch_count: usize, + /// Estimated size of data written to this file in bytes + estimated_size: usize, } impl InProgressSpillFile { @@ -46,6 +50,8 @@ impl InProgressSpillFile { spill_writer, in_progress_file: Some(in_progress_file), writer: None, + batch_count: 0, + estimated_size: 0, } } @@ -84,6 +90,10 @@ impl InProgressSpillFile { // Update metrics self.spill_writer.metrics.spilled_rows.add(spilled_rows); + + // Update stats + self.batch_count += 1; + self.estimated_size += batch.get_array_memory_size(); } Ok(()) } @@ -107,4 +117,14 @@ impl InProgressSpillFile { Ok(self.in_progress_file.take()) } + + /// Returns the number of batches written to this file + pub fn batch_count(&self) -> usize { + self.batch_count + } + + /// Returns the estimated size of data written to this file in bytes + pub fn estimated_size(&self) -> usize { + self.estimated_size + } } diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs index 5b9a91e781b1..395c98ca5f8d 100644 --- a/datafusion/physical-plan/src/spill/mod.rs +++ b/datafusion/physical-plan/src/spill/mod.rs @@ -19,6 +19,7 @@ pub(crate) mod in_progress_spill_file; pub(crate) mod spill_manager; +pub(crate) mod spill_pool; use std::fs::File; use std::io::BufReader; diff --git a/datafusion/physical-plan/src/spill/spill_manager.rs b/datafusion/physical-plan/src/spill/spill_manager.rs index cc39102d8981..b43323e62399 100644 --- a/datafusion/physical-plan/src/spill/spill_manager.rs +++ b/datafusion/physical-plan/src/spill/spill_manager.rs @@ -72,6 +72,11 @@ impl SpillManager { self } + /// Returns the schema for batches managed by this SpillManager + pub fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + /// Creates a temporary file for in-progress operations, returning an error /// message if file creation fails. The file can be used to append batches /// incrementally and then finish the file when done. diff --git a/datafusion/physical-plan/src/spill/spill_pool.rs b/datafusion/physical-plan/src/spill/spill_pool.rs new file mode 100644 index 000000000000..7636f2a51c9e --- /dev/null +++ b/datafusion/physical-plan/src/spill/spill_pool.rs @@ -0,0 +1,1043 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! SpillPool: A reusable abstraction for managing spill files with FIFO semantics. +//! +//! # Overview +//! +//! The `SpillPool` provides a centralized mechanism for spilling record batches to disk +//! when memory is constrained. It manages a collection of spill files, each containing +//! multiple batches, with configurable maximum file sizes. +//! +//! # Design +//! +//! - **FIFO (Queue) semantics**: Batches are read in the order they were spilled +//! - **File handle reuse**: Multiple batches are written to the same file to minimize syscalls +//! - **Automatic file rotation**: When a file exceeds `max_file_size_bytes`, rotate to a new file +//! - **Sequential reading**: Uses IPC Stream format's natural sequential access pattern +//! - **Automatic cleanup**: Files are deleted once fully consumed +//! +//! # Usage Example +//! +//! ```ignore +//! use std::sync::Arc; +//! use parking_lot::Mutex; +//! +//! let pool = SpillPool::new( +//! 100 * 1024 * 1024, // 100MB max per file +//! spill_manager, +//! ); +//! let pool = Arc::new(Mutex::new(pool)); +//! +//! // Spill batches - automatically rotates files when size limit reached +//! { +//! let mut pool = pool.lock(); +//! pool.push_batch(batch1)?; +//! pool.push_batch(batch2)?; +//! pool.flush()?; // Finalize current file +//! pool.finalize(); // Signal no more writes +//! } +//! +//! // Read back in FIFO order using a stream +//! let mut stream = SpillPool::reader(pool); +//! let batch1 = stream.next().await.unwrap()?; // Returns batch1 +//! let batch2 = stream.next().await.unwrap()?; // Returns batch2 +//! // stream.next() returns None after finalize +//! ``` + +use std::collections::VecDeque; +use std::sync::Arc; +use std::task::Waker; + +use parking_lot::Mutex; + +use arrow::datatypes::SchemaRef; +use arrow::record_batch::RecordBatch; +use datafusion_common::Result; +use datafusion_execution::disk_manager::RefCountedTempFile; +use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; + +use super::in_progress_spill_file::InProgressSpillFile; +use super::spill_manager::SpillManager; + +/// A pool of spill files that manages batch-level spilling with FIFO semantics. +/// +/// Batches are written sequentially to files, with automatic rotation when the +/// configured size limit is reached. Reading is done via an infinite stream +/// that can read concurrently while writes continue. +/// +/// # Thread Safety +/// +/// `SpillPool` is not thread-safe and should be used from a single thread or +/// protected with appropriate synchronization (e.g., `Arc>`). +pub struct SpillPool { + /// Maximum size in bytes before rotating to a new file + max_file_size_bytes: usize, + /// Queue of spill files (front = oldest, back = newest) + files: VecDeque, + /// Current file being written to (if any) + current_write_file: Option, + /// SpillManager for creating files and tracking metrics + spill_manager: Arc, + /// Wakers to notify when new data is available for readers + wakers: Vec, + /// Flag indicating no more writes will occur + finalized: bool, +} + +impl SpillPool { + /// Creates a new SpillPool with FIFO semantics. + /// + /// # Arguments + /// + /// * `max_file_size_bytes` - Maximum size per file before rotation (e.g., 100MB) + /// * `spill_manager` - Manager for file creation and metrics + pub fn new(max_file_size_bytes: usize, spill_manager: Arc) -> Self { + Self { + max_file_size_bytes, + files: VecDeque::new(), + current_write_file: None, + spill_manager, + wakers: Vec::new(), + finalized: false, + } + } + + /// Marks the pool as finalized, indicating no more writes will occur. + /// This allows readers to know when to stop waiting for more data. + pub fn finalize(&mut self) { + self.finalized = true; + self.wake(); // Wake readers to check finalized status + } + + /// Returns true if the pool has been finalized + pub fn is_finalized(&self) -> bool { + self.finalized + } + + /// Creates a stream reader for this pool. + /// + /// The stream automatically handles file rotation and can read concurrently + /// while writes continue to the pool. When the stream catches up to the writer, + /// it will return `Poll::Pending` and wait for more data. + /// + /// # Arguments + /// + /// * `pool` - Shared reference to the SpillPool + /// + /// # Returns + /// + /// A `SpillPoolStream` that returns batches in FIFO order and ends when the pool + /// is finalized and all data has been read + pub fn reader(pool: Arc>) -> SendableRecordBatchStream { + Box::pin(SpillPoolStream::new(pool)) + } + + /// Spills a batch to the pool, rotating files when necessary. + /// + /// If the current file would exceed `max_file_size_bytes` after adding + /// this batch, the file is finalized and a new one is started. + /// + /// # Errors + /// + /// Returns an error if disk I/O fails or disk quota is exceeded. + pub fn push_batch(&mut self, batch: &RecordBatch) -> Result<()> { + if batch.num_rows() == 0 { + // Skip empty batches + return Ok(()); + } + + let batch_size = batch.get_array_memory_size(); + + // Check if we need to rotate to a new file + let needs_rotation = if let Some(ref file) = self.current_write_file { + // Rotate if adding this batch would exceed the max file size + file.estimated_size() + batch_size > self.max_file_size_bytes + } else { + // No current file, need to create one + true + }; + + if needs_rotation && self.current_write_file.is_some() { + // Finish current file and add to queue + self.finish_current_file()?; + } + + // Create new file if needed + if self.current_write_file.is_none() { + self.current_write_file = + Some(self.spill_manager.create_in_progress_file("SpillPool")?); + } + + // Append batch to current file + if let Some(ref mut file) = self.current_write_file { + file.append_batch(batch)?; + } + + // Wake any waiting readers + self.wake(); + + Ok(()) + } + + /// Registers a waker to be notified when new data is available + fn register_waker(&mut self, waker: Waker) { + // Only register if not already present (avoid duplicates) + if !self.wakers.iter().any(|w| w.will_wake(&waker)) { + self.wakers.push(waker); + } + } + + /// Wakes all registered readers + fn wake(&mut self) { + for waker in self.wakers.drain(..) { + waker.wake(); + } + } + + /// Finalizes the current write file and adds it to the files queue. + /// + /// This is called automatically when files reach the size limit, but can + /// also be called explicitly to ensure all pending data is available for reading. + pub fn flush(&mut self) -> Result<()> { + if self.current_write_file.is_some() { + self.finish_current_file()?; + } + Ok(()) + } + + // Private helper methods + + /// Finishes the current write file and moves it to the files queue. + fn finish_current_file(&mut self) -> Result<()> { + if let Some(mut file) = self.current_write_file.take() { + // Finish writing to get the final file + let finished_file = file.finish()?; + + if let Some(temp_file) = finished_file { + // Add to queue + self.files.push_back(temp_file); + } + + // Wake any waiting readers since a new complete file is available + self.wake(); + } + + Ok(()) + } +} + +impl Drop for SpillPool { + fn drop(&mut self) { + // Flush any pending writes to ensure metrics are accurate + // We ignore errors here since Drop doesn't allow returning errors + let _ = self.flush(); + } +} + +/// A stream that reads from a SpillPool in FIFO order. +/// +/// The stream automatically handles file rotation and reads from completed files. +/// When no completed files are available, it returns `Poll::Pending` and waits +/// for the writer to complete more files. +/// +/// The stream ends (`Poll::Ready(None)`) when the pool is finalized and all data has been read. +struct SpillPoolStream { + /// Shared reference to the spill pool + spill_pool: Arc>, + /// Current stream being read from + current_stream: Option, + /// Schema for the batches + schema: SchemaRef, +} + +impl SpillPoolStream { + /// Creates a new infinite stream from a SpillPool. + /// + /// # Arguments + /// + /// * `spill_pool` - Shared reference to the pool to read from + pub fn new(spill_pool: Arc>) -> Self { + let schema = { + let pool = spill_pool.lock(); + pool.spill_manager.schema() + }; + + Self { + spill_pool, + current_stream: None, + schema, + } + } +} + +impl futures::Stream for SpillPoolStream { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + use futures::StreamExt; + use std::task::Poll; + + loop { + // If we have a current stream, try to read from it + if let Some(stream) = &mut self.current_stream { + match stream.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + return Poll::Ready(Some(Ok(batch))); + } + Poll::Ready(Some(Err(e))) => { + return Poll::Ready(Some(Err(e))); + } + Poll::Ready(None) => { + // Current stream exhausted (finished reading a completed file) + self.current_stream = None; + // Continue loop to try getting next file + } + Poll::Pending => { + // Stream not ready yet + return Poll::Pending; + } + } + } + + // No current stream, try to get the next file to read + // Only read from completed files in the queue + let mut pool = self.spill_pool.lock(); + + if let Some(file) = pool.files.pop_front() { + // We have a completed file to read + let spill_manager = Arc::clone(&pool.spill_manager); + drop(pool); // Release lock before creating stream + + match spill_manager.read_spill_as_stream(file, None) { + Ok(stream) => { + self.current_stream = Some(stream); + // Continue loop to poll the new stream + } + Err(e) => { + return Poll::Ready(Some(Err(e))); + } + } + } else { + // No completed files available + let is_finalized = pool.is_finalized(); + if is_finalized { + // Pool is finalized and no more files - we're done + return Poll::Ready(None); + } + // Register waker and wait for more files + pool.register_waker(cx.waker().clone()); + return Poll::Pending; + } + } + } +} + +impl RecordBatchStream for SpillPoolStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metrics::{ExecutionPlanMetricsSet, SpillMetrics}; + use arrow::array::{ArrayRef, Int32Array}; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common_runtime::SpawnedTask; + use datafusion_execution::runtime_env::RuntimeEnv; + use futures::StreamExt; + use std::task::Poll; + use tokio; + + // ============================================================================ + // Test Utilities + // ============================================================================ + + fn create_test_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])) + } + + fn create_test_batch(start: i32, count: usize) -> RecordBatch { + let schema = create_test_schema(); + let a: ArrayRef = Arc::new(Int32Array::from( + (start..start + count as i32).collect::>(), + )); + let b: ArrayRef = Arc::new(Int32Array::from( + (start * 10..start * 10 + count as i32 * 10) + .step_by(10) + .collect::>(), + )); + RecordBatch::try_new(schema, vec![a, b]).unwrap() + } + + fn create_spill_pool(max_file_size: usize) -> SpillPool { + let env = Arc::new(RuntimeEnv::default()); + let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0); + let schema = create_test_schema(); + let spill_manager = + Arc::new(SpillManager::new(env, metrics, Arc::clone(&schema))); + + SpillPool::new(max_file_size, spill_manager) + } + + /// Helper to collect all batches from a stream + async fn collect_batches( + mut stream: SendableRecordBatchStream, + ) -> Result> { + let mut batches = Vec::new(); + while let Some(result) = stream.next().await { + batches.push(result?); + } + Ok(batches) + } + + // ============================================================================ + // Basic Functionality Tests + // ============================================================================ + + #[tokio::test] + async fn test_empty_pool_stream() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + pool.finalize(); // Mark as done with no data + + let pool = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool); + + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 0); + + Ok(()) + } + + #[tokio::test] + async fn test_single_batch() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + + // Push one batch + let batch1 = create_test_batch(0, 10); + pool.push_batch(&batch1)?; + pool.flush()?; + pool.finalize(); + + let pool = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool); + + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 10); + assert_eq!(batches[0].num_columns(), 2); + + Ok(()) + } + + #[tokio::test] + async fn test_multiple_batches_single_file() -> Result<()> { + let mut pool = create_spill_pool(10 * 1024 * 1024); // Large file size + + // Push multiple batches + for i in 0..5 { + let batch = create_test_batch(i * 10, 10); + pool.push_batch(&batch)?; + } + pool.flush()?; + pool.finalize(); + + let pool = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool); + + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 5); + + // Verify FIFO order + for (i, batch) in batches.iter().enumerate() { + assert_eq!(batch.num_rows(), 10); + let col_a = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col_a.value(0), (i as i32) * 10); + } + + Ok(()) + } + + #[tokio::test] + async fn test_file_rotation_on_size_limit() -> Result<()> { + // Small file size to force rotation + let mut pool = create_spill_pool(500); // ~500 bytes + + // Push multiple batches - should create multiple files + for i in 0..10 { + let batch = create_test_batch(i * 5, 5); + pool.push_batch(&batch)?; + } + pool.flush()?; + pool.finalize(); + + let pool = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool); + + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 10); + + // Verify all batches in FIFO order + for (i, batch) in batches.iter().enumerate() { + let col_a = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col_a.value(0), (i as i32) * 5); + } + + Ok(()) + } + + #[tokio::test] + async fn test_empty_batches_skipped() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + + let batch1 = create_test_batch(0, 10); + let empty_batch = RecordBatch::new_empty(create_test_schema()); + let batch2 = create_test_batch(10, 10); + + pool.push_batch(&batch1)?; + pool.push_batch(&empty_batch)?; // Should be skipped + pool.push_batch(&batch2)?; + pool.flush()?; + pool.finalize(); + + let pool = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool); + + let batches = collect_batches(stream).await?; + // Should only have 2 batches (empty one skipped) + assert_eq!(batches.len(), 2); + + Ok(()) + } + + #[tokio::test] + async fn test_large_batches() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + + // Create larger batches + let batch1 = create_test_batch(0, 1000); + let batch2 = create_test_batch(1000, 1000); + + pool.push_batch(&batch1)?; + pool.push_batch(&batch2)?; + pool.flush()?; + pool.finalize(); + + let pool = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool); + + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 2); + assert_eq!(batches[0].num_rows(), 1000); + assert_eq!(batches[1].num_rows(), 1000); + + Ok(()) + } + + // ============================================================================ + // Stream API Tests + // ============================================================================ + + #[tokio::test] + async fn test_stream_blocks_when_no_data() -> Result<()> { + let pool = create_spill_pool(1024 * 1024); + + let pool = Arc::new(Mutex::new(pool)); + let mut stream = SpillPool::reader(Arc::clone(&pool)); + + // Poll should return Pending since no data and not finalized + let poll_result = futures::poll!(stream.next()); + assert!(matches!(poll_result, Poll::Pending)); + + Ok(()) + } + + #[tokio::test] + async fn test_stream_wakes_on_push() -> Result<()> { + let pool = create_spill_pool(1024 * 1024); + let pool_arc = Arc::new(Mutex::new(pool)); + + let pool_clone = Arc::clone(&pool_arc); + let stream = SpillPool::reader(pool_clone); + + // Spawn a task that will push data after a delay + let writer_pool = Arc::clone(&pool_arc); + let _writer = SpawnedTask::spawn(async move { + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + let mut pool = writer_pool.lock(); + pool.push_batch(&create_test_batch(0, 10)).unwrap(); + pool.flush().unwrap(); + pool.finalize(); + }); + + // This should wait for data and then return it + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 1); + + Ok(()) + } + + #[tokio::test] + async fn test_stream_wakes_on_flush() -> Result<()> { + let pool = create_spill_pool(1024 * 1024); + let pool_arc = Arc::new(Mutex::new(pool)); + + let pool_clone = Arc::clone(&pool_arc); + let stream = SpillPool::reader(pool_clone); + + // Push without flush first + { + let mut pool = pool_arc.lock(); + pool.push_batch(&create_test_batch(0, 10)).unwrap(); + // Don't flush yet - data is in current_write_file + } + + // Spawn task to flush after delay + let writer_pool = Arc::clone(&pool_arc); + let _writer = SpawnedTask::spawn(async move { + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + let mut pool = writer_pool.lock(); + pool.flush().unwrap(); + pool.finalize(); + }); + + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 1); + + Ok(()) + } + + #[tokio::test] + async fn test_stream_wakes_on_finalize() -> Result<()> { + let pool = create_spill_pool(1024 * 1024); + let pool_arc = Arc::new(Mutex::new(pool)); + + let pool_clone = Arc::clone(&pool_arc); + let mut stream = SpillPool::reader(pool_clone); + + // First poll should be pending + let poll_result = futures::poll!(stream.next()); + assert!(matches!(poll_result, Poll::Pending)); + + // Finalize after delay + let writer_pool = Arc::clone(&pool_arc); + let _writer = SpawnedTask::spawn(async move { + tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + writer_pool.lock().finalize(); + }); + + // Stream should eventually return None + let result = stream.next().await; + assert!(result.is_none()); + + Ok(()) + } + + #[tokio::test] + async fn test_finalize_before_flush() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + + // Push data but DON'T flush + pool.push_batch(&create_test_batch(0, 10))?; + pool.finalize(); // Finalize without flush + + let pool = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool); + + // Data in current_write_file should still be lost since not flushed + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 0); + + Ok(()) + } + + // ============================================================================ + // Concurrent Reader/Writer Tests + // ============================================================================ + + #[tokio::test] + async fn test_concurrent_push_and_read() -> Result<()> { + let pool = create_spill_pool(1024 * 1024); + let pool_arc = Arc::new(Mutex::new(pool)); + + let writer_pool = Arc::clone(&pool_arc); + let writer = SpawnedTask::spawn(async move { + for i in 0..10 { + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + let mut pool = writer_pool.lock(); + pool.push_batch(&create_test_batch(i * 10, 10)).unwrap(); + pool.flush().unwrap(); + } + writer_pool.lock().finalize(); + }); + + let reader_pool = Arc::clone(&pool_arc); + let stream = SpillPool::reader(reader_pool); + let reader = SpawnedTask::spawn(async move { collect_batches(stream).await }); + + // Wait for both tasks + writer.await.unwrap(); + let batches = reader.await.unwrap()?; + + assert_eq!(batches.len(), 10); + for (i, batch) in batches.iter().enumerate() { + let col_a = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col_a.value(0), (i as i32) * 10); + } + + Ok(()) + } + + #[tokio::test] + async fn test_reader_catches_up_to_writer() -> Result<()> { + let pool = create_spill_pool(1024 * 1024); + let pool_arc = Arc::new(Mutex::new(pool)); + + // Start reader before any data is written + let reader_pool = Arc::clone(&pool_arc); + let mut stream = SpillPool::reader(reader_pool); + + // Should return pending + let poll_result = futures::poll!(stream.next()); + assert!(matches!(poll_result, Poll::Pending)); + + // Now add data + { + let mut pool = pool_arc.lock(); + pool.push_batch(&create_test_batch(0, 10))?; + pool.flush()?; + pool.finalize(); + } + + // Now stream should have data + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 1); + + Ok(()) + } + + #[tokio::test] + async fn test_multiple_readers_same_pool() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + + // Push some batches + for i in 0..5 { + pool.push_batch(&create_test_batch(i * 10, 10))?; + } + pool.flush()?; + pool.finalize(); + + let pool_arc = Arc::new(Mutex::new(pool)); + + // Create two readers + let stream1 = SpillPool::reader(Arc::clone(&pool_arc)); + let stream2 = SpillPool::reader(Arc::clone(&pool_arc)); + + // Read from both concurrently + let reader1 = SpawnedTask::spawn(async move { collect_batches(stream1).await }); + let reader2 = SpawnedTask::spawn(async move { collect_batches(stream2).await }); + + let batches1 = reader1.await.unwrap()?; + let batches2 = reader2.await.unwrap()?; + + // Each reader should consume different batches (pop_front removes from queue) + // The total number should be 5, but distributed between readers + let total = batches1.len() + batches2.len(); + assert_eq!(total, 5); + + Ok(()) + } + + #[tokio::test] + async fn test_file_cutover_during_read() -> Result<()> { + let pool = create_spill_pool(500); // Small size for rotation + let pool_arc = Arc::new(Mutex::new(pool)); + + let writer_pool = Arc::clone(&pool_arc); + let writer = SpawnedTask::spawn(async move { + // Write multiple batches that will cause rotation + for i in 0..8 { + { + let mut pool = writer_pool.lock(); + pool.push_batch(&create_test_batch(i * 5, 5)).unwrap(); + pool.flush().unwrap(); + } // Drop lock before sleep + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + } + writer_pool.lock().finalize(); + }); + + // Read concurrently + let reader_pool = Arc::clone(&pool_arc); + let stream = SpillPool::reader(reader_pool); + let reader = SpawnedTask::spawn(async move { collect_batches(stream).await }); + + writer.await.unwrap(); + let batches = reader.await.unwrap()?; + + // Should get all 8 batches despite file rotation + assert_eq!(batches.len(), 8); + + Ok(()) + } + + #[tokio::test] + async fn test_file_cutover_during_write() -> Result<()> { + let mut pool = create_spill_pool(300); // Very small to force frequent rotation + + // Push batches that will definitely cause rotation + for i in 0..5 { + let batch = create_test_batch(i * 10, 10); + pool.push_batch(&batch)?; + // Don't flush after each - let rotation happen naturally + } + pool.flush()?; + pool.finalize(); + + let pool = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool); + + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 5); + + Ok(()) + } + + // ============================================================================ + // Garbage Collection Tests + // ============================================================================ + + #[tokio::test] + async fn test_file_cleanup_after_read() -> Result<()> { + let mut pool = create_spill_pool(500); + + // Create multiple files + for i in 0..5 { + pool.push_batch(&create_test_batch(i * 10, 10))?; + pool.flush()?; // Each batch in its own file + } + + // Verify files exist before reading + let initial_file_count = pool.files.len(); + assert_eq!(initial_file_count, 5); + + pool.finalize(); + + let pool_arc = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(Arc::clone(&pool_arc)); + + // Read all batches + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 5); + + // All files should be consumed (dropped from queue) + let final_file_count = pool_arc.lock().files.len(); + assert_eq!(final_file_count, 0); + + Ok(()) + } + + #[tokio::test] + async fn test_cleanup_with_rotation() -> Result<()> { + let pool = create_spill_pool(400); + let pool_arc = Arc::new(Mutex::new(pool)); + + // Write and read concurrently + let writer_pool = Arc::clone(&pool_arc); + let writer = SpawnedTask::spawn(async move { + for i in 0..10 { + { + let mut pool = writer_pool.lock(); + pool.push_batch(&create_test_batch(i * 10, 10)).unwrap(); + pool.flush().unwrap(); + } // Drop lock before sleep + tokio::time::sleep(tokio::time::Duration::from_millis(20)).await; + } + writer_pool.lock().finalize(); + }); + + let reader_pool = Arc::clone(&pool_arc); + let stream = SpillPool::reader(reader_pool); + let reader = SpawnedTask::spawn(async move { + let mut batches = Vec::new(); + let mut stream = stream; + while let Some(result) = stream.next().await { + batches.push(result.unwrap()); + // Small delay to let writer create more files + tokio::time::sleep(tokio::time::Duration::from_millis(15)).await; + } + batches + }); + + writer.await.unwrap(); + let batches = reader.await.unwrap(); + + assert_eq!(batches.len(), 10); + + // All files should be cleaned up + let final_file_count = pool_arc.lock().files.len(); + assert_eq!(final_file_count, 0); + + Ok(()) + } + + #[tokio::test] + async fn test_cleanup_with_unflushed_file() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + + // Create some flushed files + for i in 0..3 { + pool.push_batch(&create_test_batch(i * 10, 10))?; + pool.flush()?; + } + + // Add unflushed data + pool.push_batch(&create_test_batch(30, 10))?; + // Don't flush! + + // current_write_file should have data + assert!(pool.current_write_file.is_some()); + assert_eq!(pool.files.len(), 3); + + pool.finalize(); + + let pool_arc = Arc::new(Mutex::new(pool)); + let stream = SpillPool::reader(pool_arc); + + // Should only get the 3 flushed batches + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 3); + + Ok(()) + } + + // ============================================================================ + // Edge Cases & Error Handling Tests + // ============================================================================ + + #[tokio::test] + async fn test_interleaved_flush() -> Result<()> { + let pool = create_spill_pool(1024 * 1024); + let pool_arc = Arc::new(Mutex::new(pool)); + + // Push → flush + { + let mut pool = pool_arc.lock(); + pool.push_batch(&create_test_batch(0, 10))?; + pool.flush()?; + } + + // Read one batch + let stream = SpillPool::reader(Arc::clone(&pool_arc)); + let mut stream = stream; + let batch1 = stream.next().await.unwrap()?; + assert_eq!(batch1.num_rows(), 10); + + // Push → flush again + { + let mut pool = pool_arc.lock(); + pool.push_batch(&create_test_batch(10, 10))?; + pool.flush()?; + } + + // Read second batch from same stream + let batch2 = stream.next().await.unwrap()?; + assert_eq!(batch2.num_rows(), 10); + + // Finalize and verify stream ends + pool_arc.lock().finalize(); + let batch3 = stream.next().await; + assert!(batch3.is_none()); + + Ok(()) + } + + #[tokio::test] + async fn test_flush_empty_pool() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + + // Flush with no data should be no-op + pool.flush()?; + pool.flush()?; // Multiple flushes + + assert_eq!(pool.files.len(), 0); + assert!(pool.current_write_file.is_none()); + + Ok(()) + } + + #[tokio::test] + async fn test_finalize_idempotent() -> Result<()> { + let mut pool = create_spill_pool(1024 * 1024); + + pool.push_batch(&create_test_batch(0, 10))?; + pool.flush()?; + + // Multiple finalize calls should be safe + pool.finalize(); + assert!(pool.is_finalized()); + pool.finalize(); + assert!(pool.is_finalized()); + pool.finalize(); + assert!(pool.is_finalized()); + + Ok(()) + } + + #[tokio::test] + async fn test_drop_flushes_current_file() -> Result<()> { + let pool = create_spill_pool(1024 * 1024); + let pool_arc = Arc::new(Mutex::new(pool)); + + // Push without flush + { + let mut pool = pool_arc.lock(); + pool.push_batch(&create_test_batch(0, 10)).unwrap(); + pool.flush().unwrap(); + pool.finalize(); + } + + // Drop should trigger flush in Drop impl + // (though in this case we already flushed) + + let stream = SpillPool::reader(pool_arc); + let batches = collect_batches(stream).await?; + assert_eq!(batches.len(), 1); + + Ok(()) + } +} diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index b15ec026372d..7c64f97c6965 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -223,6 +223,7 @@ datafusion.execution.keep_partition_by_columns false datafusion.execution.listing_table_factory_infer_partitions true datafusion.execution.listing_table_ignore_subdirectory true datafusion.execution.max_buffered_batches_per_output_file 2 +datafusion.execution.max_spill_file_size_bytes 134217728 datafusion.execution.meta_fetch_concurrency 32 datafusion.execution.minimum_parallel_output_files 4 datafusion.execution.objectstore_writer_buffer_size 10485760 @@ -343,6 +344,7 @@ datafusion.execution.keep_partition_by_columns false Should DataFusion keep the datafusion.execution.listing_table_factory_infer_partitions true Should a `ListingTable` created through the `ListingTableFactory` infer table partitions from Hive compliant directories. Defaults to true (partition columns are inferred and will be represented in the table schema). datafusion.execution.listing_table_ignore_subdirectory true Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption +datafusion.execution.max_spill_file_size_bytes 134217728 Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Not all operators support this feature, some may create spill files larger than the limit. Default: 128 MB datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics datafusion.execution.minimum_parallel_output_files 4 Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. datafusion.execution.objectstore_writer_buffer_size 10485760 Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. diff --git a/datafusion/sqllogictest/test_files/repartition.slt b/datafusion/sqllogictest/test_files/repartition.slt index 29d20d10b671..67b3f92a6d99 100644 --- a/datafusion/sqllogictest/test_files/repartition.slt +++ b/datafusion/sqllogictest/test_files/repartition.slt @@ -146,3 +146,135 @@ statement ok DROP TABLE t1; # End repartition on empty columns test + +# Start spilling tests +# Spilling is hard to reproduce with real data / queries +# The memory limit was tuned to this specific datset (which was already used in `window.slt`) +# by hand to trigger spilling without being so low that the query would not succeed. +# Before we introduced spilling to `RepartitionExec` this query could not complete. + +statement ok +CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite2 ( + a0 INTEGER, + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER +) +STORED AS CSV +WITH ORDER (a ASC, b ASC, c ASC) +LOCATION '../../datafusion/core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); + +statement ok +SET datafusion.runtime.memory_limit = '12K'; + +query IIII +SELECT SUM(a) OVER(partition by a, b order by c) as sum1, + SUM(a) OVER(partition by b, a order by c) as sum2, + SUM(a) OVER(partition by a, d order by b) as sum3, + SUM(a) OVER(partition by d order by a) as sum4 +FROM annotated_data_infinite2; +---- +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +2 2 2 4 +14 14 4 4 +12 12 2 4 +25 25 4 4 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +1 1 3 8 +2 2 8 8 +16 16 3 8 +7 7 8 8 +21 21 3 8 +18 18 8 8 +19 19 8 8 +21 21 8 8 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +0 0 0 0 +3 3 10 15 +1 1 11 11 +4 4 10 15 +3 3 11 11 +5 5 6 12 +4 4 15 15 +6 6 4 11 +5 5 12 12 +7 7 10 15 +6 6 11 11 +8 8 10 15 +8 8 12 12 +9 9 10 15 +9 9 11 11 +10 10 4 11 +10 10 15 15 +11 11 6 12 +11 11 15 15 +13 13 10 15 +12 12 15 15 +14 14 6 12 +13 13 12 12 +15 15 6 12 +15 15 12 12 +17 17 4 11 +16 16 15 15 +18 18 6 12 +17 17 11 11 +19 19 10 15 +20 20 11 11 +20 20 10 15 +22 22 12 12 +22 22 4 11 +23 23 11 11 +23 23 10 15 +24 24 12 12 +24 24 10 15 +25 25 6 12 + +# End spilling tests diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index c0e4ccd850d9..43d084da5212 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -114,6 +114,7 @@ The following configuration settings are available: | datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | | datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | | datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | +| datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Not all operators support this feature, some may create spill files larger than the limit. Default: 128 MB | | datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | | datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | | datafusion.execution.soft_max_rows_per_output_file | 50000000 | Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max |