Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,22 @@ config_namespace! {
/// batches and merged.
pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024

/// Maximum size in bytes for individual spill files before rotating to a new file.
///
/// When operators spill data to disk (e.g., RepartitionExec), they write
/// multiple batches to the same file until this size limit is reached, then rotate
/// to a new file. This reduces syscall overhead compared to one-file-per-batch
/// while preventing files from growing too large.
///
/// A larger value reduces file creation overhead but may hold more disk space.
/// A smaller value creates more files but allows finer-grained space reclamation
/// as files can be deleted once fully consumed.
///
/// Not all operators support this feature, some may create spill files larger than the limit.
///
/// Default: 128 MB
pub max_spill_file_size_bytes: usize, default = 128 * 1024 * 1024

/// Number of files to read in parallel when inferring schema and statistics
pub meta_fetch_concurrency: usize, default = 32

Expand Down
580 changes: 399 additions & 181 deletions datafusion/physical-plan/src/repartition/mod.rs

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions datafusion/physical-plan/src/spill/in_progress_spill_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ pub struct InProgressSpillFile {
writer: Option<IPCStreamWriter>,
/// Lazily initialized in-progress file, it will be moved out when the `finish` method is invoked
in_progress_file: Option<RefCountedTempFile>,
/// Number of batches written to this file
batch_count: usize,
/// Estimated size of data written to this file in bytes
estimated_size: usize,
}

impl InProgressSpillFile {
Expand All @@ -46,6 +50,8 @@ impl InProgressSpillFile {
spill_writer,
in_progress_file: Some(in_progress_file),
writer: None,
batch_count: 0,
estimated_size: 0,
}
}

Expand Down Expand Up @@ -84,6 +90,10 @@ impl InProgressSpillFile {

// Update metrics
self.spill_writer.metrics.spilled_rows.add(spilled_rows);

// Update stats
self.batch_count += 1;
self.estimated_size += batch.get_array_memory_size();
}
Ok(())
}
Expand All @@ -107,4 +117,14 @@ impl InProgressSpillFile {

Ok(self.in_progress_file.take())
}

/// Returns the number of batches written to this file
pub fn batch_count(&self) -> usize {
self.batch_count
}

/// Returns the estimated size of data written to this file in bytes
pub fn estimated_size(&self) -> usize {
self.estimated_size
}
}
1 change: 1 addition & 0 deletions datafusion/physical-plan/src/spill/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

pub(crate) mod in_progress_spill_file;
pub(crate) mod spill_manager;
pub(crate) mod spill_pool;

use std::fs::File;
use std::io::BufReader;
Expand Down
5 changes: 5 additions & 0 deletions datafusion/physical-plan/src/spill/spill_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ impl SpillManager {
self
}

/// Returns the schema for batches managed by this SpillManager
pub fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}

/// Creates a temporary file for in-progress operations, returning an error
/// message if file creation fails. The file can be used to append batches
/// incrementally and then finish the file when done.
Expand Down
Loading
Loading