Skip to content

Commit 7b6d5d0

Browse files
committed
Initial commit
1 parent b0c8dd6 commit 7b6d5d0

File tree

14 files changed

+496
-218
lines changed

14 files changed

+496
-218
lines changed

Cargo.lock

Lines changed: 397 additions & 151 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,19 +89,20 @@ ahash = { version = "0.8", default-features = false, features = [
8989
"runtime-rng",
9090
] }
9191
apache-avro = { version = "0.17", default-features = false }
92-
arrow = { version = "56.0.0", features = [
92+
arrow = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", features = [
9393
"prettyprint",
9494
"chrono-tz",
9595
] }
96-
arrow-buffer = { version = "56.0.0", default-features = false }
97-
arrow-flight = { version = "56.0.0", features = [
96+
97+
arrow-buffer = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false }
98+
arrow-flight = { git = "https://github.com/rok/arrow-rs.git", features = [
9899
"flight-sql-experimental",
99100
] }
100-
arrow-ipc = { version = "56.0.0", default-features = false, features = [
101+
arrow-ipc = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false, features = [
101102
"lz4",
102103
] }
103-
arrow-ord = { version = "56.0.0", default-features = false }
104-
arrow-schema = { version = "56.0.0", default-features = false }
104+
arrow-ord = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false }
105+
arrow-schema = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false }
105106
async-trait = "0.1.88"
106107
bigdecimal = "0.4.8"
107108
bytes = "1.10"
@@ -155,7 +156,7 @@ itertools = "0.14"
155156
log = "^0.4"
156157
object_store = { version = "0.12.3", default-features = false }
157158
parking_lot = "0.12"
158-
parquet = { version = "56.0.0", default-features = false, features = [
159+
parquet = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false, features = [
159160
"arrow",
160161
"async",
161162
"object_store",

datafusion-examples/Cargo.toml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,6 @@ rust-version = { workspace = true }
3232
[lints]
3333
workspace = true
3434

35-
[[example]]
36-
name = "flight_sql_server"
37-
path = "examples/flight/flight_sql_server.rs"
38-
39-
[[example]]
40-
name = "flight_server"
41-
path = "examples/flight/flight_server.rs"
42-
43-
[[example]]
44-
name = "flight_client"
45-
path = "examples/flight/flight_client.rs"
46-
4735
[[example]]
4836
name = "dataframe_to_s3"
4937
path = "examples/external_dependency/dataframe-to-s3.rs"

datafusion/common/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ log = { workspace = true }
7171
object_store = { workspace = true, optional = true }
7272
parquet = { workspace = true, optional = true, default-features = true }
7373
paste = "1.0.15"
74-
pyo3 = { version = "0.25", optional = true }
74+
pyo3 = { version = "0.25.1", optional = true }
7575
recursive = { workspace = true, optional = true }
7676
sqlparser = { workspace = true }
7777
tokio = { workspace = true }

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ use crate::{
2525
DataFusionError, Result, _internal_datafusion_err,
2626
};
2727

28+
pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
29+
2830
use arrow::datatypes::Schema;
2931
// TODO: handle once deprecated
3032
use crate::encryption::add_crypto_to_writer_properties;
@@ -161,6 +163,19 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
161163
builder =
162164
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
163165
}
166+
<<<<<<< HEAD
167+
=======
168+
169+
// max_statistics_size is deprecated, currently it is not being used
170+
// TODO: remove once deprecated
171+
// #[allow(deprecated)]
172+
// if let Some(max_statistics_size) = options.max_statistics_size {
173+
// builder = {
174+
// #[allow(deprecated)]
175+
// builder.set_column_max_statistics_size(path, max_statistics_size)
176+
// }
177+
// }
178+
>>>>>>> f1f6d637c (Initial commit)
164179
}
165180

166181
Ok(builder)
@@ -209,6 +224,10 @@ impl ParquetOptions {
209224
dictionary_enabled,
210225
dictionary_page_size_limit,
211226
statistics_enabled,
227+
<<<<<<< HEAD
228+
=======
229+
max_statistics_size: _max_statistics_size,
230+
>>>>>>> f1f6d637c (Initial commit)
212231
max_row_group_size,
213232
created_by,
214233
column_index_truncate_length,
@@ -255,6 +274,16 @@ impl ParquetOptions {
255274
.set_data_page_row_count_limit(*data_page_row_count_limit)
256275
.set_bloom_filter_enabled(*bloom_filter_on_write);
257276

277+
<<<<<<< HEAD
278+
=======
279+
// builder = {
280+
// #[allow(deprecated)]
281+
// builder.set_max_statistics_size(
282+
// max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
283+
// )
284+
// };
285+
286+
>>>>>>> f1f6d637c (Initial commit)
258287
if let Some(bloom_filter_fpp) = bloom_filter_fpp {
259288
builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp);
260289
};
@@ -533,6 +562,10 @@ mod tests {
533562
),
534563
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
535564
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
565+
<<<<<<< HEAD
566+
=======
567+
max_statistics_size: Some(DEFAULT_MAX_STATISTICS_SIZE),
568+
>>>>>>> f1f6d637c (Initial commit)
536569
}
537570
}
538571

datafusion/common/src/scalar/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2386,7 +2386,9 @@ impl ScalarValue {
23862386
| DataType::Time64(TimeUnit::Millisecond)
23872387
| DataType::RunEndEncoded(_, _)
23882388
| DataType::ListView(_)
2389-
| DataType::LargeListView(_) => {
2389+
| DataType::LargeListView(_)
2390+
| DataType::Decimal32(_, _)
2391+
| DataType::Decimal64(_, _) => {
23902392
return _not_impl_err!(
23912393
"Unsupported creation of {:?} array from ScalarValue {:?}",
23922394
data_type,

datafusion/core/src/dataframe/parquet.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ mod tests {
278278
// Write encrypted parquet using write_parquet
279279
let mut options = TableParquetOptions::default();
280280
options.crypto.file_encryption = Some((&encrypt).into());
281+
options.global.allow_single_file_parallelism = true;
281282

282283
df.write_parquet(
283284
tempfile_str.as_str(),

datafusion/core/tests/fuzz_cases/pruning.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ async fn execute_with_predicate(
314314
}
315315

316316
async fn write_parquet_file(
317-
truncation_length: Option<usize>,
317+
_truncation_length: Option<usize>,
318318
schema: Arc<Schema>,
319319
row_groups: Vec<Vec<String>>,
320320
) -> Bytes {

datafusion/datasource-avro/src/avro_to_arrow/schema.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,8 @@ fn default_field_name(dt: &DataType) -> &str {
239239
DataType::Decimal64(_, _) => "decimal",
240240
DataType::Decimal128(_, _) => "decimal",
241241
DataType::Decimal256(_, _) => "decimal",
242+
DataType::Decimal32(_, _) => "decimal",
243+
DataType::Decimal64(_, _) => "decimal",
242244
}
243245
}
244246

datafusion/datasource-parquet/src/file_format.rs

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ use object_store::path::Path;
7676
use object_store::{ObjectMeta, ObjectStore};
7777
use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
7878
use parquet::arrow::arrow_writer::{
79-
compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter,
80-
ArrowLeafColumn, ArrowWriterOptions,
79+
compute_leaves, ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn,
80+
ArrowRowGroupWriterFactory, ArrowWriterOptions,
8181
};
8282
use parquet::arrow::async_reader::MetadataFetch;
8383
use parquet::arrow::{parquet_to_arrow_schema, ArrowSchemaConverter, AsyncArrowWriter};
@@ -1319,14 +1319,6 @@ impl FileSink for ParquetSink {
13191319
object_store: Arc<dyn ObjectStore>,
13201320
) -> Result<u64> {
13211321
let parquet_opts = &self.parquet_options;
1322-
let mut allow_single_file_parallelism =
1323-
parquet_opts.global.allow_single_file_parallelism;
1324-
1325-
if parquet_opts.crypto.file_encryption.is_some() {
1326-
// For now, arrow-rs does not support parallel writes with encryption
1327-
// See https://github.com/apache/arrow-rs/issues/7359
1328-
allow_single_file_parallelism = false;
1329-
}
13301322

13311323
let mut file_write_tasks: JoinSet<
13321324
std::result::Result<(Path, FileMetaData), DataFusionError>,
@@ -1343,7 +1335,7 @@ impl FileSink for ParquetSink {
13431335
};
13441336

13451337
while let Some((path, mut rx)) = file_stream_rx.recv().await {
1346-
if !allow_single_file_parallelism {
1338+
if !parquet_opts.global.allow_single_file_parallelism {
13471339
let mut writer = self
13481340
.create_async_arrow_writer(
13491341
&path,
@@ -1471,13 +1463,13 @@ type ColSender = Sender<ArrowLeafColumn>;
14711463
/// Returns join handles for each columns serialization task along with a send channel
14721464
/// to send arrow arrays to each serialization task.
14731465
fn spawn_column_parallel_row_group_writer(
1474-
schema: Arc<Schema>,
1475-
parquet_props: Arc<WriterProperties>,
1466+
arrow_row_group_writer_factory: Arc<ArrowRowGroupWriterFactory>,
14761467
max_buffer_size: usize,
14771468
pool: &Arc<dyn MemoryPool>,
14781469
) -> Result<(Vec<ColumnWriterTask>, Vec<ColSender>)> {
1479-
let schema_desc = ArrowSchemaConverter::new().convert(&schema)?;
1480-
let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?;
1470+
let arrow_row_group_writer =
1471+
arrow_row_group_writer_factory.create_row_group_writer(0)?;
1472+
let col_writers = arrow_row_group_writer.into_column_writers();
14811473
let num_columns = col_writers.len();
14821474

14831475
let mut col_writer_tasks = Vec::with_capacity(num_columns);
@@ -1572,6 +1564,7 @@ fn spawn_rg_join_and_finalize_task(
15721564
/// across both columns and row_groups, with a theoretical max number of parallel tasks
15731565
/// given by n_columns * num_row_groups.
15741566
fn spawn_parquet_parallel_serialization_task(
1567+
arrow_row_group_writer_factory: Arc<ArrowRowGroupWriterFactory>,
15751568
mut data: Receiver<RecordBatch>,
15761569
serialize_tx: Sender<SpawnedTask<RBStreamSerializeResult>>,
15771570
schema: Arc<Schema>,
@@ -1584,12 +1577,14 @@ fn spawn_parquet_parallel_serialization_task(
15841577
let max_row_group_rows = writer_props.max_row_group_size();
15851578
let (mut column_writer_handles, mut col_array_channels) =
15861579
spawn_column_parallel_row_group_writer(
1587-
Arc::clone(&schema),
1588-
Arc::clone(&writer_props),
1580+
Arc::clone(&arrow_row_group_writer_factory),
15891581
max_buffer_rb,
15901582
&pool,
15911583
)?;
15921584
let mut current_rg_rows = 0;
1585+
// TODO: row_group_writer should use the correct row group index. Currently this would fail if
1586+
// multiple row groups were written.
1587+
// let mut rg_index = 0;
15931588

15941589
while let Some(mut rb) = data.recv().await {
15951590
// This loop allows the "else" block to repeatedly split the RecordBatch to handle the case
@@ -1636,8 +1631,7 @@ fn spawn_parquet_parallel_serialization_task(
16361631

16371632
(column_writer_handles, col_array_channels) =
16381633
spawn_column_parallel_row_group_writer(
1639-
Arc::clone(&schema),
1640-
Arc::clone(&writer_props),
1634+
Arc::clone(&arrow_row_group_writer_factory),
16411635
max_buffer_rb,
16421636
&pool,
16431637
)?;
@@ -1668,24 +1662,15 @@ fn spawn_parquet_parallel_serialization_task(
16681662
/// Consume RowGroups serialized by other parallel tasks and concatenate them in
16691663
/// to the final parquet file, while flushing finalized bytes to an [ObjectStore]
16701664
async fn concatenate_parallel_row_groups(
1665+
mut parquet_writer: SerializedFileWriter<SharedBuffer>,
1666+
merged_buff: SharedBuffer,
16711667
mut serialize_rx: Receiver<SpawnedTask<RBStreamSerializeResult>>,
1672-
schema: Arc<Schema>,
1673-
writer_props: Arc<WriterProperties>,
16741668
mut object_store_writer: Box<dyn AsyncWrite + Send + Unpin>,
16751669
pool: Arc<dyn MemoryPool>,
16761670
) -> Result<FileMetaData> {
1677-
let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES);
1678-
16791671
let mut file_reservation =
16801672
MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool);
16811673

1682-
let schema_desc = ArrowSchemaConverter::new().convert(schema.as_ref())?;
1683-
let mut parquet_writer = SerializedFileWriter::new(
1684-
merged_buff.clone(),
1685-
schema_desc.root_schema_ptr(),
1686-
writer_props,
1687-
)?;
1688-
16891674
while let Some(task) = serialize_rx.recv().await {
16901675
let result = task.join_unwind().await;
16911676
let mut rg_out = parquet_writer.next_row_group()?;
@@ -1736,28 +1721,47 @@ async fn output_single_parquet_file_parallelized(
17361721
let (serialize_tx, serialize_rx) =
17371722
mpsc::channel::<SpawnedTask<RBStreamSerializeResult>>(max_rowgroups);
17381723

1724+
let parquet_schema = ArrowSchemaConverter::new()
1725+
.with_coerce_types(parquet_props.coerce_types())
1726+
.convert(&output_schema)?;
1727+
let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES);
1728+
let parquet_writer = SerializedFileWriter::new(
1729+
merged_buff.clone(),
1730+
parquet_schema.root_schema_ptr(),
1731+
parquet_props.clone().into(),
1732+
)?;
1733+
let arrow_row_group_writer_factory = ArrowRowGroupWriterFactory::new(
1734+
&parquet_writer,
1735+
parquet_schema,
1736+
Arc::clone(&output_schema),
1737+
parquet_props.clone().into(),
1738+
);
1739+
17391740
let arc_props = Arc::new(parquet_props.clone());
17401741
let launch_serialization_task = spawn_parquet_parallel_serialization_task(
1742+
Arc::new(arrow_row_group_writer_factory),
17411743
data,
17421744
serialize_tx,
17431745
Arc::clone(&output_schema),
17441746
Arc::clone(&arc_props),
17451747
parallel_options,
17461748
Arc::clone(&pool),
17471749
);
1750+
1751+
launch_serialization_task
1752+
.join_unwind()
1753+
.await
1754+
.map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
1755+
17481756
let file_metadata = concatenate_parallel_row_groups(
1757+
parquet_writer,
1758+
merged_buff,
17491759
serialize_rx,
1750-
Arc::clone(&output_schema),
1751-
Arc::clone(&arc_props),
17521760
object_store_writer,
17531761
pool,
17541762
)
17551763
.await?;
17561764

1757-
launch_serialization_task
1758-
.join_unwind()
1759-
.await
1760-
.map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
17611765
Ok(file_metadata)
17621766
}
17631767

0 commit comments

Comments
 (0)