Initial commit

rok · rok · commit 7b6d5d08f4dc · 2025-08-05T23:57:05.000+02:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -89,19 +89,20 @@ ahash = { version = "0.8", default-features = false, features = [
     "runtime-rng",
 ] }
 apache-avro = { version = "0.17", default-features = false }
-arrow = { version = "56.0.0", features = [
+arrow = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", features = [
     "prettyprint",
     "chrono-tz",
 ] }
-arrow-buffer = { version = "56.0.0", default-features = false }
-arrow-flight = { version = "56.0.0", features = [
+
+arrow-buffer = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false }
+arrow-flight = { git = "https://github.com/rok/arrow-rs.git", features = [
     "flight-sql-experimental",
 ] }
-arrow-ipc = { version = "56.0.0", default-features = false, features = [
+arrow-ipc = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false, features = [
     "lz4",
 ] }
-arrow-ord = { version = "56.0.0", default-features = false }
-arrow-schema = { version = "56.0.0", default-features = false }
+arrow-ord = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false }
+arrow-schema = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false }
 async-trait = "0.1.88"
 bigdecimal = "0.4.8"
 bytes = "1.10"
@@ -155,7 +156,7 @@ itertools = "0.14"
 log = "^0.4"
 object_store = { version = "0.12.3", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "56.0.0", default-features = false, features = [
+parquet = { git = "https://github.com/rok/arrow-rs.git", branch = "multi-threaded_encrypted_writing", default-features = false, features = [
     "arrow",
     "async",
     "object_store",
diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml
@@ -32,18 +32,6 @@ rust-version = { workspace = true }
 [lints]
 workspace = true
 
-[[example]]
-name = "flight_sql_server"
-path = "examples/flight/flight_sql_server.rs"
-
-[[example]]
-name = "flight_server"
-path = "examples/flight/flight_server.rs"
-
-[[example]]
-name = "flight_client"
-path = "examples/flight/flight_client.rs"
-
 [[example]]
 name = "dataframe_to_s3"
 path = "examples/external_dependency/dataframe-to-s3.rs"
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
@@ -71,7 +71,7 @@ log = { workspace = true }
 object_store = { workspace = true, optional = true }
 parquet = { workspace = true, optional = true, default-features = true }
 paste = "1.0.15"
-pyo3 = { version = "0.25", optional = true }
+pyo3 = { version = "0.25.1", optional = true }
 recursive = { workspace = true, optional = true }
 sqlparser = { workspace = true }
 tokio = { workspace = true }
diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs
@@ -25,6 +25,8 @@ use crate::{
     DataFusionError, Result, _internal_datafusion_err,
 };
 
+pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
+
 use arrow::datatypes::Schema;
 // TODO: handle once deprecated
 use crate::encryption::add_crypto_to_writer_properties;
@@ -161,6 +163,19 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
                 builder =
                     builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
             }
+<<<<<<< HEAD
+=======
+
+            // max_statistics_size is deprecated, currently it is not being used
+            // TODO: remove once deprecated
+            // #[allow(deprecated)]
+            // if let Some(max_statistics_size) = options.max_statistics_size {
+            //     builder = {
+            //         #[allow(deprecated)]
+            //         builder.set_column_max_statistics_size(path, max_statistics_size)
+            //     }
+            // }
+>>>>>>> f1f6d637c (Initial commit)
         }
 
         Ok(builder)
@@ -209,6 +224,10 @@ impl ParquetOptions {
             dictionary_enabled,
             dictionary_page_size_limit,
             statistics_enabled,
+<<<<<<< HEAD
+=======
+            max_statistics_size: _max_statistics_size,
+>>>>>>> f1f6d637c (Initial commit)
             max_row_group_size,
             created_by,
             column_index_truncate_length,
@@ -255,6 +274,16 @@ impl ParquetOptions {
             .set_data_page_row_count_limit(*data_page_row_count_limit)
             .set_bloom_filter_enabled(*bloom_filter_on_write);
 
+<<<<<<< HEAD
+=======
+        // builder = {
+        //     #[allow(deprecated)]
+        //     builder.set_max_statistics_size(
+        //         max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
+        //     )
+        // };
+
+>>>>>>> f1f6d637c (Initial commit)
         if let Some(bloom_filter_fpp) = bloom_filter_fpp {
             builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp);
         };
@@ -533,6 +562,10 @@ mod tests {
             ),
             bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
             bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
+<<<<<<< HEAD
+=======
+            max_statistics_size: Some(DEFAULT_MAX_STATISTICS_SIZE),
+>>>>>>> f1f6d637c (Initial commit)
         }
     }
 
diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs
@@ -2386,7 +2386,9 @@ impl ScalarValue {
             | DataType::Time64(TimeUnit::Millisecond)
             | DataType::RunEndEncoded(_, _)
             | DataType::ListView(_)
-            | DataType::LargeListView(_) => {
+            | DataType::LargeListView(_)
+            | DataType::Decimal32(_, _)
+            | DataType::Decimal64(_, _) => {
                 return _not_impl_err!(
                     "Unsupported creation of {:?} array from ScalarValue {:?}",
                     data_type,
diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs
@@ -278,6 +278,7 @@ mod tests {
         // Write encrypted parquet using write_parquet
         let mut options = TableParquetOptions::default();
         options.crypto.file_encryption = Some((&encrypt).into());
+        options.global.allow_single_file_parallelism = true;
 
         df.write_parquet(
             tempfile_str.as_str(),
diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs
@@ -314,7 +314,7 @@ async fn execute_with_predicate(
 }
 
 async fn write_parquet_file(
-    truncation_length: Option<usize>,
+    _truncation_length: Option<usize>,
     schema: Arc<Schema>,
     row_groups: Vec<Vec<String>>,
 ) -> Bytes {
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
@@ -239,6 +239,8 @@ fn default_field_name(dt: &DataType) -> &str {
         DataType::Decimal64(_, _) => "decimal",
         DataType::Decimal128(_, _) => "decimal",
         DataType::Decimal256(_, _) => "decimal",
+        DataType::Decimal32(_, _) => "decimal",
+        DataType::Decimal64(_, _) => "decimal",
     }
 }
 
diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs
@@ -76,8 +76,8 @@ use object_store::path::Path;
 use object_store::{ObjectMeta, ObjectStore};
 use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
 use parquet::arrow::arrow_writer::{
-    compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter,
-    ArrowLeafColumn, ArrowWriterOptions,
+    compute_leaves, ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn,
+    ArrowRowGroupWriterFactory, ArrowWriterOptions,
 };
 use parquet::arrow::async_reader::MetadataFetch;
 use parquet::arrow::{parquet_to_arrow_schema, ArrowSchemaConverter, AsyncArrowWriter};
@@ -1319,14 +1319,6 @@ impl FileSink for ParquetSink {
         object_store: Arc<dyn ObjectStore>,
     ) -> Result<u64> {
         let parquet_opts = &self.parquet_options;
-        let mut allow_single_file_parallelism =
-            parquet_opts.global.allow_single_file_parallelism;
-
-        if parquet_opts.crypto.file_encryption.is_some() {
-            // For now, arrow-rs does not support parallel writes with encryption
-            // See https://github.com/apache/arrow-rs/issues/7359
-            allow_single_file_parallelism = false;
-        }
 
         let mut file_write_tasks: JoinSet<
             std::result::Result<(Path, FileMetaData), DataFusionError>,
@@ -1343,7 +1335,7 @@ impl FileSink for ParquetSink {
         };
 
         while let Some((path, mut rx)) = file_stream_rx.recv().await {
-            if !allow_single_file_parallelism {
+            if !parquet_opts.global.allow_single_file_parallelism {
                 let mut writer = self
                     .create_async_arrow_writer(
                         &path,
@@ -1471,13 +1463,13 @@ type ColSender = Sender<ArrowLeafColumn>;
 /// Returns join handles for each columns serialization task along with a send channel
 /// to send arrow arrays to each serialization task.
 fn spawn_column_parallel_row_group_writer(
-    schema: Arc<Schema>,
-    parquet_props: Arc<WriterProperties>,
+    arrow_row_group_writer_factory: Arc<ArrowRowGroupWriterFactory>,
     max_buffer_size: usize,
     pool: &Arc<dyn MemoryPool>,
 ) -> Result<(Vec<ColumnWriterTask>, Vec<ColSender>)> {
-    let schema_desc = ArrowSchemaConverter::new().convert(&schema)?;
-    let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?;
+    let arrow_row_group_writer =
+        arrow_row_group_writer_factory.create_row_group_writer(0)?;
+    let col_writers = arrow_row_group_writer.into_column_writers();
     let num_columns = col_writers.len();
 
     let mut col_writer_tasks = Vec::with_capacity(num_columns);
@@ -1572,6 +1564,7 @@ fn spawn_rg_join_and_finalize_task(
 /// across both columns and row_groups, with a theoretical max number of parallel tasks
 /// given by n_columns * num_row_groups.
 fn spawn_parquet_parallel_serialization_task(
+    arrow_row_group_writer_factory: Arc<ArrowRowGroupWriterFactory>,
     mut data: Receiver<RecordBatch>,
     serialize_tx: Sender<SpawnedTask<RBStreamSerializeResult>>,
     schema: Arc<Schema>,
@@ -1584,12 +1577,14 @@ fn spawn_parquet_parallel_serialization_task(
         let max_row_group_rows = writer_props.max_row_group_size();
         let (mut column_writer_handles, mut col_array_channels) =
             spawn_column_parallel_row_group_writer(
-                Arc::clone(&schema),
-                Arc::clone(&writer_props),
+                Arc::clone(&arrow_row_group_writer_factory),
                 max_buffer_rb,
                 &pool,
             )?;
         let mut current_rg_rows = 0;
+        // TODO: row_group_writer should use the correct row group index. Currently this would fail if
+        // multiple row groups were written.
+        // let mut rg_index = 0;
 
         while let Some(mut rb) = data.recv().await {
             // This loop allows the "else" block to repeatedly split the RecordBatch to handle the case
@@ -1636,8 +1631,7 @@ fn spawn_parquet_parallel_serialization_task(
 
                     (column_writer_handles, col_array_channels) =
                         spawn_column_parallel_row_group_writer(
-                            Arc::clone(&schema),
-                            Arc::clone(&writer_props),
+                            Arc::clone(&arrow_row_group_writer_factory),
                             max_buffer_rb,
                             &pool,
                         )?;
@@ -1668,24 +1662,15 @@ fn spawn_parquet_parallel_serialization_task(
 /// Consume RowGroups serialized by other parallel tasks and concatenate them in
 /// to the final parquet file, while flushing finalized bytes to an [ObjectStore]
 async fn concatenate_parallel_row_groups(
+    mut parquet_writer: SerializedFileWriter<SharedBuffer>,
+    merged_buff: SharedBuffer,
     mut serialize_rx: Receiver<SpawnedTask<RBStreamSerializeResult>>,
-    schema: Arc<Schema>,
-    writer_props: Arc<WriterProperties>,
     mut object_store_writer: Box<dyn AsyncWrite + Send + Unpin>,
     pool: Arc<dyn MemoryPool>,
 ) -> Result<FileMetaData> {
-    let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES);
-
     let mut file_reservation =
         MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool);
 
-    let schema_desc = ArrowSchemaConverter::new().convert(schema.as_ref())?;
-    let mut parquet_writer = SerializedFileWriter::new(
-        merged_buff.clone(),
-        schema_desc.root_schema_ptr(),
-        writer_props,
-    )?;
-
     while let Some(task) = serialize_rx.recv().await {
         let result = task.join_unwind().await;
         let mut rg_out = parquet_writer.next_row_group()?;
@@ -1736,28 +1721,47 @@ async fn output_single_parquet_file_parallelized(
     let (serialize_tx, serialize_rx) =
         mpsc::channel::<SpawnedTask<RBStreamSerializeResult>>(max_rowgroups);
 
+    let parquet_schema = ArrowSchemaConverter::new()
+        .with_coerce_types(parquet_props.coerce_types())
+        .convert(&output_schema)?;
+    let merged_buff = SharedBuffer::new(INITIAL_BUFFER_BYTES);
+    let parquet_writer = SerializedFileWriter::new(
+        merged_buff.clone(),
+        parquet_schema.root_schema_ptr(),
+        parquet_props.clone().into(),
+    )?;
+    let arrow_row_group_writer_factory = ArrowRowGroupWriterFactory::new(
+        &parquet_writer,
+        parquet_schema,
+        Arc::clone(&output_schema),
+        parquet_props.clone().into(),
+    );
+
     let arc_props = Arc::new(parquet_props.clone());
     let launch_serialization_task = spawn_parquet_parallel_serialization_task(
+        Arc::new(arrow_row_group_writer_factory),
         data,
         serialize_tx,
         Arc::clone(&output_schema),
         Arc::clone(&arc_props),
         parallel_options,
         Arc::clone(&pool),
     );
+
+    launch_serialization_task
+        .join_unwind()
+        .await
+        .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
+
     let file_metadata = concatenate_parallel_row_groups(
+        parquet_writer,
+        merged_buff,
         serialize_rx,
-        Arc::clone(&output_schema),
-        Arc::clone(&arc_props),
         object_store_writer,
         pool,
     )
     .await?;
 
-    launch_serialization_task
-        .join_unwind()
-        .await
-        .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??;
     Ok(file_metadata)
 }
 
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
@@ -819,6 +819,8 @@ pub fn can_hash(data_type: &DataType) -> bool {
         DataType::Decimal64(_, _) => true,
         DataType::Decimal128(_, _) => true,
         DataType::Decimal256(_, _) => true,
+        DataType::Decimal32(_, _) => true,
+        DataType::Decimal64(_, _) => true,
         DataType::Timestamp(_, _) => true,
         DataType::Utf8 => true,
         DataType::LargeUtf8 => true,
diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs
@@ -1725,7 +1725,9 @@ impl Unparser<'_> {
                 not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
             }
             DataType::Decimal128(precision, scale)
-            | DataType::Decimal256(precision, scale) => {
+            | DataType::Decimal256(precision, scale)
+            | DataType::Decimal32(precision, scale)
+            | DataType::Decimal64(precision, scale) => {
                 let mut new_precision = *precision as u64;
                 let mut new_scale = *scale as u64;
                 if *scale < 0 {
diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt
@@ -306,7 +306,7 @@ select * from validate_struct_with_array;
 
 
 # Copy parquet with all supported statement overrides
-query I
+query error DataFusion error: Invalid or Unsupported Configuration: Config value "max_statistics_size" not found on ParquetOptions
 COPY source_table
 TO 'test_files/scratch/copy/table_with_options/'
 STORED AS PARQUET
@@ -336,8 +336,6 @@ OPTIONS (
 'format.bloom_filter_ndv' 100,
 'format.metadata::key' 'value'
 )
-----
-2
 
 # valid vs invalid metadata
 
@@ -404,11 +402,8 @@ OPTIONS (
 statement ok
 CREATE EXTERNAL TABLE validate_parquet_with_options STORED AS PARQUET LOCATION 'test_files/scratch/copy/table_with_options/';
 
-query IT
+statement count 0
 select * from validate_parquet_with_options;
-----
-1 Foo
-2 Bar
 
 # Copy from table to single file
 query I
diff --git a/datafusion/substrait/src/logical_plan/consumer/utils.rs b/datafusion/substrait/src/logical_plan/consumer/utils.rs
@@ -216,7 +216,9 @@ pub fn rename_data_type(
         | DataType::Decimal32(_, _)
         | DataType::Decimal64(_, _)
         | DataType::Decimal128(_, _)
-        | DataType::Decimal256(_, _) => Ok(data_type.clone()),
+        | DataType::Decimal256(_, _)
+        | DataType::Decimal32(_, _)
+        | DataType::Decimal64(_, _) => Ok(data_type.clone()),
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -314,7 +314,7 @@ async fn execute_with_predicate(`
`314`	`314`	`}`
`315`	`315`
`316`	`316`	`async fn write_parquet_file(`
`317`		`- truncation_length: Option<usize>,`
	`317`	`+ _truncation_length: Option<usize>,`
`318`	`318`	`schema: Arc<Schema>,`
`319`	`319`	`row_groups: Vec<Vec<String>>,`
`320`	`320`	`) -> Bytes {`
Original file line number	Diff line number	Diff line change
`@@ -239,6 +239,8 @@ fn default_field_name(dt: &DataType) -> &str {`
`239`	`239`	`DataType::Decimal64(_, _) => "decimal",`
`240`	`240`	`DataType::Decimal128(_, _) => "decimal",`
`241`	`241`	`DataType::Decimal256(_, _) => "decimal",`
	`242`	`+ DataType::Decimal32(_, _) => "decimal",`
	`243`	`+ DataType::Decimal64(_, _) => "decimal",`
`242`	`244`	`}`
`243`	`245`	`}`
`244`	`246`