apache
diff --git a/‎datafusion-examples/examples/default_column_values.rs
Lines changed: 1 addition & 1 deletion b/‎datafusion-examples/examples/default_column_values.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎datafusion-examples/examples/json_shredding.rs
Lines changed: 1 addition & 1 deletion b/‎datafusion-examples/examples/json_shredding.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎datafusion/core/src/datasource/listing/table.rs
Lines changed: 32 additions & 2 deletions b/‎datafusion/core/src/datasource/listing/table.rs
Lines changed: 32 additions & 2 deletions
diff --git a/‎datafusion/core/tests/execution/datasource_split.rs
Lines changed: 123 additions & 0 deletions b/‎datafusion/core/tests/execution/datasource_split.rs
Lines changed: 123 additions & 0 deletions
diff --git a/‎datafusion/core/tests/execution/mod.rs
Lines changed: 1 addition & 0 deletions b/‎datafusion/core/tests/execution/mod.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs
Lines changed: 64 additions & 0 deletions b/‎datafusion/core/tests/integration_tests/schema_adapter_integration_tests.rs
Lines changed: 64 additions & 0 deletions
diff --git a/‎datafusion/core/tests/parquet/mod.rs
Lines changed: 1 addition & 0 deletions b/‎datafusion/core/tests/parquet/mod.rs
Lines changed: 1 addition & 0 deletions
@@ -263,7 +263,7 @@ impl TableProvider for DefaultValueTableProvider {
         .with_projection(projection.cloned())
         .with_limit(limit)
         .with_file_group(file_group)
-        .with_expr_adapter(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _);
+        .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _));
 
         Ok(Arc::new(DataSourceExec::new(Arc::new(
             file_scan_config.build(),
 
@@ -273,7 +273,7 @@ impl TableProvider for ExampleTableProvider {
         .with_limit(limit)
         .with_file_group(file_group)
         // if the rewriter needs a reference to the table schema you can bind self.schema() here
-        .with_expr_adapter(Arc::new(ShreddedJsonRewriterFactory) as _);
+        .with_expr_adapter(Some(Arc::new(ShreddedJsonRewriterFactory) as _));
 
         Ok(Arc::new(DataSourceExec::new(Arc::new(
             file_scan_config.build(),
 
@@ -48,6 +48,7 @@ use datafusion_execution::{
 use datafusion_expr::{
     dml::InsertOp, Expr, SortExpr, TableProviderFilterPushDown, TableType,
 };
+use datafusion_physical_expr::schema_rewriter::PhysicalExprAdapterFactory;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics};
 use futures::{future, stream, Stream, StreamExt, TryStreamExt};
@@ -99,6 +100,8 @@ pub struct ListingTableConfig {
     schema_source: SchemaSource,
     /// Optional [`SchemaAdapterFactory`] for creating schema adapters
     schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
+    /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters
+    physical_expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
 }
 
 impl ListingTableConfig {
@@ -281,6 +284,7 @@ impl ListingTableConfig {
             options: Some(listing_options),
             schema_source: self.schema_source,
             schema_adapter_factory: self.schema_adapter_factory,
+            physical_expr_adapter_factory: self.physical_expr_adapter_factory,
         })
     }
 
@@ -300,6 +304,7 @@ impl ListingTableConfig {
                     options: _,
                     schema_source,
                     schema_adapter_factory,
+                    physical_expr_adapter_factory,
                 } = self;
 
                 let (schema, new_schema_source) = match file_schema {
@@ -322,6 +327,7 @@ impl ListingTableConfig {
                     options: Some(options),
                     schema_source: new_schema_source,
                     schema_adapter_factory,
+                    physical_expr_adapter_factory,
                 })
             }
             None => internal_err!("No `ListingOptions` set for inferring schema"),
@@ -364,6 +370,7 @@ impl ListingTableConfig {
                     options: Some(options),
                     schema_source: self.schema_source,
                     schema_adapter_factory: self.schema_adapter_factory,
+                    physical_expr_adapter_factory: self.physical_expr_adapter_factory,
                 })
             }
             None => config_err!("No `ListingOptions` set for inferring schema"),
@@ -415,6 +422,26 @@ impl ListingTableConfig {
     pub fn schema_adapter_factory(&self) -> Option<&Arc<dyn SchemaAdapterFactory>> {
         self.schema_adapter_factory.as_ref()
     }
+
+    /// Set the [`PhysicalExprAdapterFactory`] for the [`ListingTable`]
+    ///
+    /// The expression adapter factory is used to create physical expression adapters that can
+    /// handle schema evolution and type conversions when evaluating expressions
+    /// with different schemas than the table schema.
+    ///
+    /// If not provided, a default physical expression adapter factory will be used unless a custom
+    /// `SchemaAdapterFactory` is set, in which case only the `SchemaAdapterFactory` will be used.
+    ///
+    /// See <https://github.com/apache/datafusion/issues/16800> for details on this transition.
+    pub fn with_physical_expr_adapter_factory(
+        self,
+        physical_expr_adapter_factory: Arc<dyn PhysicalExprAdapterFactory>,
+    ) -> Self {
+        Self {
+            physical_expr_adapter_factory: Some(physical_expr_adapter_factory),
+            ..self
+        }
+    }
 }
 
 /// Options for creating a [`ListingTable`]
@@ -911,6 +938,8 @@ pub struct ListingTable {
     column_defaults: HashMap<String, Expr>,
     /// Optional [`SchemaAdapterFactory`] for creating schema adapters
     schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
+    /// Optional [`PhysicalExprAdapterFactory`] for creating physical expression adapters
+    expr_adapter_factory: Option<Arc<dyn PhysicalExprAdapterFactory>>,
 }
 
 impl ListingTable {
@@ -952,6 +981,7 @@ impl ListingTable {
             constraints: Constraints::default(),
             column_defaults: HashMap::new(),
             schema_adapter_factory: config.schema_adapter_factory,
+            expr_adapter_factory: config.physical_expr_adapter_factory,
         };
 
         Ok(table)
@@ -1196,6 +1226,7 @@ impl TableProvider for ListingTable {
                 .with_limit(limit)
                 .with_output_ordering(output_ordering)
                 .with_table_partition_cols(table_partition_cols)
+                .with_expr_adapter(self.expr_adapter_factory.clone())
                 .build(),
             )
             .await
@@ -1995,7 +2026,6 @@ mod tests {
     #[tokio::test]
     async fn test_insert_into_append_new_parquet_files_session_overrides() -> Result<()> {
         let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
         config_map.insert(
             "datafusion.execution.soft_max_rows_per_output_file".into(),
             "10".into(),
@@ -2060,7 +2090,7 @@ mod tests {
             "datafusion.execution.parquet.write_batch_size".into(),
             "5".into(),
         );
-        config_map.insert("datafusion.execution.batch_size".into(), "1".into());
+        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
         helper_test_append_new_files_to_table(
             ParquetFormat::default().get_ext(),
             FileCompressionType::UNCOMPRESSED,
 
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::{
+    array::{ArrayRef, Int32Array},
+    datatypes::{DataType, Field, Schema},
+    record_batch::RecordBatch,
+};
+use datafusion_datasource::memory::MemorySourceConfig;
+use datafusion_execution::TaskContext;
+use datafusion_physical_plan::{common::collect, ExecutionPlan};
+use std::sync::Arc;
+
+/// Helper function to create a memory source with the given batch size and collect all batches
+async fn create_and_collect_batches(
+    batch_size: usize,
+) -> datafusion_common::Result<Vec<RecordBatch>> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let array = Int32Array::from_iter_values(0..batch_size as i32);
+    let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array) as ArrayRef])?;
+    let exec = MemorySourceConfig::try_new_exec(&[vec![batch]], schema, None)?;
+    let ctx = Arc::new(TaskContext::default());
+    let stream = exec.execute(0, ctx)?;
+    collect(stream).await
+}
+
+/// Helper function to create a memory source with multiple batches and collect all results
+async fn create_and_collect_multiple_batches(
+    input_batches: Vec<RecordBatch>,
+) -> datafusion_common::Result<Vec<RecordBatch>> {
+    let schema = input_batches[0].schema();
+    let exec = MemorySourceConfig::try_new_exec(&[input_batches], schema, None)?;
+    let ctx = Arc::new(TaskContext::default());
+    let stream = exec.execute(0, ctx)?;
+    collect(stream).await
+}
+
+#[tokio::test]
+async fn datasource_splits_large_batches() -> datafusion_common::Result<()> {
+    let batch_size = 20000;
+    let batches = create_and_collect_batches(batch_size).await?;
+
+    assert!(batches.len() > 1);
+    let max = batches.iter().map(|b| b.num_rows()).max().unwrap();
+    assert!(
+        max <= datafusion_execution::config::SessionConfig::new()
+            .options()
+            .execution
+            .batch_size
+    );
+    let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(total, batch_size);
+    Ok(())
+}
+
+#[tokio::test]
+async fn datasource_exact_batch_size_no_split() -> datafusion_common::Result<()> {
+    let session_config = datafusion_execution::config::SessionConfig::new();
+    let configured_batch_size = session_config.options().execution.batch_size;
+
+    let batches = create_and_collect_batches(configured_batch_size).await?;
+
+    // Should not split when exactly equal to batch_size
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), configured_batch_size);
+    Ok(())
+}
+
+#[tokio::test]
+async fn datasource_small_batch_no_split() -> datafusion_common::Result<()> {
+    // Test with batch smaller than the batch size (8192)
+    let small_batch_size = 512; // Less than 8192
+
+    let batches = create_and_collect_batches(small_batch_size).await?;
+
+    // Should not split small batches below the batch size
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), small_batch_size);
+    Ok(())
+}
+
+#[tokio::test]
+async fn datasource_empty_batch_clean_termination() -> datafusion_common::Result<()> {
+    let batches = create_and_collect_batches(0).await?;
+
+    // Empty batch should result in one empty batch
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0].num_rows(), 0);
+    Ok(())
+}
+
+#[tokio::test]
+async fn datasource_multiple_empty_batches() -> datafusion_common::Result<()> {
+    let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
+    let empty_array = Int32Array::from_iter_values(std::iter::empty::<i32>());
+    let empty_batch =
+        RecordBatch::try_new(schema.clone(), vec![Arc::new(empty_array) as ArrayRef])?;
+
+    // Create multiple empty batches
+    let input_batches = vec![empty_batch.clone(), empty_batch.clone(), empty_batch];
+    let batches = create_and_collect_multiple_batches(input_batches).await?;
+
+    // Should preserve empty batches without issues
+    assert_eq!(batches.len(), 3);
+    for batch in &batches {
+        assert_eq!(batch.num_rows(), 0);
+    }
+    Ok(())
+}
@@ -16,4 +16,5 @@
 // under the License.
 
 mod coop;
+mod datasource_split;
 mod logical_plan;
@@ -148,6 +148,70 @@ async fn test_parquet_integration_with_schema_adapter() -> Result<()> {
     Ok(())
 }
 
+#[cfg(feature = "parquet")]
+#[tokio::test]
+async fn test_parquet_integration_with_schema_adapter_and_expression_rewriter() -> Result<()> {
+    // Create a temporary directory for our test file
+    let tmp_dir = TempDir::new()?;
+    let file_path = tmp_dir.path().join("test.parquet");
+    let file_path_str = file_path.to_str().unwrap();
+
+    // Create test data
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("name", DataType::Utf8, true),
+    ]));
+
+    let batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(arrow::array::Int32Array::from(vec![1, 2, 3])),
+            Arc::new(arrow::array::StringArray::from(vec!["a", "b", "c"])),
+        ],
+    )?;
+
+    // Write test parquet file
+    let file = std::fs::File::create(file_path_str)?;
+    let props = WriterProperties::builder().build();
+    let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))?;
+    writer.write(&batch)?;
+    writer.close()?;
+
+    // Create a session context
+    let ctx = SessionContext::new();
+
+    // Create a ParquetSource with the adapter factory
+    let source = ParquetSource::default()
+        .with_schema_adapter_factory(Arc::new(UppercaseAdapterFactory {}));
+
+    // Create a scan config
+    let config = FileScanConfigBuilder::new(
+        ObjectStoreUrl::parse(&format!("file://{}", file_path_str))?,
+        schema.clone(),
+    )
+    .with_source(source)
+    .build();
+
+    // Create a data source executor
+    let exec = DataSourceExec::from_data_source(config);
+
+    // Collect results
+    let task_ctx = ctx.task_ctx();
+    let stream = exec.execute(0, task_ctx)?;
+    let batches = datafusion::physical_plan::common::collect(stream).await?;
+
+    // There should be one batch
+    assert_eq!(batches.len(), 1);
+
+    // Verify the schema has uppercase column names
+    let result_schema = batches[0].schema();
+    assert_eq!(result_schema.field(0).name(), "ID");
+    assert_eq!(result_schema.field(1).name(), "NAME");
+
+    Ok(())
+}
+
+
 #[tokio::test]
 async fn test_multi_source_schema_adapter_reuse() -> Result<()> {
     // This test verifies that the same schema adapter factory can be reused
 
@@ -50,6 +50,7 @@ mod filter_pushdown;
 mod page_pruning;
 mod row_group_pruning;
 mod schema;
+mod schema_adapter;
 mod schema_coercion;
 mod utils;