Skip to content

feat: change Expr OuterReferenceColumn and Alias to Box type for reducing expr struct size #16771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/catalog-listing/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub fn expr_applicable_for_cols(col_names: &[&str], expr: &Expr) -> bool {
}
Expr::Literal(_, _)
| Expr::Alias(_)
| Expr::OuterReferenceColumn(_, _)
| Expr::OuterReferenceColumn(_)
| Expr::ScalarVariable(_, _)
| Expr::Not(_)
| Expr::IsNotNull(_)
Expand Down
4 changes: 2 additions & 2 deletions datafusion/core/src/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2291,15 +2291,15 @@ impl DataFrame {
if cols.contains(field) {
// Try to cast fill value to column type. If the cast fails, fallback to the original column.
match value.clone().cast_to(field.data_type()) {
Ok(fill_value) => Expr::Alias(Alias {
Ok(fill_value) => Expr::Alias(Box::new(Alias {
expr: Box::new(Expr::ScalarFunction(ScalarFunction {
func: coalesce(),
args: vec![col(field.name()), lit(fill_value)],
})),
relation: None,
name: field.name().to_string(),
metadata: None,
}),
})),
Err(_) => col(field.name()),
}
} else {
Expand Down
16 changes: 9 additions & 7 deletions datafusion/core/src/physical_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ use datafusion_datasource::file_groups::FileGroup;
use datafusion_datasource::memory::MemorySourceConfig;
use datafusion_expr::dml::{CopyTo, InsertOp};
use datafusion_expr::expr::{
physical_name, AggregateFunction, AggregateFunctionParams, Alias, GroupingSet,
physical_name, AggregateFunction, AggregateFunctionParams, GroupingSet,
WindowFunction, WindowFunctionParams,
};
use datafusion_expr::expr_rewriter::unnormalize_cols;
Expand Down Expand Up @@ -602,9 +602,9 @@ impl DefaultPhysicalPlanner {
} = &window_fun.as_ref().params;
generate_sort_key(partition_by, order_by)
}
Expr::Alias(Alias { expr, .. }) => {
Expr::Alias(boxed_alias) => {
// Convert &Box<T> to &T
match &**expr {
match boxed_alias.expr.as_ref() {
Expr::WindowFunction(window_fun) => {
let WindowFunctionParams {
ref partition_by,
Expand Down Expand Up @@ -1693,7 +1693,7 @@ pub fn create_window_expr(
) -> Result<Arc<dyn WindowExpr>> {
// unpack aliased logical expressions, e.g. "sum(col) over () as total"
let (name, e) = match e {
Expr::Alias(Alias { expr, name, .. }) => (name.clone(), expr.as_ref()),
Expr::Alias(boxed_alias) => (boxed_alias.name.clone(), boxed_alias.expr.as_ref()),
_ => (e.schema_name().to_string(), e),
};
create_window_expr_with_name(e, name, logical_schema, execution_props)
Expand Down Expand Up @@ -1784,9 +1784,11 @@ pub fn create_aggregate_expr_and_maybe_filter(
) -> Result<AggregateExprWithOptionalArgs> {
// unpack (nested) aliased logical expressions, e.g. "sum(col) as total"
let (name, human_display, e) = match e {
Expr::Alias(Alias { expr, name, .. }) => {
(Some(name.clone()), String::default(), expr.as_ref())
}
Expr::Alias(boxed_alias) => (
Some(boxed_alias.name.clone()),
String::default(),
boxed_alias.expr.as_ref(),
),
Expr::AggregateFunction(_) => (
Some(e.schema_name().to_string()),
e.human_display().to_string(),
Expand Down
8 changes: 4 additions & 4 deletions datafusion/core/tests/dataframe/dataframe_functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -415,11 +415,11 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
");

// the arg2 parameter is a complex expr, but it can be evaluated to the literal value
let alias_expr = Expr::Alias(Alias::new(
let alias_expr = Expr::Alias(Box::new(Alias::new(
cast(lit(0.5), DataType::Float32),
None::<&str>,
"arg_2".to_string(),
));
)));
let expr = approx_percentile_cont(col("b").sort(true, false), alias_expr, None);
let df = create_test_table().await?;
let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
Expand All @@ -435,11 +435,11 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
"
);

let alias_expr = Expr::Alias(Alias::new(
let alias_expr = Expr::Alias(Box::new(Alias::new(
cast(lit(0.1), DataType::Float32),
None::<&str>,
"arg_2".to_string(),
));
)));
let expr = approx_percentile_cont(col("b").sort(false, false), alias_expr, None);
let df = create_test_table().await?;
let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
Expand Down
4 changes: 2 additions & 2 deletions datafusion/core/tests/user_defined/expr_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ impl ExprPlanner for MyCustomPlanner {
})))
}
BinaryOperator::Question => {
Ok(PlannerResult::Planned(Expr::Alias(Alias::new(
Ok(PlannerResult::Planned(Expr::Alias(Box::new(Alias::new(
Expr::Literal(ScalarValue::Boolean(Some(true)), None),
None::<&str>,
format!("{} ? {}", expr.left, expr.right),
))))
)))))
}
_ => Ok(PlannerResult::Original(expr)),
}
Expand Down
88 changes: 43 additions & 45 deletions datafusion/expr/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ use sqlparser::ast::{
#[derive(Clone, PartialEq, PartialOrd, Eq, Debug, Hash)]
pub enum Expr {
/// An expression with a specific name.
Alias(Alias),
Alias(Box<Alias>),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another change that might be a bit less impactful would be to box the fields of Alias instead

So Alias {
expr: Box
..
}


It may be just as bad / worse though

/// A named reference to a qualified field in a schema.
Column(Column),
/// A named reference to a variable in a registry.
Expand Down Expand Up @@ -362,7 +362,7 @@ pub enum Expr {
Placeholder(Placeholder),
/// A placeholder which holds a reference to a qualified field
/// in the outer query, used for correlated sub queries.
OuterReferenceColumn(DataType, Column),
OuterReferenceColumn(Box<(DataType, Column)>),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are going to make this change anyways, can we also pull this into a named struct rather than a unnamed tuple

like

struct OuterReference {
 // fields here
}

enum Expr {
... 
    OuterReferenceColumn(OuterReference),
...
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @alamb for this good suggestion, addressed in latest PR.

/// Unnest expression
Unnest(Unnest),
}
Expand Down Expand Up @@ -1424,7 +1424,9 @@ impl Expr {
name,
spans: _,
}) => (relation.clone(), name.clone()),
Expr::Alias(Alias { relation, name, .. }) => (relation.clone(), name.clone()),
Expr::Alias(boxed_alias) => {
(boxed_alias.relation.clone(), boxed_alias.name.clone())
}
_ => (None, self.schema_name().to_string()),
}
}
Expand All @@ -1446,7 +1448,7 @@ impl Expr {
Expr::Case { .. } => "Case",
Expr::Cast { .. } => "Cast",
Expr::Column(..) => "Column",
Expr::OuterReferenceColumn(_, _) => "Outer",
Expr::OuterReferenceColumn(_) => "Outer",
Expr::Exists { .. } => "Exists",
Expr::GroupingSet(..) => "GroupingSet",
Expr::InList { .. } => "InList",
Expand Down Expand Up @@ -1572,7 +1574,7 @@ impl Expr {

/// Return `self AS name` alias expression
pub fn alias(self, name: impl Into<String>) -> Expr {
Expr::Alias(Alias::new(self, None::<&str>, name.into()))
Expr::Alias(Box::new(Alias::new(self, None::<&str>, name.into())))
}

/// Return `self AS name` alias expression with metadata
Expand All @@ -1595,7 +1597,9 @@ impl Expr {
name: impl Into<String>,
metadata: Option<FieldMetadata>,
) -> Expr {
Expr::Alias(Alias::new(self, None::<&str>, name.into()).with_metadata(metadata))
Expr::Alias(Box::new(
Alias::new(self, None::<&str>, name.into()).with_metadata(metadata),
))
}

/// Return `self AS name` alias expression with a specific qualifier
Expand All @@ -1604,7 +1608,7 @@ impl Expr {
relation: Option<impl Into<TableReference>>,
name: impl Into<String>,
) -> Expr {
Expr::Alias(Alias::new(self, relation, name.into()))
Expr::Alias(Box::new(Alias::new(self, relation, name.into())))
}

/// Return `self AS name` alias expression with a specific qualifier and metadata
Expand All @@ -1628,7 +1632,9 @@ impl Expr {
name: impl Into<String>,
metadata: Option<FieldMetadata>,
) -> Expr {
Expr::Alias(Alias::new(self, relation, name.into()).with_metadata(metadata))
Expr::Alias(Box::new(
Alias::new(self, relation, name.into()).with_metadata(metadata),
))
}

/// Remove an alias from an expression if one exists.
Expand Down Expand Up @@ -2021,7 +2027,7 @@ impl Expr {
| Expr::SimilarTo(..)
| Expr::Not(..)
| Expr::Negative(..)
| Expr::OuterReferenceColumn(_, _)
| Expr::OuterReferenceColumn(_)
| Expr::TryCast(..)
| Expr::Unnest(..)
| Expr::Wildcard { .. }
Expand Down Expand Up @@ -2109,23 +2115,10 @@ impl NormalizeEq for Expr {
&& self_right.normalize_eq(other_right)
}
}
(
Expr::Alias(Alias {
expr: self_expr,
relation: self_relation,
name: self_name,
..
}),
Expr::Alias(Alias {
expr: other_expr,
relation: other_relation,
name: other_name,
..
}),
) => {
self_name == other_name
&& self_relation == other_relation
&& self_expr.normalize_eq(other_expr)
(Expr::Alias(boxed_alias), Expr::Alias(boxed_other_alias)) => {
boxed_alias.name == boxed_other_alias.name
&& boxed_alias.relation == boxed_other_alias.relation
&& boxed_alias.expr.normalize_eq(&*boxed_other_alias.expr)
}
(
Expr::Like(Like {
Expand Down Expand Up @@ -2459,14 +2452,9 @@ impl HashNode for Expr {
fn hash_node<H: Hasher>(&self, state: &mut H) {
mem::discriminant(self).hash(state);
match self {
Expr::Alias(Alias {
expr: _expr,
relation,
name,
..
}) => {
relation.hash(state);
name.hash(state);
Expr::Alias(boxed_alias) => {
boxed_alias.relation.hash(state);
boxed_alias.name.hash(state);
}
Expr::Column(column) => {
column.hash(state);
Expand Down Expand Up @@ -2609,7 +2597,8 @@ impl HashNode for Expr {
Expr::Placeholder(place_holder) => {
place_holder.hash(state);
}
Expr::OuterReferenceColumn(data_type, column) => {
Expr::OuterReferenceColumn(boxed_orc) => {
let (data_type, column) = boxed_orc.as_ref();
data_type.hash(state);
column.hash(state);
}
Expand Down Expand Up @@ -2673,12 +2662,14 @@ impl Display for SchemaDisplay<'_> {
}
}
// Expr is not shown since it is aliased
Expr::Alias(Alias {
name,
relation: Some(relation),
..
}) => write!(f, "{relation}.{name}"),
Expr::Alias(Alias { name, .. }) => write!(f, "{name}"),
Expr::Alias(boxed_alias) => {
let alias = boxed_alias.as_ref();
if let Some(relation) = &alias.relation {
write!(f, "{relation}.{}", alias.name)
} else {
write!(f, "{}", alias.name)
}
}
Expr::Between(Between {
expr,
negated,
Expand Down Expand Up @@ -2929,7 +2920,10 @@ impl Display for SqlDisplay<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self.0 {
Expr::Literal(scalar, _) => scalar.fmt(f),
Expr::Alias(Alias { name, .. }) => write!(f, "{name}"),
Expr::Alias(boxed_alias) => {
let name = &boxed_alias.as_ref().name;
write!(f, "{name}")
}
Expr::Between(Between {
expr,
negated,
Expand Down Expand Up @@ -3189,9 +3183,13 @@ pub const UNNEST_COLUMN_PREFIX: &str = "UNNEST";
impl Display for Expr {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Expr::Alias(Alias { expr, name, .. }) => write!(f, "{expr} AS {name}"),
Expr::Alias(boxed_alias) => {
let Alias { expr, name, .. } = boxed_alias.as_ref();
write!(f, "{expr} AS {name}")
}
Expr::Column(c) => write!(f, "{c}"),
Expr::OuterReferenceColumn(_, c) => {
Expr::OuterReferenceColumn(boxed_orc) => {
let (_, c) = boxed_orc.as_ref();
write!(f, "{OUTER_REFERENCE_COLUMN_PREFIX}({c})")
}
Expr::ScalarVariable(_, var_names) => write!(f, "{}", var_names.join(".")),
Expand Down Expand Up @@ -3839,7 +3837,7 @@ mod test {
// If this test fails when you change `Expr`, please try
// `Box`ing the fields to make `Expr` smaller
// See https://github.com/apache/datafusion/issues/16199 for details
assert_eq!(size_of::<Expr>(), 128);
assert_eq!(size_of::<Expr>(), 112);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change saves 16 bytes per Expr. Nice.

I will run some planning benchmarks and see if we can see any difference

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @alamb , i am doing more amazing experiment, try to reduce from 128 to 80, so we can save 48 bytes per Expr!

assert_eq!(size_of::<ScalarValue>(), 64);
assert_eq!(size_of::<DataType>(), 24); // 3 ptrs
assert_eq!(size_of::<Vec<Expr>>(), 24);
Expand Down
2 changes: 1 addition & 1 deletion datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ pub fn col(ident: impl Into<Column>) -> Expr {
/// Create an out reference column which hold a reference that has been resolved to a field
/// outside of the current plan.
pub fn out_ref_col(dt: DataType, ident: impl Into<Column>) -> Expr {
Expr::OuterReferenceColumn(dt, ident.into())
Expr::OuterReferenceColumn(Box::new((dt, ident.into())))
}

/// Create an unqualified column expression from the provided name, without normalizing
Expand Down
27 changes: 18 additions & 9 deletions datafusion/expr/src/expr_rewriter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,14 @@ pub fn create_col_from_scalar_expr(
subqry_alias: String,
) -> Result<Column> {
match scalar_expr {
Expr::Alias(Alias { name, .. }) => Ok(Column::new(
Some::<TableReference>(subqry_alias.into()),
name,
)),
Expr::Alias(boxed_alias) => {
// boxed_alias: &Box<Alias> (if you’re matching on &expr) or Box<Alias>
let name = &boxed_alias.name;
Ok(Column::new(
Some::<TableReference>(subqry_alias.into()),
name,
))
}
Expr::Column(col) => Ok(col.with_relation(subqry_alias.into())),
_ => {
let scalar_column = scalar_expr.schema_name().to_string();
Expand All @@ -200,7 +204,9 @@ pub fn unnormalize_cols(exprs: impl IntoIterator<Item = Expr>) -> Vec<Expr> {
pub fn strip_outer_reference(expr: Expr) -> Expr {
expr.transform(|expr| {
Ok({
if let Expr::OuterReferenceColumn(_, col) = expr {
// Match the boxed (DataType, Column) tuple and extract the Column
if let Expr::OuterReferenceColumn(boxed_pair) = expr {
let (_data_type, col) = *boxed_pair;
Transformed::yes(Expr::Column(col))
} else {
Transformed::no(expr)
Expand Down Expand Up @@ -250,7 +256,9 @@ fn coerce_exprs_for_schema(
let new_type = dst_schema.field(idx).data_type();
if new_type != &expr.get_type(src_schema)? {
match expr {
Expr::Alias(Alias { expr, name, .. }) => {
Expr::Alias(boxed_alias) => {
// boxed_alias: Box<Alias>
let Alias { expr, name, .. } = *boxed_alias;
Ok(expr.cast_to(new_type, src_schema)?.alias(name))
}
#[expect(deprecated)]
Expand All @@ -264,12 +272,13 @@ fn coerce_exprs_for_schema(
.collect::<Result<_>>()
}

/// Recursively un-alias an expressions
/// Recursively un-alias an expression
#[inline]
pub fn unalias(expr: Expr) -> Expr {
match expr {
Expr::Alias(Alias { expr, .. }) => unalias(*expr),
_ => expr,
// Unbox the Alias, then recurse on the inner Expr
Expr::Alias(boxed_alias) => unalias(*boxed_alias.expr),
other => other,
}
}

Expand Down
7 changes: 5 additions & 2 deletions datafusion/expr/src/expr_rewriter/order_by.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,11 @@ fn rewrite_in_terms_of_projection(
/// so avg(c) as average will match avgc
fn expr_match(needle: &Expr, expr: &Expr) -> bool {
// check inside aliases
if let Expr::Alias(Alias { expr, .. }) = &expr {
expr.as_ref() == needle
if let Expr::Alias(boxed_alias) = expr {
// boxed_alias: &Box<Alias>, so boxed_alias.as_ref() is &Alias
let alias: &Alias = boxed_alias.as_ref();
// alias.expr: Box<Expr>, so alias.expr.as_ref() is &Expr
alias.expr.as_ref() == needle
} else {
expr == needle
}
Expand Down
Loading