Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/polars-plan/dsl-schema-hashes.json
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@
"Schema_for_DataType_and_null": "6f5ccfa2d9f3beca900c1b4ded6dade8173e200287e286f187e69c66bb1bedca",
"Schema_for_Field_and_Map_of_string": "2f9fffc68f13a663609cf6184b816a65b4971f37684ee4f7fe8642c53358a20c",
"SearchSortedSide": "3976138cd5159a641e118a57aedcb079ef8b131b84fdcb102413eb8802a27403",
"Selector": "fcdb32f1c6ccb9f71a57afd77ba233a38888e736dc7b3e4562c54a20e9a0a290",
"Selector": "678b9968c5c46b913a5ee380c7e220f001b853a3136ee41bfd45e589d92355c6",
"SequenceKind": "bff6b860f53c19db2c7d1a02b57b2c27f683a69ddd1668c44eb23c24b436a138",
"SerializeOptions": "c16fee7c896396e39b9cc6ff4da53315e79edc3846738f8bafeea104e4bc0dc4",
"Series": "5bbddd4f899afa592c318b20bb8d0bdfe2877fa5bf1a63d9cd0da908ac3aec0e",
Expand Down
12 changes: 10 additions & 2 deletions crates/polars-plan/src/dsl/selector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ pub enum Selector {

Wildcard,
Empty,

/// Select all columns not already selected by preceding expressions.
/// This is handled specially in `rewrite_projections` - the preceding output
/// columns are added to `ignored_columns` before expansion.
Remaining,
}

fn dtype_selector(
Expand Down Expand Up @@ -265,7 +270,9 @@ impl Selector {
.cloned(),
)
},
Self::Wildcard => PlIndexSet::from_iter(
// Remaining behaves like Wildcard - the difference is that
// rewrite_projections adds prior output columns to ignored_columns
Self::Wildcard | Self::Remaining => PlIndexSet::from_iter(
schema
.iter_names()
.filter(|name| !ignored_columns.contains(*name))
Expand Down Expand Up @@ -304,7 +311,7 @@ impl Selector {

Self::ByDType(dts) => Some(dts.clone()),

Self::ByName { .. } | Self::ByIndex { .. } | Self::Matches(_) => None,
Self::ByName { .. } | Self::ByIndex { .. } | Self::Matches(_) | Self::Remaining => None,
}
}

Expand Down Expand Up @@ -671,6 +678,7 @@ impl fmt::Display for Selector {
},
Self::Matches(s) => write!(f, "cs.matches(\"{s}\")"),
Self::Wildcard => f.write_str("cs.all()"),
Self::Remaining => f.write_str("cs.remaining()"),
Self::Empty => f.write_str("cs.empty()"),
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use super::*;
use crate::constants::{
POLARS_ELEMENT, POLARS_STRUCTFIELDS, get_pl_element_name, get_pl_structfields_name,
};
use crate::utils::expr_output_name;

pub fn prepare_projection(
exprs: Vec<Expr>,
Expand Down Expand Up @@ -45,20 +46,96 @@ pub fn rewrite_projections(
schema: &Schema,
opt_flags: &mut OptFlags,
) -> PolarsResult<Vec<Expr>> {
// Check if any expression contains `Selector::Remaining`.
// If so, we need to track output columns as we expand.
let has_remaining = exprs.iter().any(expr_contains_remaining);

if !has_remaining {
// Fast path: no Remaining selector, expand normally
let mut result = Vec::with_capacity(exprs.len() + schema.len());
for expr in &exprs {
expand_expression(
expr,
ignored_selector_columns,
schema,
&mut result,
opt_flags,
)?;
}
return Ok(result);
}

// When Remaining is present, we accumulate output column names as we expand.
// Each Remaining selector excludes columns from ALL other expressions.
// To achieve this, we first collect all non-Remaining output names,
// then expand each expression with those names added to ignored_columns.
let mut all_output_names = PlHashSet::default();

// First, collect output names from all non-Remaining expressions
for expr in &exprs {
if !expr_contains_remaining(expr) {
let mut expanded = Vec::new();
expand_expression(
expr,
ignored_selector_columns,
schema,
&mut expanded,
opt_flags,
)?;
for e in &expanded {
if let Ok(name) = expr_output_name(e) {
all_output_names.insert(name);
}
}
}
}

// Now expand all expressions, adding collected names to ignored_columns for Remaining
let mut ignored_with_outputs = ignored_selector_columns.clone();
ignored_with_outputs.extend(all_output_names);

let mut result = Vec::with_capacity(exprs.len() + schema.len());
for expr in &exprs {
expand_expression(
expr,
ignored_selector_columns,
schema,
&mut result,
opt_flags,
)?;
if expr_contains_remaining(expr) {
// Expand with all output names excluded
expand_expression(expr, &ignored_with_outputs, schema, &mut result, opt_flags)?;
} else {
// Expand normally
expand_expression(
expr,
ignored_selector_columns,
schema,
&mut result,
opt_flags,
)?;
}
}

Ok(result)
}

/// Check if an expression contains `Selector::Remaining` (including nested in selector combinations).
fn expr_contains_remaining(expr: &Expr) -> bool {
expr.into_iter().any(|e| match e {
Expr::Selector(selector) => selector_contains_remaining(selector),
_ => false,
})
}

/// Check if a selector contains `Remaining` (recursively for combinations).
fn selector_contains_remaining(selector: &Selector) -> bool {
match selector {
Selector::Remaining => true,
Selector::Union(lhs, rhs)
| Selector::Difference(lhs, rhs)
| Selector::ExclusiveOr(lhs, rhs)
| Selector::Intersect(lhs, rhs) => {
selector_contains_remaining(lhs) || selector_contains_remaining(rhs)
},
_ => false,
}
}

fn toggle_cse_for_structs(opt_flags: &mut OptFlags) {
if opt_flags.contains(OptFlags::EAGER) && !opt_flags.contains(OptFlags::NEW_STREAMING) {
use polars_core::config::verbose;
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-python/src/expr/selector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,11 @@ impl PySelector {
dsl::functions::all().into()
}

#[staticmethod]
fn remaining() -> Self {
Selector::Remaining.into()
}

fn hash(&self) -> u64 {
let mut hasher = std::hash::DefaultHasher::default();
self.inner.hash(&mut hasher);
Expand Down
2 changes: 2 additions & 0 deletions py-polars/src/polars/_plr.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2047,6 +2047,8 @@ class PySelector:
def empty() -> PySelector: ...
@staticmethod
def all() -> PySelector: ...
@staticmethod
def remaining() -> PySelector: ...
def hash(self) -> int: ...

class PyOptFlags:
Expand Down
64 changes: 64 additions & 0 deletions py-polars/src/polars/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
"matches",
"nested",
"numeric",
"remaining",
"signed_integer",
"starts_with",
"string",
Expand Down Expand Up @@ -2808,6 +2809,69 @@ def numeric() -> Selector:
return Selector._from_pyselector(PySelector.numeric())


@unstable()
def remaining() -> Selector:
"""
Select all columns whose names do not appear as output names of other expressions.
.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
This selector is useful in `select` and `with_columns` operations when you want
to perform an operation on specific columns while also keeping all other columns
that are not already produced by other expressions.
See Also
--------
all : Select all columns.
exclude : Select all columns except those matching the given columns, datatypes,
or selectors.
Examples
--------
>>> import polars as pl
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "b": [4, 5, 6],
... "a": [1, 2, 3],
... "c": [7, 8, 9],
... }
... )
Transform column "a" and reorder columns:
>>> df.select(pl.col("a") * 10, cs.remaining())
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 10 ┆ 4 ┆ 7 │
│ 20 ┆ 5 ┆ 8 │
│ 30 ┆ 6 ┆ 9 │
└─────┴─────┴─────┘
Use with `with_columns` to add new columns while transforming existing ones:
>>> df.select(pl.col("a"), cs.remaining().cast(pl.Float64), pl.col("a").alias("d"))
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ 1 ┆ 4.0 ┆ 7.0 ┆ 1 │
│ 2 ┆ 5.0 ┆ 8.0 ┆ 2 │
│ 3 ┆ 6.0 ┆ 9.0 ┆ 3 │
└─────┴─────┴─────┴─────┘
"""
return Selector._from_pyselector(PySelector.remaining())


def object() -> Selector:
"""
Select all object columns.
Expand Down
125 changes: 125 additions & 0 deletions py-polars/tests/unit/operations/test_selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1158,3 +1158,128 @@ def test_multiline_colname_matches() -> None:
cs.contains(prefix).alias("contains"),
)
assert res.columns == ["starts_with", "ends_with", "contains"]


def test_selector_remaining_12067() -> None:
"""Test cs.remaining() selector that selects columns not explicitly named."""
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})

# Basic case: one explicit column, remaining selects the rest
result = df.select(pl.col("a") * 10, cs.remaining())
assert result.columns == ["a", "b", "c"]
assert result["a"].to_list() == [10, 20, 30]
assert result["b"].to_list() == [4, 5, 6]
assert result["c"].to_list() == [7, 8, 9]

# Expression using multiple columns - only output name is excluded
result = df.select(pl.col("a") + pl.col("b"), cs.remaining())
assert result.columns == ["a", "b", "c"]
assert result["a"].to_list() == [5, 7, 9] # a + b
assert result["b"].to_list() == [4, 5, 6]
assert result["c"].to_list() == [7, 8, 9]

# with_columns preserves original columns
result = df.with_columns(pl.col("a").alias("d"), cs.remaining().cast(pl.Float64))
assert result.columns == ["a", "b", "c", "d"]
assert result["b"].dtype == pl.Float64
assert result["c"].dtype == pl.Float64
assert result["d"].to_list() == [1, 2, 3]

# No explicit columns: remaining selects all
result = df.select(cs.remaining())
assert result.columns == ["a", "b", "c"]


def test_selector_remaining_with_selector_combination() -> None:
"""Test cs.remaining() combined with other selectors."""
df = pl.DataFrame(
{
"int_a": [1, 2],
"int_b": [3, 4],
"str_c": ["x", "y"],
"float_d": [1.0, 2.0],
},
schema_overrides={"int_a": pl.Int32, "int_b": pl.Int32, "float_d": pl.Float32},
)

# Use selector with remaining
result = df.select(cs.integer().cast(pl.Int64), cs.remaining())
assert result.columns == ["int_a", "int_b", "str_c", "float_d"]
assert result["int_a"].dtype == pl.Int64
assert result["int_b"].dtype == pl.Int64
assert result["str_c"].dtype == pl.String
assert result["float_d"].dtype == pl.Float32


def test_selector_remaining_with_contains_no_duplicate() -> None:
"""Test cs.remaining() correctly excludes columns selected by cs.contains()."""
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})

# This should not raise a DuplicateError - remaining should exclude
# columns selected by cs.contains("a") and explicitly named "b"
result = df.select(cs.contains("a") * 10, cs.remaining(), "b")
assert result.columns == ["a", "c", "b"]
assert result["a"].to_list() == [10, 20, 30]
assert result["c"].to_list() == [7, 8, 9]
assert result["b"].to_list() == [4, 5, 6]


def test_selector_remaining_first_excludes_output_names() -> None:
"""Test cs.remaining() excludes output names, not referenced columns."""
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})

# remaining() comes first, followed by expression using a and b
# The expression outputs column "a", so remaining should select "b" and "c"
result = df.select(cs.remaining(), pl.col("a") + pl.col("b"))
assert result.columns == ["b", "c", "a"]
assert result["b"].to_list() == [4, 5, 6]
assert result["c"].to_list() == [7, 8, 9]
assert result["a"].to_list() == [5, 7, 9] # a + b


def test_selector_remaining_with_selector_arithmetic() -> None:
"""Test cs.remaining() works with selector arithmetic."""
df = pl.DataFrame(
{
"a": [1, 2],
"b": [3, 4],
"c": [5, 6],
"group": ["x", "y"],
}
)

# remaining() - cs.string() should exclude 'a' (explicit) and string columns
result = df.select(pl.col("a"), cs.remaining() - cs.string())
assert result.columns == ["a", "b", "c"]

# remaining() & cs.numeric() should exclude 'a' and keep only numeric remaining
result = df.select(pl.col("a"), cs.remaining() & cs.numeric())
assert result.columns == ["a", "b", "c"]

# remaining() | cs.string() should exclude 'a', include remaining + string
result = df.select(pl.col("a"), cs.remaining() | cs.string())
assert result.columns == ["a", "b", "c", "group"]


def test_selector_remaining_in_group_by_agg() -> None:
"""Test cs.remaining() works correctly in group_by().agg() context."""
df = pl.DataFrame(
{
"a": [1, 2, 3, 4],
"b": [4, 5, 6, 7],
"c": [7, 8, 9, 10],
"group": ["x", "x", "y", "y"],
}
)

# remaining() in agg should exclude group key AND explicitly aggregated columns
result = df.group_by("group").agg(pl.col("a").sum(), cs.remaining().sum())

# Should have group + a + remaining (b, c)
assert set(result.columns) == {"group", "a", "b", "c"}

# Verify values are correct sums
result_sorted = result.sort("group")
assert result_sorted["a"].to_list() == [3, 7] # x: 1+2, y: 3+4
assert result_sorted["b"].to_list() == [9, 13] # x: 4+5, y: 6+7
assert result_sorted["c"].to_list() == [15, 19] # x: 7+8, y: 9+10
Loading