Skip to content

Commit e102cdb

Browse files
feat: Add bigframes.pandas.col with basic operators
1 parent a2d13e7 commit e102cdb

File tree

7 files changed

+183
-19
lines changed

7 files changed

+183
-19
lines changed

bigframes/core/agg_expressions.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import functools
2020
import itertools
2121
import typing
22-
from typing import Callable, Mapping, Tuple, TypeVar
22+
from typing import Callable, Hashable, Mapping, Tuple, TypeVar
2323

2424
from bigframes import dtypes
2525
from bigframes.core import expression, window_spec
@@ -68,7 +68,7 @@ def children(self) -> Tuple[expression.Expression, ...]:
6868
return self.inputs
6969

7070
@property
71-
def free_variables(self) -> typing.Tuple[str, ...]:
71+
def free_variables(self) -> typing.Tuple[Hashable, ...]:
7272
return tuple(
7373
itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs))
7474
)
@@ -92,7 +92,7 @@ def transform_children(
9292

9393
def bind_variables(
9494
self: TExpression,
95-
bindings: Mapping[str, expression.Expression],
95+
bindings: Mapping[Hashable, expression.Expression],
9696
allow_partial_bindings: bool = False,
9797
) -> TExpression:
9898
return self.transform_children(
@@ -192,7 +192,7 @@ def children(self) -> Tuple[expression.Expression, ...]:
192192
return self.inputs
193193

194194
@property
195-
def free_variables(self) -> typing.Tuple[str, ...]:
195+
def free_variables(self) -> typing.Tuple[Hashable, ...]:
196196
return tuple(
197197
itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs))
198198
)
@@ -216,7 +216,7 @@ def transform_children(
216216

217217
def bind_variables(
218218
self: WindowExpression,
219-
bindings: Mapping[str, expression.Expression],
219+
bindings: Mapping[Hashable, expression.Expression],
220220
allow_partial_bindings: bool = False,
221221
) -> WindowExpression:
222222
return self.transform_children(

bigframes/core/col.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from __future__ import annotations
15+
16+
import dataclasses
17+
from typing import Any, Hashable
18+
19+
import bigframes.core.expression as bf_expression
20+
import bigframes.operations as bf_ops
21+
22+
23+
# Not to be confused with internal Expressions class
24+
# Name collision unintended
25+
@dataclasses.dataclass(frozen=True)
26+
class Expression:
27+
_value: bf_expression.Expression
28+
29+
def _apply_unary(self, op: bf_ops.UnaryOp) -> Expression:
30+
return Expression(op.as_expr(self._value))
31+
32+
def _apply_binary(self, other: Any, op: bf_ops.BinaryOp, reverse: bool = False):
33+
if isinstance(other, Expression):
34+
other_value = other._value
35+
else:
36+
other_value = bf_expression.const(other)
37+
if reverse:
38+
return Expression(op.as_expr(other_value, self._value))
39+
else:
40+
return Expression(op.as_expr(self._value, other_value))
41+
42+
def __add__(self, other: Any) -> Expression:
43+
return self._apply_binary(other, bf_ops.add_op)
44+
45+
def __radd__(self, other: Any) -> Expression:
46+
return self._apply_binary(other, bf_ops.add_op, reverse=True)
47+
48+
def __sub__(self, other: Any) -> Expression:
49+
return self._apply_binary(other, bf_ops.sub_op)
50+
51+
def __rsub__(self, other: Any) -> Expression:
52+
return self._apply_binary(other, bf_ops.sub_op, reverse=True)
53+
54+
def __mul__(self, other: Any) -> Expression:
55+
return self._apply_binary(other, bf_ops.mul_op)
56+
57+
def __rmul__(self, other: Any) -> Expression:
58+
return self._apply_binary(other, bf_ops.mul_op, reverse=True)
59+
60+
def __truediv__(self, other: Any) -> Expression:
61+
return self._apply_binary(other, bf_ops.div_op)
62+
63+
def __rtruediv__(self, other: Any) -> Expression:
64+
return self._apply_binary(other, bf_ops.div_op, reverse=True)
65+
66+
def __floordiv__(self, other: Any) -> Expression:
67+
return self._apply_binary(other, bf_ops.floordiv_op)
68+
69+
def __rfloordiv__(self, other: Any) -> Expression:
70+
return self._apply_binary(other, bf_ops.floordiv_op, reverse=True)
71+
72+
def __ge__(self, other: Any) -> Expression:
73+
return self._apply_binary(other, bf_ops.ge_op)
74+
75+
def __gt__(self, other: Any) -> Expression:
76+
return self._apply_binary(other, bf_ops.gt_op)
77+
78+
def __le__(self, other: Any) -> Expression:
79+
return self._apply_binary(other, bf_ops.le_op)
80+
81+
def __lt__(self, other: Any) -> Expression:
82+
return self._apply_binary(other, bf_ops.lt_op)
83+
84+
def __eq__(self, other: object) -> Expression: # type: ignore
85+
return self._apply_binary(other, bf_ops.eq_op)
86+
87+
def __ne__(self, other: object) -> Expression: # type: ignore
88+
return self._apply_binary(other, bf_ops.ne_op)
89+
90+
def __mod__(self, other: Any) -> Expression:
91+
return self._apply_binary(other, bf_ops.mod_op)
92+
93+
def __rmod__(self, other: Any) -> Expression:
94+
return self._apply_binary(other, bf_ops.mod_op, reverse=True)
95+
96+
def __and__(self, other: Any) -> Expression:
97+
return self._apply_binary(other, bf_ops.and_op)
98+
99+
def __rand__(self, other: Any) -> Expression:
100+
return self._apply_binary(other, bf_ops.and_op, reverse=True)
101+
102+
def __or__(self, other: Any) -> Expression:
103+
return self._apply_binary(other, bf_ops.or_op)
104+
105+
def __ror__(self, other: Any) -> Expression:
106+
return self._apply_binary(other, bf_ops.or_op, reverse=True)
107+
108+
def __xor__(self, other: Any) -> Expression:
109+
return self._apply_binary(other, bf_ops.xor_op)
110+
111+
def __rxor__(self, other: Any) -> Expression:
112+
return self._apply_binary(other, bf_ops.xor_op, reverse=True)
113+
114+
def __invert__(self) -> Expression:
115+
return self._apply_unary(bf_ops.invert_op)
116+
117+
118+
def col(col_name: Hashable) -> Expression:
119+
return Expression(bf_expression.free_var(col_name))

bigframes/core/expression.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import functools
2020
import itertools
2121
import typing
22-
from typing import Callable, Generator, Mapping, TypeVar, Union
22+
from typing import Callable, Generator, Hashable, Mapping, TypeVar, Union
2323

2424
import pandas as pd
2525

@@ -39,7 +39,7 @@ def deref(name: str) -> DerefOp:
3939
return DerefOp(ids.ColumnId(name))
4040

4141

42-
def free_var(id: str) -> UnboundVariableExpression:
42+
def free_var(id: Hashable) -> UnboundVariableExpression:
4343
return UnboundVariableExpression(id)
4444

4545

@@ -52,7 +52,7 @@ class Expression(abc.ABC):
5252
"""An expression represents a computation taking N scalar inputs and producing a single output scalar."""
5353

5454
@property
55-
def free_variables(self) -> typing.Tuple[str, ...]:
55+
def free_variables(self) -> typing.Tuple[Hashable, ...]:
5656
return ()
5757

5858
@property
@@ -116,7 +116,9 @@ def bind_refs(
116116

117117
@abc.abstractmethod
118118
def bind_variables(
119-
self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False
119+
self,
120+
bindings: Mapping[Hashable, Expression],
121+
allow_partial_bindings: bool = False,
120122
) -> Expression:
121123
"""Replace variables with expression given in `bindings`.
122124
@@ -191,7 +193,9 @@ def output_type(self) -> dtypes.ExpressionType:
191193
return self.dtype
192194

193195
def bind_variables(
194-
self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False
196+
self,
197+
bindings: Mapping[Hashable, Expression],
198+
allow_partial_bindings: bool = False,
195199
) -> Expression:
196200
return self
197201

@@ -226,10 +230,10 @@ def transform_children(self, t: Callable[[Expression], Expression]) -> Expressio
226230
class UnboundVariableExpression(Expression):
227231
"""A variable expression representing an unbound variable."""
228232

229-
id: str
233+
id: Hashable
230234

231235
@property
232-
def free_variables(self) -> typing.Tuple[str, ...]:
236+
def free_variables(self) -> typing.Tuple[Hashable, ...]:
233237
return (self.id,)
234238

235239
@property
@@ -256,7 +260,9 @@ def bind_refs(
256260
return self
257261

258262
def bind_variables(
259-
self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False
263+
self,
264+
bindings: Mapping[Hashable, Expression],
265+
allow_partial_bindings: bool = False,
260266
) -> Expression:
261267
if self.id in bindings.keys():
262268
return bindings[self.id]
@@ -304,7 +310,9 @@ def output_type(self) -> dtypes.ExpressionType:
304310
raise ValueError(f"Type of variable {self.id} has not been fixed.")
305311

306312
def bind_variables(
307-
self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False
313+
self,
314+
bindings: Mapping[Hashable, Expression],
315+
allow_partial_bindings: bool = False,
308316
) -> Expression:
309317
return self
310318

@@ -373,7 +381,7 @@ def column_references(
373381
)
374382

375383
@property
376-
def free_variables(self) -> typing.Tuple[str, ...]:
384+
def free_variables(self) -> typing.Tuple[Hashable, ...]:
377385
return tuple(
378386
itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs))
379387
)
@@ -408,7 +416,9 @@ def output_type(self) -> dtypes.ExpressionType:
408416
return self.op.output_type(*input_types)
409417

410418
def bind_variables(
411-
self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False
419+
self,
420+
bindings: Mapping[Hashable, Expression],
421+
allow_partial_bindings: bool = False,
412422
) -> OpExpression:
413423
return OpExpression(
414424
self.op,

bigframes/dataframe.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
from bigframes.core import agg_expressions
5959
import bigframes.core.block_transforms as block_ops
6060
import bigframes.core.blocks as blocks
61+
import bigframes.core.col
6162
import bigframes.core.convert
6263
import bigframes.core.explode
6364
import bigframes.core.expression as ex
@@ -94,7 +95,13 @@
9495
import bigframes.session
9596

9697
SingleItemValue = Union[
97-
bigframes.series.Series, int, float, str, pandas.Timedelta, Callable
98+
bigframes.series.Series,
99+
int,
100+
float,
101+
str,
102+
pandas.Timedelta,
103+
Callable,
104+
bigframes.core.col.Expression,
98105
]
99106
MultiItemValue = Union[
100107
"DataFrame", Sequence[int | float | str | pandas.Timedelta | Callable]
@@ -2236,6 +2243,13 @@ def _assign_single_item(
22362243
) -> DataFrame:
22372244
if isinstance(v, bigframes.series.Series):
22382245
return self._assign_series_join_on_index(k, v)
2246+
elif isinstance(v, bigframes.core.col.Expression):
2247+
label_to_col_ref = {
2248+
label: ex.deref(id) for id, label in self._block.col_id_to_label.items()
2249+
}
2250+
resolved_expr = v._value.bind_variables(label_to_col_ref)
2251+
block = self._block.project_block_exprs([resolved_expr], labels=[k])
2252+
return DataFrame(block)
22392253
elif isinstance(v, bigframes.dataframe.DataFrame):
22402254
v_df_col_count = len(v._block.value_columns)
22412255
if v_df_col_count != 1:

bigframes/operations/aggregations.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
205205
return dtypes.TIMEDELTA_DTYPE
206206

207207
if dtypes.is_numeric(input_types[0]):
208-
if pd.api.types.is_bool_dtype(input_types[0]):
208+
if pd.api.types.is_bool_dtype(input_types[0]): # type: ignore
209209
return dtypes.INT_DTYPE
210210
return input_types[0]
211211

@@ -224,7 +224,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
224224
# These will change if median is changed to exact implementation.
225225
if not dtypes.is_orderable(input_types[0]):
226226
raise TypeError(f"Type {input_types[0]} is not orderable")
227-
if pd.api.types.is_bool_dtype(input_types[0]):
227+
if pd.api.types.is_bool_dtype(input_types[0]): # type: ignore
228228
return dtypes.INT_DTYPE
229229
else:
230230
return input_types[0]

bigframes/pandas/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import pandas
2828

2929
import bigframes._config as config
30+
from bigframes.core.col import col
3031
import bigframes.core.global_session as global_session
3132
import bigframes.core.indexes
3233
from bigframes.core.logging import log_adapter
@@ -415,6 +416,7 @@ def reset_session():
415416
"clean_up_by_session_id",
416417
"concat",
417418
"crosstab",
419+
"col",
418420
"cut",
419421
"deploy_remote_function",
420422
"deploy_udf",

tests/unit/test_dataframe_polars.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,25 @@ def test_assign_new_column(scalars_dfs):
828828
assert_frame_equal(bf_result, pd_result)
829829

830830

831+
def test_assign_using_pd_col(scalars_dfs):
832+
if pd.__version__.startswith("1.") or pd.__version__.startswith("2."):
833+
pytest.skip("col expression interface only supported for pandas 3+")
834+
scalars_df, scalars_pandas_df = scalars_dfs
835+
bf_kwargs = {
836+
"new_col_1": 4 - bpd.col("int64_col"),
837+
"new_col_2": bpd.col("int64_col") / (bpd.col("float64_col") * 0.5),
838+
}
839+
pd_kwargs = {
840+
"new_col_1": 4 - pd.col("int64_col"), # type: ignore
841+
"new_col_2": pd.col("int64_col") / (pd.col("float64_col") * 0.5), # type: ignore
842+
}
843+
df = scalars_df.assign(**bf_kwargs)
844+
bf_result = df.to_pandas()
845+
pd_result = scalars_pandas_df.assign(**pd_kwargs)
846+
847+
assert_frame_equal(bf_result, pd_result)
848+
849+
831850
def test_assign_new_column_w_loc(scalars_dfs):
832851
scalars_df, scalars_pandas_df = scalars_dfs
833852
bf_df = scalars_df.copy()

0 commit comments

Comments
 (0)