Skip to content

Commit 56b7243

Browse files
chore: deprecate select_columns (#911)
* chore: deprecate select_columns * chore: lint * Update user document to use select instead of select_columns * Update all tpch examples to use select instead of select_columns --------- Co-authored-by: Tim Saucer <[email protected]>
1 parent f59dd08 commit 56b7243

26 files changed

+98
-109
lines changed

docs/source/user-guide/common-operations/select-and-filter.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ DataFusion can work with several file types, to start simple we can use a subset
3333
3434
ctx = SessionContext()
3535
df = ctx.read_parquet("yellow_trip_data.parquet")
36-
df.select_columns("trip_distance", "passenger_count")
36+
df.select("trip_distance", "passenger_count")
3737
3838
For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting
3939
operations using :py:func:`~datafusion.expr.Expr.alias`
@@ -48,7 +48,7 @@ operations using :py:func:`~datafusion.expr.Expr.alias`
4848

4949
Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters
5050
(ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple
51-
column selection use :py:func:`~datafusion.dataframe.DataFrame.select_columns` without double quotes
51+
column selection use :py:func:`~datafusion.dataframe.DataFrame.select` without double quotes
5252

5353
For selecting columns with capital letters use ``'"VendorID"'``
5454

examples/import.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# The dictionary keys represent column names and the dictionary values
2929
# represent column values
3030
df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
31-
assert type(df) == datafusion.DataFrame
31+
assert type(df) is datafusion.DataFrame
3232
# Dataframe:
3333
# +---+---+
3434
# | a | b |
@@ -40,19 +40,19 @@
4040

4141
# Create a datafusion DataFrame from a Python list of rows
4242
df = ctx.from_pylist([{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}])
43-
assert type(df) == datafusion.DataFrame
43+
assert type(df) is datafusion.DataFrame
4444

4545
# Convert pandas DataFrame to datafusion DataFrame
4646
pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
4747
df = ctx.from_pandas(pandas_df)
48-
assert type(df) == datafusion.DataFrame
48+
assert type(df) is datafusion.DataFrame
4949

5050
# Convert polars DataFrame to datafusion DataFrame
5151
polars_df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
5252
df = ctx.from_polars(polars_df)
53-
assert type(df) == datafusion.DataFrame
53+
assert type(df) is datafusion.DataFrame
5454

5555
# Convert Arrow Table to datafusion DataFrame
5656
arrow_table = pa.Table.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]})
5757
df = ctx.from_arrow(arrow_table)
58-
assert type(df) == datafusion.DataFrame
58+
assert type(df) is datafusion.DataFrame

examples/tpch/convert_data_to_parquet.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,6 @@
138138

139139
df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|")
140140

141-
df = df.select_columns(*output_cols)
141+
df = df.select(*output_cols)
142142

143143
df.write_parquet(dest_file, compression="snappy")

examples/tpch/q02_minimum_cost_supplier.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@
4343

4444
ctx = SessionContext()
4545

46-
df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
46+
df_part = ctx.read_parquet(get_data_path("part.parquet")).select(
4747
"p_partkey", "p_mfgr", "p_type", "p_size"
4848
)
49-
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
49+
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
5050
"s_acctbal",
5151
"s_name",
5252
"s_address",
@@ -55,13 +55,13 @@
5555
"s_nationkey",
5656
"s_suppkey",
5757
)
58-
df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
58+
df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select(
5959
"ps_partkey", "ps_suppkey", "ps_supplycost"
6060
)
61-
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
61+
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
6262
"n_nationkey", "n_regionkey", "n_name"
6363
)
64-
df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns(
64+
df_region = ctx.read_parquet(get_data_path("region.parquet")).select(
6565
"r_regionkey", "r_name"
6666
)
6767

@@ -115,7 +115,7 @@
115115

116116
# From the problem statement, these are the values we wish to output
117117

118-
df = df.select_columns(
118+
df = df.select(
119119
"s_acctbal",
120120
"s_name",
121121
"n_name",

examples/tpch/q03_shipping_priority.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,13 @@
3737

3838
ctx = SessionContext()
3939

40-
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
40+
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
4141
"c_mktsegment", "c_custkey"
4242
)
43-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
43+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
4444
"o_orderdate", "o_shippriority", "o_custkey", "o_orderkey"
4545
)
46-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
46+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
4747
"l_orderkey", "l_extendedprice", "l_discount", "l_shipdate"
4848
)
4949

@@ -80,7 +80,7 @@
8080

8181
# Change the order that the columns are reported in just to match the spec
8282

83-
df = df.select_columns("l_orderkey", "revenue", "o_orderdate", "o_shippriority")
83+
df = df.select("l_orderkey", "revenue", "o_orderdate", "o_shippriority")
8484

8585
# Show result
8686

examples/tpch/q04_order_priority_checking.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@
3939

4040
ctx = SessionContext()
4141

42-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
42+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
4343
"o_orderdate", "o_orderpriority", "o_orderkey"
4444
)
45-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
45+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
4646
"l_orderkey", "l_commitdate", "l_receiptdate"
4747
)
4848

@@ -54,7 +54,7 @@
5454
# Limit results to cases where commitment date before receipt date
5555
# Aggregate the results so we only get one row to join with the order table.
5656
# Alternately, and likely more idiomatic is instead of `.aggregate` you could
57-
# do `.select_columns("l_orderkey").distinct()`. The goal here is to show
57+
# do `.select("l_orderkey").distinct()`. The goal here is to show
5858
# multiple examples of how to use Data Fusion.
5959
df_lineitem = df_lineitem.filter(col("l_commitdate") < col("l_receiptdate")).aggregate(
6060
[col("l_orderkey")], []

examples/tpch/q05_local_supplier_volume.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -47,22 +47,22 @@
4747

4848
ctx = SessionContext()
4949

50-
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
50+
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
5151
"c_custkey", "c_nationkey"
5252
)
53-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
53+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
5454
"o_custkey", "o_orderkey", "o_orderdate"
5555
)
56-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
56+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
5757
"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"
5858
)
59-
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
59+
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
6060
"s_suppkey", "s_nationkey"
6161
)
62-
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
62+
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
6363
"n_nationkey", "n_regionkey", "n_name"
6464
)
65-
df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns(
65+
df_region = ctx.read_parquet(get_data_path("region.parquet")).select(
6666
"r_regionkey", "r_name"
6767
)
6868

examples/tpch/q06_forecasting_revenue_change.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151

5252
ctx = SessionContext()
5353

54-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
54+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
5555
"l_shipdate", "l_quantity", "l_extendedprice", "l_discount"
5656
)
5757

examples/tpch/q07_volume_shipping.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -49,19 +49,19 @@
4949

5050
ctx = SessionContext()
5151

52-
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
52+
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
5353
"s_suppkey", "s_nationkey"
5454
)
55-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
55+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
5656
"l_shipdate", "l_extendedprice", "l_discount", "l_suppkey", "l_orderkey"
5757
)
58-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
58+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
5959
"o_orderkey", "o_custkey"
6060
)
61-
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
61+
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
6262
"c_custkey", "c_nationkey"
6363
)
64-
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
64+
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
6565
"n_nationkey", "n_name"
6666
)
6767

examples/tpch/q08_market_share.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -47,25 +47,23 @@
4747

4848
ctx = SessionContext()
4949

50-
df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
51-
"p_partkey", "p_type"
52-
)
53-
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
50+
df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey", "p_type")
51+
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
5452
"s_suppkey", "s_nationkey"
5553
)
56-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
54+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
5755
"l_partkey", "l_extendedprice", "l_discount", "l_suppkey", "l_orderkey"
5856
)
59-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
57+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
6058
"o_orderkey", "o_custkey", "o_orderdate"
6159
)
62-
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
60+
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
6361
"c_custkey", "c_nationkey"
6462
)
65-
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
63+
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
6664
"n_nationkey", "n_name", "n_regionkey"
6765
)
68-
df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns(
66+
df_region = ctx.read_parquet(get_data_path("region.parquet")).select(
6967
"r_regionkey", "r_name"
7068
)
7169

@@ -133,7 +131,7 @@
133131

134132
# When we join to the customer dataframe, we don't want to confuse other columns, so only
135133
# select the supplier key that we need
136-
df_national_suppliers = df_national_suppliers.select_columns("s_suppkey")
134+
df_national_suppliers = df_national_suppliers.select("s_suppkey")
137135

138136

139137
# Part 3: Combine suppliers and customers and compute the market share

examples/tpch/q09_product_type_profit_measure.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -39,27 +39,25 @@
3939

4040
ctx = SessionContext()
4141

42-
df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
43-
"p_partkey", "p_name"
44-
)
45-
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
42+
df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey", "p_name")
43+
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
4644
"s_suppkey", "s_nationkey"
4745
)
48-
df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
46+
df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select(
4947
"ps_suppkey", "ps_partkey", "ps_supplycost"
5048
)
51-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
49+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
5250
"l_partkey",
5351
"l_extendedprice",
5452
"l_discount",
5553
"l_suppkey",
5654
"l_orderkey",
5755
"l_quantity",
5856
)
59-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
57+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
6058
"o_orderkey", "o_custkey", "o_orderdate"
6159
)
62-
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
60+
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
6361
"n_nationkey", "n_name", "n_regionkey"
6462
)
6563

examples/tpch/q10_returned_item_reporting.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545
ctx = SessionContext()
4646

47-
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
47+
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select(
4848
"c_custkey",
4949
"c_nationkey",
5050
"c_name",
@@ -53,13 +53,13 @@
5353
"c_phone",
5454
"c_comment",
5555
)
56-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
56+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
5757
"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"
5858
)
59-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
59+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
6060
"o_orderkey", "o_custkey", "o_orderdate"
6161
)
62-
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
62+
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
6363
"n_nationkey", "n_name", "n_regionkey"
6464
)
6565

@@ -87,7 +87,7 @@
8787
df = df.join(df_nation, (["c_nationkey"], ["n_nationkey"]), how="inner")
8888

8989
# These are the columns the problem statement requires
90-
df = df.select_columns(
90+
df = df.select(
9191
"c_custkey",
9292
"c_name",
9393
"revenue",

examples/tpch/q11_important_stock_identification.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,13 @@
3737

3838
ctx = SessionContext()
3939

40-
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
40+
df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select(
4141
"s_suppkey", "s_nationkey"
4242
)
43-
df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
43+
df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select(
4444
"ps_supplycost", "ps_availqty", "ps_suppkey", "ps_partkey"
4545
)
46-
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
46+
df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select(
4747
"n_nationkey", "n_name"
4848
)
4949

@@ -75,7 +75,7 @@
7575
df = df.filter(col("value") / col("total_value") >= lit(FRACTION))
7676

7777
# We only need to report on these two columns
78-
df = df.select_columns("ps_partkey", "value")
78+
df = df.select("ps_partkey", "value")
7979

8080
# Sort in descending order of value
8181
df = df.sort(col("value").sort(ascending=False))

examples/tpch/q12_ship_mode_order_priority.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@
4242

4343
ctx = SessionContext()
4444

45-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
45+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
4646
"o_orderkey", "o_orderpriority"
4747
)
48-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
48+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
4949
"l_orderkey", "l_shipmode", "l_commitdate", "l_shipdate", "l_receiptdate"
5050
)
5151

examples/tpch/q13_customer_distribution.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,10 @@
3838

3939
ctx = SessionContext()
4040

41-
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
41+
df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select(
4242
"o_custkey", "o_comment"
4343
)
44-
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
45-
"c_custkey"
46-
)
44+
df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select("c_custkey")
4745

4846
# Use a regex to remove special cases
4947
df_orders = df_orders.filter(

examples/tpch/q14_promotion_effect.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,10 @@
4141

4242
ctx = SessionContext()
4343

44-
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
44+
df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select(
4545
"l_partkey", "l_shipdate", "l_extendedprice", "l_discount"
4646
)
47-
df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
48-
"p_partkey", "p_type"
49-
)
47+
df_part = ctx.read_parquet(get_data_path("part.parquet")).select("p_partkey", "p_type")
5048

5149

5250
# Check part type begins with PROMO

0 commit comments

Comments
 (0)