Skip to content

Commit f9e37b2

Browse files
committed
Add percent rank and dense rank
1 parent edb1ada commit f9e37b2

File tree

4 files changed

+72
-17
lines changed

4 files changed

+72
-17
lines changed

python/datafusion/dataframe.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,10 @@ def select(self, *exprs: Expr | str) -> DataFrame:
123123
df = df.select("a", col("b"), col("a").alias("alternate_a"))
124124
125125
"""
126-
exprs = [
127-
arg.expr if isinstance(arg, Expr) else Expr.column(arg).expr
128-
for arg in exprs
126+
exprs_internal = [
127+
Expr.column(arg).expr if isinstance(arg, str) else arg.expr for arg in exprs
129128
]
130-
return DataFrame(self.df.select(*exprs))
129+
return DataFrame(self.df.select(*exprs_internal))
131130

132131
def filter(self, *predicates: Expr) -> DataFrame:
133132
"""Return a DataFrame for which ``predicate`` evaluates to ``True``.

python/datafusion/functions.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,9 +250,14 @@
250250
"var",
251251
"var_pop",
252252
"var_samp",
253+
# Window Functions
253254
"window",
254255
"lead",
255256
"lag",
257+
"row_number",
258+
"rank",
259+
"dense_rank",
260+
"percent_rank",
256261
]
257262

258263

@@ -1810,13 +1815,64 @@ def rank() -> Expr:
18101815
is an example of a dataframe with a window ordered by descending ``points`` and the
18111816
associated rank.
18121817
1818+
You should set ``order_by`` to produce meaningful results.
1819+
18131820
```
18141821
+--------+------+
18151822
| points | rank |
18161823
+--------+------+
18171824
| 100 | 1 |
18181825
| 100 | 1 |
18191826
| 50 | 3 |
1827+
| 25 | 4 |
1828+
+--------+------+
1829+
```
1830+
1831+
To set window function parameters use the window builder approach described in the
1832+
ref:`_window_functions` online documentation.
1833+
"""
1834+
return Expr(f.rank())
1835+
1836+
1837+
def dense_rank() -> Expr:
1838+
"""Create a dense_rank window function.
1839+
1840+
This window function is similar to :py:func:`rank` except that the returned values
1841+
will be consecutive. Here is an example of a dataframe with a window ordered by
1842+
descending ``points`` and the associated dense rank.
1843+
1844+
```
1845+
+--------+------------+
1846+
| points | dense_rank |
1847+
+--------+------------+
1848+
| 100 | 1 |
1849+
| 100 | 1 |
1850+
| 50 | 2 |
1851+
| 25 | 3 |
1852+
+--------+------------+
1853+
```
1854+
1855+
To set window function parameters use the window builder approach described in the
1856+
ref:`_window_functions` online documentation.
1857+
"""
1858+
return Expr(f.rank())
1859+
1860+
1861+
def percent_rank() -> Expr:
1862+
"""Create a percent_rank window function.
1863+
1864+
This window function is similar to :py:func:`rank` except that the returned values
1865+
will be consecutive. Here is an example of a dataframe with a window ordered by
1866+
descending ``points`` and the associated dense rank.
1867+
1868+
```
1869+
+--------+------+
1870+
| points | rank |
1871+
+--------+------+
1872+
| 100 | 1 |
1873+
| 100 | 1 |
1874+
| 50 | 2 |
1875+
| 25 | 3 |
18201876
+--------+------+
18211877
```
18221878

python/datafusion/tests/test_dataframe.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def test_distinct():
283283
("rank", f.rank().order_by(column("c").sort()).build(), [2, 1, 2]),
284284
(
285285
"dense_rank",
286-
f.window("dense_rank", [], order_by=[f.order_by(column("c"))]),
286+
f.dense_rank().order_by((column("c").sort())).build(),
287287
[2, 1, 2],
288288
),
289289
(
@@ -839,15 +839,3 @@ def test_write_compressed_parquet_missing_compression_level(df, tmp_path, compre
839839

840840
with pytest.raises(ValueError):
841841
df.write_parquet(str(path), compression=compression)
842-
843-
844-
# ctx = SessionContext()
845-
846-
# # create a RecordBatch and a new DataFrame from it
847-
# batch = pa.RecordBatch.from_arrays(
848-
# [pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([8, 5, 8])],
849-
# names=["a", "b", "c"],
850-
# )
851-
852-
# df = ctx.create_dataframe([[batch]])
853-
# test_execute_stream(df)

src/functions.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,16 @@ pub fn rank() -> PyExpr {
879879
window_function::rank().into()
880880
}
881881

882+
#[pyfunction]
883+
pub fn dense_rank() -> PyExpr {
884+
window_function::dense_rank().into()
885+
}
886+
887+
#[pyfunction]
888+
pub fn percent_rank() -> PyExpr {
889+
window_function::percent_rank().into()
890+
}
891+
882892
pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
883893
m.add_wrapped(wrap_pyfunction!(abs))?;
884894
m.add_wrapped(wrap_pyfunction!(acos))?;
@@ -1067,6 +1077,8 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
10671077
m.add_wrapped(wrap_pyfunction!(lag))?;
10681078
m.add_wrapped(wrap_pyfunction!(row_number))?;
10691079
m.add_wrapped(wrap_pyfunction!(rank))?;
1080+
m.add_wrapped(wrap_pyfunction!(dense_rank))?;
1081+
m.add_wrapped(wrap_pyfunction!(percent_rank))?;
10701082

10711083
Ok(())
10721084
}

0 commit comments

Comments
 (0)