Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix polars groupby script to polars==latest #244

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 16 additions & 21 deletions polars/groupby-polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import gc
import timeit
import polars as pl
from polars.lazy import col
from polars import col

exec(open("./_helpers/helpers.py").read())

Expand All @@ -23,13 +23,8 @@
print("loading dataset %s" % data_name, flush=True)

with pl.StringCache():
x = pl.read_csv(src_grp, dtype={"id4":pl.Int32, "id5":pl.Int32, "id6":pl.Int32, "v1":pl.Int32, "v2":pl.Int32, "v3":pl.Float64}, low_memory=True)
x["id1"] = x["id1"].cast(pl.Categorical)
x["id1"].shrink_to_fit(in_place=True)
x["id2"] = x["id2"].cast(pl.Categorical)
x["id2"].shrink_to_fit(in_place=True)
x["id3"] = x["id3"].cast(pl.Categorical)
x["id3"].shrink_to_fit(in_place=True)
x = (pl.read_csv(src_grp, dtype={"id4":pl.Int32, "id5":pl.Int32, "id6":pl.Int32, "v1":pl.Int32, "v2":pl.Int32, "v3":pl.Float64}, low_memory=True)
.with_columns(pl.col(["id1", "id2", "id3"]).cast(pl.Categorical)))

in_rows = x.shape[0]
x = x.lazy()
Expand All @@ -42,7 +37,7 @@
question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
ans = x.groupby("id1").agg(pl.sum("v1").alias("v1_sum")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -53,7 +48,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
ans = x.groupby("id1").agg(pl.sum("v1").alias("v1_sum")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -68,7 +63,7 @@
question = "sum v1 by id1:id2" # q2
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby(["id1","id2"]).agg(pl.sum("v1")).collect()
ans = x.groupby(["id1","id2"]).agg(pl.sum("v1").alias("v1_sum")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -79,7 +74,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby(["id1","id2"]).agg(pl.sum("v1")).collect()
ans = x.groupby(["id1","id2"]).agg(pl.sum("v1").alias("v1_sum")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -94,7 +89,7 @@
question = "sum v1 mean v3 by id3" # q3
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id3").agg([pl.sum("v1"), pl.mean("v3")]).collect()
ans = x.groupby("id3").agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -105,7 +100,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id3").agg([pl.sum("v1"), pl.mean("v3")]).collect()
ans = x.groupby("id3").agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -120,7 +115,7 @@
question = "mean v1:v3 by id4" # q4
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id4").agg([pl.mean("v1"), pl.mean("v2"), pl.mean("v3")]).collect()
ans = x.groupby("id4").agg([pl.mean("v1").alias("v1_mean"), pl.mean("v2").alias("v2_mean"), pl.mean("v3").alias("v3_mean")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -131,7 +126,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id4").agg([pl.mean("v1"), pl.mean("v2"), pl.mean("v3")]).collect()
ans = x.groupby("id4").agg([pl.mean("v1").alias("v1_mean"), pl.mean("v2").alias("v2_mean"), pl.mean("v3").alias("v3_mean")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -146,7 +141,7 @@
question = "sum v1:v3 by id6" # q5
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id6").agg([pl.sum("v1"), pl.sum("v2"), pl.sum("v3")]).collect()
ans = x.groupby("id6").agg([pl.sum("v1").alias("v1_sum"), pl.sum("v2").alias("v2_sum"), pl.sum("v3").alias("v3_sum")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -157,7 +152,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id6").agg([pl.sum("v1"), pl.sum("v2"), pl.sum("v3")]).collect()
ans = x.groupby("id6").agg([pl.sum("v1").alias("v1_sum"), pl.sum("v2").alias("v2_sum"), pl.sum("v3").alias("v3_sum")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand Down Expand Up @@ -224,7 +219,7 @@
question = "largest two v3 by id6" # q8
gc.collect()
t_start = timeit.default_timer()
ans = x.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg(col("v3").head(2).alias("largest2_v3")).explode("largest2_v3").collect()
ans = x.drop_nulls("v3").groupby("id6").agg(col("v3").top_k(2).alias("largest2_v3")).explode("largest2_v3").collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -235,7 +230,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg(col("v3").head(2).alias("largest2_v3")).explode("largest2_v3").collect()
ans = x.drop_nulls("v3").groupby("id6").agg(col("v3").top_k(2).alias("largest2_v3")).explode("largest2_v3").collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand Down Expand Up @@ -299,6 +294,6 @@
print(ans.tail(3), flush=True)
del ans

print("grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True)
print("grouping finished, took %0.3fs" % (timeit.default_timer() - task_init), flush=True)

exit(0)