library(polars)
= pl$scan_parquet("nyc-taxi/**/*.parquet", hive_partitioning = TRUE)
nyc
nyc
= (
q1
nyc$group_by("passenger_count")
$agg(
$mean("tip_amount")#$alias("mean_tip") ## alias is optional
pl
)$sort("passenger_count")
)
q1
= Sys.time()
tic = q1$collect()
dat1 = Sys.time()
toc
dat1
= (
q2
nyc$filter(pl$col("month") <= 3)
$group_by("month", "passenger_count")
$agg(pl$mean("tip_amount")$alias("mean_tip"))
$sort("passenger_count")
)
# q2 # naive
cat(q2$explain()) # optimized
= Sys.time()
tic = q2$collect()
dat2 = Sys.time()
toc
dat2
= (
q3
nyc$group_by("passenger_count", "trip_distance")
$agg(
$mean("tip_amount")$alias("mean_tip"),
pl$mean("fare_amount")$alias("mean_fare")
pl
)$sort("passenger_count", "trip_distance")
)
= Sys.time()
tic = q3$collect()
dat3 = Sys.time()
toc
dat3
$unpivot(index = c("passenger_count", "trip_distance"))
dat3
= nyc$group_by("month")$agg(pl$col("tip_amount")$mean())
mean_tips = nyc$group_by("month")$agg(pl$col("fare_amount")$mean())
mean_fares
(
mean_tips$join(
mean_fares,on = "month",
how = "left" # default is inner join
)$collect()
)
# You can also try tidypolars
library(polars) ## Already loaded
library(tidypolars)
library(dplyr, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)
= scan_parquet_polars("nyc-taxi/**/*.parquet")
nyc
|>
nyc summarise(mean_tip = mean(tip_amount), .by = passenger_count) |>
compute()
# Aside: Use collect() instead of compute() at the end if you would prefer to return a standard R data.frame instead of a Polars DataFrame.
Polars from Python and R
Templates for using Polars in Python and R
This is example templates that use Polars for R
and Python
. Note that these are short examples. If you want to know more about what they can do, check out this site.
import polars as pl
import time
import matplotlib
= pl.scan_parquet("nyc-taxi/**/*.parquet", hive_partitioning=True)
nyc
nyc
= (
q1
nyc"passenger_count"])
.group_by([
.agg(["tip_amount")#.alias("mean_tip") ## alias is optional
pl.mean(
])"passenger_count")
.sort(
)
q1
= time.time()
tic = q1.collect()
dat1 = time.time()
toc
dat1
= (
q2
nycfilter(pl.col("month") <= 3)
."month", "passenger_count"])
.group_by(["tip_amount").alias("mean_tip")])
.agg([pl.mean("passenger_count")
.sort(
)
# q2 # naive
# optimized
q2.show_graph()
= time.time()
tic = q2.collect()
dat2 = time.time()
toc
dat2
= (
q3
nyc"passenger_count", "trip_distance"])
.group_by([
.agg(["tip_amount").alias("mean_tip"),
pl.mean("fare_amount").alias("mean_fare"),
pl.mean(
])"passenger_count", "trip_distance"])
.sort([
)
= time.time()
tic = q3.collect()
dat3 = time.time()
toc
dat3
= ["passenger_count", "trip_distance"])
dat3.unpivot(index
= nyc.group_by("month").agg(pl.col("tip_amount").mean())
mean_tips = nyc.group_by("month").agg(pl.col("fare_amount").mean())
mean_fares
(
mean_tips
.join(
mean_fares,= "month",
on = "left" # default is inner join
how
)
.collect() )