library(duckdb)
= dbConnect(duckdb(), shutdown = TRUE)
con
# uncomment and run the next line if you'd like to create a persistent, disk-based database instead. It is good for computation for bigger than RAM data.
# con = dbConnect(duckdb(), dbdir = "nyc.duck")
# SELECT
# passenger_count,
# AVG(tip_amount) AS mean_tip
# FROM 'nyc-taxi/**/*.parquet'
# GROUP BY passenger_count
# ORDER BY passenger_count
= Sys.time()
tic = dbGetQuery(
dat1
con,"
FROM 'nyc-taxi/**/*.parquet'
SELECT
passenger_count,
AVG(tip_amount) AS mean_tip
GROUP BY ALL
ORDER BY ALL
"
)= Sys.time()
toc
dat1- tic
toc
= Sys.time()
tic = dbGetQuery(
dat2
con,"
FROM 'nyc-taxi/**/*.parquet'
SELECT
month,
passenger_count,
AVG(tip_amount) AS mean_tip
WHERE month <= 3
GROUP BY ALL
"
)= Sys.time()
toc
head(dat2)
dbDisconnect(con)
DuckDB SQL
Templates for DuckDB SQL
This is example templates that use DuckDB with SQL for R
and Python
. Note that these are short examples. If you want to know more about what they can do, check out this site.
import duckdb
import time
= duckdb.connect(database=':memory:', read_only=False)
con
# uncomment and run the next line if you'd like to create a persistent, disk-based database instead. It is good for computation for bigger than RAM data.
= duckdb.connect(database='nyc.duck', read_only=False)
con
= time.time()
tic = (
dat1
con.
query('''
FROM 'nyc-taxi/**/*.parquet'
SELECT
passenger_count,
AVG(tip_amount) AS mean_tip
GROUP BY ALL
ORDER BY ALL
'''
)
)= time.time()
toc
dat1
= time.time()
tic = (
dat2
con.
query('''
FROM 'nyc-taxi/**/*.parquet'
SELECT
month,
passenger_count,
AVG(tip_amount) AS mean_tip
WHERE month <= 3
GROUP BY ALL
'''
)
)= time.time()
toc
dat2
con.close()