Skip to content

Commit 3e115cd

Browse files
authored
docs: Add example for working with remote data (#411)
* Infrastructure for reading from remote data * docs: Add example for working with remote data
1 parent eb89b14 commit 3e115cd

9 files changed

+526
-38
lines changed

.Rbuildignore

+1
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@
2626
^gh-analysis$
2727
^index\.md$
2828
^touchstone$
29+
^README_cache$

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
/.ccache/
33
/docs
44
inst/doc
5+
README_cache/

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ S3method(as.data.frame,duckplyr_df)
1111
S3method(as.data.frame,lazy_duckplyr_df)
1212
S3method(as_duck_tbl,data.frame)
1313
S3method(as_duck_tbl,default)
14+
S3method(as_duck_tbl,duckplyr_df)
1415
S3method(as_duck_tbl,grouped_df)
1516
S3method(as_duck_tbl,rowwise_df)
1617
S3method(as_duck_tbl,tbl_duckdb_connection)

R/ducktbl.R

+12-2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,10 @@ duck_tbl <- function(..., .lazy = FALSE) {
5858
as_duck_tbl <- function(x, ..., .lazy = FALSE) {
5959
out <- as_duck_tbl_dispatch(x, ...)
6060

61-
if (.lazy) {
61+
if (isTRUE(.lazy)) {
6262
out <- as_lazy_duckplyr_df(out)
63+
} else {
64+
out <- as_eager_duckplyr_df(out)
6365
}
6466

6567
return(out)
@@ -79,12 +81,20 @@ as_duck_tbl.tbl_duckdb_connection <- function(x, ...) {
7981
duck_sql(sql, lazy = FALSE, con = con)
8082
}
8183

84+
#' @export
85+
as_duck_tbl.duckplyr_df <- function(x, ...) {
86+
check_dots_empty()
87+
x
88+
}
89+
8290
#' @export
8391
as_duck_tbl.data.frame <- function(x, ...) {
8492
check_dots_empty()
8593

94+
tbl <- as_tibble(x)
95+
8696
# - as_tibble() to remove row names
87-
new_duck_tbl(as_tibble(x))
97+
new_duck_tbl(tbl)
8898
}
8999

90100
#' @export

R/io2.R

+9-2
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,13 @@ duckfun <- function(table_function, args, ..., lazy = TRUE) {
151151

152152
meta_rel_register_file(rel, table_function, path, options)
153153

154-
out <- duckdb$rel_to_altrep(rel)
155-
as_duck_tbl(out, .lazy = lazy)
154+
# Start with lazy, to avoid unwanted materialization
155+
df <- duckdb$rel_to_altrep(rel, allow_materialization = FALSE)
156+
out <- new_duck_tbl(df, lazy = TRUE)
157+
158+
if (!lazy) {
159+
out <- as_duck_tbl(out, .lazy = lazy)
160+
}
161+
162+
out
156163
}

R/lazy.R

+17
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
as_lazy_duckplyr_df <- function(x) {
2+
if (inherits(x, "lazy_duckplyr_df")) {
3+
return(x)
4+
}
5+
26
rel <- duckdb_rel_from_df(x)
37

48
out <- rel_to_df(rel, allow_materialization = FALSE)
@@ -12,6 +16,19 @@ add_lazy_duckplyr_df_class <- function(x) {
1216
x
1317
}
1418

19+
as_eager_duckplyr_df <- function(x) {
20+
if (!inherits(x, "lazy_duckplyr_df")) {
21+
return(x)
22+
}
23+
24+
rel <- duckdb_rel_from_df(x)
25+
26+
out <- rel_to_df(rel, allow_materialization = TRUE)
27+
28+
out <- dplyr_reconstruct(out, x)
29+
remove_lazy_duckplyr_df_class(out)
30+
}
31+
1532
remove_lazy_duckplyr_df_class <- function(x) {
1633
class(x) <- setdiff(class(x), "lazy_duckplyr_df")
1734
x

README.Rmd

+73-6
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ local({
4343
hook_source <- knitr::knit_hooks$get("document")
4444
knitr::knit_hooks$set(document = clean_output)
4545
})
46+
47+
Sys.setenv(DUCKPLYR_META_SKIP = TRUE)
4648
```
4749

4850
# duckplyr <a href="https://duckplyr.tidyverse.org"><img src="man/figures/logo.png" align="right" height="138" /></a>
@@ -56,7 +58,7 @@ local({
5658
5759
[dplyr](https://dplyr.tidyverse.org/) is the grammar of data manipulation in the tidyverse.
5860
The duckplyr package will run all of your existing dplyr code with identical results, using [DuckDB](https://duckdb.org/) where possible to compute the results faster.
59-
In addition, you can analyze larger-than-memory datasets straight from files on your disk or from S3 storage.
61+
In addition, you can analyze larger-than-memory datasets straight from files on your disk or from the web.
6062
If you are new to dplyr, the best place to start is the [data transformation chapter](https://r4ds.hadley.nz/data-transform) in R for Data Science.
6163

6264

@@ -81,10 +83,9 @@ Or from [GitHub](https://github.com/) with:
8183
pak::pak("tidyverse/duckplyr")
8284
```
8385

84-
## Example
86+
## Drop-in replacement for dplyr
8587

86-
Calling `library(duckplyr)` overwrites dplyr methods,
87-
enabling duckplyr instead for the entire session.
88+
Calling `library(duckplyr)` overwrites dplyr methods, enabling duckplyr for the entire session.
8889

8990
```{r attach}
9091
library(conflicted)
@@ -103,7 +104,7 @@ conflict_prefer("filter", "dplyr", quiet = TRUE)
103104
```
104105

105106
The following code aggregates the inflight delay by year and month for the first half of the year.
106-
We use a variant of the `nycflights13::flights` dataset that removes an incompatibility with duckplyr.
107+
We use a variant of the `nycflights13::flights` dataset that works around an incompatibility with duckplyr.
107108

108109
```{r}
109110
flights_df()
@@ -130,7 +131,7 @@ Nothing has been computed yet.
130131
Querying the number of rows, or a column, starts the computation:
131132

132133
```{r}
133-
system.time(print(out$month))
134+
out$month
134135
```
135136

136137
Note that, unlike dplyr, the results are not ordered, see `?config` for details.
@@ -146,6 +147,72 @@ Restart R, or call `duckplyr::methods_restore()` to revert to the default dplyr
146147
duckplyr::methods_restore()
147148
```
148149

150+
## Analyzing larger-than-memory data
151+
152+
An extended variant of this dataset is also available for download as Parquet files.
153+
154+
```{r}
155+
year <- 2022:2024
156+
base_url <- "https://blobs.duckdb.org/flight-data-partitioned/"
157+
files <- paste0("Year=", year, "/data_0.parquet")
158+
urls <- paste0(base_url, files)
159+
urls
160+
```
161+
162+
Using the httpfs DuckDB extension, we can query these files directly from R, without even downloading them first.
163+
164+
```{r}
165+
duck_exec("INSTALL httpfs")
166+
duck_exec("LOAD httpfs")
167+
168+
flights <- duck_parquet(urls)
169+
```
170+
171+
Unlike with local data frames, the default is to disallow automatic materialization of the results on access.
172+
173+
```{r error = TRUE}
174+
nrow(flights)
175+
```
176+
177+
Queries on the remote data are executed lazily, and the results are not materialized until explicitly requested.
178+
For printing, only the first few rows of the result are fetched.
179+
180+
```{r cache = TRUE}
181+
flights
182+
```
183+
184+
```{r cache = TRUE}
185+
flights |>
186+
count(Year)
187+
```
188+
189+
Complex queries can be executed on the remote data.
190+
Note how only the relevant columns are fetched and the 2024 data isn't even touched, as it's not needed for the result.
191+
192+
```{r cache = TRUE}
193+
out <-
194+
flights |>
195+
filter(!is.na(DepDelay), !is.na(ArrDelay)) |>
196+
mutate(InFlightDelay = ArrDelay - DepDelay) |>
197+
summarize(
198+
.by = c(Year, Month),
199+
MeanInFlightDelay = mean(InFlightDelay),
200+
MedianInFlightDelay = median(InFlightDelay),
201+
) |>
202+
filter(Year < 2024)
203+
204+
out |>
205+
explain()
206+
207+
out |>
208+
print() |>
209+
system.time()
210+
```
211+
212+
Over 10M rows analyzed in about 10 seconds over the internet, that's not bad.
213+
Of course, working with Parquet, CSV, or JSON files downloaded locally is possible as well.
214+
215+
149216
## Using duckplyr in other packages
150217

151218
Refer to `vignette("developers", package = "duckplyr")`.

0 commit comments

Comments
 (0)