docs: Add example for working with remote data (#411)

krlmlr · web-flow · commit 3e115cda529a · 2024-12-20T18:59:23.000Z
* Infrastructure for reading from remote data

* docs: Add example for working with remote data
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -26,3 +26,4 @@
 ^gh-analysis$
 ^index\.md$
 ^touchstone$
+^README_cache$
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 /.ccache/
 /docs
 inst/doc
+README_cache/
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ S3method(as.data.frame,duckplyr_df)
 S3method(as.data.frame,lazy_duckplyr_df)
 S3method(as_duck_tbl,data.frame)
 S3method(as_duck_tbl,default)
+S3method(as_duck_tbl,duckplyr_df)
 S3method(as_duck_tbl,grouped_df)
 S3method(as_duck_tbl,rowwise_df)
 S3method(as_duck_tbl,tbl_duckdb_connection)
diff --git a/R/ducktbl.R b/R/ducktbl.R
@@ -58,8 +58,10 @@ duck_tbl <- function(..., .lazy = FALSE) {
 as_duck_tbl <- function(x, ..., .lazy = FALSE) {
   out <- as_duck_tbl_dispatch(x, ...)
 
-  if (.lazy) {
+  if (isTRUE(.lazy)) {
     out <- as_lazy_duckplyr_df(out)
+  } else {
+    out <- as_eager_duckplyr_df(out)
   }
 
   return(out)
@@ -79,12 +81,20 @@ as_duck_tbl.tbl_duckdb_connection <- function(x, ...) {
   duck_sql(sql, lazy = FALSE, con = con)
 }
 
+#' @export
+as_duck_tbl.duckplyr_df <- function(x, ...) {
+  check_dots_empty()
+  x
+}
+
 #' @export
 as_duck_tbl.data.frame <- function(x, ...) {
   check_dots_empty()
 
+  tbl <- as_tibble(x)
+
   # - as_tibble() to remove row names
-  new_duck_tbl(as_tibble(x))
+  new_duck_tbl(tbl)
 }
 
 #' @export
diff --git a/R/io2.R b/R/io2.R
@@ -151,6 +151,13 @@ duckfun <- function(table_function, args, ..., lazy = TRUE) {
 
   meta_rel_register_file(rel, table_function, path, options)
 
-  out <- duckdb$rel_to_altrep(rel)
-  as_duck_tbl(out, .lazy = lazy)
+  # Start with lazy, to avoid unwanted materialization
+  df <- duckdb$rel_to_altrep(rel, allow_materialization = FALSE)
+  out <- new_duck_tbl(df, lazy = TRUE)
+
+  if (!lazy) {
+    out <- as_duck_tbl(out, .lazy = lazy)
+  }
+
+  out
 }
diff --git a/R/lazy.R b/R/lazy.R
@@ -1,4 +1,8 @@
 as_lazy_duckplyr_df <- function(x) {
+  if (inherits(x, "lazy_duckplyr_df")) {
+    return(x)
+  }
+
   rel <- duckdb_rel_from_df(x)
 
   out <- rel_to_df(rel, allow_materialization = FALSE)
@@ -12,6 +16,19 @@ add_lazy_duckplyr_df_class <- function(x) {
   x
 }
 
+as_eager_duckplyr_df <- function(x) {
+  if (!inherits(x, "lazy_duckplyr_df")) {
+    return(x)
+  }
+
+  rel <- duckdb_rel_from_df(x)
+
+  out <- rel_to_df(rel, allow_materialization = TRUE)
+
+  out <- dplyr_reconstruct(out, x)
+  remove_lazy_duckplyr_df_class(out)
+}
+
 remove_lazy_duckplyr_df_class <- function(x) {
   class(x) <- setdiff(class(x), "lazy_duckplyr_df")
   x
diff --git a/README.Rmd b/README.Rmd
@@ -43,6 +43,8 @@ local({
   hook_source <- knitr::knit_hooks$get("document")
   knitr::knit_hooks$set(document = clean_output)
 })
+
+Sys.setenv(DUCKPLYR_META_SKIP = TRUE)
 ```
 
 # duckplyr <a href="https://duckplyr.tidyverse.org"><img src="man/figures/logo.png" align="right" height="138" /></a>
@@ -56,7 +58,7 @@ local({
 
 [dplyr](https://dplyr.tidyverse.org/) is the grammar of data manipulation in the tidyverse.
 The duckplyr package will run all of your existing dplyr code with identical results, using [DuckDB](https://duckdb.org/) where possible to compute the results faster.
-In addition, you can analyze larger-than-memory datasets straight from files on your disk or from S3 storage.
+In addition, you can analyze larger-than-memory datasets straight from files on your disk or from the web.
 If you are new to dplyr, the best place to start is the [data transformation chapter](https://r4ds.hadley.nz/data-transform) in R for Data Science.
 
 
@@ -81,10 +83,9 @@ Or from [GitHub](https://github.com/) with:
 pak::pak("tidyverse/duckplyr")
 ```
 
-## Example
+## Drop-in replacement for dplyr
 
-Calling `library(duckplyr)` overwrites dplyr methods,
-enabling duckplyr instead for the entire session.
+Calling `library(duckplyr)` overwrites dplyr methods, enabling duckplyr for the entire session.
 
 ```{r attach}
 library(conflicted)
@@ -103,7 +104,7 @@ conflict_prefer("filter", "dplyr", quiet = TRUE)
 ```
 
 The following code aggregates the inflight delay by year and month for the first half of the year.
-We use a variant of the `nycflights13::flights` dataset that removes an incompatibility with duckplyr.
+We use a variant of the `nycflights13::flights` dataset that works around an incompatibility with duckplyr.
 
 ```{r}
 flights_df()
@@ -130,7 +131,7 @@ Nothing has been computed yet.
 Querying the number of rows, or a column, starts the computation:
 
 ```{r}
-system.time(print(out$month))
+out$month
 ```
 
 Note that, unlike dplyr, the results are not ordered, see `?config` for details.
@@ -146,6 +147,72 @@ Restart R, or call `duckplyr::methods_restore()` to revert to the default dplyr
 duckplyr::methods_restore()
 ```
 
+## Analyzing larger-than-memory data
+
+An extended variant of this dataset is also available for download as Parquet files.
+
+```{r}
+year <- 2022:2024
+base_url <- "https://blobs.duckdb.org/flight-data-partitioned/"
+files <- paste0("Year=", year, "/data_0.parquet")
+urls <- paste0(base_url, files)
+urls
+```
+
+Using the httpfs DuckDB extension, we can query these files directly from R, without even downloading them first.
+
+```{r}
+duck_exec("INSTALL httpfs")
+duck_exec("LOAD httpfs")
+
+flights <- duck_parquet(urls)
+```
+
+Unlike with local data frames, the default is to disallow automatic materialization of the results on access.
+
+```{r error = TRUE}
+nrow(flights)
+```
+
+Queries on the remote data are executed lazily, and the results are not materialized until explicitly requested.
+For printing, only the first few rows of the result are fetched.
+
+```{r cache = TRUE}
+flights
+```
+
+```{r cache = TRUE}
+flights |>
+  count(Year)
+```
+
+Complex queries can be executed on the remote data.
+Note how only the relevant columns are fetched and the 2024 data isn't even touched, as it's not needed for the result.
+
+```{r cache = TRUE}
+out <-
+  flights |>
+  filter(!is.na(DepDelay), !is.na(ArrDelay)) |>
+  mutate(InFlightDelay = ArrDelay - DepDelay) |>
+  summarize(
+    .by = c(Year, Month),
+    MeanInFlightDelay = mean(InFlightDelay),
+    MedianInFlightDelay = median(InFlightDelay),
+  ) |>
+  filter(Year < 2024)
+
+out |>
+  explain()
+
+out |>
+  print() |>
+  system.time()
+```
+
+Over 10M rows analyzed in about 10 seconds over the internet, that's not bad.
+Of course, working with Parquet, CSV, or JSON files downloaded locally is possible as well.
+
+
 ## Using duckplyr in other packages
 
 Refer to `vignette("developers", package = "duckplyr")`.
diff --git a/README.md b/README.md
diff --git a/index.md b/index.md