Move function name to assert_count(); add assertions vignette

sfirke · Dec 18, 2024 · 5b4c1fe · 5b4c1fe
1 parent 030a03d
commit 5b4c1fe
Show file tree

Hide file tree

Showing 10 changed files with 163 additions and 23 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ docs
 Meta
 docs/
 janitor.Rproj
+inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -52,4 +52,4 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
@@ -29,7 +29,7 @@ export(adorn_rounding)
 export(adorn_title)
 export(adorn_totals)
 export(as_tabyl)
-export(assert_count_true)
+export(assert_count)
 export(chisq.test)
 export(clean_names)
 export(compare_df_cols)

diff --git a/NEWS.md b/NEWS.md
@@ -14,7 +14,7 @@ These are all minor breaking changes resulting from enhancements and are not exp
 
 * The new function `excel_time_to_numeric()` converts times from Excel that do not have accompanying dates into a number of seconds.  (#245, thanks to **@billdenney** for the feature.)
 
-* The new function `assert_count_true()` verifies that an expected number of values are `TRUE` for quality checks in data pipelines
+* The new function `assert_count()` verifies that an expected number of values are `TRUE` for quality checks in data pipelines
 
 ## Bug fixes
 

diff --git a/R/assert_count_true.R → R/assertions.R b/R/assert_count_true.R → R/assertions.R
@@ -1,25 +1,25 @@
 #' Verify that a vector of values has the expected number of `TRUE` values
 #'
-#' @param x A logical vecotor without `NA` values
+#' @param x A logical vector without `NA` values
 #' @param n The expected number of `TRUE` values
 #' @returns `x` if `sum(x) == n` or an informative error message otherwise
 #' @examples
 #' data.frame(A = 1:5) %>%
 #'   dplyr::mutate(
-#'     big_values = assert_count_true(A > 2, n = 3)
+#'     big_values = assert_count(A > 2, n = 3)
 #'   )
 #'
 #' my_data <- data.frame(name = c("Bill", "Sam"), birthdate = c("2024-05-22", "2024-05-22"))
 #' my_data |>
 #'   dplyr::mutate(
 #'     birthdate =
 #'       dplyr::case_when(
-#'         assert_count_true(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23",
+#'         assert_count(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23",
 #'         TRUE ~ birthdate
 #'       )
 #'   )
 #' @export
-assert_count_true <- function(x, n = 1) {
+assert_count <- function(x, n = 1) {
   stopifnot(is.logical(x))
   if (any(is.na(x))) {
     stop(deparse(substitute(x)), " has NA values")

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -30,7 +30,7 @@ reference:
     Compare data frames columns
   contents:
   - starts_with("compare_df_cols")
-  - assert_count_true
+  - assert_count
 
 - title: Removing unnecessary columns / rows
   contents:

diff --git a/man/assert_count_true.Rd → man/assert_count.Rd b/man/assert_count_true.Rd → man/assert_count.Rd
diff --git a/tests/testthat/test-assert_count_true.R b/tests/testthat/test-assert_count_true.R
@@ -1,36 +1,36 @@
-test_that("assert_count_true", {
+test_that("assert_count", {
   expect_equal(
-    assert_count_true(TRUE, 1),
+    assert_count(TRUE, 1),
     TRUE
   )
   expect_equal(
-    assert_count_true(rep(TRUE, 3), 3),
+    assert_count(rep(TRUE, 3), 3),
     rep(TRUE, 3)
   )
   my_vector <- c(rep(TRUE, 3), FALSE)
   expect_equal(
-    assert_count_true(my_vector, 3),
+    assert_count(my_vector, 3),
     my_vector
   )
   expect_error(
-    assert_count_true(NA),
+    assert_count(NA),
     regexp = "NA has NA values"
   )
   # more informative errors
   my_vector <- c(NA, TRUE)
   expect_error(
-    assert_count_true(my_vector),
+    assert_count(my_vector),
     regexp = "my_vector has NA values"
   )
   my_vector <- c(FALSE, TRUE)
   expect_error(
-    assert_count_true(my_vector, n = 2),
+    assert_count(my_vector, n = 2),
     regexp = "`my_vector` expected 2 `TRUE` values but 1 was found."
   )
   # Check grammar of error message
   my_vector <- c(TRUE, TRUE)
   expect_error(
-    assert_count_true(my_vector, n = 1),
+    assert_count(my_vector, n = 1),
     regexp = "`my_vector` expected 1 `TRUE` value but 2 were found."
   )
 })
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/assertions.Rmd b/vignettes/assertions.Rmd
@@ -0,0 +1,137 @@
+---
+title: "Assertions for cleaning data"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Assertions for cleaning data}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+# Assertions for cleaning data
+
+Part of cleaning data includes assertions to make sure that data are expected
+before changing the values. `janitor` provides an assertion to enable data
+verification before making changes; more assertions may be added in the future.
+
+```{r setup}
+library(janitor)
+library(dplyr)
+```
+
+## `assert_count()` - Verify the number of `TRUE` values
+
+`assert_count()` will verify that the number of `TRUE` values is the expected
+number. It is useful when data may change over time and you want to be sure that
+you are changing only data that you intend to change.
+
+For example, you are given a data set with test scores for several students.
+Some of the scores are missing.
+
+```{r raw-v1}
+raw <-
+  data.frame(
+    student_id = c(123, 124, 125, 126),
+    test_score = c(NA, 93, NA, 82)
+  )
+```
+
+When you first receive the data, you're told separately that student 123 has a
+score of 84 and 125 has a score of 91. You want to verify that you are finding
+the right rows to replace and that you replace them.
+
+```{r clean-v1-mistake}
+clean_mistake <-
+  raw %>%
+  mutate(
+    test_score =
+      case_when(
+        student_id == 124 & is.na(test_score) ~ 84,
+        student_id == 125 & is.na(test_score) ~ 91,
+        TRUE ~ test_score
+      )
+  )
+```
+
+Because of a bug in the code, `student_id == 123` was not replaced.
+
+```{r clean-v1-mistake-table}
+clean_mistake
+```
+
+Using `assert_count()`, you would find this error because of an error raised by
+`assert_count()` in the pipeline.
+
+```{r clean_assert}
+try({
+clean_assert <-
+  raw %>%
+  mutate(
+    test_score =
+      case_when(
+        assert_count(student_id == 124 & is.na(test_score)) ~ 84,
+        assert_count(student_id == 125 & is.na(test_score)) ~ 91,
+        TRUE ~ test_score
+      )
+  )
+})
+```
+
+Fixing the code bug so that the first `student_id == 123` instead of
+`student_id == 124`, you now get the expected result.
+
+```{r clean_assert_fixed}
+clean_assert <-
+  raw %>%
+  mutate(
+    test_score =
+      case_when(
+        assert_count(student_id == 123 & is.na(test_score)) ~ 84,
+        assert_count(student_id == 125 & is.na(test_score)) ~ 91,
+        TRUE ~ test_score
+      )
+  )
+
+# New result
+clean_assert
+
+# Original data
+raw
+```
+
+### Changing data
+
+Another way that `assert_count()` can help is verifying that your code notifies
+you if your data changes in an important way. Using the example before, you may
+get a new raw data set (`raw_v2`) that has some of the `test_score` values
+added. They may be different than what you were told before.
+
+Running the same code on the new data will give you an informative error telling
+you what to look into.
+
+```{r raw_v2}
+raw_v2 <-
+  data.frame(
+    student_id = c(123, 124, 125, 126),
+    test_score = c(90, 93, NA, 82)
+  )
+
+try({
+clean_assert <-
+  raw_v2 %>%
+  mutate(
+    test_score =
+      case_when(
+        assert_count(student_id == 123 & is.na(test_score)) ~ 84,
+        assert_count(student_id == 125 & is.na(test_score)) ~ 91,
+        TRUE ~ test_score
+      )
+  )
+})
+```
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ docs @@
     Meta
     docs/
     janitor.Rproj
+    inst/doc