| Title: | A Lightweight, Flexible, and Fast Data Validation Package that Can Handle All Sizes of Data |
|---|---|
| Description: | Allows you to define rules which can be used to verify a given dataset. The package acts as a thin wrapper around more powerful data packages such as 'dplyr', 'data.table', 'arrow', and 'DBI' ('SQL'), which do the heavy lifting. |
| Authors: | David Zimmermann-Kollenda [aut, cre], Beniamino Green [ctb] |
| Maintainer: | David Zimmermann-Kollenda <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 0.1.11 |
| Built: | 2026-05-10 09:18:04 UTC |
| Source: | https://github.com/davzim/dataverifyr |
Programatically Combine a List of Rules and Rulesets into a Single Ruleset
bind_rules(rule_ruleset_list)bind_rules(rule_ruleset_list)
rule_ruleset_list |
a list of rules and rulesets you whish to combine into a single list |
a ruleset which consolidates all the inputs
Checks if a dataset confirms to a given set of rules
check_data( x, rules, xname = deparse(substitute(x)), stop_on_fail = FALSE, stop_on_warn = FALSE, stop_on_error = FALSE, stop_on_schema_fail = FALSE, extra_columns = c("ignore", "warn", "fail") )check_data( x, rules, xname = deparse(substitute(x)), stop_on_fail = FALSE, stop_on_warn = FALSE, stop_on_error = FALSE, stop_on_schema_fail = FALSE, extra_columns = c("ignore", "warn", "fail") )
x |
a dataset, either a |
rules |
a list of |
xname |
optional, a name for the x variable (only used for errors) |
stop_on_fail |
when any of the rules fail, throw an error with stop |
stop_on_warn |
when a warning is found in the code execution, throw an error with stop |
stop_on_error |
when an error is found in the code execution, throw an error with stop |
stop_on_schema_fail |
when any schema checks fail, throw an error with stop |
extra_columns |
how to treat columns in |
a data.frame-like object with one row for each rule and its results
rs <- ruleset( rule(mpg > 10), rule(cyl %in% c(4, 6)), # missing 8 rule(qsec >= 14.5 & qsec <= 22.9) ) rs check_data(mtcars, rs) # schema + relation checks in one output orders <- data.frame(order_id = 1:3, customer_id = c(10, 99, NA), amount = c(10, -5, 20)) customers <- data.frame(customer_id = c(10, 11)) rs2 <- ruleset( rule(amount >= 0, name = "amount non-negative"), reference_rule( local_col = "customer_id", ref_dataset = "customers", ref_col = "customer_id", allow_na = TRUE ), data_columns = list( data_column("order_id", type = "int", optional = FALSE), data_column("customer_id", type = "double", optional = FALSE), data_column("amount", type = "double", optional = FALSE) ), data_name = "orders" ) check_data(list(orders = orders, customers = customers), rs2)rs <- ruleset( rule(mpg > 10), rule(cyl %in% c(4, 6)), # missing 8 rule(qsec >= 14.5 & qsec <= 22.9) ) rs check_data(mtcars, rs) # schema + relation checks in one output orders <- data.frame(order_id = 1:3, customer_id = c(10, 99, NA), amount = c(10, -5, 20)) customers <- data.frame(customer_id = c(10, 11)) rs2 <- ruleset( rule(amount >= 0, name = "amount non-negative"), reference_rule( local_col = "customer_id", ref_dataset = "customers", ref_col = "customer_id", allow_na = TRUE ), data_columns = list( data_column("order_id", type = "int", optional = FALSE), data_column("customer_id", type = "double", optional = FALSE), data_column("amount", type = "double", optional = FALSE) ), data_name = "orders" ) check_data(list(orders = orders, customers = customers), rs2)
Creates a single column declaration used in ruleset(..., data_columns = ...).
Column declarations are schema checks (column existence, optionality, and
declared type), whereas rule() is for row-wise value checks.
data_column( col, type = NA_character_, optional = FALSE, description = NA_character_ )data_column( col, type = NA_character_, optional = FALSE, description = NA_character_ )
col |
column name. |
type |
optional declared type (for example |
optional |
logical; if |
description |
optional free-text description. |
A data_column object (list) that can be passed in
ruleset(..., data_columns = list(...)).
rs <- ruleset( rule(price >= 0), data_columns = list( data_column("price", type = "double", optional = FALSE), data_column("note", type = "str", optional = TRUE) ) ) rs # combined with row rules and strict schema stopping order_rules <- ruleset( rule(price >= 0, allow_na = FALSE), data_columns = list( data_column("order_id", type = "int", optional = FALSE), data_column("price", type = "double", optional = FALSE), data_column("note", type = "str", optional = TRUE) ) ) check_data( data.frame(order_id = 1:3, price = c(10, 20, 30), note = c("ok", NA, "ok")), order_rules, stop_on_schema_fail = TRUE )rs <- ruleset( rule(price >= 0), data_columns = list( data_column("price", type = "double", optional = FALSE), data_column("note", type = "str", optional = TRUE) ) ) rs # combined with row rules and strict schema stopping order_rules <- ruleset( rule(price >= 0, allow_na = FALSE), data_columns = list( data_column("order_id", type = "int", optional = FALSE), data_column("price", type = "double", optional = FALSE), data_column("note", type = "str", optional = TRUE) ) ) check_data( data.frame(order_id = 1:3, price = c(10, 20, 30), note = c("ok", NA, "ok")), order_rules, stop_on_schema_fail = TRUE )
allows you to add rules and rulesets into larger rulesets. This can be useful if you want to create a ruleset for a dataset out of checks for other datasets.
datavarifyr_plus(a, b) ## S3 method for class 'ruleset' a + b ## S3 method for class 'rule' a + bdatavarifyr_plus(a, b) ## S3 method for class 'ruleset' a + b ## S3 method for class 'rule' a + b
a |
the first ruleset you wish to add |
b |
the second ruleset you wish to add |
Note that the current version is in the beta stadium at best, that means the R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster than arrow or SQL-based datasets.
describe(x, skip_ones = TRUE, digits = 4, top_n = 3, fast = FALSE)describe(x, skip_ones = TRUE, digits = 4, top_n = 3, fast = FALSE)
x |
a dataset, either a |
skip_ones |
logical, whether values that occur exactly once should be omitted
from |
digits |
integer, number of digits to round numeric values in |
top_n |
integer, number of most frequent values to include in |
fast |
logical, when |
Numeric values in most_frequent are rounded to digits (default: 4).
If a variable has at most 1 distinct value, most_frequent is left empty.
By default, values with count 1 are omitted from most_frequent.
a data.frame, dplyr::tibble, or data.table::data.table containing
a summary of the dataset given
Similar to skimr::skim(), summarytools::dfSummary(), and gtExtras::gt_plt_summary()
describe(mtcars)describe(mtcars)
The detection will be made based on the class of the object as well as the packages installed.
For example, if a data.frame is used, it will look if data.table or dplyr are installed
on the system, as they provide more speed.
Note the main functions will revert the
detect_backend(x)detect_backend(x)
x |
The data object, ie a data.frame, tibble, data.table, arrow, or DBI object |
a single character element with the name of the backend to use.
One of base-r, data.table, dplyr, collectibles (for arrow or DBI objects)
data <- mtcars detect_backend(data)data <- mtcars detect_backend(data)
Filters a result dataset for the values that failed the verification
filter_fails(res, x, per_rule = FALSE)filter_fails(res, x, per_rule = FALSE)
res |
a result data.frame as outputted from |
x |
a dataset that was used in |
per_rule |
if set to TRUE, a list of filtered data is returned, one for each failed verification rule. If set to FALSE, a data.frame is returned of the values that fail any rule. |
the dataset with the entries that did not match the given rules
rules <- ruleset( rule(mpg > 10 & mpg < 30), # mpg goes up to 34 rule(cyl %in% c(4, 8)), # missing 6 cyl rule(vs %in% c(0, 1), allow_na = TRUE) ) res <- check_data(mtcars, rules) filter_fails(res, mtcars) filter_fails(res, mtcars, per_rule = TRUE) # alternatively, the first argument can also be a ruleset filter_fails(rules, mtcars) filter_fails(rules, mtcars, per_rule = TRUE)rules <- ruleset( rule(mpg > 10 & mpg < 30), # mpg goes up to 34 rule(cyl %in% c(4, 8)), # missing 6 cyl rule(vs %in% c(0, 1), allow_na = TRUE) ) res <- check_data(mtcars, rules) filter_fails(res, mtcars) filter_fails(res, mtcars, per_rule = TRUE) # alternatively, the first argument can also be a ruleset filter_fails(rules, mtcars) filter_fails(rules, mtcars, per_rule = TRUE)
Visualize the results of a data validation
plot_res( res, main = "Verification Results per Rule", colors = c(pass = "#308344", fail = "#E66820"), labels = TRUE, table = TRUE )plot_res( res, main = "Verification Results per Rule", colors = c(pass = "#308344", fail = "#E66820"), labels = TRUE, table = TRUE )
res |
a data.frame as returned by |
main |
the title of the plot |
colors |
a named list of colors, with the names pass and fail |
labels |
whether the values should be displayed on the barplot |
table |
show a table in the legend with the values |
a base r plot
rs <- ruleset( rule(Ozone > 0 & Ozone < 120, allow_na = TRUE), # some mising values and > 120 rule(Solar.R > 0, allow_na = TRUE), rule(Solar.R < 200, allow_na = TRUE), rule(Wind > 10), rule(Temp < 100) ) res <- check_data(airquality, rs) plot_res(res)rs <- ruleset( rule(Ozone > 0 & Ozone < 120, allow_na = TRUE), # some mising values and > 120 rule(Solar.R > 0, allow_na = TRUE), rule(Solar.R < 200, allow_na = TRUE), rule(Wind > 10), rule(Temp < 100) ) res <- check_data(airquality, rs) plot_res(res)
Creates a rule that checks whether values in a local column exist in a
column of a referenced dataset. Use with check_data() by supplying x as
a named list of datasets and setting data_name in ruleset() (or by
ordering the list so the first entry is the primary dataset).
reference_rule( local_col, ref_dataset, ref_col, name = NA, allow_na = FALSE, negate = FALSE, ... )reference_rule( local_col, ref_dataset, ref_col, name = NA, allow_na = FALSE, negate = FALSE, ... )
local_col |
column name in the primary dataset. |
ref_dataset |
name of the referenced dataset in the |
ref_col |
column name in the referenced dataset. |
name |
optional display name for the rule. |
allow_na |
logical; if |
negate |
logical; if |
... |
additional fields attached to the rule object. |
A reference_rule object that can be included in ruleset().
flights <- data.frame(carrier = c("AA", "BB", NA_character_)) carriers <- data.frame(carrier_id = c("AA")) rs <- ruleset( reference_rule( local_col = "carrier", ref_dataset = "carriers", ref_col = "carrier_id", allow_na = TRUE ), data_name = "flights" ) check_data(list(flights = flights, carriers = carriers), rs) # negated relation: value must NOT exist in blacklist blacklist <- data.frame(carrier_id = c("XX", "YY")) rs_neg <- ruleset( reference_rule( local_col = "carrier", ref_dataset = "blacklist", ref_col = "carrier_id", negate = TRUE, allow_na = TRUE ), data_name = "flights" ) check_data(list(flights = flights, blacklist = blacklist), rs_neg)flights <- data.frame(carrier = c("AA", "BB", NA_character_)) carriers <- data.frame(carrier_id = c("AA")) rs <- ruleset( reference_rule( local_col = "carrier", ref_dataset = "carriers", ref_col = "carrier_id", allow_na = TRUE ), data_name = "flights" ) check_data(list(flights = flights, carriers = carriers), rs) # negated relation: value must NOT exist in blacklist blacklist <- data.frame(carrier_id = c("XX", "YY")) rs_neg <- ruleset( reference_rule( local_col = "carrier", ref_dataset = "blacklist", ref_col = "carrier_id", negate = TRUE, allow_na = TRUE ), data_name = "flights" ) check_data(list(flights = flights, blacklist = blacklist), rs_neg)
Creates a single data rule
rule(expr, name = NA, allow_na = FALSE, negate = FALSE, ...) ## S3 method for class 'rule' print(x, ...)rule(expr, name = NA, allow_na = FALSE, negate = FALSE, ...) ## S3 method for class 'rule' print(x, ...)
expr |
an expression which dictates which determines when a rule is good.
Note that the expression is evaluated in |
name |
an optional name for the rule for reference |
allow_na |
does the rule allow for NA values in the data? default value is FALSE.
Note that when NAs are introduced in the expression, |
negate |
is the rule negated, only applies to the expression not allow_na,
that is, if |
... |
additional arguments that are carried along for your documentation, but are not used. Could be for example date, person, contact, comment, etc |
x |
a rule to print |
The rule values as a list
print(rule): Prints a rule
r <- rule(mpg > 10) r r2 <- rule(mpg > 10, name = "check that mpg is reasonable", allow_na = TRUE, negate = FALSE, author = "me", date = Sys.Date()) r2 check_data(mtcars, r) rs <- ruleset( rule(mpg > 10), rule(cyl %in% c(4, 6)), # missing 8 rule(qsec >= 14.5 & qsec <= 22.9) ) rs check_data(mtcars, rs)r <- rule(mpg > 10) r r2 <- rule(mpg > 10, name = "check that mpg is reasonable", allow_na = TRUE, negate = FALSE, author = "me", date = Sys.Date()) r2 check_data(mtcars, r) rs <- ruleset( rule(mpg > 10), rule(cyl %in% c(4, 6)), # missing 8 rule(qsec >= 14.5 & qsec <= 22.9) ) rs check_data(mtcars, rs)
Creates a set of rules
ruleset(..., data_columns = NULL, meta = NULL, data_name = NULL) ## S3 method for class 'ruleset' print(x, n = 3, ...)ruleset(..., data_columns = NULL, meta = NULL, data_name = NULL) ## S3 method for class 'ruleset' print(x, n = 3, ...)
... |
a list of rules |
data_columns |
optional list of schema declarations created with
internal |
meta |
optional metadata list for v1 YAML workflows. |
data_name |
optional name of the primary dataset when |
x |
a ruleset to print |
n |
a maximum number of rules to print |
the list of rules as a ruleset
print(ruleset): Prints a ruleset
r1 <- rule(mpg > 10) r2 <- rule(mpg < 20) rs <- ruleset(r1, r2) rs rs <- ruleset( rule(cyl %in% c(4, 6, 8)), rule(is.numeric(disp)) ) rs # combine row, schema, and relational checks orders <- data.frame(order_id = 1:4, customer_id = c(10, 11, 99, NA), amount = c(10, 20, -5, 30)) customers <- data.frame(customer_id = c(10, 11, 12)) rs2 <- ruleset( rule(amount >= 0, name = "amount must be non-negative"), reference_rule( local_col = "customer_id", ref_dataset = "customers", ref_col = "customer_id", allow_na = TRUE ), data_columns = list( data_column("order_id", type = "int", optional = FALSE), data_column("customer_id", type = "int", optional = FALSE), data_column("amount", type = "double", optional = FALSE) ), data_name = "orders" ) check_data(list(orders = orders, customers = customers), rs2)r1 <- rule(mpg > 10) r2 <- rule(mpg < 20) rs <- ruleset(r1, r2) rs rs <- ruleset( rule(cyl %in% c(4, 6, 8)), rule(is.numeric(disp)) ) rs # combine row, schema, and relational checks orders <- data.frame(order_id = 1:4, customer_id = c(10, 11, 99, NA), amount = c(10, 20, -5, 30)) customers <- data.frame(customer_id = c(10, 11, 12)) rs2 <- ruleset( rule(amount >= 0, name = "amount must be non-negative"), reference_rule( local_col = "customer_id", ref_dataset = "customers", ref_col = "customer_id", allow_na = TRUE ), data_columns = list( data_column("order_id", type = "int", optional = FALSE), data_column("customer_id", type = "int", optional = FALSE), data_column("amount", type = "double", optional = FALSE) ), data_name = "orders" ) check_data(list(orders = orders, customers = customers), rs2)
A small, human-readable dataset with mixed column types, missing values, and one datetime column. It is designed for documentation examples and unit tests.
sample_datasample_data
A data frame with 8 rows and 6 variables:
Integer order identifier.
Character tier ("bronze", "silver", "gold", etc),
includes one NA.
Numeric order amount, includes one negative value and one NA.
Logical payment flag, includes one NA.
Character payment method, includes one NA.
POSIXct order timestamp in UTC, includes one NA.
sample_datasample_data
Read and write rules to a yaml file
write_rules(x, file, format = c("v1", "pre_v1")) read_rules(file)write_rules(x, file, format = c("v1", "pre_v1")) read_rules(file)
x |
a list of rules |
file |
a filename |
format |
output format. |
the filename invisibly
read_rules(): reads a ruleset back in
rr <- ruleset( rule(mpg > 10), rule(cyl %in% c(4, 6, 8)) ) file <- tempfile(fileext = ".yml") write_rules(rr, file)rr <- ruleset( rule(mpg > 10), rule(cyl %in% c(4, 6, 8)) ) file <- tempfile(fileext = ".yml") write_rules(rr, file)