In cheapr, ‘cheap’ means fast and memory-efficient, and that’s exactly the philosophy that cheapr aims to follow.
You can install cheapr like so:
install.packages("cheapr")
or you can install the development version of cheapr:
::install_github("NicChr/cheapr") remotes
Some common operations that cheapr can do much faster and more efficiently include:
Counting, finding, removing and replacing NA
and
scalar values
Creating factors
Creating multiple sequences in a vectorised way
Sub-setting vectors and data frames efficiently
Safe, flexible and fast greatest common divisor and lowest common multiple
Lags/leads
Lightweight integer64
support
In-memory Math (no copies, vectors updated by reference)
Summary statistics of data frame variables
Binning of continuous data
Let’s first load the required packages
library(cheapr)
library(bench)
num_na()
is a useful function to efficiently return the
number of NA
values and can be used in a variety of
problems.
Almost all the NA
handling functions in cheapr can make
use of multiple cores on your machine through openMP.
<- rep(NA, 10^6)
x
# 1 core by default
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 num_na(x) 996µs 1.05ms 953. 2.41KB 0
#> 2 sum(is.na(x)) 802µs 1.84ms 543. 3.81MB 44.6
# 4 cores
options(cheapr.cores = 4)
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 num_na(x) 249µs 319.3µs 2948. 0B 0
#> 2 sum(is.na(x)) 777µs 1.72ms 568. 3.81MB 47.1
options(cheapr.cores = 1)
<- matrix(x, ncol = 10^3)
m # Number of NA values by row
mark(row_na_counts(m),
rowSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 row_na_counts(m) 2.09ms 2.24ms 439. 9.14KB 0
#> 2 rowSums(is.na(m)) 2.74ms 3.55ms 276. 3.82MB 26.6
# Number of NA values by col
mark(col_na_counts(m),
colSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 col_na_counts(m) 2.25ms 2.36ms 419. 9.14KB 0
#> 2 colSums(is.na(m)) 2.57ms 2.72ms 360. 3.82MB 33.6
is_na
is a multi-threaded alternative to
is.na
<- rnorm(10^6)
x sample.int(10^6, 10^5)] <- NA
x[mark(is.na(x), is_na(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(x) 732.1µs 1.82ms 557. 3.81MB 90.6
#> 2 is_na(x) 1.31ms 2.47ms 414. 3.82MB 40.6
### posixlt method is much faster
<- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600),
hours tz = "UTC")
sample.int(10^6, 10^5)] <- NA
hours[
mark(is.na(hours), is_na(hours))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(hours) 1.22s 1.22s 0.823 61MB 0.823
#> 2 is_na(hours) 13.34ms 15.62ms 62.8 13.9MB 5.89
It differs in 2 regards:
NA
when either that
element is an NA
value or it is a list containing only
NA
values.is_na
returns a logical vector where
TRUE
defines an empty row of only NA
values.# List example
is.na(list(NA, list(NA, NA), 10))
#> [1] TRUE FALSE FALSE
is_na(list(NA, list(NA, NA), 10))
#> [1] TRUE TRUE FALSE
# Data frame example
<- data.frame(x = c(1, NA, 3),
df y = c(NA, NA, NA))
df#> x y
#> 1 1 NA
#> 2 NA NA
#> 3 3 NA
is_na(df)
#> [1] FALSE TRUE FALSE
is_na(df)
#> [1] FALSE TRUE FALSE
# The below identity should hold
identical(is_na(df), row_na_counts(df) == ncol(df))
#> [1] TRUE
is_na
and all the NA
handling functions
fall back on calling is.na()
if no suitable method is
found. This means that custom objects like vctrs rcrds and more are
supported.
overview
Inspired by the excellent skimr package, overview()
is a
cheaper alternative designed for larger data.
set.seed(42)
<- data.frame(
df x = sample.int(100, 10^7, TRUE),
y = factor_(sample(LETTERS, 10^7, TRUE)),
z = rnorm(10^7)
)overview(df)
#> obs: 10000000
#> cols: 3
#>
#> ----- Numeric -----
#> col class n_missng p_complt n_unique mean p0 p25 p50 p75
#> 1 x integr 0 1 100 50.51 1 26 51 76
#> 2 z numerc 0 1 10000000 -0.000089 -5.16 -0.68 -0.00014 0.67
#> p100 iqr sd hist
#> 1 100 50 28.86 ▇▇▇▇▇
#> 2 5.26 1.35 1 ▁▂▇▂▁
#>
#> ----- Categorical -----
#> col class n_missng p_complt n_unique n_levels min max
#> 1 y factor 0 1 26 26 A Z
mark(overview(df, hist = FALSE))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 overview(df, hist = FALSE) 1.04s 1.04s 0.958 2.09KB 0
sset
sset(iris, 1:5)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
sset(iris, 1:5, j = "Species")
#> Species
#> 1 setosa
#> 2 setosa
#> 3 setosa
#> 4 setosa
#> 5 setosa
# sset always returns a data frame when input is a data frame
sset(iris, 1, 1) # data frame
#> Sepal.Length
#> 1 5.1
1, 1] # not a data frame
iris[#> [1] 5.1
<- sample.int(10^6, 10^4, TRUE)
x <- sample.int(10^6, 10^4, TRUE)
y mark(sset(x, x %in_% y), sset(x, x %in% y), x[x %in% y])
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(x, x %in_% y) 62.9µs 103µs 9727. 83.2KB 6.53
#> 2 sset(x, x %in% y) 136.1µs 215µs 4593. 285.4KB 11.3
#> 3 x[x %in% y] 126.9µs 209µs 4836. 324.5KB 13.6
sset
uses an internal range-based subset when
i
is an ALTREP integer sequence of the form m:n.
mark(sset(df, 0:10^5), df[0:10^5, , drop = FALSE])
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, 0:10^5) 145.5µs 498.2µs 1962. 1.53MB 16.6
#> 2 df[0:10^5, , drop = FALSE] 6.12ms 7.5ms 110. 4.83MB 2.12
It also accepts negative indexes
mark(sset(df, -10^4:0),
-10^4:0, , drop = FALSE],
df[check = FALSE) # The only difference is the row names
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, -10^4:0) 50.5ms 63.5ms 13.6 152MB 9.72
#> 2 df[-10^4:0, , drop = FALSE] 809.5ms 809.5ms 1.24 776MB 3.71
The biggest difference between sset
and [
is the way logical vectors are handled. The two main differences when
i
is a logical vector are:
NA
values are ignored, only the locations of
TRUE
values are used.i
must be the same length as x
and is not
recycled.# Examples with NAs
<- c(1, 5, NA, NA, -5)
x > 0]
x[x #> [1] 1 5 NA NA
sset(x, x > 0)
#> [1] 1 5
# Example with length(i) < length(x)
sset(x, TRUE)
#> Error in check_length(i, length(x)): i must have length 5
# This is equivalent
TRUE]
x[#> [1] 1 5 NA NA -5
# to..
sset(x)
#> [1] 1 5 NA NA -5
lag_()
set.seed(37)
lag_(1:10, 3) # Lag(3)
#> [1] NA NA NA 1 2 3 4 5 6 7
lag_(1:10, -3) # Lead(3)
#> [1] 4 5 6 7 8 9 10 NA NA NA
# Using an example from data.table
library(data.table)
#> Warning: package 'data.table' was built under R version 4.4.1
<- data.table(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5])
dt
# Similar to data.table::shift()
lag_(dt, 1) # Lag
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
lag_(dt, -1) # Lead
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2011 0.07883715 2 b
#> 2: 2012 0.64879698 3 c
#> 3: 2013 0.49685336 4 d
#> 4: 2014 0.71878731 5 e
#> 5: NA NA NA <NA>
With lag_
we can update variables by reference,
including entire data frames
# At the moment, shift() cannot do this
lag_(dt, set = TRUE)
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
# Was updated by reference
dt #> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
lag2_
is a more generalised variant that supports
vectors of lags, custom ordering and run lengths.
lag2_(dt, order = 5:1) # Reverse order lag (same as lead)
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2011 0.07883715 2 b
#> 3: 2012 0.64879698 3 c
#> 4: 2013 0.49685336 4 d
#> 5: NA NA NA <NA>
lag2_(dt, -1) # Same as above
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2011 0.07883715 2 b
#> 3: 2012 0.64879698 3 c
#> 4: 2013 0.49685336 4 d
#> 5: NA NA NA <NA>
lag2_(dt, c(1, -1)) # Alternating lead/lag
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2011 0.07883715 2 b
#> 3: 2010 0.54964085 1 a
#> 4: 2013 0.49685336 4 d
#> 5: 2012 0.64879698 3 c
lag2_(dt, c(-1, 0, 0, 0, 0)) # Lead e.g. only first row
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
gcd2(5, 25)
#> [1] 5
scm2(5, 6)
#> [1] 30
gcd(seq(5, 25, by = 5))
#> [1] 5
scm(seq(5, 25, by = 5))
#> [1] 300
<- seq(1L, 1000000L, 1L)
x mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 1.3µs 1.4µs 599445. 0B 0
<- seq(0, 10^6, 0.5)
x mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 36.2ms 36.7ms 27.2 0B 0
As an example, to create 3 sequences with different increments,
the usual approach might be to use lapply to loop through the increment
values together with seq()
# Base R
<- c(1, 0.5, 0.1)
increments <- 1
start <- 5
end unlist(lapply(increments, \(x) seq(start, end, x)))
#> [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4
#> [20] 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
#> [39] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
In cheapr you can use seq_()
which accepts vector
arguments.
seq_(start, end, increments)
#> [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4
#> [20] 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
#> [39] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
Use add_id = TRUE
to label the individual sequences.
seq_(start, end, increments, add_id = TRUE)
#> 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3
#> 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4 1.5
#> 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#> 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5
#> 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#> 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
If you know the sizes of your sequences beforehand, use
sequence_()
<- c(3, 5, 10)
seq_sizes sequence_(seq_sizes, from = 0, by = 1/3, add_id = TRUE) |>
enframe_()
#> # A tibble: 18 × 2
#> name value
#> <chr> <dbl>
#> 1 1 0
#> 2 1 0.333
#> 3 1 0.667
#> 4 2 0
#> 5 2 0.333
#> 6 2 0.667
#> 7 2 1
#> 8 2 1.33
#> 9 3 0
#> 10 3 0.333
#> 11 3 0.667
#> 12 3 1
#> 13 3 1.33
#> 14 3 1.67
#> 15 3 2
#> 16 3 2.33
#> 17 3 2.67
#> 18 3 3
You can also calculate the sequence sizes using
seq_size()
seq_size(start, end, increments)
#> [1] 5 9 41
<- rep(TRUE, 10^6)
x mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 619µs 1.86ms 521. 3.81MB 6.68
#> 2 base_which 724µs 2.87ms 346. 7.63MB 9.67
<- rep(FALSE, 10^6)
x mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 119µs 138µs 7070. 0B 0
#> 2 base_which 454µs 473µs 2039. 3.81MB 26.4
<- c(rep(TRUE, 5e05), rep(FALSE, 1e06))
x mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 448µs 1.08ms 836. 1.91MB 4.32
#> 2 base_which 785µs 1.78ms 561. 7.63MB 17.0
<- c(rep(FALSE, 5e05), rep(TRUE, 1e06))
x mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 904µs 1.99ms 508. 3.81MB 6.80
#> 2 base_which 949µs 3.14ms 285. 9.54MB 9.34
<- sample(c(TRUE, FALSE), 10^6, TRUE)
x sample.int(10^6, 10^4)] <- NA
x[mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 592.6µs 1.1ms 906. 1.89MB 6.66
#> 2 base_which 3.19ms 4.12ms 245. 5.7MB 4.26
<- sample(seq(-10^3, 10^3, 0.01))
x <- do.call(paste0, expand.grid(letters, letters, letters, letters))
y mark(cheapr_factor = factor_(x),
base_factor = factor(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 9.15ms 9.72ms 103. 4.59MB 0
#> 2 base_factor 606.98ms 606.98ms 1.65 27.84MB 0
mark(cheapr_factor = factor_(x, order = FALSE),
base_factor = factor(x, levels = unique(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 4.48ms 5.08ms 194. 1.53MB 2.13
#> 2 base_factor 1.03s 1.03s 0.974 22.79MB 0
mark(cheapr_factor = factor_(y),
base_factor = factor(y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 210.47ms 214.54ms 4.65 5.23MB 0
#> 2 base_factor 2.91s 2.91s 0.344 54.35MB 0
mark(cheapr_factor = factor_(y, order = FALSE),
base_factor = factor(y, levels = unique(y)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 5.37ms 6.5ms 152. 3.49MB 2.23
#> 2 base_factor 53.38ms 58.2ms 16.9 39.89MB 0
<- sample.int(10^6, 10^5, TRUE)
x <- sample.int(10^6, 10^5, TRUE)
y mark(cheapr_intersect = intersect_(x, y, dups = FALSE),
base_intersect = intersect(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_intersect 2.64ms 3.31ms 305. 1.18MB 2.16
#> 2 base_intersect 4.26ms 5.35ms 178. 5.16MB 2.17
mark(cheapr_setdiff = setdiff_(x, y, dups = FALSE),
base_setdiff = setdiff(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_setdiff 2.71ms 2.96ms 334. 1.76MB 0
#> 2 base_setdiff 4.92ms 5.49ms 181. 5.71MB 2.21
%in_%
and %!in_%
mark(cheapr = x %in_% y,
base = x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.68ms 1.88ms 521. 781.34KB 2.17
#> 2 base 2.51ms 3.07ms 328. 2.53MB 2.17
mark(cheapr = x %!in_% y,
base = !x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.73ms 1.86ms 516. 787.84KB 0
#> 2 base 2.73ms 3.22ms 308. 2.91MB 2.20
as_discrete
as_discrete
is a cheaper alternative to
cut
<- rnorm(10^7)
x <- seq(0, max(x), 0.2)
b mark(cheapr_cut = as_discrete(x, b, left = FALSE),
base_cut = cut(x, b))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_cut 145ms 146ms 6.55 38.2MB 1.64
#> 2 base_cut 514ms 514ms 1.95 267.1MB 1.95
cheapr_if_else
A cheap alternative to ifelse
mark(
cheapr_if_else(x >= 0, "pos", "neg"),
ifelse(x >= 0, "pos", "neg"),
::fifelse(x >= 0, "pos", "neg")
data.table
)#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 "cheapr_if_else(x >= 0, \"pos\… 125.11ms 137.7ms 7.06 114MB 3.53
#> 2 "ifelse(x >= 0, \"pos\", \"neg… 1.77s 1.77s 0.566 534MB 1.13
#> 3 "data.table::fifelse(x >= 0, \… 112.4ms 112.79ms 7.31 114MB 1.83
case
cheapr’s version of a case-when statement, with mostly the same
arguments as dplyr::case_when
but similar efficiency as
data.table::fcase
mark(case(
>= 0 ~ "pos",
x < 0 ~ "neg",
x .default = "Unknown"
),::fcase(
data.table>= 0, "pos",
x < 0, "neg",
x rep_len(TRUE, length(x)), "Unknown"
))#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl>
#> 1 "case(x >= 0 ~ \"pos\", x < 0 ~ \"n… 275ms 328ms 3.05 286MB 3.05
#> 2 "data.table::fcase(x >= 0, \"pos\",… 213ms 215ms 4.40 267MB 1.47
val_match
is an even cheaper special variant of
case
when all LHS expressions are length-1 vectors, i.e
scalars
<- round(rnorm(10^7))
x
mark(
val_match(x, 1 ~ Inf, 2 ~ -Inf, .default = NaN),
case(x == 1 ~ Inf,
== 2 ~ -Inf,
x .default = NaN),
::fcase(x == 1, Inf,
data.table== 2, -Inf,
x rep_len(TRUE, length(x)), NaN)
)#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:> <dbl> <bch:byt> <dbl>
#> 1 val_match(x, 1 ~ Inf, 2 ~ -Inf, .… 65.3ms 71ms 13.4 87.9MB 1.91
#> 2 case(x == 1 ~ Inf, x == 2 ~ -Inf,… 212ms 252ms 4.05 276.2MB 2.70
#> 3 data.table::fcase(x == 1, Inf, x … 207.1ms 224ms 4.49 305.2MB 5.98