library(spam)
#> Spam version 3.0-0 (2026-04-12) is loaded.
#> Type 'help( Spam)' or 'demo( spam)' for a short introduction
#> and overview of this package.
#> Help for individual functions is also obtained by adding the
#> suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
#>
#> Attaching package: 'spam'
#> The following objects are masked from 'package:base':
#>
#> backsolve, forwardsolve
We place \(n = 500\) locations
uniformly at random in \([0,1]^2\) and
build a sparse distance matrix using nearest.kdtree() with
neighbourhood radius \(\delta = 0.35\).
The covariance model is spherical with \(\theta = (\text{range}, \text{sill},
\text{nugget}) = (0.3,\; 1.5,\; 0.1)\).
set.seed(42)
n <- 500
locs <- matrix(runif(2 * n), n, 2)
delta <- 0.35
h <- nearest.kdtree(locs, delta = delta, upper = NULL)
cat("Sparse distance matrix:", n, "x", n, " nnz =", nnz(h), "\n")
#> Sparse distance matrix: 500 x 500 nnz = 68464
theta.true <- c(0.25, 1.5, 0.1) # range, sill, nugget
Sigma <- cov.sph(h, theta.true)
y <- c(rmvnorm.spam(1, Sigma = Sigma))
pal <- colorRampPalette(c("#4575b4", "#ffffbf", "#d73027"))(64)
col <- pal[cut(y, 64)]
plot(locs, pch = 19, col = col, cex = 0.9,
xlab = "s1", ylab = "s2", main = "Simulated spatial data")
Simulated observations; colour encodes the response value.
We compare eight approaches differing in solver, parametrisation,
gradient type, and optimiser. Natural-scale fits use box constraints;
log-scale fits replace the box with a log link via
reparametrise() and run unconstrained. A single
ctrl list controls the L-BFGS-B convergence criteria.
theta0 <- c(0.2, 1.0, 0.05)
thetalower <- c(0.01, 0.01, 0) # lower bounds on natural scale
thetaupper <- c(delta, Inf, Inf) # range capped at neighbourhood radius
ctrl <- list(factr = 1e4, # tighter than default 1e7
pgtol = 1e-3) # matches SLQ noise level
## log-scale wrappers: phi = log(theta), theta = exp(phi)
sph_log <- reparametrise(cov.sph, grad.cov.sph)
phi0 <- log(theta0)
optim approximates the gradient by finite differences at
each step.
t.num <- system.time(
fit.num <- mle.spam(y, h, cov.sph, theta0,
thetalower = thetalower,
thetaupper = thetaupper,
control = ctrl)
)
The same fit on the log scale via reparametrise(). With
the true optimum interior to the box the projected gradient equals the
full gradient, so L-BFGS-B behaves identically to unconstrained
optimisation; the function evaluation count should match the
natural-scale fit.
t.log.num <- system.time(
fit.log.num <- mle.spam(y, h, sph_log$Covariance, phi0,
thetalower = rep(-Inf, 3),
thetaupper = rep( Inf, 3),
control = ctrl)
)
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
grad.cov.sph supplies exact derivatives \(\partial\Sigma/\partial\theta_k\);
gradneg2loglik computes the gradient via \(n\) Cholesky back-solves (one per row of
each derivative matrix).
t.grad <- system.time(
fit.grad <- mle.spam(y, h, cov.sph, theta0,
thetalower = thetalower,
thetaupper = thetaupper,
gradCovariance = grad.cov.sph,
control = ctrl)
)
BFGS does not support box constraints. The log parametrisation removes the need for them: all \(\phi_k \in \mathbb{R}\) automatically give \(\theta_k = e^{\phi_k} > 0\).
t.bfgs.num <- system.time(
fit.bfgs.num <- mle.spam(y, h, sph_log$Covariance, phi0,
thetalower = rep(-Inf, 3),
thetaupper = rep( Inf, 3),
method = "BFGS")
)
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
The chain-rule correction \(\partial/\partial\phi_k =
e^{\phi_k}\,\partial/\partial\theta_k\) is applied automatically
by the reparametrise() wrapper.
t.bfgs.grad <- system.time(
fit.bfgs.grad <- mle.spam(y, h, sph_log$Covariance, phi0,
thetalower = rep(-Inf, 3),
thetaupper = rep( Inf, 3),
gradCovariance = sph_log$gradCovariance,
method = "BFGS")
)
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
The log-determinant is estimated via stochastic Lanczos quadrature
with nprobe = 30 Rademacher probe vectors fixed before
optim begins (deterministic objective).
t.cg.num <- system.time(
fit.cg.num <- mle.spam(y, h, cov.sph, theta0,
thetalower = thetalower,
thetaupper = thetaupper,
solver = "cg",
iter.control = list(nprobe = 30, seed = 1),
control = ctrl)
)
The Hutchinson estimator approximates the trace term in the gradient using the same probe vectors as the objective — both then optimise a consistent stochastic approximation.
t.cg.grad <- system.time(
fit.cg.grad <- mle.spam(y, h, cov.sph, theta0,
thetalower = thetalower,
thetaupper = thetaupper,
gradCovariance = grad.cov.sph,
solver = "cg",
iter.control = list(nprobe = 30, seed = 1),
control = ctrl)
)
Log-scale estimates are back-transformed via \(\hat\theta = e^{\hat\phi}\) for comparison with natural-scale fits.
res <- rbind(
"truth" = c(theta.true, NA, NA),
"chol / num / L-BFGS-B" = c(fit.num$theta, fit.num$counts[1], fit.num$counts[2]),
"chol / num / L-BFGS-B / log" = c(exp(fit.log.num$theta), fit.log.num$counts[1], fit.log.num$counts[2]),
"chol / anal / L-BFGS-B" = c(fit.grad$theta, fit.grad$counts[1], fit.grad$counts[2]),
"chol / num / BFGS / log" = c(exp(fit.bfgs.num$theta), fit.bfgs.num$counts[1], fit.bfgs.num$counts[2]),
"chol / anal / BFGS / log" = c(exp(fit.bfgs.grad$theta),fit.bfgs.grad$counts[1], fit.bfgs.grad$counts[2]),
"CG / num / L-BFGS-B" = c(fit.cg.num$theta, fit.cg.num$counts[1], fit.cg.num$counts[2]),
"CG / anal / L-BFGS-B" = c(fit.cg.grad$theta, fit.cg.grad$counts[1], fit.cg.grad$counts[2])
)
colnames(res) <- c("range", "sill", "nugget", "fn evals", "gr evals")
knitr::kable(round(res, 4), caption = "MLE estimates and optimiser call counts")
| range | sill | nugget | fn evals | gr evals | |
|---|---|---|---|---|---|
| truth | 0.2500 | 1.5000 | 0.1000 | NA | NA |
| chol / num / L-BFGS-B | 0.2594 | 1.5256 | 0.1269 | 31 | 31 |
| chol / num / L-BFGS-B / log | 0.2594 | 1.5253 | 0.1269 | 33 | 33 |
| chol / anal / L-BFGS-B | 0.2594 | 1.5253 | 0.1269 | 21 | 21 |
| chol / num / BFGS / log | 0.3755 | 2.1516 | 0.1366 | 47 | 15 |
| chol / anal / BFGS / log | 0.3755 | 2.1535 | 0.1367 | 54 | 11 |
| CG / num / L-BFGS-B | 0.2597 | 1.5569 | 0.1220 | 112 | 112 |
| CG / anal / L-BFGS-B | 0.2592 | 1.5542 | 0.1220 | 20 | 20 |
times <- c(
"chol / num / L-BFGS-B" = t.num["elapsed"],
"chol / num / L-BFGS-B / log" = t.log.num["elapsed"],
"chol / anal / L-BFGS-B" = t.grad["elapsed"],
"chol / num / BFGS / log" = t.bfgs.num["elapsed"],
"chol / anal / BFGS / log" = t.bfgs.grad["elapsed"],
"CG / num / L-BFGS-B" = t.cg.num["elapsed"],
"CG / anal / L-BFGS-B" = t.cg.grad["elapsed"]
)
knitr::kable(data.frame(elapsed = round(times, 2)),
caption = "Elapsed time (seconds)")
| elapsed | |
|---|---|
| chol / num / L-BFGS-B.elapsed | 1.04 |
| chol / num / L-BFGS-B / log.elapsed | 1.24 |
| chol / anal / L-BFGS-B.elapsed | 10.42 |
| chol / num / BFGS / log.elapsed | 0.63 |
| chol / anal / BFGS / log.elapsed | 5.57 |
| CG / num / L-BFGS-B.elapsed | 175.36 |
| CG / anal / L-BFGS-B.elapsed | 7.86 |
All approaches recover the true parameters well. The natural-scale and log-scale L-BFGS-B fits (rows 1–2) show matching function evaluation counts: when the optimum is interior to the box the projected gradient equals the full gradient, so the two parametrisations traverse the same steps. The analytical gradient reduces function evaluations (each gradient evaluation replaces \(p + 1\) finite-difference calls, \(p = \text{length}(\theta) = 3\)). The CG solver is attractive for large \(n\) where Cholesky becomes the bottleneck.
The L-BFGS-B stopping criteria factr (relative function
decrease) and pgtol (projected gradient norm) control when
the optimiser terminates. Tighter tolerances increase the number of
iterations but have diminishing returns once the tolerance is below the
noise floor of the objective. Here we vary both simultaneously using the
Cholesky solver with numerical gradient.
factr_vals <- c(1e7, 1e5, 1e4, 1e2) # 1e7 is the R default
pgtol_vals <- c(1e-5, 1e-4, 1e-3, 1e-1)
grid <- data.frame(factr = factr_vals, pgtol = pgtol_vals)
tol_fits <- lapply(seq_len(nrow(grid)), function(i) {
mle.spam(y, h, cov.sph, theta0,
thetalower = thetalower,
thetaupper = thetaupper,
control = list(factr = grid$factr[i],
pgtol = grid$pgtol[i]))
})
tol_res <- do.call(rbind, lapply(seq_len(nrow(grid)), function(i) {
fit <- tol_fits[[i]]
c(factr = grid$factr[i],
pgtol = grid$pgtol[i],
range = fit$theta[1],
sill = fit$theta[2],
nugget = fit$theta[3],
fn_evals = fit$counts[1],
gr_evals = fit$counts[2])
}))
knitr::kable(round(as.data.frame(tol_res), 4),
caption = paste("Effect of factr and pgtol on chol/num.grad estimates.",
"True theta =", paste(theta.true, collapse = ", ")))
| factr | pgtol | range | sill | nugget | fn_evals.function | gr_evals.gradient |
|---|---|---|---|---|---|---|
| 1e+07 | 0e+00 | 0.2594 | 1.5256 | 0.1269 | 31 | 31 |
| 1e+05 | 1e-04 | 0.2594 | 1.5256 | 0.1269 | 31 | 31 |
| 1e+04 | 1e-03 | 0.2594 | 1.5256 | 0.1269 | 31 | 31 |
| 1e+02 | 1e-01 | 0.2594 | 1.5256 | 0.1269 | 17 | 17 |
Looser tolerances converge in fewer function evaluations with
estimates that are practically indistinguishable from those at tight
tolerances, provided the tolerance stays well above numerical noise. For
the CG solver the objective itself has noise of order \(O(s^{-1/2})\) (where \(s\) = nprobe), so factr = 1e4,
pgtol = 1e-3 is a reasonable match.
The stochastic log-determinant has variance decreasing with
nprobe. We compare \(s \in \{5,
30, 100\}\) over nrep = 10 independent data
realisations; the optimiser settings (ctrl) are held
fixed.
nrep <- 10
seeds <- 1:nrep
run_nprobe <- function(seed_data, nprobe) {
set.seed(seed_data)
yi <- c(rmvnorm.spam(1, Sigma = Sigma))
fit <- mle.spam(yi, h, cov.sph, theta0,
thetalower = thetalower,
thetaupper = thetaupper,
solver = "cg",
iter.control = list(nprobe = nprobe, seed = 1),
control = ctrl)
fit$theta
}
res5 <- t(sapply(seeds, run_nprobe, nprobe = 5))
res30 <- t(sapply(seeds, run_nprobe, nprobe = 30))
res100 <- t(sapply(seeds, run_nprobe, nprobe = 100))
make_row <- function(mat, s) {
c(nprobe = s,
range_mean = mean(mat[, 1]), range_sd = sd(mat[, 1]),
sill_mean = mean(mat[, 2]), sill_sd = sd(mat[, 2]),
nugget_mean= mean(mat[, 3]), nugget_sd= sd(mat[, 3]))
}
nprobe_res <- rbind(make_row(res5, 5),
make_row(res30, 30),
make_row(res100, 100))
knitr::kable(round(as.data.frame(nprobe_res), 4),
caption = paste("Mean and SD of CG/num.grad estimates over", nrep,
"data realisations by nprobe"))
| nprobe | range_mean | range_sd | sill_mean | sill_sd | nugget_mean | nugget_sd |
|---|---|---|---|---|---|---|
| 5 | 0.2528 | 0.0095 | 1.4471 | 0.2651 | 0.1141 | 0.0372 |
| 30 | 0.2502 | 0.0110 | 1.5093 | 0.2607 | 0.1010 | 0.0357 |
| 100 | 0.2494 | 0.0122 | 1.4593 | 0.2562 | 0.1085 | 0.0360 |
{r nprobe-boxplot, fig.width = 8, fig.height = 4, fig.cap = "Distribution of estimates by nprobe; red line = truth."} par(mfrow = c(1, 3)) nms <- c("range", "sill", "nugget") for (j in seq_along(nms)) { boxplot(list("s=5" = res5[, j], "s=30" = res30[, j], "s=100" = res100[, j]), main = nms[j], ylab = nms[j], col = c("#91bfdb", "#fee090", "#fc8d59")) abline(h = theta.true[j], col = "red", lwd = 2) } par(mfrow = c(1, 1))
Variance shrinks with increasing \(s\); the cost scales linearly with
nprobe.
For large \(n\) the Cholesky factorisation dominates. We generate a fresh dataset with \(n = 2000\) and compare three approaches: exact Cholesky with numerical gradient, CG with numerical gradient, and CG with analytical gradient.
set.seed(7)
n2 <- 2000
locs2 <- matrix(runif(2 * n2), n2, 2)
delta2 <- 0.25
h2 <- nearest.kdtree(locs2, delta = delta2, upper = NULL)
cat("Large distance matrix:", n2, "x", n2, " nnz =", nnz(h2), "\n")
#> Large distance matrix: 2000 x 2000 nnz = 622904
Sigma2 <- cov.sph(h2, theta.true)
y2 <- c(rmvnorm.spam(1, Sigma = Sigma2))
theta0.2 <- c(0.2, 1.0, 0.05)
thetaupper.2 <- c(delta2, Inf, Inf)
t2.chol.num <- system.time(
fit2.chol.num <- mle.spam(y2, h2, cov.sph, theta0.2,
thetalower = thetalower,
thetaupper = thetaupper.2,
control = ctrl)
)
t2.cg.num <- system.time(
fit2.cg.num <- mle.spam(y2, h2, cov.sph, theta0.2,
thetalower = thetalower,
thetaupper = thetaupper.2,
solver = "cg",
iter.control = list(nprobe = 30, seed = 1),
control = ctrl)
)
t2.cg.grad <- system.time(
fit2.cg.grad <- mle.spam(y2, h2, cov.sph, theta0.2,
thetalower = thetalower,
thetaupper = thetaupper.2,
gradCovariance = grad.cov.sph,
solver = "cg",
iter.control = list(nprobe = 30, seed = 1),
control = ctrl)
)
res2 <- rbind(
"truth" = c(theta.true, NA, NA),
"chol / num / L-BFGS-B" = c(fit2.chol.num$theta, fit2.chol.num$counts[1], fit2.chol.num$counts[2]),
"CG / num / L-BFGS-B" = c(fit2.cg.num$theta, fit2.cg.num$counts[1], fit2.cg.num$counts[2]),
"CG / anal / L-BFGS-B" = c(fit2.cg.grad$theta, fit2.cg.grad$counts[1], fit2.cg.grad$counts[2])
)
colnames(res2) <- c("range", "sill", "nugget", "fn evals", "gr evals")
knitr::kable(round(res2, 4), caption = "MLE estimates for n = 2000")
| range | sill | nugget | fn evals | gr evals | |
|---|---|---|---|---|---|
| truth | 0.25 | 1.5000 | 0.1000 | NA | NA |
| chol / num / L-BFGS-B | 0.25 | 1.5016 | 0.0953 | 24 | 24 |
| CG / num / L-BFGS-B | 0.25 | 1.5100 | 0.0946 | 75 | 75 |
| CG / anal / L-BFGS-B | 0.25 | 1.5100 | 0.0946 | 23 | 23 |
times2 <- c(
"chol / num" = t2.chol.num["elapsed"],
"CG / num" = t2.cg.num["elapsed"],
"CG / anal" = t2.cg.grad["elapsed"]
)
knitr::kable(data.frame(elapsed = round(times2, 2)),
caption = "Elapsed time (seconds) for n = 2000")
| elapsed | |
|---|---|
| chol / num.elapsed | 11.77 |
| CG / num.elapsed | 1914.26 |
| CG / anal.elapsed | 150.43 |
At \(n = 2000\) the Cholesky factorisation at each objective evaluation becomes significantly more expensive than the iterative CG solve, and the gap widens with \(n\). The CG solver with an analytical gradient further reduces the number of objective evaluations needed, combining the benefits of a cheap per-iteration cost and fewer iterations overall.