library(spam)
#> Spam version 3.0-0 (2026-04-12) is loaded.
#> Type 'help( Spam)' or 'demo( spam)' for a short introduction 
#> and overview of this package.
#> Help for individual functions is also obtained by adding the
#> suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
#> 
#> Attaching package: 'spam'
#> The following objects are masked from 'package:base':
#> 
#>     backsolve, forwardsolve

1 Spatial dataset

We place \(n = 500\) locations uniformly at random in \([0,1]^2\) and build a sparse distance matrix using nearest.kdtree() with neighbourhood radius \(\delta = 0.35\). The covariance model is spherical with \(\theta = (\text{range}, \text{sill}, \text{nugget}) = (0.3,\; 1.5,\; 0.1)\).

set.seed(42)
n     <- 500
locs  <- matrix(runif(2 * n), n, 2)
delta <- 0.35

h <- nearest.kdtree(locs, delta = delta, upper = NULL)
cat("Sparse distance matrix:", n, "x", n, " nnz =", nnz(h), "\n")
#> Sparse distance matrix: 500 x 500  nnz = 68464

theta.true <- c(0.25, 1.5, 0.1)       # range, sill, nugget
Sigma      <- cov.sph(h, theta.true)
y          <- c(rmvnorm.spam(1, Sigma = Sigma))
pal <- colorRampPalette(c("#4575b4", "#ffffbf", "#d73027"))(64)
col <- pal[cut(y, 64)]
plot(locs, pch = 19, col = col, cex = 0.9,
     xlab = "s1", ylab = "s2", main = "Simulated spatial data")
Simulated observations; colour encodes the response value.

Simulated observations; colour encodes the response value.

2 Estimation

We compare eight approaches differing in solver, parametrisation, gradient type, and optimiser. Natural-scale fits use box constraints; log-scale fits replace the box with a log link via reparametrise() and run unconstrained. A single ctrl list controls the L-BFGS-B convergence criteria.

theta0     <- c(0.2, 1.0, 0.05)
thetalower <- c(0.01, 0.01, 0)     # lower bounds on natural scale
thetaupper <- c(delta, Inf, Inf)   # range capped at neighbourhood radius

ctrl <- list(factr = 1e4,   # tighter than default 1e7
             pgtol = 1e-3)  # matches SLQ noise level

## log-scale wrappers: phi = log(theta), theta = exp(phi)
sph_log <- reparametrise(cov.sph, grad.cov.sph)
phi0    <- log(theta0)

2.1 Cholesky — numerical gradient (L-BFGS-B, natural scale)

optim approximates the gradient by finite differences at each step.

t.num <- system.time(
  fit.num <- mle.spam(y, h, cov.sph, theta0,
                      thetalower = thetalower,
                      thetaupper = thetaupper,
                      control    = ctrl)
)

2.2 Cholesky — numerical gradient (L-BFGS-B, log scale)

The same fit on the log scale via reparametrise(). With the true optimum interior to the box the projected gradient equals the full gradient, so L-BFGS-B behaves identically to unconstrained optimisation; the function evaluation count should match the natural-scale fit.

t.log.num <- system.time(
  fit.log.num <- mle.spam(y, h, sph_log$Covariance, phi0,
                           thetalower = rep(-Inf, 3),
                           thetaupper = rep( Inf, 3),
                           control    = ctrl)
)
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.

2.3 Cholesky — analytical gradient (L-BFGS-B, natural scale)

grad.cov.sph supplies exact derivatives \(\partial\Sigma/\partial\theta_k\); gradneg2loglik computes the gradient via \(n\) Cholesky back-solves (one per row of each derivative matrix).

t.grad <- system.time(
  fit.grad <- mle.spam(y, h, cov.sph, theta0,
                       thetalower     = thetalower,
                       thetaupper     = thetaupper,
                       gradCovariance = grad.cov.sph,
                       control        = ctrl)
)

2.4 Cholesky — numerical gradient (BFGS, log scale)

BFGS does not support box constraints. The log parametrisation removes the need for them: all \(\phi_k \in \mathbb{R}\) automatically give \(\theta_k = e^{\phi_k} > 0\).

t.bfgs.num <- system.time(
  fit.bfgs.num <- mle.spam(y, h, sph_log$Covariance, phi0,
                            thetalower = rep(-Inf, 3),
                            thetaupper = rep( Inf, 3),
                            method     = "BFGS")
)
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.

2.5 Cholesky — analytical gradient (BFGS, log scale)

The chain-rule correction \(\partial/\partial\phi_k = e^{\phi_k}\,\partial/\partial\theta_k\) is applied automatically by the reparametrise() wrapper.

t.bfgs.grad <- system.time(
  fit.bfgs.grad <- mle.spam(y, h, sph_log$Covariance, phi0,
                             thetalower     = rep(-Inf, 3),
                             thetaupper     = rep( Inf, 3),
                             gradCovariance = sph_log$gradCovariance,
                             method         = "BFGS")
)
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.
#> Warning in (function (object, x, ...) : Singularity problem when updating a Cholesky Factor.
#> 'object' not updated.

2.6 CG — stochastic log-determinant, numerical gradient

The log-determinant is estimated via stochastic Lanczos quadrature with nprobe = 30 Rademacher probe vectors fixed before optim begins (deterministic objective).

t.cg.num <- system.time(
  fit.cg.num <- mle.spam(y, h, cov.sph, theta0,
                          thetalower   = thetalower,
                          thetaupper   = thetaupper,
                          solver       = "cg",
                          iter.control = list(nprobe = 30, seed = 1),
                          control      = ctrl)
)

2.7 CG — stochastic log-determinant, analytical gradient

The Hutchinson estimator approximates the trace term in the gradient using the same probe vectors as the objective — both then optimise a consistent stochastic approximation.

t.cg.grad <- system.time(
  fit.cg.grad <- mle.spam(y, h, cov.sph, theta0,
                           thetalower     = thetalower,
                           thetaupper     = thetaupper,
                           gradCovariance = grad.cov.sph,
                           solver         = "cg",
                           iter.control   = list(nprobe = 30, seed = 1),
                           control        = ctrl)
)

3 Comparison

Log-scale estimates are back-transformed via \(\hat\theta = e^{\hat\phi}\) for comparison with natural-scale fits.

res <- rbind(
  "truth"                        = c(theta.true,                   NA,                       NA),
  "chol / num  / L-BFGS-B"      = c(fit.num$theta,            fit.num$counts[1],         fit.num$counts[2]),
  "chol / num  / L-BFGS-B / log" = c(exp(fit.log.num$theta),  fit.log.num$counts[1],     fit.log.num$counts[2]),
  "chol / anal / L-BFGS-B"      = c(fit.grad$theta,           fit.grad$counts[1],        fit.grad$counts[2]),
  "chol / num  / BFGS / log"     = c(exp(fit.bfgs.num$theta), fit.bfgs.num$counts[1],    fit.bfgs.num$counts[2]),
  "chol / anal / BFGS / log"     = c(exp(fit.bfgs.grad$theta),fit.bfgs.grad$counts[1],   fit.bfgs.grad$counts[2]),
  "CG   / num  / L-BFGS-B"      = c(fit.cg.num$theta,        fit.cg.num$counts[1],      fit.cg.num$counts[2]),
  "CG   / anal / L-BFGS-B"      = c(fit.cg.grad$theta,       fit.cg.grad$counts[1],     fit.cg.grad$counts[2])
)
colnames(res) <- c("range", "sill", "nugget", "fn evals", "gr evals")
knitr::kable(round(res, 4), caption = "MLE estimates and optimiser call counts")
MLE estimates and optimiser call counts
range sill nugget fn evals gr evals
truth 0.2500 1.5000 0.1000 NA NA
chol / num / L-BFGS-B 0.2594 1.5256 0.1269 31 31
chol / num / L-BFGS-B / log 0.2594 1.5253 0.1269 33 33
chol / anal / L-BFGS-B 0.2594 1.5253 0.1269 21 21
chol / num / BFGS / log 0.3755 2.1516 0.1366 47 15
chol / anal / BFGS / log 0.3755 2.1535 0.1367 54 11
CG / num / L-BFGS-B 0.2597 1.5569 0.1220 112 112
CG / anal / L-BFGS-B 0.2592 1.5542 0.1220 20 20
times <- c(
  "chol / num  / L-BFGS-B"       = t.num["elapsed"],
  "chol / num  / L-BFGS-B / log" = t.log.num["elapsed"],
  "chol / anal / L-BFGS-B"       = t.grad["elapsed"],
  "chol / num  / BFGS / log"     = t.bfgs.num["elapsed"],
  "chol / anal / BFGS / log"     = t.bfgs.grad["elapsed"],
  "CG   / num  / L-BFGS-B"       = t.cg.num["elapsed"],
  "CG   / anal / L-BFGS-B"       = t.cg.grad["elapsed"]
)
knitr::kable(data.frame(elapsed = round(times, 2)),
             caption = "Elapsed time (seconds)")
Elapsed time (seconds)
elapsed
chol / num / L-BFGS-B.elapsed 1.04
chol / num / L-BFGS-B / log.elapsed 1.24
chol / anal / L-BFGS-B.elapsed 10.42
chol / num / BFGS / log.elapsed 0.63
chol / anal / BFGS / log.elapsed 5.57
CG / num / L-BFGS-B.elapsed 175.36
CG / anal / L-BFGS-B.elapsed 7.86

All approaches recover the true parameters well. The natural-scale and log-scale L-BFGS-B fits (rows 1–2) show matching function evaluation counts: when the optimum is interior to the box the projected gradient equals the full gradient, so the two parametrisations traverse the same steps. The analytical gradient reduces function evaluations (each gradient evaluation replaces \(p + 1\) finite-difference calls, \(p = \text{length}(\theta) = 3\)). The CG solver is attractive for large \(n\) where Cholesky becomes the bottleneck.

4 Effect of convergence tolerances

The L-BFGS-B stopping criteria factr (relative function decrease) and pgtol (projected gradient norm) control when the optimiser terminates. Tighter tolerances increase the number of iterations but have diminishing returns once the tolerance is below the noise floor of the objective. Here we vary both simultaneously using the Cholesky solver with numerical gradient.

factr_vals <- c(1e7, 1e5, 1e4, 1e2)   # 1e7 is the R default
pgtol_vals <- c(1e-5, 1e-4, 1e-3, 1e-1)

grid <- data.frame(factr = factr_vals, pgtol = pgtol_vals)

tol_fits <- lapply(seq_len(nrow(grid)), function(i) {
  mle.spam(y, h, cov.sph, theta0,
           thetalower = thetalower,
           thetaupper = thetaupper,
           control    = list(factr = grid$factr[i],
                             pgtol = grid$pgtol[i]))
})

tol_res <- do.call(rbind, lapply(seq_len(nrow(grid)), function(i) {
  fit <- tol_fits[[i]]
  c(factr     = grid$factr[i],
    pgtol     = grid$pgtol[i],
    range     = fit$theta[1],
    sill      = fit$theta[2],
    nugget    = fit$theta[3],
    fn_evals  = fit$counts[1],
    gr_evals  = fit$counts[2])
}))

knitr::kable(round(as.data.frame(tol_res), 4),
             caption = paste("Effect of factr and pgtol on chol/num.grad estimates.",
                             "True theta =", paste(theta.true, collapse = ", ")))
Effect of factr and pgtol on chol/num.grad estimates. True theta = 0.25, 1.5, 0.1
factr pgtol range sill nugget fn_evals.function gr_evals.gradient
1e+07 0e+00 0.2594 1.5256 0.1269 31 31
1e+05 1e-04 0.2594 1.5256 0.1269 31 31
1e+04 1e-03 0.2594 1.5256 0.1269 31 31
1e+02 1e-01 0.2594 1.5256 0.1269 17 17

Looser tolerances converge in fewer function evaluations with estimates that are practically indistinguishable from those at tight tolerances, provided the tolerance stays well above numerical noise. For the CG solver the objective itself has noise of order \(O(s^{-1/2})\) (where \(s\) = nprobe), so factr = 1e4, pgtol = 1e-3 is a reasonable match.

5 Effect of the number of probe vectors

The stochastic log-determinant has variance decreasing with nprobe. We compare \(s \in \{5, 30, 100\}\) over nrep = 10 independent data realisations; the optimiser settings (ctrl) are held fixed.

nrep  <- 10
seeds <- 1:nrep

run_nprobe <- function(seed_data, nprobe) {
  set.seed(seed_data)
  yi  <- c(rmvnorm.spam(1, Sigma = Sigma))
  fit <- mle.spam(yi, h, cov.sph, theta0,
                  thetalower   = thetalower,
                  thetaupper   = thetaupper,
                  solver       = "cg",
                  iter.control = list(nprobe = nprobe, seed = 1),
                  control      = ctrl)
  fit$theta
}
res5   <- t(sapply(seeds, run_nprobe, nprobe =   5))
res30  <- t(sapply(seeds, run_nprobe, nprobe =  30))
res100 <- t(sapply(seeds, run_nprobe, nprobe = 100))
make_row <- function(mat, s) {
  c(nprobe = s,
    range_mean = mean(mat[, 1]), range_sd = sd(mat[, 1]),
    sill_mean  = mean(mat[, 2]), sill_sd  = sd(mat[, 2]),
    nugget_mean= mean(mat[, 3]), nugget_sd= sd(mat[, 3]))
}
nprobe_res <- rbind(make_row(res5,   5),
                    make_row(res30,  30),
                    make_row(res100, 100))
knitr::kable(round(as.data.frame(nprobe_res), 4),
             caption = paste("Mean and SD of CG/num.grad estimates over", nrep,
                             "data realisations by nprobe"))
Mean and SD of CG/num.grad estimates over 10 data realisations by nprobe
nprobe range_mean range_sd sill_mean sill_sd nugget_mean nugget_sd
5 0.2528 0.0095 1.4471 0.2651 0.1141 0.0372
30 0.2502 0.0110 1.5093 0.2607 0.1010 0.0357
100 0.2494 0.0122 1.4593 0.2562 0.1085 0.0360

{r nprobe-boxplot, fig.width = 8, fig.height = 4, fig.cap = "Distribution of estimates by nprobe; red line = truth."} par(mfrow = c(1, 3)) nms <- c("range", "sill", "nugget") for (j in seq_along(nms)) { boxplot(list("s=5" = res5[, j], "s=30" = res30[, j], "s=100" = res100[, j]), main = nms[j], ylab = nms[j], col = c("#91bfdb", "#fee090", "#fc8d59")) abline(h = theta.true[j], col = "red", lwd = 2) } par(mfrow = c(1, 1))

Variance shrinks with increasing \(s\); the cost scales linearly with nprobe.

6 Scaling to n = 2000

For large \(n\) the Cholesky factorisation dominates. We generate a fresh dataset with \(n = 2000\) and compare three approaches: exact Cholesky with numerical gradient, CG with numerical gradient, and CG with analytical gradient.

set.seed(7)
n2     <- 2000
locs2  <- matrix(runif(2 * n2), n2, 2)
delta2 <- 0.25

h2     <- nearest.kdtree(locs2, delta = delta2, upper = NULL)
cat("Large distance matrix:", n2, "x", n2, " nnz =", nnz(h2), "\n")
#> Large distance matrix: 2000 x 2000  nnz = 622904

Sigma2 <- cov.sph(h2, theta.true)
y2     <- c(rmvnorm.spam(1, Sigma = Sigma2))
theta0.2     <- c(0.2, 1.0, 0.05)
thetaupper.2 <- c(delta2, Inf, Inf)

t2.chol.num <- system.time(
  fit2.chol.num <- mle.spam(y2, h2, cov.sph, theta0.2,
                             thetalower = thetalower,
                             thetaupper = thetaupper.2,
                             control    = ctrl)
)

t2.cg.num <- system.time(
  fit2.cg.num <- mle.spam(y2, h2, cov.sph, theta0.2,
                           thetalower   = thetalower,
                           thetaupper   = thetaupper.2,
                           solver       = "cg",
                           iter.control = list(nprobe = 30, seed = 1),
                           control      = ctrl)
)

t2.cg.grad <- system.time(
  fit2.cg.grad <- mle.spam(y2, h2, cov.sph, theta0.2,
                            thetalower     = thetalower,
                            thetaupper     = thetaupper.2,
                            gradCovariance = grad.cov.sph,
                            solver         = "cg",
                            iter.control   = list(nprobe = 30, seed = 1),
                            control        = ctrl)
)
res2 <- rbind(
  "truth"                 = c(theta.true,              NA,                     NA),
  "chol / num  / L-BFGS-B" = c(fit2.chol.num$theta,  fit2.chol.num$counts[1], fit2.chol.num$counts[2]),
  "CG   / num  / L-BFGS-B" = c(fit2.cg.num$theta,   fit2.cg.num$counts[1],   fit2.cg.num$counts[2]),
  "CG   / anal / L-BFGS-B" = c(fit2.cg.grad$theta,  fit2.cg.grad$counts[1],  fit2.cg.grad$counts[2])
)
colnames(res2) <- c("range", "sill", "nugget", "fn evals", "gr evals")
knitr::kable(round(res2, 4), caption = "MLE estimates for n = 2000")
MLE estimates for n = 2000
range sill nugget fn evals gr evals
truth 0.25 1.5000 0.1000 NA NA
chol / num / L-BFGS-B 0.25 1.5016 0.0953 24 24
CG / num / L-BFGS-B 0.25 1.5100 0.0946 75 75
CG / anal / L-BFGS-B 0.25 1.5100 0.0946 23 23
times2 <- c(
  "chol / num"  = t2.chol.num["elapsed"],
  "CG   / num"  = t2.cg.num["elapsed"],
  "CG   / anal" = t2.cg.grad["elapsed"]
)
knitr::kable(data.frame(elapsed = round(times2, 2)),
             caption = "Elapsed time (seconds) for n = 2000")
Elapsed time (seconds) for n = 2000
elapsed
chol / num.elapsed 11.77
CG / num.elapsed 1914.26
CG / anal.elapsed 150.43

At \(n = 2000\) the Cholesky factorisation at each objective evaluation becomes significantly more expensive than the iterative CG solve, and the gap widens with \(n\). The CG solver with an analytical gradient further reduces the number of objective evaluations needed, combining the benefits of a cheap per-iteration cost and fewer iterations overall.