Firth’s (1993) penalized likelihood removes the leading \(O(1/n)\) bias from maximum likelihood estimates and produces finite estimates even under separation. fastglm implements the general mean-bias reduction of Kosmidis and Firth (2009, 2021), which extends Firth’s original logistic penalty to arbitrary GLM families. The method is activated with a single flag:
The penalty adds \(\frac{1}{2} \partial \log |X^\top W X| / \partial \beta\) to the score, which is equivalent to modifying the IRLS working response by
\[ \xi_i = \frac{\phi \, h_i \, \mu''(\eta_i) \, V(\mu_i)}{2 \, w_i \, [\mu'(\eta_i)]^3} \]
where \(h_i\) is the \(i\)-th diagonal of the hat matrix \(H = W^{1/2} X (X^\top W X)^{-1} X^\top
W^{1/2}\), \(\mu''(\eta)\) is the second
derivative of the inverse link, \(V(\mu)\) is the variance function, \(w_i\) is the prior weight, and \(\phi\) is the dispersion (1 for binomial
and Poisson families; estimated iteratively for Gaussian, Gamma, and
inverse Gaussian). This is the AS_mean adjustment of Kosmidis and Firth
(2009), the same method used by
brglm2::brglmFit(type = "AS_mean").
The adjustment is computed once per IRLS iteration using the leverages from the weighted least-squares decomposition that fastglm already performs, so the overhead relative to an unpenalized fit is modest.
The canonical application is logistic regression with (quasi-)
separation, where the standard MLE diverges. The sex2
dataset from logistf (Heinze and Schemper, 2002)
demonstrates:
data(sex2, package = "logistf")
X <- model.matrix(~ age + oc + vic + vicl + vis + dia, data = sex2)
y <- sex2$case
fit_f <- fastglm(X, y, family = binomial(), firth = TRUE)
fit_l <- logistf::logistf(case ~ age + oc + vic + vicl + vis + dia,
data = sex2, pl = FALSE, plconf = NULL,
control = logistf::logistf.control(
lconv = 1e-12, gconv = 1e-12, xconv = 1e-12,
maxit = 200L))
cbind(fastglm = unname(coef(fit_f)), logistf = unname(coef(fit_l)))
#> fastglm logistf
#> [1,] 0.12025405 0.12025405
#> [2,] -1.10598133 -1.10598133
#> [3,] -0.06881673 -0.06881673
#> [4,] 2.26887465 2.26887465
#> [5,] -2.11140819 -2.11140819
#> [6,] -0.78831695 -0.78831695
#> [7,] 3.09601183 3.09601183
max(abs(unname(coef(fit_f)) - unname(coef(fit_l))))
#> [1] 1.478671e-09Coefficients agree to roughly 1e-7. The runtime advantage is substantial because fastglm’s Firth solver reuses the C++ IRLS infrastructure:
mb <- microbenchmark::microbenchmark(
fastglm = fastglm(X, y, family = binomial(), firth = TRUE),
logistf = logistf::logistf(case ~ age + oc + vic + vicl + vis + dia,
data = sex2, pl = FALSE, plconf = NULL),
times = 10L
)
print(mb, unit = "ms")
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> fastglm 0.255717 0.265229 0.2795995 0.273798 0.283351 0.349320 10 a
#> logistf 1.263456 1.279077 1.3051735 1.287584 1.313230 1.411876 10 bfirth = TRUE works with all standard R families and link
functions. For each family below, we verify that fastglm
reproduces the bias-reduced coefficients from
brglm2::brglmFit(type = "AS_mean") — both implement the
same AS_mean adjustment, so the coefficients should agree to near
machine precision.
library(brglm2)
set.seed(123)
n <- 500
x1 <- rnorm(n); x2 <- rnorm(n)
X <- cbind(1, x1, x2)
eta_true <- 0.5 + 0.3 * x1 - 0.2 * x2y_bin <- rbinom(n, 1, plogis(eta_true))
df <- data.frame(y = y_bin, x1 = x1, x2 = x2)
for (lnk in c("logit", "probit", "cloglog")) {
fam <- binomial(lnk)
f <- fastglm(X, y_bin, family = fam, firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = df, family = fam, method = "brglmFit",
type = "AS_mean", control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("binomial %-8s max|diff| = %.2e\n", lnk,
max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> binomial logit max|diff| = 2.29e-12
#> binomial probit max|diff| = 1.10e-11
#> binomial cloglog max|diff| = 3.09e-11y_pois <- rpois(n, exp(eta_true))
for (lnk in c("log", "sqrt")) {
fam <- poisson(lnk)
f <- fastglm(X, y_pois, family = fam, firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y_pois, x1 = x1, x2 = x2),
family = fam, method = "brglmFit", type = "AS_mean",
control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("poisson %-8s max|diff| = %.2e\n", lnk,
max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> poisson log max|diff| = 2.13e-12
#> poisson sqrt max|diff| = 6.30e-11y_gam <- rgamma(n, shape = 5, rate = 5 / exp(eta_true))
for (lnk in c("log", "inverse")) {
fam <- Gamma(lnk)
f <- fastglm(X, y_gam, family = fam, firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y_gam, x1 = x1, x2 = x2),
family = fam, method = "brglmFit", type = "AS_mean",
control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("Gamma %-8s max|diff| = %.2e\n", lnk,
max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> Gamma log max|diff| = 2.10e-05
#> Gamma inverse max|diff| = 1.67e-05y_gauss <- eta_true + rnorm(n, 0, 0.5)
for (lnk in c("identity", "log")) {
fam <- gaussian(lnk)
y0 <- if (lnk == "log") pmax(y_gauss, 0.01) else y_gauss
f <- fastglm(X, y0, family = fam, firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y0, x1 = x1, x2 = x2),
family = fam, method = "brglmFit", type = "AS_mean",
control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("gaussian %-8s max|diff| = %.2e\n", lnk,
max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> gaussian identity max|diff| = 1.67e-16
#> gaussian log max|diff| = 1.78e-06mu_ig <- exp(eta_true)
y_ig <- statmod::rinvgauss(n, mean = mu_ig, shape = 5)
y_ig[y_ig <= 0] <- 0.01
f <- fastglm(X, y_ig, family = inverse.gaussian("log"), firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y_ig, x1 = x1, x2 = x2),
family = inverse.gaussian("log"), method = "brglmFit", type = "AS_mean",
control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("inv.gauss log max|diff| = %.2e\n",
max(abs(unname(coef(f)) - unname(coef(b))))))
#> inv.gauss log max|diff| = 1.16e-06The standard errors reported by fastglm with
firth = TRUE come from the unscaled \((X^\top W X)^{-1}\) at convergence, the
same quantity that brglm2 uses. For unit-dispersion
families (binomial, Poisson), these are the Firth-penalized standard
errors directly; for other families, the dispersion is estimated as
\(\hat\phi = D / (n - p)\) and applied
to the variance-covariance matrix.
f <- fastglm(X, y_bin, family = binomial(), firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = df, family = binomial(), method = "brglmFit",
type = "AS_mean", control = list(epsilon = 1e-10, maxit = 200))
cbind(fastglm_SE = f$se, brglm2_SE = summary(b)$coefficients[, "Std. Error"])
#> fastglm_SE brglm2_SE
#> (Intercept) 0.09423309 0.09423309
#> x1 0.09910290 0.09910290
#> x2 0.09527990 0.09527990
max(abs(f$se - summary(b)$coefficients[, "Std. Error"]))
#> [1] 8.0283e-14The Firth-penalized deviance is \(D^* = D - \log |X^\top W X|\), where \(D\) is the standard deviance and the log-determinant comes from the information matrix at the penalized MLE. Both the standard and penalized deviances are reported:
A timing comparison of fastglm(firth = TRUE) against
brglm2::brglmFit(type = "AS_mean") across families with
n = 10000 observations and p = 5
covariates:
set.seed(123)
n <- 10000; p <- 5
x1 <- rnorm(n); x2 <- rnorm(n); x3 <- rnorm(n)
x4 <- rnorm(n); x5 <- rnorm(n)
X <- cbind(1, x1, x2, x3, x4, x5)
eta <- 0.5 + 0.3 * x1 - 0.2 * x2 + 0.1 * x3
families <- list(
`binomial logit` = list(fam = binomial("logit"),
y = rbinom(n, 1, plogis(eta))),
`binomial probit` = list(fam = binomial("probit"),
y = rbinom(n, 1, plogis(eta))),
`binomial cloglog` = list(fam = binomial("cloglog"),
y = rbinom(n, 1, plogis(eta))),
`poisson log` = list(fam = poisson("log"),
y = rpois(n, exp(eta))),
`Gamma log` = list(fam = Gamma("log"),
y = rgamma(n, shape = 5, rate = 5 / exp(eta))),
`gaussian identity`= list(fam = gaussian("identity"),
y = eta + rnorm(n, 0, 0.5))
)
df <- data.frame(y = 0, x1 = x1, x2 = x2, x3 = x3, x4 = x4, x5 = x5)
results <- list()
for (nm in names(families)) {
fi <- families[[nm]]
df$y <- fi$y
mb <- microbenchmark::microbenchmark(
fastglm = fastglm(X, fi$y, family = fi$fam, firth = TRUE),
brglm2 = glm(y ~ x1 + x2 + x3 + x4 + x5, data = df,
family = fi$fam, method = "brglmFit", type = "AS_mean"),
times = 10L
)
s <- summary(mb, unit = "ms")
cat(sprintf("%-20s fastglm=%7.2f ms brglm2=%7.2f ms speedup=%.1fx\n",
nm, s$median[1], s$median[2], s$median[2] / s$median[1]))
}
#> binomial logit fastglm= 3.22 ms brglm2= 13.31 ms speedup=4.1x
#> binomial probit fastglm= 4.22 ms brglm2= 15.77 ms speedup=3.7x
#> binomial cloglog fastglm= 5.58 ms brglm2= 16.75 ms speedup=3.0x
#> poisson log fastglm= 3.66 ms brglm2= 121.21 ms speedup=33.1x
#> Gamma log fastglm= 3.64 ms brglm2= 100.23 ms speedup=27.6x
#> gaussian identity fastglm= 1.02 ms brglm2= 21.17 ms speedup=20.7xFirth, D. (1993). Bias reduction of maximum likelihood estimates. Biometrika, 80(1), 27–38. doi:10.1093/biomet/80.1.27
Heinze, G. and Schemper, M. (2002). A solution to the problem of separation in logistic regression. Statistics in Medicine, 21(16), 2409–2419. doi:10.1002/sim.1047
Kosmidis, I. and Firth, D. (2009). Bias reduction in exponential family nonlinear models. Biometrika, 96(4), 793–804. doi:10.1093/biomet/asp055
Kosmidis, I. and Firth, D. (2021). Jeffreys-prior penalty, finiteness and shrinkage in binomial-response generalized linear models. Biometrika, 108(1), 71–82. doi:10.1093/biomet/asaa052
Kosmidis, I. (2014). Bias in parametric estimation: reduction and useful side-effects. WIREs Computational Statistics, 6(3), 185–196. doi:10.1002/wics.1296