Stata Parity: Replicating Stata fuzzydid in R

Introduction

This vignette is a parity check between Rfuzzydid and Stata’s fuzzydid. Each section runs an R example and, where possible, compares against frozen Stata outputs used in the test suite.

The Stata snippets assume the Stata fuzzydid command is already installed locally; Rfuzzydid no longer bundles the .ado sources.

The shared estimators come from de Chaisemartin and D’Haultfoeuille (2018): local average treatment effects (LATE) and local quantile treatment effects under fuzzy DID assumptions.

Simulated Data Example

We start with a simple two-period, two-group design:

set.seed(50321)

n_cell <- 120  # observations per cell

# Helper function to create each (G, T) cell
make_cell <- function(g, t, prob_d) {
  d <- rbinom(n_cell, size = 1, prob = prob_d)
  # Outcome: base + group effect + time effect + treatment effect + noise
  y <- 1 + 0.5 * g + 0.4 * t + 2.0 * d + sin(seq_len(n_cell) / 7)
  data.frame(y = y, g = g, t = t, d = d)
}

# Build the four cells
df <- rbind(
  make_cell(g = 0, t = 0, prob_d = 0.20),  # Control group, baseline
  make_cell(g = 0, t = 1, prob_d = 0.40),  # Control group, follow-up  
  make_cell(g = 1, t = 0, prob_d = 0.30),  # Treatment group, baseline
  make_cell(g = 1, t = 1, prob_d = 0.80)   # Treatment group, follow-up
)

# Verify cell sizes and treatment rates
aggregate(cbind(n = d) ~ g + t, data = df, FUN = function(x) c(n = length(x), mean = mean(x)))
#>   g t         n.n      n.mean
#> 1 0 0 120.0000000   0.1583333
#> 2 1 0 120.0000000   0.2916667
#> 3 0 1 120.0000000   0.4333333
#> 4 1 1 120.0000000   0.8083333

# Save fixture used by most examples for direct Stata comparison
if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df, "stata-parity-sim-fixture.dta")
}

The treatment rate jumps from 30% to 80% in the treated group and from 20% to 40% in controls. That gap in treatment uptake changes is the first stage that identifies fuzzy DID effects.

Basic Estimation

Wald-DID, Wald-TC, and Wald-CIC

We request the three main LATE estimators:

fit <- fuzzydid(
  data = df,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  breps = 50
)

summary(fit)
#> LATE estimators
#> 
#> 
#> |estimator | estimate| std.error| conf.low| conf.high|
#> |:---------|--------:|---------:|--------:|---------:|
#> |W_DID     | 2.000000| 0.4679188| 1.088768|  2.960728|
#> |W_TC      | 1.973204| 0.2333661| 1.526637|  2.309330|
#> |W_CIC     | 2.222098| 0.2028711| 1.820405|  2.545051|

stata_basic <- c(
  W_DID = 2.00000000000000,
  W_TC = 1.97320415000280,
  W_CIC = 2.22209836585117
)
r_basic <- setNames(fit$late$estimate, fit$late$estimator)[names(stata_basic)]
basic_cmp <- data.frame(
  estimator = names(stata_basic),
  R = unname(r_basic),
  Stata = unname(stata_basic),
  abs_diff = abs(unname(r_basic) - unname(stata_basic))
)
knitr::kable(basic_cmp, digits = 8)
estimator R Stata abs_diff
W_DID 2.000000 2.000000 0e+00
W_TC 1.973204 1.973204 8e-08
W_CIC 2.222098 2.222098 5e-08

The table reports point estimates, bootstrap standard errors, and percentile confidence intervals.

Equivalent Stata code:

fuzzydid y g t d, did tc cic breps(50)
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)

Extracting Results Programmatically

modelsummary gives a compact table that is easy to pass into reports:

library(modelsummary)

modelsummary::modelsummary(fit)
(1)
W_DID 2.000
(0.468)
W_TC 1.973
(0.233)
W_CIC 2.222
(0.203)
Num.Obs. 480
backend native
N.11 120
N.10 120
N.01 120
N.00 120
N.reps 50
N.misreps 0
Share.failures 0

Equivalent Stata code:

* Tidy-like access in Stata comes from stored matrices in e()
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)

Core Stata Parity Check

This chunk reproduces the exact fixture used by the parity tests and checks that R and Stata point estimates agree to tolerance:

n_cell <- 120L

make_parity_cell <- function(g, t, ones) {
  data.frame(
    g = rep.int(g, n_cell),
    t = rep.int(t, n_cell),
    d = c(rep.int(1L, ones), rep.int(0L, n_cell - ones))
  )
}

out <- rbind(
  make_parity_cell(0L, 0L, 24L),
  make_parity_cell(0L, 1L, 48L),
  make_parity_cell(1L, 0L, 36L),
  make_parity_cell(1L, 1L, 96L)
)
out$id <- seq_len(nrow(out))
out$y <- 1 + 0.5 * out$g + 0.4 * out$t + 2 * out$d + sin(out$id / 7)
parity_fixture <- out[, c("y", "g", "t", "d")]

fit_parity <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  breps = 5,
  seed = 1
)

stata_core <- c(
  W_DID = 2.17430877504668,
  W_TC = 2.18553887285118,
  W_CIC = 2.31982319056988
)

r_core <- setNames(fit_parity$late$estimate, fit_parity$late$estimator)[c("W_DID", "W_TC", "W_CIC")]

core_cmp <- data.frame(
  estimator = names(stata_core),
  R = unname(r_core),
  Stata = unname(stata_core),
  abs_diff = abs(unname(r_core) - unname(stata_core))
)

knitr::kable(core_cmp, digits = 8)
estimator R Stata abs_diff
W_DID 2.174309 2.174309 7e-08
W_TC 2.185539 2.185539 4e-08
W_CIC 2.319823 2.319823 3e-08

# Save exact parity fixture for Stata users
if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(parity_fixture, "stata-parity-core-fixture.dta")
}

Point estimates are strict parity targets here. Standard errors and intervals depend on bootstrap simulation, so tiny platform-level differences are normal.

Equivalent Stata code:

* Load the same dataset exported in the parity check chunk
use "stata-parity-core-fixture.dta", clear

* Generate the group variable (already in dataset as 'g')
* The Stata command expects G to identify treatment groups

* Run fuzzydid
fuzzydid y g t d, did tc cic breps(50)

* Results are stored in e()
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)

Key Parity Points

The next sections cover behavior that should match Stata and often causes confusion in practice.

1. User-Controlled Bootstrap Seed

Bootstrap inference is reproducible when you pass the same seed value. If you change seed, the resamples change too:

# These produce identical SE/CI because the same bootstrap seed is supplied
fit1 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1)
fit2 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1)

all.equal(fit1$late$std.error, fit2$late$std.error)
#> [1] TRUE

stata_seed_se <- 0.66310988000000
seed_cmp <- data.frame(
  estimator = "W_DID",
  R_seed_1_run_1 = fit1$late$std.error[fit1$late$estimator == "W_DID"],
  R_seed_1_run_2 = fit2$late$std.error[fit2$late$estimator == "W_DID"],
  Stata = stata_seed_se,
  stringsAsFactors = FALSE
)
seed_cmp$abs_diff_run_1_vs_Stata <- abs(seed_cmp$R_seed_1_run_1 - seed_cmp$Stata)
seed_cmp$abs_diff_run_2_vs_Stata <- abs(seed_cmp$R_seed_1_run_2 - seed_cmp$Stata)
knitr::kable(seed_cmp, digits = 8)
estimator R_seed_1_run_1 R_seed_1_run_2 Stata abs_diff_run_1_vs_Stata abs_diff_run_2_vs_Stata
W_DID 0.7859547 0.7859547 0.6631099 0.1228449 0.1228449

Equivalent Stata code:

use "stata-parity-sim-fixture.dta", clear
set seed 1
fuzzydid y g t d, did breps(50)
matrix se1 = e(se_LATE)

set seed 1
fuzzydid y g t d, did breps(50)
matrix se2 = e(se_LATE)

matrix list se1
matrix list se2

2. Design Restrictions

Rfuzzydid enforces the same design restrictions as Stata:

df_restrict <- transform(df, x = rnorm(nrow(df)))

# CIC requires no covariates
try({
  fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", cic = TRUE)
})
#> Error : continuous and qualitative cannot be used in combination with cic or lqte.

# LQTE requires binary G, T, and D, plus no covariates
try({
  fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", lqte = TRUE)
})
#> Error : continuous and qualitative cannot be used in combination with cic or lqte.

Equivalent Stata code:

use "stata-parity-sim-fixture.dta", clear
gen x = rnormal()

* CIC with covariates should error
capture noisily fuzzydid y g t d x, cic

* LQTE with covariates should error
capture noisily fuzzydid y g t d x, lqte

3. Partial Identification Bounds

When Wald-TC point identification is unavailable, both implementations can report partial-identification bounds:

fit_partial <- fuzzydid(
  data = df,
  formula = y ~ d,
  group = "g",
  time = "t",
  tc = TRUE,
  partial = TRUE,
  breps = 50
)

knitr::kable(fit_partial$late)
estimator estimate std.error conf.low conf.high
TC_inf 0.012206 0.4554927 -0.8004909 0.9406186
TC_sup 3.081771 0.3345019 2.4384801 3.6728553

r_partial <- setNames(fit_partial$late$estimate, fit_partial$late$estimator)[names(stata_partial)]
partial_cmp <- data.frame(
  estimator = names(stata_partial),
  R = unname(r_partial),
  Stata = unname(stata_partial),
  abs_diff = abs(unname(r_partial) - unname(stata_partial))
)
knitr::kable(partial_cmp, digits = 8)
estimator R Stata abs_diff
TC_inf 0.01220596 0.01220596 0e+00
TC_sup 3.08177133 3.08177130 3e-08

Equivalent Stata code:

use "stata-parity-sim-fixture.dta", clear
fuzzydid y g t d, tc partial breps(50)
matrix list e(b_LATE)

4. Equality Tests

When multiple estimators are requested, you can also test pairwise equality:

fit_eq <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  eqtest = TRUE,
  breps = 5,
  seed = 1
)

eq_cmp <- data.frame(
  contrast = stata_eqtest$contrast,
  R = fit_eq$eqtest$estimate,
  Stata = stata_eqtest$estimate,
  abs_diff = abs(fit_eq$eqtest$estimate - stata_eqtest$estimate)
)
knitr::kable(eq_cmp, digits = 8)
contrast R Stata abs_diff
W_DID_W_TC -0.01123007 -0.01123007 0
W_DID_W_CIC -0.14551432 -0.14551432 0
W_TC_W_CIC -0.13428425 -0.13428425 0

Equivalent Stata code:

use "stata-parity-core-fixture.dta", clear
fuzzydid y g t d, did tc cic eqtest breps(5)
matrix list e(b_LATE_eqtest)

Covariate Adjustment

Parametric Adjustment with modelx

With covariates, modelx controls how conditional expectations are modeled (for example OLS, logit, or probit):

# Add covariates
set.seed(123)
df$x1 <- rnorm(nrow(df))
df$x2 <- factor(ifelse(runif(nrow(df)) > 0.5, "a", "b"))

fit_modelx <- fuzzydid(
  data = df,
  formula = y ~ d + x1 + x2,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  modelx = c("ols", "ols"),
  nose = TRUE
)

knitr::kable(fit_modelx$late)
estimator estimate std.error conf.low conf.high
W_DID 2.032228 NA NA NA
W_TC 1.930624 NA NA NA

stata_modelx <- c(
  W_DID = 2.03222780000000,
  W_TC = 1.93062450000000
)
r_modelx <- setNames(fit_modelx$late$estimate, fit_modelx$late$estimator)[names(stata_modelx)]
modelx_cmp <- data.frame(
  estimator = names(stata_modelx),
  R = unname(r_modelx),
  Stata = unname(stata_modelx),
  abs_diff = abs(unname(r_modelx) - unname(stata_modelx))
)
knitr::kable(modelx_cmp, digits = 8)
estimator R Stata abs_diff
W_DID 2.032228 2.032228 1e-08
W_TC 1.930625 1.930624 1e-08

if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df[, c("y", "g", "t", "d", "x1", "x2")], "stata-parity-covariates-fixture.dta")
}

For binary treatment, modelx takes two entries:

  1. Method for E(Y_{gt}|X) and E(Y_{dgt}|X)
  2. Method for E(D_{gt}|X)

For multi-valued treatment, a third entry specifies the model for P(D_{gt}=d | X).

Equivalent Stata code:

use "stata-parity-covariates-fixture.dta", clear
fuzzydid y g t d, did tc continuous(x1) qualitative(x2) modelx(ols ols)
matrix list e(b_LATE)

Nonparametric Adjustment with Sieves

Continuous controls can also be handled nonparametrically:

fit_sieves <- fuzzydid(
  data = df,
  formula = y ~ d + x1,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  sieves = TRUE,
  nose = TRUE
)

knitr::kable(fit_sieves$late)
estimator estimate std.error conf.low conf.high
W_DID 2.013885 NA NA NA
W_TC 1.894181 NA NA NA

stata_sieves <- c(
  W_DID = 2.00081280000000,
  W_TC = 1.93678270000000
)
r_sieves <- setNames(fit_sieves$late$estimate, fit_sieves$late$estimator)[names(stata_sieves)]
sieves_cmp <- data.frame(
  estimator = names(stata_sieves),
  R = unname(r_sieves),
  Stata = unname(stata_sieves),
  abs_diff = abs(unname(r_sieves) - unname(stata_sieves))
)
knitr::kable(sieves_cmp, digits = 8)
estimator R Stata abs_diff
W_DID 2.013885 2.000813 0.01307262
W_TC 1.894181 1.936783 0.04260123

When sieves = TRUE, sieveorder = NULL triggers deterministic 5-fold CV (the Stata-parity default). You can set sieveorder manually if you want a fixed basis order.

Equivalent Stata code:

use "stata-parity-covariates-fixture.dta", clear
fuzzydid y g t d, did tc continuous(x1) sieves
matrix list e(b_LATE)

LQTE Estimation

LQTE estimates are available from the 5th through 95th percentiles:

fit_lqte <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  lqte = TRUE,
  nose = TRUE,
  seed = 1
)

lqte_cmp <- data.frame(
  quantile = stata_lqte$quantile,
  R = fit_lqte$lqte$estimate,
  Stata = stata_lqte$estimate,
  abs_diff = abs(fit_lqte$lqte$estimate - stata_lqte$estimate)
)
knitr::kable(head(lqte_cmp, 10), digits = 8)
quantile R Stata abs_diff
0.05 1.670645 1.672740 0.00209542
0.10 1.745951 1.794227 0.04827636
0.15 1.816027 1.816027 0.00000008
0.20 1.946114 1.946114 0.00000003
0.25 1.955441 1.955441 0.00000004
0.30 1.947278 1.947278 0.00000007
0.35 1.974047 1.974047 0.00000005
0.40 2.002776 2.094133 0.09135670
0.45 2.043456 2.131043 0.08758671
0.50 2.032202 2.032202 0.00000023

The native LQTE path follows the same rearranged counterfactual-CDF construction as the upstream Stata command. Small differences can occur because the original Stata implementation stores intermediate empirical CDFs as Stata floats.

LQTE requires binary G, T, and D, with no covariates.

Equivalent Stata code:

use "stata-parity-core-fixture.dta", clear
fuzzydid y g t d, lqte
matrix list e(b_LQTE)

Multi-Period Designs

For more than two periods, use group_forward:

set.seed(7)
n_mp <- 70

make_mp_cell <- function(gb, gf, t, prob, shift = 0) {
  d <- rbinom(n_mp, 1, prob)
  y <- 1 + 0.35 * t + 1.6 * d + shift + rnorm(n_mp, sd = 0.25)
  data.frame(y = y, d = d, gb = gb, gf = gf, t = t)
}

df_mp <- rbind(
  make_mp_cell(gb = 0, gf = 0, t = 0, prob = 0.20),
  make_mp_cell(gb = 0, gf = 0, t = 1, prob = 0.30),
  make_mp_cell(gb = 0, gf = 0, t = 2, prob = 0.35),
  make_mp_cell(gb = 0, gf = 1, t = 0, prob = 0.30, shift = 0.2),
  make_mp_cell(gb = 1, gf = 1, t = 1, prob = 0.60, shift = 0.8),
  make_mp_cell(gb = 1, gf = 0, t = 2, prob = 0.75, shift = 1.1)
)

fit_mp <- fuzzydid(
  data = df_mp,
  formula = y ~ d,
  group = "gb",
  group_forward = "gf",
  time = "t",
  did = TRUE,
  tc = TRUE,
  nose = TRUE
)

knitr::kable(fit_mp$late)
estimator estimate std.error conf.low conf.high
W_DID 4.769522 NA NA NA
W_TC 4.294036 NA NA NA

if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df_mp, "stata-parity-multiperiod-fixture.dta")
}

Equivalent Stata code:

use "stata-parity-multiperiod-fixture.dta", clear
fuzzydid y gb t d, groupf(gf) did tc
matrix list e(b_LATE)

Bootstrap Diagnostics

The object also tracks bootstrap failures (degenerate resamples):

df_diag <- parity_fixture
df_diag$cl <- rep(seq_len(24L), each = 20L)

fit_diag <- fuzzydid(
  data = df_diag,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  cluster = "cl",
  breps = 20,
  seed = 1
)

diag_cmp <- data.frame(
  metric = c("N.reps", "N.misreps", "Share.failures"),
  R = c(fit_diag$n_reps, fit_diag$n_misreps, fit_diag$share_failures),
  Reference = c(
    native_cluster[["n_reps"]],
    native_cluster[["n_misreps"]],
    native_cluster[["share_failures"]]
  ),
  stringsAsFactors = FALSE
)

share_from_counts <- if (diag_cmp$R[1] > 0) diag_cmp$R[2] / diag_cmp$R[1] else NA_real_

# These diagnostics are deterministic in R when the same bootstrap seed is used.
diag_cmp$Delta <- diag_cmp$R - diag_cmp$Reference
knitr::kable(diag_cmp, digits = 8)
metric R Reference Delta
N.reps 20.00 20.00 0
N.misreps 3.00 3.00 0
Share.failures 0.15 0.15 0

if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df_diag, "stata-parity-cluster-fixture.dta")
}

Stata can run the same clustered bootstrap command:

use "stata-parity-cluster-fixture.dta", clear
fuzzydid y g t d, did tc cic cluster(cl) breps(20)

The Stata command does not expose N_misreps or share_failures directly, so the table above is tracked as a deterministic native regression check rather than a stored-scalar parity assertion.

References

de Chaisemartin, C. and D’Haultfoeuille, X. (2018). Fuzzy Differences-in-Differences. Review of Economic Studies, 85(2): 999-1028. doi:10.1093/restud/rdx049.