Replicating Duflo (2001) with Rfuzzydid

Introduction

This vignette walks through a concrete replication exercise: re-estimating the Duflo (2001) INPRES specification with Rfuzzydid.

The key idea is to recover the wage return to schooling from cohort-by-district variation in treatment intensity, using the fuzzy DID estimators in de Chaisemartin and D’Haultfoeuille (2018).

The dataset is the bundled inpresdata.dta file in inst/extdata. The file originates from David Roodman’s replication materials (droodman/Duflo-2001). In the companion archive README (droodman/Duflo-2001), he notes that the main 1995 SUPAS file is “not contained in this repository” and links to that downloadable location.

Data Preparation

Loading the Data

library(Rfuzzydid)

# Resolve package root when rendering inside source tree
find_local_pkg_dir <- function() {
  roots <- c(".", "..", "../..")

  for (root in roots) {
    pkg_desc <- file.path(root, "pkg", "DESCRIPTION")
    root_desc <- file.path(root, "DESCRIPTION")

    if (file.exists(pkg_desc)) {
      return(file.path(root, "pkg"))
    }

    if (file.exists(root_desc)) {
      desc <- tryCatch(read.dcf(root_desc), error = function(e) NULL)
      if (!is.null(desc) && "Package" %in% colnames(desc) && identical(desc[1, "Package"], "Rfuzzydid")) {
        return(root)
      }
    }
  }

  NULL
}

# First try installed package location, then source-tree fallback
data_path <- system.file("extdata", "inpresdata.dta", package = "Rfuzzydid")
if (identical(data_path, "")) {
  local_pkg_dir <- find_local_pkg_dir()
  local_path <- if (!is.null(local_pkg_dir)) {
    file.path(local_pkg_dir, "inst", "extdata", "inpresdata.dta")
  } else {
    ""
  }

  if (file.exists(local_path)) {
    data_path <- local_path
  } else {
    stop("Bundled dataset 'inpresdata.dta' was not found in installed package or source-tree extdata.")
  }
}
raw <- foreign::read.dta(data_path)

Constructing the Analysis Sample

Following the original design, we keep two cohort windows:

The t indicator equals 1 for the later cohort.

# Define cohort groups
cohort0 <- raw$p504thn >= 57 & raw$p504thn <= 62
cohort1 <- raw$p504thn >= 68 & raw$p504thn <= 72

# Keep observations with non-missing outcome and treatment
keep <- !is.na(raw$lhwage) & !is.na(raw$yeduc) & (cohort0 | cohort1)

dat <- raw[keep, c("birthpl", "lhwage", "yeduc", "p504thn")]

# Time indicator: t = 1 for later cohort
dat$t <- as.integer(dat$p504thn >= 68 & dat$p504thn <= 72)

Constructing Group Variables

The original Stata workflow builds “super-groups” based on how schooling changes within district between the two cohort windows. We follow the same logic with a chi-square screen and the sign of the change in mean schooling:

pval_threshold <- 0.5
districts <- sort(unique(dat$birthpl))
gstar_map <- setNames(integer(length(districts)), as.character(districts))

for (g in districts) {
  i <- dat$birthpl == g
  sub <- dat[i, ]
  tab <- table(sub$yeduc, sub$t)
  pval <- suppressWarnings(chisq.test(tab)$p.value)
  evol <- mean(sub$yeduc[sub$t == 1]) - mean(sub$yeduc[sub$t == 0])
  if (!is.na(pval) && pval < pval_threshold) {
    gstar_map[[as.character(g)]] <- if (evol > 0) 1L else if (evol < 0) -1L else 0L
  } else {
    gstar_map[[as.character(g)]] <- 0L
  }
}

dat$gstar <- unname(gstar_map[as.character(dat$birthpl)])

This produces three groups:

Two-Arm Estimation

As in the paper’s setup, we estimate two arms separately and combine them after:

Arm 1: Districts with Increasing Education (g* = 1 vs g* = 0)

arm_inc_data <- dat[dat$gstar >= 0, ]
arm_inc_data$G <- as.integer(arm_inc_data$gstar == 1)

fit_inc <- fuzzydid(
  data = arm_inc_data,
  formula = lhwage ~ yeduc,
  group = "G",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  newcateg = c(5, 8, 11, 14, 1000),
  nose = TRUE,
  backend = "native"
)

knitr::kable(fit_inc$late)
estimator estimate std.error conf.low conf.high
W_DID 0.1236792 NA NA NA
W_TC 0.0798682 NA NA NA
W_CIC 0.0743160 NA NA NA

Arm 2: Districts with Decreasing Education (g* = -1 vs g* = 0)

arm_dec_data <- dat[dat$gstar <= 0, ]
arm_dec_data$G <- as.integer(arm_dec_data$gstar == -1)

fit_dec <- fuzzydid(
  data = arm_dec_data,
  formula = lhwage ~ yeduc,
  group = "G",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  newcateg = c(5, 8, 11, 14, 1000),
  nose = TRUE,
  backend = "native"
)

knitr::kable(fit_dec$late)
estimator estimate std.error conf.low conf.high
W_DID 0.1170241 NA NA NA
W_TC 0.1095677 NA NA NA
W_CIC 0.1143019 NA NA NA

Aggregating Results

To aggregate, we weight each arm by:

The decreasing arm uses -dD_dec so both weights are on a comparable “increase in treatment” scale.

# Treatment intensity weights
dD_inc <- mean(arm_inc_data$yeduc[arm_inc_data$G == 1 & arm_inc_data$t == 1]) -
          mean(arm_inc_data$yeduc[arm_inc_data$G == 1 & arm_inc_data$t == 0]) -
          mean(arm_inc_data$yeduc[arm_inc_data$G == 0 & arm_inc_data$t == 1]) +
          mean(arm_inc_data$yeduc[arm_inc_data$G == 0 & arm_inc_data$t == 0])

dD_dec <- mean(arm_dec_data$yeduc[arm_dec_data$G == 1 & arm_dec_data$t == 1]) -
          mean(arm_dec_data$yeduc[arm_dec_data$G == 1 & arm_dec_data$t == 0]) -
          mean(arm_dec_data$yeduc[arm_dec_data$G == 0 & arm_dec_data$t == 1]) +
          mean(arm_dec_data$yeduc[arm_dec_data$G == 0 & arm_dec_data$t == 0])

p_inc <- mean(dat$gstar == 1)
p_dec <- mean(dat$gstar == -1)

w_inc <- p_inc * dD_inc
w_dec <- p_dec * (-dD_dec)

# Extract point estimates
did_inc <- fit_inc$late$estimate[fit_inc$late$estimator == "W_DID"]
tc_inc  <- fit_inc$late$estimate[fit_inc$late$estimator == "W_TC"]
cic_inc <- fit_inc$late$estimate[fit_inc$late$estimator == "W_CIC"]

did_dec <- fit_dec$late$estimate[fit_dec$late$estimator == "W_DID"]
tc_dec  <- fit_dec$late$estimate[fit_dec$late$estimator == "W_TC"]
cic_dec <- fit_dec$late$estimate[fit_dec$late$estimator == "W_CIC"]

# Weighted aggregation
did_agg <- (did_inc * w_inc + did_dec * w_dec) / (w_inc + w_dec)
tc_agg  <- (tc_inc  * w_inc + tc_dec  * w_dec) / (w_inc + w_dec)
cic_agg <- (cic_inc * w_inc + cic_dec * w_dec) / (w_inc + w_dec)

cat("Aggregate Estimates:\n")
#> Aggregate Estimates:
cat(sprintf("DID = %.6f\n", did_agg))
#> DID = 0.122244
cat(sprintf("TC  = %.6f\n", tc_agg))
#> TC  = 0.086271
cat(sprintf("CIC = %.6f\n", cic_agg))
#> CIC = 0.082936

Comparison with Stata

The following one-shot Stata code starts from inpresdata.dta and reproduces the arm-level and aggregate point estimates:

The Stata snippet assumes the Stata fuzzydid command is already installed locally; Rfuzzydid no longer ships the .ado sources.

clear all
set more off

local pval_threshold 0.5

use inpresdata.dta, clear
keep if lhwage < . & yeduc < . & (inrange(p504thn,57,62) | inrange(p504thn,68,72))

gen byte t = inrange(p504thn,68,72)
gen gstar = .
levelsof birthpl, local(districts)

local pval_threshold = 0.05 
quietly foreach g of local districts {
    tab yeduc t if birthpl == `g', chi2
    scalar p = r(p)
    summarize yeduc if t == 1 & birthpl == `g', meanonly
    scalar m1 = r(mean)
    summarize yeduc if t == 0 & birthpl == `g', meanonly
    scalar m0 = r(mean)
    scalar evol = m1 - m0
    replace gstar = (scalar(evol) > 0 & scalar(p) < `pval_threshold') - (scalar(evol) < 0 & scalar(p) < `pval_threshold') if birthpl == `g'
}

gen byte inc = (gstar == 1)
gen byte dec = (gstar == -1)
summarize inc, meanonly
scalar p_inc = r(mean)
summarize dec, meanonly
scalar p_dec = r(mean)

* Arm g*=1 vs g*=0
preserve
keep if gstar >= 0
gen byte G = (gstar == 1)
fuzzydid lhwage G t yeduc, did tc cic newcateg(5 8 11 14 1000) nose
matrix M = e(b_LATE)
scalar did_inc = M[1,1]
scalar tc_inc  = M[2,1]
scalar cic_inc = M[3,1]

summarize yeduc if G == 1 & t == 1, meanonly
scalar D11 = r(mean)
summarize yeduc if G == 1 & t == 0, meanonly
scalar D10 = r(mean)
summarize yeduc if G == 0 & t == 1, meanonly
scalar D01 = r(mean)
summarize yeduc if G == 0 & t == 0, meanonly
scalar D00 = r(mean)
scalar dD_inc = (D11 - D10) - (D01 - D00)
restore

* Arm g*=-1 vs g*=0
preserve
keep if gstar <= 0
gen byte G = (gstar == -1)
fuzzydid lhwage G t yeduc, did tc cic newcateg(5 8 11 14 1000) nose
matrix M = e(b_LATE)
scalar did_dec = M[1,1]
scalar tc_dec  = M[2,1]
scalar cic_dec = M[3,1]

summarize yeduc if G == 1 & t == 1, meanonly
scalar D11m = r(mean)
summarize yeduc if G == 1 & t == 0, meanonly
scalar D10m = r(mean)
summarize yeduc if G == 0 & t == 1, meanonly
scalar D01m = r(mean)
summarize yeduc if G == 0 & t == 0, meanonly
scalar D00m = r(mean)
scalar dD_dec_pos = -((D11m - D10m) - (D01m - D00m))
restore

scalar w_inc = p_inc * dD_inc
scalar w_dec = p_dec * dD_dec_pos

scalar did_agg = (did_inc * w_inc + did_dec * w_dec) / (w_inc + w_dec)
scalar tc_agg  = (tc_inc  * w_inc + tc_dec  * w_dec) / (w_inc + w_dec)
scalar cic_agg = (cic_inc * w_inc + cic_dec * w_dec) / (w_inc + w_dec)

display "Arm g*=1: DID = " %9.6f did_inc " ; TC = " %9.6f tc_inc " ; CIC = " %9.6f cic_inc
display "Arm g*=-1: DID = " %9.6f did_dec " ; TC = " %9.6f tc_dec " ; CIC = " %9.6f cic_dec
display "AGG DID = " %9.6f did_agg
display "AGG TC  = " %9.6f tc_agg
display "AGG CIC = " %9.6f cic_agg

Running that code in Stata reports:

The next chunk checks parity directly in R:

stata_vals <- c(
  did_inc = 0.123679191,
  tc_inc = 0.079868219,
  cic_inc = 0.074316049,
  did_dec = 0.117024113,
  tc_dec = 0.109567707,
  cic_dec = 0.114301855,
  did_agg = 0.122244473,
  tc_agg = 0.086270906,
  cic_agg = 0.082936285
)

r_vals <- c(
  did_inc = did_inc,
  tc_inc = tc_inc,
  cic_inc = cic_inc,
  did_dec = did_dec,
  tc_dec = tc_dec,
  cic_dec = cic_dec,
  did_agg = did_agg,
  tc_agg = tc_agg,
  cic_agg = cic_agg
)

cmp <- data.frame(
  estimate = names(stata_vals),
  R = unname(r_vals),
  Stata = unname(stata_vals),
  abs_diff = abs(unname(r_vals) - unname(stata_vals))
)

knitr::kable(cmp, digits = 6)
estimate R Stata abs_diff
did_inc 0.123679 0.123679 0
tc_inc 0.079868 0.079868 0
cic_inc 0.074316 0.074316 0
did_dec 0.117024 0.117024 0
tc_dec 0.109568 0.109568 0
cic_dec 0.114302 0.114302 0
did_agg 0.122244 0.122244 0
tc_agg 0.086271 0.086271 0
cic_agg 0.082936 0.082936 0

Point estimates match to numerical tolerance. Confidence intervals can still differ when bootstrap is enabled, because draws and RNG streams differ across software.

References

Duflo, E. (2001). Schooling and Labor Market Consequences of School Construction in Indonesia. American Economic Review, 91(4): 795-813.

de Chaisemartin, C. and D’Haultfoeuille, X. (2018). Fuzzy Differences-in-Differences. Review of Economic Studies, 85(2): 999-1028. doi:10.1093/restud/rdx049.

de Chaisemartin, C., D’Haultfoeuille, X., and Guyonvarch, Y. (2018). Fuzzy Differences-in-Differences with Stata. Stata Journal. doi:10.1177/1536867X19854019.