Instrumental Variables (IV) estimation using Two-Stage Least Squares (2SLS) is a powerful technique for estimating causal effects when your treatment variable is endogenous (correlated with unobserved factors). By using an instrument that affects treatment but not the outcome directly, IV methods can recover unbiased causal estimates.
💡 Key Assumptions: For IV to work, your instrument must satisfy: (1) Relevance - strongly predicts treatment; (2) Exclusion - affects outcome only through treatment; (3) Exogeneity - uncorrelated with unobserved confounders.
Ready to estimate causal effects with IV? to see how IV/2SLS works, or upload your own data with an outcome, treatment, and instrument variable.
Instrumental Variables (IV) is a method for estimating causal effects when the treatment variable is endogenous (correlated with unobserved factors). An instrument is a variable that affects the treatment but has no direct effect on the outcome. Two-Stage Least Squares (2SLS) is the most common IV estimator.
First Stage: Predict Treatment with Instrument
Where X = treatment, Z = instrument
Second Stage: Estimate Effect Using Predicted Treatment
is the IV estimate of the causal effect
library(tidyverse)
library(AER) # For ivreg and IV diagnostics
library(lmtest) # For robust standard errors
# Example: Effect of education on earnings using quarter of birth as instrument
# (Angrist & Krueger 1991 setup)
# Fixed dataset for verification (30 observations)
data <- tibble(
id = 1:30,
qob1 = c(1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
0, 1, 0, 0, 1, 0, 0, 1, 0, 1),
education = c(12.5, 14.2, 13.8, 11.9, 15.1, 13.5, 12.2, 14.8, 12.8, 13.9,
14.5, 11.8, 15.2, 13.1, 12.4, 14.3, 12.1, 15.5, 13.7, 12.6,
14.1, 11.5, 13.3, 15.8, 12.9, 14.6, 13.2, 11.7, 14.9, 12.3),
earnings = c(42500, 58200, 51800, 38900, 62100, 49500, 41200, 57800, 43800, 52900,
55200, 37800, 63200, 47100, 42400, 53300, 40100, 64500, 50700, 43600,
54100, 36500, 48300, 65800, 44900, 56600, 48200, 37100, 57900, 42100)
)
# Method 1: Two-Stage Least Squares using ivreg
iv_model <- ivreg(earnings ~ education | qob1, data = data)
summary(iv_model, diagnostics = TRUE)
# Method 2: Manual 2SLS
# First stage: Regress endogenous variable on instrument
first_stage <- lm(education ~ qob1, data = data)
summary(first_stage)
# Check first stage F-statistic (should be > 10 for strong instrument)
first_stage_f <- summary(first_stage)$fstatistic[1]
cat("First Stage F-statistic:", first_stage_f, "
")
# Get fitted values from first stage
education_hat <- fitted(first_stage)
# Second stage: Regress outcome on fitted values
second_stage <- lm(earnings ~ education_hat, data = data)
summary(second_stage)
# Compare with OLS (biased due to endogeneity)
ols_model <- lm(earnings ~ education, data = data)
# Compare coefficients
cat("Coefficient Comparison:")
cat("OLS estimate:", coef(ols_model)["education"])
cat("IV estimate:", coef(iv_model)["education"])
# Visualize first stage
ggplot(data, aes(x = qob1, y = education)) +
geom_jitter(alpha = 0.3, width = 0.1) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "First Stage: Instrument vs Endogenous Variable",
x = "Born in Quarter 1",
y = "Years of Education") +
theme_minimal()
# Test for endogeneity (Hausman test)
# If significant, IV is needed
cf_test <- coef(iv_model)["education"] - coef(ols_model)["education"]
cat("Hausman test (coefficient difference):", cf_test)import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS
import matplotlib.pyplot as plt
import seaborn as sns
# Example: Effect of education on earnings using quarter of birth as instrument
# (Angrist & Krueger 1991 setup)
# Fixed dataset for verification (30 observations - same as R)
data = pd.DataFrame({
'id': range(1, 31),
'qob1': [1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
0, 1, 0, 0, 1, 0, 0, 1, 0, 1],
'education': [12.5, 14.2, 13.8, 11.9, 15.1, 13.5, 12.2, 14.8, 12.8, 13.9,
14.5, 11.8, 15.2, 13.1, 12.4, 14.3, 12.1, 15.5, 13.7, 12.6,
14.1, 11.5, 13.3, 15.8, 12.9, 14.6, 13.2, 11.7, 14.9, 12.3],
'earnings': [42500, 58200, 51800, 38900, 62100, 49500, 41200, 57800, 43800, 52900,
55200, 37800, 63200, 47100, 42400, 53300, 40100, 64500, 50700, 43600,
54100, 36500, 48300, 65800, 44900, 56600, 48200, 37100, 57900, 42100]
})
# Method 1: Two-Stage Least Squares using IV2SLS
exog = sm.add_constant(data['qob1'])
endog = data['education']
outcome = data['earnings']
iv_model = IV2SLS(outcome, exog, endog, exog).fit()
print("2SLS Results:")
print(iv_model.summary())
# Method 2: Manual 2SLS
# First stage: Regress endogenous variable on instrument
X_first = sm.add_constant(data['qob1'])
first_stage = sm.OLS(data['education'], X_first).fit()
print("First Stage Results:")
print(first_stage.summary())
# Check first stage F-statistic (should be > 10 for strong instrument)
first_stage_f = first_stage.fvalue
print(f"First Stage F-statistic: {first_stage_f:.2f}")
if first_stage_f < 10:
print("Warning: Weak instrument (F < 10)")
else:
print("Instrument appears strong (F >= 10)")
# Get fitted values from first stage
education_hat = first_stage.fittedvalues
# Second stage: Regress outcome on fitted values
X_second = sm.add_constant(education_hat)
second_stage = sm.OLS(data['earnings'], X_second).fit()
print("Second Stage Results:")
print(second_stage.summary())
# Compare with OLS (biased due to endogeneity)
X_ols = sm.add_constant(data['education'])
ols_model = sm.OLS(data['earnings'], X_ols).fit()
print("OLS Results (Biased):")
print(ols_model.summary())
# Compare coefficients
print(f"Coefficient Comparison:")
print(f"OLS estimate: {ols_model.params['education']:.2f}")
print(f"IV estimate: {second_stage.params['education_hat']:.2f}")
print(f"Difference: {second_stage.params['education_hat'] - ols_model.params['education']:.2f}")
# Visualize first stage
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='qob1', y='education', alpha=0.5)
sns.regplot(data=data, x='qob1', y='education', scatter=False, color='red')
plt.xlabel('Born in Quarter 1')
plt.ylabel('Years of Education')
plt.title('First Stage: Instrument vs Endogenous Variable')
plt.tight_layout()
plt.show()
# Visualize reduced form (instrument -> outcome)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='qob1', y='earnings', alpha=0.5)
sns.regplot(data=data, x='qob1', y='earnings', scatter=False, color='green')
plt.xlabel('Born in Quarter 1')
plt.ylabel('Earnings')
plt.title('Reduced Form: Instrument vs Outcome')
plt.tight_layout()
plt.show()