Executed Examples

This page runs small versions of the examples from the repository notebooks. The original notebooks use larger data to demonstrate scale; these examples use compact synthetic tables so the documentation renders quickly while still showing real estimator output.

Setup

Show code

import os
import tempfile

import duckdb
import numpy as np
import pandas as pd

import duckreg.estimators as estimators_module
from duckreg.estimators import (
    DuckDML,
    DuckDoubleDemeaning,
    DuckLogisticRegression,
    DuckMultinomialLogisticRegression,
    DuckMundlak,
    DuckMundlakEventStudy,
    DuckPoissonMultinomialRegression,
    DuckPoissonRegression,
    DuckRegression,
)
from duckreg.regularized import DuckRidge

pd.set_option("display.precision", 4)
rng = np.random.default_rng(2026)
tmpdir = tempfile.mkdtemp(prefix="duckreg_docs_")

# Keep bootstrap examples readable in rendered docs.
estimators_module.tqdm = lambda iterable, *args, **kwargs: iterable


def write_table(db_path, table_name, df):
    conn = duckdb.connect(db_path)
    conn.register("df", df)
    conn.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM df")
    conn.close()

Synthetic Tables

Show code

n = 6_000
linear = pd.DataFrame(
    {
        "rowid": np.arange(n),
        "D": rng.integers(0, 2, n),
        "f1": rng.integers(0, 18, n),
        "f2": rng.integers(0, 8, n),
        "f3": rng.integers(0, 5, n),
    }
)
linear["Y"] = (
    1.0
    + 2.0 * linear["D"]
    + 0.12 * linear["f1"]
    - 0.08 * linear["f2"]
    + rng.normal(0, 1, n)
)
linear["Y2"] = (
    -0.5
    + 1.4 * linear["D"]
    - 0.04 * linear["f1"]
    + 0.03 * linear["f2"]
    + rng.normal(0, 1, n)
)
linear_db = os.path.join(tmpdir, "linear.db")
write_table(linear_db, "data", linear)

units, periods = 120, 6
unit_ids = np.repeat(np.arange(units), periods)
time_ids = np.tile(np.arange(periods), units)
unit_fe = rng.normal(0, 1, units)
time_fe = rng.normal(0, 0.4, periods)
W = (
    rng.normal(0, 1, units * periods)
    + 0.4 * unit_fe[unit_ids]
    + 0.2 * time_fe[time_ids]
)
Y = 0.8 * W + unit_fe[unit_ids] + time_fe[time_ids] + rng.normal(
    0, 1, units * periods
)
panel = pd.DataFrame({"unit": unit_ids, "time": time_ids, "W": W, "Y": Y})
panel_db = os.path.join(tmpdir, "panel.db")
write_table(panel_db, "panel_data", panel)

cohorts = rng.choice([2, 3, 4, 99], size=units, p=[0.25, 0.25, 0.25, 0.25])
D_it = np.array(
    [
        1
        if time_ids[i] >= cohorts[unit_ids[i]] and cohorts[unit_ids[i]] < 99
        else 0
        for i in range(len(unit_ids))
    ]
)
Y_it = unit_fe[unit_ids] + time_fe[time_ids] + 1.2 * D_it + rng.normal(
    0, 1, len(D_it)
)
event = pd.DataFrame(
    {"unit_id": unit_ids, "time_id": time_ids, "W_it": D_it, "Y_it": Y_it}
)
event_db = os.path.join(tmpdir, "event.db")
write_table(event_db, "panel_data", event)

data_inventory = pd.DataFrame(
    {
        "table": ["linear data", "panel data", "event-study panel"],
        "rows": [len(linear), len(panel), len(event)],
        "database": [os.path.basename(linear_db), os.path.basename(panel_db), os.path.basename(event_db)],
    }
)
data_inventory

	table	rows	database
0	linear data	6000	linear.db
1	panel data	720	panel.db
2	event-study panel	720	event.db

Compressed OLS

This is the core DuckRegression workflow from notebooks/introduction.ipynb: fit on compressed covariate cells, then compute analytic HC1-style standard errors.

Show code

m = DuckRegression(
    db_name=linear_db,
    table_name="data",
    formula="Y ~ D + f1 + f2",
    cluster_col="",
    n_bootstraps=0,
    seed=42,
)
m.fit()
m.fit_vcov()

results = m.summary()
pd.DataFrame(
    {
        "point_estimate": results["point_estimate"],
        "standard_error": results["standard_error"],
    },
    index=["Intercept", "D", "f1", "f2"],
)

	point_estimate	standard_error
Intercept	0.9488	0.0340
D	2.0143	0.0257
f1	0.1240	0.0025
f2	-0.0765	0.0056

The compressed table is much smaller than the raw table because the regressors are discrete.

Show code

pd.DataFrame(
    {
        "raw_rows": [len(linear)],
        "compressed_rows": [len(m.df_compressed)],
        "compression_ratio": [len(linear) / len(m.df_compressed)],
    }
)

	raw_rows	compressed_rows	compression_ratio
0	6000	288	20.8333

Cluster Bootstrap

When cluster_col is set and n_bootstraps > 0, DuckRegression resamples clusters and recompresses the bootstrap draw.

Show code

m_boot = DuckRegression(
    db_name=linear_db,
    table_name="data",
    formula="Y ~ D + f1 + f2",
    cluster_col="f1",
    n_bootstraps=20,
    seed=42,
)
m_boot.fit()
boot_results = m_boot.summary()
pd.DataFrame(
    {
        "point_estimate": boot_results["point_estimate"],
        "bootstrap_se": boot_results["standard_error"],
    },
    index=["Intercept", "D", "f1", "f2"],
)

	point_estimate	bootstrap_se
Intercept	0.9488	0.0195
D	2.0143	0.0327
f1	0.1240	0.0013
f2	-0.0765	0.0048

Multiple Outcomes

DuckRegression can compress several outcomes in one pass.

Show code

m_multi = DuckRegression(
    db_name=linear_db,
    table_name="data",
    formula="Y + Y2 ~ D + f1 + f2",
    cluster_col="",
    n_bootstraps=0,
    seed=232,
)
m_multi.fit()

coef_index = pd.MultiIndex.from_product(
    [["Intercept", "D", "f1", "f2"], ["Y", "Y2"]],
    names=["term", "outcome"],
)
pd.DataFrame(
    {"point_estimate": m_multi.summary()["point_estimate"]},
    index=coef_index,
)

		point_estimate
term	outcome
Intercept	Y	0.9488
Intercept	Y2	-0.5056
D	Y	2.0143
D	Y2	1.4497
f1	Y	0.1240
f1	Y2	-0.0380
f2	Y	-0.0765
f2	Y2	0.0239

Mundlak Panel Regression

DuckMundlak augments the design with unit and time averages of the covariates.

Show code

mundlak = DuckMundlak(
    db_name=panel_db,
    table_name="panel_data",
    outcome_var="Y",
    covariates=["W"],
    unit_col="unit",
    time_col="time",
    cluster_col="unit",
    n_bootstraps=0,
    seed=929,
)
mundlak.fit()

pd.DataFrame(
    {"point_estimate": mundlak.summary()["point_estimate"].flatten()},
    index=["Intercept", "W", "avg_W_unit", "avg_W_time"],
)

	point_estimate
Intercept	-0.0839
W	0.8496
avg_W_unit	1.4997
avg_W_time	3.3764

Double Demeaning

DuckDoubleDemeaning constructs the residualized treatment

\[ \ddot{W}_{it} = W_{it} - \bar{W}_{i \cdot} - \bar{W}_{\cdot t} + \bar{W}_{\cdot \cdot}. \]

Show code

double_demean = DuckDoubleDemeaning(
    db_name=panel_db,
    table_name="panel_data",
    outcome_var="Y",
    treatment_var="W",
    unit_col="unit",
    time_col="time",
    cluster_col="unit",
    n_bootstraps=0,
    seed=828,
)
double_demean.fit()

pd.DataFrame(
    {"point_estimate": double_demean.summary()["point_estimate"].flatten()},
    index=["Intercept", "ddot_W"],
)

	point_estimate
Intercept	-0.3937
ddot_W	0.8496

Event Study

DuckMundlakEventStudy creates cohort-by-time interactions and returns one coefficient path per cohort.

Show code

event_model = DuckMundlakEventStudy(
    db_name=event_db,
    table_name="panel_data",
    outcome_var="Y_it",
    treatment_col="W_it",
    unit_col="unit_id",
    time_col="time_id",
    cluster_col="unit_id",
    n_bootstraps=0,
    seed=42,
    pre_treat_interactions=False,
)
event_model.fit()
event_results = event_model.summary()["point_estimate"]

first_cohort = sorted(event_results.keys())[0]
event_results[first_cohort].rename(columns={"est": f"cohort_{first_cohort}_estimate"})

	cohort_2_estimate
treatment_time_2_0	0.4327
treatment_time_2_1	0.4327
treatment_time_2_2	1.6324
treatment_time_2_3	1.4136
treatment_time_2_4	1.5538
treatment_time_2_5	1.3262

Compressed DML

This mirrors notebooks/duckdml.ipynb: residualize a partial linear model using leave-one-out means inside discrete covariate cells.

Show code

nd = 6_000
dml_df = pd.DataFrame(
    {
        "town_id": rng.integers(0, 40, nd),
        "day_id": rng.integers(0, 12, nd),
    }
)
g = 0.25 * dml_df["town_id"] + np.sin(dml_df["day_id"])
dml_df["X1"] = (
    0.15 * dml_df["town_id"] + 0.08 * dml_df["day_id"] + rng.normal(0, 1, nd)
)
dml_df["X2"] = -0.08 * dml_df["town_id"] + 0.4 * dml_df["X1"] + rng.normal(
    0, 1, nd
)
dml_df["Y"] = 1.5 * dml_df["X1"] - 0.8 * dml_df["X2"] + g + rng.normal(0, 1.5, nd)
dml_db = os.path.join(tmpdir, "dml.db")
write_table(dml_db, "data", dml_df)

dml_model = DuckDML(
    db_name=dml_db,
    table_name="data",
    outcome_var="Y",
    treatment_var=["X1", "X2"],
    discrete_covars=["town_id", "day_id"],
    seed=42,
    n_bootstraps=20,
)
dml_model.fit()
dml_results = dml_model.summary()
pd.DataFrame(
    {
        "point_estimate": dml_results["point_estimate"],
        "bootstrap_se": dml_results["standard_error"],
        "truth": [1.5, -0.8],
    },
    index=["X1", "X2"],
)

	point_estimate	bootstrap_se	truth
X1	1.4991	0.0161	1.5
X2	-0.8053	0.0153	-0.8

Ridge Cross-Validation

DuckRidge compresses the table once, then evaluates ridge penalties over compressed folds.

Show code

duck_ridge_cv = DuckRidge(
    db_name=linear_db,
    table_name="data",
    formula="Y ~ D + f1 + f2 + f3",
    lambda_grid=np.logspace(-3, 0, 6),
    cv_folds=3,
    seed=42,
)
duck_ridge_cv.fit(lambda_selection="cv")

pd.DataFrame(
    {
        "point_estimate": duck_ridge_cv.point_estimate,
    },
    index=["Intercept", "D", "f1", "f2", "f3"],
).assign(best_lambda=duck_ridge_cv.best_lambda)

	point_estimate	best_lambda
Intercept	0.9686	1.0
D	2.0130	1.0
f1	0.1240	1.0
f2	-0.0764	1.0
f3	-0.0097	1.0

Logistic Regression

The GLM examples are smaller versions of notebooks/glm.ipynb.

Show code

ng = 7_000
x1 = rng.integers(0, 6, ng)
x2 = rng.integers(0, 4, ng)
glm_db = os.path.join(tmpdir, "glm.db")

p = 1 / (1 + np.exp(-(-0.8 + 0.35 * x1 - 0.2 * x2)))
y_binary = rng.binomial(1, p)
write_table(
    glm_db,
    "binary_data",
    pd.DataFrame({"y": y_binary, "x1": x1, "x2": x2}),
)

logit = DuckLogisticRegression(
    glm_db,
    "binary_data",
    "y ~ x1 + x2",
    seed=1,
    method="irls",
    n_bootstraps=0,
)
logit.fit()
logit.fit_vcov()

pd.DataFrame(
    {
        "point_estimate": logit.summary()["point_estimate"],
        "standard_error": logit.summary()["standard_error"],
        "truth": [-0.8, 0.35, -0.2],
    },
    index=["Intercept", "x1", "x2"],
)

	point_estimate	standard_error	truth
Intercept	-0.8019	0.0564	-0.80
x1	0.3412	0.0152	0.35
x2	-0.2117	0.0226	-0.20

Poisson Regression

Show code

mu = np.exp(0.3 + 0.12 * x1 - 0.08 * x2)
y_count = rng.poisson(mu)
write_table(
    glm_db,
    "count_data",
    pd.DataFrame({"y": y_count, "x1": x1, "x2": x2}),
)

pois = DuckPoissonRegression(
    glm_db,
    "count_data",
    "y ~ x1 + x2",
    seed=1,
    method="irls",
    n_bootstraps=0,
)
pois.fit()
pois.fit_vcov()

pd.DataFrame(
    {
        "point_estimate": pois.summary()["point_estimate"],
        "standard_error": pois.summary()["standard_error"],
        "truth": [0.3, 0.12, -0.08],
    },
    index=["Intercept", "x1", "x2"],
)

	point_estimate	standard_error	truth
Intercept	0.2985	0.0217	0.30
x1	0.1260	0.0055	0.12
x2	-0.0893	0.0083	-0.08

Multinomial Logit

Show code

X = np.c_[np.ones(ng), x1, x2]
beta = np.array([[0.2, 0.18, -0.1], [-0.3, -0.08, 0.2]])
eta = np.c_[X @ beta.T, np.zeros(ng)]
prob = np.exp(eta - eta.max(axis=1, keepdims=True))
prob = prob / prob.sum(axis=1, keepdims=True)
y_class = np.array([rng.choice(["a", "b", "c"], p=row) for row in prob])
write_table(
    glm_db,
    "class_data",
    pd.DataFrame({"y": y_class, "x1": x1, "x2": x2}),
)

mn = DuckMultinomialLogisticRegression(
    glm_db,
    "class_data",
    "y ~ x1 + x2",
    seed=1,
    labels=["a", "b", "c"],
    baseline="c",
    n_bootstraps=0,
)
mn.fit()
mn.fit_vcov()

pd.DataFrame(
    mn.summary()["point_estimate"],
    index=mn.summary()["labels"],
    columns=["Intercept", "x1", "x2"],
)

	Intercept	x1	x2
a	0.2913	0.1616	-0.1252
b	-0.2591	-0.0889	0.1984

Many-Label Poisson Decomposition

Show code

labels = [f"token_{j:02d}" for j in range(6)]
rows = []
for doc_id in range(250):
    xi1 = rng.integers(0, 5)
    xi2 = rng.integers(0, 3)
    for j, label in enumerate(labels):
        mu = np.exp(-1.2 + 0.05 * j + 0.1 * xi1 - 0.06 * xi2)
        rows.append((doc_id, label, rng.poisson(mu), xi1, xi2))

token_counts = pd.DataFrame(rows, columns=["doc_id", "label", "count", "x1", "x2"])
write_table(glm_db, "token_counts", token_counts)

pm = DuckPoissonMultinomialRegression(
    glm_db,
    "token_counts",
    count_col="count",
    label_col="label",
    covars=["x1", "x2"],
    seed=1,
    n_bootstraps=0,
)
pm.fit()
pm.summary()["point_estimate"].head()

	Intercept	x1	x2
token_00	-0.6929	0.0129	-0.3181
token_01	-1.2683	0.0784	-0.0147
token_02	-0.8115	-0.0181	-0.2068
token_03	-0.8701	0.0055	0.0744
token_04	-0.6878	-0.0720	-0.0455