duckreg
  • Home
  • Compression
  • Linear Models
  • Panel
  • DML
  • GLMs
  • Ridge
  • Inference
  • Examples
  1. Executed Examples
  • duckreg Documentation
  • Compression and Estimator Lifecycle
  • Linear Regression API
  • Panel Estimators
  • Compressed Double Machine Learning
  • Generalized Linear Models
  • Compressed Ridge Regression
  • Inference and Variance Estimation
  • Executed Examples

On this page

  • Setup
    • Synthetic Tables
  • Compressed OLS
  • Cluster Bootstrap
  • Multiple Outcomes
  • Mundlak Panel Regression
  • Double Demeaning
  • Event Study
  • Compressed DML
  • Ridge Cross-Validation
  • Logistic Regression
  • Poisson Regression
  • Multinomial Logit
  • Many-Label Poisson Decomposition

Executed Examples

This page runs small versions of the examples from the repository notebooks. The original notebooks use larger data to demonstrate scale; these examples use compact synthetic tables so the documentation renders quickly while still showing real estimator output.

Setup

Show code
import os
import tempfile

import duckdb
import numpy as np
import pandas as pd

import duckreg.estimators as estimators_module
from duckreg.estimators import (
    DuckDML,
    DuckDoubleDemeaning,
    DuckLogisticRegression,
    DuckMultinomialLogisticRegression,
    DuckMundlak,
    DuckMundlakEventStudy,
    DuckPoissonMultinomialRegression,
    DuckPoissonRegression,
    DuckRegression,
)
from duckreg.regularized import DuckRidge

pd.set_option("display.precision", 4)
rng = np.random.default_rng(2026)
tmpdir = tempfile.mkdtemp(prefix="duckreg_docs_")

# Keep bootstrap examples readable in rendered docs.
estimators_module.tqdm = lambda iterable, *args, **kwargs: iterable


def write_table(db_path, table_name, df):
    conn = duckdb.connect(db_path)
    conn.register("df", df)
    conn.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM df")
    conn.close()

Synthetic Tables

Show code
n = 6_000
linear = pd.DataFrame(
    {
        "rowid": np.arange(n),
        "D": rng.integers(0, 2, n),
        "f1": rng.integers(0, 18, n),
        "f2": rng.integers(0, 8, n),
        "f3": rng.integers(0, 5, n),
    }
)
linear["Y"] = (
    1.0
    + 2.0 * linear["D"]
    + 0.12 * linear["f1"]
    - 0.08 * linear["f2"]
    + rng.normal(0, 1, n)
)
linear["Y2"] = (
    -0.5
    + 1.4 * linear["D"]
    - 0.04 * linear["f1"]
    + 0.03 * linear["f2"]
    + rng.normal(0, 1, n)
)
linear_db = os.path.join(tmpdir, "linear.db")
write_table(linear_db, "data", linear)

units, periods = 120, 6
unit_ids = np.repeat(np.arange(units), periods)
time_ids = np.tile(np.arange(periods), units)
unit_fe = rng.normal(0, 1, units)
time_fe = rng.normal(0, 0.4, periods)
W = (
    rng.normal(0, 1, units * periods)
    + 0.4 * unit_fe[unit_ids]
    + 0.2 * time_fe[time_ids]
)
Y = 0.8 * W + unit_fe[unit_ids] + time_fe[time_ids] + rng.normal(
    0, 1, units * periods
)
panel = pd.DataFrame({"unit": unit_ids, "time": time_ids, "W": W, "Y": Y})
panel_db = os.path.join(tmpdir, "panel.db")
write_table(panel_db, "panel_data", panel)

cohorts = rng.choice([2, 3, 4, 99], size=units, p=[0.25, 0.25, 0.25, 0.25])
D_it = np.array(
    [
        1
        if time_ids[i] >= cohorts[unit_ids[i]] and cohorts[unit_ids[i]] < 99
        else 0
        for i in range(len(unit_ids))
    ]
)
Y_it = unit_fe[unit_ids] + time_fe[time_ids] + 1.2 * D_it + rng.normal(
    0, 1, len(D_it)
)
event = pd.DataFrame(
    {"unit_id": unit_ids, "time_id": time_ids, "W_it": D_it, "Y_it": Y_it}
)
event_db = os.path.join(tmpdir, "event.db")
write_table(event_db, "panel_data", event)

data_inventory = pd.DataFrame(
    {
        "table": ["linear data", "panel data", "event-study panel"],
        "rows": [len(linear), len(panel), len(event)],
        "database": [os.path.basename(linear_db), os.path.basename(panel_db), os.path.basename(event_db)],
    }
)
data_inventory
table rows database
0 linear data 6000 linear.db
1 panel data 720 panel.db
2 event-study panel 720 event.db

Compressed OLS

This is the core DuckRegression workflow from notebooks/introduction.ipynb: fit on compressed covariate cells, then compute analytic HC1-style standard errors.

Show code
m = DuckRegression(
    db_name=linear_db,
    table_name="data",
    formula="Y ~ D + f1 + f2",
    cluster_col="",
    n_bootstraps=0,
    seed=42,
)
m.fit()
m.fit_vcov()

results = m.summary()
pd.DataFrame(
    {
        "point_estimate": results["point_estimate"],
        "standard_error": results["standard_error"],
    },
    index=["Intercept", "D", "f1", "f2"],
)
point_estimate standard_error
Intercept 0.9488 0.0340
D 2.0143 0.0257
f1 0.1240 0.0025
f2 -0.0765 0.0056

The compressed table is much smaller than the raw table because the regressors are discrete.

Show code
pd.DataFrame(
    {
        "raw_rows": [len(linear)],
        "compressed_rows": [len(m.df_compressed)],
        "compression_ratio": [len(linear) / len(m.df_compressed)],
    }
)
raw_rows compressed_rows compression_ratio
0 6000 288 20.8333

Cluster Bootstrap

When cluster_col is set and n_bootstraps > 0, DuckRegression resamples clusters and recompresses the bootstrap draw.

Show code
m_boot = DuckRegression(
    db_name=linear_db,
    table_name="data",
    formula="Y ~ D + f1 + f2",
    cluster_col="f1",
    n_bootstraps=20,
    seed=42,
)
m_boot.fit()
boot_results = m_boot.summary()
pd.DataFrame(
    {
        "point_estimate": boot_results["point_estimate"],
        "bootstrap_se": boot_results["standard_error"],
    },
    index=["Intercept", "D", "f1", "f2"],
)
point_estimate bootstrap_se
Intercept 0.9488 0.0195
D 2.0143 0.0327
f1 0.1240 0.0013
f2 -0.0765 0.0048

Multiple Outcomes

DuckRegression can compress several outcomes in one pass.

Show code
m_multi = DuckRegression(
    db_name=linear_db,
    table_name="data",
    formula="Y + Y2 ~ D + f1 + f2",
    cluster_col="",
    n_bootstraps=0,
    seed=232,
)
m_multi.fit()

coef_index = pd.MultiIndex.from_product(
    [["Intercept", "D", "f1", "f2"], ["Y", "Y2"]],
    names=["term", "outcome"],
)
pd.DataFrame(
    {"point_estimate": m_multi.summary()["point_estimate"]},
    index=coef_index,
)
point_estimate
term outcome
Intercept Y 0.9488
Y2 -0.5056
D Y 2.0143
Y2 1.4497
f1 Y 0.1240
Y2 -0.0380
f2 Y -0.0765
Y2 0.0239

Mundlak Panel Regression

DuckMundlak augments the design with unit and time averages of the covariates.

Show code
mundlak = DuckMundlak(
    db_name=panel_db,
    table_name="panel_data",
    outcome_var="Y",
    covariates=["W"],
    unit_col="unit",
    time_col="time",
    cluster_col="unit",
    n_bootstraps=0,
    seed=929,
)
mundlak.fit()

pd.DataFrame(
    {"point_estimate": mundlak.summary()["point_estimate"].flatten()},
    index=["Intercept", "W", "avg_W_unit", "avg_W_time"],
)
point_estimate
Intercept -0.0839
W 0.8496
avg_W_unit 1.4997
avg_W_time 3.3764

Double Demeaning

DuckDoubleDemeaning constructs the residualized treatment

\[ \ddot{W}_{it} = W_{it} - \bar{W}_{i \cdot} - \bar{W}_{\cdot t} + \bar{W}_{\cdot \cdot}. \]

Show code
double_demean = DuckDoubleDemeaning(
    db_name=panel_db,
    table_name="panel_data",
    outcome_var="Y",
    treatment_var="W",
    unit_col="unit",
    time_col="time",
    cluster_col="unit",
    n_bootstraps=0,
    seed=828,
)
double_demean.fit()

pd.DataFrame(
    {"point_estimate": double_demean.summary()["point_estimate"].flatten()},
    index=["Intercept", "ddot_W"],
)
point_estimate
Intercept -0.3937
ddot_W 0.8496

Event Study

DuckMundlakEventStudy creates cohort-by-time interactions and returns one coefficient path per cohort.

Show code
event_model = DuckMundlakEventStudy(
    db_name=event_db,
    table_name="panel_data",
    outcome_var="Y_it",
    treatment_col="W_it",
    unit_col="unit_id",
    time_col="time_id",
    cluster_col="unit_id",
    n_bootstraps=0,
    seed=42,
    pre_treat_interactions=False,
)
event_model.fit()
event_results = event_model.summary()["point_estimate"]

first_cohort = sorted(event_results.keys())[0]
event_results[first_cohort].rename(columns={"est": f"cohort_{first_cohort}_estimate"})
cohort_2_estimate
treatment_time_2_0 0.4327
treatment_time_2_1 0.4327
treatment_time_2_2 1.6324
treatment_time_2_3 1.4136
treatment_time_2_4 1.5538
treatment_time_2_5 1.3262

Compressed DML

This mirrors notebooks/duckdml.ipynb: residualize a partial linear model using leave-one-out means inside discrete covariate cells.

Show code
nd = 6_000
dml_df = pd.DataFrame(
    {
        "town_id": rng.integers(0, 40, nd),
        "day_id": rng.integers(0, 12, nd),
    }
)
g = 0.25 * dml_df["town_id"] + np.sin(dml_df["day_id"])
dml_df["X1"] = (
    0.15 * dml_df["town_id"] + 0.08 * dml_df["day_id"] + rng.normal(0, 1, nd)
)
dml_df["X2"] = -0.08 * dml_df["town_id"] + 0.4 * dml_df["X1"] + rng.normal(
    0, 1, nd
)
dml_df["Y"] = 1.5 * dml_df["X1"] - 0.8 * dml_df["X2"] + g + rng.normal(0, 1.5, nd)
dml_db = os.path.join(tmpdir, "dml.db")
write_table(dml_db, "data", dml_df)

dml_model = DuckDML(
    db_name=dml_db,
    table_name="data",
    outcome_var="Y",
    treatment_var=["X1", "X2"],
    discrete_covars=["town_id", "day_id"],
    seed=42,
    n_bootstraps=20,
)
dml_model.fit()
dml_results = dml_model.summary()
pd.DataFrame(
    {
        "point_estimate": dml_results["point_estimate"],
        "bootstrap_se": dml_results["standard_error"],
        "truth": [1.5, -0.8],
    },
    index=["X1", "X2"],
)
point_estimate bootstrap_se truth
X1 1.4991 0.0161 1.5
X2 -0.8053 0.0153 -0.8

Ridge Cross-Validation

DuckRidge compresses the table once, then evaluates ridge penalties over compressed folds.

Show code
duck_ridge_cv = DuckRidge(
    db_name=linear_db,
    table_name="data",
    formula="Y ~ D + f1 + f2 + f3",
    lambda_grid=np.logspace(-3, 0, 6),
    cv_folds=3,
    seed=42,
)
duck_ridge_cv.fit(lambda_selection="cv")

pd.DataFrame(
    {
        "point_estimate": duck_ridge_cv.point_estimate,
    },
    index=["Intercept", "D", "f1", "f2", "f3"],
).assign(best_lambda=duck_ridge_cv.best_lambda)
point_estimate best_lambda
Intercept 0.9686 1.0
D 2.0130 1.0
f1 0.1240 1.0
f2 -0.0764 1.0
f3 -0.0097 1.0

Logistic Regression

The GLM examples are smaller versions of notebooks/glm.ipynb.

Show code
ng = 7_000
x1 = rng.integers(0, 6, ng)
x2 = rng.integers(0, 4, ng)
glm_db = os.path.join(tmpdir, "glm.db")

p = 1 / (1 + np.exp(-(-0.8 + 0.35 * x1 - 0.2 * x2)))
y_binary = rng.binomial(1, p)
write_table(
    glm_db,
    "binary_data",
    pd.DataFrame({"y": y_binary, "x1": x1, "x2": x2}),
)

logit = DuckLogisticRegression(
    glm_db,
    "binary_data",
    "y ~ x1 + x2",
    seed=1,
    method="irls",
    n_bootstraps=0,
)
logit.fit()
logit.fit_vcov()

pd.DataFrame(
    {
        "point_estimate": logit.summary()["point_estimate"],
        "standard_error": logit.summary()["standard_error"],
        "truth": [-0.8, 0.35, -0.2],
    },
    index=["Intercept", "x1", "x2"],
)
point_estimate standard_error truth
Intercept -0.8019 0.0564 -0.80
x1 0.3412 0.0152 0.35
x2 -0.2117 0.0226 -0.20

Poisson Regression

Show code
mu = np.exp(0.3 + 0.12 * x1 - 0.08 * x2)
y_count = rng.poisson(mu)
write_table(
    glm_db,
    "count_data",
    pd.DataFrame({"y": y_count, "x1": x1, "x2": x2}),
)

pois = DuckPoissonRegression(
    glm_db,
    "count_data",
    "y ~ x1 + x2",
    seed=1,
    method="irls",
    n_bootstraps=0,
)
pois.fit()
pois.fit_vcov()

pd.DataFrame(
    {
        "point_estimate": pois.summary()["point_estimate"],
        "standard_error": pois.summary()["standard_error"],
        "truth": [0.3, 0.12, -0.08],
    },
    index=["Intercept", "x1", "x2"],
)
point_estimate standard_error truth
Intercept 0.2985 0.0217 0.30
x1 0.1260 0.0055 0.12
x2 -0.0893 0.0083 -0.08

Multinomial Logit

Show code
X = np.c_[np.ones(ng), x1, x2]
beta = np.array([[0.2, 0.18, -0.1], [-0.3, -0.08, 0.2]])
eta = np.c_[X @ beta.T, np.zeros(ng)]
prob = np.exp(eta - eta.max(axis=1, keepdims=True))
prob = prob / prob.sum(axis=1, keepdims=True)
y_class = np.array([rng.choice(["a", "b", "c"], p=row) for row in prob])
write_table(
    glm_db,
    "class_data",
    pd.DataFrame({"y": y_class, "x1": x1, "x2": x2}),
)

mn = DuckMultinomialLogisticRegression(
    glm_db,
    "class_data",
    "y ~ x1 + x2",
    seed=1,
    labels=["a", "b", "c"],
    baseline="c",
    n_bootstraps=0,
)
mn.fit()
mn.fit_vcov()

pd.DataFrame(
    mn.summary()["point_estimate"],
    index=mn.summary()["labels"],
    columns=["Intercept", "x1", "x2"],
)
Intercept x1 x2
a 0.2913 0.1616 -0.1252
b -0.2591 -0.0889 0.1984

Many-Label Poisson Decomposition

Show code
labels = [f"token_{j:02d}" for j in range(6)]
rows = []
for doc_id in range(250):
    xi1 = rng.integers(0, 5)
    xi2 = rng.integers(0, 3)
    for j, label in enumerate(labels):
        mu = np.exp(-1.2 + 0.05 * j + 0.1 * xi1 - 0.06 * xi2)
        rows.append((doc_id, label, rng.poisson(mu), xi1, xi2))

token_counts = pd.DataFrame(rows, columns=["doc_id", "label", "count", "x1", "x2"])
write_table(glm_db, "token_counts", token_counts)

pm = DuckPoissonMultinomialRegression(
    glm_db,
    "token_counts",
    count_col="count",
    label_col="label",
    covars=["x1", "x2"],
    seed=1,
    n_bootstraps=0,
)
pm.fit()
pm.summary()["point_estimate"].head()
Intercept x1 x2
token_00 -0.6929 0.0129 -0.3181
token_01 -1.2683 0.0784 -0.0147
token_02 -0.8115 -0.0181 -0.2068
token_03 -0.8701 0.0055 0.0744
token_04 -0.6878 -0.0720 -0.0455