def make_data(n=200_000, k=1_000, p=4, seed=123):
rng = np.random.default_rng(seed)
# Unequal cell probabilities make the compression problem look realistic.
weights = rng.gamma(shape=1.5, scale=1.0, size=k)
probs = weights / weights.sum()
g = rng.choice(k, size=n, p=probs)
cell_effect = rng.normal(scale=2.0, size=k)
cell_x_shift = rng.normal(scale=0.8, size=(k, p))
x = cell_x_shift[g] + rng.normal(size=(n, p))
beta = np.array([1.0, -0.7, 0.35, 0.2])
y = x @ beta + cell_effect[g] + rng.normal(scale=1.0, size=n)
return x, y, g, beta
def residualize_by_group(x, y, g, k):
n, p = x.shape
counts = np.bincount(g, minlength=k).astype(float)
sx = np.zeros((k, p))
sy = np.bincount(g, weights=y, minlength=k)
np.add.at(sx, g, x)
xbar = sx / counts[:, None]
ybar = sy / counts
return x - xbar[g], y - ybar[g]
def full_residualized_ols(x, y, g, k):
xr, yr = residualize_by_group(x, y, g, k)
beta = np.linalg.solve(xr.T @ xr, xr.T @ yr)
return beta, xr, yr
def exact_compressed_beta(x, y, g, k):
n, p = x.shape
counts = np.bincount(g, minlength=k).astype(float)
sx = np.zeros((k, p))
sxy = np.zeros((k, p))
sxx = np.zeros((k, p, p))
sy = np.bincount(g, weights=y, minlength=k)
np.add.at(sx, g, x)
np.add.at(sxy, g, x * y[:, None])
for j in range(p):
for l in range(j, p):
vals = x[:, j] * x[:, l]
accum = np.bincount(g, weights=vals, minlength=k)
sxx[:, j, l] = accum
sxx[:, l, j] = accum
xtx = sxx.sum(axis=0) - np.einsum("gp,gq,g->pq", sx, sx, 1.0 / counts)
xty = sxy.sum(axis=0) - (sx * (sy / counts)[:, None]).sum(axis=0)
return np.linalg.solve(xtx, xty), xtx, xty
def countsketch_beta(xr, yr, s, seed):
rng = np.random.default_rng(seed)
n, p = xr.shape
buckets = rng.integers(0, s, size=n)
signs = rng.choice(np.array([-1.0, 1.0]), size=n)
sx = np.zeros((s, p))
sy = np.zeros(s)
np.add.at(sx, buckets, xr * signs[:, None])
np.add.at(sy, buckets, yr * signs)
return np.linalg.solve(sx.T @ sx, sx.T @ sy)
def subsample_beta(xr, yr, s, seed):
rng = np.random.default_rng(seed)
n = xr.shape[0]
idx = rng.choice(n, size=s, replace=False)
# Uniform rescaling cancels in OLS, so ordinary OLS on the sampled rows is enough.
xs = xr[idx]
ys = yr[idx]
return np.linalg.solve(xs.T @ xs, xs.T @ ys)
x, y, g, beta_true = make_data()
k = int(g.max() + 1)
beta_full, xr, yr = full_residualized_ols(x, y, g, k)
beta_exact, xtx_exact, xty_exact = exact_compressed_beta(x, y, g, k)
print("n rows:", len(y))
print("occupied cells:", k)
print("continuous regressors:", x.shape[1])
print("true beta:", beta_true)
print("full residualized OLS:", beta_full)
print("exact compressed beta:", beta_exact)
print("max absolute difference, full vs exact:", np.max(np.abs(beta_full - beta_exact)))