feat: add model comparisons and sanitize session files

- Rename gamma to glm5 and model to minimax-m2.7 - Add model_comparison/ directory with head-to-head analyses - Sanitize all session.jsonl files: remove absolute paths and usernames - Remove __pycache__ artifacts - Add .gitignore
2026-04-23 11:16:01 +02:00
commit 8e72eef09c
62 changed files with 18469 additions and 0 deletions
@@ -0,0 +1,227 @@
+"""
+Benchmark and numerical stability comparison for layer_norm_backward.py
+"""
+
+import numpy as np
+import time
+from layer_norm_backward import layer_norm_forward, layer_norm_backward
+
+
+# ---------------------------------------------------------------------------
+# 1. Numerical stability: two-pass vs naive variance
+# ---------------------------------------------------------------------------
+
+def naive_variance(x, axis=-1):
+    """Naive one-pass variance: E[x²] - E[x]² — prone to cancellation."""
+    return np.mean(x ** 2, axis=axis) - np.mean(x, axis=axis) ** 2
+
+
+def two_pass_variance(x, axis=-1):
+    """Two-pass variance: center first, then compute — numerically stable."""
+    mu = np.mean(x, axis=axis, keepdims=True)
+    return np.mean((x - mu) ** 2, axis=axis)
+
+
+def demo_variance_stability():
+    print("=" * 70)
+    print("NUMERICAL STABILITY: TWO-PASS vs NAIVE VARIANCE")
+    print("=" * 70)
+    print()
+    print("When mean² ≫ var, the naive formula E[x²] - E[x]² suffers from")
+    print("catastrophic cancellation. The two-pass algorithm avoids this.")
+    print()
+
+    # Construct a pathological case: large offset, tiny variance
+    offset = 1e8
+    true_values = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64)
+    true_var = np.var(true_values)  # 2.0
+
+    x_shifted = true_values + offset
+
+    naive_var = naive_variance(x_shifted[np.newaxis, np.newaxis, :])
+    stable_var = two_pass_variance(x_shifted[np.newaxis, np.newaxis, :])
+
+    print(f"  True values:       {true_values}")
+    print(f"  True variance:     {true_var:.15f}")
+    print(f"  Offset:            {offset:.0e}")
+    print(f"  Shifted values:    {x_shifted}")
+    print()
+    print(f"  Naive (E[x²]-E[x]²):   {naive_var[0,0]:.15f}  (error: {abs(naive_var[0,0] - true_var):.2e})")
+    print(f"  Two-pass (centered):   {stable_var[0,0]:.15f}  (error: {abs(stable_var[0,0] - true_var):.2e})")
+    print()
+
+    # Show how it gets worse with larger offsets
+    print("  Worsening with larger offsets:")
+    for exp in range(4, 16, 2):
+        offset = 10 ** exp
+        x = true_values + offset
+        nv = naive_variance(x[np.newaxis, np.newaxis, :])[0, 0]
+        sv = two_pass_variance(x[np.newaxis, np.newaxis, :])[0, 0]
+        print(f"    offset=1e{exp:2d}:  naive={nv:15.6f}  stable={sv:15.6f}  true=2.000000")
+
+    print()
+
+
+# ---------------------------------------------------------------------------
+# 2. Performance benchmark
+# ---------------------------------------------------------------------------
+
+def benchmark(B, T, D, n_warmup=5, n_iter=50):
+    """Benchmark forward + backward throughput."""
+    x = np.random.randn(B, T, D).astype(np.float32)
+    gamma = np.random.randn(D).astype(np.float32)
+    beta = np.random.randn(D).astype(np.float32)
+    dy = np.random.randn(B, T, D).astype(np.float32)
+
+    # Warmup
+    for _ in range(n_warmup):
+        y, cache = layer_norm_forward(x, gamma, beta)
+        dx, dgamma, dbeta = layer_norm_backward(dy, cache)
+
+    # Benchmark forward
+    times_fwd = []
+    for _ in range(n_iter):
+        t0 = time.perf_counter()
+        y, cache = layer_norm_forward(x, gamma, beta)
+        times_fwd.append(time.perf_counter() - t0)
+
+    # Benchmark backward
+    times_bwd = []
+    for _ in range(n_iter):
+        t0 = time.perf_counter()
+        dx, dgamma, dbeta = layer_norm_backward(dy, cache)
+        times_bwd.append(time.perf_counter() - t0)
+
+    N = B * T * D
+    fwd_ms = np.median(times_fwd) * 1000
+    bwd_ms = np.median(times_bwd) * 1000
+    fwd_tflops = (6 * N) / (fwd_ms * 1e-3) / 1e12
+    bwd_tflops = (9 * N) / (bwd_ms * 1e-3) / 1e12
+
+    return {
+        "shape": f"({B}, {T}, {D})",
+        "N": N,
+        "fwd_ms": fwd_ms,
+        "bwd_ms": bwd_ms,
+        "fwd_tflops": fwd_tflops,
+        "bwd_tflops": bwd_tflops,
+    }
+
+
+def run_benchmarks():
+    print("=" * 70)
+    print("PERFORMANCE BENCHMARK (NumPy, single CPU core)")
+    print("=" * 70)
+    print()
+    print(f"{'Shape':<20} {'Elements':>10} {'Fwd (ms)':>10} {'Bwd (ms)':>10} {'Fwd TF/s':>10} {'Bwd TF/s':>10}")
+    print("-" * 72)
+
+    configs = [
+        (1, 1, 64),
+        (1, 1, 1024),
+        (1, 1, 4096),
+        (2, 128, 64),
+        (2, 128, 1024),
+        (2, 128, 4096),
+        (4, 512, 1024),
+        (4, 512, 4096),
+    ]
+
+    for B, T, D in configs:
+        result = benchmark(B, T, D)
+        print(
+            f"{result['shape']:<20} {result['N']:>10,} "
+            f"{result['fwd_ms']:>10.4f} {result['bwd_ms']:>10.4f} "
+            f"{result['fwd_tflops']:>10.4f} {result['bwd_tflops']:>10.4f}"
+        )
+
+    print()
+    print("  Note: NumPy is multithreaded for large arrays (BLAS).")
+    print("  These numbers are memory-bandwidth bound, not compute bound.")
+
+
+# ---------------------------------------------------------------------------
+# 3. Backward formula verification: alternative derivation
+# ---------------------------------------------------------------------------
+
+def verify_backward_alternative():
+    """
+    Verify the backward formula using an alternative derivation path.
+
+    Alternative: compute dx by explicitly differentiating through each step
+    (mean → centered → normalized → affine) rather than using the compact
+    projection formula. This serves as a cross-check.
+    """
+    print("=" * 70)
+    print("BACKWARD CROSS-CHECK: ALTERNATIVE DERIVATION")
+    print("=" * 70)
+    print()
+
+    B, T, D = 3, 5, 8
+    x = np.random.randn(B, T, D).astype(np.float64)
+    gamma = np.random.randn(D).astype(np.float64)
+    beta = np.random.randn(D).astype(np.float64)
+    dy = np.random.randn(B, T, D).astype(np.float64)
+
+    # Forward
+    mu = x.mean(axis=-1, keepdims=True)        # (B, T, 1)
+    x_c = x - mu                                 # (B, T, D)
+    var = np.mean(x_c ** 2, axis=-1, keepdims=True)  # (B, T, 1)
+    std = np.sqrt(var + 1e-5)                    # (B, T, 1)
+    x_hat = x_c / std                            # (B, T, D)
+    y = gamma * x_hat + beta
+
+    # --- Alternative backward: step-by-step chain rule ---
+    # Step 4: y = γ·x_hat + β  →  ∂L/∂x_hat = γ·dy
+    dx_hat = gamma[np.newaxis, np.newaxis, :] * dy  # (B, T, D)
+
+    # Step 3: x_hat = x_c / std
+    #   ∂x_hat_i/∂x_c_j = δ_ij/std - x_c_i·(Σ_k x_c_k·∂x_c_k/∂x_c_j)/(D·std³)
+    #   But since std depends on x_c, we need the full derivative.
+    #   ∂x_hat_i/∂x_c_j = (δ_ij·std - x_hat_i·x_hat_j/std) / std
+    #                    = (δ_ij - x_hat_i·x_hat_j) / std
+    #   Wait, that's not quite right. Let me be more careful.
+    #
+    #   x_hat_i = x_c_i / σ where σ = sqrt(mean(x_c²) + ε)
+    #   ∂σ/∂x_c_j = x_c_j / (D·σ)
+    #   ∂x_hat_i/∂x_c_j = (δ_ij·σ - x_c_i·∂σ/∂x_c_j) / σ²
+    #                    = (δ_ij·σ - x_c_i·x_c_j/(D·σ)) / σ²
+    #                    = δ_ij/σ - x_hat_i·x_hat_j/(D·σ)
+    #                    = (1/σ) · (δ_ij - x_hat_i·x_hat_j/D)
+    #
+    #   So: ∂L/∂x_c_j = Σ_i dx_hat_i · (1/σ) · (δ_ij - x_hat_i·x_hat_j/D)
+    #                    = (1/σ) · [dx_hat_j - (1/D)·Σ_i(dx_hat_i·x_hat_i)·x_hat_j]
+
+    std_inv = 1.0 / std[..., 0]  # (B, T)
+    dx_hat_sum_xhat = np.sum(dx_hat * x_hat, axis=-1, keepdims=True)  # (B, T, 1)
+    dx_c = std_inv[..., np.newaxis] * (dx_hat - dx_hat_sum_xhat * x_hat / D)
+
+    # Step 2: x_c = x - μ
+    #   ∂x_c_i/∂x_j = δ_ij - 1/D
+    #   ∂L/∂x_j = Σ_i ∂L/∂x_c_i · (δ_ij - 1/D)
+    #            = dx_c_j - (1/D)·Σ_i dx_c_i
+    dx_c_sum = np.sum(dx_c, axis=-1, keepdims=True)  # (B, T, 1)
+    dx_alt = dx_c - dx_c_sum / D
+
+    # --- Our compact backward ---
+    y2, cache = layer_norm_forward(x, gamma, beta)
+    dx_ours, _, _ = layer_norm_backward(dy, cache)
+
+    # Compare
+    rel_err = np.max(np.abs(dx_alt - dx_ours)) / np.max(np.abs(dx_alt) + np.abs(dx_ours) + 1e-30)
+    print(f"  Alternative derivation (step-by-step chain rule)")
+    print(f"  Compact derivation (projection formula)")
+    print(f"  Relative error: {rel_err:.2e}")
+    print(f"  {'✓ MATCH' if rel_err < 1e-10 else '✗ MISMATCH'}")
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    np.random.seed(42)
+    demo_variance_stability()
+    run_benchmarks()
+    verify_backward_alternative()