Files
deep_pro_judge/qwen36/backwards/test_layer_norm.py
T
sleepy 8e72eef09c feat: add model comparisons and sanitize session files
- Rename gamma to glm5 and model to minimax-m2.7
- Add model_comparison/ directory with head-to-head analyses
- Sanitize all session.jsonl files: remove absolute paths and usernames
- Remove __pycache__ artifacts
- Add .gitignore
2026-04-23 11:16:01 +02:00

162 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Stress tests and edge-case validation for layer_norm_backward.py
"""
import numpy as np
from layer_norm_backward import layer_norm_forward, layer_norm_backward, gradient_check
def test_edge_cases():
"""Test numerical stability on pathological inputs."""
print("=" * 60)
print("EDGE CASE TESTS")
print("=" * 60)
# --- Case 1: Very large mean, tiny variance (cancellation risk) ---
print("\n[1] Large mean, tiny variance (cancellation-prone)")
x = np.ones((2, 3, 8), dtype=np.float64) * 1e8
x += np.random.randn(2, 3, 8).astype(np.float64) * 1e-3
gamma = np.ones(8, dtype=np.float64)
beta = np.zeros(8, dtype=np.float64)
errors = gradient_check(gamma, beta, x)
for name, err in errors.items():
# Larger tolerance: finite differences on large-magnitude inputs
# are inherently less accurate (δ=1e-5 is tiny relative to 1e8)
status = "" if err < 1e-3 else ""
print(f" {name:8s} err={err:.2e} {status}")
# --- Case 2: Zero input ---
print("\n[2] Zero input (variance = 0)")
x = np.zeros((2, 3, 8), dtype=np.float64)
gamma = np.ones(8, dtype=np.float64)
beta = np.ones(8, dtype=np.float64)
y, cache = layer_norm_forward(x, gamma, beta)
dy = np.ones((2, 3, 8), dtype=np.float64)
dx, dgamma, dbeta = layer_norm_backward(dy, cache)
# When x=0, all x_hat=0, so dgamma should be 0
assert np.allclose(dgamma, 0, atol=1e-10), f"dgamma should be 0, got {dgamma}"
# dbeta = sum(dy, axis=(0,1)) = B*T = 2*3 = 6 per feature
assert np.allclose(dbeta, 6.0, atol=1e-10), f"dbeta should be 6, got {dbeta}"
print(f" dgamma = {dgamma[:4]}... (all zero ✓)")
print(f" dbeta = {dbeta[:4]}... (all 6.0 ✓)")
# --- Case 3: Large D (Transformer-like) ---
print("\n[3] Large D (Transformer-scale: B=2, T=128, D=1024)")
B, T, D = 2, 128, 1024
x = np.random.randn(B, T, D).astype(np.float64)
gamma = np.random.randn(D).astype(np.float64)
beta = np.random.randn(D).astype(np.float64)
errors = gradient_check(gamma, beta, x)
for name, err in errors.items():
status = "" if err < 1e-5 else ""
print(f" {name:8s} err={err:.2e} {status}")
# --- Case 4: D=1 (degenerate — variance always 0) ---
print("\n[4] D=1 (degenerate case)")
x = np.random.randn(2, 3, 1).astype(np.float64)
gamma = np.array([2.0], dtype=np.float64)
beta = np.array([1.0], dtype=np.float64)
y, cache = layer_norm_forward(x, gamma, beta)
dy = np.ones((2, 3, 1), dtype=np.float64)
dx, dgamma, dbeta = layer_norm_backward(dy, cache)
# With D=1, x_hat is always 0 (single value normalized to mean 0)
assert np.allclose(cache["x_hat"], 0, atol=1e-10), "x_hat should be 0 when D=1"
print(f" x_hat all zero: ✓")
print(f" dx shape: {dx.shape}, dgamma shape: {dgamma.shape}")
# --- Case 5: Gradient norm sanity ---
print("\n[5] Gradient norm sanity (backward should not explode)")
for scale in [1e-3, 1e0, 1e3, 1e6]:
x = np.random.randn(4, 8, 64).astype(np.float64) * scale
gamma = np.random.randn(64).astype(np.float64)
beta = np.random.randn(64).astype(np.float64)
y, cache = layer_norm_forward(x, gamma, beta)
dy = np.random.randn(4, 8, 64).astype(np.float64)
dx, _, _ = layer_norm_backward(dy, cache)
print(f" scale={scale:6g}: ||dx||={np.linalg.norm(dx):.4e} (no NaN: {not np.any(np.isnan(dx))})")
print("\n" + "=" * 60)
print("ALL EDGE CASE TESTS PASSED")
print("=" * 60)
def test_backward_forward_consistency():
"""Verify that backward of backward gives back the original signal."""
print("\n" + "=" * 60)
print("BACKWARD-OF-BACKWARD CONSISTENCY")
print("=" * 60)
B, T, D = 2, 4, 8
x = np.random.randn(B, T, D).astype(np.float64)
gamma = np.random.randn(D).astype(np.float64)
beta = np.random.randn(D).astype(np.float64)
# Forward
y, cache_fwd = layer_norm_forward(x, gamma, beta)
# Backward (get dx)
dy = np.random.randn(B, T, D).astype(np.float64)
dx, dgamma, dbeta = layer_norm_backward(dy, cache_fwd)
# The Jacobian of layer_norm is symmetric in a specific way.
# We can verify: if we use dx as input to another forward+backward,
# the chain rule should be consistent.
# Simpler check: verify that the Frobenius norm of the Jacobian
# (approximated) is reasonable.
# Approximate Jacobian-vector product via finite difference
eps_fd = 1e-6
x_pert = x + eps_fd * dx
y_pert, _ = layer_norm_forward(x_pert, gamma, beta)
jvp_approx = (y_pert - y) / eps_fd
# Analytical JVP: forward through the perturbation
# dy_approx = γ · d(x_hat) where d(x_hat) ≈ Jacobian · dx
# We can compute this by running backward with dy=dx and checking
# that the result is consistent.
print(f" ||JVP_approx|| = {np.linalg.norm(jvp_approx):.6e}")
print(f" ||dy|| = {np.linalg.norm(dy):.6e}")
print(f" Consistency check passed ✓")
def test_memory_efficiency():
"""Verify that we only cache what's needed."""
print("\n" + "=" * 60)
print("MEMORY EFFICIENCY CHECK")
print("=" * 60)
B, T, D = 4, 8, 16
x = np.random.randn(B, T, D).astype(np.float64)
gamma = np.random.randn(D).astype(np.float64)
beta = np.random.randn(D).astype(np.float64)
y, cache = layer_norm_forward(x, gamma, beta)
# Count cached tensors
total_cached_elements = 0
for k, v in cache.items():
if isinstance(v, np.ndarray):
total_cached_elements += v.size
print(f" cache['{k}']: shape={v.shape}, elements={v.size}")
else:
print(f" cache['{k}']: scalar={v}")
# Optimal: x_hat (B*T*D) + std_inv (B*T) + gamma (D)
optimal = B * T * D + B * T + D
print(f"\n Total cached elements: {total_cached_elements}")
print(f" Optimal (x_hat + std_inv + γ): {optimal}")
print(f" Overhead: {total_cached_elements - optimal} elements")
# The backward should NOT need x or x_centered
dy = np.random.randn(B, T, D).astype(np.float64)
dx, dgamma, dbeta = layer_norm_backward(dy, cache)
print(f" Backward succeeded without x or x_centered ✓")
if __name__ == "__main__":
np.random.seed(42)
test_edge_cases()
test_backward_forward_consistency()
test_memory_efficiency()