45c3aad453
- Add Claude Opus 4.7, Kimi K2.6, GLM-5.1 to existing GLM-5, Qwen3-6, MiniMax-M2.7 - Add 5 new challenges: flash attention fwd/bwd, beam search, DFlash, ternary training - Rewrite README with TL;DR rankings, grade matrix, and DeepSeek V4 Pro attribution - Add analysis/ folder with cross-model comparisons and per-challenge deep dives - Add deploy_challenges.sh script - Expand .gitignore to exclude Python envs, ML weights, and build artifacts
489 lines
17 KiB
Python
489 lines
17 KiB
Python
"""
|
|
Batched Beam Search Decoder for Autoregressive Generation
|
|
Implemented in pure NumPy
|
|
"""
|
|
|
|
import numpy as np
|
|
from typing import List, Tuple
|
|
|
|
|
|
class MinimalLanguageModel:
|
|
"""
|
|
A minimal language model with random embeddings + 1 transformer block.
|
|
Used to test beam search correctness - quality doesn't matter.
|
|
"""
|
|
|
|
def __init__(self, vocab_size: int = 1000, d_model: int = 64, num_heads: int = 4):
|
|
self.vocab_size = vocab_size
|
|
self.d_model = d_model
|
|
self.num_heads = num_heads
|
|
|
|
np.random.seed(42)
|
|
self.embedding = np.random.randn(vocab_size, d_model).astype(np.float32) * 0.02
|
|
self.embedding_norm = np.random.randn(d_model, d_model).astype(np.float32) * 0.02
|
|
|
|
self.query_projection = np.random.randn(d_model, d_model).astype(np.float32) * 0.02
|
|
self.key_projection = np.random.randn(d_model, d_model).astype(np.float32) * 0.02
|
|
self.value_projection = np.random.randn(d_model, d_model).astype(np.float32) * 0.02
|
|
|
|
self.output_projection = np.random.randn(d_model, d_model).astype(np.float32) * 0.02
|
|
self.ffn_inner = np.random.randn(d_model, d_model * 4).astype(np.float32) * 0.02
|
|
self.ffn_outer = np.random.randn(d_model * 4, d_model).astype(np.float32) * 0.02
|
|
|
|
self.layer_norm_scale = np.ones(d_model).astype(np.float32)
|
|
self.layer_norm_bias = np.zeros(d_model).astype(np.float32)
|
|
|
|
self.ffn_ln_scale = np.ones(d_model).astype(np.float32)
|
|
self.ffn_ln_bias = np.zeros(d_model).astype(np.float32)
|
|
|
|
def _layer_norm(self, x: np.ndarray) -> np.ndarray:
|
|
mean = np.mean(x, axis=-1, keepdims=True)
|
|
std = np.std(x, axis=-1, keepdims=True) + 1e-6
|
|
return self.layer_norm_scale * (x - mean) / std + self.layer_norm_bias
|
|
|
|
def _ffn_layer_norm(self, x: np.ndarray) -> np.ndarray:
|
|
mean = np.mean(x, axis=-1, keepdims=True)
|
|
std = np.std(x, axis=-1, keepdims=True) + 1e-6
|
|
return self.ffn_ln_scale * (x - mean) / std + self.ffn_ln_bias
|
|
|
|
def _multi_head_attention(self, x: np.ndarray) -> np.ndarray:
|
|
batch_size, seq_len, d_model = x.shape
|
|
|
|
Q = np.dot(x, self.query_projection)
|
|
K = np.dot(x, self.key_projection)
|
|
V = np.dot(x, self.value_projection)
|
|
|
|
head_dim = d_model // self.num_heads
|
|
Q = Q.reshape(batch_size, seq_len, self.num_heads, head_dim).transpose(0, 2, 1, 3)
|
|
K = K.reshape(batch_size, seq_len, self.num_heads, head_dim).transpose(0, 2, 1, 3)
|
|
V = V.reshape(batch_size, seq_len, self.num_heads, head_dim).transpose(0, 2, 1, 3)
|
|
|
|
attention_scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / np.sqrt(head_dim)
|
|
attention_probs = self._softmax(attention_scores)
|
|
|
|
attention_output = np.matmul(attention_probs, V)
|
|
attention_output = attention_output.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, d_model)
|
|
|
|
return np.dot(attention_output, self.output_projection)
|
|
|
|
def _softmax(self, x: np.ndarray) -> np.ndarray:
|
|
x_max = np.max(x, axis=-1, keepdims=True)
|
|
e_x = np.exp(x - x_max)
|
|
return e_x / np.sum(e_x, axis=-1, keepdims=True)
|
|
|
|
def _feed_forward(self, x: np.ndarray) -> np.ndarray:
|
|
inner = np.dot(x, self.ffn_inner)
|
|
inner = np.maximum(inner, 0)
|
|
return np.dot(inner, self.ffn_outer)
|
|
|
|
def forward(self, token_ids: np.ndarray) -> np.ndarray:
|
|
batch_size, seq_len = token_ids.shape
|
|
|
|
x = self.embedding[token_ids]
|
|
x = np.dot(x, self.embedding_norm)
|
|
|
|
x_normed = self._layer_norm(x)
|
|
attn_out = self._multi_head_attention(x_normed)
|
|
x = x + attn_out
|
|
|
|
x_normed = self._ffn_layer_norm(x)
|
|
ffn_out = self._feed_forward(x_normed)
|
|
x = x + ffn_out
|
|
|
|
logits = np.matmul(x, self.embedding.T)
|
|
|
|
return logits
|
|
|
|
|
|
def batched_beam_search(
|
|
prompts: List[List[int]],
|
|
beam_width: int,
|
|
max_new_tokens: int,
|
|
alpha: float = 0.6,
|
|
eos_token_id: int = 0,
|
|
model: MinimalLanguageModel = None
|
|
) -> List[List[Tuple[List[int], float]]]:
|
|
"""
|
|
Batched beam search decoder for autoregressive generation.
|
|
|
|
Args:
|
|
prompts: List of prompt token ID lists, one per batch item
|
|
beam_width: Number of beams per batch item (K)
|
|
max_new_tokens: Maximum number of new tokens to generate
|
|
alpha: Length penalty hyperparameter (default 0.6)
|
|
eos_token_id: End-of-sequence token ID
|
|
model: The language model to use
|
|
|
|
Returns:
|
|
List of lists of (sequence, score) tuples per batch item,
|
|
sorted by length-penalized score descending (best first)
|
|
|
|
IMPORTANT: Finished beams are NOT removed from the pool. They compete
|
|
with unfinished beams using length-penalized scores. This ensures that
|
|
a short, high-confidence sequence that hits EOS early is not wrongly
|
|
discarded in favor of a longer, lower-confidence sequence.
|
|
"""
|
|
if model is None:
|
|
model = MinimalLanguageModel()
|
|
|
|
batch_size = len(prompts)
|
|
vocab_size = model.vocab_size
|
|
|
|
active_beams = []
|
|
for batch_idx in range(batch_size):
|
|
prompt_tokens = np.array(prompts[batch_idx], dtype=np.int32)
|
|
|
|
beams = [{
|
|
'seq': list(prompt_tokens),
|
|
'logprob': 0.0,
|
|
'generated_length': 0,
|
|
'finished': False,
|
|
'batch_idx': batch_idx
|
|
}]
|
|
active_beams.append(beams)
|
|
|
|
finished_results = [[] for _ in range(batch_size)]
|
|
|
|
for step in range(max_new_tokens):
|
|
all_candidates = []
|
|
|
|
all_done = True
|
|
for batch_idx in range(batch_size):
|
|
beams = active_beams[batch_idx]
|
|
if beams and not all(beam['finished'] for beam in beams):
|
|
all_done = False
|
|
break
|
|
if all_done:
|
|
break
|
|
|
|
for batch_idx in range(batch_size):
|
|
beams = active_beams[batch_idx]
|
|
|
|
if not beams:
|
|
continue
|
|
|
|
if all(beam['finished'] for beam in beams):
|
|
for beam in beams:
|
|
finished_results[batch_idx].append({
|
|
'seq': beam['seq'][len(prompts[batch_idx]):],
|
|
'logprob': beam['logprob'],
|
|
'generated_length': beam['generated_length']
|
|
})
|
|
active_beams[batch_idx] = []
|
|
continue
|
|
|
|
seqs = [beam['seq'] for beam in beams]
|
|
max_seq_len = max(len(seq) for seq in seqs)
|
|
|
|
padded_seqs = []
|
|
for seq in seqs:
|
|
if len(seq) < max_seq_len:
|
|
padded_seqs.append(seq + [0] * (max_seq_len - len(seq)))
|
|
else:
|
|
padded_seqs.append(seq)
|
|
|
|
input_ids = np.array(padded_seqs, dtype=np.int32)
|
|
|
|
logits = model.forward(input_ids)
|
|
|
|
last_logits = logits[:, -1, :]
|
|
|
|
probs = np.exp(last_logits - np.max(last_logits, axis=-1, keepdims=True))
|
|
probs = probs / np.sum(probs, axis=-1, keepdims=True)
|
|
|
|
for beam_idx, beam in enumerate(beams):
|
|
if beam['finished']:
|
|
continue
|
|
|
|
beam_logprob = beam['logprob']
|
|
beam_gen_len = beam['generated_length']
|
|
|
|
token_probs = probs[beam_idx]
|
|
|
|
top_k_indices = np.argpartition(token_probs, -2 * beam_width)[-2 * beam_width:]
|
|
top_k_indices = top_k_indices[np.argsort(token_probs[top_k_indices])[::-1]]
|
|
|
|
for token_id in top_k_indices:
|
|
token_prob = token_probs[token_id]
|
|
|
|
if token_prob <= 0:
|
|
continue
|
|
|
|
new_logprob = beam_logprob + np.log(token_prob)
|
|
new_gen_len = beam_gen_len + 1
|
|
new_seq = beam['seq'] + [int(token_id)]
|
|
|
|
is_finished = (token_id == eos_token_id)
|
|
|
|
if is_finished:
|
|
cand_logprob = beam_logprob
|
|
cand_gen_len = beam_gen_len
|
|
else:
|
|
cand_logprob = new_logprob
|
|
cand_gen_len = new_gen_len
|
|
|
|
all_candidates.append({
|
|
'batch_idx': batch_idx,
|
|
'seq': new_seq,
|
|
'logprob': cand_logprob,
|
|
'generated_length': cand_gen_len,
|
|
'finished': is_finished,
|
|
'beam_idx': beam_idx
|
|
})
|
|
|
|
if not all_candidates:
|
|
break
|
|
|
|
for batch_idx in range(batch_size):
|
|
batch_candidates = [c for c in all_candidates if c['batch_idx'] == batch_idx]
|
|
|
|
if not batch_candidates:
|
|
continue
|
|
|
|
adjusted_scores = []
|
|
for c in batch_candidates:
|
|
gen_len = c['generated_length']
|
|
if gen_len == 0:
|
|
adj_score = c['logprob']
|
|
else:
|
|
adj_score = c['logprob'] / (gen_len ** alpha)
|
|
adjusted_scores.append(adj_score)
|
|
|
|
adjusted_scores = np.array(adjusted_scores)
|
|
|
|
select_k = min(beam_width, len(adjusted_scores))
|
|
if select_k <= 0:
|
|
continue
|
|
|
|
if len(adjusted_scores) <= beam_width:
|
|
top_k_indices = np.arange(len(adjusted_scores))
|
|
else:
|
|
top_k_indices = np.argpartition(adjusted_scores, -select_k)[-select_k:]
|
|
|
|
top_k_indices = top_k_indices[np.argsort(adjusted_scores[top_k_indices])[::-1]]
|
|
|
|
selected = [batch_candidates[i] for i in top_k_indices]
|
|
|
|
new_active_beams = []
|
|
for c in selected:
|
|
if c['finished']:
|
|
finished_results[c['batch_idx']].append({
|
|
'seq': c['seq'][len(prompts[c['batch_idx']]):],
|
|
'logprob': c['logprob'],
|
|
'generated_length': c['generated_length']
|
|
})
|
|
else:
|
|
new_active_beams.append({
|
|
'seq': c['seq'],
|
|
'logprob': c['logprob'],
|
|
'generated_length': c['generated_length'],
|
|
'finished': False,
|
|
'batch_idx': c['batch_idx']
|
|
})
|
|
|
|
active_beams[batch_idx] = new_active_beams
|
|
|
|
for batch_idx in range(batch_size):
|
|
remaining_beams = active_beams[batch_idx]
|
|
for beam in remaining_beams:
|
|
finished_results[batch_idx].append({
|
|
'seq': beam['seq'][len(prompts[batch_idx]):],
|
|
'logprob': beam['logprob'],
|
|
'generated_length': beam['generated_length']
|
|
})
|
|
|
|
results = []
|
|
for batch_idx in range(batch_size):
|
|
batch_results = finished_results[batch_idx]
|
|
|
|
scored_results = []
|
|
for item in batch_results:
|
|
seq = item['seq']
|
|
logprob = item['logprob']
|
|
gen_len = item['generated_length']
|
|
if gen_len == 0:
|
|
adj_score = logprob
|
|
else:
|
|
adj_score = logprob / (gen_len ** alpha)
|
|
scored_results.append((seq, adj_score))
|
|
|
|
scored_results.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
results.append(scored_results[:beam_width])
|
|
|
|
return results
|
|
|
|
|
|
def test_greedy_equivalence():
|
|
"""Test 1: Single batch item, K=1, short prompt, alpha=0
|
|
Verify this behaves identically to greedy decoding (always pick argmax)
|
|
"""
|
|
print("=" * 60)
|
|
print("TEST 1: Greedy Equivalence Test")
|
|
print("=" * 60)
|
|
|
|
model = MinimalLanguageModel(vocab_size=1000, d_model=64)
|
|
prompt = [[1, 2, 3]]
|
|
beam_width = 1
|
|
max_new_tokens = 5
|
|
alpha = 0.0
|
|
eos_token_id = 0
|
|
|
|
results = batched_beam_search(prompt, beam_width, max_new_tokens, alpha, eos_token_id, model)
|
|
|
|
print(f"Prompt: {prompt}")
|
|
print(f"Beam width: {beam_width}, Alpha: {alpha}")
|
|
print(f"Generated sequences: {results}")
|
|
|
|
input_ids = np.array(prompt, dtype=np.int32)
|
|
greedy_seq = list(prompt[0])
|
|
|
|
for _ in range(max_new_tokens):
|
|
logits = model.forward(input_ids)
|
|
probs = np.exp(logits[0, -1] - np.max(logits[0, -1]))
|
|
probs = probs / np.sum(probs)
|
|
next_token = int(np.argmax(probs))
|
|
greedy_seq.append(next_token)
|
|
if next_token == eos_token_id:
|
|
break
|
|
input_ids = np.array([greedy_seq], dtype=np.int32)
|
|
|
|
print(f"Greedy sequence (expected): {greedy_seq[len(prompt[0]):]}")
|
|
|
|
if results[0]:
|
|
result_seq = results[0][0][0]
|
|
print(f"Beam search sequence: {result_seq}")
|
|
match = result_seq == greedy_seq[len(prompt[0]):]
|
|
print(f"Match with greedy: {match}")
|
|
print()
|
|
|
|
|
|
def test_per_batch_independence():
|
|
"""Test 2: batch=2, beam_width=3, different prompt lengths [3, 5], alpha=0.6
|
|
Verify per-batch independence: beams from prompt 0 never interact with beams from prompt 1
|
|
"""
|
|
print("=" * 60)
|
|
print("TEST 2: Per-Batch Independence Test")
|
|
print("=" * 60)
|
|
|
|
model = MinimalLanguageModel(vocab_size=1000, d_model=64)
|
|
prompts = [[1, 2, 3], [4, 5, 6, 7, 8]]
|
|
beam_width = 3
|
|
max_new_tokens = 4
|
|
alpha = 0.6
|
|
eos_token_id = 0
|
|
|
|
results = batched_beam_search(prompts, beam_width, max_new_tokens, alpha, eos_token_id, model)
|
|
|
|
print(f"Prompts: {prompts}")
|
|
print(f"Prompt lengths: {[len(p) for p in prompts]}")
|
|
print(f"Beam width: {beam_width}, Alpha: {alpha}")
|
|
print(f"Results for batch 0 (should have {beam_width} beams): {len(results[0])} beams")
|
|
print(f"Results for batch 1 (should have {beam_width} beams): {len(results[1])} beams")
|
|
|
|
for batch_idx, batch_results in enumerate(results):
|
|
print(f"\nBatch {batch_idx} results:")
|
|
for seq, score in batch_results:
|
|
print(f" Seq: {seq[:10]}..., Score: {score:.4f}")
|
|
|
|
prompt_0_tokens = set(prompts[0])
|
|
prompt_1_tokens = set(prompts[1])
|
|
|
|
cross_contamination = False
|
|
for seq, _ in results[0]:
|
|
overlap = set(seq) & prompt_1_tokens
|
|
if overlap:
|
|
print(f"WARNING: Batch 0 seq contains tokens from batch 1 prompt: {overlap}")
|
|
cross_contamination = True
|
|
|
|
for seq, _ in results[1]:
|
|
overlap = set(seq) & prompt_0_tokens
|
|
if overlap:
|
|
print(f"WARNING: Batch 1 seq contains tokens from batch 0 prompt: {overlap}")
|
|
cross_contamination = True
|
|
|
|
print(f"\nPer-batch independence verified: {len(results) == 2 and not cross_contamination}")
|
|
print()
|
|
|
|
|
|
def test_eos_retention():
|
|
"""Test 3: THE EOS RETENTION TEST
|
|
Verify that EOS beams compete correctly with unfinished beams.
|
|
A beam that hits EOS early with logprob=-3.0 should beat
|
|
an unfinished beam with logprob=-5.0 (both length-penalized).
|
|
"""
|
|
print("=" * 60)
|
|
print("TEST 3: EOS Retention Test")
|
|
print("=" * 60)
|
|
|
|
model = MinimalLanguageModel(vocab_size=1000, d_model=64)
|
|
prompt = [[1, 2, 3, 4, 5]]
|
|
beam_width = 3
|
|
max_new_tokens = 10
|
|
alpha = 0.6
|
|
eos_token_id = 42
|
|
|
|
class MockedModel:
|
|
def __init__(self, real_model):
|
|
self.vocab_size = real_model.vocab_size
|
|
self.real_model = real_model
|
|
self.step_count = 0
|
|
self.eos_logprob = -3.0
|
|
self.cont_logprob = -4.0
|
|
|
|
def forward(self, token_ids):
|
|
self.step_count += 1
|
|
batch_size, seq_len = token_ids.shape
|
|
|
|
if self.step_count == 1:
|
|
logits = np.full((batch_size, seq_len, self.vocab_size), -20.0, dtype=np.float32)
|
|
|
|
logits[0, -1, eos_token_id] = 5.0
|
|
logits[0, -1, 99] = 3.0
|
|
|
|
return logits
|
|
else:
|
|
logits = self.real_model.forward(token_ids)
|
|
return logits
|
|
|
|
mocked_model = MockedModel(model)
|
|
|
|
results = batched_beam_search(
|
|
prompt, beam_width, max_new_tokens, alpha, eos_token_id, mocked_model
|
|
)
|
|
|
|
print(f"Prompt: {prompt}")
|
|
print(f"Beam width: {beam_width}, Alpha: {alpha}, EOS token: {eos_token_id}")
|
|
print(f"Step 1 mock: EOS token will have high logit (pre-softmax)")
|
|
|
|
print(f"\nGenerated sequences:")
|
|
for seq, score in results[0]:
|
|
print(f" Seq: {seq}, Score: {score:.4f}")
|
|
|
|
eos_in_best = False
|
|
if results[0]:
|
|
best_seq, best_score = results[0][0]
|
|
if eos_token_id in best_seq:
|
|
eos_in_best = True
|
|
print(f"\n[PASS] Best sequence contains EOS token - EOS beam correctly retained")
|
|
else:
|
|
print(f"\n[FAIL] Best sequence does NOT contain EOS token - EOS beam was wrongly discarded")
|
|
print("This happens if finished beams are removed from the pool too early.")
|
|
print("With correct EOS retention: the EOS beam (stopped at step 1 with score=-3.0/1^0.6=-3.0)")
|
|
print("would beat continuing beams (logprob=-4.0 at step 1, then -5.0 at step 2, etc.)")
|
|
print()
|
|
|
|
|
|
def run_all_tests():
|
|
"""Run all tests."""
|
|
test_greedy_equivalence()
|
|
test_per_batch_independence()
|
|
test_eos_retention()
|
|
|
|
print("=" * 60)
|
|
print("ALL TESTS COMPLETED")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run_all_tests() |