Files
llm_programming_tests/glm5/fuse/test_fused.cu
T
sleepy 8e72eef09c feat: add model comparisons and sanitize session files
- Rename gamma to glm5 and model to minimax-m2.7
- Add model_comparison/ directory with head-to-head analyses
- Sanitize all session.jsonl files: remove absolute paths and usernames
- Remove __pycache__ artifacts
- Add .gitignore
2026-04-23 11:16:01 +02:00

199 lines
6.6 KiB
Plaintext

// =============================================================================
// Test / Benchmark: Fused Softmax + Top-K
// =============================================================================
// Compile:
// nvcc -O3 -arch=sm_80 -o test_fused test_fused.cu fused_softmax_topk.cuh
//
// Run:
// ./test_fused
// =============================================================================
#include "fused_softmax_topk.cuh"
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <algorithm>
#include <vector>
#include <numeric>
// ---------- CPU reference implementation ----------
void cpu_softmax_topk(const float* logits, int* indices, float* probs,
int B, int T, int V, int K) {
for (int bt = 0; bt < B * T; bt++) {
const float* row = logits + bt * V;
int* out_idx = indices + bt * K;
float* out_prob = probs + bt * K;
// Numerically stable softmax
float max_val = *std::max_element(row, row + V);
float sum = 0.0f;
std::vector<float> exp_vals(V);
for (int v = 0; v < V; v++) {
exp_vals[v] = expf(row[v] - max_val);
sum += exp_vals[v];
}
float inv_sum = 1.0f / sum;
for (int v = 0; v < V; v++) {
exp_vals[v] *= inv_sum;
}
// Top-K by sorting (simple but correct)
std::vector<int> idx(V);
std::iota(idx.begin(), idx.end(), 0);
std::partial_sort(idx.begin(), idx.begin() + K, idx.end(),
[&](int a, int b) { return exp_vals[a] > exp_vals[b]; });
for (int k = 0; k < K; k++) {
out_idx[k] = idx[k];
out_prob[k] = exp_vals[idx[k]];
}
}
}
// ---------- Verification ----------
bool verify(const float* ref_probs, const int* ref_idx,
const float* gpu_probs, const int* gpu_idx,
int B, int T, int K, float tol = 1e-4f) {
bool ok = true;
int failures = 0;
for (int bt = 0; bt < B * T && failures < 10; bt++) {
for (int k = 0; k < K; k++) {
int ri = ref_idx[bt * K + k];
int gi = gpu_idx[bt * K + k];
float rp = ref_probs[bt * K + k];
float gp = gpu_probs[bt * K + k];
// Index must match (probabilities might have ties, but for random data they won't)
if (ri != gi) {
// Check if probability is close (might be a tie)
if (fabsf(rp - gp) > tol) {
printf("FAIL [bt=%d, k=%d]: ref_idx=%d gpu_idx=%d ref_prob=%.8f gpu_prob=%.8f\n",
bt, k, ri, gi, rp, gp);
ok = false;
failures++;
}
}
// Probability must match
if (fabsf(rp - gp) > tol) {
printf("FAIL [bt=%d, k=%d]: idx=%d ref_prob=%.8f gpu_prob=%.8f diff=%.2e\n",
bt, k, gi, rp, gp, fabsf(rp - gp));
ok = false;
failures++;
}
}
}
return ok;
}
// ---------- Main ----------
int main() {
constexpr int B = 4;
constexpr int T = 8;
constexpr int V = 1024; // manageable for CPU verification
constexpr int K = 10;
constexpr int N = B * T;
printf("=== Fused Softmax + Top-K Test ===\n");
printf("Shape: [B=%d, T=%d, V=%d], K=%d\n\n", B, T, V, K);
// Allocate and initialize
size_t logits_bytes = (size_t)N * V * sizeof(float);
size_t idx_bytes = (size_t)N * K * sizeof(int);
size_t prob_bytes = (size_t)N * K * sizeof(float);
std::vector<float> h_logits(N * V);
std::vector<int> h_idx_gpu(N * K);
std::vector<float> h_prob_gpu(N * K);
std::vector<int> h_idx_ref(N * K);
std::vector<float> h_prob_ref(N * K);
// Random logits with large range to stress numerical stability
srand(42);
for (auto& x : h_logits) {
x = ((float)rand() / RAND_MAX - 0.5f) * 40.0f; // range [-20, 20]
}
// GPU allocation
float *d_logits, *d_probs;
int *d_indices;
cudaMalloc(&d_logits, logits_bytes);
cudaMalloc(&d_indices, idx_bytes);
cudaMalloc(&d_probs, prob_bytes);
cudaMemcpy(d_logits, h_logits.data(), logits_bytes, cudaMemcpyHostToDevice);
// Launch kernel
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
printf("Launching fused kernel...\n");
cudaEventRecord(start);
launch_fused_softmax_topk<K>(d_logits, d_indices, d_probs, B, T, V);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float ms = 0;
cudaEventElapsedTime(&ms, start, stop);
printf("Kernel time: %.3f ms\n\n", ms);
// Copy results back
cudaMemcpy(h_idx_gpu.data(), d_indices, idx_bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_prob_gpu.data(), d_probs, prob_bytes, cudaMemcpyDeviceToHost);
// CPU reference
printf("Running CPU reference...\n");
cpu_softmax_topk(h_logits.data(), h_idx_ref.data(), h_prob_ref.data(),
B, T, V, K);
// Verify
printf("Verifying...\n");
bool pass = verify(h_prob_ref.data(), h_idx_ref.data(),
h_prob_gpu.data(), h_idx_gpu.data(),
B, T, K);
printf("\n%s\n", pass ? "✓ ALL TESTS PASSED" : "✗ TESTS FAILED");
// Print a sample row
int row = 0;
printf("\nSample output (row %d):\n", row);
printf(" %-6s %-12s %-12s %-12s\n", "k", "Index", "GPU Prob", "Ref Prob");
printf(" %-6s %-12s %-12s %-12s\n", "---", "-----", "--------", "--------");
for (int k = 0; k < K; k++) {
printf(" %-6d %-12d %-12.8f %-12.8f\n", k,
h_idx_gpu[row * K + k],
h_prob_gpu[row * K + k],
h_prob_ref[row * K + k]);
}
// Check probability sums
float sum_gpu = 0, sum_ref = 0;
for (int k = 0; k < K; k++) {
sum_gpu += h_prob_gpu[row * K + k];
sum_ref += h_prob_ref[row * K + k];
}
printf("\n Sum of top-%d probs: GPU=%.8f Ref=%.8f\n", K, sum_gpu, sum_ref);
printf(" (Note: sum < 1.0 because K << V; these should match)\n");
// Bandwidth estimate
size_t total_read = logits_bytes;
size_t total_write = idx_bytes + prob_bytes;
double bw = (total_read + total_write) / (ms * 1e-3) / 1e9;
printf("\nEstimated effective bandwidth: %.1f GB/s\n", bw);
printf(" Reads: %zu bytes (%.1f KB)\n", total_read, total_read / 1024.0);
printf(" Writes: %zu bytes (%.1f KB)\n", total_write, total_write / 1024.0);
// Cleanup
cudaFree(d_logits);
cudaFree(d_indices);
cudaFree(d_probs);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return pass ? 0 : 1;
}