#include "arg.h" #include "common.h" #include "log.h" #include "llama.h" #include "sampling.h" #include #include #include #include #include #include struct op_stat { int count = 0; double total_bytes_in = 0; double total_bytes_out = 0; }; static std::map g_op_stats; static bool g_collect = false; static bool eval_callback(struct ggml_tensor * t, bool ask, void * user_data) { (void)user_data; if (!g_collect) return false; if (t->op == GGML_OP_NONE) return false; if (!ask) return false; // after compute, just observe auto & s = g_op_stats[t->op]; s.count++; s.total_bytes_out += (double)ggml_nbytes(t); for (int i = 0; i < GGML_MAX_SRC; i++) { if (t->src[i]) s.total_bytes_in += (double)ggml_nbytes(t->src[i]); } return false; // no sync needed } int main(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); common_params params; params.n_predict = 32; params.cb_eval = eval_callback; params.cb_eval_user_data = nullptr; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } llama_backend_init(); llama_numa_init(params.numa); common_init(); auto llama_init = common_init_from_params(params); auto * model = llama_init->model(); auto * ctx = llama_init->context(); if (!model || !ctx) { LOG_ERR("failed to init\n"); return 1; } const llama_vocab * vocab = llama_model_get_vocab(model); const bool add_bos = llama_vocab_get_add_bos(vocab); std::string prompt = params.prompt.empty() ? "The" : params.prompt; std::vector tokens = common_tokenize(ctx, prompt, add_bos, true); // Prefill LOG_INF("Prefilling %zu tokens...\n", tokens.size()); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size())) != 0) { LOG_ERR("prefill failed\n"); return 1; } // Start collecting op stats g_collect = true; int n_gen = params.n_predict; LOG_INF("Generating %d tokens...\n", n_gen); llama_token new_token = common_sampler_sample(llama_init->sampler(0), ctx, -1); auto t_start = ggml_time_us(); for (int i = 0; i < n_gen; i++) { if (llama_decode(ctx, llama_batch_get_one(&new_token, 1)) != 0) { LOG_ERR("decode failed at step %d\n", i); break; } new_token = common_sampler_sample(llama_init->sampler(0), ctx, -1); } auto t_end = ggml_time_us(); double total_ms = (t_end - t_start) / 1000.0; double tps = n_gen / (total_ms / 1000.0); LOG("\n=== Tokgen Graph Profile ===\n"); LOG("Total: %d tokens in %.1f ms (%.1f tok/s)\n", n_gen, total_ms, tps); LOG("Total ops per decode step: %d\n\n", g_op_stats.empty() ? 0 : std::accumulate(g_op_stats.begin(), g_op_stats.end(), 0, [](int s, const auto & p) { return s + p.second.count; }) / n_gen); double total_bytes = 0; for (auto & [op, s] : g_op_stats) total_bytes += s.total_bytes_out; // sort by bytes_out descending std::vector> sorted(g_op_stats.begin(), g_op_stats.end()); std::sort(sorted.begin(), sorted.end(), [](auto & a, auto & b) { return a.second.total_bytes_out > b.second.total_bytes_out; }); LOG("%-20s %8s %14s %14s %10s\n", "Op", "PerTick", "BytesIn/tk", "BytesOut/tk", "Frac%"); LOG("%-20s %8s %14s %14s %10s\n", "---", "---", "---", "---", "---"); for (auto & [op, s] : sorted) { double per_tick = (double)s.count / n_gen; double bytes_in_per = s.total_bytes_in / n_gen; double bytes_out_per = s.total_bytes_out / n_gen; double frac = s.total_bytes_out / total_bytes * 100; LOG("%-20s %8.1f %14.0f %14.0f %9.1f%%\n", ggml_op_name(op), per_tick, bytes_in_per, bytes_out_per, frac); } // Estimate effective bandwidth if dominated by memory ops double model_gib = (double)llama_model_size(model) / (1ull << 30); double eff_bw = tps * model_gib * 1.074; LOG("\nModel size: %.2f GiB\n", model_gib); LOG("Effective BW (model * tps): %.0f GB/s\n", eff_bw); LOG("M4 Max theoretical BW: ~400 GB/s\n"); LOG("BW utilization: %.1f%%\n", eff_bw / 400 * 100); llama_perf_context_print(ctx); llama_backend_free(); return 0; }