[docs] add GIT.md with workflow and agent instructions

2026-04-30 18:11:44 +02:00
parent 683c5acb90
commit 222626cfdc
5 changed files with 786 additions and 0 deletions
@@ -1,7 +1,12 @@
 set(TARGET llama-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
+
+set(TARGET_PROFILE llama-eval-callback-profile)
+add_executable(${TARGET_PROFILE} eval-callback-profile.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET_PROFILE} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET_PROFILE} PRIVATE cxx_std_17)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 if(LLAMA_BUILD_TESTS)
@@ -0,0 +1,136 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "sampling.h"
+#include <clocale>
+#include <numeric>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <vector>
+
+struct op_stat {
+    int count = 0;
+    double total_bytes_in = 0;
+    double total_bytes_out = 0;
+};
+
+static std::map<enum ggml_op, op_stat> g_op_stats;
+static bool g_collect = false;
+
+static bool eval_callback(struct ggml_tensor * t, bool ask, void * user_data) {
+    (void)user_data;
+    if (!g_collect) return false;
+    if (t->op == GGML_OP_NONE) return false;
+    if (!ask) return false;  // after compute, just observe
+    
+    auto & s = g_op_stats[t->op];
+    s.count++;
+    s.total_bytes_out += (double)ggml_nbytes(t);
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (t->src[i]) s.total_bytes_in += (double)ggml_nbytes(t->src[i]);
+    }
+    
+    return false;  // no sync needed
+}
+
+int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+    
+    common_params params;
+    params.n_predict = 32;
+    params.cb_eval = eval_callback;
+    params.cb_eval_user_data = nullptr;
+    
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+    
+    llama_backend_init();
+    llama_numa_init(params.numa);
+    
+    common_init();
+    
+    auto llama_init = common_init_from_params(params);
+    auto * model = llama_init->model();
+    auto * ctx = llama_init->context();
+    
+    if (!model || !ctx) {
+        LOG_ERR("failed to init\n");
+        return 1;
+    }
+    
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
+    std::string prompt = params.prompt.empty() ? "The" : params.prompt;
+    std::vector<llama_token> tokens = common_tokenize(ctx, prompt, add_bos, true);
+    
+    // Prefill
+    LOG_INF("Prefilling %zu tokens...\n", tokens.size());
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size())) != 0) {
+        LOG_ERR("prefill failed\n");
+        return 1;
+    }
+    
+    // Start collecting op stats
+    g_collect = true;
+    
+    int n_gen = params.n_predict;
+    LOG_INF("Generating %d tokens...\n", n_gen);
+    
+    llama_token new_token = common_sampler_sample(llama_init->sampler(0), ctx, -1);
+    
+    auto t_start = ggml_time_us();
+    
+    for (int i = 0; i < n_gen; i++) {
+        if (llama_decode(ctx, llama_batch_get_one(&new_token, 1)) != 0) {
+            LOG_ERR("decode failed at step %d\n", i);
+            break;
+        }
+        new_token = common_sampler_sample(llama_init->sampler(0), ctx, -1);
+    }
+    
+    auto t_end = ggml_time_us();
+    
+    double total_ms = (t_end - t_start) / 1000.0;
+    double tps = n_gen / (total_ms / 1000.0);
+    
+    LOG("\n=== Tokgen Graph Profile ===\n");
+    LOG("Total: %d tokens in %.1f ms (%.1f tok/s)\n", n_gen, total_ms, tps);
+    LOG("Total ops per decode step: %d\n\n", g_op_stats.empty() ? 0 : 
+        std::accumulate(g_op_stats.begin(), g_op_stats.end(), 0, [](int s, const auto & p) { return s + p.second.count; }) / n_gen);
+    
+    double total_bytes = 0;
+    for (auto & [op, s] : g_op_stats) total_bytes += s.total_bytes_out;
+    
+    // sort by bytes_out descending
+    std::vector<std::pair<enum ggml_op, op_stat>> sorted(g_op_stats.begin(), g_op_stats.end());
+    std::sort(sorted.begin(), sorted.end(), [](auto & a, auto & b) {
+        return a.second.total_bytes_out > b.second.total_bytes_out;
+    });
+    
+    LOG("%-20s %8s %14s %14s %10s\n", "Op", "PerTick", "BytesIn/tk", "BytesOut/tk", "Frac%");
+    LOG("%-20s %8s %14s %14s %10s\n", "---", "---", "---", "---", "---");
+    
+    for (auto & [op, s] : sorted) {
+        double per_tick = (double)s.count / n_gen;
+        double bytes_in_per = s.total_bytes_in / n_gen;
+        double bytes_out_per = s.total_bytes_out / n_gen;
+        double frac = s.total_bytes_out / total_bytes * 100;
+        LOG("%-20s %8.1f %14.0f %14.0f %9.1f%%\n",
+            ggml_op_name(op), per_tick, bytes_in_per, bytes_out_per, frac);
+    }
+    
+    // Estimate effective bandwidth if dominated by memory ops
+    double model_gib = (double)llama_model_size(model) / (1ull << 30);
+    double eff_bw = tps * model_gib * 1.074;
+    LOG("\nModel size: %.2f GiB\n", model_gib);
+    LOG("Effective BW (model * tps): %.0f GB/s\n", eff_bw);
+    LOG("M4 Max theoretical BW: ~400 GB/s\n");
+    LOG("BW utilization: %.1f%%\n", eff_bw / 400 * 100);
+    
+    llama_perf_context_print(ctx);
+    llama_backend_free();
+    return 0;
+}