[docs] add GIT.md with workflow and agent instructions
CI (3rd-party) / ubuntu-24-llguidance (push) Waiting to run
CI (android) / android (push) Waiting to run
CI (android) / android-ndk (push) Waiting to run
CI (apple) / macOS-latest-ios (push) Waiting to run
CI (apple) / macos-latest-ios-xcode (push) Waiting to run
CI (apple) / macOS-latest-tvos (push) Waiting to run
CI (apple) / macOS-latest-visionos (push) Waiting to run
CI (apple) / macOS-latest-swift (generic/platform=iOS) (push) Blocked by required conditions
CI (apple) / macOS-latest-swift (generic/platform=macOS) (push) Blocked by required conditions
CI (apple) / macOS-latest-swift (generic/platform=tvOS) (push) Blocked by required conditions
CI (cann) / openEuler-latest-cann (aarch64, Release, 310p, off) (push) Waiting to run
CI (cann) / openEuler-latest-cann (aarch64, Release, 910b, off) (push) Waiting to run
CI (cann) / openEuler-latest-cann (aarch64, Release, 910b, on) (push) Waiting to run
CI (cann) / openEuler-latest-cann (x86, Release, 310p, off) (push) Waiting to run
CI (cann) / openEuler-latest-cann (x86, Release, 910b, off) (push) Waiting to run
CI (cann) / openEuler-latest-cann (x86, Release, 910b, on) (push) Waiting to run
CI (riscv) / ubuntu-riscv64-native-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI (riscv) / ubuntu-riscv64-native-sanitizer (Debug, THREAD) (push) Waiting to run
CI (riscv) / ubuntu-riscv64-native-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI (sanitize) / ubuntu-latest-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI (sanitize) / ubuntu-latest-sanitizer (Debug, THREAD) (push) Waiting to run
CI (sanitize) / ubuntu-latest-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI (openvino) / ubuntu-24-openvino-GPU (push) Has been cancelled
CI (self-hosted) / ggml-ci-nvidia-cuda (push) Waiting to run
CI (self-hosted) / ggml-ci-nvidia-vulkan-cm (push) Waiting to run
CI (self-hosted) / ggml-ci-nvidia-vulkan-cm2 (push) Waiting to run
CI (self-hosted) / ggml-ci-mac-metal (push) Waiting to run
CI (self-hosted) / ggml-ci-mac-webgpu (push) Waiting to run
CI (self-hosted) / ggml-ci-mac-vulkan (push) Waiting to run
CI (self-hosted) / ggml-ci-linux-intel-vulkan (push) Waiting to run
CI (self-hosted) / ggml-ci-win-intel-vulkan (push) Waiting to run
CI (sycl) / ubuntu-24-sycl (fp16, ON) (push) Waiting to run
CI (sycl) / ubuntu-24-sycl (fp32, OFF) (push) Waiting to run
CI (sycl) / windows-latest-sycl (push) Waiting to run
CI (vulkan) / ubuntu-24-vulkan-llvmpipe (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-arm64 (push) Waiting to run
CI / macOS-latest-x64 (push) Waiting to run
CI / macOS-latest-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu (arm64, ubuntu-24.04-arm) (push) Waiting to run
CI / ubuntu-cpu (ppc64le, ubuntu-24.04-ppc64le) (push) Waiting to run
CI / ubuntu-cpu (s390x, ubuntu-24.04-s390x) (push) Waiting to run
CI / ubuntu-cpu (x64, ubuntu-22.04) (push) Waiting to run
CI / android-arm64 (push) Waiting to run
CI / ubuntu-latest-rpc (push) Waiting to run
CI / ubuntu-24-vulkan (arm64, ubuntu-24.04-arm) (push) Waiting to run
CI / ubuntu-24-vulkan (x64, ubuntu-24.04) (push) Waiting to run
CI / ubuntu-24-webgpu (push) Waiting to run
CI / ubuntu-24-webgpu-wasm (push) Waiting to run
CI / ubuntu-22-hip (push) Waiting to run
CI / ubuntu-22-musa (push) Waiting to run
CI / windows-latest (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DG… (push) Waiting to run
CI / windows-latest (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cuda (push) Waiting to run
CI / windows-2022-cuda (12.4) (push) Waiting to run
CI / windows-latest-hip (push) Waiting to run
CI / ubuntu-cpu-riscv64-native (push) Waiting to run
CI / ggml-ci-x64-cpu-low-perf (push) Waiting to run
CI / ggml-ci-arm64-cpu-low-perf (push) Waiting to run
CI / ggml-ci-x64-cpu-high-perf (push) Waiting to run
CI / ggml-ci-arm64-cpu-high-perf (push) Waiting to run
CI / ggml-ci-arm64-cpu-high-perf-sve (push) Waiting to run
CI / ggml-ci-arm64-cpu-kleidiai (push) Waiting to run
CI / ggml-ci-arm64-cpu-kleidiai-graviton4 (push) Waiting to run
EditorConfig Checker / editorconfig (push) Waiting to run
Release / macOS-cpu (arm64, arm64, -DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON, macos-14) (push) Waiting to run
Release / macOS-cpu (arm64, arm64-kleidiai, -DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON, macos-14) (push) Waiting to run
Release / macOS-cpu (x64, x64, -DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3, macos-15-intel) (push) Waiting to run
Release / ubuntu-cpu (arm64, ubuntu-24.04-arm) (push) Waiting to run
Release / ubuntu-cpu (s390x, ubuntu-24.04-s390x) (push) Waiting to run
Release / ubuntu-cpu (x64, ubuntu-22.04) (push) Waiting to run
Release / ubuntu-vulkan (arm64, ubuntu-24.04-arm) (push) Waiting to run
Release / ubuntu-vulkan (x64, ubuntu-22.04) (push) Waiting to run
Release / android-arm64 (push) Waiting to run
Release / ubuntu-24-openvino (push) Waiting to run
Release / windows-cpu (arm64) (push) Waiting to run
Release / windows-cpu (x64) (push) Waiting to run
Release / windows (arm64, opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON, ggml-opencl) (push) Waiting to run
Release / windows (x64, vulkan, -DGGML_VULKAN=ON, ggml-vulkan) (push) Waiting to run
Release / windows-cuda (12.4) (push) Waiting to run
Release / windows-cuda (13.1) (push) Waiting to run
Release / windows-sycl (push) Waiting to run
Release / ubuntu-24-sycl (fp16, ON) (push) Waiting to run
Release / ubuntu-24-sycl (fp32, OFF) (push) Waiting to run
Release / ubuntu-22-rocm (7.2.1, x64, gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201) (push) Waiting to run
Release / windows-hip (gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032, radeon) (push) Waiting to run
Release / ios-xcode-build (push) Waiting to run
Release / openEuler-cann (aarch64, Release, 310p, off) (push) Waiting to run
Release / openEuler-cann (aarch64, Release, 910b, on) (push) Waiting to run
Release / openEuler-cann (x86, Release, 310p, off) (push) Waiting to run
Release / openEuler-cann (x86, Release, 910b, on) (push) Waiting to run
Release / release (push) Blocked by required conditions
Server (sanitize) / server (RelWithDebInfo, ADDRESS) (push) Waiting to run
Server (sanitize) / server (RelWithDebInfo, UNDEFINED) (push) Waiting to run
Server (self-hosted) / server-metal (GPUx2, backend-sampling) (push) Waiting to run
Server (self-hosted) / server-metal (GPUx2) (push) Waiting to run
Server (self-hosted) / server-metal (GPUx1) (push) Waiting to run
Server (self-hosted) / server-metal (GPUx1, backend-sampling) (push) Waiting to run
Server / server (default) (push) Waiting to run
Server / server (backend-sampling) (push) Waiting to run
Server / server-windows (push) Waiting to run
CI (openvino) / ubuntu-24-openvino-CPU (push) Has been cancelled
CI (self-hosted) / ggml-ci-intel-openvino-gpu-low-perf (push) Has been cancelled
CI (3rd-party) / ubuntu-24-llguidance (push) Waiting to run
CI (android) / android (push) Waiting to run
CI (android) / android-ndk (push) Waiting to run
CI (apple) / macOS-latest-ios (push) Waiting to run
CI (apple) / macos-latest-ios-xcode (push) Waiting to run
CI (apple) / macOS-latest-tvos (push) Waiting to run
CI (apple) / macOS-latest-visionos (push) Waiting to run
CI (apple) / macOS-latest-swift (generic/platform=iOS) (push) Blocked by required conditions
CI (apple) / macOS-latest-swift (generic/platform=macOS) (push) Blocked by required conditions
CI (apple) / macOS-latest-swift (generic/platform=tvOS) (push) Blocked by required conditions
CI (cann) / openEuler-latest-cann (aarch64, Release, 310p, off) (push) Waiting to run
CI (cann) / openEuler-latest-cann (aarch64, Release, 910b, off) (push) Waiting to run
CI (cann) / openEuler-latest-cann (aarch64, Release, 910b, on) (push) Waiting to run
CI (cann) / openEuler-latest-cann (x86, Release, 310p, off) (push) Waiting to run
CI (cann) / openEuler-latest-cann (x86, Release, 910b, off) (push) Waiting to run
CI (cann) / openEuler-latest-cann (x86, Release, 910b, on) (push) Waiting to run
CI (riscv) / ubuntu-riscv64-native-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI (riscv) / ubuntu-riscv64-native-sanitizer (Debug, THREAD) (push) Waiting to run
CI (riscv) / ubuntu-riscv64-native-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI (sanitize) / ubuntu-latest-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI (sanitize) / ubuntu-latest-sanitizer (Debug, THREAD) (push) Waiting to run
CI (sanitize) / ubuntu-latest-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI (openvino) / ubuntu-24-openvino-GPU (push) Has been cancelled
CI (self-hosted) / ggml-ci-nvidia-cuda (push) Waiting to run
CI (self-hosted) / ggml-ci-nvidia-vulkan-cm (push) Waiting to run
CI (self-hosted) / ggml-ci-nvidia-vulkan-cm2 (push) Waiting to run
CI (self-hosted) / ggml-ci-mac-metal (push) Waiting to run
CI (self-hosted) / ggml-ci-mac-webgpu (push) Waiting to run
CI (self-hosted) / ggml-ci-mac-vulkan (push) Waiting to run
CI (self-hosted) / ggml-ci-linux-intel-vulkan (push) Waiting to run
CI (self-hosted) / ggml-ci-win-intel-vulkan (push) Waiting to run
CI (sycl) / ubuntu-24-sycl (fp16, ON) (push) Waiting to run
CI (sycl) / ubuntu-24-sycl (fp32, OFF) (push) Waiting to run
CI (sycl) / windows-latest-sycl (push) Waiting to run
CI (vulkan) / ubuntu-24-vulkan-llvmpipe (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-arm64 (push) Waiting to run
CI / macOS-latest-x64 (push) Waiting to run
CI / macOS-latest-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu (arm64, ubuntu-24.04-arm) (push) Waiting to run
CI / ubuntu-cpu (ppc64le, ubuntu-24.04-ppc64le) (push) Waiting to run
CI / ubuntu-cpu (s390x, ubuntu-24.04-s390x) (push) Waiting to run
CI / ubuntu-cpu (x64, ubuntu-22.04) (push) Waiting to run
CI / android-arm64 (push) Waiting to run
CI / ubuntu-latest-rpc (push) Waiting to run
CI / ubuntu-24-vulkan (arm64, ubuntu-24.04-arm) (push) Waiting to run
CI / ubuntu-24-vulkan (x64, ubuntu-24.04) (push) Waiting to run
CI / ubuntu-24-webgpu (push) Waiting to run
CI / ubuntu-24-webgpu-wasm (push) Waiting to run
CI / ubuntu-22-hip (push) Waiting to run
CI / ubuntu-22-musa (push) Waiting to run
CI / windows-latest (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DG… (push) Waiting to run
CI / windows-latest (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cuda (push) Waiting to run
CI / windows-2022-cuda (12.4) (push) Waiting to run
CI / windows-latest-hip (push) Waiting to run
CI / ubuntu-cpu-riscv64-native (push) Waiting to run
CI / ggml-ci-x64-cpu-low-perf (push) Waiting to run
CI / ggml-ci-arm64-cpu-low-perf (push) Waiting to run
CI / ggml-ci-x64-cpu-high-perf (push) Waiting to run
CI / ggml-ci-arm64-cpu-high-perf (push) Waiting to run
CI / ggml-ci-arm64-cpu-high-perf-sve (push) Waiting to run
CI / ggml-ci-arm64-cpu-kleidiai (push) Waiting to run
CI / ggml-ci-arm64-cpu-kleidiai-graviton4 (push) Waiting to run
EditorConfig Checker / editorconfig (push) Waiting to run
Release / macOS-cpu (arm64, arm64, -DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON, macos-14) (push) Waiting to run
Release / macOS-cpu (arm64, arm64-kleidiai, -DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON, macos-14) (push) Waiting to run
Release / macOS-cpu (x64, x64, -DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3, macos-15-intel) (push) Waiting to run
Release / ubuntu-cpu (arm64, ubuntu-24.04-arm) (push) Waiting to run
Release / ubuntu-cpu (s390x, ubuntu-24.04-s390x) (push) Waiting to run
Release / ubuntu-cpu (x64, ubuntu-22.04) (push) Waiting to run
Release / ubuntu-vulkan (arm64, ubuntu-24.04-arm) (push) Waiting to run
Release / ubuntu-vulkan (x64, ubuntu-22.04) (push) Waiting to run
Release / android-arm64 (push) Waiting to run
Release / ubuntu-24-openvino (push) Waiting to run
Release / windows-cpu (arm64) (push) Waiting to run
Release / windows-cpu (x64) (push) Waiting to run
Release / windows (arm64, opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON, ggml-opencl) (push) Waiting to run
Release / windows (x64, vulkan, -DGGML_VULKAN=ON, ggml-vulkan) (push) Waiting to run
Release / windows-cuda (12.4) (push) Waiting to run
Release / windows-cuda (13.1) (push) Waiting to run
Release / windows-sycl (push) Waiting to run
Release / ubuntu-24-sycl (fp16, ON) (push) Waiting to run
Release / ubuntu-24-sycl (fp32, OFF) (push) Waiting to run
Release / ubuntu-22-rocm (7.2.1, x64, gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201) (push) Waiting to run
Release / windows-hip (gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032, radeon) (push) Waiting to run
Release / ios-xcode-build (push) Waiting to run
Release / openEuler-cann (aarch64, Release, 310p, off) (push) Waiting to run
Release / openEuler-cann (aarch64, Release, 910b, on) (push) Waiting to run
Release / openEuler-cann (x86, Release, 310p, off) (push) Waiting to run
Release / openEuler-cann (x86, Release, 910b, on) (push) Waiting to run
Release / release (push) Blocked by required conditions
Server (sanitize) / server (RelWithDebInfo, ADDRESS) (push) Waiting to run
Server (sanitize) / server (RelWithDebInfo, UNDEFINED) (push) Waiting to run
Server (self-hosted) / server-metal (GPUx2, backend-sampling) (push) Waiting to run
Server (self-hosted) / server-metal (GPUx2) (push) Waiting to run
Server (self-hosted) / server-metal (GPUx1) (push) Waiting to run
Server (self-hosted) / server-metal (GPUx1, backend-sampling) (push) Waiting to run
Server / server (default) (push) Waiting to run
Server / server (backend-sampling) (push) Waiting to run
Server / server-windows (push) Waiting to run
CI (openvino) / ubuntu-24-openvino-CPU (push) Has been cancelled
CI (self-hosted) / ggml-ci-intel-openvino-gpu-low-perf (push) Has been cancelled
This commit is contained in:
@@ -1,7 +1,12 @@
|
||||
set(TARGET llama-eval-callback)
|
||||
add_executable(${TARGET} eval-callback.cpp)
|
||||
|
||||
set(TARGET_PROFILE llama-eval-callback-profile)
|
||||
add_executable(${TARGET_PROFILE} eval-callback-profile.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET_PROFILE} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET_PROFILE} PRIVATE cxx_std_17)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_BUILD_TESTS)
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
#include <clocale>
|
||||
#include <numeric>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
struct op_stat {
|
||||
int count = 0;
|
||||
double total_bytes_in = 0;
|
||||
double total_bytes_out = 0;
|
||||
};
|
||||
|
||||
static std::map<enum ggml_op, op_stat> g_op_stats;
|
||||
static bool g_collect = false;
|
||||
|
||||
static bool eval_callback(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
(void)user_data;
|
||||
if (!g_collect) return false;
|
||||
if (t->op == GGML_OP_NONE) return false;
|
||||
if (!ask) return false; // after compute, just observe
|
||||
|
||||
auto & s = g_op_stats[t->op];
|
||||
s.count++;
|
||||
s.total_bytes_out += (double)ggml_nbytes(t);
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (t->src[i]) s.total_bytes_in += (double)ggml_nbytes(t->src[i]);
|
||||
}
|
||||
|
||||
return false; // no sync needed
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
params.n_predict = 32;
|
||||
params.cb_eval = eval_callback;
|
||||
params.cb_eval_user_data = nullptr;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
common_init();
|
||||
|
||||
auto llama_init = common_init_from_params(params);
|
||||
auto * model = llama_init->model();
|
||||
auto * ctx = llama_init->context();
|
||||
|
||||
if (!model || !ctx) {
|
||||
LOG_ERR("failed to init\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
std::string prompt = params.prompt.empty() ? "The" : params.prompt;
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, prompt, add_bos, true);
|
||||
|
||||
// Prefill
|
||||
LOG_INF("Prefilling %zu tokens...\n", tokens.size());
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size())) != 0) {
|
||||
LOG_ERR("prefill failed\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Start collecting op stats
|
||||
g_collect = true;
|
||||
|
||||
int n_gen = params.n_predict;
|
||||
LOG_INF("Generating %d tokens...\n", n_gen);
|
||||
|
||||
llama_token new_token = common_sampler_sample(llama_init->sampler(0), ctx, -1);
|
||||
|
||||
auto t_start = ggml_time_us();
|
||||
|
||||
for (int i = 0; i < n_gen; i++) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(&new_token, 1)) != 0) {
|
||||
LOG_ERR("decode failed at step %d\n", i);
|
||||
break;
|
||||
}
|
||||
new_token = common_sampler_sample(llama_init->sampler(0), ctx, -1);
|
||||
}
|
||||
|
||||
auto t_end = ggml_time_us();
|
||||
|
||||
double total_ms = (t_end - t_start) / 1000.0;
|
||||
double tps = n_gen / (total_ms / 1000.0);
|
||||
|
||||
LOG("\n=== Tokgen Graph Profile ===\n");
|
||||
LOG("Total: %d tokens in %.1f ms (%.1f tok/s)\n", n_gen, total_ms, tps);
|
||||
LOG("Total ops per decode step: %d\n\n", g_op_stats.empty() ? 0 :
|
||||
std::accumulate(g_op_stats.begin(), g_op_stats.end(), 0, [](int s, const auto & p) { return s + p.second.count; }) / n_gen);
|
||||
|
||||
double total_bytes = 0;
|
||||
for (auto & [op, s] : g_op_stats) total_bytes += s.total_bytes_out;
|
||||
|
||||
// sort by bytes_out descending
|
||||
std::vector<std::pair<enum ggml_op, op_stat>> sorted(g_op_stats.begin(), g_op_stats.end());
|
||||
std::sort(sorted.begin(), sorted.end(), [](auto & a, auto & b) {
|
||||
return a.second.total_bytes_out > b.second.total_bytes_out;
|
||||
});
|
||||
|
||||
LOG("%-20s %8s %14s %14s %10s\n", "Op", "PerTick", "BytesIn/tk", "BytesOut/tk", "Frac%");
|
||||
LOG("%-20s %8s %14s %14s %10s\n", "---", "---", "---", "---", "---");
|
||||
|
||||
for (auto & [op, s] : sorted) {
|
||||
double per_tick = (double)s.count / n_gen;
|
||||
double bytes_in_per = s.total_bytes_in / n_gen;
|
||||
double bytes_out_per = s.total_bytes_out / n_gen;
|
||||
double frac = s.total_bytes_out / total_bytes * 100;
|
||||
LOG("%-20s %8.1f %14.0f %14.0f %9.1f%%\n",
|
||||
ggml_op_name(op), per_tick, bytes_in_per, bytes_out_per, frac);
|
||||
}
|
||||
|
||||
// Estimate effective bandwidth if dominated by memory ops
|
||||
double model_gib = (double)llama_model_size(model) / (1ull << 30);
|
||||
double eff_bw = tps * model_gib * 1.074;
|
||||
LOG("\nModel size: %.2f GiB\n", model_gib);
|
||||
LOG("Effective BW (model * tps): %.0f GB/s\n", eff_bw);
|
||||
LOG("M4 Max theoretical BW: ~400 GB/s\n");
|
||||
LOG("BW utilization: %.1f%%\n", eff_bw / 400 * 100);
|
||||
|
||||
llama_perf_context_print(ctx);
|
||||
llama_backend_free();
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user