Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 31ce8b1ae5 | |||
| 06f05e71c1 | |||
| eeb79b026b |
@@ -52,59 +52,9 @@ Issue labels:
|
|||||||
2. Make changes, commit with `[area] description` conventions (see below)
|
2. Make changes, commit with `[area] description` conventions (see below)
|
||||||
3. Push branch: `git push gitea feature/<name>`
|
3. Push branch: `git push gitea feature/<name>`
|
||||||
4. Create PR on Gitea targeting `master`
|
4. Create PR on Gitea targeting `master`
|
||||||
5. Before merge: build, benchmark (record in BENCHMARKS.md), perplexity check if kernel changed, **coherence test** (see below)
|
5. Before merge: build, benchmark (record in BENCHMARKS.md), perplexity check if kernel changed
|
||||||
6. Squash-merge to master
|
6. Squash-merge to master
|
||||||
|
|
||||||
## Pre-Merge Coherence Tests
|
|
||||||
|
|
||||||
**Mandatory before any merge to master.** Run on both `master` and the PR branch to detect silent correctness regressions.
|
|
||||||
|
|
||||||
> **IMPORTANT:** macOS has no `timeout` command. Use `gtimeout` (from `brew install coreutils`).
|
|
||||||
|
|
||||||
### Quick Coherence Test (4B model, ~30s)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
gtimeout 60 ./build-build/bin/llama-cli \
|
|
||||||
-m ~/.llama/models/Qwen3.5-4B-Q4_0.gguf \
|
|
||||||
-n 64 -p "Once upon a time" \
|
|
||||||
--temp 0 -s 42 -st
|
|
||||||
```
|
|
||||||
|
|
||||||
### Perplexity Check (kernel changes only)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
gtimeout 120 ./build-build/bin/llama-perplexity \
|
|
||||||
-m ~/.llama/models/Qwen3.5-4B-Q4_0.gguf \
|
|
||||||
-f /tmp/coherence_test.txt -t 1 --chunks 1 -c 128
|
|
||||||
```
|
|
||||||
|
|
||||||
### Verification
|
|
||||||
|
|
||||||
- **llama-cli**: PR output must be coherent speech (not gibberish). Does not need to be bit-perfect vs master.
|
|
||||||
- **perplexity**: PR perplexity must match master within floating-point tolerance (<0.1% delta)
|
|
||||||
- **Gibberish output = block merge.** Re-dispatch with specific feedback.
|
|
||||||
|
|
||||||
### Timeout Policy
|
|
||||||
|
|
||||||
**All test commands MUST use `gtimeout`** to prevent hangs:
|
|
||||||
- Inference/cli: `gtimeout 60` (60s)
|
|
||||||
- Perplexity: `gtimeout 120` (120s)
|
|
||||||
- Benchmark: `gtimeout 300` (5min)
|
|
||||||
- Build: `gtimeout 300` (5min)
|
|
||||||
|
|
||||||
A hung test is a test failure. Do not retry without investigating the hang.
|
|
||||||
|
|
||||||
### IMPORTANT: llama-cli interactive mode
|
|
||||||
|
|
||||||
llama-cli enters interactive REPL after generating, flooding output with `>` prompts. This is NOT a correctness failure — it's the CLI waiting for input.
|
|
||||||
**Always use `--single-turn` (`-st`) flag to prevent this:**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
gtimeout 60 ./build-build/bin/llama-cli -m ~/.llama/models/Qwen3.5-4B-Q4_0.gguf -n 64 -p "Once upon a time" --temp 0 -s 42 -st
|
|
||||||
```
|
|
||||||
|
|
||||||
Without `-st`, you will see `>` garbage and the process will hang. DO NOT attempt to "fix" the kernel because of this.
|
|
||||||
|
|
||||||
## Commit Messages
|
## Commit Messages
|
||||||
|
|
||||||
Format: `[area] short description (max 72 chars)`
|
Format: `[area] short description (max 72 chars)`
|
||||||
@@ -127,23 +77,15 @@ When working autonomously, agents MUST:
|
|||||||
2. **Create a branch** for any code change: `feature/<issue-number>-<short-desc>`
|
2. **Create a branch** for any code change: `feature/<issue-number>-<short-desc>`
|
||||||
3. **Reference the issue** in commits: `[area] description (#123)`
|
3. **Reference the issue** in commits: `[area] description (#123)`
|
||||||
4. **Run benchmarks** before/after kernel changes and record in BENCHMARKS.md
|
4. **Run benchmarks** before/after kernel changes and record in BENCHMARKS.md
|
||||||
5. **Run perplexity** to verify correctness after any kernel change (with timeout):
|
5. **Run perplexity** to verify correctness after any kernel change:
|
||||||
```bash
|
```bash
|
||||||
gtimeout 120 ./build-build/bin/llama-perplexity -m MODEL.gguf -f /tmp/coherence_test.txt -t 1 --chunks 1 -c 128
|
./build-build/bin/llama-perplexity -m MODEL.gguf -f /tmp/coherence_test.txt -t 1 --chunks 1 -c 128
|
||||||
```
|
```
|
||||||
6. **Run coherence test** before any merge (with timeout):
|
6. **Build succeeds** before pushing:
|
||||||
```bash
|
```bash
|
||||||
gtimeout 60 ./build-build/bin/llama-cli -m ~/.llama/models/Qwen3.5-4B-Q4_0.gguf -n 64 -p "Once upon a time" --temp 0 -s 42 -st
|
cmake --build build-build -j$(sysctl -n hw.ncpu)
|
||||||
```
|
```
|
||||||
Output must be coherent speech (not gibberish).
|
7. **Push branch** to gitea, then **create PR via Gitea API** (not via git push)
|
||||||
7. **Build succeeds** before pushing (with timeout):
|
|
||||||
```bash
|
|
||||||
gtimeout 300 cmake --build build-build -j$(sysctl -n hw.ncpu)
|
|
||||||
```
|
|
||||||
8. **Push branch** to gitea, then **create PR via Gitea API** (not via git push)
|
|
||||||
|
|
||||||
> **NOTE:** macOS has no `timeout` command. Always use `gtimeout` (from `brew install coreutils`).
|
|
||||||
> **NOTE:** Always use `-st` flag with llama-cli to prevent interactive mode `>` prompts.
|
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
@@ -171,20 +113,6 @@ cmake --build build-build --target llama-eval-callback-profile -j$(sysctl -n hw.
|
|||||||
./build-build/bin/llama-perplexity -m MODEL.gguf -f /tmp/coherence_test.txt -t 1 --chunks 1 -c 128
|
./build-build/bin/llama-perplexity -m MODEL.gguf -f /tmp/coherence_test.txt -t 1 --chunks 1 -c 128
|
||||||
```
|
```
|
||||||
|
|
||||||
## MLX Benchmarking
|
|
||||||
|
|
||||||
MLX-lm is the performance target. Models at `~/.omlx/models/`.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Quick generation test
|
|
||||||
mlx_lm.generate --model ~/.omlx/models/Qwen3.6-27B-Q4_0 --prompt "Once upon a time" --max-tokens 128
|
|
||||||
|
|
||||||
# Benchmark with timing
|
|
||||||
time mlx_lm.generate --model ~/.omlx/models/Qwen3.6-27B-Q4_0 --prompt "Once upon a time" --max-tokens 4096
|
|
||||||
```
|
|
||||||
|
|
||||||
Compare llama.cpp results against MLX baselines. Record in BENCHMARKS.md.
|
|
||||||
|
|
||||||
## Profiling Tools
|
## Profiling Tools
|
||||||
|
|
||||||
| Tool | What it does |
|
| Tool | What it does |
|
||||||
@@ -224,47 +152,19 @@ ANALYSIS_QWEN3_5_MXFP4.md — MXFP4 format analysis
|
|||||||
## Gitea API
|
## Gitea API
|
||||||
|
|
||||||
Base: `https://git.kokoham.com/api/v1`
|
Base: `https://git.kokoham.com/api/v1`
|
||||||
Token in `~/Documents/personal/projects/.env` as `GITEA_TOKEN`.
|
Token in `~/.gitea_token` (not committed).
|
||||||
|
Local API from server: `http://127.0.0.1:18431/api/v1`
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export $(grep -v '^#' ~/Documents/personal/projects/.env | xargs)
|
|
||||||
|
|
||||||
# Create issue
|
# Create issue
|
||||||
curl -X POST "https://git.kokoham.com/api/v1/repos/sleepy/llama.cpp/issues" \
|
curl -X POST "http://127.0.0.1:18431/api/v1/repos/sleepy/llama.cpp/issues" \
|
||||||
-H "Authorization: token $GITEA_TOKEN" \
|
-H "Authorization: token $(cat ~/.gitea_token)" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{"title":"...","body":"...","labels":["perf"]}'
|
-d '{"title":"...","body":"...","labels":["perf"]}'
|
||||||
|
|
||||||
# Create PR
|
# Create PR
|
||||||
curl -X POST "https://git.kokoham.com/api/v1/repos/sleepy/llama.cpp/pulls" \
|
curl -X POST "http://127.0.0.1:18431/api/v1/repos/sleepy/llama.cpp/pulls" \
|
||||||
-H "Authorization: token $GITEA_TOKEN" \
|
-H "Authorization: token $(cat ~/.gitea_token)" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{"title":"...","body":"...","head":"feature/xyz","base":"master"}'
|
-d '{"title":"...","body":"...","head":"feature/xyz","base":"master"}'
|
||||||
|
|
||||||
# Comment on issue/PR
|
|
||||||
curl -X POST "https://git.kokoham.com/api/v1/repos/sleepy/llama.cpp/issues/{number}/comments" \
|
|
||||||
-H "Authorization: token $GITEA_TOKEN" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"body":"..."}'
|
|
||||||
|
|
||||||
# Close issue
|
|
||||||
curl -X PATCH "https://git.kokoham.com/api/v1/repos/sleepy/llama.cpp/issues/{number}" \
|
|
||||||
-H "Authorization: token $GITEA_TOKEN" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"state":"closed"}'
|
|
||||||
|
|
||||||
# Merge PR
|
|
||||||
curl -X POST "https://git.kokoham.com/api/v1/repos/sleepy/llama.cpp/pulls/{number}/merge" \
|
|
||||||
-H "Authorization: token $GITEA_TOKEN" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"do_force_merge":false,"merge_title":"..."}'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Onboarding — What to Read
|
|
||||||
|
|
||||||
For new agents or sessions, read in this order:
|
|
||||||
1. **GIT.md** — this file (workflow, tests, commands, file locations)
|
|
||||||
2. **BENCHMARKS.md** — all benchmark results, track progress toward 22 t/s
|
|
||||||
3. **ANALYSIS_QWEN3_5_MXFP4.md** — MXFP4 format analysis (if relevant)
|
|
||||||
4. **Issue #40** — target t/s goal and MLX comparison guidelines
|
|
||||||
5. **MLX reference**: `../mlx-lm/mlx/include/mlx/backend/metal/kernels/quantized.h` — qmv_fast_impl
|
|
||||||
|
|||||||
@@ -732,6 +732,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
|
|||||||
int nr1 = 1; // number of src1 rows per threadgroup
|
int nr1 = 1; // number of src1 rows per threadgroup
|
||||||
|
|
||||||
size_t smem = 0; // shared memory
|
size_t smem = 0; // shared memory
|
||||||
|
bool contig = false;
|
||||||
|
|
||||||
const ggml_type tsrc0 = op->src[0]->type;
|
const ggml_type tsrc0 = op->src[0]->type;
|
||||||
const ggml_type tsrc1 = op->src[1]->type;
|
const ggml_type tsrc1 = op->src[1]->type;
|
||||||
@@ -766,6 +767,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
|
|||||||
{
|
{
|
||||||
nsg = N_SG_Q4_0;
|
nsg = N_SG_Q4_0;
|
||||||
nr0 = N_R0_Q4_0;
|
nr0 = N_R0_Q4_0;
|
||||||
|
contig = ne00 >= 256;
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
{
|
{
|
||||||
@@ -877,7 +879,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv(ggml_meta
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
snprintf(base, 256, "kernel_mul_mv_%s_%s%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1), suffix);
|
snprintf(base, 256, "kernel_mul_mv_%s_%s%s%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1), contig ? "_c" : "", suffix);
|
||||||
snprintf(name, 256, "%s_nsg=%d", base, nsg);
|
snprintf(name, 256, "%s_nsg=%d", base, nsg);
|
||||||
|
|
||||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||||
|
|||||||
@@ -3533,6 +3533,94 @@ kernel void kernel_mul_mv_q4_0_f32(
|
|||||||
mul_vec_q_n_f32_impl<block_q4_0, N_R0_Q4_0, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
|
mul_vec_q_n_f32_impl<block_q4_0, N_R0_Q4_0, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Q4_0 kernel with contiguous uint32_t weight reads (MLX-style)
|
||||||
|
// Each thread reads 4 contiguous uint32_t packs per block instead of
|
||||||
|
// 8 strided uint16_t reads, improving memory coalescing on Apple GPUs.
|
||||||
|
kernel void kernel_mul_mv_q4_0_f32_c(
|
||||||
|
constant ggml_metal_kargs_mul_mv & args,
|
||||||
|
device const char * src0,
|
||||||
|
device const char * src1,
|
||||||
|
device char * dst,
|
||||||
|
threadgroup char * shmem [[threadgroup(0)]],
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
ushort tiisg[[thread_index_in_simdgroup]],
|
||||||
|
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
const short NSG = FC_mul_mv_nsg;
|
||||||
|
|
||||||
|
constexpr short NR0 = N_R0_Q4_0;
|
||||||
|
constexpr short NW = N_SIMDWIDTH;
|
||||||
|
|
||||||
|
const int nb = args.ne00 / QK4_0;
|
||||||
|
|
||||||
|
const int r0 = (tgpig.x * NSG + sgitg) * NR0;
|
||||||
|
const int r1 = tgpig.y;
|
||||||
|
const int im = tgpig.z;
|
||||||
|
|
||||||
|
const uint i12 = im % args.ne12;
|
||||||
|
const uint i13 = im / args.ne12;
|
||||||
|
|
||||||
|
const uint64_t offset1 = r1 * args.nb11 + (i12) * args.nb12 + (i13) * args.nb13;
|
||||||
|
|
||||||
|
device const float * y = (device const float *) (src1 + offset1);
|
||||||
|
|
||||||
|
device const block_q4_0 * ax[NR0];
|
||||||
|
FOR_UNROLL (int row = 0; row < NR0; ++row) {
|
||||||
|
const uint64_t offset0 = (r0 + row) * args.nb01 + (i12 / args.r2) * args.nb02 + (i13 / args.r3) * args.nb03;
|
||||||
|
ax[row] = (device const block_q4_0 *) ((device char *) src0 + offset0);
|
||||||
|
}
|
||||||
|
|
||||||
|
float sumf[NR0] = {0.f};
|
||||||
|
|
||||||
|
const short ix = (tiisg / (NW / 16));
|
||||||
|
const short il = (tiisg % (NW / 16)) * 8;
|
||||||
|
const int ib0 = ix;
|
||||||
|
|
||||||
|
const uint q_off = il / 8;
|
||||||
|
|
||||||
|
device const float * yb = y + ib0 * QK4_0 + il;
|
||||||
|
|
||||||
|
for (int ib = ib0; ib < nb; ib += 16) {
|
||||||
|
float sumy = 0.f;
|
||||||
|
|
||||||
|
FOR_UNROLL (short i = 0; i < 8; i += 2) {
|
||||||
|
sumy += yb[i + 0] + yb[i + 1] + yb[i + 16] + yb[i + 17];
|
||||||
|
}
|
||||||
|
|
||||||
|
FOR_UNROLL (short row = 0; row < NR0; row++) {
|
||||||
|
const float d = ax[row][ib].d;
|
||||||
|
device const uint32_t * qs = (device const uint32_t *) (ax[row][ib].qs);
|
||||||
|
|
||||||
|
const uint32_t q0 = qs[q_off];
|
||||||
|
const uint32_t q1 = qs[q_off + 2];
|
||||||
|
|
||||||
|
float acc = 0.f;
|
||||||
|
|
||||||
|
FOR_UNROLL (short i = 0; i < 8; i += 2) {
|
||||||
|
const uint ni = i / 2;
|
||||||
|
|
||||||
|
acc += ((q0 >> (4 * ni)) & 0xF) * yb[i + 0]
|
||||||
|
+ ((q0 >> (4 * (ni + 1))) & 0xF) * yb[i + 1]
|
||||||
|
+ ((q1 >> (4 * ni)) & 0xF) * yb[i + 16]
|
||||||
|
+ ((q1 >> (4 * (ni + 1))) & 0xF) * yb[i + 17];
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf[row] += d * (acc + sumy * -8.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
yb += QK4_0 * 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
device float * dst_f32 = (device float *) dst + im * args.ne0 * args.ne1 + r1 * args.ne0;
|
||||||
|
|
||||||
|
for (int row = 0; row < NR0; ++row) {
|
||||||
|
const float tot = simd_sum(sumf[row]);
|
||||||
|
|
||||||
|
if (tiisg == 0 && r0 + row < args.ne01) {
|
||||||
|
dst_f32[r0 + row] = tot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mv_q4_1_f32(
|
kernel void kernel_mul_mv_q4_1_f32(
|
||||||
constant ggml_metal_kargs_mul_mv & args,
|
constant ggml_metal_kargs_mul_mv & args,
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
|
|||||||
Reference in New Issue
Block a user