diff --git a/.gitignore b/.gitignore index e43b0f9..0949377 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .DS_Store +logs/ diff --git a/repos/patch/README.md b/repos/patch/README.md new file mode 100644 index 0000000..0b51a6e --- /dev/null +++ b/repos/patch/README.md @@ -0,0 +1,93 @@ +# Intel GPU Diagnosis Patches + +Patches generated by a 3-model council (GLM-5.1, Minimax-M2.7, Kimi k2p5) analyzing +Intel Arc GPU performance issues in llama.cpp. Non-overlapping training data ensures +blind spots are caught through cross-review. + +## Quick Start + +```bash +# Apply all phases at once: +cd repos +./patch/apply.sh + +# Apply one phase at a time (recommended for testing): +./patch/apply-phase.sh 1 # Apply phase 1 +./patch/apply-phase.sh 2 # Apply phase 2 (after testing phase 1) +./patch/apply-phase.sh 3 # Apply phase 3 (after testing phase 2) + +# Dry-run / reverse: +./patch/apply-phase.sh 1 --dry-run +./patch/apply-phase.sh 2 --reverse + +# Check what's applied: +./patch/apply.sh --status +``` + +## Phases + +### Phase 1 — SYCL Sync (safest, highest impact) +| Patch | File | Change | Decision | +|-------|------|--------|----------| +| 0001 | ggml-sycl.cpp:217 | Graph default 1→0 | Approved 3/3 | + +Enables SYCL graph execution by default. Eliminates 8 blocking `.wait()` calls. +Expected 10-30% token generation speedup for single-GPU dense LLMs. + +### Phase 2 — SYCL Kernel Tuning (depends on Phase 1) +| Patch | File | Change | Decision | +|-------|------|--------|----------| +| 0001 | common.hpp:90-91 | VER_GEN12 1M→1200, VER_GEN13→1300 | Approved 3/3 | +| 0002 | presets.hpp:57,60 | DMMV_X 32→64, MMV_Y 1→2 | Approved 3/3, needs bench | +| 0003 | common.hpp:106,109 | DMMV_X 32→64, MMV_Y 1→2 | Approved 3/3, needs bench | + +Fixes the VER_GEN12 placeholder (1M) that routed all Intel GPUs to NVIDIA Ampere paths. +Tunes DMMV thread parameters for Arc hardware. Expected 5-15% additional improvement. + +### Phase 3 — Vulkan Intel (independent of Phase 1/2) +| Patch | File | Change | Decision | +|-------|------|--------|----------| +| 0001 | ggml-vulkan.cpp:302,349 | Arc 140T device-ID Xe2 override | Approved 1/3 | + +Fixes Arrow Lake H misdetection as non-Xe2. Enables cooperative matrix. +Only affects Arc 140T systems. + +## Testing Protocol + +On Intel Arc GPU test machine: + +```bash +cd repos + +# Apply one phase +./patch/apply-phase.sh 1 + +# Build +source /opt/intel/oneapi/setvars.sh +cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ + -DCMAKE_BUILD_TYPE=Release -DGGML_AVX2=ON -DGGML_AVX=ON -DGGML_FMA=ON -DGGML_F16C=ON +cmake --build build -j$(nproc) + +# Test +ctest --test-dir build --output-on-failure + +# Benchmark before/after each phase +./build/bin/llama-bench -m -ngl 99 -p 512 -n 128 +``` + +## Council Deliberations + +Stored in `../logs/` (gitignored). Key files: +- `logs/decisions.md` — All 6 council decisions with rationale +- `logs/M-sync-overhead-20260415.md` — Agent-M sync analysis +- `logs/K-kernel-tuning-20260415.md` — Agent-K kernel tuning analysis +- `logs/M-review-K-20260415.md` — Cross-review +- `logs/K-review-M-20260415.md` — Cross-review (caught memset_tensor error) + +## Deferred to Future Phases + +- Q4_K DMMV reorder (medium complexity) +- Q6_K DMMV reorder (medium complexity) +- Q5_K reorder for both DMMV and MMVQ (hard) +- Host-buffer double-copy elimination +- Async memory ops decoupled from graph diff --git a/repos/patch/apply-phase.sh b/repos/patch/apply-phase.sh new file mode 100755 index 0000000..e1313ca --- /dev/null +++ b/repos/patch/apply-phase.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +# +# Apply patches for a single phase. +# Usage: +# apply-phase.sh 1 # Apply phase 1 patches +# apply-phase.sh 2 --dry-run # Dry-run phase 2 +# apply-phase.sh 2 --reverse # Reverse phase 2 +# apply-phase.sh all # Apply all phases in order +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPOS_DIR="$(dirname "$SCRIPT_DIR")" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +PHASE_NAME="" +DRY_RUN=0 +REVERSE=0 + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [--dry-run|--reverse]" + echo "Phases: 1=sycl-sync, 2=sycl-kernel, 3=vulkan-intel" + exit 1 +fi + +PHASE_ARG="$1" +shift + +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=1 ;; + --reverse) REVERSE=1 ;; + esac +done + +apply_phase() { + local phase_num="$1" + local phase_dir="" + + case "$phase_num" in + 1) phase_dir="phase1-sycl-sync"; PHASE_NAME="SYCL Sync (graph + async)" ;; + 2) phase_dir="phase2-sycl-kernel"; PHASE_NAME="SYCL Kernel (VER_GEN + DMMV tuning)" ;; + 3) phase_dir="phase3-vulkan-intel"; PHASE_NAME="Vulkan Intel (Arc 140T Xe2 override)" ;; + *) + echo -e "${RED}Unknown phase: $phase_num${NC}" + exit 1 + ;; + esac + + local patch_dir="$SCRIPT_DIR/$phase_dir" + local target_repo="$REPOS_DIR/llama.cpp" + + if [[ ! -d "$target_repo" ]]; then + echo -e "${RED}Repo not found: $target_repo${NC}" + exit 1 + fi + + if [[ ! -d "$patch_dir" ]]; then + echo -e "${YELLOW}No patch directory: $patch_dir${NC}" + return 0 + fi + + local patches=($(ls "$patch_dir"/*.patch 2>/dev/null | sort)) + if [[ ${#patches[@]} -eq 0 ]]; then + echo -e "${YELLOW}No patches in $patch_dir${NC}" + return 0 + fi + + local action="APPLYING" + local git_flag="" + if [[ $DRY_RUN -eq 1 ]]; then + action="CHECKING" + git_flag="--check" + fi + if [[ $REVERSE -eq 1 ]]; then + action="REVERSING" + git_flag="-R" + if [[ $DRY_RUN -eq 1 ]]; then + git_flag="-R --check" + fi + fi + + local applied=0 failed=0 + + for patch_file in "${patches[@]}"; do + local patch_name=$(basename "$patch_file") + echo -n -e " ${BLUE}${action}${NC} ${phase_dir}/${patch_name} ... " + + if (cd "$target_repo" && git apply $git_flag "$patch_file" 2>/dev/null); then + if [[ $DRY_RUN -eq 1 ]]; then + echo -e "${GREEN}[OK dry-run]${NC}" + else + echo -e "${GREEN}[OK]${NC}" + fi + ((applied++)) || true + else + if [[ $DRY_RUN -eq 1 ]]; then + echo -e "${RED}[FAIL dry-run]${NC}" + else + echo -e "${RED}[FAIL]${NC}" + fi + ((failed++)) || true + fi + done + + echo -e " ${GREEN}$applied ok${NC}, ${RED}$failed failed${NC}" + return $failed +} + +if [[ "$PHASE_ARG" == "all" ]]; then + echo -e "${BLUE}=== All Phases ===${NC}" + if [[ $DRY_RUN -eq 1 ]]; then echo -e "(dry-run)"; fi + if [[ $REVERSE -eq 1 ]]; then echo -e "(reversing in reverse order: 3, 2, 1)"; fi + echo "" + total_fail=0 + + if [[ $REVERSE -eq 1 ]]; then + for p in 3 2 1; do + echo -e "${BLUE}[Phase $p]${NC}" + apply_phase $p || ((total_fail++)) || true + echo "" + done + else + for p in 1 2 3; do + echo -e "${BLUE}[Phase $p]${NC}" + apply_phase $p || ((total_fail++)) || true + echo "" + done + fi + + if [[ $total_fail -gt 0 ]]; then + echo -e "${RED}$total_fail phase(s) had failures.${NC}" + exit 1 + fi + echo -e "${GREEN}All phases complete.${NC}" +else + echo -e "${BLUE}=== Phase $PHASE_ARG: $PHASE_NAME ===${NC}" + if [[ $DRY_RUN -eq 1 ]]; then echo -e "(dry-run)"; fi + if [[ $REVERSE -eq 1 ]]; then echo -e "(reversing)"; fi + echo "" + apply_phase "$PHASE_ARG" +fi diff --git a/repos/patch/apply.sh b/repos/patch/apply.sh new file mode 100755 index 0000000..44a5b16 --- /dev/null +++ b/repos/patch/apply.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# +# apply.sh — Apply all Intel GPU diagnosis patches +# +# Delegates to apply-phase.sh. See that script for options. +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +NC='\033[0m' + +# Parse args to detect --reverse and pass through +REVERSE=0 +DRY_RUN=0 +for arg in "$@"; do + case "$arg" in + --reverse) REVERSE=1 ;; + --dry-run) DRY_RUN=1 ;; + --status) + # Status mode — check each patch + echo "Patch status:" + for phase_dir in "$SCRIPT_DIR"/phase*/; do + phase_name=$(basename "$phase_dir") + echo "" + echo " [$phase_name]" + for patch_file in "$phase_dir"*.patch; do + [[ -f "$patch_file" ]] || continue + patch_name=$(basename "$patch_file") + target_repo="$(dirname "$SCRIPT_DIR")/llama.cpp" + if (cd "$target_repo" && git apply --check "$patch_file" 2>/dev/null); then + echo -e " ${YELLOW}NOT APPLIED${NC} $patch_name" + elif (cd "$target_repo" && git apply --check -R "$patch_file" 2>/dev/null); then + echo -e " ${GREEN}APPLIED${NC} $patch_name" + else + echo -e " ${RED}CONFLICT${NC} $patch_name" + fi + done + done + exit 0 + ;; + esac +done + +exec "$SCRIPT_DIR/apply-phase.sh" all "$@" diff --git a/repos/patch/phase1-sycl-sync/0001-enable-sycl-graph-by-default.patch b/repos/patch/phase1-sycl-sync/0001-enable-sycl-graph-by-default.patch new file mode 100644 index 0000000..ece5d96 --- /dev/null +++ b/repos/patch/phase1-sycl-sync/0001-enable-sycl-graph-by-default.patch @@ -0,0 +1,13 @@ +diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp +index ea79d25..e93201c 100644 +--- a/ggml/src/ggml-sycl/ggml-sycl.cpp ++++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +@@ -214,7 +214,7 @@ static void ggml_check_sycl() try { + if (!initialized) { + g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); + g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); +- g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); ++ g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 0); + g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); + g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); + diff --git a/repos/patch/phase1-sycl-sync/README.md b/repos/patch/phase1-sycl-sync/README.md new file mode 100644 index 0000000..334a011 --- /dev/null +++ b/repos/patch/phase1-sycl-sync/README.md @@ -0,0 +1,26 @@ +# Phase 1 — SYCL Synchronization + +## 0001-enable-sycl-graph-by-default.patch + +Changes `GGML_SYCL_DISABLE_GRAPH` default from 1 (disabled) to 0 (enabled). + +### What it does +- Enables SYCL graph execution for single-GPU dense LLM inference +- Enables async memory operations (tied to graph support in upstream code) +- Eliminates 8 blocking `.wait()` calls in reorder functions (Q4_0, Q8_0, Q4_K, Q6_K) + +### What it does NOT affect +- MoE models (MUL_MAT_ID) — `check_graph_compatibility()` auto-disables graphs +- CONCAT operations — auto-disabled +- Multi-GPU setups — always disabled +- Users can override: `GGML_SYCL_DISABLE_GRAPH=1` + +### Expected impact +10-30% token generation speedup on single-GPU dense LLM inference. + +### Testing checklist +- [ ] Build succeeds with `-DGGML_SYCL=ON` +- [ ] `GGML_SYCL_DEBUG=1` shows "SYCL-GRAPH" messages for dense models +- [ ] Dense model inference produces correct output +- [ ] MoE model falls back gracefully (logs "disabling SYCL graphs") +- [ ] `GGML_SYCL_DISABLE_GRAPH=1` restores old behavior diff --git a/repos/patch/phase2-sycl-kernel/0001-fix-ver-gen-thresholds.patch b/repos/patch/phase2-sycl-kernel/0001-fix-ver-gen-thresholds.patch new file mode 100644 index 0000000..e5c6da1 --- /dev/null +++ b/repos/patch/phase2-sycl-kernel/0001-fix-ver-gen-thresholds.patch @@ -0,0 +1,15 @@ +diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp +index fd84c91..48cc91f 100644 +--- a/ggml/src/ggml-sycl/common.hpp ++++ b/ggml/src/ggml-sycl/common.hpp +@@ -87,8 +87,8 @@ extern int g_ggml_sycl_enable_flash_attention; + #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP + #define VER_4VEC 610 // todo for hardware optimize. + #define VER_GEN9 700 // todo for hardware optimize. +-#define VER_GEN12 1000000 // todo for hardware optimize. +-#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardware optimize. ++#define VER_GEN12 1200 // Intel discrete GPUs (DG2/Alchemist, major=12) ++#define VER_GEN13 1300 // Intel Xe2/Battlemage GPUs (major=13) + + #define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares + diff --git a/repos/patch/phase2-sycl-kernel/0002-tune-dmmv-xy-for-arc.patch b/repos/patch/phase2-sycl-kernel/0002-tune-dmmv-xy-for-arc.patch new file mode 100644 index 0000000..57863dd --- /dev/null +++ b/repos/patch/phase2-sycl-kernel/0002-tune-dmmv-xy-for-arc.patch @@ -0,0 +1,17 @@ +diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp +index dc4dad1..a2e4ce2 100644 +--- a/ggml/src/ggml-sycl/presets.hpp ++++ b/ggml/src/ggml-sycl/presets.hpp +@@ -54,10 +54,10 @@ + + // dmmv = dequantize_mul_mat_vec + #ifndef GGML_SYCL_DMMV_X +-#define GGML_SYCL_DMMV_X 32 ++#define GGML_SYCL_DMMV_X 64 + #endif + #ifndef GGML_SYCL_MMV_Y +-#define GGML_SYCL_MMV_Y 1 ++#define GGML_SYCL_MMV_Y 2 + #endif + + #ifndef K_QUANTS_PER_ITERATION diff --git a/repos/patch/phase2-sycl-kernel/0003-tune-dmmv-xy-common-hpp.patch b/repos/patch/phase2-sycl-kernel/0003-tune-dmmv-xy-common-hpp.patch new file mode 100644 index 0000000..7feb07c --- /dev/null +++ b/repos/patch/phase2-sycl-kernel/0003-tune-dmmv-xy-common-hpp.patch @@ -0,0 +1,17 @@ +diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp +index fd84c91..dd5cf1a 100644 +--- a/ggml/src/ggml-sycl/common.hpp ++++ b/ggml/src/ggml-sycl/common.hpp +@@ -103,10 +103,10 @@ extern int g_ggml_sycl_enable_flash_attention; + + // dmmv = dequantize_mul_mat_vec + #ifndef GGML_SYCL_DMMV_X +-#define GGML_SYCL_DMMV_X 32 ++#define GGML_SYCL_DMMV_X 64 + #endif + #ifndef GGML_SYCL_MMV_Y +-#define GGML_SYCL_MMV_Y 1 ++#define GGML_SYCL_MMV_Y 2 + #endif + + typedef sycl::queue *queue_ptr; diff --git a/repos/patch/phase2-sycl-kernel/README.md b/repos/patch/phase2-sycl-kernel/README.md new file mode 100644 index 0000000..46446b6 --- /dev/null +++ b/repos/patch/phase2-sycl-kernel/README.md @@ -0,0 +1,37 @@ +# Phase 2 — SYCL Kernel Tuning + +**Depends on:** Phase 1 (should be applied and tested first) + +## 0001-fix-ver-gen-thresholds.patch + +Fixes VER_GEN12 (1,000,000 → 1,200) and VER_GEN13 (1,001,030 → 1,300). + +The original VER_GEN12 value was an unreachable placeholder that caused all Intel +Arc GPUs (cc≈1255 for A770) to fall through to the NVIDIA Ampere tuning path in +all MMQ kernels. After this patch, Intel discrete GPUs use the VER_GEN12 path. + +## 0002-tune-dmmv-xy-for-arc.patch + +Changes presets.hpp: DMMV_X 32→64, MMV_Y 1→2. + +Doubles the data processed per thread in DMMV kernels and doubles rows per +work-group. All common model widths (4096-14336) are divisible by 64. + +## 0003-tune-dmmv-xy-common-hpp.patch + +Same changes as 0002 but in common.hpp (duplicate definitions). + +### Expected impact +5-15% additional improvement on top of Phase 1. + +### ⚠️ Needs Benchmarking +DMMV_X=64 and MMV_Y=2 were chosen analytically, not empirically. If MMV_Y=2 +causes register spills (check with `GGML_SYCL_DEBUG=1`), revert 0002+0003 and +try DMMV_X=64 with MMV_Y=1 only. + +### Testing checklist +- [ ] Build succeeds +- [ ] Unit tests pass +- [ ] Dense model inference produces correct output +- [ ] No assertion failures (`ncols % GGML_SYCL_DMMV_X == 0`) +- [ ] Benchmark comparison vs Phase 1 only diff --git a/repos/patch/phase3-vulkan-intel/0001-arc-140t-xe2-override.patch b/repos/patch/phase3-vulkan-intel/0001-arc-140t-xe2-override.patch new file mode 100644 index 0000000..5daac4c --- /dev/null +++ b/repos/patch/phase3-vulkan-intel/0001-arc-140t-xe2-override.patch @@ -0,0 +1,34 @@ +diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +index b2a54bd..3469f6f 100644 +--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp ++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +@@ -302,6 +302,14 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& + return vk_device_architecture::OTHER; + } + ++ // Arrow Lake H (Arc 140T) misreports minSubgroupSize=8 despite being Xe2. ++ // Device-ID override to force Xe2 classification. ++ // See: https://github.com/ggml-org/llama.cpp/issues/20776 ++ const uint32_t devid = props.deviceID; ++ if (devid == 0x7D51 || devid == 0x7D45) { ++ return vk_device_architecture::INTEL_XE2; ++ } ++ + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd; + vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props; +@@ -341,6 +349,14 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& + return vk_device_architecture::OTHER; + } + ++ // Arrow Lake H (Arc 140T) misreports minSubgroupSize=8 despite being Xe2. ++ // Device-ID override to force Xe2 classification. ++ // See: https://github.com/ggml-org/llama.cpp/issues/20776 ++ const uint32_t devid = props.deviceID; ++ if (devid == 0x7D51 || devid == 0x7D45) { ++ return vk_device_architecture::INTEL_XE2; ++ } ++ + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + diff --git a/repos/patch/phase3-vulkan-intel/README.md b/repos/patch/phase3-vulkan-intel/README.md new file mode 100644 index 0000000..208f84c --- /dev/null +++ b/repos/patch/phase3-vulkan-intel/README.md @@ -0,0 +1,30 @@ +# Phase 3 — Vulkan Intel Fixes + +**Depends on:** Phase 1 and 2 (should be applied and tested first) + +## 0001-arc-140t-xe2-override.patch + +Adds device-ID override for Intel Arc 140T (Arrow Lake H) to force INTEL_XE2 +classification in the Vulkan backend. + +### Problem +Arc 140T reports minSubgroupSize=8 instead of 16. The Vulkan backend uses +minSubgroupSize to detect Xe2. When misreported, the 140T is classified as +OTHER, disabling cooperative matrix and all dependent optimizations. + +### Fix +Checks for Arrow Lake H device IDs (0x7D51, 0x7D45) before the minSubgroupSize +check and returns INTEL_XE2 directly. + +### Applies to +Both the EXT and KHR code paths in ggml-vulkan.cpp. + +### Impact +Only affects Arrow Lake H (Arc 140T) systems. No effect on other hardware. + +### Testing checklist +- [ ] Build succeeds with Vulkan support +- [ ] Arc 140T: device classified as INTEL_XE2 +- [ ] Arc 140T: cooperative matrix shaders used for matmul +- [ ] Other Intel GPUs: no change in behavior +- [ ] Non-Intel GPUs: no change in behavior