openvino: driver setup, CI split, thread safety, and NPU optimizations (#21944)

* Thread safety per request only

* Fix ROPE yarn case

* Fix sticky stateful config

* Use i4/i8 directly for symmetric quant

* Use weightless caching

* Add WeightlessCacheAttribute to reduce NPU memory usage

* Gelu tanh support (#125)

* Imrope support (#126)

* fix(openvino): explicit ov::Tensor frees in ggml_backend_openvino_free

* add GPU,NPU support in OV Dockerfile

* add build-openvino.yml ci

* Fix sticky stateful config

* add concurrency to ov-gpu ci runs. Move OV CI to build-openvino.yml

* fix thread-safety of shared runtime context

* rope type abstraction for frontend translations

* fix editorconfig

---------

Co-authored-by: Mustafa Cavus <mustafa.cavus@intel.com>
Co-authored-by: Dan Hoffman <dhoff749@gmail.com>
Co-authored-by: Ravi Panchumarthy <ravi.panchumarthy@intel.com>
This commit is contained in:
Zijun Yu
2026-04-21 23:58:34 +08:00
committed by GitHub
parent 606fa42f5d
commit 52f1096f21
21 changed files with 823 additions and 544 deletions
+48 -2
View File
@@ -2,7 +2,19 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
ARG UBUNTU_VERSION=24.04 ARG UBUNTU_VERSION=24.04
# Optional proxy build arguments - empty by default # Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGDGMM_VERSION=22.9.0
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
ARG NPU_DRIVER_VERSION=v1.32.0
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
# Optional proxy build arguments
ARG http_proxy= ARG http_proxy=
ARG https_proxy= ARG https_proxy=
@@ -78,13 +90,47 @@ ARG http_proxy
ARG https_proxy ARG https_proxy
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl \ && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete && find /var/cache -type f -delete
# Install GPU drivers
ARG IGC_VERSION
ARG IGC_VERSION_FULL
ARG COMPUTE_RUNTIME_VERSION
ARG COMPUTE_RUNTIME_VERSION_FULL
ARG IGDGMM_VERSION
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& dpkg --install *.deb \
&& rm -rf /tmp/neo/
# Install NPU drivers
ARG NPU_DRIVER_VERSION
ARG NPU_DRIVER_FULL
ARG LIBZE1_VERSION
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& dpkg --install *.deb \
&& rm -rf /tmp/npu/
RUN cd /tmp \
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
COPY --from=build /app/lib/ /app/ COPY --from=build /app/lib/ /app/
### Full (all binaries) ### Full (all binaries)
+120
View File
@@ -0,0 +1,120 @@
name: CI (openvino)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-openvino.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-openvino.yml',
'ggml/src/ggml-openvino/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-openvino:
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
concurrency:
group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
cancel-in-progress: false
strategy:
matrix:
include:
- variant: cpu
runner: '"ubuntu-24.04"'
openvino_device: "CPU"
- variant: gpu
runner: '["self-hosted","Linux","Intel","OpenVINO"]'
openvino_device: "GPU"
runs-on: ${{ fromJSON(matrix.runner) }}
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
if: runner.environment == 'github-hosted'
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
- name: Use OpenVINO Toolkit Cache
if: runner.environment == 'github-hosted'
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Build
id: cmake_build
run: |
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
time cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test
id: cmake_test
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
cd ${{ github.workspace }}
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
export GGML_OPENVINO_DEVICE=GPU
fi
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
+4
View File
@@ -265,6 +265,10 @@ jobs:
ggml-ci-intel-openvino-gpu-low-perf: ggml-ci-intel-openvino-gpu-low-perf:
runs-on: [self-hosted, Linux, Intel, OpenVINO] runs-on: [self-hosted, Linux, Intel, OpenVINO]
concurrency:
group: openvino-gpu-${{ github.head_ref || github.ref }}
cancel-in-progress: false
env: env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0" OPENVINO_VERSION_MAJOR: "2026.0"
-80
View File
@@ -656,86 +656,6 @@ jobs:
-DGGML_SYCL_F16=ON -DGGML_SYCL_F16=ON
time cmake --build build --config Release -j $(nproc) time cmake --build build --config Release -j $(nproc)
ubuntu-24-openvino:
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
strategy:
matrix:
include:
- variant: cpu
runner: '"ubuntu-24.04"'
openvino_device: "CPU"
- variant: gpu
runner: '["self-hosted","Linux","X64","Intel"]'
openvino_device: "GPU"
runs-on: ${{ fromJSON(matrix.runner) }}
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
if: runner.environment == 'github-hosted'
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
- name: Use OpenVINO Toolkit Cache
if: runner.environment == 'github-hosted'
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Build
id: cmake_build
run: |
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
time cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test
id: cmake_test
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
cd ${{ github.workspace }}
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
export GGML_OPENVINO_DEVICE=GPU
fi
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
windows-latest: windows-latest:
runs-on: windows-2025 runs-on: windows-2025
-3
View File
@@ -244,7 +244,6 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
- `-fa 1` is required when running llama-bench with the OpenVINO backend. - `-fa 1` is required when running llama-bench with the OpenVINO backend.
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1` - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled. - `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
> [!NOTE] > [!NOTE]
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved. > The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -274,8 +273,6 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
Run llama.cpp with OpenVINO backend Docker container. Run llama.cpp with OpenVINO backend Docker container.
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below. Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
> [!NOTE]
> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
```bash ```bash
# Run Docker container # Run Docker container
+15 -5
View File
@@ -19,7 +19,6 @@
#include <iomanip> #include <iomanip>
#include <map> #include <map>
#include <memory> #include <memory>
#include <mutex>
#include <openvino/core/dimension.hpp> #include <openvino/core/dimension.hpp>
#include <openvino/core/except.hpp> #include <openvino/core/except.hpp>
#include <openvino/core/node.hpp> #include <openvino/core/node.hpp>
@@ -207,8 +206,22 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
break; break;
} }
case GGML_OP_ROPE: { case GGML_OP_ROPE: {
const int mode = node->op_params[2];
switch (mode) {
case GGML_ROPE_TYPE_NEOX: {
op_case = 0x00010000;
break;
}
case GGML_ROPE_TYPE_IMROPE: {
op_case = 0x00020000;
break;
}
default:
op_case = 0x00000000;
break;
}
if (node->src[0]->op == GGML_OP_VIEW) { if (node->src[0]->op == GGML_OP_VIEW) {
op_case = 2; op_case = (op_case | 0x00000002);
} }
break; break;
} }
@@ -573,9 +586,6 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
} }
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) { std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
static std::mutex weights_mutex;
std::lock_guard<std::mutex> lock(weights_mutex);
std::map<std::string, std::shared_ptr<ov::Node>> model_weights; std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
auto * nodes = cgraph->nodes; auto * nodes = cgraph->nodes;
auto n_nodes = cgraph->n_nodes; auto n_nodes = cgraph->n_nodes;
+18 -11
View File
@@ -6,6 +6,7 @@
#include <cstring> #include <cstring>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp> #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp> #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
#include <openvino/runtime/properties.hpp>
#include <optional> #include <optional>
ov::Core & ov_singleton_core() { ov::Core & ov_singleton_core() {
@@ -42,11 +43,13 @@ void ggml_openvino_device_config::init() {
{"NPUW_DQ", "YES" }, {"NPUW_DQ", "YES" },
{"NPUW_DQ_FULL", "NO" }, {"NPUW_DQ_FULL", "NO" },
}; };
if (cache_dir) { if (cache_dir && strlen(cache_dir) > 0) {
compile_config["NPUW_CACHE_DIR"] = cache_dir; compile_config["NPUW_CACHE_DIR"] = cache_dir;
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
} }
} else if (cache_dir) { } else if (cache_dir && strlen(cache_dir) > 0) {
ov_singleton_core().set_property(ov::cache_dir(cache_dir)); compile_config.insert(ov::cache_dir(cache_dir));
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
} }
// Initialize remote context with queue sharing for GPU // Initialize remote context with queue sharing for GPU
@@ -259,10 +262,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
int64_t n_blocks = n_elements / layout.weights_per_block; int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); layout.scales_size = n_blocks * sizeof(uint16_t);
// For symmetric quantization, we only need one zp value (not one per block) // For symmetric quantization, no zp needed (weights stored as signed)
// Zero points are stored in U4 or U8 format matching the weight type if (layout.is_symmetric) {
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks; layout.zp_size = 0;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements; } else {
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
}
layout.weights_offset = 0; layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
@@ -313,10 +318,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
// Scales: F16 per block // Scales: F16 per block
int64_t n_blocks = n_elements / layout.weights_per_block; int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
// Zero points: U4 or U8 matching weight type // For symmetric quantization, no zp needed (weights stored as signed)
// For symmetric quantization, we only need one zp value (not one per block) if (layout.is_symmetric) {
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks; layout.zp_size = 0;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements; } else {
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
}
// Layout in buffer: [weights | scales | zp] with alignment // Layout in buffer: [weights | scales | zp] with alignment
layout.weights_offset = 0; layout.weights_offset = 0;
+29 -13
View File
@@ -145,13 +145,18 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
return ctx->data; return ctx->data;
} }
static bool is_stateful_enabled() {
static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0;
}
static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
// GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name); // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
// Put kvcache on device memory for GPU (NPU memory is too small even for kvcache) // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" && if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
!getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) { !is_stateful_enabled()) {
GGML_ASSERT(ctx->tensor_extras.empty()); GGML_ASSERT(ctx->tensor_extras.empty());
auto device = ctx->device; auto device = ctx->device;
auto size = ctx->size; auto size = ctx->size;
@@ -600,6 +605,14 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
static void ggml_backend_openvino_free(ggml_backend_t backend) { static void ggml_backend_openvino_free(ggml_backend_t backend) {
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context; ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
if (ctx->runtime_context) {
auto r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
if (--r_ctx->backend_count == 0) {
r_ctx->clear_caches();
}
}
delete ctx; delete ctx;
delete backend; delete backend;
} }
@@ -644,7 +657,12 @@ static ggml_guid_t ggml_backend_openvino_guid(void) {
} }
static std::shared_ptr<ov_runtime_context> get_ov_runtime_context_ptr() { static std::shared_ptr<ov_runtime_context> get_ov_runtime_context_ptr() {
static std::shared_ptr<ov_runtime_context> r_ctx = std::make_shared<ov_runtime_context>(); static std::shared_ptr<ov_runtime_context> r_ctx = [] {
auto ctx = std::make_shared<ov_runtime_context>();
ctx->device = ggml_openvino_get_device_name();
ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();
return ctx;
}();
return r_ctx; return r_ctx;
} }
@@ -669,8 +687,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
} }
std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context); std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
r_ctx->device = ggml_openvino_get_device_name(); r_ctx->backend_count++;
r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
ggml_backend_t openvino_backend = new ggml_backend{ ggml_backend_t openvino_backend = new ggml_backend{
/* .guid = */ ggml_backend_openvino_guid(), /* .guid = */ ggml_backend_openvino_guid(),
@@ -883,7 +900,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
const int32_t * op_params = op->op_params; const int32_t * op_params = op->op_params;
const int n_dims = op_params[1]; const int n_dims = op_params[1];
const int mode = op_params[2]; const int mode = op_params[2];
if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) { if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
return true; return true;
} }
@@ -896,14 +913,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
return true; return true;
} }
float freq_scale;
float ext_factor;
memcpy(&freq_scale, op_params + 6, sizeof(float));
memcpy(&ext_factor, op_params + 7, sizeof(float));
if (ext_factor != 0.0f) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
return true;
}
if (op->src[0]->op == GGML_OP_VIEW) { if (op->src[0]->op == GGML_OP_VIEW) {
if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) { if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
// GGML_LOG_WARN( // GGML_LOG_WARN(
@@ -913,6 +922,12 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
return true; return true;
} }
} }
if (mode == GGML_ROPE_TYPE_IMROPE &&
(op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 ||
((const float *) op_params)[8] != 1)) {
// GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n");
return true;
}
break; break;
} }
default: default:
@@ -942,6 +957,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
// GGML_OP_SOFT_MAX, // GGML_OP_SOFT_MAX,
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
static const std::set<ggml_unary_op> supported_unary_ops{ static const std::set<ggml_unary_op> supported_unary_ops{
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_SILU, GGML_UNARY_OP_SILU,
}; };
static const std::set<ggml_glu_op> supported_glu_ops{ static const std::set<ggml_glu_op> supported_glu_ops{
+180 -108
View File
@@ -46,6 +46,7 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
// Extracts (weight, scales, zp) from Q4_0 tensors. // Extracts (weight, scales, zp) from Q4_0 tensors.
// Data layout is: |16 bit scale|32 x 4bit weights|. // Data layout is: |16 bit scale|32 x 4bit weights|.
// When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
void extract_q4_0_data(const ggml_tensor * tensor, void extract_q4_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
ov::Tensor & scales_arr, ov::Tensor & scales_arr,
@@ -55,28 +56,32 @@ void extract_q4_0_data(const ggml_tensor * tensor,
auto * data = static_cast<uint8_t *>(tensor->data); auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data()); auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>(); auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data()); auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q4_0, zero point is always 8
if (is_scalar_zp) {
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
}
ov::parallel_for(scales_arr.get_size(), [&](size_t i) { ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
// For asymmetric quantization, compute per-block zero points
if (!is_scalar_zp) {
// Pack two 4-bit zero points per byte // Pack two 4-bit zero points per byte
if (i % 2 == 0) { if (i % 2 == 0) {
zp[i / 2] = 8; // Lower nibble zp[i / 2] = 8; // Lower nibble
} else { } else {
zp[i / 2] |= (8 << 4); // Upper nibble zp[i / 2] |= (8 << 4); // Upper nibble
} }
}
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
}); });
} else {
// Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble)
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
// Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8.
for (int j = 0; j < 16; ++j) {
weights[i * 16 + j] ^= 0x88;
}
});
}
} }
// Extracts (weight, scales, zp) from Q4_1 tensors. // Extracts (weight, scales, zp) from Q4_1 tensors.
@@ -123,6 +128,7 @@ void extract_q4_1_data(const ggml_tensor * tensor,
// Extracts (weight, scales, zp) from Q8_0 tensors. // Extracts (weight, scales, zp) from Q8_0 tensors.
// Data layout is: |16 bit scale|32 x 8bit weights|. // Data layout is: |16 bit scale|32 x 8bit weights|.
// When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
void extract_q8_0_data(const ggml_tensor * tensor, void extract_q8_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
ov::Tensor & scales_arr, ov::Tensor & scales_arr,
@@ -133,29 +139,30 @@ void extract_q8_0_data(const ggml_tensor * tensor,
auto * data = static_cast<uint8_t *>(tensor->data); auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data()); auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>(); auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data()); auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q8_0, zero point is always 128
if (is_scalar_zp) {
zp[0] = 128;
}
ov::parallel_for(scales_arr.get_size(), [&](size_t i) { ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block; uint8_t * block_data = data + i * bytes_per_block;
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[i] = 128; zp[i] = 128;
}
for (size_t j = 0; j < weights_per_block; ++j) { for (size_t j = 0; j < weights_per_block; ++j) {
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. uint8_t x = block_data[j + 2];
// Original data is in int8_t, so we add a bias of -128 and invert the first bit. x ^= 1 << 7; // Convert int8 to uint8 by flipping sign bit
x ^= 1 << 7;
weights[i * weights_per_block + j] = x; weights[i * weights_per_block + j] = x;
} }
}); });
} else {
// Symmetric: store original int8 values directly (no unsigned bias)
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
// Copy int8 weights as-is (the tensor element type is i8)
memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block);
});
}
} }
void unpack_256_4(const uint8_t * data, uint8_t * dst) { void unpack_256_4(const uint8_t * data, uint8_t * dst) {
@@ -256,33 +263,21 @@ void extract_q6_k_data(const ggml_tensor * tensor,
auto * data = static_cast<uint8_t *>(tensor->data); auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data()); auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>(); auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data()); auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q6_K, zero point is always 32
if (is_scalar_zp) {
zp[0] = 32;
}
ov::parallel_for(n_super_block, [&](size_t i) { ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block; uint8_t * block_data = data + i * bytes_per_block;
float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
float scale_factor =
static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
for (size_t j = 0; j < 16; j++) { for (size_t j = 0; j < 16; j++) {
scales[j + i * 16] = scales[j + i * 16] =
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j)))); ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[j + i * 16] = 32; zp[j + i * 16] = 32;
} }
}
uint8_t * ql = block_data; uint8_t * ql = block_data;
uint8_t * qh = block_data + 128; uint8_t * qh = block_data + 128;
for (int64_t j = 0; j < 32; ++j) { for (int64_t j = 0; j < 32; ++j) {
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
@@ -294,6 +289,36 @@ void extract_q6_k_data(const ggml_tensor * tensor,
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
} }
}); });
} else {
// Symmetric: subtract 32 from each weight to store as signed i8
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
for (size_t j = 0; j < 16; j++) {
scales[j + i * 16] =
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
}
uint8_t * ql = block_data;
uint8_t * qh = block_data + 128;
auto * signed_weights = reinterpret_cast<int8_t *>(weights);
for (int64_t j = 0; j < 32; ++j) {
signed_weights[i * 256 + j] = static_cast<int8_t>((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 32] =
static_cast<int8_t>((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 64] = static_cast<int8_t>((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 96] =
static_cast<int8_t>((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 128] =
static_cast<int8_t>((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 160] =
static_cast<int8_t>((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 192] =
static_cast<int8_t>((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32;
signed_weights[i * 256 + j + 224] =
static_cast<int8_t>((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32;
}
});
}
} }
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) { static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
@@ -389,11 +414,10 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
size_t group_size, size_t group_size,
bool use_bias) { bool use_bias) {
ov::Shape orig_shape = weight.get_shape(); ov::Shape orig_shape = weight.get_shape();
bool is_signed = (weight.get_element_type() == ov::element::i8); // Symmetric: signed weights, no ZP
// Expand dimensions for scales and zp/bias // Expand dimensions for scales and zp/bias
auto scale_shape = scales.get_shape(); auto scale_shape = scales.get_shape();
auto zp_shape = zp.get_shape();
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
@@ -403,25 +427,35 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
} else { } else {
scale_shape.push_back(1); scale_shape.push_back(1);
scales.set_shape(scale_shape); scales.set_shape(scale_shape);
// For symmetric quantization, zp remains scalar (don't resize) if (!is_signed && zp.get_size() > 0) {
if (!is_scalar_zp) { auto zp_shape = zp.get_shape();
zp_shape.push_back(1); zp_shape.push_back(1);
zp.set_shape(zp_shape); zp.set_shape(zp_shape);
} }
} }
// Create graph nodes auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
ov::Output<ov::Node> result;
if (is_signed) {
// Signed path: q * s (no zero point subtraction needed)
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i8, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Unsigned path
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape, auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr); static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16); auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
ov::Output<ov::Node> result; if (use_bias && zp.get_size() > 0) {
if (use_bias && !is_scalar_zp) {
// Bias path: w * s + b (zp tensor holds f16 bias values) // Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp); auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); auto w_s =
std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else { } else {
// Zero point path: (w - zp) * s // Zero point path: (w - zp) * s
@@ -435,6 +469,7 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
} }
}
if (packed_shape.size() != 2) { if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape // If not requantized channel-wise case, reshape back to original shape
@@ -452,11 +487,10 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
size_t group_size, size_t group_size,
bool use_bias) { bool use_bias) {
ov::Shape orig_weight_shape = weight.get_shape(); ov::Shape orig_weight_shape = weight.get_shape();
bool is_signed = (weight.get_element_type() == ov::element::i4); // Symmetric: signed weights, no ZP
// Expand dimensions for scales and zp/bias // Expand dimensions for scales and zp/bias
ov::Shape scale_shape = scales.get_shape(); ov::Shape scale_shape = scales.get_shape();
auto zp_shape = zp.get_shape();
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
// Create INT4 weight tensor // Create INT4 weight tensor
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
@@ -467,24 +501,35 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
} else { } else {
scale_shape.push_back(1); scale_shape.push_back(1);
scales.set_shape(scale_shape); scales.set_shape(scale_shape);
// For symmetric quantization, zp remains scalar (don't resize) if (!is_signed && zp.get_size() > 0) {
if (!is_scalar_zp) { auto zp_shape = zp.get_shape();
zp_shape.push_back(1); zp_shape.push_back(1);
zp.set_shape(zp_shape); zp.set_shape(zp_shape);
} }
} }
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
ov::Output<ov::Node> result;
if (is_signed) {
// Signed path: q * s (no zero point subtraction needed)
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i4, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Unsigned path
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr); static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16); auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
ov::Output<ov::Node> result; if (use_bias && zp.get_size() > 0) {
if (use_bias && !is_scalar_zp) {
// Bias path: w * s + b (zp tensor holds f16 bias values) // Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp); auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); auto w_s =
std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else { } else {
// Zero point path: (w - zp) * s // Zero point path: (w - zp) * s
@@ -498,6 +543,7 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
} }
}
if (packed_shape.size() != 2) { if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape // If not requantized channel-wise case, reshape back to original shape
@@ -699,25 +745,33 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
// Quantized path (normal extraction or quantized requant) // Quantized path (normal extraction or quantized requant)
// Create weight/scale/zp tensors - shared between both paths // Create weight/scale/zp tensors - shared between both paths
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; // For symmetric quantization, use signed types (i4/i8) and no ZP tensor
ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
(layout.is_u4 ? ov::element::u4 : ov::element::u8);
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
if (output_base_ptr) { if (output_base_ptr) {
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr); uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); if (!layout.is_symmetric) {
ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
}
// else: result.zp remains default-constructed (empty) for symmetric
} else { } else {
result.weights = ov::Tensor(weight_type, node_shape); result.weights = ov::Tensor(weight_type, node_shape);
result.scales = ov::Tensor(ov::element::f16, scale_shape); result.scales = ov::Tensor(ov::element::f16, scale_shape);
if (use_bias && !layout.is_symmetric) { if (!layout.is_symmetric) {
// bias only has effect for asymmetric quant if (use_bias) {
result.zp = ov::Tensor(ov::element::f16, zp_shape); result.zp = ov::Tensor(ov::element::f16, scale_shape);
} else { } else {
result.zp = ov::Tensor(weight_type, zp_shape); ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
result.zp = ov::Tensor(zp_type, scale_shape);
} }
} }
// else: result.zp remains default-constructed (empty) for symmetric
}
if (layout.is_requant && layout.requant_type.has_value()) { if (layout.is_requant && layout.requant_type.has_value()) {
result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
@@ -741,18 +795,13 @@ void quantize_q4_0(const float * x,
auto * weights = static_cast<uint8_t *>(weights_arr.data()); auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>(); auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4); // Signed i4 path
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data()); auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q4_0, zero point is always 8
if (is_scalar_zp) {
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
}
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max float amax = 0.0f;
float max = 0.0f; float max = 0.0f;
for (int j = 0; j < qk; j++) { for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j]; const float v = x[i * qk + j];
if (amax < fabsf(v)) { if (amax < fabsf(v)) {
@@ -760,34 +809,24 @@ void quantize_q4_0(const float * x,
max = v; max = v;
} }
} }
const float d = max / -8; const float d = max / -8;
if (d == 0) { if (d == 0) {
scales[i] = ov::float16(1.0f); scales[i] = ov::float16(1.0f);
// zp is already set to 8 for symmetric, or set per-block for asymmetric
if (!is_scalar_zp) {
if (i % 2 == 0) { if (i % 2 == 0) {
zp[i / 2] = 8; zp[i / 2] = 8;
} else { } else {
zp[i / 2] |= (8 << 4); zp[i / 2] |= (8 << 4);
} }
}
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2); memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
continue; continue;
} }
const float id = 1.0f / d; const float id = 1.0f / d;
scales[i] = ov::float16(d); scales[i] = ov::float16(d);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
if (i % 2 == 0) { if (i % 2 == 0) {
zp[i / 2] = 8; zp[i / 2] = 8;
} else { } else {
zp[i / 2] |= (8 << 4); zp[i / 2] |= (8 << 4);
} }
}
for (int j = 0; j < qk / 2; ++j) { for (int j = 0; j < qk / 2; ++j) {
const float x0 = x[i * qk + 2 * j] * id; const float x0 = x[i * qk + 2 * j] * id;
const float x1 = x[i * qk + 2 * j + 1] * id; const float x1 = x[i * qk + 2 * j + 1] * id;
@@ -796,6 +835,37 @@ void quantize_q4_0(const float * x,
weights[i * qk / 2 + j] = xi0 | (xi1 << 4); weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
} }
} }
} else {
// Symmetric: produce signed i4 values in [-8, 7]
for (int i = 0; i < nb; i++) {
float amax = 0.0f;
float max = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / -8;
if (d == 0) {
scales[i] = ov::float16(1.0f);
// i4 value 0 packed: 0x00
memset(weights + i * qk / 2, 0, qk / 2);
continue;
}
const float id = 1.0f / d;
scales[i] = ov::float16(d);
for (int j = 0; j < qk / 2; ++j) {
const float x0 = x[i * qk + 2 * j] * id;
const float x1 = x[i * qk + 2 * j + 1] * id;
// Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement.
int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0)));
int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1)));
weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4);
}
}
}
} }
void quantize_q8_0(const float * x, void quantize_q8_0(const float * x,
@@ -809,38 +879,44 @@ void quantize_q8_0(const float * x,
auto * weights = static_cast<uint8_t *>(weights_arr.data()); auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>(); auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8); // Signed i8 path
if (!is_symmetric) {
auto * zp = static_cast<uint8_t *>(zp_arr.data()); auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q8_0, zero point is always 128
if (is_scalar_zp) {
zp[0] = 128;
}
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max float amax = 0.0f;
for (int j = 0; j < qk; j++) { for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j]; const float v = x[i * qk + j];
if (amax < fabsf(v)) { amax = std::max(amax, fabsf(v));
amax = fabsf(v);
} }
}
const float d = amax / 127.0f; const float d = amax / 127.0f;
const float id = d ? 1.0f / d : 0.0f; const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d); scales[i] = ov::float16(d);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[i] = 128; zp[i] = 128;
}
for (int j = 0; j < qk; ++j) { for (int j = 0; j < qk; ++j) {
const float x0 = x[i * qk + j] * id; const float x0 = x[i * qk + j] * id;
const int8_t xi0 = roundf(x0); const int8_t xi0 = roundf(x0);
weights[i * qk + j] = (uint8_t) (xi0 + 128); weights[i * qk + j] = (uint8_t) (xi0 + 128);
} }
} }
} else {
// Symmetric: store signed int8 values directly
auto * signed_weights = reinterpret_cast<int8_t *>(weights);
for (int i = 0; i < nb; i++) {
float amax = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
amax = std::max(amax, fabsf(v));
}
const float d = amax / 127.0f;
const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d);
for (int j = 0; j < qk; ++j) {
const float x0 = x[i * qk + j] * id;
signed_weights[i * qk + j] = (int8_t) roundf(x0);
}
}
}
} }
void quantize_q8_1(const float * x, void quantize_q8_1(const float * x,
@@ -861,12 +937,8 @@ void quantize_q8_1(const float * x,
for (int j = 0; j < qk; j++) { for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j]; const float v = x[i * qk + j];
if (v < min) { min = std::min(v, min);
min = v; max = std::max(v, max);
}
if (v > max) {
max = v;
}
} }
const float d = (max - min) / ((1 << 8) - 1); const float d = (max - min) / ((1 << 8) - 1);
+33 -7
View File
@@ -9,12 +9,17 @@
#include <openvino/op/add.hpp> #include <openvino/op/add.hpp>
#include <openvino/op/concat.hpp> #include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp> #include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/cos.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/multiply.hpp> #include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp> #include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp> #include <openvino/op/shape_of.hpp>
#include <openvino/op/sin.hpp>
#include <openvino/op/slice.hpp> #include <openvino/op/slice.hpp>
#include <openvino/op/split.hpp> #include <openvino/op/split.hpp>
#include <openvino/op/subtract.hpp> #include <openvino/op/subtract.hpp>
#include <openvino/op/transpose.hpp>
#include <openvino/op/unsqueeze.hpp> #include <openvino/op/unsqueeze.hpp>
#include <vector> #include <vector>
@@ -33,6 +38,12 @@ OutputVector translate_rope(const NodeContext & context) {
auto data_node = context.get_input(0).get_node_shared_ptr(); auto data_node = context.get_input(0).get_node_shared_ptr();
auto output_shape = context.get_output_shape().to_shape(); auto output_shape = context.get_output_shape().to_shape();
int32_t * op_params = context.get_output_op_params(); int32_t * op_params = context.get_output_op_params();
const int mode = (op_case & 0xFFFF0000) >> 16;
op_case = (op_case & 0x0000FFFF);
constexpr int TYPE_NORMAL = 0;
constexpr int TYPE_NEOX = 1;
constexpr int TYPE_IMROPE = 2;
Output<Node> cos_theta_node; Output<Node> cos_theta_node;
Output<Node> sin_theta_node; Output<Node> sin_theta_node;
@@ -45,7 +56,7 @@ OutputVector translate_rope(const NodeContext & context) {
if (context.get_input_size() == 3) { if (context.get_input_size() == 3) {
rope_freqs_weight = context.get_input(2).get_node_shared_ptr(); rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
} }
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight); auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE);
sin_theta_node = sin_cos.first; sin_theta_node = sin_cos.first;
cos_theta_node = sin_cos.second; cos_theta_node = sin_cos.second;
} }
@@ -65,11 +76,7 @@ OutputVector translate_rope(const NodeContext & context) {
} }
} }
const int mode = op_params[2]; if (mode == TYPE_NORMAL) {
constexpr int ROPE_TYPE_NORMAL = 0;
constexpr int ROPE_TYPE_NEOX = 2;
if (mode == ROPE_TYPE_NORMAL) {
auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
@@ -97,7 +104,7 @@ OutputVector translate_rope(const NodeContext & context) {
auto data_shape = ov::op::v0::Constant::create( auto data_shape = ov::op::v0::Constant::create(
ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false); res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
} else if (mode == ROPE_TYPE_NEOX) { } else if (mode == TYPE_NEOX) {
auto data_split = std::make_shared<ov::op::v1::Split>( auto data_split = std::make_shared<ov::op::v1::Split>(
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2); data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
Output<Node> slice_data_node_0 = data_split->outputs()[0]; Output<Node> slice_data_node_0 = data_split->outputs()[0];
@@ -112,6 +119,25 @@ OutputVector translate_rope(const NodeContext & context) {
std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node)); std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1); res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
} else if (mode == TYPE_IMROPE) {
int64_t n_dims = data_node->get_shape()[3];
auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1,-1,1,(n_dims >> 1)});
auto cos_reshaped = std::make_shared<ov::op::v1::Reshape>(cos_theta_node, cos_sin_shape, true);
auto sin_reshaped = std::make_shared<ov::op::v1::Reshape>(sin_theta_node, cos_sin_shape, true);
auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3});
auto split_a = std::make_shared<ov::op::v1::Split>(data_node, split_axis, 2);
auto x0 = split_a->output(0);
auto x1 = split_a->output(1);
auto mul_a = std::make_shared<ov::op::v1::Multiply>(x0, cos_reshaped);
auto mul_b = std::make_shared<ov::op::v1::Multiply>(x1, sin_reshaped);
auto sub = std::make_shared<ov::op::v1::Subtract>(mul_a, mul_b);
auto mul_c = std::make_shared<ov::op::v1::Multiply>(x0, sin_reshaped);
auto mul_d = std::make_shared<ov::op::v1::Multiply>(x1, cos_reshaped);
auto add = std::make_shared<ov::op::v1::Add>(mul_c, mul_d);
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
} }
return rename_outputs_with_suffix({res}, context.get_name()); return rename_outputs_with_suffix({res}, context.get_name());
@@ -0,0 +1,25 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <openvino/core/node_output.hpp>
#include <openvino/op/gelu.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_unary_gelu(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = context.get_input(0);
auto res = std::make_shared<ov::op::v7::Gelu>(input);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -31,6 +31,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
{"GGML_OP_SOFT_MAX", op::translate_soft_max }, {"GGML_OP_SOFT_MAX", op::translate_soft_max },
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>}, {"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
{"GGML_OP_TRANSPOSE", op::translate_transpose }, {"GGML_OP_TRANSPOSE", op::translate_transpose },
{"GGML_UNARY_OP_GELU", op::translate_unary_gelu },
{"GGML_UNARY_OP_SILU", op::translate_unary_silu }, {"GGML_UNARY_OP_SILU", op::translate_unary_silu },
{"GGML_OP_VIEW", op::translate_view }, {"GGML_OP_VIEW", op::translate_view },
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
@@ -21,6 +21,7 @@ GGML_OP_CONVERTER(translate_rms_norm);
GGML_OP_CONVERTER(translate_rope); GGML_OP_CONVERTER(translate_rope);
GGML_OP_CONVERTER(translate_scale); GGML_OP_CONVERTER(translate_scale);
GGML_OP_CONVERTER(translate_unary_silu); GGML_OP_CONVERTER(translate_unary_silu);
GGML_OP_CONVERTER(translate_unary_gelu);
GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_soft_max);
GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_transpose);
GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_view);
@@ -1,123 +0,0 @@
#include "eliminate_zp.h"
#include <openvino/core/graph_util.hpp>
#include <openvino/core/parallel.hpp>
#include <openvino/core/rt_info.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/pass/pattern/op/label.hpp>
#include <openvino/pass/pattern/op/pattern.hpp>
#include <openvino/pass/pattern/op/wrap_type.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace pass {
EliminateZeroPoints::EliminateZeroPoints() {
// Find pattern:
// (Multiply Any(scale)
// (Subtract (Convert Constant(data)))
// (Convert Constant(zero_point)))
// where zero_point is a scalar
// If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val
// If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant
auto m_data_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
auto m_data_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_data_constant});
auto m_zp_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
auto m_zp_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_zp_constant});
auto m_subtract = ov::pass::pattern::wrap_type<ov::op::v1::Subtract>({m_data_convert, m_zp_convert});
auto m_scale = ov::pass::pattern::any_input();
auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
const auto callback = [=](ov::pass::pattern::Matcher & m) {
const auto & pattern_map = m.get_pattern_value_map();
auto multiply_node =
std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
auto subtract_node =
std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
auto data_constant =
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
auto zp_constant =
std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
return false;
}
if (ov::shape_size(zp_constant->get_shape()) != 1) {
return false;
}
auto data_type = data_constant->get_element_type();
auto zp_data = zp_constant->cast_vector<int>();
if (zp_data.empty()) {
return false;
}
int zp_value = zp_data[0];
bool should_eliminate = false;
ov::element::Type target_type;
if (data_type == ov::element::u4 && zp_value == 8) {
should_eliminate = true;
target_type = ov::element::i4;
} else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) {
should_eliminate = true;
target_type = ov::element::i8;
}
if (!should_eliminate) {
return false;
}
auto data_shape = data_constant->get_shape();
size_t total_elements = ov::shape_size(data_shape);
std::shared_ptr<ov::op::v0::Constant> new_constant;
// TODO improve performance
if (data_type == ov::element::u4) {
auto data_values = data_constant->cast_vector<uint8_t>();
std::vector<int8_t> adjusted_values(total_elements);
ov::parallel_for(total_elements, [&](size_t i) {
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - 8);
});
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
} else if (data_type == ov::element::u8) {
auto data_values = data_constant->cast_vector<uint8_t>();
std::vector<int8_t> adjusted_values(total_elements);
ov::parallel_for(total_elements, [&, zp_value](size_t i) {
adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - zp_value);
});
new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
}
auto new_convert =
std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
ov::replace_node(subtract_node, new_convert);
return true;
};
register_matcher(
std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
callback);
}
} // namespace pass
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -1,17 +0,0 @@
#include "openvino/pass/matcher_pass.hpp"
namespace ov {
namespace frontend {
namespace ggml {
namespace pass {
class EliminateZeroPoints : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints")
EliminateZeroPoints();
};
} // namespace pass
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,41 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <openvino/core/core_visibility.hpp>
#include <openvino/core/node.hpp>
#include <openvino/core/runtime_attribute.hpp>
namespace ov {
/**
* @brief Holds weightless caching attributes of a single constant.
*
* WeightlessCacheAttribute class represents runtime info attribute that holds
* the values of original size of the constant in bytes and the binary offset of the
* constant's data in the weights file used by the weightless caching mechanism. It's
* not copyable in case the data was changed (the original node was replaced by a new
* one produced during the tranformation pipeline) - in that case weightless caching
* can't be used for that constant.
*/
class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
public:
OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute)
WeightlessCacheAttribute() = delete;
WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
: original_size(original_size),
bin_offset(bin_offset),
original_dtype(original_dtype) {}
bool is_copyable() const override;
size_t original_size;
size_t bin_offset;
ov::element::Type original_dtype;
};
} // namespace ov
@@ -3,15 +3,16 @@
#include "ggml-openvino/openvino/node_context.h" #include "ggml-openvino/openvino/node_context.h"
#include "ggml-openvino/openvino/utils.h" #include "ggml-openvino/openvino/utils.h"
#include "input_model.h" #include "input_model.h"
#include "pass/eliminate_zp.h"
#include "pass/mark_decompression_convert_constant_folding.h" #include "pass/mark_decompression_convert_constant_folding.h"
#include "pass/squeeze_matmul.h" #include "pass/squeeze_matmul.h"
#include "rt_info/weightless_caching_attributes.hpp"
#include <cstdint> #include <cstdint>
#include <cstdlib> #include <cstdlib>
#include <map> #include <map>
#include <memory> #include <memory>
#include <openvino/core/node.hpp> #include <openvino/core/node.hpp>
#include <openvino/core/preprocess/pre_post_process.hpp>
#include <openvino/op/add.hpp> #include <openvino/op/add.hpp>
#include <openvino/op/broadcast.hpp> #include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp> #include <openvino/op/concat.hpp>
@@ -33,7 +34,6 @@
#include <openvino/op/unsqueeze.hpp> #include <openvino/op/unsqueeze.hpp>
#include <openvino/pass/constant_folding.hpp> #include <openvino/pass/constant_folding.hpp>
#include <openvino/pass/make_stateful.hpp> #include <openvino/pass/make_stateful.hpp>
#include <openvino/core/preprocess/pre_post_process.hpp>
namespace ov { namespace ov {
namespace frontend { namespace frontend {
@@ -240,6 +240,31 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
resulting_model = std::make_shared<Model>(results, used_params); resulting_model = std::make_shared<Model>(results, used_params);
apply_transformations(resulting_model); apply_transformations(resulting_model);
// Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies
// in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor
// (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export
// occurs", doubling memory usage per compile_model call.
//
// The bin_offset field serves as a unique key (not a real file offset) — this is
// the same convention the GPU plugin uses for non-IR models (see
// Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp).
// Each constant must have a distinct bin_offset, otherwise GPU's weightless cache
// import will map multiple constants to the same data.
//
// Small constants (< 16 elements) are excluded since they may be introduced by
// optimization patterns and the overhead is negligible.
size_t offset = 0;
for (auto & node : resulting_model->get_ordered_ops()) {
if (auto cnst = ov::as_type_ptr<ov::op::v0::Constant>(node);
cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) {
auto & rt_info = cnst->get_rt_info();
if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) {
rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] =
ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type());
}
}
}
return resulting_model; return resulting_model;
} }
@@ -257,7 +282,6 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
} }
if (ggml_model_decoder->is_static()) { if (ggml_model_decoder->is_static()) {
manager.register_pass<pass::EliminateZeroPoints>();
manager.register_pass<pass::SqueezeMatmul>(); manager.register_pass<pass::SqueezeMatmul>();
} }
manager.run_passes(model); manager.run_passes(model);
+38 -7
View File
@@ -2,6 +2,7 @@
#include "ggml-impl.h" #include "ggml-impl.h"
#include <cmath>
#include <cstddef> #include <cstddef>
#include <ctime> #include <ctime>
#include <memory> #include <memory>
@@ -13,6 +14,7 @@
#include <openvino/op/gather.hpp> #include <openvino/op/gather.hpp>
#include <openvino/op/maximum.hpp> #include <openvino/op/maximum.hpp>
#include <openvino/op/multiply.hpp> #include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp> #include <openvino/op/shape_of.hpp>
#include <openvino/op/sin.hpp> #include <openvino/op/sin.hpp>
#include <openvino/op/squeeze.hpp> #include <openvino/op/squeeze.hpp>
@@ -87,8 +89,11 @@ ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl
auto ramp_y = auto ramp_y =
std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom); std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f); auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
// rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling
auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
auto ramp_inverted = std::make_shared<ov::op::v1::Subtract>(one, ramp_clamped);
auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor}); auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node); auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_inverted, ext_factor_node);
return ramp_mix; return ramp_mix;
} }
@@ -115,6 +120,7 @@ void ggml_rope_yarn_corr_dims(int n_dims,
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params, std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
std::shared_ptr<ov::Node> inp_pos, std::shared_ptr<ov::Node> inp_pos,
std::shared_ptr<ov::Node> rope_freqs_weight, std::shared_ptr<ov::Node> rope_freqs_weight,
bool imrope,
bool stateful) { bool stateful) {
if (stateful) { if (stateful) {
inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
@@ -122,6 +128,13 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
auto pos_perm = auto pos_perm =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0}); std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm); inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
} else if (imrope) {
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1});
inp_pos = std::make_shared<ov::op::v1::Reshape>(inp_pos, pos_shape, true);
auto pos_transpose_shape =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{5}, std::vector<int64_t>{0, 1, 2, 4, 3});
inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_transpose_shape);
} else { } else {
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32); inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
auto pos_perm = auto pos_perm =
@@ -136,6 +149,7 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
float beta_fast; float beta_fast;
float beta_slow; float beta_slow;
const int n_dims = rope_params[1]; const int n_dims = rope_params[1];
const size_t n_dims_half = n_dims >> 1;
const int n_ctx_orig = rope_params[4]; const int n_ctx_orig = rope_params[4];
memcpy(&freq_base, rope_params + 5, sizeof(float)); memcpy(&freq_base, rope_params + 5, sizeof(float));
memcpy(&freq_scale, rope_params + 6, sizeof(float)); memcpy(&freq_scale, rope_params + 6, sizeof(float));
@@ -146,16 +160,31 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
const float theta_scale = powf(freq_base, -2.0f / n_dims); const float theta_scale = powf(freq_base, -2.0f / n_dims);
std::vector<float> factor(n_dims_half);
Output<Node> freq_factors;
Output<Node> theta;
float mscale = attn_factor;
if (imrope) {
std::vector<int64_t> gather_indices(n_dims_half);
for (size_t j = 0; j < n_dims_half; j++) {
gather_indices[j] = j % 3;
factor[j] = std::pow(theta_scale, j);
}
auto gather_indices_const =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{n_dims_half}, gather_indices);
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4});
inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, gather_indices_const, gather_axis);
auto factor_const = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_dims_half}, factor);
theta = std::make_shared<ov::op::v1::Multiply>(inp_pos, factor_const);
} else {
float corr_dims[2]; float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
std::vector<float> factor(n_dims / 2);
factor[0] = 1.0f; factor[0] = 1.0f;
for (size_t i = 1; i < factor.size(); i++) { for (size_t i = 1; i < factor.size(); i++) {
factor[i] = theta_scale * factor[i - 1]; factor[i] = theta_scale * factor[i - 1];
} }
Output<Node> freq_factors;
if (stateful) { if (stateful) {
freq_factors = freq_factors =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
@@ -171,8 +200,6 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
auto theta_interp = std::make_shared<ov::op::v1::Multiply>( auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale})); theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
Output<Node> theta;
float mscale = attn_factor;
if (ext_factor == 0.0f) { if (ext_factor == 0.0f) {
theta = theta_interp; theta = theta_interp;
} else { } else {
@@ -189,14 +216,18 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix)); std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
} }
}
Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta); Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta); Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
if (!imrope) {
auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale}); auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node); cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node); sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
}
return std::make_pair(sin_theta, cos_theta); return std::make_pair(sin_theta, cos_theta);
} }
+1
View File
@@ -67,6 +67,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params, std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
std::shared_ptr<ov::Node> inp_pos, std::shared_ptr<ov::Node> inp_pos,
std::shared_ptr<ov::Node> rope_freqs_weight = nullptr, std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
bool imrope = false,
bool stateful = false); bool stateful = false);
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0); ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
+94 -37
View File
@@ -81,8 +81,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) { enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
auto & core = ov_singleton_core(); auto & core = ov_singleton_core();
const auto & config = ggml_openvino_get_compile_config(); const auto & config = ggml_openvino_get_compile_config();
auto device = r_ctx->device; const auto & device = r_ctx->device;
bool stateful = r_ctx->stateful; const auto & stateful = r_ctx->stateful;
static auto is_static = false; static auto is_static = false;
if (is_naive(cgraph)) { if (is_naive(cgraph)) {
@@ -106,14 +106,26 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
int64_t infer_end_time; int64_t infer_end_time;
{ {
std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex); std::shared_ptr<decoder_runtime_ctx> entry;
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
ModelParams old_m_params; ModelParams old_m_params;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
if (cache_hit) { if (cache_hit) {
ggml_decoder = it->second; entry = it->second;
} else {
auto mutex = std::make_shared<std::mutex>();
entry = std::make_shared<decoder_runtime_ctx>(mutex);
r_ctx->decoder_cache[key] = entry;
}
}
std::lock_guard<std::mutex> lock(*(entry->mutex));
if (cache_hit) {
ggml_decoder = entry->ptr;
old_m_params = ggml_decoder->get_model_params(); old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_dynamically(m_params); cache_hit = old_m_params.can_reuse_dynamically(m_params);
} }
@@ -126,7 +138,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
ggml_decoder->update_io(cgraph); ggml_decoder->update_io(cgraph);
} }
ggml_decoder->add_extra_inputs(); ggml_decoder->add_extra_inputs();
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
infer_request = r_ctx->infer_request_cache.at(key); infer_request = r_ctx->infer_request_cache.at(key);
}
if (stateful) { if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph); const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -170,7 +185,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
conversion_end_time = decoder_end_time; conversion_end_time = decoder_end_time;
compile_end_time = decoder_end_time; compile_end_time = decoder_end_time;
} else { } else {
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache.erase(key); r_ctx->infer_request_cache.erase(key);
}
std::shared_ptr<ov::Model> model; std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -199,8 +217,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
} }
compile_end_time = ggml_time_us(); compile_end_time = ggml_time_us();
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request()); infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
r_ctx->infer_request_cache[key] = infer_request; entry->ptr = ggml_decoder;
r_ctx->decoder_cache[key] = ggml_decoder;
std::vector<std::string> ov_input_names; std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names; std::vector<std::string> ov_output_names;
@@ -210,8 +227,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
for (const auto & ov_output : model->get_results()) { for (const auto & ov_output : model->get_results()) {
ov_output_names.push_back(ov_output->get_friendly_name()); ov_output_names.push_back(ov_output->get_friendly_name());
} }
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache[key] = infer_request;
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
}
if (stateful) { if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph); const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -224,8 +246,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
} }
} }
auto ov_input_names = r_ctx->ov_input_names_cache[key]; std::vector<std::string> ov_input_names;
auto ov_output_names = r_ctx->ov_output_names_cache[key]; std::vector<std::string> ov_output_names;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
ov_input_names = r_ctx->ov_input_names_cache[key];
ov_output_names = r_ctx->ov_output_names_cache[key];
}
for (size_t i = 0; i < ov_input_names.size(); i++) { for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i]; auto param_name = ov_input_names[i];
@@ -306,12 +333,26 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
int64_t compile_end_time; int64_t compile_end_time;
int64_t infer_end_time; int64_t infer_end_time;
auto it = r_ctx->decoder_cache.find(key); std::shared_ptr<decoder_runtime_ctx> entry;
cache_hit = it != r_ctx->decoder_cache.end();
ModelParams old_m_params; ModelParams old_m_params;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
if (cache_hit) { if (cache_hit) {
ggml_decoder = it->second; entry = it->second;
} else {
auto mutex = std::make_shared<std::mutex>();
entry = std::make_shared<decoder_runtime_ctx>(mutex);
r_ctx->decoder_cache[key] = entry;
}
}
std::lock_guard<std::mutex> lock(*(entry->mutex));
if (cache_hit) {
ggml_decoder = entry->ptr;
old_m_params = ggml_decoder->get_model_params(); old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_statically(m_params); cache_hit = old_m_params.can_reuse_statically(m_params);
} }
@@ -325,14 +366,21 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
ggml_decoder->update_io(cgraph); ggml_decoder->update_io(cgraph);
} }
ggml_decoder->add_extra_inputs(); ggml_decoder->add_extra_inputs();
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key); {
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
infer_request =
is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
}
decoder_end_time = ggml_time_us(); decoder_end_time = ggml_time_us();
conversion_end_time = decoder_end_time; conversion_end_time = decoder_end_time;
compile_end_time = decoder_end_time; compile_end_time = decoder_end_time;
} else { } else {
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache.erase(key); r_ctx->infer_request_cache.erase(key);
r_ctx->infer_request_cache_prefill.erase(key); r_ctx->infer_request_cache_prefill.erase(key);
}
std::shared_ptr<ov::Model> model; std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -372,16 +420,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
compiled_model_decode = core.compile_model(model_decode, device, config); compiled_model_decode = core.compile_model(model_decode, device, config);
} }
r_ctx->infer_request_cache_prefill[key] = auto infer_request_prefill = std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request()); auto infer_request_decode = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
r_ctx->infer_request_cache[key] =
std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
compile_end_time = ggml_time_us(); compile_end_time = ggml_time_us();
model = is_prefill ? model_prefill : model_decode; model = is_prefill ? model_prefill : model_decode;
ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key]; infer_request = is_prefill ? infer_request_prefill : infer_request_decode;
r_ctx->decoder_cache[key] = ggml_decoder; entry->ptr = ggml_decoder;
std::vector<std::string> ov_input_names; std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names; std::vector<std::string> ov_output_names;
@@ -391,18 +437,29 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
for (const auto & ov_output : model->get_results()) { for (const auto & ov_output : model->get_results()) {
ov_output_names.push_back(ov_output->get_friendly_name()); ov_output_names.push_back(ov_output->get_friendly_name());
} }
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache_prefill[key] = infer_request_prefill;
r_ctx->infer_request_cache[key] = infer_request_decode;
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
} }
}
auto ov_input_names = r_ctx->ov_input_names_cache[key]; std::vector<std::string> ov_input_names_local;
auto ov_output_names = r_ctx->ov_output_names_cache[key]; std::vector<std::string> ov_output_names_local;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
ov_input_names_local = r_ctx->ov_input_names_cache[key];
ov_output_names_local = r_ctx->ov_output_names_cache[key];
}
if (is_prefill) { if (is_prefill) {
auto inp_len = inp_pos->ne[0]; auto inp_len = inp_pos->ne[0];
for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
for (size_t i = 0; i < ov_input_names.size(); i++) { for (size_t i = 0; i < ov_input_names_local.size(); i++) {
auto param_name = ov_input_names[i]; auto param_name = ov_input_names_local[i];
auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
infer_request->set_input_tensor(i, input_tensor); infer_request->set_input_tensor(i, input_tensor);
@@ -412,8 +469,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
} }
} }
for (size_t i = 0; i < ov_output_names.size(); i++) { for (size_t i = 0; i < ov_output_names_local.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor); auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
infer_request->set_output_tensor(i, output_tensor); infer_request->set_output_tensor(i, output_tensor);
} }
@@ -421,16 +478,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_request->infer(); infer_request->infer();
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names.size(); i++) { for (size_t i = 0; i < ov_output_names_local.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i); const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
} }
} }
} }
infer_end_time = ggml_time_us(); infer_end_time = ggml_time_us();
} else { } else {
for (size_t i = 0; i < ov_input_names.size(); i++) { for (size_t i = 0; i < ov_input_names_local.size(); i++) {
auto param_name = ov_input_names[i]; auto param_name = ov_input_names_local[i];
auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
infer_request->set_input_tensor(i, input_tensor); infer_request->set_input_tensor(i, input_tensor);
@@ -440,8 +497,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
} }
} }
for (size_t i = 0; i < ov_output_names.size(); i++) { for (size_t i = 0; i < ov_output_names_local.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor); auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
infer_request->set_output_tensor(i, output_tensor); infer_request->set_output_tensor(i, output_tensor);
} }
@@ -450,9 +507,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_end_time = ggml_time_us(); infer_end_time = ggml_time_us();
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names.size(); i++) { for (size_t i = 0; i < ov_output_names_local.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i); const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
} }
} }
} }
+23 -3
View File
@@ -3,12 +3,15 @@
#include "ggml-impl.h" #include "ggml-impl.h"
#include <algorithm> #include <algorithm>
#include <atomic>
#include <cstddef> #include <cstddef>
#include <memory> #include <memory>
#include <mutex>
#include <openvino/runtime/core.hpp> #include <openvino/runtime/core.hpp>
#include <openvino/runtime/infer_request.hpp> #include <openvino/runtime/infer_request.hpp>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility>
#include <vector> #include <vector>
struct graph_key { struct graph_key {
@@ -40,11 +43,17 @@ struct graph_key_hash {
} }
}; };
struct decoder_runtime_ctx {
decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) : mutex(std::move(mutex)) {}
std::shared_ptr<std::mutex> mutex;
std::shared_ptr<GgmlOvDecoder> ptr;
};
struct ov_runtime_context { struct ov_runtime_context {
std::mutex ov_compute_mutex; mutable std::mutex ctx_mutex;
std::string device; std::string device;
bool stateful; bool stateful;
std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache; std::unordered_map<graph_key, std::shared_ptr<decoder_runtime_ctx>, graph_key_hash> decoder_cache;
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache; std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill; std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache; std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
@@ -53,11 +62,22 @@ struct ov_runtime_context {
// Simultanous stateful inference request support to be added. // Simultanous stateful inference request support to be added.
size_t stateful_kv_size; size_t stateful_kv_size;
std::map<std::string, std::string> kv_state_input_name_map; std::map<std::string, std::string> kv_state_input_name_map;
std::atomic<int> backend_count;
ov_runtime_context() : ov_runtime_context() :
device("CPU"), device("CPU"),
stateful(false), stateful(false),
stateful_kv_size(0) {} stateful_kv_size(0),
backend_count(0) {}
void clear_caches() {
std::lock_guard<std::mutex> lock(ctx_mutex);
decoder_cache.clear();
infer_request_cache.clear();
infer_request_cache_prefill.clear();
ov_input_names_cache.clear();
ov_output_names_cache.clear();
}
}; };
enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend); enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);