openvino: driver setup, CI split, thread safety, and NPU optimizations (#21944)

* Thread safety per request only * Fix ROPE yarn case * Fix sticky stateful config * Use i4/i8 directly for symmetric quant * Use weightless caching * Add WeightlessCacheAttribute to reduce NPU memory usage * Gelu tanh support (#125) * Imrope support (#126) * fix(openvino): explicit ov::Tensor frees in ggml_backend_openvino_free * add GPU,NPU support in OV Dockerfile * add build-openvino.yml ci * Fix sticky stateful config * add concurrency to ov-gpu ci runs. Move OV CI to build-openvino.yml * fix thread-safety of shared runtime context * rope type abstraction for frontend translations * fix editorconfig --------- Co-authored-by: Mustafa Cavus <mustafa.cavus@intel.com> Co-authored-by: Dan Hoffman <dhoff749@gmail.com> Co-authored-by: Ravi Panchumarthy <ravi.panchumarthy@intel.com>
2026-04-21 23:58:34 +08:00
parent 606fa42f5d
commit 52f1096f21
21 changed files with 823 additions and 544 deletions
@@ -2,7 +2,19 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04
-# Optional proxy build arguments - empty by default
+# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
 ARG IGC_VERSION=v2.30.1
 ARG IGC_VERSION_FULL=2_2.30.1+20950
 ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
 ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
 ARG IGDGMM_VERSION=22.9.0
 # Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
 ARG NPU_DRIVER_VERSION=v1.32.0
 ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
 ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
 # Optional proxy build arguments
 ARG http_proxy=
 ARG https_proxy=
@@ -78,13 +90,47 @@ ARG http_proxy
 ARG https_proxy
 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl \
+    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 # Install GPU drivers
 ARG IGC_VERSION
 ARG IGC_VERSION_FULL
 ARG COMPUTE_RUNTIME_VERSION
 ARG COMPUTE_RUNTIME_VERSION_FULL
 ARG IGDGMM_VERSION
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
    && dpkg --install *.deb \
    && rm -rf /tmp/neo/
 # Install NPU drivers
 ARG NPU_DRIVER_VERSION
 ARG NPU_DRIVER_FULL
 ARG LIBZE1_VERSION
 RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
    && dpkg --install *.deb \
    && rm -rf /tmp/npu/
 RUN cd /tmp \
    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
    && rm libze1_${LIBZE1_VERSION}_amd64.deb
 COPY --from=build /app/lib/ /app/
 ### Full (all binaries)
@@ -0,0 +1,120 @@
 name: CI (openvino)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-openvino.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/build-openvino.yml',
      'ggml/src/ggml-openvino/**'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  ubuntu-24-openvino:
    name: ubuntu-24-openvino-${{ matrix.openvino_device }}
    concurrency:
      group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
      cancel-in-progress: false
    strategy:
      matrix:
        include:
          - variant: cpu
            runner: '"ubuntu-24.04"'
            openvino_device: "CPU"
          - variant: gpu
            runner: '["self-hosted","Linux","Intel","OpenVINO"]'
            openvino_device: "GPU"
    runs-on: ${{ fromJSON(matrix.runner) }}
    env:
      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: ccache
        if: runner.environment == 'github-hosted'
        uses: ggml-org/ccache-action@v1.2.21
        with:
          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
      - name: Use OpenVINO Toolkit Cache
        if: runner.environment == 'github-hosted'
        uses: actions/cache@v5
        id: cache-openvino
        with:
          path: ./openvino_toolkit
          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
      - name: Setup OpenVINO Toolkit
        if: steps.cache-openvino.outputs.cache-hit != 'true'
        uses: ./.github/actions/linux-setup-openvino
        with:
          path: ./openvino_toolkit
          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
          version_full: ${{ env.OPENVINO_VERSION_FULL }}
      - name: Install OpenVINO dependencies
        run: |
          cd ./openvino_toolkit
          chmod +x ./install_dependencies/install_openvino_dependencies.sh
          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
      - name: Build
        id: cmake_build
        run: |
          source ./openvino_toolkit/setupvars.sh
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENVINO=ON
          time cmake --build build/ReleaseOV --config Release -j $(nproc)
      - name: Test
        id: cmake_test
        # TODO: fix and re-enable the `test-llama-archs` test below
        run: |
          cd ${{ github.workspace }}
          if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
            export GGML_OPENVINO_DEVICE=GPU
          fi
          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
@@ -265,6 +265,10 @@ jobs:
  ggml-ci-intel-openvino-gpu-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]
    concurrency:
      group: openvino-gpu-${{ github.head_ref || github.ref }}
      cancel-in-progress: false
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
@@ -656,86 +656,6 @@ jobs:
            -DGGML_SYCL_F16=ON
          time cmake --build build --config Release -j $(nproc)
  ubuntu-24-openvino:
      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
      strategy:
        matrix:
          include:
            - variant: cpu
              runner: '"ubuntu-24.04"'
              openvino_device: "CPU"
            - variant: gpu
              runner: '["self-hosted","Linux","X64","Intel"]'
              openvino_device: "GPU"
      runs-on: ${{ fromJSON(matrix.runner) }}
      env:
        # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
        OPENVINO_VERSION_MAJOR: "2026.0"
        OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
      steps:
        - name: Clone
          id: checkout
          uses: actions/checkout@v6
        - name: ccache
          if: runner.environment == 'github-hosted'
          uses: ggml-org/ccache-action@v1.2.21
          with:
            key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
            evict-old-files: 1d
            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
        - name: Dependencies
          id: depends
          run: |
            sudo apt-get update
            sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
            sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
        - name: Use OpenVINO Toolkit Cache
          if: runner.environment == 'github-hosted'
          uses: actions/cache@v5
          id: cache-openvino
          with:
            path: ./openvino_toolkit
            key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
        - name: Setup OpenVINO Toolkit
          if: steps.cache-openvino.outputs.cache-hit != 'true'
          uses: ./.github/actions/linux-setup-openvino
          with:
            path: ./openvino_toolkit
            version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
            version_full: ${{ env.OPENVINO_VERSION_FULL }}
        - name: Install OpenVINO dependencies
          run: |
            cd ./openvino_toolkit
            chmod +x ./install_dependencies/install_openvino_dependencies.sh
            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
        - name: Build
          id: cmake_build
          run: |
            source ./openvino_toolkit/setupvars.sh
            cmake -B build/ReleaseOV -G Ninja \
              -DCMAKE_BUILD_TYPE=Release \
              -DGGML_OPENVINO=ON
            time cmake --build build/ReleaseOV --config Release -j $(nproc)
        - name: Test
          id: cmake_test
          # TODO: fix and re-enable the `test-llama-archs` test below
          run: |
            cd ${{ github.workspace }}
            if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
              export GGML_OPENVINO_DEVICE=GPU
            fi
            ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
  windows-latest:
    runs-on: windows-2025
@@ -244,7 +244,6 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
 - `-fa 1` is required when running llama-bench with the OpenVINO backend.
  - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
 - `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
 - For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
 > [!NOTE]
 > The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -274,8 +273,6 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
 Run llama.cpp with OpenVINO backend Docker container.
 Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
 > [!NOTE]
 > Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
 ```bash
 #  Run Docker container
@@ -19,7 +19,6 @@
 #include <iomanip>
 #include <map>
 #include <memory>
 #include <mutex>
 #include <openvino/core/dimension.hpp>
 #include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
@@ -207,8 +206,22 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
        break;
    }
    case GGML_OP_ROPE: {
        const int mode = node->op_params[2];
        switch (mode) {
       case GGML_ROPE_TYPE_NEOX: {
            op_case = 0x00010000;
            break;
        }
       case GGML_ROPE_TYPE_IMROPE: {
            op_case = 0x00020000;
            break;
        }
        default:
            op_case = 0x00000000;
            break;
        }
        if (node->src[0]->op == GGML_OP_VIEW) {
-            op_case = 2;
+            op_case = (op_case | 0x00000002);
        }
        break;
    }
@@ -573,9 +586,6 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
 }
 std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
    static std::mutex weights_mutex;
    std::lock_guard<std::mutex> lock(weights_mutex);
    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    auto * nodes = cgraph->nodes;
    auto n_nodes = cgraph->n_nodes;
@@ -6,6 +6,7 @@
 #include <cstring>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
 #include <openvino/runtime/properties.hpp>
 #include <optional>
 ov::Core & ov_singleton_core() {
@@ -42,11 +43,13 @@ void ggml_openvino_device_config::init() {
            {"NPUW_DQ",                           "YES"   },
            {"NPUW_DQ_FULL",                      "NO"    },
        };
-        if (cache_dir) {
+        if (cache_dir && strlen(cache_dir) > 0) {
            compile_config["NPUW_CACHE_DIR"] = cache_dir;
            compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
        }
-    } else if (cache_dir) {
+    } else if (cache_dir && strlen(cache_dir) > 0) {
-        ov_singleton_core().set_property(ov::cache_dir(cache_dir));
+        compile_config.insert(ov::cache_dir(cache_dir));
        compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
    }
    // Initialize remote context with queue sharing for GPU
@@ -259,10 +262,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
            layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
            int64_t n_blocks = n_elements / layout.weights_per_block;
            layout.scales_size = n_blocks * sizeof(uint16_t);
-            // For symmetric quantization, we only need one zp value (not one per block)
+            // For symmetric quantization, no zp needed (weights stored as signed)
-            // Zero points are stored in U4 or U8 format matching the weight type
+            if (layout.is_symmetric) {
-            size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
+                layout.zp_size = 0;
-            layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
+            } else {
                layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
            }
            layout.weights_offset = 0;
            layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
@@ -313,10 +318,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
    // Scales: F16 per block
    int64_t n_blocks = n_elements / layout.weights_per_block;
    layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
-    // Zero points: U4 or U8 matching weight type
+    // For symmetric quantization, no zp needed (weights stored as signed)
-    // For symmetric quantization, we only need one zp value (not one per block)
+    if (layout.is_symmetric) {
-    size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
+        layout.zp_size = 0;
-    layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
+    } else {
        layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
    }
    // Layout in buffer: [weights | scales | zp] with alignment
    layout.weights_offset = 0;
@@ -145,13 +145,18 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
    return ctx->data;
 }
 static bool is_stateful_enabled() {
    static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
    return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0;
 }
 static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    // GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
    ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
    // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
    if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
-        !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
+        !is_stateful_enabled()) {
        GGML_ASSERT(ctx->tensor_extras.empty());
        auto device = ctx->device;
        auto size = ctx->size;
@@ -600,6 +605,14 @@ bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
 static void ggml_backend_openvino_free(ggml_backend_t backend) {
    ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
    if (ctx->runtime_context) {
        auto r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
        if (--r_ctx->backend_count == 0) {
            r_ctx->clear_caches();
        }
    }
    delete ctx;
    delete backend;
 }
@@ -644,7 +657,12 @@ static ggml_guid_t ggml_backend_openvino_guid(void) {
 }
 static std::shared_ptr<ov_runtime_context> get_ov_runtime_context_ptr() {
-    static std::shared_ptr<ov_runtime_context> r_ctx = std::make_shared<ov_runtime_context>();
+    static std::shared_ptr<ov_runtime_context> r_ctx = [] {
        auto ctx = std::make_shared<ov_runtime_context>();
        ctx->device = ggml_openvino_get_device_name();
        ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();
        return ctx;
    }();
    return r_ctx;
 }
@@ -669,8 +687,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
    }
    std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
-    r_ctx->device = ggml_openvino_get_device_name();
+    r_ctx->backend_count++;
    r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
    ggml_backend_t openvino_backend = new ggml_backend{
        /* .guid      = */ ggml_backend_openvino_guid(),
@@ -883,7 +900,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
        const int32_t * op_params = op->op_params;
        const int n_dims = op_params[1];
        const int mode = op_params[2];
-        if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
+        if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_IMROPE) {
            // GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode);
            return true;
        }
@@ -896,14 +913,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
            // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
            return true;
        }
        float freq_scale;
        float ext_factor;
        memcpy(&freq_scale, op_params + 6, sizeof(float));
        memcpy(&ext_factor, op_params + 7, sizeof(float));
        if (ext_factor != 0.0f) {
            // GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
            return true;
        }
        if (op->src[0]->op == GGML_OP_VIEW) {
            if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
                // GGML_LOG_WARN(
@@ -913,6 +922,12 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
                return true;
            }
        }
        if (mode == GGML_ROPE_TYPE_IMROPE &&
            (op->src[2] != 0 || ((const float *) op_params)[6] != 1 || ((const float *) op_params)[7] != 0 ||
             ((const float *) op_params)[8] != 1)) {
            // GGML_LOG_WARN("OpenVINO backend does not support IMROPE with freq_factors, freq_scale, ext_factor, and attn_factor\n");
            return true;
        }
        break;
    }
    default:
@@ -942,6 +957,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
                                                 // GGML_OP_SOFT_MAX,
                                                 GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
    static const std::set<ggml_unary_op> supported_unary_ops{
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_SILU,
    };
    static const std::set<ggml_glu_op> supported_glu_ops{
@@ -46,6 +46,7 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
 // Extracts (weight, scales, zp) from Q4_0 tensors.
 // Data layout is: |16 bit scale|32 x 4bit weights|.
 // When zp_arr is empty (symmetric), weights are stored as signed i4 (value - 8).
 void extract_q4_0_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
@@ -55,28 +56,32 @@ void extract_q4_0_data(const ggml_tensor * tensor,
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4);  // Signed i4 path
    if (!is_symmetric) {
        auto * zp = static_cast<uint8_t *>(zp_arr.data());
    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
    // For Q4_0, zero point is always 8
    if (is_scalar_zp) {
        zp[0] = 8 | (8 << 4);  // Pack two 4-bit values
    }
        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
            scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
        // For asymmetric quantization, compute per-block zero points
        if (!is_scalar_zp) {
            // Pack two 4-bit zero points per byte
            if (i % 2 == 0) {
                zp[i / 2] = 8;          // Lower nibble
            } else {
                zp[i / 2] |= (8 << 4);  // Upper nibble
            }
        }
            unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
        });
    } else {
        // Symmetric: unpack as u4 then convert to i4 by subtracting 8 (XOR each nibble)
        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
            scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
            unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
            // Convert u4 to i4: subtract 8 from each nibble. XOR 0x88 flips each nibble by 8.
            for (int j = 0; j < 16; ++j) {
                weights[i * 16 + j] ^= 0x88;
            }
        });
    }
 }
 // Extracts (weight, scales, zp) from Q4_1 tensors.
@@ -123,6 +128,7 @@ void extract_q4_1_data(const ggml_tensor * tensor,
 // Extracts (weight, scales, zp) from Q8_0 tensors.
 // Data layout is: |16 bit scale|32 x 8bit weights|.
 // When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
 void extract_q8_0_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
@@ -133,29 +139,30 @@ void extract_q8_0_data(const ggml_tensor * tensor,
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
    if (!is_symmetric) {
        auto * zp = static_cast<uint8_t *>(zp_arr.data());
    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
    // For Q8_0, zero point is always 128
    if (is_scalar_zp) {
        zp[0] = 128;
    }
        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
            uint8_t * block_data = data + i * bytes_per_block;
            scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
        // For asymmetric quantization, store per-block zero points
        if (!is_scalar_zp) {
            zp[i] = 128;
        }
            for (size_t j = 0; j < weights_per_block; ++j) {
-            uint8_t x = block_data[j + 2];  // j+2 to skip the scale bytes.
+                uint8_t x = block_data[j + 2];
-            // Original data is in int8_t, so we add a bias of -128 and invert the first bit.
+                x ^= 1 << 7;  // Convert int8 to uint8 by flipping sign bit
            x ^= 1 << 7;
                weights[i * weights_per_block + j] = x;
            }
        });
    } else {
        // Symmetric: store original int8 values directly (no unsigned bias)
        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
            uint8_t * block_data = data + i * bytes_per_block;
            scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
            // Copy int8 weights as-is (the tensor element type is i8)
            memcpy(weights + i * weights_per_block, block_data + 2, weights_per_block);
        });
    }
 }
 void unpack_256_4(const uint8_t * data, uint8_t * dst) {
@@ -256,33 +263,21 @@ void extract_q6_k_data(const ggml_tensor * tensor,
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
    if (!is_symmetric) {
        auto * zp = static_cast<uint8_t *>(zp_arr.data());
    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
    // For Q6_K, zero point is always 32
    if (is_scalar_zp) {
        zp[0] = 32;
    }
        ov::parallel_for(n_super_block, [&](size_t i) {
            uint8_t * block_data = data + i * bytes_per_block;
-
+            float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
        float scale_factor =
            static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));  // (128+64+16)/2
            for (size_t j = 0; j < 16; j++) {
                scales[j + i * 16] =
                    ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
            // For asymmetric quantization, store per-block zero points
            if (!is_scalar_zp) {
                zp[j + i * 16] = 32;
            }
        }
            uint8_t * ql = block_data;
            uint8_t * qh = block_data + 128;
            for (int64_t j = 0; j < 32; ++j) {
                weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
                weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
@@ -294,6 +289,36 @@ void extract_q6_k_data(const ggml_tensor * tensor,
                weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
            }
        });
    } else {
        // Symmetric: subtract 32 from each weight to store as signed i8
        ov::parallel_for(n_super_block, [&](size_t i) {
            uint8_t * block_data = data + i * bytes_per_block;
            float scale_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104)));
            for (size_t j = 0; j < 16; j++) {
                scales[j + i * 16] =
                    ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
            }
            uint8_t * ql = block_data;
            uint8_t * qh = block_data + 128;
            auto * signed_weights = reinterpret_cast<int8_t *>(weights);
            for (int64_t j = 0; j < 32; ++j) {
                signed_weights[i * 256 + j] = static_cast<int8_t>((ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4)) - 32;
                signed_weights[i * 256 + j + 32] =
                    static_cast<int8_t>((ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4)) - 32;
                signed_weights[i * 256 + j + 64] = static_cast<int8_t>((ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4)) - 32;
                signed_weights[i * 256 + j + 96] =
                    static_cast<int8_t>((ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4)) - 32;
                signed_weights[i * 256 + j + 128] =
                    static_cast<int8_t>((ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4)) - 32;
                signed_weights[i * 256 + j + 160] =
                    static_cast<int8_t>((ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4)) - 32;
                signed_weights[i * 256 + j + 192] =
                    static_cast<int8_t>((ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4)) - 32;
                signed_weights[i * 256 + j + 224] =
                    static_cast<int8_t>((ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4)) - 32;
            }
        });
    }
 }
 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
@@ -389,11 +414,10 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                                       size_t group_size,
                                       bool use_bias) {
    ov::Shape orig_shape = weight.get_shape();
    bool is_signed = (weight.get_element_type() == ov::element::i8);  // Symmetric: signed weights, no ZP
    // Expand dimensions for scales and zp/bias
    auto scale_shape = scales.get_shape();
    auto zp_shape = zp.get_shape();
    bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization
    ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
@@ -403,25 +427,35 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
    } else {
        scale_shape.push_back(1);
        scales.set_shape(scale_shape);
-        // For symmetric quantization, zp remains scalar (don't resize)
+        if (!is_signed && zp.get_size() > 0) {
-        if (!is_scalar_zp) {
+            auto zp_shape = zp.get_shape();
            zp_shape.push_back(1);
            zp.set_shape(zp_shape);
        }
    }
-    // Create graph nodes
+    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
    ov::Output<ov::Node> result;
    if (is_signed) {
        // Signed path: q * s (no zero point subtraction needed)
        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i8, packed_shape,
                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
        result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
    } else {
        // Unsigned path
        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
-    ov::Output<ov::Node> result;
+        if (use_bias && zp.get_size() > 0) {
    if (use_bias && !is_scalar_zp) {
            // Bias path: w * s + b (zp tensor holds f16 bias values)
            auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
-        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+            auto w_s =
                std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
            result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
        } else {
            // Zero point path: (w - zp) * s
@@ -435,6 +469,7 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
            result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
        }
    }
    if (packed_shape.size() != 2) {
        // If not requantized channel-wise case, reshape back to original shape
@@ -452,11 +487,10 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
                                       size_t group_size,
                                       bool use_bias) {
    ov::Shape orig_weight_shape = weight.get_shape();
    bool is_signed = (weight.get_element_type() == ov::element::i4);  // Symmetric: signed weights, no ZP
    // Expand dimensions for scales and zp/bias
    ov::Shape scale_shape = scales.get_shape();
    auto zp_shape = zp.get_shape();
    bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization
    // Create INT4 weight tensor
    ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
@@ -467,24 +501,35 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
    } else {
        scale_shape.push_back(1);
        scales.set_shape(scale_shape);
-        // For symmetric quantization, zp remains scalar (don't resize)
+        if (!is_signed && zp.get_size() > 0) {
-        if (!is_scalar_zp) {
+            auto zp_shape = zp.get_shape();
            zp_shape.push_back(1);
            zp.set_shape(zp_shape);
        }
    }
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
    ov::Output<ov::Node> result;
    if (is_signed) {
        // Signed path: q * s (no zero point subtraction needed)
        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::i4, packed_shape,
                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
        result = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
    } else {
        // Unsigned path
        auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
                                                                   static_cast<uint8_t *>(weight.data()), nullptr);
        weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
        auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
-    ov::Output<ov::Node> result;
+        if (use_bias && zp.get_size() > 0) {
    if (use_bias && !is_scalar_zp) {
            // Bias path: w * s + b (zp tensor holds f16 bias values)
            auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
-        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+            auto w_s =
                std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
            result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
        } else {
            // Zero point path: (w - zp) * s
@@ -498,6 +543,7 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
                std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
            result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
        }
    }
    if (packed_shape.size() != 2) {
        // If not requantized channel-wise case, reshape back to original shape
@@ -699,25 +745,33 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
    // Quantized path (normal extraction or quantized requant)
    // Create weight/scale/zp tensors - shared between both paths
-    ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
+    // For symmetric quantization, use signed types (i4/i8) and no ZP tensor
    ov::element::Type weight_type = layout.is_symmetric ? (layout.is_u4 ? ov::element::i4 : ov::element::i8) :
                                                          (layout.is_u4 ? ov::element::u4 : ov::element::u8);
    ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
    ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
    if (output_base_ptr) {
        uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
        result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
        result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-        result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
+        if (!layout.is_symmetric) {
            ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
            result.zp = ov::Tensor(zp_type, scale_shape, buf_base + layout.zp_offset);
        }
        // else: result.zp remains default-constructed (empty) for symmetric
    } else {
        result.weights = ov::Tensor(weight_type, node_shape);
        result.scales = ov::Tensor(ov::element::f16, scale_shape);
-        if (use_bias && !layout.is_symmetric) {
+        if (!layout.is_symmetric) {
-            // bias only has effect for asymmetric quant
+            if (use_bias) {
-            result.zp = ov::Tensor(ov::element::f16, zp_shape);
+                result.zp = ov::Tensor(ov::element::f16, scale_shape);
            } else {
-            result.zp = ov::Tensor(weight_type, zp_shape);
+                ov::element::Type zp_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
                result.zp = ov::Tensor(zp_type, scale_shape);
            }
        }
        // else: result.zp remains default-constructed (empty) for symmetric
    }
    if (layout.is_requant && layout.requant_type.has_value()) {
        result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
@@ -741,18 +795,13 @@ void quantize_q4_0(const float * x,
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i4);  // Signed i4 path
    if (!is_symmetric) {
        auto * zp = static_cast<uint8_t *>(zp_arr.data());
    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
    // For Q4_0, zero point is always 8
    if (is_scalar_zp) {
        zp[0] = 8 | (8 << 4);  // Pack two 4-bit values
    }
        for (int i = 0; i < nb; i++) {
-        float amax = 0.0f;  // absolute max
+            float amax = 0.0f;
            float max = 0.0f;
            for (int j = 0; j < qk; j++) {
                const float v = x[i * qk + j];
                if (amax < fabsf(v)) {
@@ -760,34 +809,24 @@ void quantize_q4_0(const float * x,
                    max = v;
                }
            }
            const float d = max / -8;
            if (d == 0) {
                scales[i] = ov::float16(1.0f);
            // zp is already set to 8 for symmetric, or set per-block for asymmetric
            if (!is_scalar_zp) {
                if (i % 2 == 0) {
                    zp[i / 2] = 8;
                } else {
                    zp[i / 2] |= (8 << 4);
                }
            }
                memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
                continue;
            }
            const float id = 1.0f / d;
            scales[i] = ov::float16(d);
        // For asymmetric quantization, store per-block zero points
        if (!is_scalar_zp) {
            if (i % 2 == 0) {
                zp[i / 2] = 8;
            } else {
                zp[i / 2] |= (8 << 4);
            }
        }
            for (int j = 0; j < qk / 2; ++j) {
                const float x0 = x[i * qk + 2 * j] * id;
                const float x1 = x[i * qk + 2 * j + 1] * id;
@@ -796,6 +835,37 @@ void quantize_q4_0(const float * x,
                weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
            }
        }
    } else {
        // Symmetric: produce signed i4 values in [-8, 7]
        for (int i = 0; i < nb; i++) {
            float amax = 0.0f;
            float max = 0.0f;
            for (int j = 0; j < qk; j++) {
                const float v = x[i * qk + j];
                if (amax < fabsf(v)) {
                    amax = fabsf(v);
                    max = v;
                }
            }
            const float d = max / -8;
            if (d == 0) {
                scales[i] = ov::float16(1.0f);
                // i4 value 0 packed: 0x00
                memset(weights + i * qk / 2, 0, qk / 2);
                continue;
            }
            const float id = 1.0f / d;
            scales[i] = ov::float16(d);
            for (int j = 0; j < qk / 2; ++j) {
                const float x0 = x[i * qk + 2 * j] * id;
                const float x1 = x[i * qk + 2 * j + 1] * id;
                // Signed i4: range [-8, 7]. Quantize as round(x*id), then pack as 4-bit two's complement.
                int8_t si0 = (int8_t) std::max(-8, std::min(7, (int) roundf(x0)));
                int8_t si1 = (int8_t) std::max(-8, std::min(7, (int) roundf(x1)));
                weights[i * qk / 2 + j] = (si0 & 0x0F) | ((si1 & 0x0F) << 4);
            }
        }
    }
 }
 void quantize_q8_0(const float * x,
@@ -809,38 +879,44 @@ void quantize_q8_0(const float * x,
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    bool is_symmetric = (weights_arr.get_element_type() == ov::element::i8);  // Signed i8 path
    if (!is_symmetric) {
        auto * zp = static_cast<uint8_t *>(zp_arr.data());
    bool is_scalar_zp = (zp_arr.get_size() == 1);  // Symmetric quantization
    // For Q8_0, zero point is always 128
    if (is_scalar_zp) {
        zp[0] = 128;
    }
        for (int i = 0; i < nb; i++) {
-        float amax = 0.0f;  // absolute max
+            float amax = 0.0f;
            for (int j = 0; j < qk; j++) {
                const float v = x[i * qk + j];
-            if (amax < fabsf(v)) {
+                amax = std::max(amax, fabsf(v));
                amax = fabsf(v);
            }
        }
            const float d = amax / 127.0f;
            const float id = d ? 1.0f / d : 0.0f;
            scales[i] = ov::float16(d);
        // For asymmetric quantization, store per-block zero points
        if (!is_scalar_zp) {
            zp[i] = 128;
        }
            for (int j = 0; j < qk; ++j) {
                const float x0 = x[i * qk + j] * id;
                const int8_t xi0 = roundf(x0);
                weights[i * qk + j] = (uint8_t) (xi0 + 128);
            }
        }
    } else {
        // Symmetric: store signed int8 values directly
        auto * signed_weights = reinterpret_cast<int8_t *>(weights);
        for (int i = 0; i < nb; i++) {
            float amax = 0.0f;
            for (int j = 0; j < qk; j++) {
                const float v = x[i * qk + j];
                amax = std::max(amax, fabsf(v));
            }
            const float d = amax / 127.0f;
            const float id = d ? 1.0f / d : 0.0f;
            scales[i] = ov::float16(d);
            for (int j = 0; j < qk; ++j) {
                const float x0 = x[i * qk + j] * id;
                signed_weights[i * qk + j] = (int8_t) roundf(x0);
            }
        }
    }
 }
 void quantize_q8_1(const float * x,
@@ -861,12 +937,8 @@ void quantize_q8_1(const float * x,
        for (int j = 0; j < qk; j++) {
            const float v = x[i * qk + j];
-            if (v < min) {
+            min = std::min(v, min);
-                min = v;
+            max = std::max(v, max);
            }
            if (v > max) {
                max = v;
            }
        }
        const float d = (max - min) / ((1 << 8) - 1);
@@ -9,12 +9,17 @@
 #include <openvino/op/add.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/cos.hpp>
 #include <openvino/op/gather.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/shape_of.hpp>
 #include <openvino/op/sin.hpp>
 #include <openvino/op/slice.hpp>
 #include <openvino/op/split.hpp>
 #include <openvino/op/subtract.hpp>
 #include <openvino/op/transpose.hpp>
 #include <openvino/op/unsqueeze.hpp>
 #include <vector>
@@ -33,6 +38,12 @@ OutputVector translate_rope(const NodeContext & context) {
    auto data_node = context.get_input(0).get_node_shared_ptr();
    auto output_shape = context.get_output_shape().to_shape();
    int32_t * op_params = context.get_output_op_params();
    const int mode = (op_case & 0xFFFF0000) >> 16;
    op_case = (op_case & 0x0000FFFF);
    constexpr int TYPE_NORMAL = 0;
    constexpr int TYPE_NEOX = 1;
    constexpr int TYPE_IMROPE = 2;
    Output<Node> cos_theta_node;
    Output<Node> sin_theta_node;
@@ -45,7 +56,7 @@ OutputVector translate_rope(const NodeContext & context) {
        if (context.get_input_size() == 3) {
            rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
        }
-        auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight);
+        auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE);
        sin_theta_node = sin_cos.first;
        cos_theta_node = sin_cos.second;
    }
@@ -65,11 +76,7 @@ OutputVector translate_rope(const NodeContext & context) {
        }
    }
-    const int mode = op_params[2];
+    if (mode == TYPE_NORMAL) {
    constexpr int ROPE_TYPE_NORMAL = 0;
    constexpr int ROPE_TYPE_NEOX = 2;
    if (mode == ROPE_TYPE_NORMAL) {
        auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
@@ -97,7 +104,7 @@ OutputVector translate_rope(const NodeContext & context) {
        auto data_shape = ov::op::v0::Constant::create(
            ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
        res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
-    } else if (mode == ROPE_TYPE_NEOX) {
+    } else if (mode == TYPE_NEOX) {
        auto data_split = std::make_shared<ov::op::v1::Split>(
            data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
        Output<Node> slice_data_node_0 = data_split->outputs()[0];
@@ -112,6 +119,25 @@ OutputVector translate_rope(const NodeContext & context) {
            std::make_shared<ov::op::v1::Multiply>(slice_data_node_1, cos_theta_node));
        res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
    } else if (mode == TYPE_IMROPE) {
        int64_t n_dims = data_node->get_shape()[3];
        auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1,-1,1,(n_dims >> 1)});
        auto cos_reshaped = std::make_shared<ov::op::v1::Reshape>(cos_theta_node, cos_sin_shape, true);
        auto sin_reshaped = std::make_shared<ov::op::v1::Reshape>(sin_theta_node, cos_sin_shape, true);
        auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3});
        auto split_a = std::make_shared<ov::op::v1::Split>(data_node, split_axis, 2);
        auto x0 = split_a->output(0);
        auto x1 = split_a->output(1);
        auto mul_a = std::make_shared<ov::op::v1::Multiply>(x0, cos_reshaped);
        auto mul_b = std::make_shared<ov::op::v1::Multiply>(x1, sin_reshaped);
        auto sub = std::make_shared<ov::op::v1::Subtract>(mul_a, mul_b);
        auto mul_c = std::make_shared<ov::op::v1::Multiply>(x0, sin_reshaped);
        auto mul_d = std::make_shared<ov::op::v1::Multiply>(x1, cos_reshaped);
        auto add = std::make_shared<ov::op::v1::Add>(mul_c, mul_d);
        res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
    }
    return rename_outputs_with_suffix({res}, context.get_name());
@@ -0,0 +1,25 @@
 #include "../node_context.h"
 #include "../op_table.h"
 #include "../utils.h"
 #include <openvino/core/node_output.hpp>
 #include <openvino/op/gelu.hpp>
 namespace ov {
 namespace frontend {
 namespace ggml {
 namespace op {
 OutputVector translate_unary_gelu(const NodeContext & context) {
    num_inputs_check(context, 1, 1);
    auto input = context.get_input(0);
    auto res = std::make_shared<ov::op::v7::Gelu>(input);
    return rename_outputs_with_suffix({res}, context.get_name());
 }
 }  // namespace op
 }  // namespace ggml
 }  // namespace frontend
 }  // namespace ov
@@ -31,6 +31,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
        {"GGML_OP_SOFT_MAX",       op::translate_soft_max                         },
        {"GGML_OP_SUB",            op::translate_1to1_match_2_inputs<v1::Subtract>},
        {"GGML_OP_TRANSPOSE",      op::translate_transpose                        },
        {"GGML_UNARY_OP_GELU",     op::translate_unary_gelu                       },
        {"GGML_UNARY_OP_SILU",     op::translate_unary_silu                       },
        {"GGML_OP_VIEW",           op::translate_view                             },
        {"GGML_GLU_OP_SWIGLU",     op::translate_glu_swiglu                       },
@@ -21,6 +21,7 @@ GGML_OP_CONVERTER(translate_rms_norm);
 GGML_OP_CONVERTER(translate_rope);
 GGML_OP_CONVERTER(translate_scale);
 GGML_OP_CONVERTER(translate_unary_silu);
 GGML_OP_CONVERTER(translate_unary_gelu);
 GGML_OP_CONVERTER(translate_soft_max);
 GGML_OP_CONVERTER(translate_transpose);
 GGML_OP_CONVERTER(translate_view);
@@ -1,123 +0,0 @@
 #include "eliminate_zp.h"
 #include <openvino/core/graph_util.hpp>
 #include <openvino/core/parallel.hpp>
 #include <openvino/core/rt_info.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/subtract.hpp>
 #include <openvino/pass/pattern/op/label.hpp>
 #include <openvino/pass/pattern/op/pattern.hpp>
 #include <openvino/pass/pattern/op/wrap_type.hpp>
 namespace ov {
 namespace frontend {
 namespace ggml {
 namespace pass {
 EliminateZeroPoints::EliminateZeroPoints() {
    // Find pattern:
    // (Multiply Any(scale)
    //           (Subtract (Convert Constant(data)))
    //                     (Convert Constant(zero_point)))
    // where zero_point is a scalar
    // If data is u4 and zp value is 8 (q4_0), Replace the Subtract with an i4 Constant whose value is data - zp_val
    // If data is u8 and zp value is 128 (q8_0) or 32 (q6_k), Replace the Subtract with an i8 Constant
    auto m_data_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
    auto m_data_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_data_constant});
    auto m_zp_constant = ov::pass::pattern::wrap_type<ov::op::v0::Constant>();
    auto m_zp_convert = ov::pass::pattern::wrap_type<ov::op::v0::Convert>({m_zp_constant});
    auto m_subtract = ov::pass::pattern::wrap_type<ov::op::v1::Subtract>({m_data_convert, m_zp_convert});
    auto m_scale = ov::pass::pattern::any_input();
    auto m_multiply = ov::pass::pattern::wrap_type<ov::op::v1::Multiply>({m_scale, m_subtract});
    const auto callback = [=](ov::pass::pattern::Matcher & m) {
        const auto & pattern_map = m.get_pattern_value_map();
        auto multiply_node =
            std::dynamic_pointer_cast<ov::op::v1::Multiply>(pattern_map.at(m_multiply).get_node_shared_ptr());
        auto subtract_node =
            std::dynamic_pointer_cast<ov::op::v1::Subtract>(pattern_map.at(m_subtract).get_node_shared_ptr());
        auto data_constant =
            std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_data_constant).get_node_shared_ptr());
        auto zp_constant =
            std::dynamic_pointer_cast<ov::op::v0::Constant>(pattern_map.at(m_zp_constant).get_node_shared_ptr());
        if (!multiply_node || !subtract_node || !data_constant || !zp_constant) {
            return false;
        }
        if (ov::shape_size(zp_constant->get_shape()) != 1) {
            return false;
        }
        auto data_type = data_constant->get_element_type();
        auto zp_data = zp_constant->cast_vector<int>();
        if (zp_data.empty()) {
            return false;
        }
        int zp_value = zp_data[0];
        bool should_eliminate = false;
        ov::element::Type target_type;
        if (data_type == ov::element::u4 && zp_value == 8) {
            should_eliminate = true;
            target_type = ov::element::i4;
        } else if (data_type == ov::element::u8 && (zp_value == 128 || zp_value == 32)) {
            should_eliminate = true;
            target_type = ov::element::i8;
        }
        if (!should_eliminate) {
            return false;
        }
        auto data_shape = data_constant->get_shape();
        size_t total_elements = ov::shape_size(data_shape);
        std::shared_ptr<ov::op::v0::Constant> new_constant;
        // TODO improve performance
        if (data_type == ov::element::u4) {
            auto data_values = data_constant->cast_vector<uint8_t>();
            std::vector<int8_t> adjusted_values(total_elements);
            ov::parallel_for(total_elements, [&](size_t i) {
                adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - 8);
            });
            new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
        } else if (data_type == ov::element::u8) {
            auto data_values = data_constant->cast_vector<uint8_t>();
            std::vector<int8_t> adjusted_values(total_elements);
            ov::parallel_for(total_elements, [&, zp_value](size_t i) {
                adjusted_values[i] = static_cast<int8_t>(static_cast<int>(data_values[i]) - zp_value);
            });
            new_constant = std::make_shared<ov::op::v0::Constant>(target_type, data_shape, adjusted_values);
        }
        auto new_convert =
            std::make_shared<ov::op::v0::Convert>(new_constant, subtract_node->get_output_element_type(0));
        ov::replace_node(subtract_node, new_convert);
        return true;
    };
    register_matcher(
        std::make_shared<ov::pass::pattern::Matcher>(m_multiply, "ov::frontend::ggml::pass::EliminateZeroPoints"),
        callback);
 }
 }  // namespace pass
 }  // namespace ggml
 }  // namespace frontend
 }  // namespace ov
@@ -1,17 +0,0 @@
 #include "openvino/pass/matcher_pass.hpp"
 namespace ov {
 namespace frontend {
 namespace ggml {
 namespace pass {
 class EliminateZeroPoints : public ov::pass::MatcherPass {
 public:
    OPENVINO_MATCHER_PASS_RTTI("ov::frontend::ggml::pass::EliminateZeroPoints")
    EliminateZeroPoints();
 };
 }  // namespace pass
 }  // namespace ggml
 }  // namespace frontend
 }  // namespace ov
@@ -0,0 +1,41 @@
 // Copyright (C) 2018-2026 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include <openvino/core/core_visibility.hpp>
 #include <openvino/core/node.hpp>
 #include <openvino/core/runtime_attribute.hpp>
 namespace ov {
 /**
 * @brief Holds weightless caching attributes of a single constant.
 *
 * WeightlessCacheAttribute class represents runtime info attribute that holds
 * the values of original size of the constant in bytes and the binary offset of the
 * constant's data in the weights file used by the weightless caching mechanism. It's
 * not copyable in case the data was changed (the original node was replaced by a new
 * one produced during the tranformation pipeline) - in that case weightless caching
 * can't be used for that constant.
 */
 class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute {
 public:
    OPENVINO_RTTI("WeightlessCacheAttribute", "0", RuntimeAttribute)
    WeightlessCacheAttribute() = delete;
    WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype)
        : original_size(original_size),
          bin_offset(bin_offset),
          original_dtype(original_dtype) {}
    bool is_copyable() const override;
    size_t original_size;
    size_t bin_offset;
    ov::element::Type original_dtype;
 };
 }  // namespace ov
@@ -3,15 +3,16 @@
 #include "ggml-openvino/openvino/node_context.h"
 #include "ggml-openvino/openvino/utils.h"
 #include "input_model.h"
 #include "pass/eliminate_zp.h"
 #include "pass/mark_decompression_convert_constant_folding.h"
 #include "pass/squeeze_matmul.h"
 #include "rt_info/weightless_caching_attributes.hpp"
 #include <cstdint>
 #include <cstdlib>
 #include <map>
 #include <memory>
 #include <openvino/core/node.hpp>
 #include <openvino/core/preprocess/pre_post_process.hpp>
 #include <openvino/op/add.hpp>
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
@@ -33,7 +34,6 @@
 #include <openvino/op/unsqueeze.hpp>
 #include <openvino/pass/constant_folding.hpp>
 #include <openvino/pass/make_stateful.hpp>
 #include <openvino/core/preprocess/pre_post_process.hpp>
 namespace ov {
 namespace frontend {
@@ -240,6 +240,31 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
    resulting_model = std::make_shared<Model>(results, used_params);
    apply_transformations(resulting_model);
    // Set WeightlessCacheAttribute on large constants to avoid unnecessary memory copies
    // in the NPUW plugin. Without this attribute, NPUW's LazyTensor constructor
    // (lazy_tensor.cpp, op::Const::Const) will memcpy every constant "in case export
    // occurs", doubling memory usage per compile_model call.
    //
    // The bin_offset field serves as a unique key (not a real file offset) — this is
    // the same convention the GPU plugin uses for non-IR models (see
    // Plugin::set_weightless_cache_attributes in intel_gpu/src/plugin/plugin.cpp).
    // Each constant must have a distinct bin_offset, otherwise GPU's weightless cache
    // import will map multiple constants to the same data.
    //
    // Small constants (< 16 elements) are excluded since they may be introduced by
    // optimization patterns and the overhead is negligible.
    size_t offset = 0;
    for (auto & node : resulting_model->get_ordered_ops()) {
        if (auto cnst = ov::as_type_ptr<ov::op::v0::Constant>(node);
            cnst && cnst->get_byte_size() / cnst->get_element_type().size() >= 16) {
            auto & rt_info = cnst->get_rt_info();
            if (rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()) == rt_info.end()) {
                rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] =
                    ov::WeightlessCacheAttribute(cnst->get_byte_size(), offset++, cnst->get_element_type());
            }
        }
    }
    return resulting_model;
 }
@@ -257,7 +282,6 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
        }
        if (ggml_model_decoder->is_static()) {
            manager.register_pass<pass::EliminateZeroPoints>();
            manager.register_pass<pass::SqueezeMatmul>();
        }
        manager.run_passes(model);
@@ -2,6 +2,7 @@
 #include "ggml-impl.h"
 #include <cmath>
 #include <cstddef>
 #include <ctime>
 #include <memory>
@@ -13,6 +14,7 @@
 #include <openvino/op/gather.hpp>
 #include <openvino/op/maximum.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/shape_of.hpp>
 #include <openvino/op/sin.hpp>
 #include <openvino/op/squeeze.hpp>
@@ -87,8 +89,11 @@ ov::Output<ov::Node> rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], fl
    auto ramp_y =
        std::make_shared<ov::op::v1::Divide>(std::make_shared<ov::op::v1::Subtract>(dim_ids, corr_low), denom);
    auto ramp_clamped = std::make_shared<ov::op::v0::Clamp>(ramp_y, 0.0f, 1.0f);
    // rope_yarn_ramp returns (1 - clamp(y)), so invert before scaling
    auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1, 1}, {1.0f});
    auto ramp_inverted = std::make_shared<ov::op::v1::Subtract>(one, ramp_clamped);
    auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor});
-    auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_clamped, ext_factor_node);
+    auto ramp_mix = std::make_shared<ov::op::v1::Multiply>(ramp_inverted, ext_factor_node);
    return ramp_mix;
 }
@@ -115,6 +120,7 @@ void ggml_rope_yarn_corr_dims(int n_dims,
 std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
                                                           std::shared_ptr<ov::Node> inp_pos,
                                                           std::shared_ptr<ov::Node> rope_freqs_weight,
                                                           bool imrope,
                                                           bool stateful) {
    if (stateful) {
        inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
@@ -122,6 +128,13 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
        auto pos_perm =
            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
        inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_perm);
    } else if (imrope) {
        inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
        auto pos_shape = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, {0, 0, 0, 4, -1});
        inp_pos = std::make_shared<ov::op::v1::Reshape>(inp_pos, pos_shape, true);
        auto pos_transpose_shape =
            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{5}, std::vector<int64_t>{0, 1, 2, 4, 3});
        inp_pos = std::make_shared<ov::op::v1::Transpose>(inp_pos, pos_transpose_shape);
    } else {
        inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
        auto pos_perm =
@@ -136,6 +149,7 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
    float beta_fast;
    float beta_slow;
    const int n_dims = rope_params[1];
    const size_t n_dims_half = n_dims >> 1;
    const int n_ctx_orig = rope_params[4];
    memcpy(&freq_base, rope_params + 5, sizeof(float));
    memcpy(&freq_scale, rope_params + 6, sizeof(float));
@@ -146,16 +160,31 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
    const float theta_scale = powf(freq_base, -2.0f / n_dims);
    std::vector<float> factor(n_dims_half);
    Output<Node> freq_factors;
    Output<Node> theta;
    float mscale = attn_factor;
    if (imrope) {
        std::vector<int64_t> gather_indices(n_dims_half);
        for (size_t j = 0; j < n_dims_half; j++) {
            gather_indices[j] = j % 3;
            factor[j] = std::pow(theta_scale, j);
        }
        auto gather_indices_const =
            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{n_dims_half}, gather_indices);
        auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {4});
        inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, gather_indices_const, gather_axis);
        auto factor_const = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_dims_half}, factor);
        theta = std::make_shared<ov::op::v1::Multiply>(inp_pos, factor_const);
    } else {
        float corr_dims[2];
        ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
    std::vector<float> factor(n_dims / 2);
        factor[0] = 1.0f;
        for (size_t i = 1; i < factor.size(); i++) {
            factor[i] = theta_scale * factor[i - 1];
        }
    Output<Node> freq_factors;
        if (stateful) {
            freq_factors =
                std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor);
@@ -171,8 +200,6 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
        auto theta_interp = std::make_shared<ov::op::v1::Multiply>(
            theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale}));
    Output<Node> theta;
    float mscale = attn_factor;
        if (ext_factor == 0.0f) {
            theta = theta_interp;
        } else {
@@ -189,14 +216,18 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
                                                      std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
            mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
        }
    }
    Output<Node> cos_theta = std::make_shared<ov::op::v0::Cos>(theta);
    Output<Node> sin_theta = std::make_shared<ov::op::v0::Sin>(theta);
    if (!imrope) {
        auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale});
        cos_theta = std::make_shared<ov::op::v1::Multiply>(cos_theta, mscale_node);
        sin_theta = std::make_shared<ov::op::v1::Multiply>(sin_theta, mscale_node);
    }
    return std::make_pair(sin_theta, cos_theta);
 }
@@ -67,6 +67,7 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::
 std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
                                                           std::shared_ptr<ov::Node> inp_pos,
                                                           std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
                                                           bool imrope = false,
                                                           bool stateful = false);
 ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
@@ -81,8 +81,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
 enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
    auto & core = ov_singleton_core();
    const auto & config = ggml_openvino_get_compile_config();
-    auto device = r_ctx->device;
+    const auto & device = r_ctx->device;
-    bool stateful = r_ctx->stateful;
+    const auto & stateful = r_ctx->stateful;
    static auto is_static = false;
    if (is_naive(cgraph)) {
@@ -106,14 +106,26 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
    int64_t infer_end_time;
    {
-        std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex);
+        std::shared_ptr<decoder_runtime_ctx> entry;
        auto it = r_ctx->decoder_cache.find(key);
        cache_hit = it != r_ctx->decoder_cache.end();
        ModelParams old_m_params;
        {
            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
            auto it = r_ctx->decoder_cache.find(key);
            cache_hit = it != r_ctx->decoder_cache.end();
            if (cache_hit) {
-            ggml_decoder = it->second;
+                entry = it->second;
            } else {
                auto mutex = std::make_shared<std::mutex>();
                entry = std::make_shared<decoder_runtime_ctx>(mutex);
                r_ctx->decoder_cache[key] = entry;
            }
        }
        std::lock_guard<std::mutex> lock(*(entry->mutex));
        if (cache_hit) {
            ggml_decoder = entry->ptr;
            old_m_params = ggml_decoder->get_model_params();
            cache_hit = old_m_params.can_reuse_dynamically(m_params);
        }
@@ -126,7 +138,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
                ggml_decoder->update_io(cgraph);
            }
            ggml_decoder->add_extra_inputs();
            {
                std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
                infer_request = r_ctx->infer_request_cache.at(key);
            }
            if (stateful) {
                const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -170,7 +185,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            conversion_end_time = decoder_end_time;
            compile_end_time = decoder_end_time;
        } else {
            {
                std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
                r_ctx->infer_request_cache.erase(key);
            }
            std::shared_ptr<ov::Model> model;
            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -199,8 +217,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            }
            compile_end_time = ggml_time_us();
            infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
-            r_ctx->infer_request_cache[key] = infer_request;
+            entry->ptr = ggml_decoder;
            r_ctx->decoder_cache[key] = ggml_decoder;
            std::vector<std::string> ov_input_names;
            std::vector<std::string> ov_output_names;
@@ -210,8 +227,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            for (const auto & ov_output : model->get_results()) {
                ov_output_names.push_back(ov_output->get_friendly_name());
            }
            {
                std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
                r_ctx->infer_request_cache[key] = infer_request;
                r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
                r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
            }
            if (stateful) {
                const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -224,8 +246,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            }
        }
-        auto ov_input_names = r_ctx->ov_input_names_cache[key];
+        std::vector<std::string> ov_input_names;
-        auto ov_output_names = r_ctx->ov_output_names_cache[key];
+        std::vector<std::string> ov_output_names;
        {
            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
            ov_input_names = r_ctx->ov_input_names_cache[key];
            ov_output_names = r_ctx->ov_output_names_cache[key];
        }
        for (size_t i = 0; i < ov_input_names.size(); i++) {
            auto param_name = ov_input_names[i];
@@ -306,12 +333,26 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
    int64_t compile_end_time;
    int64_t infer_end_time;
-    auto it = r_ctx->decoder_cache.find(key);
+    std::shared_ptr<decoder_runtime_ctx> entry;
    cache_hit = it != r_ctx->decoder_cache.end();
    ModelParams old_m_params;
    {
        std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
        auto it = r_ctx->decoder_cache.find(key);
        cache_hit = it != r_ctx->decoder_cache.end();
        if (cache_hit) {
-        ggml_decoder = it->second;
+            entry = it->second;
        } else {
            auto mutex = std::make_shared<std::mutex>();
            entry = std::make_shared<decoder_runtime_ctx>(mutex);
            r_ctx->decoder_cache[key] = entry;
        }
    }
    std::lock_guard<std::mutex> lock(*(entry->mutex));
    if (cache_hit) {
        ggml_decoder = entry->ptr;
        old_m_params = ggml_decoder->get_model_params();
        cache_hit = old_m_params.can_reuse_statically(m_params);
    }
@@ -325,14 +366,21 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
            ggml_decoder->update_io(cgraph);
        }
        ggml_decoder->add_extra_inputs();
-        infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
+        {
            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
            infer_request =
                is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
        }
        decoder_end_time = ggml_time_us();
        conversion_end_time = decoder_end_time;
        compile_end_time = decoder_end_time;
    } else {
        {
            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
            r_ctx->infer_request_cache.erase(key);
            r_ctx->infer_request_cache_prefill.erase(key);
        }
        std::shared_ptr<ov::Model> model;
        auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -372,16 +420,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
            compiled_model_decode = core.compile_model(model_decode, device, config);
        }
-        r_ctx->infer_request_cache_prefill[key] =
+        auto infer_request_prefill = std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
-            std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
+        auto infer_request_decode = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
        r_ctx->infer_request_cache[key] =
            std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
        compile_end_time = ggml_time_us();
        model = is_prefill ? model_prefill : model_decode;
        ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
-        infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
+        infer_request = is_prefill ? infer_request_prefill : infer_request_decode;
-        r_ctx->decoder_cache[key] = ggml_decoder;
+        entry->ptr = ggml_decoder;
        std::vector<std::string> ov_input_names;
        std::vector<std::string> ov_output_names;
@@ -391,18 +437,29 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
        for (const auto & ov_output : model->get_results()) {
            ov_output_names.push_back(ov_output->get_friendly_name());
        }
        {
            std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
            r_ctx->infer_request_cache_prefill[key] = infer_request_prefill;
            r_ctx->infer_request_cache[key] = infer_request_decode;
            r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
            r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
        }
    }
-    auto ov_input_names = r_ctx->ov_input_names_cache[key];
+    std::vector<std::string> ov_input_names_local;
-    auto ov_output_names = r_ctx->ov_output_names_cache[key];
+    std::vector<std::string> ov_output_names_local;
    {
        std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
        ov_input_names_local = r_ctx->ov_input_names_cache[key];
        ov_output_names_local = r_ctx->ov_output_names_cache[key];
    }
    if (is_prefill) {
        auto inp_len = inp_pos->ne[0];
        for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
-            for (size_t i = 0; i < ov_input_names.size(); i++) {
+            for (size_t i = 0; i < ov_input_names_local.size(); i++) {
-                auto param_name = ov_input_names[i];
+                auto param_name = ov_input_names_local[i];
                auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
                infer_request->set_input_tensor(i, input_tensor);
@@ -412,8 +469,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
                }
            }
-            for (size_t i = 0; i < ov_output_names.size(); i++) {
+            for (size_t i = 0; i < ov_output_names_local.size(); i++) {
-                auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
+                auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
                auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
                infer_request->set_output_tensor(i, output_tensor);
            }
@@ -421,16 +478,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
            infer_request->infer();
            if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
-                for (size_t i = 0; i < ov_output_names.size(); i++) {
+                for (size_t i = 0; i < ov_output_names_local.size(); i++) {
                    const auto output_tensor = infer_request->get_output_tensor(i);
-                    print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
+                    print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
                }
            }
        }
        infer_end_time = ggml_time_us();
    } else {
-        for (size_t i = 0; i < ov_input_names.size(); i++) {
+        for (size_t i = 0; i < ov_input_names_local.size(); i++) {
-            auto param_name = ov_input_names[i];
+            auto param_name = ov_input_names_local[i];
            auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
            infer_request->set_input_tensor(i, input_tensor);
@@ -440,8 +497,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
            }
        }
-        for (size_t i = 0; i < ov_output_names.size(); i++) {
+        for (size_t i = 0; i < ov_output_names_local.size(); i++) {
-            auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
+            auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
            auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
            infer_request->set_output_tensor(i, output_tensor);
        }
@@ -450,9 +507,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
        infer_end_time = ggml_time_us();
        if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
-            for (size_t i = 0; i < ov_output_names.size(); i++) {
+            for (size_t i = 0; i < ov_output_names_local.size(); i++) {
                const auto output_tensor = infer_request->get_output_tensor(i);
-                print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
+                print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
            }
        }
    }
@@ -3,12 +3,15 @@
 #include "ggml-impl.h"
 #include <algorithm>
 #include <atomic>
 #include <cstddef>
 #include <memory>
 #include <mutex>
 #include <openvino/runtime/core.hpp>
 #include <openvino/runtime/infer_request.hpp>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 struct graph_key {
@@ -40,11 +43,17 @@ struct graph_key_hash {
    }
 };
 struct decoder_runtime_ctx {
    decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) : mutex(std::move(mutex)) {}
    std::shared_ptr<std::mutex> mutex;
    std::shared_ptr<GgmlOvDecoder> ptr;
 };
 struct ov_runtime_context {
-    std::mutex ov_compute_mutex;
+    mutable std::mutex ctx_mutex;
    std::string device;
    bool stateful;
-    std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
+    std::unordered_map<graph_key, std::shared_ptr<decoder_runtime_ctx>, graph_key_hash> decoder_cache;
    std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
    std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
    std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
@@ -53,11 +62,22 @@ struct ov_runtime_context {
    //      Simultanous stateful inference request support to be added.
    size_t stateful_kv_size;
    std::map<std::string, std::string> kv_state_input_name_map;
    std::atomic<int> backend_count;
    ov_runtime_context() :
        device("CPU"),
        stateful(false),
-        stateful_kv_size(0) {}
+        stateful_kv_size(0),
        backend_count(0) {}
    void clear_caches() {
        std::lock_guard<std::mutex> lock(ctx_mutex);
        decoder_cache.clear();
        infer_request_cache.clear();
        infer_request_cache_prefill.clear();
        ov_input_names_cache.clear();
        ov_output_names_cache.clear();
    }
 };
 enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend);