openvino: driver setup, CI split, thread safety, and NPU optimizations (#21944)
* Thread safety per request only * Fix ROPE yarn case * Fix sticky stateful config * Use i4/i8 directly for symmetric quant * Use weightless caching * Add WeightlessCacheAttribute to reduce NPU memory usage * Gelu tanh support (#125) * Imrope support (#126) * fix(openvino): explicit ov::Tensor frees in ggml_backend_openvino_free * add GPU,NPU support in OV Dockerfile * add build-openvino.yml ci * Fix sticky stateful config * add concurrency to ov-gpu ci runs. Move OV CI to build-openvino.yml * fix thread-safety of shared runtime context * rope type abstraction for frontend translations * fix editorconfig --------- Co-authored-by: Mustafa Cavus <mustafa.cavus@intel.com> Co-authored-by: Dan Hoffman <dhoff749@gmail.com> Co-authored-by: Ravi Panchumarthy <ravi.panchumarthy@intel.com>
This commit is contained in:
@@ -19,7 +19,6 @@
|
||||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/except.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
@@ -207,8 +206,22 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
|
||||
break;
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
const int mode = node->op_params[2];
|
||||
switch (mode) {
|
||||
case GGML_ROPE_TYPE_NEOX: {
|
||||
op_case = 0x00010000;
|
||||
break;
|
||||
}
|
||||
case GGML_ROPE_TYPE_IMROPE: {
|
||||
op_case = 0x00020000;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
op_case = 0x00000000;
|
||||
break;
|
||||
}
|
||||
if (node->src[0]->op == GGML_OP_VIEW) {
|
||||
op_case = 2;
|
||||
op_case = (op_case | 0x00000002);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -573,9 +586,6 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
||||
}
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
|
||||
static std::mutex weights_mutex;
|
||||
std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
auto * nodes = cgraph->nodes;
|
||||
auto n_nodes = cgraph->n_nodes;
|
||||
|
||||
Reference in New Issue
Block a user