openvino: driver setup, CI split, thread safety, and NPU optimizations (#21944)

* Thread safety per request only

* Fix ROPE yarn case

* Fix sticky stateful config

* Use i4/i8 directly for symmetric quant

* Use weightless caching

* Add WeightlessCacheAttribute to reduce NPU memory usage

* Gelu tanh support (#125)

* Imrope support (#126)

* fix(openvino): explicit ov::Tensor frees in ggml_backend_openvino_free

* add GPU,NPU support in OV Dockerfile

* add build-openvino.yml ci

* Fix sticky stateful config

* add concurrency to ov-gpu ci runs. Move OV CI to build-openvino.yml

* fix thread-safety of shared runtime context

* rope type abstraction for frontend translations

* fix editorconfig

---------

Co-authored-by: Mustafa Cavus <mustafa.cavus@intel.com>
Co-authored-by: Dan Hoffman <dhoff749@gmail.com>
Co-authored-by: Ravi Panchumarthy <ravi.panchumarthy@intel.com>
This commit is contained in:
Zijun Yu
2026-04-21 23:58:34 +08:00
committed by GitHub
parent 606fa42f5d
commit 52f1096f21
21 changed files with 823 additions and 544 deletions
+102 -45
View File
@@ -81,8 +81,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<ov_runtime_context> r_ctx) {
auto & core = ov_singleton_core();
const auto & config = ggml_openvino_get_compile_config();
auto device = r_ctx->device;
bool stateful = r_ctx->stateful;
const auto & device = r_ctx->device;
const auto & stateful = r_ctx->stateful;
static auto is_static = false;
if (is_naive(cgraph)) {
@@ -106,14 +106,26 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
int64_t infer_end_time;
{
std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
std::shared_ptr<decoder_runtime_ctx> entry;
ModelParams old_m_params;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
if (cache_hit) {
entry = it->second;
} else {
auto mutex = std::make_shared<std::mutex>();
entry = std::make_shared<decoder_runtime_ctx>(mutex);
r_ctx->decoder_cache[key] = entry;
}
}
std::lock_guard<std::mutex> lock(*(entry->mutex));
if (cache_hit) {
ggml_decoder = it->second;
ggml_decoder = entry->ptr;
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_dynamically(m_params);
}
@@ -126,7 +138,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
ggml_decoder->update_io(cgraph);
}
ggml_decoder->add_extra_inputs();
infer_request = r_ctx->infer_request_cache.at(key);
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
infer_request = r_ctx->infer_request_cache.at(key);
}
if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -170,7 +185,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
conversion_end_time = decoder_end_time;
compile_end_time = decoder_end_time;
} else {
r_ctx->infer_request_cache.erase(key);
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache.erase(key);
}
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -199,8 +217,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
}
compile_end_time = ggml_time_us();
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
r_ctx->infer_request_cache[key] = infer_request;
r_ctx->decoder_cache[key] = ggml_decoder;
entry->ptr = ggml_decoder;
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
@@ -210,8 +227,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
for (const auto & ov_output : model->get_results()) {
ov_output_names.push_back(ov_output->get_friendly_name());
}
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache[key] = infer_request;
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
}
if (stateful) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
@@ -224,8 +246,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
}
}
auto ov_input_names = r_ctx->ov_input_names_cache[key];
auto ov_output_names = r_ctx->ov_output_names_cache[key];
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
ov_input_names = r_ctx->ov_input_names_cache[key];
ov_output_names = r_ctx->ov_output_names_cache[key];
}
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
@@ -306,12 +333,26 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
int64_t compile_end_time;
int64_t infer_end_time;
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
std::shared_ptr<decoder_runtime_ctx> entry;
ModelParams old_m_params;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
if (cache_hit) {
entry = it->second;
} else {
auto mutex = std::make_shared<std::mutex>();
entry = std::make_shared<decoder_runtime_ctx>(mutex);
r_ctx->decoder_cache[key] = entry;
}
}
std::lock_guard<std::mutex> lock(*(entry->mutex));
if (cache_hit) {
ggml_decoder = it->second;
ggml_decoder = entry->ptr;
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_statically(m_params);
}
@@ -325,14 +366,21 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
ggml_decoder->update_io(cgraph);
}
ggml_decoder->add_extra_inputs();
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
infer_request =
is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
}
decoder_end_time = ggml_time_us();
conversion_end_time = decoder_end_time;
compile_end_time = decoder_end_time;
} else {
r_ctx->infer_request_cache.erase(key);
r_ctx->infer_request_cache_prefill.erase(key);
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache.erase(key);
r_ctx->infer_request_cache_prefill.erase(key);
}
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
@@ -372,16 +420,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
compiled_model_decode = core.compile_model(model_decode, device, config);
}
r_ctx->infer_request_cache_prefill[key] =
std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
r_ctx->infer_request_cache[key] =
std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
auto infer_request_prefill = std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
auto infer_request_decode = std::make_shared<ov::InferRequest>(compiled_model_decode.create_infer_request());
compile_end_time = ggml_time_us();
model = is_prefill ? model_prefill : model_decode;
ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
r_ctx->decoder_cache[key] = ggml_decoder;
infer_request = is_prefill ? infer_request_prefill : infer_request_decode;
entry->ptr = ggml_decoder;
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
@@ -391,18 +437,29 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
for (const auto & ov_output : model->get_results()) {
ov_output_names.push_back(ov_output->get_friendly_name());
}
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache_prefill[key] = infer_request_prefill;
r_ctx->infer_request_cache[key] = infer_request_decode;
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
}
}
auto ov_input_names = r_ctx->ov_input_names_cache[key];
auto ov_output_names = r_ctx->ov_output_names_cache[key];
std::vector<std::string> ov_input_names_local;
std::vector<std::string> ov_output_names_local;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
ov_input_names_local = r_ctx->ov_input_names_cache[key];
ov_output_names_local = r_ctx->ov_output_names_cache[key];
}
if (is_prefill) {
auto inp_len = inp_pos->ne[0];
for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
for (size_t i = 0; i < ov_input_names_local.size(); i++) {
auto param_name = ov_input_names_local[i];
auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
infer_request->set_input_tensor(i, input_tensor);
@@ -412,8 +469,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
}
}
for (size_t i = 0; i < ov_output_names.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
infer_request->set_output_tensor(i, output_tensor);
}
@@ -421,16 +478,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_request->infer();
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names.size(); i++) {
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
}
}
}
infer_end_time = ggml_time_us();
} else {
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
for (size_t i = 0; i < ov_input_names_local.size(); i++) {
auto param_name = ov_input_names_local[i];
auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
infer_request->set_input_tensor(i, input_tensor);
@@ -440,8 +497,8 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
}
}
for (size_t i = 0; i < ov_output_names.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names_local[i]);
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
infer_request->set_output_tensor(i, output_tensor);
}
@@ -450,9 +507,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_end_time = ggml_time_us();
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names.size(); i++) {
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
}
}
}