model : full modern bert support (#18330)
* full modern bert support * added gelu op in rank pooling for modern bert * still working on stuff, added mean calculation before classifier head * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * first layer is dense, as per modern bert research paper * Update src/llama-graph.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fixed set input for mean pooling to check if pooling type is ranking since modern bert does mean & rank * Update src/llama-graph.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
@@ -11003,13 +11003,17 @@ class ModernBertModel(BertModel):
|
|||||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
# these layers act as MLM head, so we don't need them
|
|
||||||
if name.startswith("decoder."):
|
|
||||||
return
|
|
||||||
|
|
||||||
if name.startswith("model."):
|
if name.startswith("model."):
|
||||||
name = name[6:]
|
name = name[6:]
|
||||||
|
|
||||||
|
if self.cls_out_labels:
|
||||||
|
# For BertForSequenceClassification (direct projection layer)
|
||||||
|
if name == "classifier.weight":
|
||||||
|
name = "classifier.out_proj.weight"
|
||||||
|
|
||||||
|
if name == "classifier.bias":
|
||||||
|
name = "classifier.out_proj.bias"
|
||||||
|
|
||||||
yield from super().modify_tensors(data_torch, name, bid)
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -652,6 +652,7 @@ class MODEL_TENSOR(IntEnum):
|
|||||||
ENC_OUTPUT_NORM = auto()
|
ENC_OUTPUT_NORM = auto()
|
||||||
CLS = auto() # classifier
|
CLS = auto() # classifier
|
||||||
CLS_OUT = auto() # classifier output projection
|
CLS_OUT = auto() # classifier output projection
|
||||||
|
CLS_NORM = auto()
|
||||||
CONV1D = auto()
|
CONV1D = auto()
|
||||||
CONVNEXT_DW = auto()
|
CONVNEXT_DW = auto()
|
||||||
CONVNEXT_NORM = auto()
|
CONVNEXT_NORM = auto()
|
||||||
@@ -1088,6 +1089,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||||||
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||||
MODEL_TENSOR.CLS: "cls",
|
MODEL_TENSOR.CLS: "cls",
|
||||||
MODEL_TENSOR.CLS_OUT: "cls.output",
|
MODEL_TENSOR.CLS_OUT: "cls.output",
|
||||||
|
MODEL_TENSOR.CLS_NORM: "cls.norm",
|
||||||
MODEL_TENSOR.CONV1D: "conv1d",
|
MODEL_TENSOR.CONV1D: "conv1d",
|
||||||
MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw",
|
MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw",
|
||||||
MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm",
|
MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm",
|
||||||
@@ -1507,6 +1509,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.FFN_NORM,
|
MODEL_TENSOR.FFN_NORM,
|
||||||
MODEL_TENSOR.CLS,
|
MODEL_TENSOR.CLS,
|
||||||
MODEL_TENSOR.CLS_OUT,
|
MODEL_TENSOR.CLS_OUT,
|
||||||
|
MODEL_TENSOR.CLS_NORM,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.NOMIC_BERT: [
|
MODEL_ARCH.NOMIC_BERT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
|||||||
@@ -1240,6 +1240,10 @@ class TensorNameMap:
|
|||||||
MODEL_TENSOR.CLS_OUT: (
|
MODEL_TENSOR.CLS_OUT: (
|
||||||
"classifier.out_proj", # roberta
|
"classifier.out_proj", # roberta
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.CLS_NORM: (
|
||||||
|
"head.norm", # modern-bert
|
||||||
|
),
|
||||||
#############################################################################
|
#############################################################################
|
||||||
|
|
||||||
MODEL_TENSOR.CONVNEXT_DW: (
|
MODEL_TENSOR.CONVNEXT_DW: (
|
||||||
|
|||||||
@@ -367,6 +367,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||||||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
{ LLM_TENSOR_CLS, "cls" },
|
{ LLM_TENSOR_CLS, "cls" },
|
||||||
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||||
|
{ LLM_TENSOR_CLS_NORM, "cls.norm" },
|
||||||
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
||||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||||
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
|
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
|
||||||
@@ -828,6 +829,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_NORM,
|
||||||
LLM_TENSOR_CLS,
|
LLM_TENSOR_CLS,
|
||||||
LLM_TENSOR_CLS_OUT,
|
LLM_TENSOR_CLS_OUT,
|
||||||
|
LLM_TENSOR_CLS_NORM,
|
||||||
};
|
};
|
||||||
case LLM_ARCH_JINA_BERT_V2:
|
case LLM_ARCH_JINA_BERT_V2:
|
||||||
return {
|
return {
|
||||||
@@ -2518,6 +2520,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||||||
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||||
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||||
|
{LLM_TENSOR_CLS_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||||
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
{LLM_TENSOR_DENSE_2_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
||||||
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
{LLM_TENSOR_DENSE_3_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Dense layer output
|
||||||
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
{LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||||
|
|||||||
@@ -497,6 +497,7 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||||
LLM_TENSOR_CLS,
|
LLM_TENSOR_CLS,
|
||||||
LLM_TENSOR_CLS_OUT,
|
LLM_TENSOR_CLS_OUT,
|
||||||
|
LLM_TENSOR_CLS_NORM,
|
||||||
LLM_TENSOR_CONV1D,
|
LLM_TENSOR_CONV1D,
|
||||||
LLM_TENSOR_CONVNEXT_DW,
|
LLM_TENSOR_CONVNEXT_DW,
|
||||||
LLM_TENSOR_CONVNEXT_NORM,
|
LLM_TENSOR_CONVNEXT_NORM,
|
||||||
|
|||||||
@@ -2761,6 +2761,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
|
|||||||
llama_set_param(model->cls_b, param_filter, param_filter_ud);
|
llama_set_param(model->cls_b, param_filter, param_filter_ud);
|
||||||
llama_set_param(model->cls_out, param_filter, param_filter_ud);
|
llama_set_param(model->cls_out, param_filter, param_filter_ud);
|
||||||
llama_set_param(model->cls_out_b, param_filter, param_filter_ud);
|
llama_set_param(model->cls_out_b, param_filter, param_filter_ud);
|
||||||
|
llama_set_param(model->cls_norm, param_filter, param_filter_ud);
|
||||||
|
|
||||||
for (struct llama_layer & layer : model->layers) {
|
for (struct llama_layer & layer : model->layers) {
|
||||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||||
|
|||||||
+24
-5
@@ -185,7 +185,10 @@ bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
if (cparams.embeddings &&
|
||||||
|
(cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN ||
|
||||||
|
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK )) {
|
||||||
|
|
||||||
const int64_t n_tokens = ubatch->n_tokens;
|
const int64_t n_tokens = ubatch->n_tokens;
|
||||||
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
||||||
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
||||||
@@ -2437,7 +2440,8 @@ void llm_graph_context::build_pooling(
|
|||||||
ggml_tensor * cls,
|
ggml_tensor * cls,
|
||||||
ggml_tensor * cls_b,
|
ggml_tensor * cls_b,
|
||||||
ggml_tensor * cls_out,
|
ggml_tensor * cls_out,
|
||||||
ggml_tensor * cls_out_b) const {
|
ggml_tensor * cls_out_b,
|
||||||
|
ggml_tensor * cls_norm) const {
|
||||||
if (!cparams.embeddings) {
|
if (!cparams.embeddings) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -2476,8 +2480,15 @@ void llm_graph_context::build_pooling(
|
|||||||
} break;
|
} break;
|
||||||
case LLAMA_POOLING_TYPE_RANK:
|
case LLAMA_POOLING_TYPE_RANK:
|
||||||
{
|
{
|
||||||
ggml_tensor * inp_cls = build_inp_cls();
|
if (arch == LLM_ARCH_MODERN_BERT) {
|
||||||
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
// modern bert gte reranker builds mean first then applies prediction head and classifier
|
||||||
|
// https://github.com/huggingface/transformers/blob/main/src/transformers/models/modernbert/modular_modernbert.py#L1404-1411
|
||||||
|
ggml_tensor * inp_mean = build_inp_mean();
|
||||||
|
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
|
||||||
|
} else {
|
||||||
|
ggml_tensor * inp_cls = build_inp_cls();
|
||||||
|
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||||
|
}
|
||||||
|
|
||||||
// classification head
|
// classification head
|
||||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||||
@@ -2486,7 +2497,15 @@ void llm_graph_context::build_pooling(
|
|||||||
if (cls_b) {
|
if (cls_b) {
|
||||||
cur = ggml_add(ctx0, cur, cls_b);
|
cur = ggml_add(ctx0, cur, cls_b);
|
||||||
}
|
}
|
||||||
cur = ggml_tanh(ctx0, cur);
|
if (arch == LLM_ARCH_MODERN_BERT) {
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
} else {
|
||||||
|
cur = ggml_tanh(ctx0, cur);
|
||||||
|
}
|
||||||
|
if (cls_norm) {
|
||||||
|
// head norm
|
||||||
|
cur = build_norm(cur, cls_norm, NULL, LLM_NORM, -1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||||
|
|||||||
+2
-1
@@ -1000,7 +1000,8 @@ struct llm_graph_context {
|
|||||||
ggml_tensor * cls,
|
ggml_tensor * cls,
|
||||||
ggml_tensor * cls_b,
|
ggml_tensor * cls_b,
|
||||||
ggml_tensor * cls_out,
|
ggml_tensor * cls_out,
|
||||||
ggml_tensor * cls_out_b) const;
|
ggml_tensor * cls_out_b,
|
||||||
|
ggml_tensor * cls_norm) const;
|
||||||
|
|
||||||
//
|
//
|
||||||
// sampling (backend sampling)
|
// sampling (backend sampling)
|
||||||
|
|||||||
@@ -271,6 +271,7 @@ void llama_model_saver::add_tensors_from_model() {
|
|||||||
add_tensor(model.cls_b);
|
add_tensor(model.cls_b);
|
||||||
add_tensor(model.cls_out);
|
add_tensor(model.cls_out);
|
||||||
add_tensor(model.cls_out_b);
|
add_tensor(model.cls_out_b);
|
||||||
|
add_tensor(model.cls_norm);
|
||||||
|
|
||||||
for (const struct llama_layer & layer : model.layers) {
|
for (const struct llama_layer & layer : model.layers) {
|
||||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||||
|
|||||||
+6
-5
@@ -908,7 +908,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
|
|
||||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||||
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
||||||
hparams.set_swa_pattern(swa_period);
|
hparams.set_swa_pattern(swa_period, true);
|
||||||
} else {
|
} else {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
}
|
}
|
||||||
@@ -3513,9 +3513,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||||
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||||
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
cls_norm = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEO_BERT:
|
case LLM_ARCH_NEO_BERT:
|
||||||
@@ -8734,7 +8735,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// add on pooling layer
|
// add on pooling layer
|
||||||
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
|
||||||
|
|
||||||
// add backend sampling layers (if any)
|
// add backend sampling layers (if any)
|
||||||
llm->build_sampling();
|
llm->build_sampling();
|
||||||
|
|||||||
@@ -475,6 +475,7 @@ struct llama_model {
|
|||||||
struct ggml_tensor * cls_b = nullptr;
|
struct ggml_tensor * cls_b = nullptr;
|
||||||
struct ggml_tensor * cls_out = nullptr;
|
struct ggml_tensor * cls_out = nullptr;
|
||||||
struct ggml_tensor * cls_out_b = nullptr;
|
struct ggml_tensor * cls_out_b = nullptr;
|
||||||
|
struct ggml_tensor * cls_norm = nullptr;
|
||||||
|
|
||||||
struct ggml_tensor * conv1d = nullptr;
|
struct ggml_tensor * conv1d = nullptr;
|
||||||
struct ggml_tensor * conv1d_b = nullptr;
|
struct ggml_tensor * conv1d_b = nullptr;
|
||||||
|
|||||||
@@ -104,13 +104,6 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
|
|||||||
LLM_NORM, -1);
|
LLM_NORM, -1);
|
||||||
cb(cur, "final_norm_out", -1);
|
cb(cur, "final_norm_out", -1);
|
||||||
|
|
||||||
if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
|
||||||
// extracting cls token
|
|
||||||
cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
|
|
||||||
cb(cur, "cls_pooled_embd", -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
cb(cur, "res_embd", -1);
|
|
||||||
res->t_embd = cur;
|
res->t_embd = cur;
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user