model-conversion : add device option to embd run orig model (#18386)

This commit refactors the original model embedding script to include a
device selection option. Users can now specify the device (cpu, cuda,
mps, auto) via command-line arguments. It also refactors the code to be
more structured.
This commit is contained in:
Daniel Bevenius
2025-12-29 13:37:02 +01:00
committed by GitHub
parent 0c8986403b
commit 7cbec34a63
@@ -2,6 +2,7 @@
import argparse
import os
import sys
import numpy as np
import importlib
from pathlib import Path
@@ -9,32 +10,52 @@ from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModel
import torch
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
parser.add_argument('--use-sentence-transformers', action='store_true',
help='Use SentenceTransformer to apply all numbered layers (01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
args = parser.parse_args()
def parse_arguments():
parser = argparse.ArgumentParser(description='Run original embedding model')
parser.add_argument(
'--model-path',
'-m',
help='Path to the model'
)
parser.add_argument(
'--prompts-file',
'-p',
help='Path to file containing prompts (one per line)'
)
parser.add_argument(
'--use-sentence-transformers',
action='store_true',
help=('Use SentenceTransformer to apply all numbered layers '
'(01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
)
parser.add_argument(
'--device',
'-d',
help='Device to use (cpu, cuda, mps, auto)',
default='auto'
)
return parser.parse_args()
def read_prompt_from_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read().strip()
except FileNotFoundError:
print(f"Error: Prompts file '{file_path}' not found")
exit(1)
except Exception as e:
print(f"Error reading prompts file: {e}")
exit(1)
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
# Determine if we should use SentenceTransformer
use_sentence_transformers = args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device="auto"):
if device == "cpu":
device_map = {"": "cpu"}
print("Forcing CPU usage")
elif device == "auto":
# On Mac, "auto" device_map can cause issues with accelerate
# So we detect the best device manually
if torch.cuda.is_available():
device_map = {"": "cuda"}
print("Using CUDA")
elif torch.backends.mps.is_available():
device_map = {"": "mps"}
print("Using MPS (Apple Metal)")
else:
device_map = {"": "cpu"}
print("Using CPU")
else:
device_map = {"": device}
if use_sentence_transformers:
from sentence_transformers import SentenceTransformer
@@ -44,7 +65,6 @@ if use_sentence_transformers:
config = model[0].auto_model.config # type: ignore
else:
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
# This can be used to override the sliding window size for manual testing. This
@@ -52,9 +72,9 @@ else:
# and compare it with the converted .gguf model.
if hasattr(config, 'sliding_window'):
original_sliding_window = config.sliding_window
#original_sliding_window = 6
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
print(f"Using unreleased model: {unreleased_model_name}")
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
@@ -64,32 +84,81 @@ else:
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path, config=config, trust_remote_code=True)
model = model_class.from_pretrained(
model_path,
device_map=device_map,
offload_folder="offload",
trust_remote_code=True,
config=config
)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
sys.exit(1)
else:
model = AutoModel.from_pretrained(model_path, config=config, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_path,
device_map=device_map,
offload_folder="offload",
trust_remote_code=True,
config=config
)
print(f"Model class: {type(model)}")
print(f"Model file: {type(model).__module__}")
# Verify the model is using the correct sliding window
if not use_sentence_transformers:
if hasattr(model.config, 'sliding_window'): # type: ignore
print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
else:
print("Model config does not have sliding_window attribute")
return model, tokenizer, config
def get_prompt(args):
if args.prompts_file:
try:
with open(args.prompts_file, 'r', encoding='utf-8') as f:
return f.read().strip()
except FileNotFoundError:
print(f"Error: Prompts file '{args.prompts_file}' not found")
sys.exit(1)
except Exception as e:
print(f"Error reading prompts file: {e}")
sys.exit(1)
else:
return "Hello world today"
def main():
args = parse_arguments()
model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
if model_path is None:
print("Error: Model path must be specified either via --model-path argument "
"or EMBEDDING_MODEL_PATH environment variable")
sys.exit(1)
# Determine if we should use SentenceTransformer
use_st = (
args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
)
model, tokenizer, config = load_model_and_tokenizer(model_path, use_st, args.device)
# Get the device the model is on
if not use_st:
device = next(model.parameters()).device
else:
# For SentenceTransformer, get device from the underlying model
device = next(model[0].auto_model.parameters()).device # type: ignore
model_name = os.path.basename(model_path)
if args.prompts_file:
prompt_text = read_prompt_from_file(args.prompts_file)
prompt_text = get_prompt(args)
texts = [prompt_text]
else:
texts = ["Hello world today"]
with torch.no_grad():
if use_sentence_transformers:
if use_st:
embeddings = model.encode(texts, convert_to_numpy=True)
all_embeddings = embeddings # Shape: [batch_size, hidden_size]
@@ -120,6 +189,8 @@ with torch.no_grad():
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
print(f"{token_id:6d} -> '{token_str}'")
# Move inputs to the same device as the model
encoded = {k: v.to(device) for k, v in encoded.items()}
outputs = model(**encoded)
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
@@ -175,3 +246,7 @@ with torch.no_grad():
print("")
print(f"Saved bin embeddings to: {bin_filename}")
print(f"Saved txt embeddings to: {txt_filename}")
if __name__ == "__main__":
main()