code

#2
by asdgad - opened

code for run it?????????????????????

import torch
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Xenova/Qwen1.5-1.8B-Chat")

1. Try loading the quantized version instead.

This model is often smaller and might not rely on the external .onnx_data file,

thus avoiding the file naming issue.

model = ORTModelForCausalLM.from_pretrained(
"Xenova/Qwen1.5-1.8B-Chat",
file_name="decoder_model_merged_quantized.onnx", # <-- CHANGE: Use quantized file
subfolder="onnx"
)

inputs = tokenizer("My name is Arthur and I live in", return_tensors="pt")

gen_tokens = model.generate(**inputs,do_sample=True,temperature=0.9, min_length=20,max_length=20)

print(tokenizer.batch_decode(gen_tokens))

pip install "optimum-onnx[onnxruntime]"

import torch
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Xenova/Qwen1.5-1.8B-Chat")

1. Try loading the quantized version instead.

This model is often smaller and might not rely on the external .onnx_data file,

thus avoiding the file naming issue.

model = ORTModelForCausalLM.from_pretrained(
"Xenova/Qwen1.5-1.8B-Chat",
file_name="decoder_model_merged_quantized.onnx", # <-- CHANGE: Use quantized file
subfolder="onnx"
)

inputs = tokenizer("My name is Arthur and I live in", return_tensors="pt")

gen_tokens = model.generate(**inputs,do_sample=True,temperature=0.9, min_length=20,max_length=20)

print(tokenizer.batch_decode(gen_tokens))

onnx/decoder_model_merged_quantized.onnx: 100%
 1.87G/1.87G [00:40<00:00, 69.6MB/s]

Setting pad_token_id to eos_token_id:151643 for open-end generation.

['My name is Arthur and I live in the city of Cambridge, England. Cambridge is a great place']

Sign up or log in to comment