|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
import json |
|
|
import re |
|
|
from unsloth import FastLanguageModel |
|
|
from datasets import load_dataset, Dataset |
|
|
from trl import SFTTrainer |
|
|
from transformers import TrainingArguments, TrainerCallback, set_seed |
|
|
from sklearn.model_selection import train_test_split |
|
|
import numpy as np |
|
|
import sys |
|
|
from collections import defaultdict |
|
|
from huggingface_hub import HfApi, login |
|
|
|
|
|
set_seed(3407) |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
|
|
|
max_seq_length = 256 |
|
|
model_name = "unsloth/gemma-3-270m-it" |
|
|
output_dir = "gemma-3-270m-custom-ner-sft-chat" |
|
|
hf_dataset_repo = "Luigi/dinercall-ner" |
|
|
hf_model_repo = "Luigi/gemma-3-270m-it-dinercall-ner" |
|
|
|
|
|
|
|
|
system_prompt = """你是一個助理,負責從用戶消息中提取預訂資訊並以JSON格式輸出。 |
|
|
JSON必須包含三個字段: num_people, reservation_date, phone_num。 |
|
|
如果某個字段沒有信息,使用空字符串。只輸出JSON,不要添加任何其他文字。""" |
|
|
|
|
|
|
|
|
DEMO_SAMPLES = [ |
|
|
"你好,我想訂明天晚上7點的位子,四位成人,電話是0912-345-678", |
|
|
"週六下午三點,兩位,電話0987654321", |
|
|
"預約下週三中午12點半,5人用餐,聯絡電話0912345678" |
|
|
] |
|
|
|
|
|
|
|
|
def load_dataset_from_hf(tokenizer): |
|
|
try: |
|
|
|
|
|
dataset = load_dataset(hf_dataset_repo, split="train") |
|
|
|
|
|
formatted_data = [] |
|
|
raw_eval_data = [] |
|
|
|
|
|
for example in dataset: |
|
|
|
|
|
output_dict = { |
|
|
"num_people": example["num_people"] if example["num_people"] else "", |
|
|
"reservation_date": example["reservation_date"] if example["reservation_date"] else "", |
|
|
"phone_num": example["phone_num"] if example["phone_num"] else "" |
|
|
} |
|
|
|
|
|
|
|
|
chat_template = [ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": example["text"]}, |
|
|
{"role": "assistant", "content": json.dumps(output_dict, ensure_ascii=False)} |
|
|
] |
|
|
formatted_text = tokenizer.apply_chat_template( |
|
|
chat_template, |
|
|
tokenize=False, |
|
|
add_generation_prompt=False |
|
|
) |
|
|
formatted_data.append({"text": formatted_text}) |
|
|
|
|
|
|
|
|
raw_eval_data.append({ |
|
|
"text": example["text"], |
|
|
"output": output_dict |
|
|
}) |
|
|
|
|
|
|
|
|
train_data, eval_data = train_test_split(formatted_data, test_size=0.2, random_state=42) |
|
|
_, raw_eval_data = train_test_split(raw_eval_data, test_size=0.2, random_state=42) |
|
|
|
|
|
print(f"Dataset: {len(formatted_data)} total, {len(train_data)} train, {len(eval_data)} eval") |
|
|
return Dataset.from_list(train_data), Dataset.from_list(eval_data), raw_eval_data |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error loading dataset from HF: {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
def validate_json(output): |
|
|
try: |
|
|
json_match = re.search(r'\{[\s\S]*\}', output) |
|
|
if not json_match: |
|
|
return False, None, "No JSON found" |
|
|
|
|
|
json_str = json_match.group(0) |
|
|
json_str = re.sub(r',\s*\}', '}', json_str) |
|
|
parsed = json.loads(json_str) |
|
|
return True, parsed, "Valid JSON" |
|
|
except: |
|
|
return False, None, "Invalid JSON" |
|
|
|
|
|
|
|
|
def evaluate_model(model, tokenizer, eval_samples, max_samples=20): |
|
|
"""Evaluate model performance on a set of samples""" |
|
|
model.eval() |
|
|
|
|
|
json_validity = [] |
|
|
exact_matches = [] |
|
|
field_accuracy = defaultdict(list) |
|
|
|
|
|
for i, example in enumerate(eval_samples[:max_samples]): |
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": example["text"]} |
|
|
] |
|
|
prompt = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt", padding=True) |
|
|
if device == "cuda": |
|
|
inputs = inputs.to("cuda") |
|
|
|
|
|
|
|
|
eos_token_id = tokenizer.eos_token_id |
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=64, |
|
|
temperature=0.1, |
|
|
pad_token_id=eos_token_id, |
|
|
eos_token_id=eos_token_id, |
|
|
do_sample=False, |
|
|
) |
|
|
|
|
|
|
|
|
prompt_length = len(inputs.input_ids[0]) |
|
|
assistant_output = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
is_valid, parsed, _ = validate_json(assistant_output) |
|
|
json_validity.append(1 if is_valid else 0) |
|
|
|
|
|
|
|
|
if is_valid: |
|
|
|
|
|
exact_match = 1 if parsed == example["output"] else 0 |
|
|
exact_matches.append(exact_match) |
|
|
|
|
|
|
|
|
for field in ["num_people", "reservation_date", "phone_num"]: |
|
|
if field in parsed and field in example["output"]: |
|
|
field_match = 1 if parsed[field] == example["output"][field] else 0 |
|
|
field_accuracy[field].append(field_match) |
|
|
|
|
|
|
|
|
json_accuracy = np.mean(json_validity) * 100 if json_validity else 0 |
|
|
exact_match_rate = np.mean(exact_matches) * 100 if exact_matches else 0 |
|
|
|
|
|
field_accuracy_rates = {} |
|
|
for field, accuracies in field_accuracy.items(): |
|
|
field_accuracy_rates[field] = np.mean(accuracies) * 100 if accuracies else 0 |
|
|
|
|
|
return { |
|
|
"json_accuracy": json_accuracy, |
|
|
"exact_match": exact_match_rate, |
|
|
"field_accuracy": field_accuracy_rates |
|
|
} |
|
|
|
|
|
|
|
|
def run_demo(model, tokenizer, step="Initial"): |
|
|
print(f"\n{'='*50}") |
|
|
print(f"DEMO: {step}") |
|
|
print(f"{'='*50}") |
|
|
|
|
|
model.eval() |
|
|
|
|
|
for i, text in enumerate(DEMO_SAMPLES): |
|
|
messages = [ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": text} |
|
|
] |
|
|
prompt = tokenizer.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt", padding=True) |
|
|
if device == "cuda": |
|
|
inputs = inputs.to("cuda") |
|
|
|
|
|
|
|
|
eos_token_id = tokenizer.eos_token_id |
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=64, |
|
|
temperature=0.1, |
|
|
pad_token_id=eos_token_id, |
|
|
eos_token_id=eos_token_id, |
|
|
do_sample=True, |
|
|
) |
|
|
|
|
|
|
|
|
prompt_length = len(inputs.input_ids[0]) |
|
|
assistant_output = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True) |
|
|
|
|
|
print(f"\nSample {i+1}: {text}") |
|
|
print(f"Output: {assistant_output}") |
|
|
|
|
|
is_valid, parsed, message = validate_json(assistant_output) |
|
|
status = "✅ VALID" if is_valid else "❌ INVALID" |
|
|
print(f"{status}: {message}") |
|
|
if is_valid: |
|
|
print(json.dumps(parsed, indent=2, ensure_ascii=False)) |
|
|
|
|
|
|
|
|
class EvaluationCallback(TrainerCallback): |
|
|
def __init__(self, tokenizer, eval_data, eval_interval=500): |
|
|
self.tokenizer = tokenizer |
|
|
self.eval_data = eval_data |
|
|
self.eval_interval = eval_interval |
|
|
self.best_accuracy = 0 |
|
|
|
|
|
def on_step_end(self, args, state, control, **kwargs): |
|
|
if state.global_step % self.eval_interval == 0: |
|
|
print(f"\n{'='*60}") |
|
|
print(f"EVALUATION AT STEP {state.global_step}") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
|
|
|
model = kwargs['model'] |
|
|
|
|
|
|
|
|
metrics = evaluate_model(model, self.tokenizer, self.eval_data) |
|
|
|
|
|
print(f"JSON Accuracy: {metrics['json_accuracy']:.2f}%") |
|
|
print(f"Exact Match: {metrics['exact_match']:.2f}%") |
|
|
print("Field-level Accuracy:") |
|
|
for field, accuracy in metrics['field_accuracy'].items(): |
|
|
print(f" {field}: {accuracy:.2f}%") |
|
|
|
|
|
|
|
|
if metrics['json_accuracy'] > self.best_accuracy: |
|
|
self.best_accuracy = metrics['json_accuracy'] |
|
|
print(f"New best accuracy: {self.best_accuracy:.2f}%") |
|
|
model.save_pretrained(f"{output_dir}/best_model") |
|
|
self.tokenizer.save_pretrained(f"{output_dir}/best_model") |
|
|
|
|
|
def on_train_end(self, args, state, control, **kwargs): |
|
|
print(f"\n{'='*60}") |
|
|
print("FINAL EVALUATION") |
|
|
print(f"{'='*60}") |
|
|
|
|
|
|
|
|
model = kwargs['model'] |
|
|
|
|
|
|
|
|
metrics = evaluate_model(model, self.tokenizer, self.eval_data) |
|
|
|
|
|
print(f"Final JSON Accuracy: {metrics['json_accuracy']:.2f}%") |
|
|
print(f"Final Exact Match: {metrics['exact_match']:.2f}%") |
|
|
print("Final Field-level Accuracy:") |
|
|
for field, accuracy in metrics['field_accuracy'].items(): |
|
|
print(f" {field}: {accuracy:.2f}%") |
|
|
|
|
|
|
|
|
|
|
|
def merge_and_upload_model(model, tokenizer, repo_name): |
|
|
"""Merge LoRA adapter, convert to FP16, and upload to Hugging Face Hub""" |
|
|
try: |
|
|
|
|
|
from huggingface_hub import create_repo |
|
|
create_repo( |
|
|
repo_id=repo_name, |
|
|
repo_type="model", |
|
|
exist_ok=True |
|
|
) |
|
|
|
|
|
|
|
|
model = model.merge_and_unload() |
|
|
|
|
|
|
|
|
model = model.half() |
|
|
|
|
|
|
|
|
final_model_dir = "./final_model" |
|
|
|
|
|
|
|
|
model.save_pretrained(final_model_dir) |
|
|
tokenizer.save_pretrained(final_model_dir) |
|
|
|
|
|
|
|
|
print(f"Uploading final model to {repo_name}...") |
|
|
from huggingface_hub import HfApi |
|
|
api = HfApi() |
|
|
|
|
|
|
|
|
api.upload_folder( |
|
|
folder_path=final_model_dir, |
|
|
repo_id=repo_name, |
|
|
repo_type="model" |
|
|
) |
|
|
|
|
|
print(f"Final model successfully uploaded to {repo_name}") |
|
|
|
|
|
|
|
|
import shutil |
|
|
shutil.rmtree(final_model_dir, ignore_errors=True) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error merging/uploading model: {e}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
|
model_name=model_name, |
|
|
max_seq_length=max_seq_length, |
|
|
dtype=torch.float16, |
|
|
load_in_4bit=False, |
|
|
) |
|
|
|
|
|
model = FastLanguageModel.get_peft_model( |
|
|
model, |
|
|
r=32, |
|
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
|
|
lora_alpha=32, |
|
|
lora_dropout=0.1, |
|
|
bias="none", |
|
|
use_gradient_checkpointing=True, |
|
|
) |
|
|
|
|
|
|
|
|
train_dataset, eval_dataset, raw_eval_data = load_dataset_from_hf(tokenizer) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
learning_rate=2e-5, |
|
|
per_device_train_batch_size=4, |
|
|
gradient_accumulation_steps=2, |
|
|
num_train_epochs=5, |
|
|
logging_steps=10, |
|
|
save_steps=500, |
|
|
fp16=True, |
|
|
bf16=False, |
|
|
eval_strategy="no", |
|
|
) |
|
|
|
|
|
|
|
|
trainer = SFTTrainer( |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
dataset_text_field="text", |
|
|
max_seq_length=max_seq_length, |
|
|
callbacks=[EvaluationCallback(tokenizer, raw_eval_data, eval_interval=500)], |
|
|
) |
|
|
|
|
|
|
|
|
run_demo(model, tokenizer, "Pre-training") |
|
|
|
|
|
|
|
|
print("Starting training...") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
run_demo(model, tokenizer, "Final") |
|
|
|
|
|
|
|
|
trainer.save_model(output_dir) |
|
|
|
|
|
|
|
|
merge_and_upload_model(model, tokenizer, hf_model_repo) |
|
|
|
|
|
print(f"Training completed. Final model saved and uploaded.") |
|
|
|