Luigi's picture
Upload train_v6.py
171f917 verified
# Install required packages
# !pip install -q "git+https://github.com/huggingface/trl.git" unsloth huggingface_hub
import torch
import json
import re
from unsloth import FastLanguageModel
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback, set_seed
from sklearn.model_selection import train_test_split
import numpy as np
import sys
from collections import defaultdict
from huggingface_hub import HfApi, login
set_seed(3407)
# Check for CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Configuration
max_seq_length = 256
model_name = "unsloth/gemma-3-270m-it"
output_dir = "gemma-3-270m-custom-ner-sft-chat"
hf_dataset_repo = "Luigi/dinercall-ner"
hf_model_repo = "Luigi/gemma-3-270m-it-dinercall-ner" # Note: This should be a model repo, not dataset
# System prompt
system_prompt = """你是一個助理,負責從用戶消息中提取預訂資訊並以JSON格式輸出。
JSON必須包含三個字段: num_people, reservation_date, phone_num。
如果某個字段沒有信息,使用空字符串。只輸出JSON,不要添加任何其他文字。"""
# Demo samples for quick testing
DEMO_SAMPLES = [
"你好,我想訂明天晚上7點的位子,四位成人,電話是0912-345-678",
"週六下午三點,兩位,電話0987654321",
"預約下週三中午12點半,5人用餐,聯絡電話0912345678"
]
# Load and prepare dataset
def load_dataset_from_hf(tokenizer):
try:
# Load dataset from Hugging Face
dataset = load_dataset(hf_dataset_repo, split="train")
formatted_data = []
raw_eval_data = []
for example in dataset:
# Create the output JSON structure
output_dict = {
"num_people": example["num_people"] if example["num_people"] else "",
"reservation_date": example["reservation_date"] if example["reservation_date"] else "",
"phone_num": example["phone_num"] if example["phone_num"] else ""
}
# Create chat template format
chat_template = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": example["text"]},
{"role": "assistant", "content": json.dumps(output_dict, ensure_ascii=False)}
]
formatted_text = tokenizer.apply_chat_template(
chat_template,
tokenize=False,
add_generation_prompt=False
)
formatted_data.append({"text": formatted_text})
# Store raw data for evaluation
raw_eval_data.append({
"text": example["text"],
"output": output_dict
})
# Split data
train_data, eval_data = train_test_split(formatted_data, test_size=0.2, random_state=42)
_, raw_eval_data = train_test_split(raw_eval_data, test_size=0.2, random_state=42)
print(f"Dataset: {len(formatted_data)} total, {len(train_data)} train, {len(eval_data)} eval")
return Dataset.from_list(train_data), Dataset.from_list(eval_data), raw_eval_data
except Exception as e:
print(f"Error loading dataset from HF: {e}")
sys.exit(1)
# JSON validation function (unchanged)
def validate_json(output):
try:
json_match = re.search(r'\{[\s\S]*\}', output)
if not json_match:
return False, None, "No JSON found"
json_str = json_match.group(0)
json_str = re.sub(r',\s*\}', '}', json_str)
parsed = json.loads(json_str)
return True, parsed, "Valid JSON"
except:
return False, None, "Invalid JSON"
# Evaluation function (unchanged)
def evaluate_model(model, tokenizer, eval_samples, max_samples=20):
"""Evaluate model performance on a set of samples"""
model.eval()
json_validity = []
exact_matches = []
field_accuracy = defaultdict(list)
for i, example in enumerate(eval_samples[:max_samples]):
# Create prompt for inference
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": example["text"]}
]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt", padding=True)
if device == "cuda":
inputs = inputs.to("cuda")
# Generate response
eos_token_id = tokenizer.eos_token_id
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=64,
temperature=0.1,
pad_token_id=eos_token_id,
eos_token_id=eos_token_id,
do_sample=False,
)
# Extract only the assistant's response
prompt_length = len(inputs.input_ids[0])
assistant_output = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
# Validate JSON
is_valid, parsed, _ = validate_json(assistant_output)
json_validity.append(1 if is_valid else 0)
# Calculate accuracy metrics
if is_valid:
# Check exact match
exact_match = 1 if parsed == example["output"] else 0
exact_matches.append(exact_match)
# Check field-level accuracy
for field in ["num_people", "reservation_date", "phone_num"]:
if field in parsed and field in example["output"]:
field_match = 1 if parsed[field] == example["output"][field] else 0
field_accuracy[field].append(field_match)
# Calculate metrics
json_accuracy = np.mean(json_validity) * 100 if json_validity else 0
exact_match_rate = np.mean(exact_matches) * 100 if exact_matches else 0
field_accuracy_rates = {}
for field, accuracies in field_accuracy.items():
field_accuracy_rates[field] = np.mean(accuracies) * 100 if accuracies else 0
return {
"json_accuracy": json_accuracy,
"exact_match": exact_match_rate,
"field_accuracy": field_accuracy_rates
}
# Demo function (unchanged)
def run_demo(model, tokenizer, step="Initial"):
print(f"\n{'='*50}")
print(f"DEMO: {step}")
print(f"{'='*50}")
model.eval()
for i, text in enumerate(DEMO_SAMPLES):
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text}
]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt", padding=True)
if device == "cuda":
inputs = inputs.to("cuda")
# Generate response
eos_token_id = tokenizer.eos_token_id
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=64,
temperature=0.1,
pad_token_id=eos_token_id,
eos_token_id=eos_token_id,
do_sample=True,
)
# Extract only the assistant's response
prompt_length = len(inputs.input_ids[0])
assistant_output = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
print(f"\nSample {i+1}: {text}")
print(f"Output: {assistant_output}")
is_valid, parsed, message = validate_json(assistant_output)
status = "✅ VALID" if is_valid else "❌ INVALID"
print(f"{status}: {message}")
if is_valid:
print(json.dumps(parsed, indent=2, ensure_ascii=False))
# Custom callback for evaluation (unchanged)
class EvaluationCallback(TrainerCallback):
def __init__(self, tokenizer, eval_data, eval_interval=500):
self.tokenizer = tokenizer
self.eval_data = eval_data
self.eval_interval = eval_interval
self.best_accuracy = 0
def on_step_end(self, args, state, control, **kwargs):
if state.global_step % self.eval_interval == 0:
print(f"\n{'='*60}")
print(f"EVALUATION AT STEP {state.global_step}")
print(f"{'='*60}")
# Get the model from the trainer
model = kwargs['model']
# Run evaluation
metrics = evaluate_model(model, self.tokenizer, self.eval_data)
print(f"JSON Accuracy: {metrics['json_accuracy']:.2f}%")
print(f"Exact Match: {metrics['exact_match']:.2f}%")
print("Field-level Accuracy:")
for field, accuracy in metrics['field_accuracy'].items():
print(f" {field}: {accuracy:.2f}%")
# Save best model
if metrics['json_accuracy'] > self.best_accuracy:
self.best_accuracy = metrics['json_accuracy']
print(f"New best accuracy: {self.best_accuracy:.2f}%")
model.save_pretrained(f"{output_dir}/best_model")
self.tokenizer.save_pretrained(f"{output_dir}/best_model")
def on_train_end(self, args, state, control, **kwargs):
print(f"\n{'='*60}")
print("FINAL EVALUATION")
print(f"{'='*60}")
# Get the model from the trainer
model = kwargs['model']
# Run final evaluation
metrics = evaluate_model(model, self.tokenizer, self.eval_data)
print(f"Final JSON Accuracy: {metrics['json_accuracy']:.2f}%")
print(f"Final Exact Match: {metrics['exact_match']:.2f}%")
print("Final Field-level Accuracy:")
for field, accuracy in metrics['field_accuracy'].items():
print(f" {field}: {accuracy:.2f}%")
# Function to merge and upload model - MODIFIED
def merge_and_upload_model(model, tokenizer, repo_name):
"""Merge LoRA adapter, convert to FP16, and upload to Hugging Face Hub"""
try:
# First, create the repository if it doesn't exist
from huggingface_hub import create_repo
create_repo(
repo_id=repo_name,
repo_type="model",
exist_ok=True
)
# Merge the model with LoRA weights
model = model.merge_and_unload()
# CONVERT TO FP16 - This is the key change
model = model.half()
# Create a temporary directory for the final model
final_model_dir = "./final_model"
# Save the merged model to the temporary directory
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
# Upload only the final model to Hugging Face Hub
print(f"Uploading final model to {repo_name}...")
from huggingface_hub import HfApi
api = HfApi()
# Upload only the final model directory
api.upload_folder(
folder_path=final_model_dir,
repo_id=repo_name,
repo_type="model"
)
print(f"Final model successfully uploaded to {repo_name}")
# Clean up the temporary directory
import shutil
shutil.rmtree(final_model_dir, ignore_errors=True)
except Exception as e:
print(f"Error merging/uploading model: {e}")
# Main execution
if __name__ == "__main__":
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=torch.float16,
load_in_4bit=False,
)
model = FastLanguageModel.get_peft_model(
model,
r=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha=32,
lora_dropout=0.1,
bias="none",
use_gradient_checkpointing=True,
)
# Load datasets from HF
train_dataset, eval_dataset, raw_eval_data = load_dataset_from_hf(tokenizer)
# Training arguments (without built-in evaluation to avoid EmptyLogits error)
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=2e-5,
per_device_train_batch_size=4,
gradient_accumulation_steps=2,
num_train_epochs=5,
logging_steps=10,
save_steps=500,
fp16=True, # enable fp16 for better performance on old cpu like HF spaces free cpus
bf16=False,
eval_strategy="no", # Disable built-in evaluation
)
# Initialize trainer with custom callback
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
callbacks=[EvaluationCallback(tokenizer, raw_eval_data, eval_interval=500)],
)
# Run initial demo
run_demo(model, tokenizer, "Pre-training")
# Train
print("Starting training...")
trainer.train()
# Run final demo
run_demo(model, tokenizer, "Final")
# Save the model locally first
trainer.save_model(output_dir)
# Merge and upload only the final model
merge_and_upload_model(model, tokenizer, hf_model_repo)
print(f"Training completed. Final model saved and uploaded.")