gemma-3-270m-it-dinercall-ner / train_v6.py

Upload train_v6.py

171f917 verified 4 months ago

13.7 kB

	# Install required packages
	# !pip install -q "git+https://github.com/huggingface/trl.git" unsloth huggingface_hub

	import torch
	import json
	import re
	from unsloth import FastLanguageModel
	from datasets import load_dataset, Dataset
	from trl import SFTTrainer
	from transformers import TrainingArguments, TrainerCallback, set_seed
	from sklearn.model_selection import train_test_split
	import numpy as np
	import sys
	from collections import defaultdict
	from huggingface_hub import HfApi, login

	set_seed(3407)

	# Check for CUDA
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Configuration
	max_seq_length = 256
	model_name = "unsloth/gemma-3-270m-it"
	output_dir = "gemma-3-270m-custom-ner-sft-chat"
	hf_dataset_repo = "Luigi/dinercall-ner"
	hf_model_repo = "Luigi/gemma-3-270m-it-dinercall-ner" # Note: This should be a model repo, not dataset

	# System prompt
	system_prompt = """你是一個助理，負責從用戶消息中提取預訂資訊並以JSON格式輸出。
	JSON必須包含三個字段: num_people, reservation_date, phone_num。
	如果某個字段沒有信息，使用空字符串。只輸出JSON，不要添加任何其他文字。"""

	# Demo samples for quick testing
	DEMO_SAMPLES = [
	"你好，我想訂明天晚上7點的位子，四位成人，電話是0912-345-678",
	"週六下午三點，兩位，電話0987654321",
	"預約下週三中午12點半，5人用餐，聯絡電話0912345678"
	]

	# Load and prepare dataset
	def load_dataset_from_hf(tokenizer):
	try:
	# Load dataset from Hugging Face
	dataset = load_dataset(hf_dataset_repo, split="train")

	formatted_data = []
	raw_eval_data = []

	for example in dataset:
	# Create the output JSON structure
	output_dict = {
	"num_people": example["num_people"] if example["num_people"] else "",
	"reservation_date": example["reservation_date"] if example["reservation_date"] else "",
	"phone_num": example["phone_num"] if example["phone_num"] else ""
	}

	# Create chat template format
	chat_template = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": example["text"]},
	{"role": "assistant", "content": json.dumps(output_dict, ensure_ascii=False)}
	]
	formatted_text = tokenizer.apply_chat_template(
	chat_template,
	tokenize=False,
	add_generation_prompt=False
	)
	formatted_data.append({"text": formatted_text})

	# Store raw data for evaluation
	raw_eval_data.append({
	"text": example["text"],
	"output": output_dict
	})

	# Split data
	train_data, eval_data = train_test_split(formatted_data, test_size=0.2, random_state=42)
	_, raw_eval_data = train_test_split(raw_eval_data, test_size=0.2, random_state=42)

	print(f"Dataset: {len(formatted_data)} total, {len(train_data)} train, {len(eval_data)} eval")
	return Dataset.from_list(train_data), Dataset.from_list(eval_data), raw_eval_data

	except Exception as e:
	print(f"Error loading dataset from HF: {e}")
	sys.exit(1)

	# JSON validation function (unchanged)
	def validate_json(output):
	try:
	json_match = re.search(r'\{[\s\S]*\}', output)
	if not json_match:
	return False, None, "No JSON found"

	json_str = json_match.group(0)
	json_str = re.sub(r',\s*\}', '}', json_str)
	parsed = json.loads(json_str)
	return True, parsed, "Valid JSON"
	except:
	return False, None, "Invalid JSON"

	# Evaluation function (unchanged)
	def evaluate_model(model, tokenizer, eval_samples, max_samples=20):
	"""Evaluate model performance on a set of samples"""
	model.eval()

	json_validity = []
	exact_matches = []
	field_accuracy = defaultdict(list)

	for i, example in enumerate(eval_samples[:max_samples]):
	# Create prompt for inference
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": example["text"]}
	]
	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	inputs = tokenizer(prompt, return_tensors="pt", padding=True)
	if device == "cuda":
	inputs = inputs.to("cuda")

	# Generate response
	eos_token_id = tokenizer.eos_token_id
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=64,
	temperature=0.1,
	pad_token_id=eos_token_id,
	eos_token_id=eos_token_id,
	do_sample=False,
	)

	# Extract only the assistant's response
	prompt_length = len(inputs.input_ids[0])
	assistant_output = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)

	# Validate JSON
	is_valid, parsed, _ = validate_json(assistant_output)
	json_validity.append(1 if is_valid else 0)

	# Calculate accuracy metrics
	if is_valid:
	# Check exact match
	exact_match = 1 if parsed == example["output"] else 0
	exact_matches.append(exact_match)

	# Check field-level accuracy
	for field in ["num_people", "reservation_date", "phone_num"]:
	if field in parsed and field in example["output"]:
	field_match = 1 if parsed[field] == example["output"][field] else 0
	field_accuracy[field].append(field_match)

	# Calculate metrics
	json_accuracy = np.mean(json_validity) * 100 if json_validity else 0
	exact_match_rate = np.mean(exact_matches) * 100 if exact_matches else 0

	field_accuracy_rates = {}
	for field, accuracies in field_accuracy.items():
	field_accuracy_rates[field] = np.mean(accuracies) * 100 if accuracies else 0

	return {
	"json_accuracy": json_accuracy,
	"exact_match": exact_match_rate,
	"field_accuracy": field_accuracy_rates
	}

	# Demo function (unchanged)
	def run_demo(model, tokenizer, step="Initial"):
	print(f"\n{'='*50}")
	print(f"DEMO: {step}")
	print(f"{'='*50}")

	model.eval()

	for i, text in enumerate(DEMO_SAMPLES):
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": text}
	]
	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	inputs = tokenizer(prompt, return_tensors="pt", padding=True)
	if device == "cuda":
	inputs = inputs.to("cuda")

	# Generate response
	eos_token_id = tokenizer.eos_token_id
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=64,
	temperature=0.1,
	pad_token_id=eos_token_id,
	eos_token_id=eos_token_id,
	do_sample=True,
	)

	# Extract only the assistant's response
	prompt_length = len(inputs.input_ids[0])
	assistant_output = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)

	print(f"\nSample {i+1}: {text}")
	print(f"Output: {assistant_output}")

	is_valid, parsed, message = validate_json(assistant_output)
	status = "✅ VALID" if is_valid else "❌ INVALID"
	print(f"{status}: {message}")
	if is_valid:
	print(json.dumps(parsed, indent=2, ensure_ascii=False))

	# Custom callback for evaluation (unchanged)
	class EvaluationCallback(TrainerCallback):
	def __init__(self, tokenizer, eval_data, eval_interval=500):
	self.tokenizer = tokenizer
	self.eval_data = eval_data
	self.eval_interval = eval_interval
	self.best_accuracy = 0

	def on_step_end(self, args, state, control, **kwargs):
	if state.global_step % self.eval_interval == 0:
	print(f"\n{'='*60}")
	print(f"EVALUATION AT STEP {state.global_step}")
	print(f"{'='*60}")

	# Get the model from the trainer
	model = kwargs['model']

	# Run evaluation
	metrics = evaluate_model(model, self.tokenizer, self.eval_data)

	print(f"JSON Accuracy: {metrics['json_accuracy']:.2f}%")
	print(f"Exact Match: {metrics['exact_match']:.2f}%")
	print("Field-level Accuracy:")
	for field, accuracy in metrics['field_accuracy'].items():
	print(f" {field}: {accuracy:.2f}%")

	# Save best model
	if metrics['json_accuracy'] > self.best_accuracy:
	self.best_accuracy = metrics['json_accuracy']
	print(f"New best accuracy: {self.best_accuracy:.2f}%")
	model.save_pretrained(f"{output_dir}/best_model")
	self.tokenizer.save_pretrained(f"{output_dir}/best_model")

	def on_train_end(self, args, state, control, **kwargs):
	print(f"\n{'='*60}")
	print("FINAL EVALUATION")
	print(f"{'='*60}")

	# Get the model from the trainer
	model = kwargs['model']

	# Run final evaluation
	metrics = evaluate_model(model, self.tokenizer, self.eval_data)

	print(f"Final JSON Accuracy: {metrics['json_accuracy']:.2f}%")
	print(f"Final Exact Match: {metrics['exact_match']:.2f}%")
	print("Final Field-level Accuracy:")
	for field, accuracy in metrics['field_accuracy'].items():
	print(f" {field}: {accuracy:.2f}%")


	# Function to merge and upload model - MODIFIED
	def merge_and_upload_model(model, tokenizer, repo_name):
	"""Merge LoRA adapter, convert to FP16, and upload to Hugging Face Hub"""
	try:
	# First, create the repository if it doesn't exist
	from huggingface_hub import create_repo
	create_repo(
	repo_id=repo_name,
	repo_type="model",
	exist_ok=True
	)

	# Merge the model with LoRA weights
	model = model.merge_and_unload()

	# CONVERT TO FP16 - This is the key change
	model = model.half()

	# Create a temporary directory for the final model
	final_model_dir = "./final_model"

	# Save the merged model to the temporary directory
	model.save_pretrained(final_model_dir)
	tokenizer.save_pretrained(final_model_dir)

	# Upload only the final model to Hugging Face Hub
	print(f"Uploading final model to {repo_name}...")
	from huggingface_hub import HfApi
	api = HfApi()

	# Upload only the final model directory
	api.upload_folder(
	folder_path=final_model_dir,
	repo_id=repo_name,
	repo_type="model"
	)

	print(f"Final model successfully uploaded to {repo_name}")

	# Clean up the temporary directory
	import shutil
	shutil.rmtree(final_model_dir, ignore_errors=True)

	except Exception as e:
	print(f"Error merging/uploading model: {e}")

	# Main execution
	if __name__ == "__main__":
	# Load model and tokenizer
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_name,
	max_seq_length=max_seq_length,
	dtype=torch.float16,
	load_in_4bit=False,
	)

	model = FastLanguageModel.get_peft_model(
	model,
	r=32,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	lora_alpha=32,
	lora_dropout=0.1,
	bias="none",
	use_gradient_checkpointing=True,
	)

	# Load datasets from HF
	train_dataset, eval_dataset, raw_eval_data = load_dataset_from_hf(tokenizer)

	# Training arguments (without built-in evaluation to avoid EmptyLogits error)
	training_args = TrainingArguments(
	output_dir=output_dir,
	learning_rate=2e-5,
	per_device_train_batch_size=4,
	gradient_accumulation_steps=2,
	num_train_epochs=5,
	logging_steps=10,
	save_steps=500,
	fp16=True, # enable fp16 for better performance on old cpu like HF spaces free cpus
	bf16=False,
	eval_strategy="no", # Disable built-in evaluation
	)

	# Initialize trainer with custom callback
	trainer = SFTTrainer(
	model=model,
	tokenizer=tokenizer,
	args=training_args,
	train_dataset=train_dataset,
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	callbacks=[EvaluationCallback(tokenizer, raw_eval_data, eval_interval=500)],
	)

	# Run initial demo
	run_demo(model, tokenizer, "Pre-training")

	# Train
	print("Starting training...")
	trainer.train()

	# Run final demo
	run_demo(model, tokenizer, "Final")

	# Save the model locally first
	trainer.save_model(output_dir)

	# Merge and upload only the final model
	merge_and_upload_model(model, tokenizer, hf_model_repo)

	print(f"Training completed. Final model saved and uploaded.")