CodeFeel / train_model.py
jasvir-singh1021's picture
Create train_model.py
e915946 verified
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
# Load dataset
dataset = load_dataset("imranraad/github-emotion-love")
# Multi-label setup
emotions = ["Anger", "Love", "Fear", "Joy", "Sadness", "Surprise"]
# Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize(batch):
return tokenizer(batch['modified_comment'], padding='max_length', truncation=True, max_length=128)
dataset = dataset.map(tokenize, batched=True)
# Convert labels to list of floats for multi-label
def format_labels(batch):
batch["labels"] = [[batch[emo][i] for emo in emotions] for i in range(len(batch[emotions[0]]))]
return batch
dataset = dataset.map(format_labels, batched=True)
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=len(emotions),
problem_type="multi_label_classification"
)
# Training arguments
training_args = TrainingArguments(
output_dir="./model",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
save_strategy="epoch"
)
# Metrics
def compute_metrics(pred):
logits, labels = pred
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.tensor(logits))
preds = (probs > 0.5).float()
accuracy = (preds == torch.tensor(labels)).float().mean()
return {"accuracy": accuracy.item()}
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model("./model")