Spaces:
Sleeping
Sleeping
File size: 5,293 Bytes
1620846 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
#!/usr/bin/env python3
"""
TransLingo Training Script for Google Colab
This script is designed to be copied and run in Google Colab cells
"""
# ============================================
# CELL 1: Check GPU and Setup
# ============================================
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
# ============================================
# CELL 2: Mount Google Drive (Optional)
# ============================================
# from google.colab import drive
# drive.mount('/content/drive')
# DRIVE_PATH = '/content/drive/MyDrive/translingo_checkpoints'
# import os
# os.makedirs(DRIVE_PATH, exist_ok=True)
# ============================================
# CELL 3: Clone Repository
# ============================================
# !git clone https://github.com/YOUR_USERNAME/translingo.git
# %cd translingo
# !pip install -r requirements.txt
# ============================================
# CELL 4: Main Training Script
# ============================================
import os
import sys
import yaml
import logging
from tqdm import tqdm
# Add project root to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from data.download import DataDownloader
from data.preprocessing import create_dataloaders
from training.train import Trainer
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def main():
"""Main training function for Colab"""
# 1. Download and prepare data
logger.info("Downloading Multi30k dataset...")
downloader = DataDownloader()
train_data, valid_data, test_data = downloader.download_multi30k()
if not train_data:
logger.error("Failed to download data!")
return
logger.info(f"Train samples: {len(train_data)}")
logger.info(f"Valid samples: {len(valid_data)}")
logger.info(f"Test samples: {len(test_data)}")
# 2. Train tokenizer
logger.info("Training tokenizer...")
downloader.prepare_tokenizer(train_data)
# 3. Create trainer
logger.info("Initializing trainer...")
trainer = Trainer('configs/config.yaml')
# Optional: Modify config for faster testing
# trainer.config['training']['num_epochs'] = 5
# trainer.config['model']['n_layers'] = 2 # Fewer layers for testing
# 4. Create dataloaders
tokenizer_path = os.path.join('data', 'processed', 'tokenizer.model')
train_loader, valid_loader, test_loader = create_dataloaders(
train_data,
valid_data,
test_data,
tokenizer_path,
batch_size=trainer.config['training']['batch_size'],
num_workers=2 if torch.cuda.is_available() else 0
)
logger.info(f"Train batches: {len(train_loader)}")
logger.info(f"Valid batches: {len(valid_loader)}")
# 5. Train model
logger.info("Starting training...")
logger.info(f"Device: {trainer.device}")
logger.info(f"Epochs: {trainer.config['training']['num_epochs']}")
logger.info(f"Batch size: {trainer.config['training']['batch_size']}")
trainer.train(train_loader, valid_loader)
# 6. Evaluate on test set
logger.info("Evaluating on test set...")
test_metrics = trainer.validate(test_loader)
logger.info(f"Test Loss: {test_metrics['loss']:.4f}")
logger.info(f"Test BLEU: {test_metrics['bleu']:.2f}")
# 7. Save to Google Drive if mounted
if 'DRIVE_PATH' in globals():
os.system(f"cp -r checkpoints/* {DRIVE_PATH}/")
os.system(f"cp -r logs/* {DRIVE_PATH}/logs/")
logger.info(f"Results saved to Google Drive: {DRIVE_PATH}")
if __name__ == "__main__":
main()
# ============================================
# CELL 5: Quick Translation Test
# ============================================
"""
import sentencepiece as spm
# Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load('data/processed/tokenizer.model')
# Test sentences
test_sentences = [
"Guten Morgen!",
"Wie geht es dir?",
"Das Wetter ist heute schön."
]
# Load best model
checkpoint = torch.load('checkpoints/best.pt', map_location='cuda')
model = create_model(checkpoint['config'])
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
model.to('cuda')
# Translate
with torch.no_grad():
for sentence in test_sentences:
tokens = sp.encode(sentence)
src = torch.tensor([sp.bos_id()] + tokens + [sp.eos_id()]).unsqueeze(0).cuda()
translation = model.generate(src, max_length=50)
translated_tokens = translation[0].cpu().numpy().tolist()
translated_text = sp.decode(translated_tokens)
print(f"Source: {sentence}")
print(f"Translation: {translated_text}")
print()
"""
# ============================================
# CELL 6: Download Results
# ============================================
"""
# Zip and download checkpoints
!zip -r translingo_checkpoints.zip checkpoints/
!zip -r translingo_logs.zip logs/
from google.colab import files
files.download('translingo_checkpoints.zip')
files.download('translingo_logs.zip')
"""
|