Spaces:
Sleeping
Sleeping
File size: 12,179 Bytes
ca2c89c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
{% extends "base.html" %}
{% block title %}Tokenization - NLP Ultimate Tutorial{% endblock %}
{% block content %}
<div class="container">
<!-- Header Section -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h1 class="mb-0">
<i class="fas fa-cut"></i>
Tokenization
</h1>
</div>
<div class="card-body">
<p class="lead">Break text into smaller units called tokens using various tokenization methods.</p>
<div class="alert alert-info">
<i class="fas fa-info-circle"></i>
<strong>About:</strong> Tokenization is the process of breaking text into smaller units called tokens, which can be words, characters, or subwords.
</div>
</div>
</div>
</div>
</div>
{% include "_analysis_nav.html" %}
<!-- Text Input Section -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h3 class="mb-0">
<i class="fas fa-keyboard"></i>
Enter your text:
</h3>
</div>
<div class="card-body">
<div class="row mb-3">
<div class="col-md-8">
<textarea id="textInput" class="form-control" rows="6" placeholder="Enter or paste your text here...">The quick brown fox jumps over the lazy dog. It was a beautiful day in May of 2023!</textarea>
</div>
<div class="col-md-4">
<label for="sampleSelect" class="form-label">Or choose a sample:</label>
<select id="sampleSelect" class="form-select">
<option value="Custom">Custom</option>
<option value="News Article">News Article</option>
<option value="Product Review">Product Review</option>
<option value="Scientific Text">Scientific Text</option>
<option value="Literary Text">Literary Text</option>
</select>
</div>
</div>
<div class="d-flex justify-content-between align-items-center">
<div>
<button id="processBtn" class="btn btn-primary btn-lg">
<i class="fas fa-cut"></i>
Analyze Tokens
</button>
</div>
<div>
<button id="clearBtn" class="btn btn-outline-secondary">
<i class="fas fa-trash"></i>
Clear
</button>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Tokenization Methods Info -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h3 class="mb-0">
<i class="fas fa-info-circle"></i>
Tokenization Methods
</h3>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-3">
<div class="card h-100">
<div class="card-body text-center">
<i class="fas fa-font fa-2x text-primary mb-2"></i>
<h5>Word Tokenization</h5>
<p class="small">Splits text into individual words and punctuation marks using NLTK.</p>
</div>
</div>
</div>
<div class="col-md-3">
<div class="card h-100">
<div class="card-body text-center">
<i class="fas fa-paragraph fa-2x text-success mb-2"></i>
<h5>Sentence Tokenization</h5>
<p class="small">Divides text into sentences using punctuation and linguistic rules.</p>
</div>
</div>
</div>
<div class="col-md-3">
<div class="card h-100">
<div class="card-body text-center">
<i class="fas fa-brain fa-2x text-info mb-2"></i>
<h5>Linguistic Tokenization</h5>
<p class="small">Advanced tokenization with spaCy including POS tags and dependencies.</p>
</div>
</div>
</div>
<div class="col-md-3">
<div class="card h-100">
<div class="card-body text-center">
<i class="fas fa-puzzle-piece fa-2x text-warning mb-2"></i>
<h5>Subword Tokenization</h5>
<p class="small">Breaks words into smaller units using BERT WordPiece and GPT-2 BPE.</p>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Results Section -->
<div class="row">
<div class="col-12">
<div class="card">
<div class="card-header">
<h3 class="mb-0">
<i class="fas fa-chart-bar"></i>
Tokenization Results
</h3>
</div>
<div class="card-body">
<div id="resultsContainer">
<div class="text-center text-muted py-5">
<i class="fas fa-arrow-up fa-2x mb-3"></i>
<p>Click "Analyze Tokens" to see tokenization results</p>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
{% endblock %}
{% block extra_scripts %}
<script>
// Initialize page
document.addEventListener('DOMContentLoaded', function() {
// Only carry over when using Quick Nav; otherwise leave defaults
const shouldCarry = sessionStorage.getItem('carryTextOnNextPage') === '1';
if (shouldCarry) {
const sampleSel = document.getElementById('sampleSelect');
if (sampleSel) sampleSel.value = 'Custom';
const storedText = sessionStorage.getItem('analysisText');
if (storedText) document.getElementById('textInput').value = storedText;
sessionStorage.removeItem('carryTextOnNextPage');
}
// Sample text dropdown handler
document.getElementById('sampleSelect').addEventListener('change', function() {
const sampleType = this.value;
const textInput = document.getElementById('textInput');
if (sampleType === 'Custom') {
textInput.value = '';
} else {
// Get sample text from server
fetch('/api/sample-text', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({sample_type: sampleType})
})
.then(response => response.json())
.then(data => {
textInput.value = data.text;
});
}
});
// Process button handler
document.getElementById('processBtn').addEventListener('click', function() {
const text = document.getElementById('textInput').value.trim();
if (!text) {
alert('Please enter some text to tokenize.');
return;
}
// Show loading state
this.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Processing...';
this.disabled = true;
// Process text
processTokenization();
// Reset button after a delay
setTimeout(() => {
this.innerHTML = '<i class="fas fa-cut"></i> Analyze Tokens';
this.disabled = false;
}, 2000);
});
// Clear button handler
document.getElementById('clearBtn').addEventListener('click', function() {
document.getElementById('textInput').value = '';
document.getElementById('resultsContainer').innerHTML = `
<div class="text-center text-muted py-5">
<i class="fas fa-arrow-up fa-2x mb-3"></i>
<p>Click "Analyze Tokens" to see tokenization results</p>
</div>
`;
});
// Keyboard shortcuts
document.addEventListener('keydown', function(e) {
// Ctrl+Enter to process
if (e.ctrlKey && e.key === 'Enter') {
document.getElementById('processBtn').click();
}
// Ctrl+L to clear
if (e.ctrlKey && e.key === 'l') {
e.preventDefault();
document.getElementById('clearBtn').click();
}
});
});
// Process tokenization
function processTokenization() {
const text = document.getElementById('textInput').value.trim();
if (!text) {
alert('Please enter some text to tokenize.');
return;
}
showLoading('resultsContainer');
fetch('/api/tokenization', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({text: text})
})
.then(response => response.json())
.then(data => {
if (data.success) {
displayResults(data.result);
} else {
showError(data.error || 'An error occurred while processing the text');
}
})
.catch(error => {
showError('Failed to process text: ' + error.message);
})
.finally(() => {
hideLoading('resultsContainer');
});
}
// Show loading state
function showLoading(elementId) {
const element = document.getElementById(elementId);
if (element) {
element.innerHTML = `
<div class="text-center py-4">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">Loading...</span>
</div>
<p class="mt-2">Analyzing tokens...</p>
</div>
`;
}
}
// Hide loading state
function hideLoading(elementId) {
const element = document.getElementById(elementId);
if (element && element.innerHTML.includes('spinner-border')) {
element.innerHTML = '';
}
}
// Show error message
function showError(message, elementId = 'resultsContainer') {
const element = document.getElementById(elementId);
if (element) {
element.innerHTML = `
<div class="alert alert-danger fade-in">
<i class="fas fa-exclamation-triangle"></i>
<strong>Error:</strong> ${message}
</div>
`;
}
}
// Display results
function displayResults(result) {
const container = document.getElementById('resultsContainer');
if (container) {
container.innerHTML = result;
container.classList.add('fade-in');
// Scroll to results
container.scrollIntoView({ behavior: 'smooth', block: 'start' });
}
}
</script>
{% endblock %}
|