import matplotlib
matplotlib.use('Agg') # Use non-interactive backend
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import re
import string
import base64
import io
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from wordcloud import WordCloud
from utils.model_loader import download_nltk_resources
from utils.helpers import fig_to_html, df_to_html_table
from nltk.util import ngrams
def preprocessing_handler(text_input):
"""Generate HTML for text preprocessing display"""
output_html = []
# Add result area container
output_html.append('
')
output_html.append('')
output_html.append("""
Text preprocessing is the process of cleaning and transforming raw text into a format that can be easily analyzed by NLP models.
""")
# Model info
output_html.append("""
Tools & Libraries Used:
- NLTK - For stopwords, tokenization, stemming and lemmatization
- Regular Expressions - For pattern matching and text cleaning
- WordCloud - For visualizing word frequency
""")
# Ensure NLTK resources are downloaded
download_nltk_resources()
try:
# Original Text
output_html.append('')
output_html.append(f'
')
# Text statistics
word_count = len(text_input.split())
char_count = len(text_input)
sentence_count = len(nltk.sent_tokenize(text_input))
stats_html = f"""
{sentence_count}
Sentences
"""
output_html.append(stats_html)
# NEW SECTION: Text Cleaning with Regular Expressions
output_html.append('
')
output_html.append('')
output_html.append("""
Regular expressions (regex) provide powerful pattern matching capabilities for cleaning and processing text data.
Common text cleaning tasks include removing URLs, HTML tags, special characters, and normalizing text formats.
""")
# Several regex cleaning examples
url_pattern = r'https?://\S+|www\.\S+'
html_pattern = r'<.*?>'
whitespace_pattern = r'\s+'
email_pattern = r'\S+@\S+'
# Original text for comparison
text_cleaned = text_input
# 1. Remove URLs
urls_cleaned = re.sub(url_pattern, '[URL]', text_cleaned)
# 2. Remove HTML tags
html_cleaned = re.sub(html_pattern, '', urls_cleaned)
# 3. Remove extra whitespace
whitespace_cleaned = re.sub(whitespace_pattern, ' ', html_cleaned).strip()
# 4. Remove email addresses
email_cleaned = re.sub(email_pattern, '[EMAIL]', whitespace_cleaned)
# 5. Fix common contractions
contractions = {
r"won't": "will not",
r"can't": "cannot",
r"n't": " not",
r"'re": " are",
r"'s": " is",
r"'d": " would",
r"'ll": " will",
r"'t": " not",
r"'ve": " have",
r"'m": " am"
}
contraction_cleaned = email_cleaned
for pattern, replacement in contractions.items():
contraction_cleaned = re.sub(pattern, replacement, contraction_cleaned)
# Display the regex cleaning examples in a table
output_html.append("""
Regex Text Cleaning Operations
| Operation |
Regex Pattern |
Description |
| URL Removal |
https?://\\S+|www\\.\\S+ |
Removes or replaces web URLs in text |
| HTML Tag Removal |
<.*?> |
Strips HTML/XML markup tags |
| Whitespace Normalization |
\\s+ |
Replaces multiple spaces, tabs, and newlines with a single space |
| Email Anonymization |
\\S+@\\S+ |
Redacts email addresses for privacy |
| Contraction Expansion |
Multiple patterns |
Expands contractions like "don't" to "do not" |
""")
# Example of cleaned text
output_html.append("""
Example of Text After Regex Cleaning
""")
output_html.append(f"{text_input}")
output_html.append("""
""")
output_html.append(f"{contraction_cleaned}")
output_html.append("""
""")
output_html.append("""
Why Use Regex for Text Cleaning?
- Precision: Regular expressions allow for precise pattern matching
- Flexibility: Can be customized for domain-specific cleaning needs
- Efficiency: Processes text in a single pass for better performance
- Standardization: Creates consistent formatting across documents
""")
# Word length distribution
word_lengths = [len(word) for word in text_input.split()]
fig = plt.figure(figsize=(10, 4))
plt.hist(word_lengths, bins=range(1, max(word_lengths) + 2), alpha=0.7, color='#1976D2')
plt.xlabel('Word Length')
plt.ylabel('Frequency')
plt.title('Word Length Distribution')
plt.grid(alpha=0.3)
plt.tight_layout()
output_html.append('
')
output_html.append('')
output_html.append(fig_to_html(fig))
# Case Normalization
output_html.append('
')
output_html.append('')
lowercase_text = text_input.lower()
uppercase_text = text_input.upper()
case_html = f"""
"""
output_html.append(case_html)
# Remove Punctuation & Special Characters
output_html.append('
')
output_html.append('')
# Count original punctuation
punc_count = sum([1 for char in text_input if char in string.punctuation])
# Remove punctuation
no_punct_text = re.sub(r'[^\w\s]', '', text_input)
punct_html = f"""
{text_input}
Contains {punc_count} punctuation marks
{no_punct_text}
Removed {punc_count} punctuation marks
"""
output_html.append(punct_html)
# Show removed punctuation
punct_chars = [char for char in text_input if char in string.punctuation]
punct_freq = Counter(punct_chars)
if punct_freq:
output_html.append('
Punctuation Distribution
')
fig = plt.figure(figsize=(10, 4))
plt.bar(punct_freq.keys(), punct_freq.values(), color='#1976D2')
plt.xlabel('Punctuation')
plt.ylabel('Frequency')
plt.title('Punctuation Distribution')
plt.tight_layout()
output_html.append(fig_to_html(fig))
# Tokenization
output_html.append('
')
output_html.append('')
# Word tokenization
words = nltk.word_tokenize(text_input)
# Create a multi-column layout for word tokens
output_html.append('
Word Tokens
')
output_html.append(f'
Total tokens: {len(words)} (showing first 50)
')
# Create a multi-column table layout
tokens_html = """
| # |
Token |
Length |
# |
Token |
Length |
# |
Token |
Length |
"""
# Create rows with 3 tokens per row
for i in range(0, min(50, len(words)), 3):
tokens_html += ""
for j in range(3):
if i + j < min(50, len(words)):
token = words[i + j]
tokens_html += f'| {i + j + 1} | {token} | {len(token)} | '
else:
tokens_html += ' | | | '
tokens_html += "
"
tokens_html += """
"""
output_html.append(tokens_html)
# Sentence tokenization
sentences = nltk.sent_tokenize(text_input)
output_html.append('
Sentence Tokens
')
output_html.append(f'
Total sentences: {len(sentences)}
')
for i, sentence in enumerate(sentences[:5]):
output_html.append(f'
')
if len(sentences) > 5:
output_html.append(f'
... and {len(sentences) - 5} more sentences.
')
# Stopwords Removal
output_html.append('
')
output_html.append('')
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
# Count stopwords
stopword_count = len(words) - len(filtered_words)
stopword_percentage = (stopword_count / len(words)) * 100 if words else 0
output_html.append(f"""
Original Words
{len(words)}
After Stopword Removal
{len(filtered_words)}
Stopwords Removed
{stopword_count} ({stopword_percentage:.1f}%)
""")
# Display common stopwords in the text
text_stopwords = [word for word in words if word.lower() in stop_words]
stop_freq = Counter(text_stopwords).most_common(10)
if stop_freq:
output_html.append('
Most Common Stopwords in Text
')
# Create a multi-column layout for stopwords
stopwords_html = """
| # |
Stopword |
Frequency |
# |
Stopword |
Frequency |
"""
# Create rows with 2 stopwords per row
for i in range(0, len(stop_freq), 2):
stopwords_html += ""
for j in range(2):
if i + j < len(stop_freq):
stopword, freq = stop_freq[i + j]
stopwords_html += f'| {i + j + 1} | {stopword} | {freq} | '
else:
stopwords_html += ' | | | '
stopwords_html += "
"
stopwords_html += """
"""
output_html.append(stopwords_html)
# Visualization of before and after
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Before
ax[0].hist([len(word) for word in words], bins=range(1, 15), alpha=0.7, color='#1976D2')
ax[0].set_title('Word Length Before Stopword Removal')
ax[0].set_xlabel('Word Length')
ax[0].set_ylabel('Frequency')
# After
ax[1].hist([len(word) for word in filtered_words], bins=range(1, 15), alpha=0.7, color='#4CAF50')
ax[1].set_title('Word Length After Stopword Removal')
ax[1].set_xlabel('Word Length')
ax[1].set_ylabel('Frequency')
plt.tight_layout()
output_html.append(fig_to_html(fig))
# Stemming and Lemmatization
output_html.append('
')
output_html.append('')
# Apply stemming (Porter Stemmer)
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words[:100]] # Limit to first 100 words for performance
# Apply lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words[:100]] # Limit to first 100 words
# Create comparison DataFrame
comparison_data = []
for i in range(min(20, len(filtered_words))): # Show first 20 examples
if i < len(filtered_words) and filtered_words[i].isalpha(): # Only include alphabetic words
comparison_data.append({
'Original': filtered_words[i],
'Stemmed': stemmer.stem(filtered_words[i]),
'Lemmatized': lemmatizer.lemmatize(filtered_words[i])
})
comparison_df = pd.DataFrame(comparison_data)
output_html.append('
Stemming vs. Lemmatization Comparison
')
# Create a custom table for stemming vs lemmatization comparison
comparison_html = """
| Original |
Stemmed |
Lemmatized |
"""
# Add comparison data rows
for _, row in comparison_df.iterrows():
comparison_html += f"""
{row['Original']} |
{row['Stemmed']} |
{row['Lemmatized']} |
"""
comparison_html += """
"""
output_html.append(comparison_html)
output_html.append("""
Stemming vs. Lemmatization
- Stemming - Cuts off word endings based on common patterns, faster but less accurate
- Lemmatization - Uses vocabulary and morphological analysis, slower but produces actual words
""")
# NEW SECTION: N-gram Analysis
output_html.append('
')
output_html.append('')
output_html.append("""
N-grams are contiguous sequences of n items from text. In NLP, they are used to capture word patterns and relationships,
and are helpful for language modeling, prediction, and feature extraction.
""")
# Process text for n-grams (use filtered_words to avoid stopwords)
# Convert to lowercase for consistency
clean_words = [word.lower() for word in filtered_words if word.isalnum()]
# Generate n-grams
bigrams_list = list(ngrams(clean_words, 2))
trigrams_list = list(ngrams(clean_words, 3))
# Count frequencies
bigram_freq = Counter(bigrams_list)
trigram_freq = Counter(trigrams_list)
# Get most common
common_bigrams = bigram_freq.most_common(15)
common_trigrams = trigram_freq.most_common(15)
# Format for display
bigram_labels = [' '.join(bg) for bg, _ in common_bigrams]
bigram_values = [count for _, count in common_bigrams]
trigram_labels = [' '.join(tg) for tg, _ in common_trigrams]
trigram_values = [count for _, count in common_trigrams]
# Create DataFrames for display
bigram_df = pd.DataFrame({
'Bigram': [' '.join(bg) for bg, _ in common_bigrams],
'Frequency': [count for _, count in common_bigrams]
})
trigram_df = pd.DataFrame({
'Trigram': [' '.join(tg) for tg, _ in common_trigrams],
'Frequency': [count for _, count in common_trigrams]
})
# Explanation of n-grams
output_html.append("""
What are N-grams?
- Unigrams - Single words (e.g., "climate")
- Bigrams - Two consecutive words (e.g., "climate change")
- Trigrams - Three consecutive words (e.g., "global climate change")
N-grams capture contextual relationships between words and are valuable for many NLP tasks including language modeling,
machine translation, speech recognition, and text classification.
""")
# Create visualizations for bigrams and trigrams
if bigram_labels and len(bigram_values) > 0:
# Bigram visualization
output_html.append('
Most Common Bigrams
')
fig = plt.figure(figsize=(10, 6))
plt.barh(range(len(bigram_labels)), bigram_values, align='center', color='#1976D2')
plt.yticks(range(len(bigram_labels)), bigram_labels)
plt.xlabel('Frequency')
plt.title('Most Common Bigrams')
plt.tight_layout()
output_html.append(fig_to_html(fig))
# Create a multi-column layout for bigrams
bigram_html = """
| # |
Bigram |
Freq |
# |
Bigram |
Freq |
"""
# Create rows with 2 bigrams per row
for i in range(0, len(common_bigrams), 2):
bigram_html += ""
for j in range(2):
if i + j < len(common_bigrams):
bigram, freq = common_bigrams[i + j]
bigram_text = ' '.join(bigram)
bigram_html += f'| {i + j + 1} | {bigram_text} | {freq} | '
else:
bigram_html += ' | | | '
bigram_html += "
"
bigram_html += """
"""
output_html.append(bigram_html)
else:
output_html.append('
Not enough text to generate meaningful bigrams.
')
if trigram_labels and len(trigram_values) > 0:
# Trigram visualization
output_html.append('
Most Common Trigrams
')
fig = plt.figure(figsize=(10, 6))
plt.barh(range(len(trigram_labels)), trigram_values, align='center', color='#4CAF50')
plt.yticks(range(len(trigram_labels)), trigram_labels)
plt.xlabel('Frequency')
plt.title('Most Common Trigrams')
plt.tight_layout()
output_html.append(fig_to_html(fig))
# Create a multi-column layout for trigrams
trigram_html = """
| # |
Trigram |
Freq |
# |
Trigram |
Freq |
"""
# Create rows with 2 trigrams per row
for i in range(0, len(common_trigrams), 2):
trigram_html += ""
for j in range(2):
if i + j < len(common_trigrams):
trigram, freq = common_trigrams[i + j]
trigram_text = ' '.join(trigram)
trigram_html += f'| {i + j + 1} | {trigram_text} | {freq} | '
else:
trigram_html += ' | | | '
trigram_html += "
"
trigram_html += """
"""
output_html.append(trigram_html)
else:
output_html.append('
Not enough text to generate meaningful trigrams.
')
# Applications of N-grams
output_html.append("""
Applications of N-gram Analysis
- Language Modeling - Predicting the next word in a sequence
- Machine Translation - Improving translation quality
- Text Classification - Using n-grams as features
- Spelling Correction - Suggesting correct spellings
- Information Retrieval - Enhancing search results
- Sentiment Analysis - Capturing phrase-level sentiments
""")
# Word Cloud
output_html.append('
')
output_html.append('')
try:
# Create word cloud from filtered words
wordcloud_text = ' '.join(filtered_words)
wordcloud = WordCloud(
width=800,
height=400,
background_color='white',
colormap='viridis',
max_words=100,
contour_width=1,
contour_color='#1976D2'
).generate(wordcloud_text)
# Display word cloud
fig = plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
output_html.append(fig_to_html(fig))
except Exception as e:
output_html.append(f"
Failed to generate word cloud: {str(e)}
")
# Word Frequency
output_html.append('
')
output_html.append('')
# Calculate word frequencies
word_freq = Counter(filtered_words)
most_common = word_freq.most_common(20)
# Create DataFrame
freq_df = pd.DataFrame(most_common, columns=['Word', 'Frequency'])
# Create horizontal bar chart
fig = plt.figure(figsize=(12, 16))
plt.barh(range(len(most_common)), [val[1] for val in most_common], align='center', color='#1976D2')
plt.yticks(range(len(most_common)), [val[0] for val in most_common])
plt.xlabel('Frequency')
plt.title('Top 20 Words')
plt.subplots_adjust(left=0.15, right=0.95, top=0.95, bottom=0.1)
plt.tight_layout(pad=3.0)
# Render chart
output_html.append('
')
output_html.append('')
output_html.append(fig_to_html(fig))
output_html.append('
')
output_html.append('')
# Create a multi-column layout for word frequency
freq_html = """
| # |
Word |
Freq |
# |
Word |
Freq |
"""
# Create rows with 2 words per row
for i in range(0, len(most_common), 2):
freq_html += ""
for j in range(2):
if i + j < len(most_common):
word, freq = most_common[i + j]
freq_html += f'| {i + j + 1} | {word} | {freq} | '
else:
freq_html += ' | | | '
freq_html += "
"
freq_html += """
"""
output_html.append(freq_html)
except Exception as e:
output_html.append(f"""
Error
Failed to process text: {str(e)}
""")
# About text preprocessing
output_html.append("""
What is Text Preprocessing?
Text preprocessing is the first step in NLP pipelines that transforms raw text into a clean, structured format
suitable for analysis. It includes various techniques to standardize text and reduce noise.
Common Preprocessing Steps:
- Tokenization - Splitting text into individual words or sentences
- Normalization - Converting text to lowercase, removing accents, etc.
- Noise Removal - Removing punctuation, special characters, HTML tags, etc.
- Stopword Removal - Filtering out common words that add little meaning
- Stemming/Lemmatization - Reducing words to their root forms
- Spelling Correction - Fixing typos and errors
Why Preprocess Text?
- Reduces dimensionality and noise in the data
- Standardizes text for consistent analysis
- Improves performance of downstream NLP tasks
- Makes text more suitable for machine learning models
""")
output_html.append('
') # Close result-area div
return '\n'.join(output_html)