import matplotlib matplotlib.use('Agg') # Use non-interactive backend import matplotlib.pyplot as plt import pandas as pd import nltk import re import string import base64 import io from collections import Counter from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer, PorterStemmer from wordcloud import WordCloud from utils.model_loader import download_nltk_resources from utils.helpers import fig_to_html, df_to_html_table from nltk.util import ngrams def preprocessing_handler(text_input): """Generate HTML for text preprocessing display""" output_html = [] # Add result area container output_html.append('
') output_html.append('

Text Preprocessing

') output_html.append("""
Text preprocessing is the process of cleaning and transforming raw text into a format that can be easily analyzed by NLP models.
""") # Model info output_html.append("""

Tools & Libraries Used:

""") # Ensure NLTK resources are downloaded download_nltk_resources() try: # Original Text output_html.append('

Original Text

') output_html.append(f'
{text_input}
') # Text statistics word_count = len(text_input.split()) char_count = len(text_input) sentence_count = len(nltk.sent_tokenize(text_input)) stats_html = f"""

{word_count}

Words

{char_count}

Characters

{sentence_count}

Sentences

""" output_html.append(stats_html) # NEW SECTION: Text Cleaning with Regular Expressions output_html.append('
') output_html.append('

Text Cleaning with Regular Expressions

') output_html.append("""

Regular expressions (regex) provide powerful pattern matching capabilities for cleaning and processing text data. Common text cleaning tasks include removing URLs, HTML tags, special characters, and normalizing text formats.

""") # Several regex cleaning examples url_pattern = r'https?://\S+|www\.\S+' html_pattern = r'<.*?>' whitespace_pattern = r'\s+' email_pattern = r'\S+@\S+' # Original text for comparison text_cleaned = text_input # 1. Remove URLs urls_cleaned = re.sub(url_pattern, '[URL]', text_cleaned) # 2. Remove HTML tags html_cleaned = re.sub(html_pattern, '', urls_cleaned) # 3. Remove extra whitespace whitespace_cleaned = re.sub(whitespace_pattern, ' ', html_cleaned).strip() # 4. Remove email addresses email_cleaned = re.sub(email_pattern, '[EMAIL]', whitespace_cleaned) # 5. Fix common contractions contractions = { r"won't": "will not", r"can't": "cannot", r"n't": " not", r"'re": " are", r"'s": " is", r"'d": " would", r"'ll": " will", r"'t": " not", r"'ve": " have", r"'m": " am" } contraction_cleaned = email_cleaned for pattern, replacement in contractions.items(): contraction_cleaned = re.sub(pattern, replacement, contraction_cleaned) # Display the regex cleaning examples in a table output_html.append("""

Regex Text Cleaning Operations

Operation Regex Pattern Description
URL Removal https?://\\S+|www\\.\\S+ Removes or replaces web URLs in text
HTML Tag Removal <.*?> Strips HTML/XML markup tags
Whitespace Normalization \\s+ Replaces multiple spaces, tabs, and newlines with a single space
Email Anonymization \\S+@\\S+ Redacts email addresses for privacy
Contraction Expansion Multiple patterns Expands contractions like "don't" to "do not"
""") # Example of cleaned text output_html.append("""

Example of Text After Regex Cleaning

Before Cleaning
""") output_html.append(f"{text_input}") output_html.append("""
After Regex Cleaning
""") output_html.append(f"{contraction_cleaned}") output_html.append("""
""") output_html.append("""

Why Use Regex for Text Cleaning?

""") # Word length distribution word_lengths = [len(word) for word in text_input.split()] fig = plt.figure(figsize=(10, 4)) plt.hist(word_lengths, bins=range(1, max(word_lengths) + 2), alpha=0.7, color='#1976D2') plt.xlabel('Word Length') plt.ylabel('Frequency') plt.title('Word Length Distribution') plt.grid(alpha=0.3) plt.tight_layout() output_html.append('
') output_html.append('

Word Length Distribution

') output_html.append(fig_to_html(fig)) # Case Normalization output_html.append('
') output_html.append('

Case Normalization

') lowercase_text = text_input.lower() uppercase_text = text_input.upper() case_html = f"""
Original Text
{text_input}
Lowercase Text
{lowercase_text}
Uppercase Text
{uppercase_text}
""" output_html.append(case_html) # Remove Punctuation & Special Characters output_html.append('
') output_html.append('

Punctuation & Special Characters Removal

') # Count original punctuation punc_count = sum([1 for char in text_input if char in string.punctuation]) # Remove punctuation no_punct_text = re.sub(r'[^\w\s]', '', text_input) punct_html = f"""
Original Text
{text_input}
Contains {punc_count} punctuation marks
Without Punctuation
{no_punct_text}
Removed {punc_count} punctuation marks
""" output_html.append(punct_html) # Show removed punctuation punct_chars = [char for char in text_input if char in string.punctuation] punct_freq = Counter(punct_chars) if punct_freq: output_html.append('

Punctuation Distribution

') fig = plt.figure(figsize=(10, 4)) plt.bar(punct_freq.keys(), punct_freq.values(), color='#1976D2') plt.xlabel('Punctuation') plt.ylabel('Frequency') plt.title('Punctuation Distribution') plt.tight_layout() output_html.append(fig_to_html(fig)) # Tokenization output_html.append('
') output_html.append('

Tokenization

') # Word tokenization words = nltk.word_tokenize(text_input) # Create a multi-column layout for word tokens output_html.append('

Word Tokens

') output_html.append(f'

Total tokens: {len(words)} (showing first 50)

') # Create a multi-column table layout tokens_html = """
""" # Create rows with 3 tokens per row for i in range(0, min(50, len(words)), 3): tokens_html += "" for j in range(3): if i + j < min(50, len(words)): token = words[i + j] tokens_html += f'' else: tokens_html += '' tokens_html += "" tokens_html += """
# Token Length # Token Length # Token Length
{i + j + 1}{token}{len(token)}
""" output_html.append(tokens_html) # Sentence tokenization sentences = nltk.sent_tokenize(text_input) output_html.append('

Sentence Tokens

') output_html.append(f'

Total sentences: {len(sentences)}

') for i, sentence in enumerate(sentences[:5]): output_html.append(f'
{i+1}. {sentence}
') if len(sentences) > 5: output_html.append(f'

... and {len(sentences) - 5} more sentences.

') # Stopwords Removal output_html.append('
') output_html.append('

Stopwords Removal

') stop_words = set(stopwords.words('english')) filtered_words = [word for word in words if word.lower() not in stop_words] # Count stopwords stopword_count = len(words) - len(filtered_words) stopword_percentage = (stopword_count / len(words)) * 100 if words else 0 output_html.append(f"""
Original Words

{len(words)}

After Stopword Removal

{len(filtered_words)}

Stopwords Removed

{stopword_count} ({stopword_percentage:.1f}%)

""") # Display common stopwords in the text text_stopwords = [word for word in words if word.lower() in stop_words] stop_freq = Counter(text_stopwords).most_common(10) if stop_freq: output_html.append('

Most Common Stopwords in Text

') # Create a multi-column layout for stopwords stopwords_html = """
""" # Create rows with 2 stopwords per row for i in range(0, len(stop_freq), 2): stopwords_html += "" for j in range(2): if i + j < len(stop_freq): stopword, freq = stop_freq[i + j] stopwords_html += f'' else: stopwords_html += '' stopwords_html += "" stopwords_html += """
# Stopword Frequency # Stopword Frequency
{i + j + 1}{stopword}{freq}
""" output_html.append(stopwords_html) # Visualization of before and after fig, ax = plt.subplots(1, 2, figsize=(12, 5)) # Before ax[0].hist([len(word) for word in words], bins=range(1, 15), alpha=0.7, color='#1976D2') ax[0].set_title('Word Length Before Stopword Removal') ax[0].set_xlabel('Word Length') ax[0].set_ylabel('Frequency') # After ax[1].hist([len(word) for word in filtered_words], bins=range(1, 15), alpha=0.7, color='#4CAF50') ax[1].set_title('Word Length After Stopword Removal') ax[1].set_xlabel('Word Length') ax[1].set_ylabel('Frequency') plt.tight_layout() output_html.append(fig_to_html(fig)) # Stemming and Lemmatization output_html.append('
') output_html.append('

Stemming & Lemmatization

') # Apply stemming (Porter Stemmer) stemmer = PorterStemmer() stemmed_words = [stemmer.stem(word) for word in filtered_words[:100]] # Limit to first 100 words for performance # Apply lemmatization lemmatizer = WordNetLemmatizer() lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words[:100]] # Limit to first 100 words # Create comparison DataFrame comparison_data = [] for i in range(min(20, len(filtered_words))): # Show first 20 examples if i < len(filtered_words) and filtered_words[i].isalpha(): # Only include alphabetic words comparison_data.append({ 'Original': filtered_words[i], 'Stemmed': stemmer.stem(filtered_words[i]), 'Lemmatized': lemmatizer.lemmatize(filtered_words[i]) }) comparison_df = pd.DataFrame(comparison_data) output_html.append('

Stemming vs. Lemmatization Comparison

') # Create a custom table for stemming vs lemmatization comparison comparison_html = """
""" # Add comparison data rows for _, row in comparison_df.iterrows(): comparison_html += f""" """ comparison_html += """
Original Stemmed Lemmatized
{row['Original']} {row['Stemmed']} {row['Lemmatized']}
""" output_html.append(comparison_html) output_html.append("""

Stemming vs. Lemmatization

""") # NEW SECTION: N-gram Analysis output_html.append('
') output_html.append('

N-gram Analysis

') output_html.append("""

N-grams are contiguous sequences of n items from text. In NLP, they are used to capture word patterns and relationships, and are helpful for language modeling, prediction, and feature extraction.

""") # Process text for n-grams (use filtered_words to avoid stopwords) # Convert to lowercase for consistency clean_words = [word.lower() for word in filtered_words if word.isalnum()] # Generate n-grams bigrams_list = list(ngrams(clean_words, 2)) trigrams_list = list(ngrams(clean_words, 3)) # Count frequencies bigram_freq = Counter(bigrams_list) trigram_freq = Counter(trigrams_list) # Get most common common_bigrams = bigram_freq.most_common(15) common_trigrams = trigram_freq.most_common(15) # Format for display bigram_labels = [' '.join(bg) for bg, _ in common_bigrams] bigram_values = [count for _, count in common_bigrams] trigram_labels = [' '.join(tg) for tg, _ in common_trigrams] trigram_values = [count for _, count in common_trigrams] # Create DataFrames for display bigram_df = pd.DataFrame({ 'Bigram': [' '.join(bg) for bg, _ in common_bigrams], 'Frequency': [count for _, count in common_bigrams] }) trigram_df = pd.DataFrame({ 'Trigram': [' '.join(tg) for tg, _ in common_trigrams], 'Frequency': [count for _, count in common_trigrams] }) # Explanation of n-grams output_html.append("""

What are N-grams?

N-grams capture contextual relationships between words and are valuable for many NLP tasks including language modeling, machine translation, speech recognition, and text classification.

""") # Create visualizations for bigrams and trigrams if bigram_labels and len(bigram_values) > 0: # Bigram visualization output_html.append('

Most Common Bigrams

') fig = plt.figure(figsize=(10, 6)) plt.barh(range(len(bigram_labels)), bigram_values, align='center', color='#1976D2') plt.yticks(range(len(bigram_labels)), bigram_labels) plt.xlabel('Frequency') plt.title('Most Common Bigrams') plt.tight_layout() output_html.append(fig_to_html(fig)) # Create a multi-column layout for bigrams bigram_html = """
""" # Create rows with 2 bigrams per row for i in range(0, len(common_bigrams), 2): bigram_html += "" for j in range(2): if i + j < len(common_bigrams): bigram, freq = common_bigrams[i + j] bigram_text = ' '.join(bigram) bigram_html += f'' else: bigram_html += '' bigram_html += "" bigram_html += """
# Bigram Freq # Bigram Freq
{i + j + 1}{bigram_text}{freq}
""" output_html.append(bigram_html) else: output_html.append('

Not enough text to generate meaningful bigrams.

') if trigram_labels and len(trigram_values) > 0: # Trigram visualization output_html.append('

Most Common Trigrams

') fig = plt.figure(figsize=(10, 6)) plt.barh(range(len(trigram_labels)), trigram_values, align='center', color='#4CAF50') plt.yticks(range(len(trigram_labels)), trigram_labels) plt.xlabel('Frequency') plt.title('Most Common Trigrams') plt.tight_layout() output_html.append(fig_to_html(fig)) # Create a multi-column layout for trigrams trigram_html = """
""" # Create rows with 2 trigrams per row for i in range(0, len(common_trigrams), 2): trigram_html += "" for j in range(2): if i + j < len(common_trigrams): trigram, freq = common_trigrams[i + j] trigram_text = ' '.join(trigram) trigram_html += f'' else: trigram_html += '' trigram_html += "" trigram_html += """
# Trigram Freq # Trigram Freq
{i + j + 1}{trigram_text}{freq}
""" output_html.append(trigram_html) else: output_html.append('

Not enough text to generate meaningful trigrams.

') # Applications of N-grams output_html.append("""

Applications of N-gram Analysis

""") # Word Cloud output_html.append('
') output_html.append('

Word Cloud

') try: # Create word cloud from filtered words wordcloud_text = ' '.join(filtered_words) wordcloud = WordCloud( width=800, height=400, background_color='white', colormap='viridis', max_words=100, contour_width=1, contour_color='#1976D2' ).generate(wordcloud_text) # Display word cloud fig = plt.figure(figsize=(12, 8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.tight_layout() output_html.append(fig_to_html(fig)) except Exception as e: output_html.append(f"
Failed to generate word cloud: {str(e)}
") # Word Frequency output_html.append('
') output_html.append('

Word Frequency Analysis

') # Calculate word frequencies word_freq = Counter(filtered_words) most_common = word_freq.most_common(20) # Create DataFrame freq_df = pd.DataFrame(most_common, columns=['Word', 'Frequency']) # Create horizontal bar chart fig = plt.figure(figsize=(12, 16)) plt.barh(range(len(most_common)), [val[1] for val in most_common], align='center', color='#1976D2') plt.yticks(range(len(most_common)), [val[0] for val in most_common]) plt.xlabel('Frequency') plt.title('Top 20 Words') plt.subplots_adjust(left=0.15, right=0.95, top=0.95, bottom=0.1) plt.tight_layout(pad=3.0) # Render chart output_html.append('
') output_html.append('
') output_html.append(fig_to_html(fig)) output_html.append('
') output_html.append('
') # Create a multi-column layout for word frequency freq_html = """
""" # Create rows with 2 words per row for i in range(0, len(most_common), 2): freq_html += "" for j in range(2): if i + j < len(most_common): word, freq = most_common[i + j] freq_html += f'' else: freq_html += '' freq_html += "" freq_html += """
# Word Freq # Word Freq
{i + j + 1}{word}{freq}
""" output_html.append(freq_html) except Exception as e: output_html.append(f"""

Error

Failed to process text: {str(e)}

""") # About text preprocessing output_html.append("""

About Text Preprocessing

What is Text Preprocessing?

Text preprocessing is the first step in NLP pipelines that transforms raw text into a clean, structured format suitable for analysis. It includes various techniques to standardize text and reduce noise.

Common Preprocessing Steps:
  • Tokenization - Splitting text into individual words or sentences
  • Normalization - Converting text to lowercase, removing accents, etc.
  • Noise Removal - Removing punctuation, special characters, HTML tags, etc.
  • Stopword Removal - Filtering out common words that add little meaning
  • Stemming/Lemmatization - Reducing words to their root forms
  • Spelling Correction - Fixing typos and errors
Why Preprocess Text?
  • Reduces dimensionality and noise in the data
  • Standardizes text for consistent analysis
  • Improves performance of downstream NLP tasks
  • Makes text more suitable for machine learning models
""") output_html.append('
') # Close result-area div return '\n'.join(output_html)