Spaces:

aradhyapavan
/

nlp-ultimate-tutor

Sleeping

File size: 37,687 Bytes

ca2c89c

import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import re
import string
import base64
import io
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from wordcloud import WordCloud
from utils.model_loader import download_nltk_resources
from utils.helpers import fig_to_html, df_to_html_table
from nltk.util import ngrams

def preprocessing_handler(text_input):
    """Generate HTML for text preprocessing display"""
    output_html = []
    
    # Add result area container
    output_html.append('<div class="result-area">')
    output_html.append('<h2 class="task-header">Text Preprocessing</h2>')
    
    output_html.append("""
    <div class="alert alert-info">
    <i class="fas fa-info-circle"></i>
    Text preprocessing is the process of cleaning and transforming raw text into a format that can be easily analyzed by NLP models.
    </div>
    """)
    
    # Model info
    output_html.append("""
    <div class="alert alert-info">
        <h4><i class="fas fa-tools"></i> Tools & Libraries Used:</h4>
        <ul>
            <li><b>NLTK</b> - For stopwords, tokenization, stemming and lemmatization</li>
            <li><b>Regular Expressions</b> - For pattern matching and text cleaning</li>
            <li><b>WordCloud</b> - For visualizing word frequency</li>
        </ul>
    </div>
    """)
    
    # Ensure NLTK resources are downloaded
    download_nltk_resources()
    
    try:
        # Original Text
        output_html.append('<h3 class="task-subheader">Original Text</h3>')
        output_html.append(f'<div class="card"><div class="card-body"><div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 500px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">{text_input}</div></div></div>')
        
        # Text statistics
        word_count = len(text_input.split())
        char_count = len(text_input)
        sentence_count = len(nltk.sent_tokenize(text_input))
        
        stats_html = f"""
        <div class="stats-container">
            <div class="row">
                <div class="col-md-4">
                    <div class="card text-center stats-card">
                        <div class="card-body">
                            <h3 class="metric-blue">{word_count}</h3>
                            <p>Words</p>
                        </div>
                    </div>
                </div>
                <div class="col-md-4">
                    <div class="card text-center stats-card">
                        <div class="card-body">
                            <h3 class="metric-green">{char_count}</h3>
                            <p>Characters</p>
                        </div>
                    </div>
                </div>
                <div class="col-md-4">
                    <div class="card text-center stats-card">
                        <div class="card-body">
                            <h3 class="metric-orange">{sentence_count}</h3>
                            <p>Sentences</p>
                        </div>
                    </div>
                </div>
            </div>
        </div>
        """
        output_html.append(stats_html)
        
        # NEW SECTION: Text Cleaning with Regular Expressions
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Text Cleaning with Regular Expressions</h3>')
        
        output_html.append("""
        <div class="alert alert-light">
            <p>Regular expressions (regex) provide powerful pattern matching capabilities for cleaning and processing text data. 
            Common text cleaning tasks include removing URLs, HTML tags, special characters, and normalizing text formats.</p>
        </div>
        """)
        
        # Several regex cleaning examples
        url_pattern = r'https?://\S+|www\.\S+'
        html_pattern = r'<.*?>'
        whitespace_pattern = r'\s+'
        email_pattern = r'\S+@\S+'
        
        # Original text for comparison
        text_cleaned = text_input
        
        # 1. Remove URLs
        urls_cleaned = re.sub(url_pattern, '[URL]', text_cleaned)
        
        # 2. Remove HTML tags
        html_cleaned = re.sub(html_pattern, '', urls_cleaned)
        
        # 3. Remove extra whitespace
        whitespace_cleaned = re.sub(whitespace_pattern, ' ', html_cleaned).strip()
        
        # 4. Remove email addresses
        email_cleaned = re.sub(email_pattern, '[EMAIL]', whitespace_cleaned)
        
        # 5. Fix common contractions
        contractions = {
            r"won't": "will not",
            r"can't": "cannot",
            r"n't": " not",
            r"'re": " are",
            r"'s": " is",
            r"'d": " would",
            r"'ll": " will",
            r"'t": " not",
            r"'ve": " have",
            r"'m": " am"
        }
        
        contraction_cleaned = email_cleaned
        for pattern, replacement in contractions.items():
            contraction_cleaned = re.sub(pattern, replacement, contraction_cleaned)
        
        # Display the regex cleaning examples in a table
        output_html.append("""
        <h4>Regex Text Cleaning Operations</h4>
        <div class="table-responsive">
        <table class="table table-striped">
            <thead class="table-primary">
                <tr>
                    <th>Operation</th>
                    <th>Regex Pattern</th>
                    <th>Description</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>URL Removal</td>
                    <td><code>https?://\\S+|www\\.\\S+</code></td>
                    <td>Removes or replaces web URLs in text</td>
                </tr>
                <tr>
                    <td>HTML Tag Removal</td>
                    <td><code>&lt;.*?&gt;</code></td>
                    <td>Strips HTML/XML markup tags</td>
                </tr>
                <tr>
                    <td>Whitespace Normalization</td>
                    <td><code>\\s+</code></td>
                    <td>Replaces multiple spaces, tabs, and newlines with a single space</td>
                </tr>
                <tr>
                    <td>Email Anonymization</td>
                    <td><code>\\S+@\\S+</code></td>
                    <td>Redacts email addresses for privacy</td>
                </tr>
                <tr>
                    <td>Contraction Expansion</td>
                    <td><code>Multiple patterns</code></td>
                    <td>Expands contractions like "don't" to "do not"</td>
                </tr>
            </tbody>
        </table>
        </div>
        """)
        
        # Example of cleaned text
        output_html.append("""
        <h4>Example of Text After Regex Cleaning</h4>
        <div class="row">
            <div class="col-md-6">
                <div class="card">
                    <div class="card-header">
                        <h5 class="mb-0">Before Cleaning</h5>
                    </div>
                    <div class="card-body">
                        <div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 400px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">""")
        output_html.append(f"{text_input}")
        output_html.append("""</div>
                    </div>
                </div>
            </div>
            <div class="col-md-6">
                <div class="card">
                    <div class="card-header">
                        <h5 class="mb-0">After Regex Cleaning</h5>
                    </div>
                    <div class="card-body">
                        <div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 400px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">""")
        output_html.append(f"{contraction_cleaned}")
        output_html.append("""</div>
                    </div>
                </div>
            </div>
        </div>
        """)
        
        output_html.append("""
        <div class="alert alert-success">
            <h4><i class="fas fa-lightbulb"></i> Why Use Regex for Text Cleaning?</h4>
            <ul>
                <li><b>Precision:</b> Regular expressions allow for precise pattern matching</li>
                <li><b>Flexibility:</b> Can be customized for domain-specific cleaning needs</li>
                <li><b>Efficiency:</b> Processes text in a single pass for better performance</li>
                <li><b>Standardization:</b> Creates consistent formatting across documents</li>
            </ul>
        </div>
        """)
        
        # Word length distribution
        word_lengths = [len(word) for word in text_input.split()]
        fig = plt.figure(figsize=(10, 4))
        plt.hist(word_lengths, bins=range(1, max(word_lengths) + 2), alpha=0.7, color='#1976D2')
        plt.xlabel('Word Length')
        plt.ylabel('Frequency')
        plt.title('Word Length Distribution')
        plt.grid(alpha=0.3)
        plt.tight_layout()
        
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Word Length Distribution</h3>')
        output_html.append(fig_to_html(fig))
        
        # Case Normalization
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Case Normalization</h3>')
        
        lowercase_text = text_input.lower()
        uppercase_text = text_input.upper()
        
        case_html = f"""
        <div class="row">
            <div class="col-md-4">
                <div class="card">
                    <div class="card-header">
                        <h5 class="mb-0">Original Text</h5>
                    </div>
                    <div class="card-body">
                        <div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 400px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">{text_input}</div>
                    </div>
                </div>
            </div>
            <div class="col-md-4">
                <div class="card">
                    <div class="card-header">
                        <h5 class="mb-0">Lowercase Text</h5>
                    </div>
                    <div class="card-body">
                        <div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 400px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">{lowercase_text}</div>
                    </div>
                </div>
            </div>
            <div class="col-md-4">
                <div class="card">
                    <div class="card-header">
                        <h5 class="mb-0">Uppercase Text</h5>
                    </div>
                    <div class="card-body">
                        <div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 400px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">{uppercase_text}</div>
                    </div>
                </div>
            </div>
        </div>
        """
        output_html.append(case_html)
        
        # Remove Punctuation & Special Characters
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Punctuation & Special Characters Removal</h3>')
        
        # Count original punctuation
        punc_count = sum([1 for char in text_input if char in string.punctuation])
        
        # Remove punctuation
        no_punct_text = re.sub(r'[^\w\s]', '', text_input)
        
        punct_html = f"""
        <div class="row">
            <div class="col-md-6">
                <div class="card">
                    <div class="card-header">
                        <h5 class="mb-0">Original Text</h5>
                    </div>
                    <div class="card-body">
                        <div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 400px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">{text_input}</div>
                        <small class="text-muted">Contains {punc_count} punctuation marks</small>
                    </div>
                </div>
            </div>
            <div class="col-md-6">
                <div class="card">
                    <div class="card-header">
                        <h5 class="mb-0">Without Punctuation</h5>
                    </div>
                    <div class="card-body">
                        <div class="text-content" style="word-wrap: break-word; word-break: break-word; overflow-wrap: break-word; max-height: 400px; overflow-y: auto; padding: 15px; background-color: #f8f9fa; border-radius: 5px; border: 1px solid #e9ecef; line-height: 1.6;">{no_punct_text}</div>
                        <small class="text-muted">Removed {punc_count} punctuation marks</small>
                    </div>
                </div>
            </div>
        </div>
        """
        output_html.append(punct_html)
        
        # Show removed punctuation
        punct_chars = [char for char in text_input if char in string.punctuation]
        punct_freq = Counter(punct_chars)
        
        if punct_freq:
            output_html.append('<h4>Punctuation Distribution</h4>')
            
            fig = plt.figure(figsize=(10, 4))
            plt.bar(punct_freq.keys(), punct_freq.values(), color='#1976D2')
            plt.xlabel('Punctuation')
            plt.ylabel('Frequency')
            plt.title('Punctuation Distribution')
            plt.tight_layout()
            
            output_html.append(fig_to_html(fig))
        
        # Tokenization
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Tokenization</h3>')
        
        # Word tokenization
        words = nltk.word_tokenize(text_input)
        
        # Create a multi-column layout for word tokens
        output_html.append('<h4>Word Tokens</h4>')
        output_html.append(f'<p>Total tokens: {len(words)} (showing first 50)</p>')
        
        # Create a multi-column table layout
        tokens_html = """
        <div class="table-responsive">
        <table class="table table-striped table-hover" style="table-layout: fixed;">
            <thead class="table-primary">
                <tr>
                    <th style="width: 8%;">#</th>
                    <th style="width: 25%;">Token</th>
                    <th style="width: 12%;">Length</th>
                    <th style="width: 8%;">#</th>
                    <th style="width: 25%;">Token</th>
                    <th style="width: 12%;">Length</th>
                    <th style="width: 8%;">#</th>
                    <th style="width: 25%;">Token</th>
                    <th style="width: 12%;">Length</th>
                </tr>
            </thead>
            <tbody>
        """
        
        # Create rows with 3 tokens per row
        for i in range(0, min(50, len(words)), 3):
            tokens_html += "<tr>"
            for j in range(3):
                if i + j < min(50, len(words)):
                    token = words[i + j]
                    tokens_html += f'<td>{i + j + 1}</td><td><code>{token}</code></td><td><span class="badge bg-secondary">{len(token)}</span></td>'
                else:
                    tokens_html += '<td></td><td></td><td></td>'
            tokens_html += "</tr>"
        
        tokens_html += """
            </tbody>
        </table>
        </div>
        """
        
        output_html.append(tokens_html)
        
        # Sentence tokenization
        sentences = nltk.sent_tokenize(text_input)
        
        output_html.append('<h4>Sentence Tokens</h4>')
        output_html.append(f'<p>Total sentences: {len(sentences)}</p>')
        
        for i, sentence in enumerate(sentences[:5]):
            output_html.append(f'<div class="card mb-2"><div class="card-body"><strong>{i+1}.</strong> {sentence}</div></div>')
        
        if len(sentences) > 5:
            output_html.append(f'<p class="text-muted">... and {len(sentences) - 5} more sentences.</p>')
        
        # Stopwords Removal
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Stopwords Removal</h3>')
        
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.lower() not in stop_words]
        
        # Count stopwords
        stopword_count = len(words) - len(filtered_words)
        stopword_percentage = (stopword_count / len(words)) * 100 if words else 0
        
        output_html.append(f"""
        <div class="row mb-3">
            <div class="col-md-4">
                <div class="card text-center">
                    <div class="card-body">
                        <h5>Original Words</h5>
                        <h3 class="text-primary">{len(words)}</h3>
                    </div>
                </div>
            </div>
            <div class="col-md-4">
                <div class="card text-center">
                    <div class="card-body">
                        <h5>After Stopword Removal</h5>
                        <h3 class="text-success">{len(filtered_words)}</h3>
                    </div>
                </div>
            </div>
            <div class="col-md-4">
                <div class="card text-center">
                    <div class="card-body">
                        <h5>Stopwords Removed</h5>
                        <h3 class="text-warning">{stopword_count} ({stopword_percentage:.1f}%)</h3>
                    </div>
                </div>
            </div>
        </div>
        """)
        
        # Display common stopwords in the text
        text_stopwords = [word for word in words if word.lower() in stop_words]
        stop_freq = Counter(text_stopwords).most_common(10)
        
        if stop_freq:
            output_html.append('<h4>Most Common Stopwords in Text</h4>')
            
            # Create a multi-column layout for stopwords
            stopwords_html = """
            <div class="table-responsive">
            <table class="table table-striped table-hover" style="table-layout: fixed;">
                <thead class="table-primary">
                    <tr>
                        <th style="width: 10%;">#</th>
                        <th style="width: 35%;">Stopword</th>
                        <th style="width: 15%;">Frequency</th>
                        <th style="width: 10%;">#</th>
                        <th style="width: 35%;">Stopword</th>
                        <th style="width: 15%;">Frequency</th>
                    </tr>
                </thead>
                <tbody>
            """
            
            # Create rows with 2 stopwords per row
            for i in range(0, len(stop_freq), 2):
                stopwords_html += "<tr>"
                for j in range(2):
                    if i + j < len(stop_freq):
                        stopword, freq = stop_freq[i + j]
                        stopwords_html += f'<td>{i + j + 1}</td><td><code>{stopword}</code></td><td><span class="badge bg-warning">{freq}</span></td>'
                    else:
                        stopwords_html += '<td></td><td></td><td></td>'
                stopwords_html += "</tr>"
            
            stopwords_html += """
                </tbody>
            </table>
            </div>
            """
            
            output_html.append(stopwords_html)
            
            # Visualization of before and after
            fig, ax = plt.subplots(1, 2, figsize=(12, 5))
            
            # Before
            ax[0].hist([len(word) for word in words], bins=range(1, 15), alpha=0.7, color='#1976D2')
            ax[0].set_title('Word Length Before Stopword Removal')
            ax[0].set_xlabel('Word Length')
            ax[0].set_ylabel('Frequency')
            
            # After
            ax[1].hist([len(word) for word in filtered_words], bins=range(1, 15), alpha=0.7, color='#4CAF50')
            ax[1].set_title('Word Length After Stopword Removal')
            ax[1].set_xlabel('Word Length')
            ax[1].set_ylabel('Frequency')
            
            plt.tight_layout()
            output_html.append(fig_to_html(fig))
        
        # Stemming and Lemmatization
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Stemming & Lemmatization</h3>')
        
        # Apply stemming (Porter Stemmer)
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(word) for word in filtered_words[:100]]  # Limit to first 100 words for performance
        
        # Apply lemmatization
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words[:100]]  # Limit to first 100 words
        
        # Create comparison DataFrame
        comparison_data = []
        for i in range(min(20, len(filtered_words))):  # Show first 20 examples
            if i < len(filtered_words) and filtered_words[i].isalpha():  # Only include alphabetic words
                comparison_data.append({
                    'Original': filtered_words[i],
                    'Stemmed': stemmer.stem(filtered_words[i]),
                    'Lemmatized': lemmatizer.lemmatize(filtered_words[i])
                })
        
        comparison_df = pd.DataFrame(comparison_data)
        
        output_html.append('<h4>Stemming vs. Lemmatization Comparison</h4>')
        
        # Create a custom table for stemming vs lemmatization comparison
        comparison_html = """
        <div class="table-responsive">
        <table class="table table-striped table-hover" style="table-layout: fixed;">
            <thead class="table-primary">
                <tr>
                    <th style="width: 30%;">Original</th>
                    <th style="width: 35%;">Stemmed</th>
                    <th style="width: 35%;">Lemmatized</th>
                </tr>
            </thead>
            <tbody>
        """
        
        # Add comparison data rows
        for _, row in comparison_df.iterrows():
            comparison_html += f"""
            <tr>
                <td><code>{row['Original']}</code></td>
                <td><code>{row['Stemmed']}</code></td>
                <td><code>{row['Lemmatized']}</code></td>
            </tr>
            """
        
        comparison_html += """
            </tbody>
        </table>
        </div>
        """
        
        output_html.append(comparison_html)
        
        output_html.append("""
        <div class="alert alert-success">
            <h4><i class="fas fa-lightbulb"></i> Stemming vs. Lemmatization</h4>
            <ul>
                <li><b>Stemming</b> - Cuts off word endings based on common patterns, faster but less accurate</li>
                <li><b>Lemmatization</b> - Uses vocabulary and morphological analysis, slower but produces actual words</li>
            </ul>
        </div>
        """)

        # NEW SECTION: N-gram Analysis
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">N-gram Analysis</h3>')
        
        output_html.append("""
        <div class="alert alert-light">
            <p>N-grams are contiguous sequences of n items from text. In NLP, they are used to capture word patterns and relationships,
            and are helpful for language modeling, prediction, and feature extraction.</p>
        </div>
        """)
        
        # Process text for n-grams (use filtered_words to avoid stopwords)
        # Convert to lowercase for consistency
        clean_words = [word.lower() for word in filtered_words if word.isalnum()]
        
        # Generate n-grams
        bigrams_list = list(ngrams(clean_words, 2))
        trigrams_list = list(ngrams(clean_words, 3))
        
        # Count frequencies
        bigram_freq = Counter(bigrams_list)
        trigram_freq = Counter(trigrams_list)
        
        # Get most common
        common_bigrams = bigram_freq.most_common(15)
        common_trigrams = trigram_freq.most_common(15)
        
        # Format for display
        bigram_labels = [' '.join(bg) for bg, _ in common_bigrams]
        bigram_values = [count for _, count in common_bigrams]
        
        trigram_labels = [' '.join(tg) for tg, _ in common_trigrams]
        trigram_values = [count for _, count in common_trigrams]
        
        # Create DataFrames for display
        bigram_df = pd.DataFrame({
            'Bigram': [' '.join(bg) for bg, _ in common_bigrams], 
            'Frequency': [count for _, count in common_bigrams]
        })
        
        trigram_df = pd.DataFrame({
            'Trigram': [' '.join(tg) for tg, _ in common_trigrams], 
            'Frequency': [count for _, count in common_trigrams]
        })
        
        # Explanation of n-grams
        output_html.append("""
        <div class="alert alert-info">
            <h4>What are N-grams?</h4>
            <ul>
                <li><b>Unigrams</b> - Single words (e.g., "climate")</li>
                <li><b>Bigrams</b> - Two consecutive words (e.g., "climate change")</li>
                <li><b>Trigrams</b> - Three consecutive words (e.g., "global climate change")</li>
            </ul>
            <p>N-grams capture contextual relationships between words and are valuable for many NLP tasks including language modeling, 
            machine translation, speech recognition, and text classification.</p>
        </div>
        """)
        
        # Create visualizations for bigrams and trigrams
        if bigram_labels and len(bigram_values) > 0:
            # Bigram visualization
            output_html.append('<h4>Most Common Bigrams</h4>')
            
            fig = plt.figure(figsize=(10, 6))
            plt.barh(range(len(bigram_labels)), bigram_values, align='center', color='#1976D2')
            plt.yticks(range(len(bigram_labels)), bigram_labels)
            plt.xlabel('Frequency')
            plt.title('Most Common Bigrams')
            plt.tight_layout()
            
            output_html.append(fig_to_html(fig))
            
            # Create a multi-column layout for bigrams
            bigram_html = """
            <div class="table-responsive">
            <table class="table table-striped table-hover" style="table-layout: fixed;">
                <thead class="table-primary">
                    <tr>
                        <th style="width: 10%;">#</th>
                        <th style="width: 35%;">Bigram</th>
                        <th style="width: 15%;">Freq</th>
                        <th style="width: 10%;">#</th>
                        <th style="width: 35%;">Bigram</th>
                        <th style="width: 15%;">Freq</th>
                    </tr>
                </thead>
                <tbody>
            """
            
            # Create rows with 2 bigrams per row
            for i in range(0, len(common_bigrams), 2):
                bigram_html += "<tr>"
                for j in range(2):
                    if i + j < len(common_bigrams):
                        bigram, freq = common_bigrams[i + j]
                        bigram_text = ' '.join(bigram)
                        bigram_html += f'<td>{i + j + 1}</td><td><code>{bigram_text}</code></td><td><span class="badge bg-info">{freq}</span></td>'
                    else:
                        bigram_html += '<td></td><td></td><td></td>'
                bigram_html += "</tr>"
            
            bigram_html += """
                </tbody>
            </table>
            </div>
            """
            
            output_html.append(bigram_html)
        else:
            output_html.append('<p class="text-muted">Not enough text to generate meaningful bigrams.</p>')
        
        if trigram_labels and len(trigram_values) > 0:
            # Trigram visualization
            output_html.append('<h4>Most Common Trigrams</h4>')
            
            fig = plt.figure(figsize=(10, 6))
            plt.barh(range(len(trigram_labels)), trigram_values, align='center', color='#4CAF50')
            plt.yticks(range(len(trigram_labels)), trigram_labels)
            plt.xlabel('Frequency')
            plt.title('Most Common Trigrams')
            plt.tight_layout()
            
            output_html.append(fig_to_html(fig))
            
            # Create a multi-column layout for trigrams
            trigram_html = """
            <div class="table-responsive">
            <table class="table table-striped table-hover" style="table-layout: fixed;">
                <thead class="table-primary">
                    <tr>
                        <th style="width: 10%;">#</th>
                        <th style="width: 35%;">Trigram</th>
                        <th style="width: 15%;">Freq</th>
                        <th style="width: 10%;">#</th>
                        <th style="width: 35%;">Trigram</th>
                        <th style="width: 15%;">Freq</th>
                    </tr>
                </thead>
                <tbody>
            """
            
            # Create rows with 2 trigrams per row
            for i in range(0, len(common_trigrams), 2):
                trigram_html += "<tr>"
                for j in range(2):
                    if i + j < len(common_trigrams):
                        trigram, freq = common_trigrams[i + j]
                        trigram_text = ' '.join(trigram)
                        trigram_html += f'<td>{i + j + 1}</td><td><code>{trigram_text}</code></td><td><span class="badge bg-success">{freq}</span></td>'
                    else:
                        trigram_html += '<td></td><td></td><td></td>'
                trigram_html += "</tr>"
            
            trigram_html += """
                </tbody>
            </table>
            </div>
            """
            
            output_html.append(trigram_html)
        else:
            output_html.append('<p class="text-muted">Not enough text to generate meaningful trigrams.</p>')
            
        # Applications of N-grams
        output_html.append("""
        <div class="alert alert-info">
            <h4><i class="fas fa-lightbulb"></i> Applications of N-gram Analysis</h4>
            <ul>
                <li><b>Language Modeling</b> - Predicting the next word in a sequence</li>
                <li><b>Machine Translation</b> - Improving translation quality</li>
                <li><b>Text Classification</b> - Using n-grams as features</li>
                <li><b>Spelling Correction</b> - Suggesting correct spellings</li>
                <li><b>Information Retrieval</b> - Enhancing search results</li>
                <li><b>Sentiment Analysis</b> - Capturing phrase-level sentiments</li>
            </ul>
        </div>
        """)
        
        # Word Cloud
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Word Cloud</h3>')
        
        try:
            # Create word cloud from filtered words
            wordcloud_text = ' '.join(filtered_words)
            wordcloud = WordCloud(
                width=800, 
                height=400, 
                background_color='white', 
                colormap='viridis',
                max_words=100, 
                contour_width=1, 
                contour_color='#1976D2'
            ).generate(wordcloud_text)
            
            # Display word cloud
            fig = plt.figure(figsize=(12, 8))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.tight_layout()
            
            output_html.append(fig_to_html(fig))
            
        except Exception as e:
            output_html.append(f"<div class='alert alert-warning'>Failed to generate word cloud: {str(e)}</div>")
        
        # Word Frequency
        output_html.append('<div class="section-divider"></div>')
        output_html.append('<h3 class="task-subheader">Word Frequency Analysis</h3>')
        
        # Calculate word frequencies
        word_freq = Counter(filtered_words)
        most_common = word_freq.most_common(20)
        
        # Create DataFrame
        freq_df = pd.DataFrame(most_common, columns=['Word', 'Frequency'])
        
        # Create horizontal bar chart
        fig = plt.figure(figsize=(12, 16))
        plt.barh(range(len(most_common)), [val[1] for val in most_common], align='center', color='#1976D2')
        plt.yticks(range(len(most_common)), [val[0] for val in most_common])
        plt.xlabel('Frequency')
        plt.title('Top 20 Words')
        plt.subplots_adjust(left=0.15, right=0.95, top=0.95, bottom=0.1)
        plt.tight_layout(pad=3.0)
        
        # Render chart
        output_html.append('<section class="wf-chart-section">')
        output_html.append('<div class="chart-container">')
        output_html.append(fig_to_html(fig))
        output_html.append('</div>')
        output_html.append('</section>')
        
        # Create a multi-column layout for word frequency
        freq_html = """
        <section class="wf-table-container">
        <div class="table-responsive">
        <table class="table table-striped table-hover" style="table-layout: fixed;">
            <thead class="table-primary">
                <tr>
                    <th style="width: 10%;">#</th>
                    <th style="width: 35%;">Word</th>
                    <th style="width: 15%;">Freq</th>
                    <th style="width: 10%;">#</th>
                    <th style="width: 35%;">Word</th>
                    <th style="width: 15%;">Freq</th>
                </tr>
            </thead>
            <tbody>
        """
        
        # Create rows with 2 words per row
        for i in range(0, len(most_common), 2):
            freq_html += "<tr>"
            for j in range(2):
                if i + j < len(most_common):
                    word, freq = most_common[i + j]
                    freq_html += f'<td>{i + j + 1}</td><td><code>{word}</code></td><td><span class="badge bg-primary">{freq}</span></td>'
                else:
                    freq_html += '<td></td><td></td><td></td>'
            freq_html += "</tr>"
        
        freq_html += """
            </tbody>
        </table>
        </div>
        </section>
        """
        
        output_html.append(freq_html)
    
    except Exception as e:
        output_html.append(f"""
        <div class="alert alert-danger">
            <h3>Error</h3>
            <p>Failed to process text: {str(e)}</p>
        </div>
        """)
    
    # About text preprocessing
    output_html.append("""
    <div class="card mt-4">
        <div class="card-header">
            <h4 class="mb-0">
                <i class="fas fa-info-circle"></i>
                About Text Preprocessing
            </h4>
        </div>
        <div class="card-body">
            <h5>What is Text Preprocessing?</h5>
            
            <p>Text preprocessing is the first step in NLP pipelines that transforms raw text into a clean, structured format
            suitable for analysis. It includes various techniques to standardize text and reduce noise.</p>
            
            <h5>Common Preprocessing Steps:</h5>
            
            <ul>
                <li><b>Tokenization</b> - Splitting text into individual words or sentences</li>
                <li><b>Normalization</b> - Converting text to lowercase, removing accents, etc.</li>
                <li><b>Noise Removal</b> - Removing punctuation, special characters, HTML tags, etc.</li>
                <li><b>Stopword Removal</b> - Filtering out common words that add little meaning</li>
                <li><b>Stemming/Lemmatization</b> - Reducing words to their root forms</li>
                <li><b>Spelling Correction</b> - Fixing typos and errors</li>
            </ul>
            
            <h5>Why Preprocess Text?</h5>
            
            <ul>
                <li>Reduces dimensionality and noise in the data</li>
                <li>Standardizes text for consistent analysis</li>
                <li>Improves performance of downstream NLP tasks</li>
                <li>Makes text more suitable for machine learning models</li>
            </ul>
        </div>
    </div>
    """)
    
    output_html.append('</div>')  # Close result-area div
    
    return '\n'.join(output_html)