') output_html.append('

Text Preprocessing

') output_html.append("""

Text preprocessing is the process of cleaning and transforming raw text into a format that can be easily analyzed by NLP models.

""") # Model info output_html.append("""

Tools & Libraries Used:

NLTK - For stopwords, tokenization, stemming and lemmatization
Regular Expressions - For pattern matching and text cleaning
WordCloud - For visualizing word frequency

""") # Ensure NLTK resources are downloaded download_nltk_resources() try: # Original Text output_html.append('

Original Text

') output_html.append(f'

{text_input}

') # Text statistics word_count = len(text_input.split()) char_count = len(text_input) sentence_count = len(nltk.sent_tokenize(text_input)) stats_html = f"""

{word_count}

Words

{char_count}

Characters

{sentence_count}

Sentences

""" output_html.append(stats_html) # NEW SECTION: Text Cleaning with Regular Expressions output_html.append('

') output_html.append('

Text Cleaning with Regular Expressions

') output_html.append("""

Regular expressions (regex) provide powerful pattern matching capabilities for cleaning and processing text data. Common text cleaning tasks include removing URLs, HTML tags, special characters, and normalizing text formats.

""") # Several regex cleaning examples url_pattern = r'https?://\S+|www\.\S+' html_pattern = r'<.*?>' whitespace_pattern = r'\s+' email_pattern = r'\S+@\S+' # Original text for comparison text_cleaned = text_input # 1. Remove URLs urls_cleaned = re.sub(url_pattern, '[URL]', text_cleaned) # 2. Remove HTML tags html_cleaned = re.sub(html_pattern, '', urls_cleaned) # 3. Remove extra whitespace whitespace_cleaned = re.sub(whitespace_pattern, ' ', html_cleaned).strip() # 4. Remove email addresses email_cleaned = re.sub(email_pattern, '[EMAIL]', whitespace_cleaned) # 5. Fix common contractions contractions = { r"won't": "will not", r"can't": "cannot", r"n't": " not", r"'re": " are", r"'s": " is", r"'d": " would", r"'ll": " will", r"'t": " not", r"'ve": " have", r"'m": " am" } contraction_cleaned = email_cleaned for pattern, replacement in contractions.items(): contraction_cleaned = re.sub(pattern, replacement, contraction_cleaned) # Display the regex cleaning examples in a table output_html.append("""

Regex Text Cleaning Operations

Operation	Regex Pattern	Description
URL Removal	`https?://\\S+\|www\\.\\S+`	Removes or replaces web URLs in text
HTML Tag Removal	`<.*?>`	Strips HTML/XML markup tags
Whitespace Normalization	`\\s+`	Replaces multiple spaces, tabs, and newlines with a single space
Email Anonymization	`\\S+@\\S+`	Redacts email addresses for privacy
Contraction Expansion	`Multiple patterns`	Expands contractions like "don't" to "do not"

""") # Example of cleaned text output_html.append("""

Example of Text After Regex Cleaning

Before Cleaning

""") output_html.append(f"{text_input}") output_html.append("""

After Regex Cleaning

""") output_html.append(f"{contraction_cleaned}") output_html.append("""

""") output_html.append("""

Why Use Regex for Text Cleaning?

Precision: Regular expressions allow for precise pattern matching
Flexibility: Can be customized for domain-specific cleaning needs
Efficiency: Processes text in a single pass for better performance
Standardization: Creates consistent formatting across documents

""") # Word length distribution word_lengths = [len(word) for word in text_input.split()] fig = plt.figure(figsize=(10, 4)) plt.hist(word_lengths, bins=range(1, max(word_lengths) + 2), alpha=0.7, color='#1976D2') plt.xlabel('Word Length') plt.ylabel('Frequency') plt.title('Word Length Distribution') plt.grid(alpha=0.3) plt.tight_layout() output_html.append('

') output_html.append('

Word Length Distribution

') output_html.append(fig_to_html(fig)) # Case Normalization output_html.append('

') output_html.append('

Case Normalization

') lowercase_text = text_input.lower() uppercase_text = text_input.upper() case_html = f"""

Original Text

{text_input}

Lowercase Text

{lowercase_text}

Uppercase Text

{uppercase_text}

""" output_html.append(case_html) # Remove Punctuation & Special Characters output_html.append('

') output_html.append('

Punctuation & Special Characters Removal

') # Count original punctuation punc_count = sum([1 for char in text_input if char in string.punctuation]) # Remove punctuation no_punct_text = re.sub(r'[^\w\s]', '', text_input) punct_html = f"""

Original Text

{text_input}

Contains {punc_count} punctuation marks

Without Punctuation

{no_punct_text}

Removed {punc_count} punctuation marks

""" output_html.append(punct_html) # Show removed punctuation punct_chars = [char for char in text_input if char in string.punctuation] punct_freq = Counter(punct_chars) if punct_freq: output_html.append('

Punctuation Distribution

') fig = plt.figure(figsize=(10, 4)) plt.bar(punct_freq.keys(), punct_freq.values(), color='#1976D2') plt.xlabel('Punctuation') plt.ylabel('Frequency') plt.title('Punctuation Distribution') plt.tight_layout() output_html.append(fig_to_html(fig)) # Tokenization output_html.append('

') output_html.append('

Tokenization

') # Word tokenization words = nltk.word_tokenize(text_input) # Create a multi-column layout for word tokens output_html.append('

Word Tokens

') output_html.append(f'

Total tokens: {len(words)} (showing first 50)

') # Create a multi-column table layout tokens_html = """

""" # Create rows with 3 tokens per row for i in range(0, min(50, len(words)), 3): tokens_html += "" for j in range(3): if i + j < min(50, len(words)): token = words[i + j] tokens_html += f'' else: tokens_html += '' tokens_html += "" tokens_html += """

#	Token	Length	#	Token	Length	#	Token	Length
{i + j + 1}	`{token}`	{len(token)}

""" output_html.append(tokens_html) # Sentence tokenization sentences = nltk.sent_tokenize(text_input) output_html.append('

Sentence Tokens

') output_html.append(f'

Total sentences: {len(sentences)}

') for i, sentence in enumerate(sentences[:5]): output_html.append(f'

{i+1}. {sentence}

') if len(sentences) > 5: output_html.append(f'

... and {len(sentences) - 5} more sentences.

') # Stopwords Removal output_html.append('

') output_html.append('

Stopwords Removal

') stop_words = set(stopwords.words('english')) filtered_words = [word for word in words if word.lower() not in stop_words] # Count stopwords stopword_count = len(words) - len(filtered_words) stopword_percentage = (stopword_count / len(words)) * 100 if words else 0 output_html.append(f"""

Original Words

{len(words)}

After Stopword Removal

{len(filtered_words)}

Stopwords Removed

{stopword_count} ({stopword_percentage:.1f}%)

""") # Display common stopwords in the text text_stopwords = [word for word in words if word.lower() in stop_words] stop_freq = Counter(text_stopwords).most_common(10) if stop_freq: output_html.append('

Most Common Stopwords in Text

') # Create a multi-column layout for stopwords stopwords_html = """

""" # Create rows with 2 stopwords per row for i in range(0, len(stop_freq), 2): stopwords_html += "" for j in range(2): if i + j < len(stop_freq): stopword, freq = stop_freq[i + j] stopwords_html += f'' else: stopwords_html += '' stopwords_html += "" stopwords_html += """

#	Stopword	Frequency	#	Stopword	Frequency
{i + j + 1}	`{stopword}`	{freq}

""" output_html.append(stopwords_html) # Visualization of before and after fig, ax = plt.subplots(1, 2, figsize=(12, 5)) # Before ax[0].hist([len(word) for word in words], bins=range(1, 15), alpha=0.7, color='#1976D2') ax[0].set_title('Word Length Before Stopword Removal') ax[0].set_xlabel('Word Length') ax[0].set_ylabel('Frequency') # After ax[1].hist([len(word) for word in filtered_words], bins=range(1, 15), alpha=0.7, color='#4CAF50') ax[1].set_title('Word Length After Stopword Removal') ax[1].set_xlabel('Word Length') ax[1].set_ylabel('Frequency') plt.tight_layout() output_html.append(fig_to_html(fig)) # Stemming and Lemmatization output_html.append('

') output_html.append('

Stemming & Lemmatization

') # Apply stemming (Porter Stemmer) stemmer = PorterStemmer() stemmed_words = [stemmer.stem(word) for word in filtered_words[:100]] # Limit to first 100 words for performance # Apply lemmatization lemmatizer = WordNetLemmatizer() lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words[:100]] # Limit to first 100 words # Create comparison DataFrame comparison_data = [] for i in range(min(20, len(filtered_words))): # Show first 20 examples if i < len(filtered_words) and filtered_words[i].isalpha(): # Only include alphabetic words comparison_data.append({ 'Original': filtered_words[i], 'Stemmed': stemmer.stem(filtered_words[i]), 'Lemmatized': lemmatizer.lemmatize(filtered_words[i]) }) comparison_df = pd.DataFrame(comparison_data) output_html.append('

Stemming vs. Lemmatization Comparison

') # Create a custom table for stemming vs lemmatization comparison comparison_html = """

""" # Add comparison data rows for _, row in comparison_df.iterrows(): comparison_html += f""" """ comparison_html += """

Original	Stemmed	Lemmatized
`{row['Original']}`	`{row['Stemmed']}`	`{row['Lemmatized']}`

""" output_html.append(comparison_html) output_html.append("""

Stemming vs. Lemmatization

Stemming - Cuts off word endings based on common patterns, faster but less accurate
Lemmatization - Uses vocabulary and morphological analysis, slower but produces actual words

""") # NEW SECTION: N-gram Analysis output_html.append('

') output_html.append('

N-gram Analysis

') output_html.append("""

N-grams are contiguous sequences of n items from text. In NLP, they are used to capture word patterns and relationships, and are helpful for language modeling, prediction, and feature extraction.

""") # Process text for n-grams (use filtered_words to avoid stopwords) # Convert to lowercase for consistency clean_words = [word.lower() for word in filtered_words if word.isalnum()] # Generate n-grams bigrams_list = list(ngrams(clean_words, 2)) trigrams_list = list(ngrams(clean_words, 3)) # Count frequencies bigram_freq = Counter(bigrams_list) trigram_freq = Counter(trigrams_list) # Get most common common_bigrams = bigram_freq.most_common(15) common_trigrams = trigram_freq.most_common(15) # Format for display bigram_labels = [' '.join(bg) for bg, _ in common_bigrams] bigram_values = [count for _, count in common_bigrams] trigram_labels = [' '.join(tg) for tg, _ in common_trigrams] trigram_values = [count for _, count in common_trigrams] # Create DataFrames for display bigram_df = pd.DataFrame({ 'Bigram': [' '.join(bg) for bg, _ in common_bigrams], 'Frequency': [count for _, count in common_bigrams] }) trigram_df = pd.DataFrame({ 'Trigram': [' '.join(tg) for tg, _ in common_trigrams], 'Frequency': [count for _, count in common_trigrams] }) # Explanation of n-grams output_html.append("""

What are N-grams?

Unigrams - Single words (e.g., "climate")
Bigrams - Two consecutive words (e.g., "climate change")
Trigrams - Three consecutive words (e.g., "global climate change")

N-grams capture contextual relationships between words and are valuable for many NLP tasks including language modeling, machine translation, speech recognition, and text classification.

""") # Create visualizations for bigrams and trigrams if bigram_labels and len(bigram_values) > 0: # Bigram visualization output_html.append('

Most Common Bigrams

') fig = plt.figure(figsize=(10, 6)) plt.barh(range(len(bigram_labels)), bigram_values, align='center', color='#1976D2') plt.yticks(range(len(bigram_labels)), bigram_labels) plt.xlabel('Frequency') plt.title('Most Common Bigrams') plt.tight_layout() output_html.append(fig_to_html(fig)) # Create a multi-column layout for bigrams bigram_html = """

""" # Create rows with 2 bigrams per row for i in range(0, len(common_bigrams), 2): bigram_html += "" for j in range(2): if i + j < len(common_bigrams): bigram, freq = common_bigrams[i + j] bigram_text = ' '.join(bigram) bigram_html += f'' else: bigram_html += '' bigram_html += "" bigram_html += """

#	Bigram	Freq	#	Bigram	Freq
{i + j + 1}	`{bigram_text}`	{freq}

""" output_html.append(bigram_html) else: output_html.append('

Not enough text to generate meaningful bigrams.

') if trigram_labels and len(trigram_values) > 0: # Trigram visualization output_html.append('

Most Common Trigrams

') fig = plt.figure(figsize=(10, 6)) plt.barh(range(len(trigram_labels)), trigram_values, align='center', color='#4CAF50') plt.yticks(range(len(trigram_labels)), trigram_labels) plt.xlabel('Frequency') plt.title('Most Common Trigrams') plt.tight_layout() output_html.append(fig_to_html(fig)) # Create a multi-column layout for trigrams trigram_html = """

""" # Create rows with 2 trigrams per row for i in range(0, len(common_trigrams), 2): trigram_html += "" for j in range(2): if i + j < len(common_trigrams): trigram, freq = common_trigrams[i + j] trigram_text = ' '.join(trigram) trigram_html += f'' else: trigram_html += '' trigram_html += "" trigram_html += """

#	Trigram	Freq	#	Trigram	Freq
{i + j + 1}	`{trigram_text}`	{freq}

""" output_html.append(trigram_html) else: output_html.append('

Not enough text to generate meaningful trigrams.

') # Applications of N-grams output_html.append("""

Applications of N-gram Analysis

Language Modeling - Predicting the next word in a sequence
Machine Translation - Improving translation quality
Text Classification - Using n-grams as features
Spelling Correction - Suggesting correct spellings
Information Retrieval - Enhancing search results
Sentiment Analysis - Capturing phrase-level sentiments

""") # Word Cloud output_html.append('

') output_html.append('

Word Cloud

') try: # Create word cloud from filtered words wordcloud_text = ' '.join(filtered_words) wordcloud = WordCloud( width=800, height=400, background_color='white', colormap='viridis', max_words=100, contour_width=1, contour_color='#1976D2' ).generate(wordcloud_text) # Display word cloud fig = plt.figure(figsize=(12, 8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.tight_layout() output_html.append(fig_to_html(fig)) except Exception as e: output_html.append(f"

Failed to generate word cloud: {str(e)}

") # Word Frequency output_html.append('

') output_html.append('

Word Frequency Analysis

') # Calculate word frequencies word_freq = Counter(filtered_words) most_common = word_freq.most_common(20) # Create DataFrame freq_df = pd.DataFrame(most_common, columns=['Word', 'Frequency']) # Create horizontal bar chart fig = plt.figure(figsize=(12, 16)) plt.barh(range(len(most_common)), [val[1] for val in most_common], align='center', color='#1976D2') plt.yticks(range(len(most_common)), [val[0] for val in most_common]) plt.xlabel('Frequency') plt.title('Top 20 Words') plt.subplots_adjust(left=0.15, right=0.95, top=0.95, bottom=0.1) plt.tight_layout(pad=3.0) # Render chart output_html.append('

') output_html.append('

') output_html.append(fig_to_html(fig)) output_html.append('

') output_html.append('

') # Create a multi-column layout for word frequency freq_html = """

""" # Create rows with 2 words per row for i in range(0, len(most_common), 2): freq_html += "" for j in range(2): if i + j < len(most_common): word, freq = most_common[i + j] freq_html += f'' else: freq_html += '' freq_html += "" freq_html += """

#	Word	Freq	#	Word	Freq
{i + j + 1}	`{word}`	{freq}

""" output_html.append(freq_html) except Exception as e: output_html.append(f"""

Error

Failed to process text: {str(e)}

""") # About text preprocessing output_html.append("""

About Text Preprocessing

What is Text Preprocessing?

Text preprocessing is the first step in NLP pipelines that transforms raw text into a clean, structured format suitable for analysis. It includes various techniques to standardize text and reduce noise.

Common Preprocessing Steps:

Tokenization - Splitting text into individual words or sentences
Normalization - Converting text to lowercase, removing accents, etc.
Noise Removal - Removing punctuation, special characters, HTML tags, etc.
Stopword Removal - Filtering out common words that add little meaning
Stemming/Lemmatization - Reducing words to their root forms
Spelling Correction - Fixing typos and errors

Why Preprocess Text?

Reduces dimensionality and noise in the data
Standardizes text for consistent analysis
Improves performance of downstream NLP tasks
Makes text more suitable for machine learning models

""") output_html.append('