') output_html.append('

Text Summarization

') output_html.append("""

Text summarization condenses text to capture its main points, enabling quicker comprehension of large volumes of information.

""") # Model info output_html.append("""

Models & Techniques Used:

Extractive Summarization - Selects important sentences from the original text
Abstractive Summarization - BART model fine-tuned on CNN/DM dataset to generate new summary text
Performance - ROUGE scores of approximately 40-45 on CNN/DM benchmark

""") try: # Check if text is long enough for summarization sentences = nltk.sent_tokenize(text_input) word_count = len(text_input.split()) if len(sentences) < 3 or word_count < 40: output_html.append(f"""

Text Too Short for Summarization

The provided text contains only {len(sentences)} sentences and {word_count} words. For effective summarization, please provide a longer text (at least 3 sentences and 40 words).

""") else: # Original Text Section output_html.append('

Original Text

') output_html.append(f"""

{text_input}

Length: {word_count} words.

""") # Text Statistics char_count = len(text_input) avg_sentence_length = word_count / len(sentences) avg_word_length = sum(len(word) for word in text_input.split()) / word_count # Neural Summarization Section output_html.append('

Neural Abstractive Summarization

') output_html.append('

Using BART model to generate a human-like summary

') # Parameter summary output_html.append(f"""

Parameters: Min Length: {min_length} | Max Length: {max_length} | Sampling: {'Enabled' if use_sampling else 'Disabled'}

""") try: # Load summarizer model summarizer = load_summarizer() if summarizer is None: output_html.append("""

Failed to load the abstractive summarization model. This may be due to memory constraints or missing dependencies.

""") else: # Check length limitations max_token_limit = 1024 # BART typically has 1024 token limit # If text is too long, warn user and truncate if word_count > max_token_limit: output_html.append(f"""

⚠️ Note: Text exceeds model's length limit. Only the first ~{max_token_limit} tokens will be used for summarization.

""") # Generate summary using the specified min_length and max_length abstractive_results = summarizer( text_input, max_length=max_length, min_length=min_length, do_sample=use_sampling, temperature=0.7 if use_sampling else 1.0, top_p=0.9 if use_sampling else 1.0, length_penalty=2.0 ) abstractive_summary = abstractive_results[0]['summary_text'] # Calculate reduction statistics abstractive_word_count = len(abstractive_summary.split()) abstractive_reduction = (1 - abstractive_word_count / word_count) * 100 # Summary Results output_html.append(f"""

Neural Summary

{abstractive_summary}

Original Length

{word_count} words

Summary Length

{abstractive_word_count} words

Compression

{abstractive_reduction:.1f}%

""") # Key Terms & Topics Section output_html.append('

Key Topics & Terms

') # Extract key terms with TF-IDF key_terms = extract_key_terms(text_input, n=10) # Create layout stacked vertically: table first, then chart output_html.append('

') # Row 1: Key terms table (full width) output_html.append('

') output_html.append('

Key Terms

') # Create key terms table terms_df = pd.DataFrame({ '#': range(1, len(key_terms) + 1), 'Keyword': [term[0] for term in key_terms], 'TF-IDF Score': [f"{term[1]:.4f}" for term in key_terms] }) output_html.append(df_to_html_table(terms_df)) output_html.append('

') # Close row 1 column output_html.append('

') # Close row 1 # Row 2: Term importance chart (full width) output_html.append('

') output_html.append('

Term Importance

') # Create horizontal bar chart of key terms fig = plt.figure(figsize=(10, 8)) # Reverse the order for bottom-to-top display terms = [term[0] for term in key_terms] scores = [term[1] for term in key_terms] # Sort by score for better visualization sorted_data = sorted(zip(terms, scores), key=lambda x: x[1]) terms = [x[0] for x in sorted_data] scores = [x[1] for x in sorted_data] # Create horizontal bar chart plt.barh(terms, scores, color='#1976D2') plt.xlabel('TF-IDF Score') plt.ylabel('Keyword') plt.title('Key Terms by TF-IDF Score') plt.tight_layout() output_html.append(fig_to_html(fig)) output_html.append('

') # Close row 2 column output_html.append('

') # Close row 2 except Exception as e: output_html.append(f"""

Abstractive Summarization Error

Failed to perform abstractive summarization: {str(e)}

""") # Extractive Summarization output_html.append('

Extractive Summarization

') output_html.append("""

Extractive summarization works by identifying important sentences in the text and extracting them to form a summary. This implementation uses a variant of the TextRank algorithm, which is based on Google's PageRank.

""") # Perform TextRank Summarization extractive_summary = textrank_summarize(text_input, num_sentences=min(3, max(1, len(sentences) // 3))) # Clean up the placeholder separator extractive_summary = extractive_summary.replace("SENTBREAKOS.OS", " ") # Calculate reduction statistics extractive_word_count = len(extractive_summary.split()) extractive_reduction = (1 - extractive_word_count / word_count) * 100 output_html.append(f"""

Extractive Summary ({extractive_reduction:.1f}% reduction)

{extractive_summary}

""") # Sentence importance visualization output_html.append('

Sentence Importance

') output_html.append('

The graph below shows the relative importance of each sentence based on the TextRank algorithm:

') # Get sentence scores from TextRank sentence_scores = textrank_sentence_scores(text_input) # Sort sentences by their original order sentence_items = list(sentence_scores.items()) sentence_items.sort(key=lambda x: int(x[0].split('_')[1])) # Create visualization fig = plt.figure(figsize=(10, 6)) bars = plt.bar( [f"Sent {item[0].split('_')[1]}" for item in sentence_items], [item[1] for item in sentence_items], color='#1976D2' ) # Highlight selected sentences selected_indices = [int(idx.split('_')[1]) for idx in sentence_scores.keys() if idx in extractive_summary.split('SENTBREAKOS.OS')] for i, bar in enumerate(bars): if i+1 in selected_indices: bar.set_color('#4CAF50') plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 'Selected', ha='center', va='bottom', fontsize=8, rotation=90) plt.xlabel('Sentence') plt.ylabel('Importance Score') plt.title('Sentence Importance Based on TextRank') plt.xticks(rotation=45) plt.tight_layout() output_html.append(fig_to_html(fig)) # Compare the two approaches output_html.append('

Summary Comparison

') # Calculate overlap between summaries extractive_words = set(re.findall(r'\b\w+\b', extractive_summary.lower())) abstractive_words = set(re.findall(r'\b\w+\b', abstractive_summary.lower())) common_words = extractive_words.intersection(abstractive_words) if len(extractive_words) > 0 and len(abstractive_words) > 0: overlap_percentage = len(common_words) / ((len(extractive_words) + len(abstractive_words)) / 2) * 100 else: overlap_percentage = 0 # Create comparison table comparison_data = { 'Metric': ['Word Count', 'Reduction %', 'Sentences', 'Words per Sentence', 'Unique Words'], 'Extractive': [ extractive_word_count, f"{extractive_reduction:.1f}%", len(nltk.sent_tokenize(extractive_summary)), f"{extractive_word_count / max(1, len(nltk.sent_tokenize(extractive_summary))):.1f}", len(extractive_words) ], 'Abstractive': [ abstractive_word_count, f"{abstractive_reduction:.1f}%", len(nltk.sent_tokenize(abstractive_summary)), f"{abstractive_word_count / max(1, len(nltk.sent_tokenize(abstractive_summary))):.1f}", len(abstractive_words) ] } comparison_df = pd.DataFrame(comparison_data) output_html.append('

') # Column 1: Comparison table output_html.append('

') output_html.append('

Summary Statistics

') output_html.append(df_to_html_table(comparison_df)) output_html.append('

') # Column 2: Venn diagram of word overlap output_html.append('

') output_html.append('

Word Overlap Visualization

') # Create Venn diagram fig = plt.figure(figsize=(8, 6)) venn = venn2( subsets=( len(extractive_words - abstractive_words), len(abstractive_words - extractive_words), len(common_words) ), set_labels=('Extractive', 'Abstractive') ) # Set colors venn.get_patch_by_id('10').set_color('#4CAF50') venn.get_patch_by_id('01').set_color('#03A9F4') venn.get_patch_by_id('11').set_color('#9C27B0') plt.title('Word Overlap Between Summaries') plt.text(0, -0.25, f"Overlap: {overlap_percentage:.1f}%", ha='center') output_html.append(fig_to_html(fig)) # Show key shared and unique words shared_words_list = list(common_words) extractive_only = list(extractive_words - abstractive_words) abstractive_only = list(abstractive_words - extractive_words) # Limit the number of words shown max_words = 10 output_html.append(f"""

Key Shared Words ({min(max_words, len(shared_words_list))} of {len(shared_words_list)})

{' '.join([f'{word}' for word in shared_words_list[:max_words]])}

Unique to Extractive ({min(max_words, len(extractive_only))} of {len(extractive_only)})

{' '.join([f'{word}' for word in extractive_only[:max_words]])}

Unique to Abstractive ({min(max_words, len(abstractive_only))} of {len(abstractive_only)})

{' '.join([f'{word}' for word in abstractive_only[:max_words]])}

""") output_html.append('

') # Close column 2 output_html.append('

') # Close row except Exception as e: output_html.append(f"""

Error

Failed to summarize text: {str(e)}

""") # About Text Summarization section output_html.append("""

About Text Summarization

What is Text Summarization?

Text summarization is the process of creating a shorter version of a text while preserving its key information and meaning. It helps users quickly grasp the main points without reading the entire document.

Two Main Approaches:

Extractive Summarization: Selects and extracts existing sentences from the source text based on their importance
Abstractive Summarization: Generates new sentences that capture the meaning of the source text (similar to how humans write summaries)

Applications:

News digests - Quick summaries of news articles
Research papers - Condensing long academic papers
Legal documents - Summarizing complex legal text
Meeting notes - Extracting key points from discussions
Content curation - Creating snippets for content recommendations

""") output_html.append('