import matplotlib.pyplot as plt import pandas as pd import numpy as np import nltk from collections import Counter import networkx as nx from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize from nltk.stem import WordNetLemmatizer import re from sklearn.feature_extraction.text import TfidfVectorizer from matplotlib_venn import venn2 from utils.model_loader import load_summarizer from utils.helpers import fig_to_html, df_to_html_table def summarization_handler(text_input, min_length=30, max_length=300, use_sampling=False): """Show text summarization capabilities.""" output_html = [] # Add result area container output_html.append('
') output_html.append('

Text Summarization

') output_html.append("""
Text summarization condenses text to capture its main points, enabling quicker comprehension of large volumes of information.
""") # Model info output_html.append("""

Models & Techniques Used:

""") try: # Check if text is long enough for summarization sentences = nltk.sent_tokenize(text_input) word_count = len(text_input.split()) if len(sentences) < 3 or word_count < 40: output_html.append(f"""

Text Too Short for Summarization

The provided text contains only {len(sentences)} sentences and {word_count} words. For effective summarization, please provide a longer text (at least 3 sentences and 40 words).

""") else: # Original Text Section output_html.append('

Original Text

') output_html.append(f"""
{text_input}

Length: {word_count} words.

""") # Text Statistics char_count = len(text_input) avg_sentence_length = word_count / len(sentences) avg_word_length = sum(len(word) for word in text_input.split()) / word_count # Neural Summarization Section output_html.append('

Neural Abstractive Summarization

') output_html.append('

Using BART model to generate a human-like summary

') # Parameter summary output_html.append(f"""
Parameters: Min Length: {min_length} | Max Length: {max_length} | Sampling: {'Enabled' if use_sampling else 'Disabled'}
""") try: # Load summarizer model summarizer = load_summarizer() if summarizer is None: output_html.append("""

Failed to load the abstractive summarization model. This may be due to memory constraints or missing dependencies.

""") else: # Check length limitations max_token_limit = 1024 # BART typically has 1024 token limit # If text is too long, warn user and truncate if word_count > max_token_limit: output_html.append(f"""

⚠️ Note: Text exceeds model's length limit. Only the first ~{max_token_limit} tokens will be used for summarization.

""") # Generate summary using the specified min_length and max_length abstractive_results = summarizer( text_input, max_length=max_length, min_length=min_length, do_sample=use_sampling, temperature=0.7 if use_sampling else 1.0, top_p=0.9 if use_sampling else 1.0, length_penalty=2.0 ) abstractive_summary = abstractive_results[0]['summary_text'] # Calculate reduction statistics abstractive_word_count = len(abstractive_summary.split()) abstractive_reduction = (1 - abstractive_word_count / word_count) * 100 # Summary Results output_html.append(f"""

Neural Summary

{abstractive_summary}
Original Length

{word_count} words

Summary Length

{abstractive_word_count} words

Compression

{abstractive_reduction:.1f}%

""") # Key Terms & Topics Section output_html.append('

Key Topics & Terms

') # Extract key terms with TF-IDF key_terms = extract_key_terms(text_input, n=10) # Create layout stacked vertically: table first, then chart output_html.append('
') # Row 1: Key terms table (full width) output_html.append('
') output_html.append('

Key Terms

') # Create key terms table terms_df = pd.DataFrame({ '#': range(1, len(key_terms) + 1), 'Keyword': [term[0] for term in key_terms], 'TF-IDF Score': [f"{term[1]:.4f}" for term in key_terms] }) output_html.append(df_to_html_table(terms_df)) output_html.append('
') # Close row 1 column output_html.append('
') # Close row 1 # Row 2: Term importance chart (full width) output_html.append('
') output_html.append('
') output_html.append('

Term Importance

') # Create horizontal bar chart of key terms fig = plt.figure(figsize=(10, 8)) # Reverse the order for bottom-to-top display terms = [term[0] for term in key_terms] scores = [term[1] for term in key_terms] # Sort by score for better visualization sorted_data = sorted(zip(terms, scores), key=lambda x: x[1]) terms = [x[0] for x in sorted_data] scores = [x[1] for x in sorted_data] # Create horizontal bar chart plt.barh(terms, scores, color='#1976D2') plt.xlabel('TF-IDF Score') plt.ylabel('Keyword') plt.title('Key Terms by TF-IDF Score') plt.tight_layout() output_html.append(fig_to_html(fig)) output_html.append('
') # Close row 2 column output_html.append('
') # Close row 2 except Exception as e: output_html.append(f"""

Abstractive Summarization Error

Failed to perform abstractive summarization: {str(e)}

""") # Extractive Summarization output_html.append('

Extractive Summarization

') output_html.append("""

Extractive summarization works by identifying important sentences in the text and extracting them to form a summary. This implementation uses a variant of the TextRank algorithm, which is based on Google's PageRank.

""") # Perform TextRank Summarization extractive_summary = textrank_summarize(text_input, num_sentences=min(3, max(1, len(sentences) // 3))) # Clean up the placeholder separator extractive_summary = extractive_summary.replace("SENTBREAKOS.OS", " ") # Calculate reduction statistics extractive_word_count = len(extractive_summary.split()) extractive_reduction = (1 - extractive_word_count / word_count) * 100 output_html.append(f"""

Extractive Summary ({extractive_reduction:.1f}% reduction)

{extractive_summary}
""") # Sentence importance visualization output_html.append('

Sentence Importance

') output_html.append('

The graph below shows the relative importance of each sentence based on the TextRank algorithm:

') # Get sentence scores from TextRank sentence_scores = textrank_sentence_scores(text_input) # Sort sentences by their original order sentence_items = list(sentence_scores.items()) sentence_items.sort(key=lambda x: int(x[0].split('_')[1])) # Create visualization fig = plt.figure(figsize=(10, 6)) bars = plt.bar( [f"Sent {item[0].split('_')[1]}" for item in sentence_items], [item[1] for item in sentence_items], color='#1976D2' ) # Highlight selected sentences selected_indices = [int(idx.split('_')[1]) for idx in sentence_scores.keys() if idx in extractive_summary.split('SENTBREAKOS.OS')] for i, bar in enumerate(bars): if i+1 in selected_indices: bar.set_color('#4CAF50') plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 'Selected', ha='center', va='bottom', fontsize=8, rotation=90) plt.xlabel('Sentence') plt.ylabel('Importance Score') plt.title('Sentence Importance Based on TextRank') plt.xticks(rotation=45) plt.tight_layout() output_html.append(fig_to_html(fig)) # Compare the two approaches output_html.append('

Summary Comparison

') # Calculate overlap between summaries extractive_words = set(re.findall(r'\b\w+\b', extractive_summary.lower())) abstractive_words = set(re.findall(r'\b\w+\b', abstractive_summary.lower())) common_words = extractive_words.intersection(abstractive_words) if len(extractive_words) > 0 and len(abstractive_words) > 0: overlap_percentage = len(common_words) / ((len(extractive_words) + len(abstractive_words)) / 2) * 100 else: overlap_percentage = 0 # Create comparison table comparison_data = { 'Metric': ['Word Count', 'Reduction %', 'Sentences', 'Words per Sentence', 'Unique Words'], 'Extractive': [ extractive_word_count, f"{extractive_reduction:.1f}%", len(nltk.sent_tokenize(extractive_summary)), f"{extractive_word_count / max(1, len(nltk.sent_tokenize(extractive_summary))):.1f}", len(extractive_words) ], 'Abstractive': [ abstractive_word_count, f"{abstractive_reduction:.1f}%", len(nltk.sent_tokenize(abstractive_summary)), f"{abstractive_word_count / max(1, len(nltk.sent_tokenize(abstractive_summary))):.1f}", len(abstractive_words) ] } comparison_df = pd.DataFrame(comparison_data) output_html.append('
') # Column 1: Comparison table output_html.append('
') output_html.append('

Summary Statistics

') output_html.append(df_to_html_table(comparison_df)) output_html.append('
') # Column 2: Venn diagram of word overlap output_html.append('
') output_html.append('

Word Overlap Visualization

') # Create Venn diagram fig = plt.figure(figsize=(8, 6)) venn = venn2( subsets=( len(extractive_words - abstractive_words), len(abstractive_words - extractive_words), len(common_words) ), set_labels=('Extractive', 'Abstractive') ) # Set colors venn.get_patch_by_id('10').set_color('#4CAF50') venn.get_patch_by_id('01').set_color('#03A9F4') venn.get_patch_by_id('11').set_color('#9C27B0') plt.title('Word Overlap Between Summaries') plt.text(0, -0.25, f"Overlap: {overlap_percentage:.1f}%", ha='center') output_html.append(fig_to_html(fig)) # Show key shared and unique words shared_words_list = list(common_words) extractive_only = list(extractive_words - abstractive_words) abstractive_only = list(abstractive_words - extractive_words) # Limit the number of words shown max_words = 10 output_html.append(f"""
Key Shared Words ({min(max_words, len(shared_words_list))} of {len(shared_words_list)})
{' '.join([f'{word}' for word in shared_words_list[:max_words]])}
Unique to Extractive ({min(max_words, len(extractive_only))} of {len(extractive_only)})
{' '.join([f'{word}' for word in extractive_only[:max_words]])}
Unique to Abstractive ({min(max_words, len(abstractive_only))} of {len(abstractive_only)})
{' '.join([f'{word}' for word in abstractive_only[:max_words]])}
""") output_html.append('
') # Close column 2 output_html.append('
') # Close row except Exception as e: output_html.append(f"""

Error

Failed to summarize text: {str(e)}

""") # About Text Summarization section output_html.append("""

About Text Summarization

What is Text Summarization?

Text summarization is the process of creating a shorter version of a text while preserving its key information and meaning. It helps users quickly grasp the main points without reading the entire document.

Two Main Approaches:
  • Extractive Summarization: Selects and extracts existing sentences from the source text based on their importance
  • Abstractive Summarization: Generates new sentences that capture the meaning of the source text (similar to how humans write summaries)
Applications:
  • News digests - Quick summaries of news articles
  • Research papers - Condensing long academic papers
  • Legal documents - Summarizing complex legal text
  • Meeting notes - Extracting key points from discussions
  • Content curation - Creating snippets for content recommendations
""") output_html.append('
') # Close result-area div return '\n'.join(output_html) def extract_key_terms(text, n=10): """Extract key terms using TF-IDF""" try: # Tokenize and preprocess stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() # Tokenize and clean text words = word_tokenize(text.lower()) words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words and len(word) > 2] # Create document for TF-IDF document = [' '.join(words)] # Create TF-IDF vectorizer vectorizer = TfidfVectorizer(max_features=100) tfidf_matrix = vectorizer.fit_transform(document) # Get feature names and scores feature_names = vectorizer.get_feature_names_out() scores = tfidf_matrix.toarray()[0] # Create term-score pairs and sort by score term_scores = [(term, score) for term, score in zip(feature_names, scores)] term_scores.sort(key=lambda x: x[1], reverse=True) return term_scores[:n] except Exception as e: print(f"Error extracting key terms: {str(e)}") return [("term", 0.0) for _ in range(n)] # Return empty placeholder # TextRank extractive summarization algorithm def textrank_summarize(text, num_sentences=3): """Generate an extractive summary using TextRank algorithm""" # Tokenize text into sentences sentences = sent_tokenize(text) # If text is too short, return the original text if len(sentences) <= num_sentences: return text # Build a graph of sentences with similarity edges sentence_scores = textrank_sentence_scores(text) # Sort sentences by score ranked_sentences = sorted([(score, i, s) for i, (s, score) in enumerate(zip(sentences, sentence_scores.values()))], reverse=True) # Select top sentences based on score selected_sentences = sorted(ranked_sentences[:num_sentences], key=lambda x: x[1]) # Combine selected sentences summary = "SENTBREAKOS.OS".join([s[2] for s in selected_sentences]) return summary def textrank_sentence_scores(text): """Generate sentence scores using TextRank algorithm""" # Tokenize text into sentences sentences = sent_tokenize(text) # Create sentence IDs sentence_ids = [f"sentence_{i+1}" for i in range(len(sentences))] # Create sentence graph G = nx.Graph() # Add nodes for sentence_id in sentence_ids: G.add_node(sentence_id) # Remove stopwords and preprocess sentences stop_words = set(stopwords.words('english')) sentence_words = [] for sentence in sentences: words = [word.lower() for word in word_tokenize(sentence) if word.lower() not in stop_words and word.isalnum()] sentence_words.append(words) # Add edges based on sentence similarity for i in range(len(sentence_ids)): for j in range(i+1, len(sentence_ids)): similarity = sentence_similarity(sentence_words[i], sentence_words[j]) if similarity > 0: G.add_edge(sentence_ids[i], sentence_ids[j], weight=similarity) # Run PageRank scores = nx.pagerank(G) return scores def sentence_similarity(words1, words2): """Calculate similarity between two sentences based on word overlap""" if not words1 or not words2: return 0 # Convert to sets for intersection set1 = set(words1) set2 = set(words2) # Jaccard similarity intersection = len(set1.intersection(set2)) union = len(set1.union(set2)) if union == 0: return 0 return intersection / union