import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
from collections import Counter
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib_venn import venn2
from utils.model_loader import load_summarizer
from utils.helpers import fig_to_html, df_to_html_table
def summarization_handler(text_input, min_length=30, max_length=300, use_sampling=False):
"""Show text summarization capabilities."""
output_html = []
# Add result area container
output_html.append('
')
output_html.append('')
output_html.append("""
Text summarization condenses text to capture its main points, enabling quicker comprehension of large volumes of information.
""")
# Model info
output_html.append("""
Models & Techniques Used:
- Extractive Summarization - Selects important sentences from the original text
- Abstractive Summarization - BART model fine-tuned on CNN/DM dataset to generate new summary text
- Performance - ROUGE scores of approximately 40-45 on CNN/DM benchmark
""")
try:
# Check if text is long enough for summarization
sentences = nltk.sent_tokenize(text_input)
word_count = len(text_input.split())
if len(sentences) < 3 or word_count < 40:
output_html.append(f"""
Text Too Short for Summarization
The provided text contains only {len(sentences)} sentences and {word_count} words.
For effective summarization, please provide a longer text (at least 3 sentences and 40 words).
""")
else:
# Original Text Section
output_html.append('')
output_html.append(f"""
Length: {word_count} words.
""")
# Text Statistics
char_count = len(text_input)
avg_sentence_length = word_count / len(sentences)
avg_word_length = sum(len(word) for word in text_input.split()) / word_count
# Neural Summarization Section
output_html.append('')
output_html.append('
Using BART model to generate a human-like summary
')
# Parameter summary
output_html.append(f"""
Parameters: Min Length: {min_length} | Max Length: {max_length} | Sampling: {'Enabled' if use_sampling else 'Disabled'}
""")
try:
# Load summarizer model
summarizer = load_summarizer()
if summarizer is None:
output_html.append("""
Failed to load the abstractive summarization model. This may be due to memory constraints or missing dependencies.
""")
else:
# Check length limitations
max_token_limit = 1024 # BART typically has 1024 token limit
# If text is too long, warn user and truncate
if word_count > max_token_limit:
output_html.append(f"""
⚠️ Note: Text exceeds model's length limit. Only the first ~{max_token_limit} tokens will be used for summarization.
""")
# Generate summary using the specified min_length and max_length
abstractive_results = summarizer(
text_input,
max_length=max_length,
min_length=min_length,
do_sample=use_sampling,
temperature=0.7 if use_sampling else 1.0,
top_p=0.9 if use_sampling else 1.0,
length_penalty=2.0
)
abstractive_summary = abstractive_results[0]['summary_text']
# Calculate reduction statistics
abstractive_word_count = len(abstractive_summary.split())
abstractive_reduction = (1 - abstractive_word_count / word_count) * 100
# Summary Results
output_html.append(f"""
Original Length
{word_count} words
Summary Length
{abstractive_word_count} words
Compression
{abstractive_reduction:.1f}%
""")
# Key Terms & Topics Section
output_html.append('')
# Extract key terms with TF-IDF
key_terms = extract_key_terms(text_input, n=10)
# Create layout stacked vertically: table first, then chart
output_html.append('
')
# Row 1: Key terms table (full width)
output_html.append('
')
output_html.append('
Key Terms
')
# Create key terms table
terms_df = pd.DataFrame({
'#': range(1, len(key_terms) + 1),
'Keyword': [term[0] for term in key_terms],
'TF-IDF Score': [f"{term[1]:.4f}" for term in key_terms]
})
output_html.append(df_to_html_table(terms_df))
output_html.append('') # Close row 1 column
output_html.append('
') # Close row 1
# Row 2: Term importance chart (full width)
output_html.append('
')
output_html.append('
')
output_html.append('
Term Importance
')
# Create horizontal bar chart of key terms
fig = plt.figure(figsize=(10, 8))
# Reverse the order for bottom-to-top display
terms = [term[0] for term in key_terms]
scores = [term[1] for term in key_terms]
# Sort by score for better visualization
sorted_data = sorted(zip(terms, scores), key=lambda x: x[1])
terms = [x[0] for x in sorted_data]
scores = [x[1] for x in sorted_data]
# Create horizontal bar chart
plt.barh(terms, scores, color='#1976D2')
plt.xlabel('TF-IDF Score')
plt.ylabel('Keyword')
plt.title('Key Terms by TF-IDF Score')
plt.tight_layout()
output_html.append(fig_to_html(fig))
output_html.append('') # Close row 2 column
output_html.append('
') # Close row 2
except Exception as e:
output_html.append(f"""
Abstractive Summarization Error
Failed to perform abstractive summarization: {str(e)}
""")
# Extractive Summarization
output_html.append('')
output_html.append("""
Extractive summarization works by identifying important sentences in the text and extracting them to form a summary.
This implementation uses a variant of the TextRank algorithm, which is based on Google's PageRank.
""")
# Perform TextRank Summarization
extractive_summary = textrank_summarize(text_input, num_sentences=min(3, max(1, len(sentences) // 3)))
# Clean up the placeholder separator
extractive_summary = extractive_summary.replace("SENTBREAKOS.OS", " ")
# Calculate reduction statistics
extractive_word_count = len(extractive_summary.split())
extractive_reduction = (1 - extractive_word_count / word_count) * 100
output_html.append(f"""
Extractive Summary ({extractive_reduction:.1f}% reduction)
{extractive_summary}
""")
# Sentence importance visualization
output_html.append('
Sentence Importance
')
output_html.append('
The graph below shows the relative importance of each sentence based on the TextRank algorithm:
')
# Get sentence scores from TextRank
sentence_scores = textrank_sentence_scores(text_input)
# Sort sentences by their original order
sentence_items = list(sentence_scores.items())
sentence_items.sort(key=lambda x: int(x[0].split('_')[1]))
# Create visualization
fig = plt.figure(figsize=(10, 6))
bars = plt.bar(
[f"Sent {item[0].split('_')[1]}" for item in sentence_items],
[item[1] for item in sentence_items],
color='#1976D2'
)
# Highlight selected sentences
selected_indices = [int(idx.split('_')[1]) for idx in sentence_scores.keys() if idx in extractive_summary.split('SENTBREAKOS.OS')]
for i, bar in enumerate(bars):
if i+1 in selected_indices:
bar.set_color('#4CAF50')
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
'Selected', ha='center', va='bottom', fontsize=8, rotation=90)
plt.xlabel('Sentence')
plt.ylabel('Importance Score')
plt.title('Sentence Importance Based on TextRank')
plt.xticks(rotation=45)
plt.tight_layout()
output_html.append(fig_to_html(fig))
# Compare the two approaches
output_html.append('')
# Calculate overlap between summaries
extractive_words = set(re.findall(r'\b\w+\b', extractive_summary.lower()))
abstractive_words = set(re.findall(r'\b\w+\b', abstractive_summary.lower()))
common_words = extractive_words.intersection(abstractive_words)
if len(extractive_words) > 0 and len(abstractive_words) > 0:
overlap_percentage = len(common_words) / ((len(extractive_words) + len(abstractive_words)) / 2) * 100
else:
overlap_percentage = 0
# Create comparison table
comparison_data = {
'Metric': ['Word Count', 'Reduction %', 'Sentences', 'Words per Sentence', 'Unique Words'],
'Extractive': [
extractive_word_count,
f"{extractive_reduction:.1f}%",
len(nltk.sent_tokenize(extractive_summary)),
f"{extractive_word_count / max(1, len(nltk.sent_tokenize(extractive_summary))):.1f}",
len(extractive_words)
],
'Abstractive': [
abstractive_word_count,
f"{abstractive_reduction:.1f}%",
len(nltk.sent_tokenize(abstractive_summary)),
f"{abstractive_word_count / max(1, len(nltk.sent_tokenize(abstractive_summary))):.1f}",
len(abstractive_words)
]
}
comparison_df = pd.DataFrame(comparison_data)
output_html.append('
')
# Column 1: Comparison table
output_html.append('
')
output_html.append('
Summary Statistics
')
output_html.append(df_to_html_table(comparison_df))
output_html.append('')
# Column 2: Venn diagram of word overlap
output_html.append('
')
output_html.append('
Word Overlap Visualization
')
# Create Venn diagram
fig = plt.figure(figsize=(8, 6))
venn = venn2(
subsets=(
len(extractive_words - abstractive_words),
len(abstractive_words - extractive_words),
len(common_words)
),
set_labels=('Extractive', 'Abstractive')
)
# Set colors
venn.get_patch_by_id('10').set_color('#4CAF50')
venn.get_patch_by_id('01').set_color('#03A9F4')
venn.get_patch_by_id('11').set_color('#9C27B0')
plt.title('Word Overlap Between Summaries')
plt.text(0, -0.25, f"Overlap: {overlap_percentage:.1f}%", ha='center')
output_html.append(fig_to_html(fig))
# Show key shared and unique words
shared_words_list = list(common_words)
extractive_only = list(extractive_words - abstractive_words)
abstractive_only = list(abstractive_words - extractive_words)
# Limit the number of words shown
max_words = 10
output_html.append(f"""
Key Shared Words ({min(max_words, len(shared_words_list))} of {len(shared_words_list)})
{' '.join([f'{word}' for word in shared_words_list[:max_words]])}
Unique to Extractive ({min(max_words, len(extractive_only))} of {len(extractive_only)})
{' '.join([f'{word}' for word in extractive_only[:max_words]])}
Unique to Abstractive ({min(max_words, len(abstractive_only))} of {len(abstractive_only)})
{' '.join([f'{word}' for word in abstractive_only[:max_words]])}
""")
output_html.append('
') # Close column 2
output_html.append('
') # Close row
except Exception as e:
output_html.append(f"""
Error
Failed to summarize text: {str(e)}
""")
# About Text Summarization section
output_html.append("""
What is Text Summarization?
Text summarization is the process of creating a shorter version of a text while preserving its key information
and meaning. It helps users quickly grasp the main points without reading the entire document.
Two Main Approaches:
- Extractive Summarization: Selects and extracts existing sentences from the source text based on their importance
- Abstractive Summarization: Generates new sentences that capture the meaning of the source text (similar to how humans write summaries)
Applications:
- News digests - Quick summaries of news articles
- Research papers - Condensing long academic papers
- Legal documents - Summarizing complex legal text
- Meeting notes - Extracting key points from discussions
- Content curation - Creating snippets for content recommendations
""")
output_html.append('
') # Close result-area div
return '\n'.join(output_html)
def extract_key_terms(text, n=10):
"""Extract key terms using TF-IDF"""
try:
# Tokenize and preprocess
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Tokenize and clean text
words = word_tokenize(text.lower())
words = [lemmatizer.lemmatize(word) for word in words
if word.isalnum() and word not in stop_words and len(word) > 2]
# Create document for TF-IDF
document = [' '.join(words)]
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = vectorizer.fit_transform(document)
# Get feature names and scores
feature_names = vectorizer.get_feature_names_out()
scores = tfidf_matrix.toarray()[0]
# Create term-score pairs and sort by score
term_scores = [(term, score) for term, score in zip(feature_names, scores)]
term_scores.sort(key=lambda x: x[1], reverse=True)
return term_scores[:n]
except Exception as e:
print(f"Error extracting key terms: {str(e)}")
return [("term", 0.0) for _ in range(n)] # Return empty placeholder
# TextRank extractive summarization algorithm
def textrank_summarize(text, num_sentences=3):
"""Generate an extractive summary using TextRank algorithm"""
# Tokenize text into sentences
sentences = sent_tokenize(text)
# If text is too short, return the original text
if len(sentences) <= num_sentences:
return text
# Build a graph of sentences with similarity edges
sentence_scores = textrank_sentence_scores(text)
# Sort sentences by score
ranked_sentences = sorted([(score, i, s) for i, (s, score) in enumerate(zip(sentences, sentence_scores.values()))], reverse=True)
# Select top sentences based on score
selected_sentences = sorted(ranked_sentences[:num_sentences], key=lambda x: x[1])
# Combine selected sentences
summary = "SENTBREAKOS.OS".join([s[2] for s in selected_sentences])
return summary
def textrank_sentence_scores(text):
"""Generate sentence scores using TextRank algorithm"""
# Tokenize text into sentences
sentences = sent_tokenize(text)
# Create sentence IDs
sentence_ids = [f"sentence_{i+1}" for i in range(len(sentences))]
# Create sentence graph
G = nx.Graph()
# Add nodes
for sentence_id in sentence_ids:
G.add_node(sentence_id)
# Remove stopwords and preprocess sentences
stop_words = set(stopwords.words('english'))
sentence_words = []
for sentence in sentences:
words = [word.lower() for word in word_tokenize(sentence) if word.lower() not in stop_words and word.isalnum()]
sentence_words.append(words)
# Add edges based on sentence similarity
for i in range(len(sentence_ids)):
for j in range(i+1, len(sentence_ids)):
similarity = sentence_similarity(sentence_words[i], sentence_words[j])
if similarity > 0:
G.add_edge(sentence_ids[i], sentence_ids[j], weight=similarity)
# Run PageRank
scores = nx.pagerank(G)
return scores
def sentence_similarity(words1, words2):
"""Calculate similarity between two sentences based on word overlap"""
if not words1 or not words2:
return 0
# Convert to sets for intersection
set1 = set(words1)
set2 = set(words2)
# Jaccard similarity
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
if union == 0:
return 0
return intersection / union