nli_checks / deploy /utils /clickbait_utils.py
aubynsamuel05's picture
Fake new dectector with Gradio interface
f078461
import re
import numpy as np
clickbait_indicators = {
"curiosity_gap": [
"you won't believe",
"wait until you see",
"what happened next",
"the reason will shock you",
"this is why",
"here's what happened",
"the truth about",
"what nobody tells you",
"finally revealed",
],
"emotional_triggers": [
"shocking",
"incredible",
"amazing",
"unbelievable",
"stunning",
"heartbreaking",
"hilarious",
"terrifying",
"adorable",
"outrageous",
"mind-blowing",
"jaw-dropping",
"breathtaking",
],
"urgency_scarcity": [
"breaking",
"urgent",
"limited time",
"before it's too late",
"act now",
"don't miss",
"last chance",
"expires soon",
],
"personal_relevance": [
"in your area",
"people like you",
"your age",
"based on your",
"you need to know",
"this affects you",
"for people who",
],
"superlatives": [
"ultimate",
"perfect",
"best ever",
"greatest",
"worst",
"most amazing",
"incredible",
"unmatched",
"revolutionary",
],
"numbers_lists": [
r"\d+\s+(reasons?|ways?|things?|facts?|secrets?|tricks?|tips?)",
r"one\s+(weird|simple|amazing)\s+trick",
r"\d+\s+minute[s]?",
r"in\s+\d+\s+(steps?|minutes?|days?)",
],
"authority_social_proof": [
"doctors hate",
"experts don't want",
"celebrities use",
"scientists discovered",
"research shows",
"studies prove",
],
}
def extract_enhanced_features(texts):
"""Extract comprehensive handcrafted features"""
features = []
for text in texts:
if not isinstance(text, str):
text = str(text) if text is not None else ""
text_lower = text.lower()
feature_vector = []
# Clickbait pattern scores by category
for category, patterns in clickbait_indicators.items():
category_score = 0
for pattern in patterns:
if isinstance(pattern, str):
if pattern in text_lower:
category_score += 1
else: # regex pattern
if re.search(pattern, text_lower):
category_score += 1
# Normalize by pattern count in category
normalized_score = min(category_score / len(patterns), 1.0)
feature_vector.append(normalized_score)
# Punctuation and formatting features
exclamation_ratio = text.count("!") / max(len(text), 1)
question_ratio = text.count("?") / max(len(text), 1)
caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
feature_vector.extend(
[
min(exclamation_ratio * 10, 1.0),
min(question_ratio * 10, 1.0),
min(caps_ratio * 5, 1.0),
]
)
# Length and structure features
words = text.split()
word_count = len(words)
avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
feature_vector.extend(
[
min(word_count / 20, 1.0), # Normalized word count
min(avg_word_length / 8, 1.0), # Normalized avg word length
1.0 if word_count > 10 else 0.0, # Long headline indicator
]
)
# Semantic features
all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
number_count = len(
[word for word in words if any(char.isdigit() for char in word)]
)
feature_vector.extend(
[
min(all_caps_words / max(word_count, 1), 1.0),
min(number_count / max(word_count, 1), 1.0),
]
)
features.append(feature_vector)
return np.array(features)