Spaces:
Sleeping
Sleeping
File size: 4,107 Bytes
f078461 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import re
import numpy as np
clickbait_indicators = {
"curiosity_gap": [
"you won't believe",
"wait until you see",
"what happened next",
"the reason will shock you",
"this is why",
"here's what happened",
"the truth about",
"what nobody tells you",
"finally revealed",
],
"emotional_triggers": [
"shocking",
"incredible",
"amazing",
"unbelievable",
"stunning",
"heartbreaking",
"hilarious",
"terrifying",
"adorable",
"outrageous",
"mind-blowing",
"jaw-dropping",
"breathtaking",
],
"urgency_scarcity": [
"breaking",
"urgent",
"limited time",
"before it's too late",
"act now",
"don't miss",
"last chance",
"expires soon",
],
"personal_relevance": [
"in your area",
"people like you",
"your age",
"based on your",
"you need to know",
"this affects you",
"for people who",
],
"superlatives": [
"ultimate",
"perfect",
"best ever",
"greatest",
"worst",
"most amazing",
"incredible",
"unmatched",
"revolutionary",
],
"numbers_lists": [
r"\d+\s+(reasons?|ways?|things?|facts?|secrets?|tricks?|tips?)",
r"one\s+(weird|simple|amazing)\s+trick",
r"\d+\s+minute[s]?",
r"in\s+\d+\s+(steps?|minutes?|days?)",
],
"authority_social_proof": [
"doctors hate",
"experts don't want",
"celebrities use",
"scientists discovered",
"research shows",
"studies prove",
],
}
def extract_enhanced_features(texts):
"""Extract comprehensive handcrafted features"""
features = []
for text in texts:
if not isinstance(text, str):
text = str(text) if text is not None else ""
text_lower = text.lower()
feature_vector = []
# Clickbait pattern scores by category
for category, patterns in clickbait_indicators.items():
category_score = 0
for pattern in patterns:
if isinstance(pattern, str):
if pattern in text_lower:
category_score += 1
else: # regex pattern
if re.search(pattern, text_lower):
category_score += 1
# Normalize by pattern count in category
normalized_score = min(category_score / len(patterns), 1.0)
feature_vector.append(normalized_score)
# Punctuation and formatting features
exclamation_ratio = text.count("!") / max(len(text), 1)
question_ratio = text.count("?") / max(len(text), 1)
caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
feature_vector.extend(
[
min(exclamation_ratio * 10, 1.0),
min(question_ratio * 10, 1.0),
min(caps_ratio * 5, 1.0),
]
)
# Length and structure features
words = text.split()
word_count = len(words)
avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
feature_vector.extend(
[
min(word_count / 20, 1.0), # Normalized word count
min(avg_word_length / 8, 1.0), # Normalized avg word length
1.0 if word_count > 10 else 0.0, # Long headline indicator
]
)
# Semantic features
all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
number_count = len(
[word for word in words if any(char.isdigit() for char in word)]
)
feature_vector.extend(
[
min(all_caps_words / max(word_count, 1), 1.0),
min(number_count / max(word_count, 1), 1.0),
]
)
features.append(feature_vector)
return np.array(features)
|