Spaces:
Sleeping
Sleeping
| import re | |
| import numpy as np | |
| clickbait_indicators = { | |
| "curiosity_gap": [ | |
| "you won't believe", | |
| "wait until you see", | |
| "what happened next", | |
| "the reason will shock you", | |
| "this is why", | |
| "here's what happened", | |
| "the truth about", | |
| "what nobody tells you", | |
| "finally revealed", | |
| ], | |
| "emotional_triggers": [ | |
| "shocking", | |
| "incredible", | |
| "amazing", | |
| "unbelievable", | |
| "stunning", | |
| "heartbreaking", | |
| "hilarious", | |
| "terrifying", | |
| "adorable", | |
| "outrageous", | |
| "mind-blowing", | |
| "jaw-dropping", | |
| "breathtaking", | |
| ], | |
| "urgency_scarcity": [ | |
| "breaking", | |
| "urgent", | |
| "limited time", | |
| "before it's too late", | |
| "act now", | |
| "don't miss", | |
| "last chance", | |
| "expires soon", | |
| ], | |
| "personal_relevance": [ | |
| "in your area", | |
| "people like you", | |
| "your age", | |
| "based on your", | |
| "you need to know", | |
| "this affects you", | |
| "for people who", | |
| ], | |
| "superlatives": [ | |
| "ultimate", | |
| "perfect", | |
| "best ever", | |
| "greatest", | |
| "worst", | |
| "most amazing", | |
| "incredible", | |
| "unmatched", | |
| "revolutionary", | |
| ], | |
| "numbers_lists": [ | |
| r"\d+\s+(reasons?|ways?|things?|facts?|secrets?|tricks?|tips?)", | |
| r"one\s+(weird|simple|amazing)\s+trick", | |
| r"\d+\s+minute[s]?", | |
| r"in\s+\d+\s+(steps?|minutes?|days?)", | |
| ], | |
| "authority_social_proof": [ | |
| "doctors hate", | |
| "experts don't want", | |
| "celebrities use", | |
| "scientists discovered", | |
| "research shows", | |
| "studies prove", | |
| ], | |
| } | |
| def extract_enhanced_features(texts): | |
| """Extract comprehensive handcrafted features""" | |
| features = [] | |
| for text in texts: | |
| if not isinstance(text, str): | |
| text = str(text) if text is not None else "" | |
| text_lower = text.lower() | |
| feature_vector = [] | |
| # Clickbait pattern scores by category | |
| for category, patterns in clickbait_indicators.items(): | |
| category_score = 0 | |
| for pattern in patterns: | |
| if isinstance(pattern, str): | |
| if pattern in text_lower: | |
| category_score += 1 | |
| else: # regex pattern | |
| if re.search(pattern, text_lower): | |
| category_score += 1 | |
| # Normalize by pattern count in category | |
| normalized_score = min(category_score / len(patterns), 1.0) | |
| feature_vector.append(normalized_score) | |
| # Punctuation and formatting features | |
| exclamation_ratio = text.count("!") / max(len(text), 1) | |
| question_ratio = text.count("?") / max(len(text), 1) | |
| caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1) | |
| feature_vector.extend( | |
| [ | |
| min(exclamation_ratio * 10, 1.0), | |
| min(question_ratio * 10, 1.0), | |
| min(caps_ratio * 5, 1.0), | |
| ] | |
| ) | |
| # Length and structure features | |
| words = text.split() | |
| word_count = len(words) | |
| avg_word_length = sum(len(word) for word in words) / max(word_count, 1) | |
| feature_vector.extend( | |
| [ | |
| min(word_count / 20, 1.0), # Normalized word count | |
| min(avg_word_length / 8, 1.0), # Normalized avg word length | |
| 1.0 if word_count > 10 else 0.0, # Long headline indicator | |
| ] | |
| ) | |
| # Semantic features | |
| all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1) | |
| number_count = len( | |
| [word for word in words if any(char.isdigit() for char in word)] | |
| ) | |
| feature_vector.extend( | |
| [ | |
| min(all_caps_words / max(word_count, 1), 1.0), | |
| min(number_count / max(word_count, 1), 1.0), | |
| ] | |
| ) | |
| features.append(feature_vector) | |
| return np.array(features) | |