File size: 4,107 Bytes
f078461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import re
import numpy as np

clickbait_indicators = {
    "curiosity_gap": [
        "you won't believe",
        "wait until you see",
        "what happened next",
        "the reason will shock you",
        "this is why",
        "here's what happened",
        "the truth about",
        "what nobody tells you",
        "finally revealed",
    ],
    "emotional_triggers": [
        "shocking",
        "incredible",
        "amazing",
        "unbelievable",
        "stunning",
        "heartbreaking",
        "hilarious",
        "terrifying",
        "adorable",
        "outrageous",
        "mind-blowing",
        "jaw-dropping",
        "breathtaking",
    ],
    "urgency_scarcity": [
        "breaking",
        "urgent",
        "limited time",
        "before it's too late",
        "act now",
        "don't miss",
        "last chance",
        "expires soon",
    ],
    "personal_relevance": [
        "in your area",
        "people like you",
        "your age",
        "based on your",
        "you need to know",
        "this affects you",
        "for people who",
    ],
    "superlatives": [
        "ultimate",
        "perfect",
        "best ever",
        "greatest",
        "worst",
        "most amazing",
        "incredible",
        "unmatched",
        "revolutionary",
    ],
    "numbers_lists": [
        r"\d+\s+(reasons?|ways?|things?|facts?|secrets?|tricks?|tips?)",
        r"one\s+(weird|simple|amazing)\s+trick",
        r"\d+\s+minute[s]?",
        r"in\s+\d+\s+(steps?|minutes?|days?)",
    ],
    "authority_social_proof": [
        "doctors hate",
        "experts don't want",
        "celebrities use",
        "scientists discovered",
        "research shows",
        "studies prove",
    ],
}


def extract_enhanced_features(texts):
    """Extract comprehensive handcrafted features"""
    features = []

    for text in texts:
        if not isinstance(text, str):
            text = str(text) if text is not None else ""

        text_lower = text.lower()
        feature_vector = []

        # Clickbait pattern scores by category
        for category, patterns in clickbait_indicators.items():
            category_score = 0
            for pattern in patterns:
                if isinstance(pattern, str):
                    if pattern in text_lower:
                        category_score += 1
                else:  # regex pattern
                    if re.search(pattern, text_lower):
                        category_score += 1

            # Normalize by pattern count in category
            normalized_score = min(category_score / len(patterns), 1.0)
            feature_vector.append(normalized_score)

        # Punctuation and formatting features
        exclamation_ratio = text.count("!") / max(len(text), 1)
        question_ratio = text.count("?") / max(len(text), 1)
        caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)

        feature_vector.extend(
            [
                min(exclamation_ratio * 10, 1.0),
                min(question_ratio * 10, 1.0),
                min(caps_ratio * 5, 1.0),
            ]
        )

        # Length and structure features
        words = text.split()
        word_count = len(words)
        avg_word_length = sum(len(word) for word in words) / max(word_count, 1)

        feature_vector.extend(
            [
                min(word_count / 20, 1.0),  # Normalized word count
                min(avg_word_length / 8, 1.0),  # Normalized avg word length
                1.0 if word_count > 10 else 0.0,  # Long headline indicator
            ]
        )

        # Semantic features
        all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
        number_count = len(
            [word for word in words if any(char.isdigit() for char in word)]
        )

        feature_vector.extend(
            [
                min(all_caps_words / max(word_count, 1), 1.0),
                min(number_count / max(word_count, 1), 1.0),
            ]
        )

        features.append(feature_vector)

    return np.array(features)