Spaces:

aubynsamuel05
/

nli_checks

Sleeping

File size: 4,107 Bytes

f078461

import re
import numpy as np

clickbait_indicators = {
    "curiosity_gap": [
        "you won't believe",
        "wait until you see",
        "what happened next",
        "the reason will shock you",
        "this is why",
        "here's what happened",
        "the truth about",
        "what nobody tells you",
        "finally revealed",
    ],
    "emotional_triggers": [
        "shocking",
        "incredible",
        "amazing",
        "unbelievable",
        "stunning",
        "heartbreaking",
        "hilarious",
        "terrifying",
        "adorable",
        "outrageous",
        "mind-blowing",
        "jaw-dropping",
        "breathtaking",
    ],
    "urgency_scarcity": [
        "breaking",
        "urgent",
        "limited time",
        "before it's too late",
        "act now",
        "don't miss",
        "last chance",
        "expires soon",
    ],
    "personal_relevance": [
        "in your area",
        "people like you",
        "your age",
        "based on your",
        "you need to know",
        "this affects you",
        "for people who",
    ],
    "superlatives": [
        "ultimate",
        "perfect",
        "best ever",
        "greatest",
        "worst",
        "most amazing",
        "incredible",
        "unmatched",
        "revolutionary",
    ],
    "numbers_lists": [
        r"\d+\s+(reasons?|ways?|things?|facts?|secrets?|tricks?|tips?)",
        r"one\s+(weird|simple|amazing)\s+trick",
        r"\d+\s+minute[s]?",
        r"in\s+\d+\s+(steps?|minutes?|days?)",
    ],
    "authority_social_proof": [
        "doctors hate",
        "experts don't want",
        "celebrities use",
        "scientists discovered",
        "research shows",
        "studies prove",
    ],
}


def extract_enhanced_features(texts):
    """Extract comprehensive handcrafted features"""
    features = []

    for text in texts:
        if not isinstance(text, str):
            text = str(text) if text is not None else ""

        text_lower = text.lower()
        feature_vector = []

        # Clickbait pattern scores by category
        for category, patterns in clickbait_indicators.items():
            category_score = 0
            for pattern in patterns:
                if isinstance(pattern, str):
                    if pattern in text_lower:
                        category_score += 1
                else:  # regex pattern
                    if re.search(pattern, text_lower):
                        category_score += 1

            # Normalize by pattern count in category
            normalized_score = min(category_score / len(patterns), 1.0)
            feature_vector.append(normalized_score)

        # Punctuation and formatting features
        exclamation_ratio = text.count("!") / max(len(text), 1)
        question_ratio = text.count("?") / max(len(text), 1)
        caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)

        feature_vector.extend(
            [
                min(exclamation_ratio * 10, 1.0),
                min(question_ratio * 10, 1.0),
                min(caps_ratio * 5, 1.0),
            ]
        )

        # Length and structure features
        words = text.split()
        word_count = len(words)
        avg_word_length = sum(len(word) for word in words) / max(word_count, 1)

        feature_vector.extend(
            [
                min(word_count / 20, 1.0),  # Normalized word count
                min(avg_word_length / 8, 1.0),  # Normalized avg word length
                1.0 if word_count > 10 else 0.0,  # Long headline indicator
            ]
        )

        # Semantic features
        all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
        number_count = len(
            [word for word in words if any(char.isdigit() for char in word)]
        )

        feature_vector.extend(
            [
                min(all_caps_words / max(word_count, 1), 1.0),
                min(number_count / max(word_count, 1), 1.0),
            ]
        )

        features.append(feature_vector)

    return np.array(features)