Spaces:

aubynsamuel05
/

nli_checks

Sleeping

App Files Files Community

nli_checks / deploy /utils /clickbait_utils.py

aubynsamuel05

Fake new dectector with Gradio interface

f078461 5 months ago

raw

history blame contribute delete

4.11 kB

	import re
	import numpy as np

	clickbait_indicators = {
	"curiosity_gap": [
	"you won't believe",
	"wait until you see",
	"what happened next",
	"the reason will shock you",
	"this is why",
	"here's what happened",
	"the truth about",
	"what nobody tells you",
	"finally revealed",
	],
	"emotional_triggers": [
	"shocking",
	"incredible",
	"amazing",
	"unbelievable",
	"stunning",
	"heartbreaking",
	"hilarious",
	"terrifying",
	"adorable",
	"outrageous",
	"mind-blowing",
	"jaw-dropping",
	"breathtaking",
	],
	"urgency_scarcity": [
	"breaking",
	"urgent",
	"limited time",
	"before it's too late",
	"act now",
	"don't miss",
	"last chance",
	"expires soon",
	],
	"personal_relevance": [
	"in your area",
	"people like you",
	"your age",
	"based on your",
	"you need to know",
	"this affects you",
	"for people who",
	],
	"superlatives": [
	"ultimate",
	"perfect",
	"best ever",
	"greatest",
	"worst",
	"most amazing",
	"incredible",
	"unmatched",
	"revolutionary",
	],
	"numbers_lists": [
	r"\d+\s+(reasons?\|ways?\|things?\|facts?\|secrets?\|tricks?\|tips?)",
	r"one\s+(weird\|simple\|amazing)\s+trick",
	r"\d+\s+minute[s]?",
	r"in\s+\d+\s+(steps?\|minutes?\|days?)",
	],
	"authority_social_proof": [
	"doctors hate",
	"experts don't want",
	"celebrities use",
	"scientists discovered",
	"research shows",
	"studies prove",
	],
	}


	def extract_enhanced_features(texts):
	"""Extract comprehensive handcrafted features"""
	features = []

	for text in texts:
	if not isinstance(text, str):
	text = str(text) if text is not None else ""

	text_lower = text.lower()
	feature_vector = []

	# Clickbait pattern scores by category
	for category, patterns in clickbait_indicators.items():
	category_score = 0
	for pattern in patterns:
	if isinstance(pattern, str):
	if pattern in text_lower:
	category_score += 1
	else: # regex pattern
	if re.search(pattern, text_lower):
	category_score += 1

	# Normalize by pattern count in category
	normalized_score = min(category_score / len(patterns), 1.0)
	feature_vector.append(normalized_score)

	# Punctuation and formatting features
	exclamation_ratio = text.count("!") / max(len(text), 1)
	question_ratio = text.count("?") / max(len(text), 1)
	caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)

	feature_vector.extend(
	[
	min(exclamation_ratio * 10, 1.0),
	min(question_ratio * 10, 1.0),
	min(caps_ratio * 5, 1.0),
	]
	)

	# Length and structure features
	words = text.split()
	word_count = len(words)
	avg_word_length = sum(len(word) for word in words) / max(word_count, 1)

	feature_vector.extend(
	[
	min(word_count / 20, 1.0), # Normalized word count
	min(avg_word_length / 8, 1.0), # Normalized avg word length
	1.0 if word_count > 10 else 0.0, # Long headline indicator
	]
	)

	# Semantic features
	all_caps_words = sum(1 for word in words if word.isupper() and len(word) > 1)
	number_count = len(
	[word for word in words if any(char.isdigit() for char in word)]
	)

	feature_vector.extend(
	[
	min(all_caps_words / max(word_count, 1), 1.0),
	min(number_count / max(word_count, 1), 1.0),
	]
	)

	features.append(feature_vector)

	return np.array(features)