Spaces:
Sleeping
Sleeping
File size: 4,336 Bytes
f078461 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import re
import time
import requests
from bs4 import BeautifulSoup
from newspaper import Article
def extract_content(
url: str,
content_cache: dict,
cache_key: str,
get_user_agent,
timeout: int,
cache_size: int,
) -> str:
"""Enhanced content extraction with newspaper3k fallback to BeautifulSoup."""
if cache_key in content_cache:
return content_cache[cache_key]
try:
# Try newspaper3k first
article = Article(url)
article.download()
article.parse()
content = article.text
# If newspaper3k didn't get good content, fallback to BeautifulSoup
if not content or len(content.strip()) < 100:
content = _fallback_extraction(url, get_user_agent, timeout)
# Clean and normalize content
content = _clean_content(content)
content = content[:10000] # Increased from 8000
# Cache result
if len(content_cache) >= cache_size:
oldest_key = next(iter(content_cache))
del content_cache[oldest_key]
content_cache[cache_key] = content
return content
except Exception:
# If newspaper3k fails, try BeautifulSoup fallback
try:
content = _fallback_extraction(url, get_user_agent, timeout)
content = _clean_content(content)
content = content[:10000]
if len(content_cache) >= cache_size:
oldest_key = next(iter(content_cache))
del content_cache[oldest_key]
content_cache[cache_key] = content
return content
except Exception:
return ""
def _fallback_extraction(url: str, get_user_agent, timeout: int) -> str:
"""Fallback extraction using BeautifulSoup."""
headers = {
"User-Agent": get_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
time.sleep(0.5)
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
# Handle encoding
if response.encoding is None or response.encoding.lower() in ["iso-8859-1", "ascii"]:
response.encoding = "utf-8"
try:
html_content = response.text
except UnicodeDecodeError:
try:
html_content = response.content.decode("utf-8", errors="ignore")
except UnicodeDecodeError:
html_content = response.content.decode("latin-1", errors="replace")
soup = BeautifulSoup(html_content, "html.parser")
# Remove irrelevant content
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "iframe"]):
element.decompose()
# Extract content using selectors
content_selectors = [
"article",
"main",
'[role="main"]',
".content",
".article-content",
".post-content",
".entry-content",
".article-body",
]
extracted_text = ""
for selector in content_selectors:
elements = soup.select(selector)
if elements:
extracted_text = " ".join([elem.get_text(separator=" ", strip=True) for elem in elements])
break
if not extracted_text:
content_elements = soup.find_all(["p", "div"], class_=lambda x: x is None or "ad" not in str(x).lower())
extracted_text = " ".join([elem.get_text(separator=" ", strip=True) for elem in content_elements])
if not extracted_text:
extracted_text = soup.get_text(separator=" ", strip=True)
return extracted_text
def _clean_content(content: str) -> str:
"""Clean and normalize extracted content."""
# Clean problematic characters
content = content.replace("\ufffd", " ")
content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x84\x86-\x9f]", " ", content)
# Normalize unicode if available
try:
import unicodedata
content = unicodedata.normalize("NFKD", content)
except:
pass
# Normalize whitespace and clean
content = re.sub(r"\s+", " ", content).strip()
content = re.sub(r"[^\x20-\x7E\u00A0-\uFFFF]", " ", content)
return content |