Spaces:

Pew404
/

SexBot

Build error

SexBot / webloader.py

Upload folder using huggingface_hub

13fbd2e verified 10 months ago

1.37 kB

	from langchain_community.document_loaders import WebBaseLoader
	from llama_index.readers.web import SimpleWebPageReader
	from newspaper import Article
	from llama_index.core.llms import ChatMessage
	import httpx
	from bs4 import BeautifulSoup

	def load_web(url: str):
	loader = WebBaseLoader(url)
	return loader.load()

	def llama_load_web(url: str):
	docs = SimpleWebPageReader(html_to_text=True).load_data([url])
	return docs

	def newspaper_load_web(url: str):
	article = Article(url)
	try:
	article.download()
	article.parse()
	result = {
	"title": article.title,
	"text": article.text,
	}
	return result
	except Exception as e:
	return ""

	def html2text(url: str):
	from html2text import HTML2Text
	h = HTML2Text()
	h.ignore_links = True
	return h.handle(url)

	def httpxs(url: str):
	import httpx
	r = httpx.get(url)
	return r.text

	if __name__ == "__main__":
	url = "https://cn.pornhub.com/video/search?search=hongkongdoll"
	response = httpx.get(url)
	soup = BeautifulSoup(response.text, "html.parser")
	text_content = soup.find_all("p")
	print([ele.get_text() for ele in text_content if ele.get_text() != ""])
	text_content = "\n".join([p.get_text() for p in text_content if p.get_text() != ""])
	print(f"text_content: {text_content}")