| from langchain_community.document_loaders import WebBaseLoader | |
| from llama_index.readers.web import SimpleWebPageReader | |
| from newspaper import Article | |
| from llama_index.core.llms import ChatMessage | |
| import httpx | |
| from bs4 import BeautifulSoup | |
| def load_web(url: str): | |
| loader = WebBaseLoader(url) | |
| return loader.load() | |
| def llama_load_web(url: str): | |
| docs = SimpleWebPageReader(html_to_text=True).load_data([url]) | |
| return docs | |
| def newspaper_load_web(url: str): | |
| article = Article(url) | |
| try: | |
| article.download() | |
| article.parse() | |
| result = { | |
| "title": article.title, | |
| "text": article.text, | |
| } | |
| return result | |
| except Exception as e: | |
| return "" | |
| def html2text(url: str): | |
| from html2text import HTML2Text | |
| h = HTML2Text() | |
| h.ignore_links = True | |
| return h.handle(url) | |
| def httpxs(url: str): | |
| import httpx | |
| r = httpx.get(url) | |
| return r.text | |
| if __name__ == "__main__": | |
| url = "https://cn.pornhub.com/video/search?search=hongkongdoll" | |
| response = httpx.get(url) | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| text_content = soup.find_all("p") | |
| print([ele.get_text() for ele in text_content if ele.get_text() != ""]) | |
| text_content = "\n".join([p.get_text() for p in text_content if p.get_text() != ""]) | |
| print(f"text_content: {text_content}") | |