# scripts/scrape_html_to_md.py import os import json import re from datetime import datetime from urllib.parse import urljoin from markdownify import markdownify as md from playwright.sync_api import sync_playwright from bs4 import BeautifulSoup BASE_URL = "https://tds.s-anand.net/#/2025-01/" BASE_ORIGIN = "https://tds.s-anand.net" OUTPUT_DIR = "markdown_files" METADATA_FILE = "metadata.json" visited = set() metadata = [] def sanitize_filename(title): return re.sub(r'[\/*?:"<>|]', "_", title).strip().replace(" ", "_") def extract_all_internal_links(page): links = page.eval_on_selector_all("a[href]", "els => els.map(el => el.href)") return list(set(link for link in links if BASE_ORIGIN in link and '/#/' in link)) def wait_for_article_and_get_html(page): page.wait_for_selector("article.markdown-section#main", timeout=10000) return page.inner_html("article.markdown-section#main") def crawl_page(page, url): if url in visited: return visited.add(url) print(f"📄 Visiting: {url}") try: page.goto(url, wait_until="domcontentloaded") page.wait_for_timeout(1000) html = wait_for_article_and_get_html(page) except Exception as e: print(f"❌ Error: {e}") return title = page.title().split(" - ")[0].strip() or f"page_{len(visited)}" filename = sanitize_filename(title) filepath = os.path.join(OUTPUT_DIR, f"{filename}.md") markdown = md(html) with open(filepath, "w", encoding="utf-8") as f: f.write(f"---\ntitle: \"{title}\"\noriginal_url: \"{url}\"\ndownloaded_at: \"{datetime.now().isoformat()}\"\n---\n\n") f.write(markdown) metadata.append({ "title": title, "filename": f"{filename}.md", "original_url": url, "downloaded_at": datetime.now().isoformat() }) links = extract_all_internal_links(page) for link in links: crawl_page(page, link) def main(): os.makedirs(OUTPUT_DIR, exist_ok=True) with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context() page = context.new_page() crawl_page(page, BASE_URL) with open(METADATA_FILE, "w", encoding="utf-8") as f: json.dump(metadata, f, indent=2) print(f"✅ Completed. {len(metadata)} pages saved.") browser.close() if __name__ == "__main__": main()