Spaces:
Sleeping
Sleeping
| # scripts/scrape_discourse.py | |
| import os | |
| import json | |
| from datetime import datetime | |
| from bs4 import BeautifulSoup | |
| from playwright.sync_api import sync_playwright, TimeoutError | |
| BASE_URL = "https://discourse.onlinedegree.iitm.ac.in" | |
| CATEGORY_ID = 34 | |
| CATEGORY_JSON_URL = f"{BASE_URL}/c/courses/tds-kb/{CATEGORY_ID}.json" | |
| AUTH_STATE_FILE = "auth.json" | |
| DATE_FROM = datetime(2025, 1, 1) | |
| DATE_TO = datetime(2025, 4, 14) | |
| def parse_date(date_str): | |
| try: | |
| return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ") | |
| except ValueError: | |
| return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ") | |
| def login_and_save_auth(playwright): | |
| print("π Login required. Opening browser...") | |
| browser = playwright.chromium.launch(headless=False) | |
| context = browser.new_context() | |
| page = context.new_page() | |
| page.goto(f"{BASE_URL}/login") | |
| print("β‘οΈ Log in using Google (IITM email), then press βΆοΈ in Playwright.") | |
| page.pause() | |
| context.storage_state(path=AUTH_STATE_FILE) | |
| print("β Login state saved.") | |
| browser.close() | |
| def is_authenticated(page): | |
| try: | |
| page.goto(CATEGORY_JSON_URL, timeout=10000) | |
| page.wait_for_selector("pre", timeout=5000) | |
| json.loads(page.inner_text("pre")) | |
| return True | |
| except (TimeoutError, json.JSONDecodeError): | |
| return False | |
| def scrape_posts(playwright): | |
| os.makedirs("data", exist_ok=True) | |
| browser = playwright.chromium.launch(headless=True) | |
| context = browser.new_context(storage_state=AUTH_STATE_FILE) | |
| page = context.new_page() | |
| all_topics = [] | |
| page_num = 0 | |
| while True: | |
| paginated_url = f"{CATEGORY_JSON_URL}?page={page_num}" | |
| print(f"π¦ Fetching page {page_num}") | |
| page.goto(paginated_url) | |
| try: | |
| data = json.loads(page.inner_text("pre")) | |
| except: | |
| data = json.loads(page.content()) | |
| topics = data.get("topic_list", {}).get("topics", []) | |
| if not topics: | |
| break | |
| all_topics.extend(topics) | |
| page_num += 1 | |
| print(f"β Found {len(all_topics)} topics") | |
| filtered_posts = [] | |
| for topic in all_topics: | |
| created_at = parse_date(topic["created_at"]) | |
| if DATE_FROM <= created_at <= DATE_TO: | |
| topic_url = f"{BASE_URL}/t/{topic['slug']}/{topic['id']}.json" | |
| page.goto(topic_url) | |
| try: | |
| topic_data = json.loads(page.inner_text("pre")) | |
| except: | |
| topic_data = json.loads(page.content()) | |
| posts = topic_data.get("post_stream", {}).get("posts", []) | |
| for post in posts: | |
| filtered_posts.append({ | |
| "topic_id": topic["id"], | |
| "topic_title": topic.get("title"), | |
| "author": post["username"], | |
| "created_at": post["created_at"], | |
| "content": BeautifulSoup(post["cooked"], "html.parser").get_text() | |
| }) | |
| with open("data/discourse_posts.json", "w", encoding="utf-8") as f: | |
| json.dump(filtered_posts, f, indent=2) | |
| print(f"β Saved {len(filtered_posts)} posts") | |
| browser.close() | |
| def main(): | |
| with sync_playwright() as p: | |
| if not os.path.exists(AUTH_STATE_FILE): | |
| login_and_save_auth(p) | |
| else: | |
| browser = p.chromium.launch(headless=True) | |
| context = browser.new_context(storage_state=AUTH_STATE_FILE) | |
| page = context.new_page() | |
| if not is_authenticated(page): | |
| browser.close() | |
| login_and_save_auth(p) | |
| else: | |
| browser.close() | |
| scrape_posts(p) | |
| if __name__ == "__main__": | |
| main() | |