iitm_scraper / scripts /scrape_discourse.py
Shriyakupp's picture
Upload 107 files
980dc8d verified
# scripts/scrape_discourse.py
import os
import json
from datetime import datetime
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright, TimeoutError
BASE_URL = "https://discourse.onlinedegree.iitm.ac.in"
CATEGORY_ID = 34
CATEGORY_JSON_URL = f"{BASE_URL}/c/courses/tds-kb/{CATEGORY_ID}.json"
AUTH_STATE_FILE = "auth.json"
DATE_FROM = datetime(2025, 1, 1)
DATE_TO = datetime(2025, 4, 14)
def parse_date(date_str):
try:
return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
except ValueError:
return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
def login_and_save_auth(playwright):
print("πŸ” Login required. Opening browser...")
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto(f"{BASE_URL}/login")
print("➑️ Log in using Google (IITM email), then press ▢️ in Playwright.")
page.pause()
context.storage_state(path=AUTH_STATE_FILE)
print("βœ… Login state saved.")
browser.close()
def is_authenticated(page):
try:
page.goto(CATEGORY_JSON_URL, timeout=10000)
page.wait_for_selector("pre", timeout=5000)
json.loads(page.inner_text("pre"))
return True
except (TimeoutError, json.JSONDecodeError):
return False
def scrape_posts(playwright):
os.makedirs("data", exist_ok=True)
browser = playwright.chromium.launch(headless=True)
context = browser.new_context(storage_state=AUTH_STATE_FILE)
page = context.new_page()
all_topics = []
page_num = 0
while True:
paginated_url = f"{CATEGORY_JSON_URL}?page={page_num}"
print(f"πŸ“¦ Fetching page {page_num}")
page.goto(paginated_url)
try:
data = json.loads(page.inner_text("pre"))
except:
data = json.loads(page.content())
topics = data.get("topic_list", {}).get("topics", [])
if not topics:
break
all_topics.extend(topics)
page_num += 1
print(f"βœ… Found {len(all_topics)} topics")
filtered_posts = []
for topic in all_topics:
created_at = parse_date(topic["created_at"])
if DATE_FROM <= created_at <= DATE_TO:
topic_url = f"{BASE_URL}/t/{topic['slug']}/{topic['id']}.json"
page.goto(topic_url)
try:
topic_data = json.loads(page.inner_text("pre"))
except:
topic_data = json.loads(page.content())
posts = topic_data.get("post_stream", {}).get("posts", [])
for post in posts:
filtered_posts.append({
"topic_id": topic["id"],
"topic_title": topic.get("title"),
"author": post["username"],
"created_at": post["created_at"],
"content": BeautifulSoup(post["cooked"], "html.parser").get_text()
})
with open("data/discourse_posts.json", "w", encoding="utf-8") as f:
json.dump(filtered_posts, f, indent=2)
print(f"βœ… Saved {len(filtered_posts)} posts")
browser.close()
def main():
with sync_playwright() as p:
if not os.path.exists(AUTH_STATE_FILE):
login_and_save_auth(p)
else:
browser = p.chromium.launch(headless=True)
context = browser.new_context(storage_state=AUTH_STATE_FILE)
page = context.new_page()
if not is_authenticated(page):
browser.close()
login_and_save_auth(p)
else:
browser.close()
scrape_posts(p)
if __name__ == "__main__":
main()