import re from io import StringIO from typing import Any import pandas as pd import requests from bs4 import BeautifulSoup def process_list_element(list_element: Any, indent: int = 0) -> str: """リスト要素を再帰的に処理する関数""" result = [] is_ordered = list_element.name == "ol" for i, li in enumerate(list_element.find_all("li", recursive=False)): # リスト項目のテキストを取得 # ネストされたリストを除いたテキストを取得 item_text = "" for content in li.contents: if content.name not in ["ul", "ol"]: item_text += str(content) item_text = BeautifulSoup(item_text, "html.parser").get_text().strip() # 順序付きリストなら番号を、そうでなければ記号を使用 prefix = " " * indent + (f"{i + 1}. " if is_ordered else "* ") if item_text: result.append(prefix + item_text) # ネストされたリストを処理 for nested_list in li.find_all(["ul", "ol"], recursive=False): nested_content = process_list_element(nested_list, indent + 1) if nested_content: result.append(nested_content) return "\n".join(result) def get_wiki_content(title: str, language: str = "en") -> tuple[str, dict[str, pd.DataFrame]]: """ Get Wikipedia page content and tables. Returns: A tuple containing the page content as a string and a dictionary of tables extracted from the page. The keys of the dictionary are "table_1", "table_2", etc. and the values are pandas DataFrames representing the tables. Example: content, tables = get_wiki_content("Python_(programming_language)") print(content) print(tables["table_1"]) # Access the first table Args: title: wikipedia page title (e.g., "Python_(programming_language)") language: wikipedia language (e.g., "en" for English, "ja" for Japanese) """ # パースAPIのURLを構築 api_url = f"https://{language}.wikipedia.org/w/api.php" # APIパラメータ params = { "action": "parse", "page": title, "format": "json", "prop": "text", "disabletoc": True, } # リクエストを送信 response = requests.get(api_url, params=params, timeout=30) # type: ignore # レスポンスをチェック if response.status_code != 200: raise Exception(f"api error: {response.status_code} - {response.text}") # JSONレスポンスをパース data = response.json() # エラーチェック if "error" in data: raise Exception(f"api error: {data['error']['info']}") if "parse" not in data: raise Exception("api error: No parse data found") # HTMLコンテンツを取得 html_content = data["parse"]["text"]["*"] # HTMLをパース soup = BeautifulSoup(html_content, "html.parser") content_soup = BeautifulSoup(html_content, "html.parser") # テーブル情報を取得 tables_dict: dict[str, pd.DataFrame] = {} table_ids: list[tuple[str, str]] = [] # (table_id, table_html) のリスト # ターゲットとするテーブルを特定: wikitableとinfobox table_index = 1 # まず、infobox(バイオグラフィーテーブル)を処理 infoboxes = soup.find_all("table", class_=lambda c: c and "infobox" in c) for i, table in enumerate(infoboxes): table_id = f"table_{table_index}" table_ids.append((table_id, str(table))) table_index += 1 # 次に、wikitableを処理 wikitables = soup.find_all("table", class_="wikitable") for i, table in enumerate(wikitables): table_id = f"table_{table_index}" table_ids.append((table_id, str(table))) table_index += 1 # 抽出したテーブルをpandasで処理 for table_id, table_html in table_ids: try: dfs = pd.read_html(StringIO(table_html)) if dfs: tables_dict[table_id] = dfs[0] except Exception: # テーブル解析に失敗した場合はスキップ continue # コンテンツ内のテーブルをプレースホルダに置き換え table_placeholders: dict[str, str] = {} # infoboxの処理 for i, table in enumerate(content_soup.find_all("table", class_=lambda c: c and "infobox" in c)): table_id = f"table_{i + 1}" if table_id in tables_dict: placeholder = f"{{{{{table_id}}}}}" table_placeholders[table_id] = placeholder table_placeholder_tag = content_soup.new_tag("p") table_placeholder_tag.string = placeholder table.replace_with(table_placeholder_tag) # wikitableの処理(インデックスは続きから) wikitable_start_index = len(infoboxes) + 1 for i, table in enumerate(content_soup.find_all("table", class_="wikitable")): table_id = f"table_{wikitable_start_index + i}" if table_id in tables_dict: placeholder = f"{{{{{table_id}}}}}" table_placeholders[table_id] = placeholder table_placeholder_tag = content_soup.new_tag("p") table_placeholder_tag.string = placeholder table.replace_with(table_placeholder_tag) # クリーンな本文テキストを抽出 for element in content_soup.find_all(["sup", "div.hatnote", "div.navbox", "span.mw-editsection"]): element.decompose() # 見出し、パラグラフ、リストを取得 elements = content_soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol"]) text_content = [] for element in elements: if element.name and element.name.startswith("h"): # type: ignore level = int(element.name[1]) # type: ignore heading_text = element.get_text().strip() if heading_text: # 空の見出しをスキップ text_content.append("\n" + "#" * level + " " + heading_text) elif element.name == "p": # type: ignore paragraph_text = element.get_text().strip() if paragraph_text: # 空のパラグラフをスキップ # テーブルプレースホルダの場合はそのまま追加 if re.match(r"^\{\{table_\d+\}\}$", paragraph_text): text_content.append(paragraph_text) else: text_content.append(paragraph_text) elif element.name in ["ul", "ol"] and element.parent.name not in ["li", "ul", "ol"]: # type: ignore # トップレベルのリストのみ処理(ネストされたものは親liで処理) list_content = process_list_element(element) if list_content: text_content.append(list_content) # テキストコンテンツを結合 content = "\n\n".join(text_content) return content, tables_dict