# -*- coding: utf-8 -*- """ 从中文 Wiki 拉取所有符文之语页面,提取符文之语的中英文名称并保存为 JSON。 供 wiki-sync-translate skill 在翻译符文之语相关页面时参考使用。 """ import os import re import json import requests from pathlib import Path # ==================== 配置区 ==================== WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php") SESSION_CN = requests.Session() SESSION_CN.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }) SESSION_CN.trust_env = False # 符文之语页面列表(按装备部位) RUNEWORD_PAGES = [ "RWWeapons", "RWChests", "RWQuivers", "RWShields", "RWHelms", ] # 输出路径 SCRIPT_DIR = Path(__file__).resolve().parent REFERENCES_DIR = SCRIPT_DIR.parent / "references" OUTPUT_FILE = REFERENCES_DIR / "runeword_names.json" # ================================================ def get_page_content(title): """从中文 Wiki 获取页面完整内容""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "content", "rvslots": "main", "format": "json" } try: r = SESSION_CN.get(WIKI_API_URL_CN, params=params) r.raise_for_status() data = r.json() pages = data["query"]["pages"] page = next(iter(pages.values())) if "revisions" in page: return page["revisions"][0]["slots"]["main"]["*"] except Exception as e: print(f" 获取页面 '{title}' 时出错: {e}") return None def extract_runeword_names(wikitext): """从 wikitext 中提取所有三级标题作为符文之语名称""" if not wikitext: return [] return re.findall(r'===\s*(.+?)\s*===', wikitext) def split_cn_en(full_name): """ 分割 "中文名 英文名" 格式的标题。 英文名可能包含多个单词(如 "Flickering Flame", "King's Grace"), 所以需要从尾部向前匹配连续的英文单词序列。 """ match = re.search(r'((?:[A-Za-z][A-Za-z\']*(?:\'[A-Za-z]+)*)(?:\s+[A-Za-z][A-Za-z\']*(?:\'[A-Za-z]+)*)*)\s*$', full_name) if match: en_name = match.group(1) cn_part = full_name[:match.start()].strip() if cn_part: return cn_part, en_name # 纯英文或无法分割 return None, full_name def build_lookup(names): """ 构建查找表:英文原名 -> 完整中英文名称 标题格式为 "中文名 英文名",英文名可含多个单词 """ lookup = {} for full_name in names: cn_name, en_name = split_cn_en(full_name) lookup[en_name] = full_name return lookup def main(): REFERENCES_DIR.mkdir(exist_ok=True) all_lookup = {} page_stats = {} for page_title in RUNEWORD_PAGES: print(f"拉取: {page_title} ...", end=" ") wikitext = get_page_content(page_title) names = extract_runeword_names(wikitext) lookup = build_lookup(names) all_lookup.update(lookup) page_stats[page_title] = len(names) print(f"找到 {len(names)} 个符文之语") # 保存 JSON with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(all_lookup, f, ensure_ascii=False, indent=2) total = len(all_lookup) print(f"\n总计 {total} 个符文之语名称已保存到 {OUTPUT_FILE}") for page, count in page_stats.items(): print(f" {page}: {count}") if __name__ == "__main__": main()