sync-pd2-wiki/.claude/skills/wiki-sync-translate/scripts/fetch_runeword_names.py

123 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
从中文 Wiki 拉取所有符文之语页面,提取符文之语的中英文名称并保存为 JSON。
供 wiki-sync-translate skill 在翻译符文之语相关页面时参考使用。
"""
import os
import re
import json
import requests
from pathlib import Path
# ==================== 配置区 ====================
WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php")
SESSION_CN = requests.Session()
SESSION_CN.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
SESSION_CN.trust_env = False
# 符文之语页面列表(按装备部位)
RUNEWORD_PAGES = [
"RWWeapons",
"RWChests",
"RWQuivers",
"RWShields",
"RWHelms",
]
# 输出路径
SCRIPT_DIR = Path(__file__).resolve().parent
REFERENCES_DIR = SCRIPT_DIR.parent / "references"
OUTPUT_FILE = REFERENCES_DIR / "runeword_names.json"
# ================================================
def get_page_content(title):
"""从中文 Wiki 获取页面完整内容"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "content",
"rvslots": "main",
"format": "json"
}
try:
r = SESSION_CN.get(WIKI_API_URL_CN, params=params)
r.raise_for_status()
data = r.json()
pages = data["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" in page:
return page["revisions"][0]["slots"]["main"]["*"]
except Exception as e:
print(f" 获取页面 '{title}' 时出错: {e}")
return None
def extract_runeword_names(wikitext):
"""从 wikitext 中提取所有三级标题作为符文之语名称"""
if not wikitext:
return []
return re.findall(r'===\s*(.+?)\s*===', wikitext)
def split_cn_en(full_name):
"""
分割 "中文名 英文名" 格式的标题。
英文名可能包含多个单词(如 "Flickering Flame", "King's Grace"
所以需要从尾部向前匹配连续的英文单词序列。
"""
match = re.search(r'((?:[A-Za-z][A-Za-z\']*(?:\'[A-Za-z]+)*)(?:\s+[A-Za-z][A-Za-z\']*(?:\'[A-Za-z]+)*)*)\s*$', full_name)
if match:
en_name = match.group(1)
cn_part = full_name[:match.start()].strip()
if cn_part:
return cn_part, en_name
# 纯英文或无法分割
return None, full_name
def build_lookup(names):
"""
构建查找表:英文原名 -> 完整中英文名称
标题格式为 "中文名 英文名",英文名可含多个单词
"""
lookup = {}
for full_name in names:
cn_name, en_name = split_cn_en(full_name)
lookup[en_name] = full_name
return lookup
def main():
REFERENCES_DIR.mkdir(exist_ok=True)
all_lookup = {}
page_stats = {}
for page_title in RUNEWORD_PAGES:
print(f"拉取: {page_title} ...", end=" ")
wikitext = get_page_content(page_title)
names = extract_runeword_names(wikitext)
lookup = build_lookup(names)
all_lookup.update(lookup)
page_stats[page_title] = len(names)
print(f"找到 {len(names)} 个符文之语")
# 保存 JSON
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(all_lookup, f, ensure_ascii=False, indent=2)
total = len(all_lookup)
print(f"\n总计 {total} 个符文之语名称已保存到 {OUTPUT_FILE}")
for page, count in page_stats.items():
print(f" {page}: {count}")
if __name__ == "__main__":
main()