123 lines
3.5 KiB
Python
123 lines
3.5 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
从中文 Wiki 拉取所有符文之语页面,提取符文之语的中英文名称并保存为 JSON。
|
||
供 wiki-sync-translate skill 在翻译符文之语相关页面时参考使用。
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import json
|
||
import requests
|
||
from pathlib import Path
|
||
|
||
# ==================== 配置区 ====================
|
||
WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php")
|
||
|
||
SESSION_CN = requests.Session()
|
||
SESSION_CN.headers.update({
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||
})
|
||
SESSION_CN.trust_env = False
|
||
|
||
# 符文之语页面列表(按装备部位)
|
||
RUNEWORD_PAGES = [
|
||
"RWWeapons",
|
||
"RWChests",
|
||
"RWQuivers",
|
||
"RWShields",
|
||
"RWHelms",
|
||
]
|
||
|
||
# 输出路径
|
||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||
REFERENCES_DIR = SCRIPT_DIR.parent / "references"
|
||
OUTPUT_FILE = REFERENCES_DIR / "runeword_names.json"
|
||
# ================================================
|
||
|
||
|
||
def get_page_content(title):
|
||
"""从中文 Wiki 获取页面完整内容"""
|
||
params = {
|
||
"action": "query",
|
||
"prop": "revisions",
|
||
"titles": title,
|
||
"rvprop": "content",
|
||
"rvslots": "main",
|
||
"format": "json"
|
||
}
|
||
try:
|
||
r = SESSION_CN.get(WIKI_API_URL_CN, params=params)
|
||
r.raise_for_status()
|
||
data = r.json()
|
||
pages = data["query"]["pages"]
|
||
page = next(iter(pages.values()))
|
||
if "revisions" in page:
|
||
return page["revisions"][0]["slots"]["main"]["*"]
|
||
except Exception as e:
|
||
print(f" 获取页面 '{title}' 时出错: {e}")
|
||
return None
|
||
|
||
|
||
def extract_runeword_names(wikitext):
|
||
"""从 wikitext 中提取所有三级标题作为符文之语名称"""
|
||
if not wikitext:
|
||
return []
|
||
return re.findall(r'===\s*(.+?)\s*===', wikitext)
|
||
|
||
|
||
def split_cn_en(full_name):
|
||
"""
|
||
分割 "中文名 英文名" 格式的标题。
|
||
英文名可能包含多个单词(如 "Flickering Flame", "King's Grace"),
|
||
所以需要从尾部向前匹配连续的英文单词序列。
|
||
"""
|
||
match = re.search(r'((?:[A-Za-z][A-Za-z\']*(?:\'[A-Za-z]+)*)(?:\s+[A-Za-z][A-Za-z\']*(?:\'[A-Za-z]+)*)*)\s*$', full_name)
|
||
if match:
|
||
en_name = match.group(1)
|
||
cn_part = full_name[:match.start()].strip()
|
||
if cn_part:
|
||
return cn_part, en_name
|
||
# 纯英文或无法分割
|
||
return None, full_name
|
||
|
||
|
||
def build_lookup(names):
|
||
"""
|
||
构建查找表:英文原名 -> 完整中英文名称
|
||
标题格式为 "中文名 英文名",英文名可含多个单词
|
||
"""
|
||
lookup = {}
|
||
for full_name in names:
|
||
cn_name, en_name = split_cn_en(full_name)
|
||
lookup[en_name] = full_name
|
||
return lookup
|
||
|
||
|
||
def main():
|
||
REFERENCES_DIR.mkdir(exist_ok=True)
|
||
|
||
all_lookup = {}
|
||
page_stats = {}
|
||
|
||
for page_title in RUNEWORD_PAGES:
|
||
print(f"拉取: {page_title} ...", end=" ")
|
||
wikitext = get_page_content(page_title)
|
||
names = extract_runeword_names(wikitext)
|
||
lookup = build_lookup(names)
|
||
all_lookup.update(lookup)
|
||
page_stats[page_title] = len(names)
|
||
print(f"找到 {len(names)} 个符文之语")
|
||
|
||
# 保存 JSON
|
||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||
json.dump(all_lookup, f, ensure_ascii=False, indent=2)
|
||
|
||
total = len(all_lookup)
|
||
print(f"\n总计 {total} 个符文之语名称已保存到 {OUTPUT_FILE}")
|
||
for page, count in page_stats.items():
|
||
print(f" {page}: {count}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|