sync-pd2-wiki/.claude/skills/wiki-sync-translate/scripts/wiki_sync.py

448 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
MediaWiki Wiki 同步工具 - AI Agent 版本
输出 JSON 格式的对比文件,便于 AI Agent 读取和处理
"""
import os
import argparse
from pathlib import Path
from datetime import datetime, timedelta
import requests
from dotenv import load_dotenv
import difflib
import json
import re
# ==================== 配置区 ====================
load_dotenv()
WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php")
WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php")
OUTPUT_DIR = Path("wiki_sync_output")
OUTPUT_DIR.mkdir(exist_ok=True)
CURRENT_OUTPUT_DIR = None
LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt"
SESSION_EN = requests.Session()
SESSION_EN.headers.update({
"User-Agent": "WikiSyncTool/5.0 (AI Agent Version)"
})
SESSION_CN = requests.Session()
SESSION_CN.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
SESSION_CN.trust_env = False
# ================================================
def load_last_timestamp():
if os.path.exists(LAST_TIMESTAMP_FILE):
with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f:
return f.read().strip()
return None
def save_last_timestamp(ts):
with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f:
f.write(ts)
def get_recent_changes(since):
"""获取自 since 时间后每个页面的最新 revid"""
params = {
"action": "query",
"list": "recentchanges",
"rcprop": "title|ids|timestamp",
"rctype": "edit|new",
"rcdir": "newer",
"rcstart": since,
"rclimit": 500,
"format": "json"
}
latest = {}
while True:
try:
r = SESSION_EN.get(WIKI_API_URL_EN, params=params)
r.raise_for_status()
data = r.json()
if "error" in data:
raise Exception(data["error"])
for rc in data.get("query", {}).get("recentchanges", []):
latest[rc["title"]] = (rc["revid"], rc["timestamp"])
if "continue" not in data:
break
params.update(data["continue"])
except Exception as e:
print(f"获取最近更改时出错: {e}")
break
return latest
def get_old_revid(title, end_time):
"""获取指定时间前的最后一个 revid"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "ids|timestamp",
"rvlimit": 1,
"rvdir": "older",
"rvstart": end_time,
"format": "json"
}
try:
r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json()
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" in page:
return page["revisions"][0]["revid"]
except Exception as e:
print(f"获取旧版本ID时出错: {e}")
return None
def get_page_content(wiki_url, session, title, revid=None):
"""获取页面完整内容"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "content|timestamp|ids",
"rvslots": "main",
"format": "json"
}
if revid:
params["rvstartid"] = revid
params["rvendid"] = revid
try:
r = session.get(wiki_url, params=params).json()
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" in page:
rev = page["revisions"][0]
return rev["slots"]["main"]["*"], rev["timestamp"], rev["revid"]
except Exception as e:
print(f"获取页面内容时出错: {e}")
return None, None, None
def generate_text_diff(old_text, new_text):
"""生成 unified diff 格式"""
if not old_text:
return "新创建页面"
old_lines = old_text.splitlines(keepends=True)
new_lines = new_text.splitlines(keepends=True)
return ''.join(difflib.unified_diff(old_lines, new_lines, lineterm='\n'))
def parse_diff_to_changes(diff_text):
"""
解析 diff 文本,提取结构化的变更信息
返回一个列表,每个元素包含:变更类型、行号、旧内容、新内容
"""
if not diff_text or diff_text.startswith("新创建页面"):
return []
changes = []
current_old_line = 0
current_new_line = 0
in_hunk = False
for line in diff_text.splitlines():
if line.startswith('@@'):
match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line)
if match:
current_old_line = int(match.group(1))
current_new_line = int(match.group(3))
in_hunk = True
elif line.startswith('---') or line.startswith('+++'):
continue
elif in_hunk:
if line.startswith('-'):
changes.append({
"type": "removed",
"old_line": current_old_line,
"new_line": None,
"old_content": line[1:],
"new_content": None
})
current_old_line += 1
elif line.startswith('+'):
changes.append({
"type": "added",
"old_line": None,
"new_line": current_new_line,
"old_content": None,
"new_content": line[1:]
})
current_new_line += 1
elif line.startswith(' '):
current_old_line += 1
current_new_line += 1
return changes
def calculate_similarity(text1, text2):
"""
计算两段文本的相似度0-1
使用 difflib.SequenceMatcher
"""
if not text1 or not text2:
return 0.0
# 去除首尾空白后比较
t1 = text1.strip()
t2 = text2.strip()
return difflib.SequenceMatcher(None, t1, t2).ratio()
def group_changes_by_line(changes, similarity_threshold=0.5):
"""
将变更按行号分组,将连续的删除和添加合并为替换操作
改进:使用内容相似度来判断 removed 和 added 是否应该配对
- 如果 removed 和 added 的内容相似度 >= threshold才配对为 replaced
- 否则分别标记为 removed 和 added
"""
# 先收集所有的删除和添加
removed_by_line = {} # old_line -> content
added_by_line = {} # new_line -> content
for c in changes:
if c["type"] == "removed":
removed_by_line[c["old_line"]] = c["old_content"]
elif c["type"] == "added":
added_by_line[c["new_line"]] = c["new_content"]
# 使用贪心算法,基于内容相似度进行配对
grouped = []
used_added = set()
used_removed = set()
# 第一步:找出所有高相似度的配对
pairings = []
for old_line, old_content in removed_by_line.items():
for new_line, new_content in added_by_line.items():
if new_line not in used_added:
similarity = calculate_similarity(old_content, new_content)
if similarity >= similarity_threshold:
pairings.append((similarity, old_line, new_line, old_content, new_content))
# 按相似度降序排序,优先处理最相似的配对
pairings.sort(key=lambda x: x[0], reverse=True)
# 第二步:贪心配对
for similarity, old_line, new_line, old_content, new_content in pairings:
if old_line not in used_removed and new_line not in used_added:
grouped.append({
"type": "replaced",
"old_line": old_line,
"new_line": new_line,
"old_content": old_content,
"new_content": new_content,
"_similarity": round(similarity, 2) # 调试用,可选
})
used_removed.add(old_line)
used_added.add(new_line)
# 第三步:处理未配对的 removed
for old_line, old_content in sorted(removed_by_line.items()):
if old_line not in used_removed:
grouped.append({
"type": "removed",
"old_line": old_line,
"new_line": None,
"old_content": old_content,
"new_content": None
})
# 第四步:处理未配对的 added
for new_line, new_content in sorted(added_by_line.items()):
if new_line not in used_added:
grouped.append({
"type": "added",
"old_line": None,
"new_line": new_line,
"old_content": None,
"new_content": new_content
})
# 按行号排序
grouped.sort(key=lambda x: x["old_line"] or x["new_line"] or 0)
return grouped
def create_diff_json(title, en_old_content, en_new_content, cn_content):
"""
创建结构化的 JSON 对比数据仅包含英文变更AI自行匹配中文
"""
# 生成英文 diff
diff_text = generate_text_diff(en_old_content, en_new_content)
# 解析变更
raw_changes = parse_diff_to_changes(diff_text)
grouped_changes = group_changes_by_line(raw_changes)
# 构建输出结构(精简版,不含中文行内容)
result = {
"title": title,
"timestamp": datetime.now().isoformat(),
"is_new_page": diff_text == "新创建页面",
"has_cn_translation": cn_content is not None,
"summary": {
"total_changes": len(grouped_changes),
"replaced": len([c for c in grouped_changes if c["type"] == "replaced"]),
"added": len([c for c in grouped_changes if c["type"] == "added"]),
"removed": len([c for c in grouped_changes if c["type"] == "removed"])
},
"changes": grouped_changes
}
return result
def save_files(title, diff_json, en_full_text, cn_content, timestamp, revid=None, old_full_text=None):
"""保存文件"""
global CURRENT_OUTPUT_DIR
if CURRENT_OUTPUT_DIR is None:
current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str
CURRENT_OUTPUT_DIR.mkdir(exist_ok=True)
(CURRENT_OUTPUT_DIR / "new_pages").mkdir(exist_ok=True)
(CURRENT_OUTPUT_DIR / "changed_pages").mkdir(exist_ok=True)
print(f"创建输出目录: {CURRENT_OUTPUT_DIR}")
safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title)
time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_")
base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}"
is_new_page = diff_json["is_new_page"]
if is_new_page:
target_dir = CURRENT_OUTPUT_DIR / "new_pages"
print(f" 检测到新页面")
# 保存英文完整内容
full_file = target_dir / f"{base_filename}.full.txt"
with open(full_file, "w", encoding="utf-8") as f:
f.write(en_full_text)
print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)}")
else:
target_dir = CURRENT_OUTPUT_DIR / "changed_pages"
# 保存英文完整内容
full_file = target_dir / f"{base_filename}.full.txt"
with open(full_file, "w", encoding="utf-8") as f:
f.write(en_full_text)
print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)}")
# 保存中文内容
if cn_content:
cn_file = target_dir / f"{base_filename}.cn.txt"
with open(cn_file, "w", encoding="utf-8") as f:
f.write(cn_content)
print(f" → 已保存: {cn_file.relative_to(OUTPUT_DIR)}")
# 保存 JSON 对比文件(核心输出)
json_file = target_dir / f"{base_filename}.comparison.json"
with open(json_file, "w", encoding="utf-8") as f:
json.dump(diff_json, f, ensure_ascii=False, indent=2)
print(f" → 已保存: {json_file.relative_to(OUTPUT_DIR)} (AI Agent 对比文件)")
# 保存历史版本
if old_full_text:
old_file = target_dir / f"{base_filename}.old.txt"
with open(old_file, "w", encoding="utf-8") as f:
f.write(old_full_text)
print(f" → 已保存: {old_file.relative_to(OUTPUT_DIR)}")
def process_single_page(title, since_time, update_timestamp=False):
"""处理单个页面"""
print(f"正在处理页面:{title}")
# 获取最新内容
latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title)
if latest_content is None:
print("页面不存在或被删除")
return None
# 获取旧版本
old_revid = get_old_revid(title, since_time)
old_content = None
if old_revid:
old_content, _, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid)
if old_content is None:
print(" 无法获取历史版本,视为新页面")
# 获取中文翻译
print(" 搜索中文翻译...")
cn_content = None
# 直接尝试获取同名页面
cn_result, _, _ = get_page_content(WIKI_API_URL_CN, SESSION_CN, title)
if cn_result:
cn_content = cn_result
print(f" 找到中文页面 ({len(cn_content)} 字符)")
else:
print(" 未找到中文翻译")
# 生成对比 JSON
diff_json = create_diff_json(title, old_content, latest_content, cn_content)
print(f" 变更统计: 替换={diff_json['summary']['replaced']}, "
f"新增={diff_json['summary']['added']}, 删除={diff_json['summary']['removed']}")
# 保存文件
save_files(title, diff_json, latest_content, cn_content, latest_ts, latest_revid, old_content)
if update_timestamp:
save_last_timestamp(latest_ts)
print(f"已更新时间戳 → {latest_ts}")
return latest_ts
def process_all_pages_since(since_time):
"""处理所有变更页面"""
print("正在获取最近变更列表...")
changes = get_recent_changes(since_time)
if not changes:
print("没有发现任何变更")
return
latest_global_ts = since_time
for title, (revid, ts) in changes.items():
print(f"\n处理:{title}")
page_ts = process_single_page(title, since_time)
if page_ts and page_ts > latest_global_ts:
latest_global_ts = page_ts
save_last_timestamp(latest_global_ts)
print(f"\n同步完成!最新时间戳: {latest_global_ts}")
print(f"文件保存在: {CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}")
def main():
parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - AI Agent 版本")
parser.add_argument("--since", type=str, help="起始时间,格式: 2025-11-28T00:00:00Z")
parser.add_argument("--title", type=str, help="只处理指定页面")
parser.add_argument("--update-timestamp", action="store_true", help="更新全局时间戳")
parser.add_argument("--run", action="store_true", help="执行同步")
args = parser.parse_args()
if not args.run:
parser.print_help()
return
since_time = args.since or load_last_timestamp()
if not since_time:
since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z"
print(f"起始时间: {since_time}")
if args.title:
process_single_page(args.title.strip(), since_time, args.update_timestamp)
else:
process_all_pages_since(since_time)
if __name__ == "__main__":
main()