448 lines
15 KiB
Python
448 lines
15 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
MediaWiki Wiki 同步工具 - AI Agent 版本
|
||
输出 JSON 格式的对比文件,便于 AI Agent 读取和处理
|
||
"""
|
||
|
||
import os
|
||
import argparse
|
||
from pathlib import Path
|
||
from datetime import datetime, timedelta
|
||
import requests
|
||
from dotenv import load_dotenv
|
||
import difflib
|
||
import json
|
||
import re
|
||
|
||
# ==================== 配置区 ====================
|
||
load_dotenv()
|
||
WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php")
|
||
WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php")
|
||
OUTPUT_DIR = Path("wiki_sync_output")
|
||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||
|
||
CURRENT_OUTPUT_DIR = None
|
||
LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt"
|
||
|
||
SESSION_EN = requests.Session()
|
||
SESSION_EN.headers.update({
|
||
"User-Agent": "WikiSyncTool/5.0 (AI Agent Version)"
|
||
})
|
||
|
||
SESSION_CN = requests.Session()
|
||
SESSION_CN.headers.update({
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||
})
|
||
SESSION_CN.trust_env = False
|
||
# ================================================
|
||
|
||
def load_last_timestamp():
|
||
if os.path.exists(LAST_TIMESTAMP_FILE):
|
||
with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f:
|
||
return f.read().strip()
|
||
return None
|
||
|
||
def save_last_timestamp(ts):
|
||
with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f:
|
||
f.write(ts)
|
||
|
||
def get_recent_changes(since):
|
||
"""获取自 since 时间后每个页面的最新 revid"""
|
||
params = {
|
||
"action": "query",
|
||
"list": "recentchanges",
|
||
"rcprop": "title|ids|timestamp",
|
||
"rctype": "edit|new",
|
||
"rcdir": "newer",
|
||
"rcstart": since,
|
||
"rclimit": 500,
|
||
"format": "json"
|
||
}
|
||
latest = {}
|
||
while True:
|
||
try:
|
||
r = SESSION_EN.get(WIKI_API_URL_EN, params=params)
|
||
r.raise_for_status()
|
||
data = r.json()
|
||
if "error" in data:
|
||
raise Exception(data["error"])
|
||
for rc in data.get("query", {}).get("recentchanges", []):
|
||
latest[rc["title"]] = (rc["revid"], rc["timestamp"])
|
||
if "continue" not in data:
|
||
break
|
||
params.update(data["continue"])
|
||
except Exception as e:
|
||
print(f"获取最近更改时出错: {e}")
|
||
break
|
||
return latest
|
||
|
||
def get_old_revid(title, end_time):
|
||
"""获取指定时间前的最后一个 revid"""
|
||
params = {
|
||
"action": "query",
|
||
"prop": "revisions",
|
||
"titles": title,
|
||
"rvprop": "ids|timestamp",
|
||
"rvlimit": 1,
|
||
"rvdir": "older",
|
||
"rvstart": end_time,
|
||
"format": "json"
|
||
}
|
||
try:
|
||
r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json()
|
||
pages = r["query"]["pages"]
|
||
page = next(iter(pages.values()))
|
||
if "revisions" in page:
|
||
return page["revisions"][0]["revid"]
|
||
except Exception as e:
|
||
print(f"获取旧版本ID时出错: {e}")
|
||
return None
|
||
|
||
def get_page_content(wiki_url, session, title, revid=None):
|
||
"""获取页面完整内容"""
|
||
params = {
|
||
"action": "query",
|
||
"prop": "revisions",
|
||
"titles": title,
|
||
"rvprop": "content|timestamp|ids",
|
||
"rvslots": "main",
|
||
"format": "json"
|
||
}
|
||
if revid:
|
||
params["rvstartid"] = revid
|
||
params["rvendid"] = revid
|
||
|
||
try:
|
||
r = session.get(wiki_url, params=params).json()
|
||
pages = r["query"]["pages"]
|
||
page = next(iter(pages.values()))
|
||
if "revisions" in page:
|
||
rev = page["revisions"][0]
|
||
return rev["slots"]["main"]["*"], rev["timestamp"], rev["revid"]
|
||
except Exception as e:
|
||
print(f"获取页面内容时出错: {e}")
|
||
return None, None, None
|
||
|
||
def generate_text_diff(old_text, new_text):
|
||
"""生成 unified diff 格式"""
|
||
if not old_text:
|
||
return "新创建页面"
|
||
|
||
old_lines = old_text.splitlines(keepends=True)
|
||
new_lines = new_text.splitlines(keepends=True)
|
||
|
||
return ''.join(difflib.unified_diff(old_lines, new_lines, lineterm='\n'))
|
||
|
||
def parse_diff_to_changes(diff_text):
|
||
"""
|
||
解析 diff 文本,提取结构化的变更信息
|
||
返回一个列表,每个元素包含:变更类型、行号、旧内容、新内容
|
||
"""
|
||
if not diff_text or diff_text.startswith("新创建页面"):
|
||
return []
|
||
|
||
changes = []
|
||
current_old_line = 0
|
||
current_new_line = 0
|
||
in_hunk = False
|
||
|
||
for line in diff_text.splitlines():
|
||
if line.startswith('@@'):
|
||
match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line)
|
||
if match:
|
||
current_old_line = int(match.group(1))
|
||
current_new_line = int(match.group(3))
|
||
in_hunk = True
|
||
elif line.startswith('---') or line.startswith('+++'):
|
||
continue
|
||
elif in_hunk:
|
||
if line.startswith('-'):
|
||
changes.append({
|
||
"type": "removed",
|
||
"old_line": current_old_line,
|
||
"new_line": None,
|
||
"old_content": line[1:],
|
||
"new_content": None
|
||
})
|
||
current_old_line += 1
|
||
elif line.startswith('+'):
|
||
changes.append({
|
||
"type": "added",
|
||
"old_line": None,
|
||
"new_line": current_new_line,
|
||
"old_content": None,
|
||
"new_content": line[1:]
|
||
})
|
||
current_new_line += 1
|
||
elif line.startswith(' '):
|
||
current_old_line += 1
|
||
current_new_line += 1
|
||
|
||
return changes
|
||
|
||
def calculate_similarity(text1, text2):
|
||
"""
|
||
计算两段文本的相似度(0-1)
|
||
使用 difflib.SequenceMatcher
|
||
"""
|
||
if not text1 or not text2:
|
||
return 0.0
|
||
# 去除首尾空白后比较
|
||
t1 = text1.strip()
|
||
t2 = text2.strip()
|
||
return difflib.SequenceMatcher(None, t1, t2).ratio()
|
||
|
||
|
||
def group_changes_by_line(changes, similarity_threshold=0.5):
|
||
"""
|
||
将变更按行号分组,将连续的删除和添加合并为替换操作
|
||
|
||
改进:使用内容相似度来判断 removed 和 added 是否应该配对
|
||
- 如果 removed 和 added 的内容相似度 >= threshold,才配对为 replaced
|
||
- 否则分别标记为 removed 和 added
|
||
"""
|
||
# 先收集所有的删除和添加
|
||
removed_by_line = {} # old_line -> content
|
||
added_by_line = {} # new_line -> content
|
||
|
||
for c in changes:
|
||
if c["type"] == "removed":
|
||
removed_by_line[c["old_line"]] = c["old_content"]
|
||
elif c["type"] == "added":
|
||
added_by_line[c["new_line"]] = c["new_content"]
|
||
|
||
# 使用贪心算法,基于内容相似度进行配对
|
||
grouped = []
|
||
used_added = set()
|
||
used_removed = set()
|
||
|
||
# 第一步:找出所有高相似度的配对
|
||
pairings = []
|
||
for old_line, old_content in removed_by_line.items():
|
||
for new_line, new_content in added_by_line.items():
|
||
if new_line not in used_added:
|
||
similarity = calculate_similarity(old_content, new_content)
|
||
if similarity >= similarity_threshold:
|
||
pairings.append((similarity, old_line, new_line, old_content, new_content))
|
||
|
||
# 按相似度降序排序,优先处理最相似的配对
|
||
pairings.sort(key=lambda x: x[0], reverse=True)
|
||
|
||
# 第二步:贪心配对
|
||
for similarity, old_line, new_line, old_content, new_content in pairings:
|
||
if old_line not in used_removed and new_line not in used_added:
|
||
grouped.append({
|
||
"type": "replaced",
|
||
"old_line": old_line,
|
||
"new_line": new_line,
|
||
"old_content": old_content,
|
||
"new_content": new_content,
|
||
"_similarity": round(similarity, 2) # 调试用,可选
|
||
})
|
||
used_removed.add(old_line)
|
||
used_added.add(new_line)
|
||
|
||
# 第三步:处理未配对的 removed
|
||
for old_line, old_content in sorted(removed_by_line.items()):
|
||
if old_line not in used_removed:
|
||
grouped.append({
|
||
"type": "removed",
|
||
"old_line": old_line,
|
||
"new_line": None,
|
||
"old_content": old_content,
|
||
"new_content": None
|
||
})
|
||
|
||
# 第四步:处理未配对的 added
|
||
for new_line, new_content in sorted(added_by_line.items()):
|
||
if new_line not in used_added:
|
||
grouped.append({
|
||
"type": "added",
|
||
"old_line": None,
|
||
"new_line": new_line,
|
||
"old_content": None,
|
||
"new_content": new_content
|
||
})
|
||
|
||
# 按行号排序
|
||
grouped.sort(key=lambda x: x["old_line"] or x["new_line"] or 0)
|
||
|
||
return grouped
|
||
|
||
def create_diff_json(title, en_old_content, en_new_content, cn_content):
|
||
"""
|
||
创建结构化的 JSON 对比数据(仅包含英文变更,AI自行匹配中文)
|
||
"""
|
||
# 生成英文 diff
|
||
diff_text = generate_text_diff(en_old_content, en_new_content)
|
||
|
||
# 解析变更
|
||
raw_changes = parse_diff_to_changes(diff_text)
|
||
grouped_changes = group_changes_by_line(raw_changes)
|
||
|
||
# 构建输出结构(精简版,不含中文行内容)
|
||
result = {
|
||
"title": title,
|
||
"timestamp": datetime.now().isoformat(),
|
||
"is_new_page": diff_text == "新创建页面",
|
||
"has_cn_translation": cn_content is not None,
|
||
"summary": {
|
||
"total_changes": len(grouped_changes),
|
||
"replaced": len([c for c in grouped_changes if c["type"] == "replaced"]),
|
||
"added": len([c for c in grouped_changes if c["type"] == "added"]),
|
||
"removed": len([c for c in grouped_changes if c["type"] == "removed"])
|
||
},
|
||
"changes": grouped_changes
|
||
}
|
||
|
||
return result
|
||
|
||
def save_files(title, diff_json, en_full_text, cn_content, timestamp, revid=None, old_full_text=None):
|
||
"""保存文件"""
|
||
global CURRENT_OUTPUT_DIR
|
||
|
||
if CURRENT_OUTPUT_DIR is None:
|
||
current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str
|
||
CURRENT_OUTPUT_DIR.mkdir(exist_ok=True)
|
||
(CURRENT_OUTPUT_DIR / "new_pages").mkdir(exist_ok=True)
|
||
(CURRENT_OUTPUT_DIR / "changed_pages").mkdir(exist_ok=True)
|
||
print(f"创建输出目录: {CURRENT_OUTPUT_DIR}")
|
||
|
||
safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title)
|
||
time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_")
|
||
base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}"
|
||
|
||
is_new_page = diff_json["is_new_page"]
|
||
|
||
if is_new_page:
|
||
target_dir = CURRENT_OUTPUT_DIR / "new_pages"
|
||
print(f" 检测到新页面")
|
||
|
||
# 保存英文完整内容
|
||
full_file = target_dir / f"{base_filename}.full.txt"
|
||
with open(full_file, "w", encoding="utf-8") as f:
|
||
f.write(en_full_text)
|
||
print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)}")
|
||
|
||
else:
|
||
target_dir = CURRENT_OUTPUT_DIR / "changed_pages"
|
||
|
||
# 保存英文完整内容
|
||
full_file = target_dir / f"{base_filename}.full.txt"
|
||
with open(full_file, "w", encoding="utf-8") as f:
|
||
f.write(en_full_text)
|
||
print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)}")
|
||
|
||
# 保存中文内容
|
||
if cn_content:
|
||
cn_file = target_dir / f"{base_filename}.cn.txt"
|
||
with open(cn_file, "w", encoding="utf-8") as f:
|
||
f.write(cn_content)
|
||
print(f" → 已保存: {cn_file.relative_to(OUTPUT_DIR)}")
|
||
|
||
# 保存 JSON 对比文件(核心输出)
|
||
json_file = target_dir / f"{base_filename}.comparison.json"
|
||
with open(json_file, "w", encoding="utf-8") as f:
|
||
json.dump(diff_json, f, ensure_ascii=False, indent=2)
|
||
print(f" → 已保存: {json_file.relative_to(OUTPUT_DIR)} (AI Agent 对比文件)")
|
||
|
||
# 保存历史版本
|
||
if old_full_text:
|
||
old_file = target_dir / f"{base_filename}.old.txt"
|
||
with open(old_file, "w", encoding="utf-8") as f:
|
||
f.write(old_full_text)
|
||
print(f" → 已保存: {old_file.relative_to(OUTPUT_DIR)}")
|
||
|
||
def process_single_page(title, since_time, update_timestamp=False):
|
||
"""处理单个页面"""
|
||
print(f"正在处理页面:{title}")
|
||
|
||
# 获取最新内容
|
||
latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title)
|
||
if latest_content is None:
|
||
print("页面不存在或被删除")
|
||
return None
|
||
|
||
# 获取旧版本
|
||
old_revid = get_old_revid(title, since_time)
|
||
old_content = None
|
||
|
||
if old_revid:
|
||
old_content, _, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid)
|
||
if old_content is None:
|
||
print(" 无法获取历史版本,视为新页面")
|
||
|
||
# 获取中文翻译
|
||
print(" 搜索中文翻译...")
|
||
cn_content = None
|
||
|
||
# 直接尝试获取同名页面
|
||
cn_result, _, _ = get_page_content(WIKI_API_URL_CN, SESSION_CN, title)
|
||
if cn_result:
|
||
cn_content = cn_result
|
||
print(f" 找到中文页面 ({len(cn_content)} 字符)")
|
||
else:
|
||
print(" 未找到中文翻译")
|
||
|
||
# 生成对比 JSON
|
||
diff_json = create_diff_json(title, old_content, latest_content, cn_content)
|
||
|
||
print(f" 变更统计: 替换={diff_json['summary']['replaced']}, "
|
||
f"新增={diff_json['summary']['added']}, 删除={diff_json['summary']['removed']}")
|
||
|
||
# 保存文件
|
||
save_files(title, diff_json, latest_content, cn_content, latest_ts, latest_revid, old_content)
|
||
|
||
if update_timestamp:
|
||
save_last_timestamp(latest_ts)
|
||
print(f"已更新时间戳 → {latest_ts}")
|
||
|
||
return latest_ts
|
||
|
||
def process_all_pages_since(since_time):
|
||
"""处理所有变更页面"""
|
||
print("正在获取最近变更列表...")
|
||
changes = get_recent_changes(since_time)
|
||
|
||
if not changes:
|
||
print("没有发现任何变更")
|
||
return
|
||
|
||
latest_global_ts = since_time
|
||
for title, (revid, ts) in changes.items():
|
||
print(f"\n处理:{title}")
|
||
page_ts = process_single_page(title, since_time)
|
||
if page_ts and page_ts > latest_global_ts:
|
||
latest_global_ts = page_ts
|
||
|
||
save_last_timestamp(latest_global_ts)
|
||
print(f"\n同步完成!最新时间戳: {latest_global_ts}")
|
||
print(f"文件保存在: {CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}")
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - AI Agent 版本")
|
||
parser.add_argument("--since", type=str, help="起始时间,格式: 2025-11-28T00:00:00Z")
|
||
parser.add_argument("--title", type=str, help="只处理指定页面")
|
||
parser.add_argument("--update-timestamp", action="store_true", help="更新全局时间戳")
|
||
parser.add_argument("--run", action="store_true", help="执行同步")
|
||
|
||
args = parser.parse_args()
|
||
|
||
if not args.run:
|
||
parser.print_help()
|
||
return
|
||
|
||
since_time = args.since or load_last_timestamp()
|
||
if not since_time:
|
||
since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z"
|
||
print(f"起始时间: {since_time}")
|
||
|
||
if args.title:
|
||
process_single_page(args.title.strip(), since_time, args.update_timestamp)
|
||
else:
|
||
process_all_pages_since(since_time)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|