# -*- coding: utf-8 -*- """ MediaWiki Wiki 同步工具 - AI Agent 版本 输出 JSON 格式的对比文件,便于 AI Agent 读取和处理 """ import os import argparse from pathlib import Path from datetime import datetime, timedelta import requests from dotenv import load_dotenv import difflib import json import re # ==================== 配置区 ==================== load_dotenv() WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php") WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php") OUTPUT_DIR = Path("wiki_sync_output") OUTPUT_DIR.mkdir(exist_ok=True) CURRENT_OUTPUT_DIR = None LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt" SESSION_EN = requests.Session() SESSION_EN.headers.update({ "User-Agent": "WikiSyncTool/5.0 (AI Agent Version)" }) SESSION_CN = requests.Session() SESSION_CN.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }) SESSION_CN.trust_env = False # ================================================ def load_last_timestamp(): if os.path.exists(LAST_TIMESTAMP_FILE): with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f: return f.read().strip() return None def save_last_timestamp(ts): with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f: f.write(ts) def get_recent_changes(since): """获取自 since 时间后每个页面的最新 revid""" params = { "action": "query", "list": "recentchanges", "rcprop": "title|ids|timestamp", "rctype": "edit|new", "rcdir": "newer", "rcstart": since, "rclimit": 500, "format": "json" } latest = {} while True: try: r = SESSION_EN.get(WIKI_API_URL_EN, params=params) r.raise_for_status() data = r.json() if "error" in data: raise Exception(data["error"]) for rc in data.get("query", {}).get("recentchanges", []): latest[rc["title"]] = (rc["revid"], rc["timestamp"]) if "continue" not in data: break params.update(data["continue"]) except Exception as e: print(f"获取最近更改时出错: {e}") break return latest def get_old_revid(title, end_time): """获取指定时间前的最后一个 revid""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "ids|timestamp", "rvlimit": 1, "rvdir": "older", "rvstart": end_time, "format": "json" } try: r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json() pages = r["query"]["pages"] page = next(iter(pages.values())) if "revisions" in page: return page["revisions"][0]["revid"] except Exception as e: print(f"获取旧版本ID时出错: {e}") return None def get_page_content(wiki_url, session, title, revid=None): """获取页面完整内容""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "content|timestamp|ids", "rvslots": "main", "format": "json" } if revid: params["rvstartid"] = revid params["rvendid"] = revid try: r = session.get(wiki_url, params=params).json() pages = r["query"]["pages"] page = next(iter(pages.values())) if "revisions" in page: rev = page["revisions"][0] return rev["slots"]["main"]["*"], rev["timestamp"], rev["revid"] except Exception as e: print(f"获取页面内容时出错: {e}") return None, None, None def generate_text_diff(old_text, new_text): """生成 unified diff 格式""" if not old_text: return "新创建页面" old_lines = old_text.splitlines(keepends=True) new_lines = new_text.splitlines(keepends=True) return ''.join(difflib.unified_diff(old_lines, new_lines, lineterm='\n')) def parse_diff_to_changes(diff_text): """ 解析 diff 文本,提取结构化的变更信息 返回一个列表,每个元素包含:变更类型、行号、旧内容、新内容 """ if not diff_text or diff_text.startswith("新创建页面"): return [] changes = [] current_old_line = 0 current_new_line = 0 in_hunk = False for line in diff_text.splitlines(): if line.startswith('@@'): match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line) if match: current_old_line = int(match.group(1)) current_new_line = int(match.group(3)) in_hunk = True elif line.startswith('---') or line.startswith('+++'): continue elif in_hunk: if line.startswith('-'): changes.append({ "type": "removed", "old_line": current_old_line, "new_line": None, "old_content": line[1:], "new_content": None }) current_old_line += 1 elif line.startswith('+'): changes.append({ "type": "added", "old_line": None, "new_line": current_new_line, "old_content": None, "new_content": line[1:] }) current_new_line += 1 elif line.startswith(' '): current_old_line += 1 current_new_line += 1 return changes def calculate_similarity(text1, text2): """ 计算两段文本的相似度(0-1) 使用 difflib.SequenceMatcher """ if not text1 or not text2: return 0.0 # 去除首尾空白后比较 t1 = text1.strip() t2 = text2.strip() return difflib.SequenceMatcher(None, t1, t2).ratio() def group_changes_by_line(changes, similarity_threshold=0.5): """ 将变更按行号分组,将连续的删除和添加合并为替换操作 改进:使用内容相似度来判断 removed 和 added 是否应该配对 - 如果 removed 和 added 的内容相似度 >= threshold,才配对为 replaced - 否则分别标记为 removed 和 added """ # 先收集所有的删除和添加 removed_by_line = {} # old_line -> content added_by_line = {} # new_line -> content for c in changes: if c["type"] == "removed": removed_by_line[c["old_line"]] = c["old_content"] elif c["type"] == "added": added_by_line[c["new_line"]] = c["new_content"] # 使用贪心算法,基于内容相似度进行配对 grouped = [] used_added = set() used_removed = set() # 第一步:找出所有高相似度的配对 pairings = [] for old_line, old_content in removed_by_line.items(): for new_line, new_content in added_by_line.items(): if new_line not in used_added: similarity = calculate_similarity(old_content, new_content) if similarity >= similarity_threshold: pairings.append((similarity, old_line, new_line, old_content, new_content)) # 按相似度降序排序,优先处理最相似的配对 pairings.sort(key=lambda x: x[0], reverse=True) # 第二步:贪心配对 for similarity, old_line, new_line, old_content, new_content in pairings: if old_line not in used_removed and new_line not in used_added: grouped.append({ "type": "replaced", "old_line": old_line, "new_line": new_line, "old_content": old_content, "new_content": new_content, "_similarity": round(similarity, 2) # 调试用,可选 }) used_removed.add(old_line) used_added.add(new_line) # 第三步:处理未配对的 removed for old_line, old_content in sorted(removed_by_line.items()): if old_line not in used_removed: grouped.append({ "type": "removed", "old_line": old_line, "new_line": None, "old_content": old_content, "new_content": None }) # 第四步:处理未配对的 added for new_line, new_content in sorted(added_by_line.items()): if new_line not in used_added: grouped.append({ "type": "added", "old_line": None, "new_line": new_line, "old_content": None, "new_content": new_content }) # 按行号排序 grouped.sort(key=lambda x: x["old_line"] or x["new_line"] or 0) return grouped def create_diff_json(title, en_old_content, en_new_content, cn_content): """ 创建结构化的 JSON 对比数据(仅包含英文变更,AI自行匹配中文) """ # 生成英文 diff diff_text = generate_text_diff(en_old_content, en_new_content) # 解析变更 raw_changes = parse_diff_to_changes(diff_text) grouped_changes = group_changes_by_line(raw_changes) # 构建输出结构(精简版,不含中文行内容) result = { "title": title, "timestamp": datetime.now().isoformat(), "is_new_page": diff_text == "新创建页面", "has_cn_translation": cn_content is not None, "summary": { "total_changes": len(grouped_changes), "replaced": len([c for c in grouped_changes if c["type"] == "replaced"]), "added": len([c for c in grouped_changes if c["type"] == "added"]), "removed": len([c for c in grouped_changes if c["type"] == "removed"]) }, "changes": grouped_changes } return result def save_files(title, diff_json, en_full_text, cn_content, timestamp, revid=None, old_full_text=None): """保存文件""" global CURRENT_OUTPUT_DIR if CURRENT_OUTPUT_DIR is None: current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S") CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str CURRENT_OUTPUT_DIR.mkdir(exist_ok=True) (CURRENT_OUTPUT_DIR / "new_pages").mkdir(exist_ok=True) (CURRENT_OUTPUT_DIR / "changed_pages").mkdir(exist_ok=True) print(f"创建输出目录: {CURRENT_OUTPUT_DIR}") safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title) time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_") base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}" is_new_page = diff_json["is_new_page"] if is_new_page: target_dir = CURRENT_OUTPUT_DIR / "new_pages" print(f" 检测到新页面") # 保存英文完整内容 full_file = target_dir / f"{base_filename}.full.txt" with open(full_file, "w", encoding="utf-8") as f: f.write(en_full_text) print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)}") else: target_dir = CURRENT_OUTPUT_DIR / "changed_pages" # 保存英文完整内容 full_file = target_dir / f"{base_filename}.full.txt" with open(full_file, "w", encoding="utf-8") as f: f.write(en_full_text) print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)}") # 保存中文内容 if cn_content: cn_file = target_dir / f"{base_filename}.cn.txt" with open(cn_file, "w", encoding="utf-8") as f: f.write(cn_content) print(f" → 已保存: {cn_file.relative_to(OUTPUT_DIR)}") # 保存 JSON 对比文件(核心输出) json_file = target_dir / f"{base_filename}.comparison.json" with open(json_file, "w", encoding="utf-8") as f: json.dump(diff_json, f, ensure_ascii=False, indent=2) print(f" → 已保存: {json_file.relative_to(OUTPUT_DIR)} (AI Agent 对比文件)") # 保存历史版本 if old_full_text: old_file = target_dir / f"{base_filename}.old.txt" with open(old_file, "w", encoding="utf-8") as f: f.write(old_full_text) print(f" → 已保存: {old_file.relative_to(OUTPUT_DIR)}") def process_single_page(title, since_time, update_timestamp=False): """处理单个页面""" print(f"正在处理页面:{title}") # 获取最新内容 latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title) if latest_content is None: print("页面不存在或被删除") return None # 获取旧版本 old_revid = get_old_revid(title, since_time) old_content = None if old_revid: old_content, _, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid) if old_content is None: print(" 无法获取历史版本,视为新页面") # 获取中文翻译 print(" 搜索中文翻译...") cn_content = None # 直接尝试获取同名页面 cn_result, _, _ = get_page_content(WIKI_API_URL_CN, SESSION_CN, title) if cn_result: cn_content = cn_result print(f" 找到中文页面 ({len(cn_content)} 字符)") else: print(" 未找到中文翻译") # 生成对比 JSON diff_json = create_diff_json(title, old_content, latest_content, cn_content) print(f" 变更统计: 替换={diff_json['summary']['replaced']}, " f"新增={diff_json['summary']['added']}, 删除={diff_json['summary']['removed']}") # 保存文件 save_files(title, diff_json, latest_content, cn_content, latest_ts, latest_revid, old_content) if update_timestamp: save_last_timestamp(latest_ts) print(f"已更新时间戳 → {latest_ts}") return latest_ts def process_all_pages_since(since_time): """处理所有变更页面""" print("正在获取最近变更列表...") changes = get_recent_changes(since_time) if not changes: print("没有发现任何变更") return latest_global_ts = since_time for title, (revid, ts) in changes.items(): print(f"\n处理:{title}") page_ts = process_single_page(title, since_time) if page_ts and page_ts > latest_global_ts: latest_global_ts = page_ts save_last_timestamp(latest_global_ts) print(f"\n同步完成!最新时间戳: {latest_global_ts}") print(f"文件保存在: {CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}") def main(): parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - AI Agent 版本") parser.add_argument("--since", type=str, help="起始时间,格式: 2025-11-28T00:00:00Z") parser.add_argument("--title", type=str, help="只处理指定页面") parser.add_argument("--update-timestamp", action="store_true", help="更新全局时间戳") parser.add_argument("--run", action="store_true", help="执行同步") args = parser.parse_args() if not args.run: parser.print_help() return since_time = args.since or load_last_timestamp() if not since_time: since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z" print(f"起始时间: {since_time}") if args.title: process_single_page(args.title.strip(), since_time, args.update_timestamp) else: process_all_pages_since(since_time) if __name__ == "__main__": main()