# -*- coding: utf-8 -*- """ MediaWiki 最近变更同步工具 - 增强版 支持: 1. 正常全量同步(无参数) 2. 手动指定时间起点:--since 2025-11-28T00:00:00Z 3. 只同步单个页面:--title "页面名称" 4. 单个页面时可选更新全局时间戳:--update-timestamp 5. 获取历史版本并生成diff 6. 同步中文翻译版本 7. 生成双语对比网页 """ import os import argparse from pathlib import Path from datetime import datetime import requests from dotenv import load_dotenv import difflib import json import re from urllib.parse import quote # ==================== 配置区 ==================== load_dotenv() WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php") WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php") OUTPUT_DIR = Path("wiki_sync_output") OUTPUT_DIR.mkdir(exist_ok=True) # 全局变量,存储本次执行的输出目录 CURRENT_OUTPUT_DIR = None LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt" SESSION_EN = requests.Session() SESSION_EN.headers.update({ "User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)" }) SESSION_CN = requests.Session() SESSION_CN.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" }) SESSION_CN.trust_env = False # 禁用环境变量代理 # ================================================ def load_last_timestamp(): if not os.path.exists(LAST_TIMESTAMP_FILE): return None with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f: return f.read().strip() def save_last_timestamp(ts): with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f: f.write(ts) def get_recent_changes(since): """获取自 since 时间后每个页面的最新 revid(自动去重)""" params = { "action": "query", "list": "recentchanges", "rcprop": "title|ids|timestamp", "rctype": "edit|new", "rcdir": "newer", "rcstart": since, "rclimit": 500, "format": "json" } latest = {} while True: try: r = SESSION_EN.get(WIKI_API_URL_EN, params=params) r.raise_for_status() response_data = r.json() if "error" in response_data: raise Exception(response_data["error"]) for rc in response_data.get("query", {}).get("recentchanges", []): latest[rc["title"]] = (rc["revid"], rc["timestamp"]) if "continue" not in response_data: break params.update(response_data["continue"]) except Exception as e: print(f"获取最近更改时出错: {e}") break return latest def get_old_revid(title, end_time): """获取 ≤ end_time 的最后一次修订的 revid(用于 fromrev)""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "ids|timestamp", "rvlimit": 1, "rvdir": "older", "rvstart": end_time, "format": "json" } try: r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json() pages = r["query"]["pages"] page = next(iter(pages.values())) if "revisions" not in page: print(f" 页面 '{title}' 在指定时间前没有找到修订版本") return None revisions = page["revisions"] if len(revisions) >= 1: return revisions[0]["revid"] print(f" 页面 '{title}' 在指定时间前没有找到修订版本") return None except Exception as e: print(f"获取旧版本ID时出错: {e}") return None def get_page_content(wiki_url, session, title, revid=None): """获取页面完整内容""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "content|timestamp|ids", "rvslots": "main", "format": "json" } if revid: params["rvstartid"] = revid params["rvendid"] = revid try: r = session.get(wiki_url, params=params).json() pages = r["query"]["pages"] page = next(iter(pages.values())) if "revisions" not in page: return None, None, None rev = page["revisions"][0] content = rev["slots"]["main"]["*"] timestamp = rev["timestamp"] rev_id = rev["revid"] return content, timestamp, rev_id except Exception as e: print(f"获取页面内容时出错: {e}") return None, None, None def generate_text_diff(old_text, new_text): """生成类似git diff的文本diff""" if not old_text: return "新创建页面" old_lines = old_text.splitlines(keepends=True) new_lines = new_text.splitlines(keepends=True) differ = difflib.unified_diff( old_lines, new_lines, lineterm='\n' ) return ''.join(differ) def parse_diff_with_line_numbers(diff_text): """解析diff文本,提取详细的行号信息""" if not diff_text or diff_text.startswith("新创建页面"): return [] parsed_lines = [] current_old_line = 0 current_new_line = 0 in_hunk = False for line in diff_text.splitlines(): if line.startswith('@@'): # 解析hunk头部,格式如: @@ -start,count +start,count @@ import re match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line) if match: old_start = int(match.group(1)) old_count = int(match.group(2)) if match.group(2) else 1 new_start = int(match.group(3)) new_count = int(match.group(4)) if match.group(4) else 1 current_old_line = old_start current_new_line = new_start in_hunk = True parsed_lines.append({ 'type': 'hunk', 'content': line, 'old_start': old_start, 'old_count': old_count, 'new_start': new_start, 'new_count': new_count, 'old_line': None, 'new_line': None }) else: parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) elif line.startswith('---') or line.startswith('+++'): # 文件头信息 continue #parsed_lines.append({ # 'type': 'header', # 'content': line, # 'old_line': None, # 'new_line': None #}) elif in_hunk: if line.startswith('-'): # 删除的行 parsed_lines.append({ 'type': 'removed', 'content': line[1:], # 去掉开头的 '-' 'old_line': current_old_line, 'new_line': None }) current_old_line += 1 elif line.startswith('+'): # 新增的行 parsed_lines.append({ 'type': 'added', 'content': line[1:], # 去掉开头的 '+' 'old_line': None, 'new_line': current_new_line }) current_new_line += 1 elif line.startswith(' '): # 未变更的行 parsed_lines.append({ 'type': 'context', 'content': line[1:], # 去掉开头的 ' ' 'old_line': current_old_line, 'new_line': current_new_line }) current_old_line += 1 current_new_line += 1 else: # 其他行(如空行) parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) else: # 不在任何hunk中的行 parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) return parsed_lines def search_chinese_page(title): """在中文wiki中搜索对应的页面""" # 首先尝试直接获取页面(因为中文wiki禁用了标题搜索) params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "ids", "format": "json" } try: r = SESSION_CN.get(WIKI_API_URL_CN, params=params, timeout=10) data = r.json() pages = data.get("query", {}).get("pages", {}) for page_id, page_info in pages.items(): # page_id 为负数表示页面不存在 if page_id != "-1" and "missing" not in page_info: return page_info.get("title") except Exception as e: print(f"搜索中文页面时出错: {e}") return None def create_diff_html(title, en_diff, en_old_lines, en_new_lines, cn_content=None): """创建双语对比的HTML页面 - Word批注风格,英文变更直接显示在对应中文行右侧""" # 准备中文内容行 cn_lines = [] if cn_content: cn_lines = cn_content.splitlines() # 解析diff并获取行号信息 parsed_diff = parse_diff_with_line_numbers(en_diff) if en_diff else [] # 构建行号到diff内容的映射 - 科学处理连续diff块 en_changes_by_line = {} blank_lines_to_insert = {} # 记录需要在某行前插入的空白行及其对应的新增内容 if parsed_diff: i = 0 while i < len(parsed_diff): # 收集连续的diff块 diff_block = [] start_index = i # 收集连续的添加/删除操作(跳过hunk和header) while i < len(parsed_diff): item = parsed_diff[i] if item['type'] in ['added', 'removed']: diff_block.append(item) elif item['type'] in ['hunk', 'header'] or item['type'] == 'context': if diff_block: # 如果已经有diff块,就停止 break i += 1 # 处理连续的diff块 - 改进的连续匹配算法 if diff_block: # 使用新的匹配算法:连续的减号和加号应该按顺序匹配 removed_items = [] added_items = [] # 分离删除和新增项目 for item in diff_block: if item['type'] == 'removed': removed_items.append(item) elif item['type'] == 'added': added_items.append(item) # 进行匹配:连续块里的每一个减都应该和后续的加形成匹配替换 match_index = 0 for removed_item in removed_items: if match_index < len(added_items): # 匹配成功:形成替换 target_line = removed_item['old_line'] if target_line not in en_changes_by_line: en_changes_by_line[target_line] = [] en_changes_by_line[target_line].append({ 'type': 'replaced', 'old_content': removed_item['content'], 'new_content': added_items[match_index]['content'] }) match_index += 1 else: # 没有匹配的加:这是删除 target_line = removed_item['old_line'] if target_line not in en_changes_by_line: en_changes_by_line[target_line] = [] en_changes_by_line[target_line].append({ 'type': 'removed', 'content': removed_item['content'] }) # 处理剩余的加(没有匹配的减):这是新增,应该在左侧空行 if match_index < len(added_items): # 找到基准行号(使用最后一个删除行的行号,如果没有则使用第一个新增的行号) base_line = None if removed_items: base_line = removed_items[-1]['old_line'] elif added_items: base_line = added_items[match_index]['new_line'] - len(added_items) + match_index if base_line: remaining_additions = added_items[match_index:] blank_lines_to_insert[base_line + 1] = remaining_additions # 继续处理剩余项 else: i += 1 # HTML转义函数 def html_escape(text): if not text: return "" return (str(text) .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) .replace("'", "'")) def generate_inline_diff(old_text, new_text): """生成GitHub风格的行内字符级diff""" if not old_text or not new_text: return html_escape(new_text or "") escaped_old = html_escape(old_text) escaped_new = html_escape(new_text) # 使用difflib进行字符级别的比较 differ = difflib.SequenceMatcher(None, escaped_old, escaped_new) result = [] for tag, i1, i2, j1, j2 in differ.get_opcodes(): if tag == 'equal': # 相同的部分 result.append(escaped_new[j1:j2]) elif tag == 'replace': # 替换的部分:删除的用红色背景,新增的用绿色背景 deleted = escaped_old[i1:i2] added = escaped_new[j1:j2] result.append(f'{deleted}') result.append(f'{added}') elif tag == 'delete': # 删除的部分用红色背景 deleted = escaped_old[i1:i2] result.append(f'{deleted}') elif tag == 'insert': # 新增的部分用绿色背景 added = escaped_new[j1:j2] result.append(f'{added}') return ''.join(result) def generate_clean_new_content(old_text, new_text): """生成干净的新内容,只显示新增部分的高亮,不包含删除部分""" if not old_text or not new_text: return html_escape(new_text or "") escaped_old = html_escape(old_text) escaped_new = html_escape(new_text) # 使用difflib进行字符级别的比较 differ = difflib.SequenceMatcher(None, escaped_old, escaped_new) result = [] for tag, i1, i2, j1, j2 in differ.get_opcodes(): if tag == 'equal': # 相同的部分 result.append(escaped_new[j1:j2]) elif tag == 'replace': # 替换的部分:只显示新增的内容(绿色高亮),跳过删除的内容 added = escaped_new[j1:j2] result.append(f'{added}') elif tag == 'delete': # 删除的部分:跳过,不显示 continue elif tag == 'insert': # 新增的部分用绿色背景 added = escaped_new[j1:j2] result.append(f'{added}') return ''.join(result) # 收集变更块信息用于导航 change_blocks = [] change_block_id = 0 # 生成HTML html = f''' Wiki Diff: {title}

{title}

英文Wiki: wiki.projectdiablo2.com {f' | 中文Wiki: wiki.projectdiablo2.cn' if cn_content else ''}
中文翻译(含英文变更批注)
''' # 添加中文内容和英文变更批注 if cn_content: for i, line in enumerate(cn_lines, 1): # 检查是否需要在此行之前插入空白行 if i in blank_lines_to_insert: additions_list = blank_lines_to_insert[i] change_block_id += 1 # 添加变更块到导航列表 first_content = additions_list[0].get('content', '') if additions_list else '' preview_text = first_content[:50] + "..." if len(first_content) > 50 else first_content change_blocks.append({ 'id': change_block_id, 'line': i, 'type': '新增', 'preview': preview_text, 'count': len(additions_list) }) for idx, addition_item in enumerate(additions_list): html += f'
' html += '
' html += ' ' # 不显示行号 html += f'(新增英文内容占位)' html += '
' # 为空白行添加对应的新增批注 addition_content = addition_item.get('content', '') # 只取内容字段 escaped_addition = html_escape(addition_content) html += '
' html += f'
' html += f'
新增
' html += f'
{escaped_addition}
' html += '
' html += '
' html += '
' escaped_line = html_escape(line) has_changes = i in en_changes_by_line changes = en_changes_by_line.get(i, []) # 判断是否为空行 is_empty = not line.strip() # 如果有变更(除了新增),添加到导航列表 if has_changes and any(change['type'] in ['replaced', 'removed'] for change in changes): change_block_id += 1 preview_text = line[:50] + "..." if len(line) > 50 else line change_type = "替换" if any(change['type'] == 'replaced' for change in changes) else "删除" change_blocks.append({ 'id': change_block_id, 'line': i, 'type': change_type, 'preview': preview_text, 'count': 1 }) html += f'
' html += f'
' html += f'{i}' html += f'{escaped_line if not is_empty else "(空行)"}' html += '
' # 添加英文变更批注(只显示替换和删除操作,新增操作已经在空白行中显示) if has_changes: html += '
' for change in changes: if change['type'] == 'added': # 新增内容已经在空白行中显示,这里跳过 continue elif change['type'] == 'removed': escaped_change = html_escape(change['content']) html += f'
' html += f'
删除
' html += f'
{escaped_change}
' html += '
' elif change['type'] == 'replaced': # 生成干净的新内容(只显示新增部分的高亮,不包含删除部分) clean_new_content = generate_clean_new_content(change['old_content'], change['new_content']) html += f'
' html += f'
替换
' html += f'
{html_escape(change["old_content"])}
' html += f'
{clean_new_content}
' html += '
' html += '
' html += '
' else: html += '
未找到对应的中文翻译页面
' # 调试日志:打印change_blocks信息 print(f"DEBUG: Final change_blocks length = {len(change_blocks)}") for i, block in enumerate(change_blocks): print(f"DEBUG: Final block {i}: {block}") html += '''
''' return html def save_files(title, diff_text, full_text, timestamp, note="", revid=None, cn_content=None, old_full_text=None): global CURRENT_OUTPUT_DIR # 确保本次执行的输出目录已经创建 if CURRENT_OUTPUT_DIR is None: current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S") CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str CURRENT_OUTPUT_DIR.mkdir(exist_ok=True) # 创建子目录 (CURRENT_OUTPUT_DIR / "new_pages").mkdir(exist_ok=True) (CURRENT_OUTPUT_DIR / "changed_pages").mkdir(exist_ok=True) print(f"创建本次执行的输出目录: {CURRENT_OUTPUT_DIR}") safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title) time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_") base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}" # 判断是否为新页面 is_new_page = diff_text and diff_text.startswith("新创建页面") if is_new_page: # 新页面:只保存完整内容到 new_pages 目录 target_dir = CURRENT_OUTPUT_DIR / "new_pages" print(f" 检测到新页面,只保存完整内容") # 保存最新完整内容 full_file = target_dir / f"{base_filename}.full.txt" if full_text: try: with open(full_file, "w", encoding="utf-8") as f: f.write(full_text) print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)} (新页面完整内容)") except Exception as e: print(f" → 保存文件 {full_file} 时出错: {e}") else: # 变更页面:保存主要文件到 changed_pages 目录,其他文件到 files 目录 target_dir = CURRENT_OUTPUT_DIR / "changed_pages" files_dir = target_dir / "files" files_dir.mkdir(exist_ok=True) # 1. 保存最新完整内容(主文件) full_file = target_dir / f"{base_filename}.full.txt" if full_text: try: with open(full_file, "w", encoding="utf-8") as f: f.write(full_text) print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)} (完整内容)") except Exception as e: print(f" → 保存文件 {full_file} 时出错: {e}") # 2. 保存中文翻译内容(主文件) if cn_content: cn_file = target_dir / f"{base_filename}.cn.txt" try: with open(cn_file, "w", encoding="utf-8") as f: f.write(cn_content) print(f" → 已保存: {cn_file.relative_to(OUTPUT_DIR)} (中文翻译)") except Exception as e: print(f" → 保存文件 {cn_file} 时出错: {e}") # 3. 创建双语对比HTML页面(主文件) en_new_lines = full_text.splitlines() if full_text else [] en_old_lines = old_full_text.splitlines() if old_full_text else [] comparison_html = create_diff_html(title, diff_text, en_old_lines, en_new_lines, cn_content) comparison_file = target_dir / f"{base_filename}.comparison.html" try: with open(comparison_file, "w", encoding="utf-8") as f: f.write(comparison_html) print(f" → 已保存: {comparison_file.relative_to(OUTPUT_DIR)} (双语对比页面)") except Exception as e: print(f" → 保存文件 {comparison_file} 时出错: {e}") if diff_text: text_diff_file = files_dir / f"{base_filename}.diff.txt" try: with open(text_diff_file, "w", encoding="utf-8") as f: f.write(diff_text) print(f" → 已保存: {text_diff_file.relative_to(OUTPUT_DIR)} (文本diff - 参考)") except Exception as e: print(f" → 保存文件 {text_diff_file} 时出错: {e}") if old_full_text: old_full_file = files_dir / f"{base_filename}.old.txt" try: with open(old_full_file, "w", encoding="utf-8") as f: f.write(old_full_text) print(f" → 已保存: {old_full_file.relative_to(OUTPUT_DIR)} (历史版本 - 参考)") except Exception as e: print(f" → 保存文件 {old_full_file} 时出错: {e}") def process_single_page(title, since_time, update_timestamp=False): """只处理单个页面""" print(f"正在单独处理页面:{title}") # 获取当前最新 revid try: latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title) if latest_content is None: print("页面不存在或被删除") return None # 获取旧 revid old_revid = get_old_revid(title, since_time) # 初始化变量 diff_text = None old_content = None cn_content = None if old_revid: # 获取历史版本内容 old_content, old_ts, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid) if old_content is not None: # 生成文本diff diff_text = generate_text_diff(old_content, latest_content) print(f" 生成了文本diff ({len(diff_text)} 字符)") else: print(f" 无法获取历史版本内容") diff_text = "新创建页面" else: # 新页面 print(" 这是新创建的页面") diff_text = "新创建页面" # 搜索对应的中文页面 print(" 搜索中文翻译...") cn_title = search_chinese_page(title) if cn_title: print(f" 找到中文页面: {cn_title}") cn_content, cn_ts, cn_revid = get_page_content(WIKI_API_URL_CN, SESSION_CN, cn_title) if cn_content: print(f" 获取中文内容成功 ({len(cn_content)} 字符)") else: print(" 无法获取中文页面内容") else: print(" 未找到对应的中文翻译页面") # 保存所有文件 save_files(title, diff_text, latest_content, latest_ts, "", latest_revid, cn_content, old_content) if update_timestamp: save_last_timestamp(latest_ts) print(f"已更新全局时间戳 → {latest_ts}") return latest_ts except Exception as e: print(f"处理页面 '{title}' 时出错: {e}") return None def process_all_pages_since(since_time): """处理自指定时间以来的所有页面变更""" print("正在获取最近变更列表...") changes = get_recent_changes(since_time) if not changes: print("没有发现任何变更") return latest_global_ts = since_time for title, (latest_revid, ts) in changes.items(): print(f"\n处理:{title}") # 复用单页处理逻辑 page_latest_ts = process_single_page(title, since_time) if page_latest_ts and page_latest_ts > latest_global_ts: latest_global_ts = page_latest_ts save_last_timestamp(latest_global_ts) print(f"\n全量同步完成!本次最新时间戳已更新为:{latest_global_ts}") print(f"文件保存在:{CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}") def main(): parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - 增强版支持双语对比") parser.add_argument("--since", type=str, help="强制从指定时间开始同步,格式如 2025-11-28T00:00:00Z") parser.add_argument("--title", type=str, help="只同步指定的单个页面标题") parser.add_argument("--update-timestamp", action="store_true", help="在单页模式下,完成后仍然更新全局 last_sync_timestamp.txt") parser.add_argument("--run", action="store_true", help="执行同步操作(必须提供此参数才能真正执行同步)") args = parser.parse_args() # 如果没有提供 --run 参数,则显示帮助信息并退出 if not args.run: parser.print_help() return # 确定实际使用的 since 时间 if args.since: since_time = args.since print(f"使用命令行指定的时间起点:{since_time}") else: since_time = load_last_timestamp() if not since_time: from datetime import timedelta since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z" print(f"使用上次记录的时间起点:{since_time}") # 单页面模式 if args.title: process_single_page(args.title.strip(), since_time, args.update_timestamp) return # 全量模式 - 使用复用的单页处理逻辑 process_all_pages_since(since_time) if __name__ == "__main__": main()