# -*- coding: utf-8 -*- """ MediaWiki 最近变更同步工具 - 增强版 支持: 1. 正常全量同步(无参数) 2. 手动指定时间起点:--since 2025-11-28T00:00:00Z 3. 只同步单个页面:--title "页面名称" 4. 单个页面时可选更新全局时间戳:--update-timestamp 5. 获取历史版本并生成diff 6. 同步中文翻译版本 7. 生成双语对比网页 """ import os import argparse from pathlib import Path from datetime import datetime import requests from dotenv import load_dotenv import difflib import json import re from urllib.parse import quote # ==================== 配置区 ==================== load_dotenv() WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php") WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php") OUTPUT_DIR = Path("wiki_sync_output") OUTPUT_DIR.mkdir(exist_ok=True) # 全局变量,存储本次执行的输出目录 CURRENT_OUTPUT_DIR = None LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt" SESSION_EN = requests.Session() SESSION_EN.headers.update({ "User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)" }) SESSION_CN = requests.Session() SESSION_CN.headers.update({ "User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)" }) # ================================================ def load_last_timestamp(): if not os.path.exists(LAST_TIMESTAMP_FILE): return None with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f: return f.read().strip() def save_last_timestamp(ts): with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f: f.write(ts) def get_recent_changes(since): """获取自 since 时间后每个页面的最新 revid(自动去重)""" params = { "action": "query", "list": "recentchanges", "rcprop": "title|ids|timestamp", "rctype": "edit|new", "rcdir": "newer", "rcstart": since, "rclimit": 500, "format": "json" } latest = {} while True: try: r = SESSION_EN.get(WIKI_API_URL_EN, params=params) r.raise_for_status() response_data = r.json() if "error" in response_data: raise Exception(response_data["error"]) for rc in response_data.get("query", {}).get("recentchanges", []): latest[rc["title"]] = (rc["revid"], rc["timestamp"]) if "continue" not in response_data: break params.update(response_data["continue"]) except Exception as e: print(f"获取最近更改时出错: {e}") break return latest def get_old_revid(title, end_time): """获取 ≤ end_time 的最后一次修订的 revid(用于 fromrev)""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "ids|timestamp", "rvlimit": 1, "rvdir": "older", "rvstart": end_time, "format": "json" } try: r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json() pages = r["query"]["pages"] page = next(iter(pages.values())) if "revisions" not in page: print(f" 页面 '{title}' 在指定时间前没有找到修订版本") return None revisions = page["revisions"] if len(revisions) >= 1: return revisions[0]["revid"] print(f" 页面 '{title}' 在指定时间前没有找到修订版本") return None except Exception as e: print(f"获取旧版本ID时出错: {e}") return None def get_page_content(wiki_url, session, title, revid=None): """获取页面完整内容""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "content|timestamp|ids", "rvslots": "main", "format": "json" } if revid: params["rvstartid"] = revid params["rvendid"] = revid try: r = session.get(wiki_url, params=params).json() pages = r["query"]["pages"] page = next(iter(pages.values())) if "revisions" not in page: return None, None, None rev = page["revisions"][0] content = rev["slots"]["main"]["*"] timestamp = rev["timestamp"] rev_id = rev["revid"] return content, timestamp, rev_id except Exception as e: print(f"获取页面内容时出错: {e}") return None, None, None def generate_text_diff(old_text, new_text): """生成类似git diff的文本diff""" if not old_text: return "新创建页面" old_lines = old_text.splitlines(keepends=True) new_lines = new_text.splitlines(keepends=True) differ = difflib.unified_diff( old_lines, new_lines, lineterm='\n' ) return ''.join(differ) def parse_diff_with_line_numbers(diff_text): """解析diff文本,提取详细的行号信息""" if not diff_text or diff_text.startswith("新创建页面"): return [] parsed_lines = [] current_old_line = 0 current_new_line = 0 in_hunk = False for line in diff_text.splitlines(): if line.startswith('@@'): # 解析hunk头部,格式如: @@ -start,count +start,count @@ import re match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line) if match: old_start = int(match.group(1)) old_count = int(match.group(2)) if match.group(2) else 1 new_start = int(match.group(3)) new_count = int(match.group(4)) if match.group(4) else 1 current_old_line = old_start current_new_line = new_start in_hunk = True parsed_lines.append({ 'type': 'hunk', 'content': line, 'old_start': old_start, 'old_count': old_count, 'new_start': new_start, 'new_count': new_count, 'old_line': None, 'new_line': None }) else: parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) elif line.startswith('---') or line.startswith('+++'): # 文件头信息 continue #parsed_lines.append({ # 'type': 'header', # 'content': line, # 'old_line': None, # 'new_line': None #}) elif in_hunk: if line.startswith('-'): # 删除的行 parsed_lines.append({ 'type': 'removed', 'content': line[1:], # 去掉开头的 '-' 'old_line': current_old_line, 'new_line': None }) current_old_line += 1 elif line.startswith('+'): # 新增的行 parsed_lines.append({ 'type': 'added', 'content': line[1:], # 去掉开头的 '+' 'old_line': None, 'new_line': current_new_line }) current_new_line += 1 elif line.startswith(' '): # 未变更的行 parsed_lines.append({ 'type': 'context', 'content': line[1:], # 去掉开头的 ' ' 'old_line': current_old_line, 'new_line': current_new_line }) current_old_line += 1 current_new_line += 1 else: # 其他行(如空行) parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) else: # 不在任何hunk中的行 parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) return parsed_lines def search_chinese_page(title): """在中文wiki中搜索对应的页面""" # 首先尝试精确匹配 params = { "action": "query", "list": "search", "srsearch": f'"{title}"', "srwhat": "title", "srlimit": 5, "format": "json" } try: r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json() search_results = r.get("query", {}).get("search", []) if search_results: # 返回第一个匹配的结果 return search_results[0]["title"] # 如果精确匹配没有结果,尝试模糊搜索 params["srsearch"] = title.replace(" ", "%20") r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json() search_results = r.get("query", {}).get("search", []) if search_results: return search_results[0]["title"] except Exception as e: print(f"搜索中文页面时出错: {e}") return None def create_diff_html(title, en_diff, en_old_lines, en_new_lines, cn_content=None): """创建双语对比的HTML页面 - Word批注风格,英文变更直接显示在对应中文行右侧""" # 准备中文内容行 cn_lines = [] if cn_content: cn_lines = cn_content.splitlines() # 解析diff并获取行号信息 parsed_diff = parse_diff_with_line_numbers(en_diff) if en_diff else [] # 构建行号到diff内容的映射 - 科学处理连续diff块 en_changes_by_line = {} blank_lines_to_insert = {} # 记录需要在某行前插入的空白行及其对应的新增内容 if parsed_diff: i = 0 while i < len(parsed_diff): # 收集连续的diff块 diff_block = [] start_index = i # 收集连续的添加/删除操作(跳过hunk和header) while i < len(parsed_diff): item = parsed_diff[i] if item['type'] in ['added', 'removed']: diff_block.append(item) elif item['type'] in ['hunk', 'header'] or item['type'] == 'context': if diff_block: # 如果已经有diff块,就停止 break i += 1 # 处理连续的diff块 if diff_block: # 计算行数平衡 line_balance = 0 for item in diff_block: if item['type'] == 'added': line_balance += 1 elif item['type'] == 'removed': line_balance -= 1 # 如果平衡为正数,需要在中文侧添加空白行 if line_balance > 0: # 找到基准行号(第一个操作的行号) base_line = None for item in diff_block: if item['old_line']: # 优先使用删除行的行号 base_line = item['old_line'] break elif item['new_line'] and base_line is None: base_line = item['new_line'] if base_line: # 收集需要分配到空白行的新增内容 additions_for_blank_lines = [] remaining_additions = [] for item in diff_block: if item['type'] == 'added': additions_for_blank_lines.append(item['content']) # 记录需要插入的空白行和对应的内容 blank_lines_to_insert[base_line] = additions_for_blank_lines # 处理具体的diff项 j = 0 while j < len(diff_block): item = diff_block[j] # 检查是否是替换操作(删除后紧跟新增) if (item['type'] == 'removed' and j + 1 < len(diff_block) and diff_block[j + 1]['type'] == 'added'): next_item = diff_block[j + 1] # 这是同一行的替换操作 target_line = item['old_line'] # 使用删除行的行号作为目标行号 if target_line not in en_changes_by_line: en_changes_by_line[target_line] = [] en_changes_by_line[target_line].append({ 'type': 'replaced', 'old_content': item['content'], 'new_content': next_item['content'] }) j += 2 # 跳过下一个项目,因为已经处理了 # 处理普通的添加操作(不包括需要分配到空白行的) elif item['type'] == 'added' and item['new_line']: # 如果这个新增内容已经被分配到空白行,就跳过 if line_balance > 0 and item['content'] in blank_lines_to_insert.get(base_line, []): j += 1 continue if item['new_line'] not in en_changes_by_line: en_changes_by_line[item['new_line']] = [] en_changes_by_line[item['new_line']].append({ 'type': 'added', 'content': item['content'] }) j += 1 # 处理普通的删除操作(没有对应的新增) elif item['type'] == 'removed' and item['old_line']: if item['old_line'] not in en_changes_by_line: en_changes_by_line[item['old_line']] = [] en_changes_by_line[item['old_line']].append({ 'type': 'removed', 'content': item['content'] }) j += 1 else: j += 1 # 继续处理剩余项 else: i += 1 # HTML转义函数 def html_escape(text): if not text: return "" return (str(text) .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) .replace("'", "'")) def generate_inline_diff(old_text, new_text): """生成GitHub风格的行内字符级diff""" if not old_text or not new_text: return html_escape(new_text or "") escaped_old = html_escape(old_text) escaped_new = html_escape(new_text) # 使用difflib进行字符级别的比较 differ = difflib.SequenceMatcher(None, escaped_old, escaped_new) result = [] for tag, i1, i2, j1, j2 in differ.get_opcodes(): if tag == 'equal': # 相同的部分 result.append(escaped_new[j1:j2]) elif tag == 'replace': # 替换的部分:删除的用红色背景,新增的用绿色背景 deleted = escaped_old[i1:i2] added = escaped_new[j1:j2] result.append(f'{deleted}') result.append(f'{added}') elif tag == 'delete': # 删除的部分用红色背景 deleted = escaped_old[i1:i2] result.append(f'{deleted}') elif tag == 'insert': # 新增的部分用绿色背景 added = escaped_new[j1:j2] result.append(f'{added}') return ''.join(result) def generate_clean_new_content(old_text, new_text): """生成干净的新内容,只显示新增部分的高亮,不包含删除部分""" if not old_text or not new_text: return html_escape(new_text or "") escaped_old = html_escape(old_text) escaped_new = html_escape(new_text) # 使用difflib进行字符级别的比较 differ = difflib.SequenceMatcher(None, escaped_old, escaped_new) result = [] for tag, i1, i2, j1, j2 in differ.get_opcodes(): if tag == 'equal': # 相同的部分 result.append(escaped_new[j1:j2]) elif tag == 'replace': # 替换的部分:只显示新增的内容(绿色高亮),跳过删除的内容 added = escaped_new[j1:j2] result.append(f'{added}') elif tag == 'delete': # 删除的部分:跳过,不显示 continue elif tag == 'insert': # 新增的部分用绿色背景 added = escaped_new[j1:j2] result.append(f'{added}') return ''.join(result) # 生成HTML html = f''' Wiki Diff: {title}

{title}

英文Wiki: wiki.projectdiablo2.com {f' | 中文Wiki: wiki.projectdiablo2.cn' if cn_content else ''}
中文翻译(含英文变更批注)
''' # 添加中文内容和英文变更批注 if cn_content: for i, line in enumerate(cn_lines, 1): # 检查是否需要在此行之前插入空白行 if i in blank_lines_to_insert: additions_list = blank_lines_to_insert[i] for addition_content in additions_list: html += f'
' html += '
' html += ' ' # 不显示行号 html += f'(新增英文内容占位)' html += '
' # 为空白行添加对应的新增批注 escaped_addition = html_escape(addition_content) html += '
' html += f'
' html += f'
新增
' html += f'
{escaped_addition}
' html += '
' html += '
' html += '
' escaped_line = html_escape(line) has_changes = i in en_changes_by_line changes = en_changes_by_line.get(i, []) # 判断是否为空行 is_empty = not line.strip() html += f'
' html += f'
' html += f'{i}' html += f'{escaped_line if not is_empty else "(空行)"}' html += '
' # 添加英文变更批注(只显示替换和删除操作,新增操作已经在空白行中显示) if has_changes: html += '
' for change in changes: if change['type'] == 'added': # 新增内容已经在空白行中显示,这里跳过 continue elif change['type'] == 'removed': escaped_change = html_escape(change['content']) html += f'
' html += f'
删除
' html += f'
{escaped_change}
' html += '
' elif change['type'] == 'replaced': # 生成干净的新内容(只显示新增部分的高亮,不包含删除部分) clean_new_content = generate_clean_new_content(change['old_content'], change['new_content']) html += f'
' html += f'
替换
' html += f'
{html_escape(change["old_content"])}
' html += f'
{clean_new_content}
' html += '
' html += '
' html += '
' else: html += '
未找到对应的中文翻译页面
' html += '''
''' return html def save_files(title, diff_html, diff_text, full_text, timestamp, note="", revid=None, cn_content=None, old_full_text=None): global CURRENT_OUTPUT_DIR # 确保本次执行的输出目录已经创建 if CURRENT_OUTPUT_DIR is None: current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S") CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str CURRENT_OUTPUT_DIR.mkdir(exist_ok=True) print(f"创建本次执行的输出目录: {CURRENT_OUTPUT_DIR}") safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title) time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_") base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}" # 保存各种文件 files_to_save = [] # 1. 标准MediaWiki diff HTML diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.html" if diff_html: files_to_save.append((diff_file, diff_html)) # 2. 文本格式的diff text_diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.txt" if diff_text: files_to_save.append((text_diff_file, diff_text)) # 3. 最新完整内容 full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.full.txt" if full_text: files_to_save.append((full_file, full_text)) # 4. 历史版本内容(如果存在) if old_full_text: old_full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.old.txt" files_to_save.append((old_full_file, old_full_text)) # 5. 中文翻译内容(如果存在) if cn_content: cn_file = CURRENT_OUTPUT_DIR / f"{base_filename}.cn.txt" files_to_save.append((cn_file, cn_content)) # 6. 双语对比HTML页面 if cn_content: # 为文本diff准备行 en_new_lines = full_text.splitlines() if full_text else [] en_old_lines = old_full_text.splitlines() if old_full_text else [] # 创建双语对比页面 comparison_html = create_diff_html(title, diff_text, en_old_lines, en_new_lines, cn_content) comparison_file = CURRENT_OUTPUT_DIR / f"{base_filename}.comparison.html" files_to_save.append((comparison_file, comparison_html)) print(f" → 已保存: {comparison_file.relative_to(OUTPUT_DIR)} (双语对比页面)") # 写入所有文件 for file_path, content in files_to_save: try: with open(file_path, "w", encoding="utf-8") as f: f.write(content) print(f" → 已保存: {file_path.relative_to(OUTPUT_DIR)}") except Exception as e: print(f" → 保存文件 {file_path} 时出错: {e}") def process_single_page(title, since_time, update_timestamp=False): """只处理单个页面""" print(f"正在单独处理页面:{title}") # 获取当前最新 revid try: latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title) if latest_content is None: print("页面不存在或被删除") return None # 获取旧 revid old_revid = get_old_revid(title, since_time) # 初始化变量 diff_html = None diff_text = None old_content = None cn_content = None if old_revid: # 获取历史版本内容 old_content, old_ts, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid) if old_content is not None: # 生成文本diff diff_text = generate_text_diff(old_content, latest_content) print(f" 生成了文本diff ({len(diff_text)} 字符)") else: print(f" 无法获取历史版本内容") else: # 新页面 print(" 这是新创建的页面") # 搜索对应的中文页面 print(" 搜索中文翻译...") cn_title = search_chinese_page(title) if cn_title: print(f" 找到中文页面: {cn_title}") cn_content, cn_ts, cn_revid = get_page_content(WIKI_API_URL_CN, SESSION_CN, cn_title) if cn_content: print(f" 获取中文内容成功 ({len(cn_content)} 字符)") else: print(" 无法获取中文页面内容") else: print(" 未找到对应的中文翻译页面") # 获取官方diff(可选) if old_revid: diff_params = { "action": "compare", "fromrev": old_revid, "torev": latest_revid, "format": "json" } try: diff_resp = SESSION_EN.get(WIKI_API_URL_EN, params=diff_params).json() diff_html = diff_resp.get("compare", {}).get("*", "") except Exception as e: print(f" 获取官方HTML diff时出错: {e}") # 保存所有文件 save_files(title, diff_html, diff_text, latest_content, latest_ts, "", latest_revid, cn_content, old_content) if update_timestamp: save_last_timestamp(latest_ts) print(f"已更新全局时间戳 → {latest_ts}") return latest_ts except Exception as e: print(f"处理页面 '{title}' 时出错: {e}") return None def process_all_pages_since(since_time): """处理自指定时间以来的所有页面变更""" print("正在获取最近变更列表...") changes = get_recent_changes(since_time) if not changes: print("没有发现任何变更") return latest_global_ts = since_time for title, (latest_revid, ts) in changes.items(): print(f"\n处理:{title}") # 复用单页处理逻辑 page_latest_ts = process_single_page(title, since_time) if page_latest_ts and page_latest_ts > latest_global_ts: latest_global_ts = page_latest_ts save_last_timestamp(latest_global_ts) print(f"\n全量同步完成!本次最新时间戳已更新为:{latest_global_ts}") print(f"文件保存在:{CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}") def main(): parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - 增强版支持双语对比") parser.add_argument("--since", type=str, help="强制从指定时间开始同步,格式如 2025-11-28T00:00:00Z") parser.add_argument("--title", type=str, help="只同步指定的单个页面标题") parser.add_argument("--update-timestamp", action="store_true", help="在单页模式下,完成后仍然更新全局 last_sync_timestamp.txt") parser.add_argument("--run", action="store_true", help="执行同步操作(必须提供此参数才能真正执行同步)") args = parser.parse_args() # 如果没有提供 --run 参数,则显示帮助信息并退出 if not args.run: parser.print_help() return # 确定实际使用的 since 时间 if args.since: since_time = args.since print(f"使用命令行指定的时间起点:{since_time}") else: since_time = load_last_timestamp() if not since_time: from datetime import timedelta since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z" print(f"使用上次记录的时间起点:{since_time}") # 单页面模式 if args.title: process_single_page(args.title.strip(), since_time, args.update_timestamp) return # 全量模式 - 使用复用的单页处理逻辑 process_all_pages_since(since_time) if __name__ == "__main__": main()