# -*- coding: utf-8 -*- """ MediaWiki 最近变更同步工具 - 增强版 支持: 1. 正常全量同步(无参数) 2. 手动指定时间起点:--since 2025-11-28T00:00:00Z 3. 只同步单个页面:--title "页面名称" 4. 单个页面时可选更新全局时间戳:--update-timestamp 5. 获取历史版本并生成diff 6. 同步中文翻译版本 7. 生成双语对比网页 """ import os import argparse from pathlib import Path from datetime import datetime import requests from dotenv import load_dotenv import difflib import json import re from urllib.parse import quote # ==================== 配置区 ==================== load_dotenv() WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php") WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php") OUTPUT_DIR = Path("wiki_sync_output") OUTPUT_DIR.mkdir(exist_ok=True) # 全局变量,存储本次执行的输出目录 CURRENT_OUTPUT_DIR = None LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt" SESSION_EN = requests.Session() SESSION_EN.headers.update({ "User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)" }) SESSION_CN = requests.Session() SESSION_CN.headers.update({ "User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)" }) # ================================================ def load_last_timestamp(): if not os.path.exists(LAST_TIMESTAMP_FILE): return None with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f: return f.read().strip() def save_last_timestamp(ts): with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f: f.write(ts) def get_recent_changes(since): """获取自 since 时间后每个页面的最新 revid(自动去重)""" params = { "action": "query", "list": "recentchanges", "rcprop": "title|ids|timestamp", "rctype": "edit|new", "rcdir": "newer", "rcstart": since, "rclimit": 500, "format": "json" } latest = {} while True: try: r = SESSION_EN.get(WIKI_API_URL_EN, params=params) r.raise_for_status() response_data = r.json() if "error" in response_data: raise Exception(response_data["error"]) for rc in response_data.get("query", {}).get("recentchanges", []): latest[rc["title"]] = (rc["revid"], rc["timestamp"]) if "continue" not in response_data: break params.update(response_data["continue"]) except Exception as e: print(f"获取最近更改时出错: {e}") break return latest def get_old_revid(title, end_time): """获取 ≤ end_time 的最后一次修订的 revid(用于 fromrev)""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "ids|timestamp", "rvlimit": 1, "rvdir": "older", "rvstart": end_time, "format": "json" } try: r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json() pages = r["query"]["pages"] page = next(iter(pages.values())) if "revisions" not in page: print(f" 页面 '{title}' 在指定时间前没有找到修订版本") return None revisions = page["revisions"] if len(revisions) >= 1: return revisions[0]["revid"] print(f" 页面 '{title}' 在指定时间前没有找到修订版本") return None except Exception as e: print(f"获取旧版本ID时出错: {e}") return None def get_page_content(wiki_url, session, title, revid=None): """获取页面完整内容""" params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "content|timestamp|ids", "rvslots": "main", "format": "json" } if revid: params["rvstartid"] = revid params["rvendid"] = revid try: r = session.get(wiki_url, params=params).json() pages = r["query"]["pages"] page = next(iter(pages.values())) if "revisions" not in page: return None, None, None rev = page["revisions"][0] content = rev["slots"]["main"]["*"] timestamp = rev["timestamp"] rev_id = rev["revid"] return content, timestamp, rev_id except Exception as e: print(f"获取页面内容时出错: {e}") return None, None, None def generate_text_diff(old_text, new_text): """生成类似git diff的文本diff""" if not old_text: return "新创建页面" old_lines = old_text.splitlines(keepends=True) new_lines = new_text.splitlines(keepends=True) differ = difflib.unified_diff( old_lines, new_lines, lineterm='\n' ) return ''.join(differ) def parse_diff_with_line_numbers(diff_text): """解析diff文本,提取详细的行号信息""" if not diff_text or diff_text.startswith("新创建页面"): return [] parsed_lines = [] current_old_line = 0 current_new_line = 0 in_hunk = False for line in diff_text.splitlines(): if line.startswith('@@'): # 解析hunk头部,格式如: @@ -start,count +start,count @@ import re match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line) if match: old_start = int(match.group(1)) old_count = int(match.group(2)) if match.group(2) else 1 new_start = int(match.group(3)) new_count = int(match.group(4)) if match.group(4) else 1 current_old_line = old_start current_new_line = new_start in_hunk = True parsed_lines.append({ 'type': 'hunk', 'content': line, 'old_start': old_start, 'old_count': old_count, 'new_start': new_start, 'new_count': new_count, 'old_line': None, 'new_line': None }) else: parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) elif line.startswith('---') or line.startswith('+++'): # 文件头信息 continue #parsed_lines.append({ # 'type': 'header', # 'content': line, # 'old_line': None, # 'new_line': None #}) elif in_hunk: if line.startswith('-'): # 删除的行 parsed_lines.append({ 'type': 'removed', 'content': line[1:], # 去掉开头的 '-' 'old_line': current_old_line, 'new_line': None }) current_old_line += 1 elif line.startswith('+'): # 新增的行 parsed_lines.append({ 'type': 'added', 'content': line[1:], # 去掉开头的 '+' 'old_line': None, 'new_line': current_new_line }) current_new_line += 1 elif line.startswith(' '): # 未变更的行 parsed_lines.append({ 'type': 'context', 'content': line[1:], # 去掉开头的 ' ' 'old_line': current_old_line, 'new_line': current_new_line }) current_old_line += 1 current_new_line += 1 else: # 其他行(如空行) parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) else: # 不在任何hunk中的行 parsed_lines.append({ 'type': 'other', 'content': line, 'old_line': None, 'new_line': None }) return parsed_lines def search_chinese_page(title): """在中文wiki中搜索对应的页面""" # 首先尝试精确匹配 params = { "action": "query", "list": "search", "srsearch": f'"{title}"', "srwhat": "title", "srlimit": 5, "format": "json" } try: r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json() search_results = r.get("query", {}).get("search", []) if search_results: # 返回第一个匹配的结果 return search_results[0]["title"] # 如果精确匹配没有结果,尝试模糊搜索 params["srsearch"] = title.replace(" ", "%20") r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json() search_results = r.get("query", {}).get("search", []) if search_results: return search_results[0]["title"] except Exception as e: print(f"搜索中文页面时出错: {e}") return None def create_diff_html(title, en_diff, en_old_lines, en_new_lines, cn_content=None): """创建双语对比的HTML页面 - Word批注风格,英文变更直接显示在对应中文行右侧""" # 准备中文内容行 cn_lines = [] if cn_content: cn_lines = cn_content.splitlines() # 解析diff并获取行号信息 parsed_diff = parse_diff_with_line_numbers(en_diff) if en_diff else [] # 构建行号到diff内容的映射 - 科学处理连续diff块 en_changes_by_line = {} blank_lines_to_insert = {} # 记录需要在某行前插入的空白行及其对应的新增内容 if parsed_diff: i = 0 while i < len(parsed_diff): # 收集连续的diff块 diff_block = [] start_index = i # 收集连续的添加/删除操作(跳过hunk和header) while i < len(parsed_diff): item = parsed_diff[i] if item['type'] in ['added', 'removed']: diff_block.append(item) elif item['type'] in ['hunk', 'header'] or item['type'] == 'context': if diff_block: # 如果已经有diff块,就停止 break i += 1 # 处理连续的diff块 if diff_block: # 计算行数平衡 line_balance = 0 for item in diff_block: if item['type'] == 'added': line_balance += 1 elif item['type'] == 'removed': line_balance -= 1 # 如果平衡为正数,需要在中文侧添加空白行 if line_balance > 0: # 找到基准行号(第一个操作的行号) base_line = None for item in diff_block: if item['old_line']: # 优先使用删除行的行号 base_line = item['old_line'] break elif item['new_line'] and base_line is None: base_line = item['new_line'] if base_line: # 收集需要分配到空白行的新增内容 additions_for_blank_lines = [] remaining_additions = [] for item in diff_block: if item['type'] == 'added': additions_for_blank_lines.append(item['content']) # 记录需要插入的空白行和对应的内容 blank_lines_to_insert[base_line] = additions_for_blank_lines # 处理具体的diff项 j = 0 while j < len(diff_block): item = diff_block[j] # 检查是否是替换操作(删除后紧跟新增) if (item['type'] == 'removed' and j + 1 < len(diff_block) and diff_block[j + 1]['type'] == 'added'): next_item = diff_block[j + 1] # 这是同一行的替换操作 target_line = item['old_line'] # 使用删除行的行号作为目标行号 if target_line not in en_changes_by_line: en_changes_by_line[target_line] = [] en_changes_by_line[target_line].append({ 'type': 'replaced', 'old_content': item['content'], 'new_content': next_item['content'] }) j += 2 # 跳过下一个项目,因为已经处理了 # 处理普通的添加操作(不包括需要分配到空白行的) elif item['type'] == 'added' and item['new_line']: # 如果这个新增内容已经被分配到空白行,就跳过 if line_balance > 0 and item['content'] in blank_lines_to_insert.get(base_line, []): j += 1 continue if item['new_line'] not in en_changes_by_line: en_changes_by_line[item['new_line']] = [] en_changes_by_line[item['new_line']].append({ 'type': 'added', 'content': item['content'] }) j += 1 # 处理普通的删除操作(没有对应的新增) elif item['type'] == 'removed' and item['old_line']: if item['old_line'] not in en_changes_by_line: en_changes_by_line[item['old_line']] = [] en_changes_by_line[item['old_line']].append({ 'type': 'removed', 'content': item['content'] }) j += 1 else: j += 1 # 继续处理剩余项 else: i += 1 # HTML转义函数 def html_escape(text): if not text: return "" return (str(text) .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) .replace("'", "'")) def generate_inline_diff(old_text, new_text): """生成GitHub风格的行内字符级diff""" if not old_text or not new_text: return html_escape(new_text or "") escaped_old = html_escape(old_text) escaped_new = html_escape(new_text) # 使用difflib进行字符级别的比较 differ = difflib.SequenceMatcher(None, escaped_old, escaped_new) result = [] for tag, i1, i2, j1, j2 in differ.get_opcodes(): if tag == 'equal': # 相同的部分 result.append(escaped_new[j1:j2]) elif tag == 'replace': # 替换的部分:删除的用红色背景,新增的用绿色背景 deleted = escaped_old[i1:i2] added = escaped_new[j1:j2] result.append(f'{deleted}') result.append(f'{added}') elif tag == 'delete': # 删除的部分用红色背景 deleted = escaped_old[i1:i2] result.append(f'{deleted}') elif tag == 'insert': # 新增的部分用绿色背景 added = escaped_new[j1:j2] result.append(f'{added}') return ''.join(result) def generate_clean_new_content(old_text, new_text): """生成干净的新内容,只显示新增部分的高亮,不包含删除部分""" if not old_text or not new_text: return html_escape(new_text or "") escaped_old = html_escape(old_text) escaped_new = html_escape(new_text) # 使用difflib进行字符级别的比较 differ = difflib.SequenceMatcher(None, escaped_old, escaped_new) result = [] for tag, i1, i2, j1, j2 in differ.get_opcodes(): if tag == 'equal': # 相同的部分 result.append(escaped_new[j1:j2]) elif tag == 'replace': # 替换的部分:只显示新增的内容(绿色高亮),跳过删除的内容 added = escaped_new[j1:j2] result.append(f'{added}') elif tag == 'delete': # 删除的部分:跳过,不显示 continue elif tag == 'insert': # 新增的部分用绿色背景 added = escaped_new[j1:j2] result.append(f'{added}') return ''.join(result) # 收集变更块信息用于导航 change_blocks = [] change_block_id = 0 # 生成HTML html = f'''