# -*- coding: utf-8 -*-
"""
MediaWiki 最近变更同步工具 - 增强版
支持:
1. 正常全量同步(无参数)
2. 手动指定时间起点:--since 2025-11-28T00:00:00Z
3. 只同步单个页面:--title "页面名称"
4. 单个页面时可选更新全局时间戳:--update-timestamp
5. 获取历史版本并生成diff
6. 同步中文翻译版本
7. 生成双语对比网页
"""
import os
import argparse
from pathlib import Path
from datetime import datetime
import requests
from dotenv import load_dotenv
import difflib
import json
import re
from urllib.parse import quote
# ==================== 配置区 ====================
load_dotenv()
WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php")
WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php")
OUTPUT_DIR = Path("wiki_sync_output")
OUTPUT_DIR.mkdir(exist_ok=True)
# 全局变量,存储本次执行的输出目录
CURRENT_OUTPUT_DIR = None
LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt"
SESSION_EN = requests.Session()
SESSION_EN.headers.update({
"User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)"
})
SESSION_CN = requests.Session()
SESSION_CN.headers.update({
"User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)"
})
# ================================================
def load_last_timestamp():
if not os.path.exists(LAST_TIMESTAMP_FILE):
return None
with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f:
return f.read().strip()
def save_last_timestamp(ts):
with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f:
f.write(ts)
def get_recent_changes(since):
"""获取自 since 时间后每个页面的最新 revid(自动去重)"""
params = {
"action": "query",
"list": "recentchanges",
"rcprop": "title|ids|timestamp",
"rctype": "edit|new",
"rcdir": "newer",
"rcstart": since,
"rclimit": 500,
"format": "json"
}
latest = {}
while True:
try:
r = SESSION_EN.get(WIKI_API_URL_EN, params=params)
r.raise_for_status()
response_data = r.json()
if "error" in response_data:
raise Exception(response_data["error"])
for rc in response_data.get("query", {}).get("recentchanges", []):
latest[rc["title"]] = (rc["revid"], rc["timestamp"])
if "continue" not in response_data:
break
params.update(response_data["continue"])
except Exception as e:
print(f"获取最近更改时出错: {e}")
break
return latest
def get_old_revid(title, end_time):
"""获取 ≤ end_time 的最后一次修订的 revid(用于 fromrev)"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "ids|timestamp",
"rvlimit": 1,
"rvdir": "older",
"rvstart": end_time,
"format": "json"
}
try:
r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json()
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" not in page:
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
return None
revisions = page["revisions"]
if len(revisions) >= 1:
return revisions[0]["revid"]
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
return None
except Exception as e:
print(f"获取旧版本ID时出错: {e}")
return None
def get_page_content(wiki_url, session, title, revid=None):
"""获取页面完整内容"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "content|timestamp|ids",
"rvslots": "main",
"format": "json"
}
if revid:
params["rvstartid"] = revid
params["rvendid"] = revid
try:
r = session.get(wiki_url, params=params).json()
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" not in page:
return None, None, None
rev = page["revisions"][0]
content = rev["slots"]["main"]["*"]
timestamp = rev["timestamp"]
rev_id = rev["revid"]
return content, timestamp, rev_id
except Exception as e:
print(f"获取页面内容时出错: {e}")
return None, None, None
def generate_text_diff(old_text, new_text):
"""生成类似git diff的文本diff"""
if not old_text:
return "新创建页面"
old_lines = old_text.splitlines(keepends=True)
new_lines = new_text.splitlines(keepends=True)
differ = difflib.unified_diff(
old_lines,
new_lines,
lineterm='\n'
)
return ''.join(differ)
def parse_diff_with_line_numbers(diff_text):
"""解析diff文本,提取详细的行号信息"""
if not diff_text or diff_text.startswith("新创建页面"):
return []
parsed_lines = []
current_old_line = 0
current_new_line = 0
in_hunk = False
for line in diff_text.splitlines():
if line.startswith('@@'):
# 解析hunk头部,格式如: @@ -start,count +start,count @@
import re
match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line)
if match:
old_start = int(match.group(1))
old_count = int(match.group(2)) if match.group(2) else 1
new_start = int(match.group(3))
new_count = int(match.group(4)) if match.group(4) else 1
current_old_line = old_start
current_new_line = new_start
in_hunk = True
parsed_lines.append({
'type': 'hunk',
'content': line,
'old_start': old_start,
'old_count': old_count,
'new_start': new_start,
'new_count': new_count,
'old_line': None,
'new_line': None
})
else:
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
elif line.startswith('---') or line.startswith('+++'):
# 文件头信息
continue
#parsed_lines.append({
# 'type': 'header',
# 'content': line,
# 'old_line': None,
# 'new_line': None
#})
elif in_hunk:
if line.startswith('-'):
# 删除的行
parsed_lines.append({
'type': 'removed',
'content': line[1:], # 去掉开头的 '-'
'old_line': current_old_line,
'new_line': None
})
current_old_line += 1
elif line.startswith('+'):
# 新增的行
parsed_lines.append({
'type': 'added',
'content': line[1:], # 去掉开头的 '+'
'old_line': None,
'new_line': current_new_line
})
current_new_line += 1
elif line.startswith(' '):
# 未变更的行
parsed_lines.append({
'type': 'context',
'content': line[1:], # 去掉开头的 ' '
'old_line': current_old_line,
'new_line': current_new_line
})
current_old_line += 1
current_new_line += 1
else:
# 其他行(如空行)
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
else:
# 不在任何hunk中的行
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
return parsed_lines
def search_chinese_page(title):
"""在中文wiki中搜索对应的页面"""
# 首先尝试精确匹配
params = {
"action": "query",
"list": "search",
"srsearch": f'"{title}"',
"srwhat": "title",
"srlimit": 5,
"format": "json"
}
try:
r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json()
search_results = r.get("query", {}).get("search", [])
if search_results:
# 返回第一个匹配的结果
return search_results[0]["title"]
# 如果精确匹配没有结果,尝试模糊搜索
params["srsearch"] = title.replace(" ", "%20")
r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json()
search_results = r.get("query", {}).get("search", [])
if search_results:
return search_results[0]["title"]
except Exception as e:
print(f"搜索中文页面时出错: {e}")
return None
def create_diff_html(title, en_diff, en_old_lines, en_new_lines, cn_content=None):
"""创建双语对比的HTML页面 - 使用精确的行号映射"""
# 准备中文内容行
cn_lines = []
if cn_content:
cn_lines = cn_content.splitlines()
# 解析diff并获取行号信息
parsed_diff = parse_diff_with_line_numbers(en_diff) if en_diff else []
# 生成HTML
html = f'''
Wiki Diff: {title}
English Diff
'''
# 生成英文diff内容
if parsed_diff:
for item in parsed_diff:
if item['type'] == 'hunk':
html += f'
{item["content"]}
'
elif item['type'] == 'header':
html += f''
elif item['type'] == 'added':
cn_line_attr = f'data-cn-line="{item["new_line"]}"' if item["new_line"] and cn_lines and item["new_line"] <= len(cn_lines) else ''
cn_title = f'中文第{item["new_line"]}行' if item["new_line"] and cn_lines and item["new_line"] <= len(cn_lines) else ''
html += f'
{item["new_line"] or ""}{item["content"]}
'
elif item['type'] == 'removed':
html += f'
{item["old_line"] or ""}{item["content"]}
'
elif item['type'] == 'context':
cn_line_attr = f'data-cn-line="{item["new_line"]}"' if item["new_line"] and cn_lines and item["new_line"] <= len(cn_lines) else ''
cn_title = f'中文第{item["new_line"]}行' if item["new_line"] and cn_lines and item["new_line"] <= len(cn_lines) else ''
html += f'
{item["new_line"]}{item["content"]}
'
else:
html += f'
{item["content"]}
'
else:
# 新页面或无diff
if en_diff and en_diff.startswith("新创建页面"):
html += '
新创建页面
'
# 显示完整内容(新页面或无diff时)
for i, line in enumerate(en_new_lines or [], 1):
cn_line_attr = f'data-cn-line="{i}"' if cn_lines and i <= len(cn_lines) else ''
cn_title = f'中文第{i}行' if cn_lines and i <= len(cn_lines) else ''
html += f'
{i}{line}
'
html += '''
中文翻译
'''
# 添加中文内容
if cn_content:
html += '
'
for i, line in enumerate(cn_lines, 1):
html += f'
{i}{line}
'
html += '
'
else:
html += '
未找到对应的中文翻译页面
'
html += '''
'''
return html
def save_files(title, diff_html, diff_text, full_text, timestamp, note="", revid=None, cn_content=None, old_full_text=None):
global CURRENT_OUTPUT_DIR
# 确保本次执行的输出目录已经创建
if CURRENT_OUTPUT_DIR is None:
current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str
CURRENT_OUTPUT_DIR.mkdir(exist_ok=True)
print(f"创建本次执行的输出目录: {CURRENT_OUTPUT_DIR}")
safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title)
time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_")
base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}"
# 保存各种文件
files_to_save = []
# 1. 标准MediaWiki diff HTML
diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.html"
if diff_html:
files_to_save.append((diff_file, diff_html))
# 2. 文本格式的diff
text_diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.txt"
if diff_text:
files_to_save.append((text_diff_file, diff_text))
# 3. 最新完整内容
full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.full.txt"
if full_text:
files_to_save.append((full_file, full_text))
# 4. 历史版本内容(如果存在)
if old_full_text:
old_full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.old.txt"
files_to_save.append((old_full_file, old_full_text))
# 5. 中文翻译内容(如果存在)
if cn_content:
cn_file = CURRENT_OUTPUT_DIR / f"{base_filename}.cn.txt"
files_to_save.append((cn_file, cn_content))
# 6. 双语对比HTML页面
if cn_content:
# 为文本diff准备行
en_new_lines = full_text.splitlines() if full_text else []
en_old_lines = old_full_text.splitlines() if old_full_text else []
# 创建双语对比页面
comparison_html = create_diff_html(title, diff_text, en_old_lines, en_new_lines, cn_content)
comparison_file = CURRENT_OUTPUT_DIR / f"{base_filename}.comparison.html"
files_to_save.append((comparison_file, comparison_html))
print(f" → 已保存: {comparison_file.relative_to(OUTPUT_DIR)} (双语对比页面)")
# 写入所有文件
for file_path, content in files_to_save:
try:
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
print(f" → 已保存: {file_path.relative_to(OUTPUT_DIR)}")
except Exception as e:
print(f" → 保存文件 {file_path} 时出错: {e}")
def process_single_page(title, since_time, update_timestamp=False):
"""只处理单个页面"""
print(f"正在单独处理页面:{title}")
# 获取当前最新 revid
try:
latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title)
if latest_content is None:
print("页面不存在或被删除")
return None
# 获取旧 revid
old_revid = get_old_revid(title, since_time)
# 初始化变量
diff_html = None
diff_text = None
old_content = None
cn_content = None
if old_revid:
# 获取历史版本内容
old_content, old_ts, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid)
if old_content is not None:
# 生成文本diff
diff_text = generate_text_diff(old_content, latest_content)
print(f" 生成了文本diff ({len(diff_text)} 字符)")
else:
print(f" 无法获取历史版本内容")
else:
# 新页面
print(" 这是新创建的页面")
# 搜索对应的中文页面
print(" 搜索中文翻译...")
cn_title = search_chinese_page(title)
if cn_title:
print(f" 找到中文页面: {cn_title}")
cn_content, cn_ts, cn_revid = get_page_content(WIKI_API_URL_CN, SESSION_CN, cn_title)
if cn_content:
print(f" 获取中文内容成功 ({len(cn_content)} 字符)")
else:
print(" 无法获取中文页面内容")
else:
print(" 未找到对应的中文翻译页面")
# 获取官方diff(可选)
if old_revid:
diff_params = {
"action": "compare",
"fromrev": old_revid,
"torev": latest_revid,
"format": "json"
}
try:
diff_resp = SESSION_EN.get(WIKI_API_URL_EN, params=diff_params).json()
diff_html = diff_resp.get("compare", {}).get("*", "")
except Exception as e:
print(f" 获取官方HTML diff时出错: {e}")
# 保存所有文件
save_files(title, diff_html, diff_text, latest_content, latest_ts, "", latest_revid, cn_content, old_content)
if update_timestamp:
save_last_timestamp(latest_ts)
print(f"已更新全局时间戳 → {latest_ts}")
return latest_ts
except Exception as e:
print(f"处理页面 '{title}' 时出错: {e}")
return None
def process_all_pages_since(since_time):
"""处理自指定时间以来的所有页面变更"""
print("正在获取最近变更列表...")
changes = get_recent_changes(since_time)
if not changes:
print("没有发现任何变更")
return
latest_global_ts = since_time
for title, (latest_revid, ts) in changes.items():
print(f"\n处理:{title}")
# 复用单页处理逻辑
page_latest_ts = process_single_page(title, since_time)
if page_latest_ts and page_latest_ts > latest_global_ts:
latest_global_ts = page_latest_ts
save_last_timestamp(latest_global_ts)
print(f"\n全量同步完成!本次最新时间戳已更新为:{latest_global_ts}")
print(f"文件保存在:{CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}")
def main():
parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - 增强版支持双语对比")
parser.add_argument("--since", type=str, help="强制从指定时间开始同步,格式如 2025-11-28T00:00:00Z")
parser.add_argument("--title", type=str, help="只同步指定的单个页面标题")
parser.add_argument("--update-timestamp", action="store_true",
help="在单页模式下,完成后仍然更新全局 last_sync_timestamp.txt")
parser.add_argument("--run", action="store_true",
help="执行同步操作(必须提供此参数才能真正执行同步)")
args = parser.parse_args()
# 如果没有提供 --run 参数,则显示帮助信息并退出
if not args.run:
parser.print_help()
return
# 确定实际使用的 since 时间
if args.since:
since_time = args.since
print(f"使用命令行指定的时间起点:{since_time}")
else:
since_time = load_last_timestamp()
if not since_time:
from datetime import timedelta
since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z"
print(f"使用上次记录的时间起点:{since_time}")
# 单页面模式
if args.title:
process_single_page(args.title.strip(), since_time, args.update_timestamp)
return
# 全量模式 - 使用复用的单页处理逻辑
process_all_pages_since(since_time)
if __name__ == "__main__":
main()