sync-pd2-wiki/sync.py

1122 lines
38 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
MediaWiki 最近变更同步工具 - 增强版
支持:
1. 正常全量同步(无参数)
2. 手动指定时间起点:--since 2025-11-28T00:00:00Z
3. 只同步单个页面:--title "页面名称"
4. 单个页面时可选更新全局时间戳:--update-timestamp
5. 获取历史版本并生成diff
6. 同步中文翻译版本
7. 生成双语对比网页
"""
import os
import argparse
from pathlib import Path
from datetime import datetime
import requests
from dotenv import load_dotenv
import difflib
import json
import re
from urllib.parse import quote
# ==================== 配置区 ====================
load_dotenv()
WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php")
WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php")
OUTPUT_DIR = Path("wiki_sync_output")
OUTPUT_DIR.mkdir(exist_ok=True)
# 全局变量,存储本次执行的输出目录
CURRENT_OUTPUT_DIR = None
LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt"
SESSION_EN = requests.Session()
SESSION_EN.headers.update({
"User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)"
})
SESSION_CN = requests.Session()
SESSION_CN.headers.update({
"User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)"
})
# ================================================
def load_last_timestamp():
if not os.path.exists(LAST_TIMESTAMP_FILE):
return None
with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f:
return f.read().strip()
def save_last_timestamp(ts):
with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f:
f.write(ts)
def get_recent_changes(since):
"""获取自 since 时间后每个页面的最新 revid自动去重"""
params = {
"action": "query",
"list": "recentchanges",
"rcprop": "title|ids|timestamp",
"rctype": "edit|new",
"rcdir": "newer",
"rcstart": since,
"rclimit": 500,
"format": "json"
}
latest = {}
while True:
try:
r = SESSION_EN.get(WIKI_API_URL_EN, params=params)
r.raise_for_status()
response_data = r.json()
if "error" in response_data:
raise Exception(response_data["error"])
for rc in response_data.get("query", {}).get("recentchanges", []):
latest[rc["title"]] = (rc["revid"], rc["timestamp"])
if "continue" not in response_data:
break
params.update(response_data["continue"])
except Exception as e:
print(f"获取最近更改时出错: {e}")
break
return latest
def get_old_revid(title, end_time):
"""获取 ≤ end_time 的最后一次修订的 revid用于 fromrev"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "ids|timestamp",
"rvlimit": 1,
"rvdir": "older",
"rvstart": end_time,
"format": "json"
}
try:
r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json()
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" not in page:
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
return None
revisions = page["revisions"]
if len(revisions) >= 1:
return revisions[0]["revid"]
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
return None
except Exception as e:
print(f"获取旧版本ID时出错: {e}")
return None
def get_page_content(wiki_url, session, title, revid=None):
"""获取页面完整内容"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "content|timestamp|ids",
"rvslots": "main",
"format": "json"
}
if revid:
params["rvstartid"] = revid
params["rvendid"] = revid
try:
r = session.get(wiki_url, params=params).json()
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" not in page:
return None, None, None
rev = page["revisions"][0]
content = rev["slots"]["main"]["*"]
timestamp = rev["timestamp"]
rev_id = rev["revid"]
return content, timestamp, rev_id
except Exception as e:
print(f"获取页面内容时出错: {e}")
return None, None, None
def generate_text_diff(old_text, new_text):
"""生成类似git diff的文本diff"""
if not old_text:
return "新创建页面"
old_lines = old_text.splitlines(keepends=True)
new_lines = new_text.splitlines(keepends=True)
differ = difflib.unified_diff(
old_lines,
new_lines,
lineterm='\n'
)
return ''.join(differ)
def parse_diff_with_line_numbers(diff_text):
"""解析diff文本提取详细的行号信息"""
if not diff_text or diff_text.startswith("新创建页面"):
return []
parsed_lines = []
current_old_line = 0
current_new_line = 0
in_hunk = False
for line in diff_text.splitlines():
if line.startswith('@@'):
# 解析hunk头部格式如: @@ -start,count +start,count @@
import re
match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line)
if match:
old_start = int(match.group(1))
old_count = int(match.group(2)) if match.group(2) else 1
new_start = int(match.group(3))
new_count = int(match.group(4)) if match.group(4) else 1
current_old_line = old_start
current_new_line = new_start
in_hunk = True
parsed_lines.append({
'type': 'hunk',
'content': line,
'old_start': old_start,
'old_count': old_count,
'new_start': new_start,
'new_count': new_count,
'old_line': None,
'new_line': None
})
else:
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
elif line.startswith('---') or line.startswith('+++'):
# 文件头信息
continue
#parsed_lines.append({
# 'type': 'header',
# 'content': line,
# 'old_line': None,
# 'new_line': None
#})
elif in_hunk:
if line.startswith('-'):
# 删除的行
parsed_lines.append({
'type': 'removed',
'content': line[1:], # 去掉开头的 '-'
'old_line': current_old_line,
'new_line': None
})
current_old_line += 1
elif line.startswith('+'):
# 新增的行
parsed_lines.append({
'type': 'added',
'content': line[1:], # 去掉开头的 '+'
'old_line': None,
'new_line': current_new_line
})
current_new_line += 1
elif line.startswith(' '):
# 未变更的行
parsed_lines.append({
'type': 'context',
'content': line[1:], # 去掉开头的 ' '
'old_line': current_old_line,
'new_line': current_new_line
})
current_old_line += 1
current_new_line += 1
else:
# 其他行(如空行)
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
else:
# 不在任何hunk中的行
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
return parsed_lines
def search_chinese_page(title):
"""在中文wiki中搜索对应的页面"""
# 首先尝试精确匹配
params = {
"action": "query",
"list": "search",
"srsearch": f'"{title}"',
"srwhat": "title",
"srlimit": 5,
"format": "json"
}
try:
r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json()
search_results = r.get("query", {}).get("search", [])
if search_results:
# 返回第一个匹配的结果
return search_results[0]["title"]
# 如果精确匹配没有结果,尝试模糊搜索
params["srsearch"] = title.replace(" ", "%20")
r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json()
search_results = r.get("query", {}).get("search", [])
if search_results:
return search_results[0]["title"]
except Exception as e:
print(f"搜索中文页面时出错: {e}")
return None
def create_diff_html(title, en_diff, en_old_lines, en_new_lines, cn_content=None):
"""创建双语对比的HTML页面 - Word批注风格英文变更直接显示在对应中文行右侧"""
# 准备中文内容行
cn_lines = []
if cn_content:
cn_lines = cn_content.splitlines()
# 解析diff并获取行号信息
parsed_diff = parse_diff_with_line_numbers(en_diff) if en_diff else []
# 构建行号到diff内容的映射 - 科学处理连续diff块
en_changes_by_line = {}
blank_lines_to_insert = {} # 记录需要在某行前插入的空白行及其对应的新增内容
if parsed_diff:
i = 0
while i < len(parsed_diff):
# 收集连续的diff块
diff_block = []
start_index = i
# 收集连续的添加/删除操作跳过hunk和header
while i < len(parsed_diff):
item = parsed_diff[i]
if item['type'] in ['added', 'removed']:
diff_block.append(item)
elif item['type'] in ['hunk', 'header'] or item['type'] == 'context':
if diff_block: # 如果已经有diff块就停止
break
i += 1
# 处理连续的diff块
if diff_block:
# 计算行数平衡
line_balance = 0
for item in diff_block:
if item['type'] == 'added':
line_balance += 1
elif item['type'] == 'removed':
line_balance -= 1
# 如果平衡为正数,需要在中文侧添加空白行
if line_balance > 0:
# 找到基准行号(第一个操作的行号)
base_line = None
for item in diff_block:
if item['old_line']: # 优先使用删除行的行号
base_line = item['old_line']
break
elif item['new_line'] and base_line is None:
base_line = item['new_line']
if base_line:
# 收集需要分配到空白行的新增内容
additions_for_blank_lines = []
remaining_additions = []
for item in diff_block:
if item['type'] == 'added':
additions_for_blank_lines.append(item['content'])
# 记录需要插入的空白行和对应的内容
blank_lines_to_insert[base_line] = additions_for_blank_lines
# 处理具体的diff项
j = 0
while j < len(diff_block):
item = diff_block[j]
# 检查是否是替换操作(删除后紧跟新增)
if (item['type'] == 'removed' and j + 1 < len(diff_block) and
diff_block[j + 1]['type'] == 'added'):
next_item = diff_block[j + 1]
# 这是同一行的替换操作
target_line = item['old_line'] # 使用删除行的行号作为目标行号
if target_line not in en_changes_by_line:
en_changes_by_line[target_line] = []
en_changes_by_line[target_line].append({
'type': 'replaced',
'old_content': item['content'],
'new_content': next_item['content']
})
j += 2 # 跳过下一个项目,因为已经处理了
# 处理普通的添加操作(不包括需要分配到空白行的)
elif item['type'] == 'added' and item['new_line']:
# 如果这个新增内容已经被分配到空白行,就跳过
if line_balance > 0 and item['content'] in blank_lines_to_insert.get(base_line, []):
j += 1
continue
if item['new_line'] not in en_changes_by_line:
en_changes_by_line[item['new_line']] = []
en_changes_by_line[item['new_line']].append({
'type': 'added',
'content': item['content']
})
j += 1
# 处理普通的删除操作(没有对应的新增)
elif item['type'] == 'removed' and item['old_line']:
if item['old_line'] not in en_changes_by_line:
en_changes_by_line[item['old_line']] = []
en_changes_by_line[item['old_line']].append({
'type': 'removed',
'content': item['content']
})
j += 1
else:
j += 1
# 继续处理剩余项
else:
i += 1
# HTML转义函数
def html_escape(text):
if not text:
return ""
return (str(text)
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#39;"))
def generate_inline_diff(old_text, new_text):
"""生成GitHub风格的行内字符级diff"""
if not old_text or not new_text:
return html_escape(new_text or "")
escaped_old = html_escape(old_text)
escaped_new = html_escape(new_text)
# 使用difflib进行字符级别的比较
differ = difflib.SequenceMatcher(None, escaped_old, escaped_new)
result = []
for tag, i1, i2, j1, j2 in differ.get_opcodes():
if tag == 'equal':
# 相同的部分
result.append(escaped_new[j1:j2])
elif tag == 'replace':
# 替换的部分:删除的用红色背景,新增的用绿色背景
deleted = escaped_old[i1:i2]
added = escaped_new[j1:j2]
result.append(f'<span class="diff-char-removed">{deleted}</span>')
result.append(f'<span class="diff-char-added">{added}</span>')
elif tag == 'delete':
# 删除的部分用红色背景
deleted = escaped_old[i1:i2]
result.append(f'<span class="diff-char-removed">{deleted}</span>')
elif tag == 'insert':
# 新增的部分用绿色背景
added = escaped_new[j1:j2]
result.append(f'<span class="diff-char-added">{added}</span>')
return ''.join(result)
def generate_clean_new_content(old_text, new_text):
"""生成干净的新内容,只显示新增部分的高亮,不包含删除部分"""
if not old_text or not new_text:
return html_escape(new_text or "")
escaped_old = html_escape(old_text)
escaped_new = html_escape(new_text)
# 使用difflib进行字符级别的比较
differ = difflib.SequenceMatcher(None, escaped_old, escaped_new)
result = []
for tag, i1, i2, j1, j2 in differ.get_opcodes():
if tag == 'equal':
# 相同的部分
result.append(escaped_new[j1:j2])
elif tag == 'replace':
# 替换的部分:只显示新增的内容(绿色高亮),跳过删除的内容
added = escaped_new[j1:j2]
result.append(f'<span class="diff-char-added">{added}</span>')
elif tag == 'delete':
# 删除的部分:跳过,不显示
continue
elif tag == 'insert':
# 新增的部分用绿色背景
added = escaped_new[j1:j2]
result.append(f'<span class="diff-char-added">{added}</span>')
return ''.join(result)
# 生成HTML
html = f'''<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Wiki Diff: {title}</title>
<style>
* {{
margin: 0;
padding: 0;
box-sizing: border-box;
}}
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
background-color: #f5f5f5;
line-height: 1.6;
padding: 20px;
}}
.header {{
background-color: #fff;
padding: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
margin-bottom: 20px;
border-radius: 8px;
}}
.header h1 {{
color: #333;
font-size: 24px;
margin-bottom: 10px;
}}
.header .meta {{
color: #666;
font-size: 14px;
}}
.content-container {{
max-width: 1200px;
margin: 0 auto;
background-color: #fff;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
overflow: hidden;
}}
.content-header {{
background-color: #e9ecef;
padding: 15px 20px;
font-weight: bold;
color: #495057;
border-bottom: 1px solid #dee2e6;
}}
.diff-content {{
padding: 0;
}}
.line-wrapper {{
display: flex;
border-bottom: 1px solid #f0f0f0;
position: relative;
}}
.line-wrapper:hover {{
background-color: rgba(0, 123, 255, 0.02);
}}
.line-wrapper.has-changes {{
background-color: rgba(255, 193, 7, 0.05);
}}
.main-line {{
display: flex;
flex: 1;
min-height: 24px;
align-items: center;
}}
.line-number {{
width: 60px;
text-align: right;
padding: 8px 12px;
background-color: #f8f9fa;
color: #6c757d;
font-size: 12px;
user-select: none;
flex-shrink: 0;
border-right: 1px solid #e9ecef;
}}
.line-content {{
flex: 1;
padding: 8px 12px;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 13px;
line-height: 1.5;
white-space: pre-wrap;
word-break: break-word;
color: #333;
}}
/* 批注样式 */
.annotation {{
width: 400px;
background-color: #f8f9fa;
border-left: 1px solid #dee2e6;
padding: 10px 14px;
font-size: 14px;
display: none;
}}
.line-wrapper.has-changes .annotation {{
display: block;
}}
.annotation-item {{
margin-bottom: 8px;
padding: 8px 10px;
border-radius: 4px;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 13px;
line-height: 1.5;
}}
.annotation-item:last-child {{
margin-bottom: 0;
}}
.annotation-item.added {{
background-color: #e6ffec;
border-left: 3px solid #28a745;
color: #155724;
}}
.annotation-item.removed {{
background-color: #ffeef0;
border-left: 3px solid #dc3545;
color: #721c24;
text-decoration: line-through;
}}
.annotation-item.replaced {{
margin-bottom: 8px;
}}
.annotation-item.replaced .old-content {{
background-color: #ffeef0;
border-left: 3px solid #dc3545;
color: #721c24;
text-decoration: line-through;
padding: 4px 6px;
border-radius: 3px;
margin-bottom: 4px;
}}
.annotation-item.replaced .new-content {{
background-color: #e6ffec;
border-left: 3px solid #28a745;
color: #155724;
padding: 4px 6px;
border-radius: 3px;
}}
.annotation-header {{
font-size: 11px;
color: #6c757d;
margin-bottom: 6px;
font-weight: bold;
text-transform: uppercase;
letter-spacing: 0.5px;
}}
/* GitHub风格的字符级diff样式 */
.diff-char-added {{
background-color: #acf2bd;
color: #24292f;
border-radius: 2px;
padding: 1px 2px;
}}
.diff-char-removed {{
background-color: #ffd5d5;
color: #24292f;
border-radius: 2px;
padding: 1px 2px;
}}
/* 批注项内的字符级diff样式调整 */
.annotation-item.removed {{
text-decoration: none; /* 移除删除线 */
}}
.annotation-item.replaced .old-content {{
text-decoration: none; /* 移除删除线 */
}}
/* 新页面提示 */
.new-page-notice {{
background-color: #d4edda;
color: #155724;
padding: 15px 20px;
margin: 15px;
border-radius: 4px;
border-left: 4px solid #28a745;
}}
.no-translation {{
background-color: #fff3cd;
color: #856404;
padding: 15px 20px;
margin: 15px;
border-radius: 4px;
border-left: 4px solid #ffc107;
}}
/* 响应式设计 */
@media (max-width: 1024px) {{
.annotation {{
width: 300px;
}}
}}
@media (max-width: 768px) {{
body {{
padding: 10px;
}}
.annotation {{
width: 100%;
display: block !important;
border-left: none;
border-top: 1px solid #dee2e6;
}}
.line-wrapper {{
flex-direction: column;
}}
.main-line {{
border-bottom: none;
}}
}}
/* 高亮效果 */
.line-wrapper.highlight {{
background-color: rgba(255, 235, 59, 0.3) !important;
animation: highlight 2s ease-in-out;
}}
@keyframes highlight {{
0% {{ background-color: rgba(255, 235, 59, 0.6); }}
100% {{ background-color: rgba(255, 235, 59, 0.3); }}
}}
/* 空行样式 */
.line-wrapper.empty-line .line-content {{
min-height: 24px;
color: #999;
font-style: italic;
}}
/* 空白占位行样式 */
.line-wrapper.blank-placeholder {{
background-color: #fafafa;
border-bottom: 1px solid #e9ecef;
display: flex;
}}
.line-wrapper.blank-placeholder .main-line {{
min-height: 24px;
flex: 1;
display: flex;
}}
.line-wrapper.blank-placeholder .line-number {{
color: #dee2e6;
}}
.line-wrapper.blank-placeholder .line-content {{
color: #dee2e6;
font-style: italic;
min-height: 24px;
display: flex;
align-items: center;
}}
/* 空白占位行的批注样式 */
.line-wrapper.blank-placeholder .annotation {{
width: 400px;
background-color: #f8f9fa;
border-left: 1px solid #dee2e6;
padding: 10px 14px;
font-size: 14px;
display: block;
}}
.line-wrapper.blank-placeholder .annotation .annotation-item {{
margin-bottom: 8px;
padding: 8px 10px;
border-radius: 4px;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 13px;
line-height: 1.5;
}}
.line-wrapper.blank-placeholder .annotation .annotation-item:last-child {{
margin-bottom: 0;
}}
.line-wrapper.blank-placeholder .annotation .annotation-item.added {{
background-color: #e6ffec;
border-left: 3px solid #28a745;
color: #155724;
}}
.line-wrapper.blank-placeholder .annotation .annotation-header {{
font-size: 10px;
color: #6c757d;
margin-bottom: 4px;
font-weight: bold;
text-transform: uppercase;
letter-spacing: 0.5px;
}}
.line-wrapper.blank-placeholder:hover {{
background-color: #f8f9fa;
}}
.line-wrapper.blank-placeholder:hover .main-line {{
background-color: rgba(0, 123, 255, 0.02);
}}
</style>
</head>
<body>
<div class="header">
<h1>{title}</h1>
<div class="meta">
<span>英文Wiki: wiki.projectdiablo2.com</span>
{f' | 中文Wiki: wiki.projectdiablo2.cn' if cn_content else ''}
</div>
</div>
<div class="content-container">
<div class="content-header">中文翻译(含英文变更批注)</div>
<div class="diff-content">
'''
# 添加中文内容和英文变更批注
if cn_content:
for i, line in enumerate(cn_lines, 1):
# 检查是否需要在此行之前插入空白行
if i in blank_lines_to_insert:
additions_list = blank_lines_to_insert[i]
for addition_content in additions_list:
html += f'<div class="line-wrapper blank-placeholder">'
html += '<div class="main-line">'
html += '<span class="line-number">&nbsp;</span>' # 不显示行号
html += f'<span class="line-content">(新增英文内容占位)</span>'
html += '</div>'
# 为空白行添加对应的新增批注
escaped_addition = html_escape(addition_content)
html += '<div class="annotation">'
html += f'<div class="annotation-item added">'
html += f'<div class="annotation-header">新增</div>'
html += f'<div>{escaped_addition}</div>'
html += '</div>'
html += '</div>'
html += '</div>'
escaped_line = html_escape(line)
has_changes = i in en_changes_by_line
changes = en_changes_by_line.get(i, [])
# 判断是否为空行
is_empty = not line.strip()
html += f'<div class="line-wrapper {"has-changes" if has_changes else ""} {"empty-line" if is_empty else ""}">'
html += f'<div class="main-line">'
html += f'<span class="line-number">{i}</span>'
html += f'<span class="line-content">{escaped_line if not is_empty else "(空行)"}</span>'
html += '</div>'
# 添加英文变更批注(只显示替换和删除操作,新增操作已经在空白行中显示)
if has_changes:
html += '<div class="annotation">'
for change in changes:
if change['type'] == 'added':
# 新增内容已经在空白行中显示,这里跳过
continue
elif change['type'] == 'removed':
escaped_change = html_escape(change['content'])
html += f'<div class="annotation-item removed">'
html += f'<div class="annotation-header">删除</div>'
html += f'<div>{escaped_change}</div>'
html += '</div>'
elif change['type'] == 'replaced':
# 生成干净的新内容(只显示新增部分的高亮,不包含删除部分)
clean_new_content = generate_clean_new_content(change['old_content'], change['new_content'])
html += f'<div class="annotation-item replaced">'
html += f'<div class="annotation-header">替换</div>'
html += f'<div class="old-content">{html_escape(change["old_content"])}</div>'
html += f'<div class="new-content">{clean_new_content}</div>'
html += '</div>'
html += '</div>'
html += '</div>'
else:
html += '<div class="no-translation">未找到对应的中文翻译页面</div>'
html += '''
</div>
</div>
<script>
// 点击有变更的行时高亮
document.querySelectorAll('.line-wrapper.has-changes').forEach(lineWrapper => {{
lineWrapper.addEventListener('click', () => {{
// 移除所有高亮
document.querySelectorAll('.line-wrapper.highlight').forEach(line => {{
line.classList.remove('highlight');
}});
// 高亮当前行
lineWrapper.classList.add('highlight');
}});
}});
</script>
</body>
</html>'''
return html
def save_files(title, diff_html, diff_text, full_text, timestamp, note="", revid=None, cn_content=None, old_full_text=None):
global CURRENT_OUTPUT_DIR
# 确保本次执行的输出目录已经创建
if CURRENT_OUTPUT_DIR is None:
current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str
CURRENT_OUTPUT_DIR.mkdir(exist_ok=True)
print(f"创建本次执行的输出目录: {CURRENT_OUTPUT_DIR}")
safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title)
time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_")
base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}"
# 保存各种文件
files_to_save = []
# 1. 标准MediaWiki diff HTML
diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.html"
if diff_html:
files_to_save.append((diff_file, diff_html))
# 2. 文本格式的diff
text_diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.txt"
if diff_text:
files_to_save.append((text_diff_file, diff_text))
# 3. 最新完整内容
full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.full.txt"
if full_text:
files_to_save.append((full_file, full_text))
# 4. 历史版本内容(如果存在)
if old_full_text:
old_full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.old.txt"
files_to_save.append((old_full_file, old_full_text))
# 5. 中文翻译内容(如果存在)
if cn_content:
cn_file = CURRENT_OUTPUT_DIR / f"{base_filename}.cn.txt"
files_to_save.append((cn_file, cn_content))
# 6. 双语对比HTML页面
if cn_content:
# 为文本diff准备行
en_new_lines = full_text.splitlines() if full_text else []
en_old_lines = old_full_text.splitlines() if old_full_text else []
# 创建双语对比页面
comparison_html = create_diff_html(title, diff_text, en_old_lines, en_new_lines, cn_content)
comparison_file = CURRENT_OUTPUT_DIR / f"{base_filename}.comparison.html"
files_to_save.append((comparison_file, comparison_html))
print(f" → 已保存: {comparison_file.relative_to(OUTPUT_DIR)} (双语对比页面)")
# 写入所有文件
for file_path, content in files_to_save:
try:
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
print(f" → 已保存: {file_path.relative_to(OUTPUT_DIR)}")
except Exception as e:
print(f" → 保存文件 {file_path} 时出错: {e}")
def process_single_page(title, since_time, update_timestamp=False):
"""只处理单个页面"""
print(f"正在单独处理页面:{title}")
# 获取当前最新 revid
try:
latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title)
if latest_content is None:
print("页面不存在或被删除")
return None
# 获取旧 revid
old_revid = get_old_revid(title, since_time)
# 初始化变量
diff_html = None
diff_text = None
old_content = None
cn_content = None
if old_revid:
# 获取历史版本内容
old_content, old_ts, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid)
if old_content is not None:
# 生成文本diff
diff_text = generate_text_diff(old_content, latest_content)
print(f" 生成了文本diff ({len(diff_text)} 字符)")
else:
print(f" 无法获取历史版本内容")
else:
# 新页面
print(" 这是新创建的页面")
# 搜索对应的中文页面
print(" 搜索中文翻译...")
cn_title = search_chinese_page(title)
if cn_title:
print(f" 找到中文页面: {cn_title}")
cn_content, cn_ts, cn_revid = get_page_content(WIKI_API_URL_CN, SESSION_CN, cn_title)
if cn_content:
print(f" 获取中文内容成功 ({len(cn_content)} 字符)")
else:
print(" 无法获取中文页面内容")
else:
print(" 未找到对应的中文翻译页面")
# 获取官方diff可选
if old_revid:
diff_params = {
"action": "compare",
"fromrev": old_revid,
"torev": latest_revid,
"format": "json"
}
try:
diff_resp = SESSION_EN.get(WIKI_API_URL_EN, params=diff_params).json()
diff_html = diff_resp.get("compare", {}).get("*", "")
except Exception as e:
print(f" 获取官方HTML diff时出错: {e}")
# 保存所有文件
save_files(title, diff_html, diff_text, latest_content, latest_ts, "", latest_revid, cn_content, old_content)
if update_timestamp:
save_last_timestamp(latest_ts)
print(f"已更新全局时间戳 → {latest_ts}")
return latest_ts
except Exception as e:
print(f"处理页面 '{title}' 时出错: {e}")
return None
def process_all_pages_since(since_time):
"""处理自指定时间以来的所有页面变更"""
print("正在获取最近变更列表...")
changes = get_recent_changes(since_time)
if not changes:
print("没有发现任何变更")
return
latest_global_ts = since_time
for title, (latest_revid, ts) in changes.items():
print(f"\n处理:{title}")
# 复用单页处理逻辑
page_latest_ts = process_single_page(title, since_time)
if page_latest_ts and page_latest_ts > latest_global_ts:
latest_global_ts = page_latest_ts
save_last_timestamp(latest_global_ts)
print(f"\n全量同步完成!本次最新时间戳已更新为:{latest_global_ts}")
print(f"文件保存在:{CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}")
def main():
parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - 增强版支持双语对比")
parser.add_argument("--since", type=str, help="强制从指定时间开始同步,格式如 2025-11-28T00:00:00Z")
parser.add_argument("--title", type=str, help="只同步指定的单个页面标题")
parser.add_argument("--update-timestamp", action="store_true",
help="在单页模式下,完成后仍然更新全局 last_sync_timestamp.txt")
parser.add_argument("--run", action="store_true",
help="执行同步操作(必须提供此参数才能真正执行同步)")
args = parser.parse_args()
# 如果没有提供 --run 参数,则显示帮助信息并退出
if not args.run:
parser.print_help()
return
# 确定实际使用的 since 时间
if args.since:
since_time = args.since
print(f"使用命令行指定的时间起点:{since_time}")
else:
since_time = load_last_timestamp()
if not since_time:
from datetime import timedelta
since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z"
print(f"使用上次记录的时间起点:{since_time}")
# 单页面模式
if args.title:
process_single_page(args.title.strip(), since_time, args.update_timestamp)
return
# 全量模式 - 使用复用的单页处理逻辑
process_all_pages_since(since_time)
if __name__ == "__main__":
main()