sync-pd2-wiki/sync.py

1627 lines
57 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
MediaWiki 最近变更同步工具 - 增强版
支持:
1. 正常全量同步(无参数)
2. 手动指定时间起点:--since 2025-11-28T00:00:00Z
3. 只同步单个页面:--title "页面名称"
4. 单个页面时可选更新全局时间戳:--update-timestamp
5. 获取历史版本并生成diff
6. 同步中文翻译版本
7. 生成双语对比网页
"""
import os
import argparse
from pathlib import Path
from datetime import datetime
import requests
from dotenv import load_dotenv
import difflib
import json
import re
from urllib.parse import quote
# ==================== 配置区 ====================
load_dotenv()
WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php")
WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php")
OUTPUT_DIR = Path("wiki_sync_output")
OUTPUT_DIR.mkdir(exist_ok=True)
# 全局变量,存储本次执行的输出目录
CURRENT_OUTPUT_DIR = None
LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt"
SESSION_EN = requests.Session()
SESSION_EN.headers.update({
"User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)"
})
SESSION_CN = requests.Session()
SESSION_CN.headers.update({
"User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)"
})
# ================================================
def load_last_timestamp():
if not os.path.exists(LAST_TIMESTAMP_FILE):
return None
with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f:
return f.read().strip()
def save_last_timestamp(ts):
with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f:
f.write(ts)
def get_recent_changes(since):
"""获取自 since 时间后每个页面的最新 revid自动去重"""
params = {
"action": "query",
"list": "recentchanges",
"rcprop": "title|ids|timestamp",
"rctype": "edit|new",
"rcdir": "newer",
"rcstart": since,
"rclimit": 500,
"format": "json"
}
latest = {}
while True:
try:
r = SESSION_EN.get(WIKI_API_URL_EN, params=params)
r.raise_for_status()
response_data = r.json()
if "error" in response_data:
raise Exception(response_data["error"])
for rc in response_data.get("query", {}).get("recentchanges", []):
latest[rc["title"]] = (rc["revid"], rc["timestamp"])
if "continue" not in response_data:
break
params.update(response_data["continue"])
except Exception as e:
print(f"获取最近更改时出错: {e}")
break
return latest
def get_old_revid(title, end_time):
"""获取 ≤ end_time 的最后一次修订的 revid用于 fromrev"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "ids|timestamp",
"rvlimit": 1,
"rvdir": "older",
"rvstart": end_time,
"format": "json"
}
try:
r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json()
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" not in page:
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
return None
revisions = page["revisions"]
if len(revisions) >= 1:
return revisions[0]["revid"]
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
return None
except Exception as e:
print(f"获取旧版本ID时出错: {e}")
return None
def get_page_content(wiki_url, session, title, revid=None):
"""获取页面完整内容"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "content|timestamp|ids",
"rvslots": "main",
"format": "json"
}
if revid:
params["rvstartid"] = revid
params["rvendid"] = revid
try:
r = session.get(wiki_url, params=params).json()
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" not in page:
return None, None, None
rev = page["revisions"][0]
content = rev["slots"]["main"]["*"]
timestamp = rev["timestamp"]
rev_id = rev["revid"]
return content, timestamp, rev_id
except Exception as e:
print(f"获取页面内容时出错: {e}")
return None, None, None
def generate_text_diff(old_text, new_text):
"""生成类似git diff的文本diff"""
if not old_text:
return "新创建页面"
old_lines = old_text.splitlines(keepends=True)
new_lines = new_text.splitlines(keepends=True)
differ = difflib.unified_diff(
old_lines,
new_lines,
lineterm='\n'
)
return ''.join(differ)
def parse_diff_with_line_numbers(diff_text):
"""解析diff文本提取详细的行号信息"""
if not diff_text or diff_text.startswith("新创建页面"):
return []
parsed_lines = []
current_old_line = 0
current_new_line = 0
in_hunk = False
for line in diff_text.splitlines():
if line.startswith('@@'):
# 解析hunk头部格式如: @@ -start,count +start,count @@
import re
match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line)
if match:
old_start = int(match.group(1))
old_count = int(match.group(2)) if match.group(2) else 1
new_start = int(match.group(3))
new_count = int(match.group(4)) if match.group(4) else 1
current_old_line = old_start
current_new_line = new_start
in_hunk = True
parsed_lines.append({
'type': 'hunk',
'content': line,
'old_start': old_start,
'old_count': old_count,
'new_start': new_start,
'new_count': new_count,
'old_line': None,
'new_line': None
})
else:
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
elif line.startswith('---') or line.startswith('+++'):
# 文件头信息
continue
#parsed_lines.append({
# 'type': 'header',
# 'content': line,
# 'old_line': None,
# 'new_line': None
#})
elif in_hunk:
if line.startswith('-'):
# 删除的行
parsed_lines.append({
'type': 'removed',
'content': line[1:], # 去掉开头的 '-'
'old_line': current_old_line,
'new_line': None
})
current_old_line += 1
elif line.startswith('+'):
# 新增的行
parsed_lines.append({
'type': 'added',
'content': line[1:], # 去掉开头的 '+'
'old_line': None,
'new_line': current_new_line
})
current_new_line += 1
elif line.startswith(' '):
# 未变更的行
parsed_lines.append({
'type': 'context',
'content': line[1:], # 去掉开头的 ' '
'old_line': current_old_line,
'new_line': current_new_line
})
current_old_line += 1
current_new_line += 1
else:
# 其他行(如空行)
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
else:
# 不在任何hunk中的行
parsed_lines.append({
'type': 'other',
'content': line,
'old_line': None,
'new_line': None
})
return parsed_lines
def search_chinese_page(title):
"""在中文wiki中搜索对应的页面"""
# 首先尝试精确匹配
params = {
"action": "query",
"list": "search",
"srsearch": f'"{title}"',
"srwhat": "title",
"srlimit": 5,
"format": "json"
}
try:
r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json()
search_results = r.get("query", {}).get("search", [])
if search_results:
# 返回第一个匹配的结果
return search_results[0]["title"]
# 如果精确匹配没有结果,尝试模糊搜索
params["srsearch"] = title.replace(" ", "%20")
r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json()
search_results = r.get("query", {}).get("search", [])
if search_results:
return search_results[0]["title"]
except Exception as e:
print(f"搜索中文页面时出错: {e}")
return None
def create_diff_html(title, en_diff, en_old_lines, en_new_lines, cn_content=None):
"""创建双语对比的HTML页面 - Word批注风格英文变更直接显示在对应中文行右侧"""
# 准备中文内容行
cn_lines = []
if cn_content:
cn_lines = cn_content.splitlines()
# 解析diff并获取行号信息
parsed_diff = parse_diff_with_line_numbers(en_diff) if en_diff else []
# 构建行号到diff内容的映射 - 科学处理连续diff块
en_changes_by_line = {}
blank_lines_to_insert = {} # 记录需要在某行前插入的空白行及其对应的新增内容
if parsed_diff:
i = 0
while i < len(parsed_diff):
# 收集连续的diff块
diff_block = []
start_index = i
# 收集连续的添加/删除操作跳过hunk和header
while i < len(parsed_diff):
item = parsed_diff[i]
if item['type'] in ['added', 'removed']:
diff_block.append(item)
elif item['type'] in ['hunk', 'header'] or item['type'] == 'context':
if diff_block: # 如果已经有diff块就停止
break
i += 1
# 处理连续的diff块 - 改进的连续匹配算法
if diff_block:
# 使用新的匹配算法:连续的减号和加号应该按顺序匹配
removed_items = []
added_items = []
# 分离删除和新增项目
for item in diff_block:
if item['type'] == 'removed':
removed_items.append(item)
elif item['type'] == 'added':
added_items.append(item)
# 进行匹配:连续块里的每一个减都应该和后续的加形成匹配替换
match_index = 0
for removed_item in removed_items:
if match_index < len(added_items):
# 匹配成功:形成替换
target_line = removed_item['old_line']
if target_line not in en_changes_by_line:
en_changes_by_line[target_line] = []
en_changes_by_line[target_line].append({
'type': 'replaced',
'old_content': removed_item['content'],
'new_content': added_items[match_index]['content']
})
match_index += 1
else:
# 没有匹配的加:这是删除
target_line = removed_item['old_line']
if target_line not in en_changes_by_line:
en_changes_by_line[target_line] = []
en_changes_by_line[target_line].append({
'type': 'removed',
'content': removed_item['content']
})
# 处理剩余的加(没有匹配的减):这是新增,应该在左侧空行
if match_index < len(added_items):
# 找到基准行号(使用最后一个删除行的行号,如果没有则使用第一个新增的行号)
base_line = None
if removed_items:
base_line = removed_items[-1]['old_line']
elif added_items:
base_line = added_items[match_index]['new_line'] - len(added_items) + match_index
if base_line:
remaining_additions = added_items[match_index:]
blank_lines_to_insert[base_line + 1] = remaining_additions
# 继续处理剩余项
else:
i += 1
# HTML转义函数
def html_escape(text):
if not text:
return ""
return (str(text)
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#39;"))
def generate_inline_diff(old_text, new_text):
"""生成GitHub风格的行内字符级diff"""
if not old_text or not new_text:
return html_escape(new_text or "")
escaped_old = html_escape(old_text)
escaped_new = html_escape(new_text)
# 使用difflib进行字符级别的比较
differ = difflib.SequenceMatcher(None, escaped_old, escaped_new)
result = []
for tag, i1, i2, j1, j2 in differ.get_opcodes():
if tag == 'equal':
# 相同的部分
result.append(escaped_new[j1:j2])
elif tag == 'replace':
# 替换的部分:删除的用红色背景,新增的用绿色背景
deleted = escaped_old[i1:i2]
added = escaped_new[j1:j2]
result.append(f'<span class="diff-char-removed">{deleted}</span>')
result.append(f'<span class="diff-char-added">{added}</span>')
elif tag == 'delete':
# 删除的部分用红色背景
deleted = escaped_old[i1:i2]
result.append(f'<span class="diff-char-removed">{deleted}</span>')
elif tag == 'insert':
# 新增的部分用绿色背景
added = escaped_new[j1:j2]
result.append(f'<span class="diff-char-added">{added}</span>')
return ''.join(result)
def generate_clean_new_content(old_text, new_text):
"""生成干净的新内容,只显示新增部分的高亮,不包含删除部分"""
if not old_text or not new_text:
return html_escape(new_text or "")
escaped_old = html_escape(old_text)
escaped_new = html_escape(new_text)
# 使用difflib进行字符级别的比较
differ = difflib.SequenceMatcher(None, escaped_old, escaped_new)
result = []
for tag, i1, i2, j1, j2 in differ.get_opcodes():
if tag == 'equal':
# 相同的部分
result.append(escaped_new[j1:j2])
elif tag == 'replace':
# 替换的部分:只显示新增的内容(绿色高亮),跳过删除的内容
added = escaped_new[j1:j2]
result.append(f'<span class="diff-char-added">{added}</span>')
elif tag == 'delete':
# 删除的部分:跳过,不显示
continue
elif tag == 'insert':
# 新增的部分用绿色背景
added = escaped_new[j1:j2]
result.append(f'<span class="diff-char-added">{added}</span>')
return ''.join(result)
# 收集变更块信息用于导航
change_blocks = []
change_block_id = 0
# 生成HTML
html = f'''<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Wiki Diff: {title}</title>
<style>
* {{
margin: 0;
padding: 0;
box-sizing: border-box;
}}
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
background-color: #f5f5f5;
line-height: 1.6;
padding: 20px;
}}
.header {{
background-color: #fff;
padding: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
margin-bottom: 20px;
border-radius: 8px;
}}
.header h1 {{
color: #333;
font-size: 24px;
margin-bottom: 10px;
}}
.header .meta {{
color: #666;
font-size: 14px;
}}
.content-container {{
max-width: 1200px;
margin: 0 auto;
background-color: #fff;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
overflow: hidden;
}}
.content-header {{
background-color: #e9ecef;
padding: 15px 20px;
font-weight: bold;
color: #495057;
border-bottom: 1px solid #dee2e6;
}}
.diff-content {{
padding: 0;
}}
.line-wrapper {{
display: flex;
border-bottom: 1px solid #f0f0f0;
position: relative;
}}
.line-wrapper:hover {{
background-color: rgba(0, 123, 255, 0.02);
}}
.line-wrapper.has-changes {{
background-color: rgba(255, 193, 7, 0.05);
}}
.main-line {{
display: flex;
flex: 1;
min-height: 24px;
align-items: center;
}}
.line-number {{
width: 60px;
text-align: right;
padding: 8px 12px;
background-color: #f8f9fa;
color: #6c757d;
font-size: 12px;
user-select: none;
flex-shrink: 0;
border-right: 1px solid #e9ecef;
}}
.line-content {{
flex: 1;
padding: 8px 12px;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 13px;
line-height: 1.5;
white-space: pre-wrap;
word-break: break-word;
color: #333;
}}
/* 批注样式 */
.annotation {{
width: 400px;
background-color: #f8f9fa;
border-left: 1px solid #dee2e6;
padding: 10px 14px;
font-size: 14px;
display: none;
}}
.line-wrapper.has-changes .annotation {{
display: block;
}}
.annotation-item {{
margin-bottom: 8px;
padding: 8px 10px;
border-radius: 4px;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 13px;
line-height: 1.5;
}}
.annotation-item:last-child {{
margin-bottom: 0;
}}
.annotation-item.added {{
background-color: #e6ffec;
border-left: 3px solid #28a745;
color: #155724;
}}
.annotation-item.removed {{
background-color: #ffeef0;
border-left: 3px solid #dc3545;
color: #721c24;
text-decoration: line-through;
}}
.annotation-item.replaced {{
margin-bottom: 8px;
}}
.annotation-item.replaced .old-content {{
background-color: #ffeef0;
border-left: 3px solid #dc3545;
color: #721c24;
text-decoration: line-through;
padding: 4px 6px;
border-radius: 3px;
margin-bottom: 4px;
}}
.annotation-item.replaced .new-content {{
background-color: #e6ffec;
border-left: 3px solid #28a745;
color: #155724;
padding: 4px 6px;
border-radius: 3px;
}}
.annotation-header {{
font-size: 11px;
color: #6c757d;
margin-bottom: 6px;
font-weight: bold;
text-transform: uppercase;
letter-spacing: 0.5px;
}}
/* GitHub风格的字符级diff样式 */
.diff-char-added {{
background-color: #acf2bd;
color: #24292f;
border-radius: 2px;
padding: 1px 2px;
}}
.diff-char-removed {{
background-color: #ffd5d5;
color: #24292f;
border-radius: 2px;
padding: 1px 2px;
}}
/* 批注项内的字符级diff样式调整 */
.annotation-item.removed {{
text-decoration: none; /* 移除删除线 */
}}
.annotation-item.replaced .old-content {{
text-decoration: none; /* 移除删除线 */
}}
/* 新页面提示 */
.new-page-notice {{
background-color: #d4edda;
color: #155724;
padding: 15px 20px;
margin: 15px;
border-radius: 4px;
border-left: 4px solid #28a745;
}}
.no-translation {{
background-color: #fff3cd;
color: #856404;
padding: 15px 20px;
margin: 15px;
border-radius: 4px;
border-left: 4px solid #ffc107;
}}
/* 响应式设计 */
@media (max-width: 1024px) {{
.annotation {{
width: 300px;
}}
}}
@media (max-width: 768px) {{
body {{
padding: 10px;
}}
.annotation {{
width: 100%;
display: block !important;
border-left: none;
border-top: 1px solid #dee2e6;
}}
.line-wrapper {{
flex-direction: column;
}}
.main-line {{
border-bottom: none;
}}
}}
/* 高亮效果 */
.line-wrapper.highlight {{
background-color: rgba(255, 235, 59, 0.3) !important;
animation: highlight 2s ease-in-out;
}}
@keyframes highlight {{
0% {{ background-color: rgba(255, 235, 59, 0.6); }}
100% {{ background-color: rgba(255, 235, 59, 0.3); }}
}}
/* 空行样式 */
.line-wrapper.empty-line .line-content {{
min-height: 24px;
color: #999;
font-style: italic;
}}
/* 空白占位行样式 */
.line-wrapper.blank-placeholder {{
background-color: #fafafa;
border-bottom: 1px solid #e9ecef;
display: flex;
}}
.line-wrapper.blank-placeholder .main-line {{
min-height: 24px;
flex: 1;
display: flex;
}}
.line-wrapper.blank-placeholder .line-number {{
color: #dee2e6;
}}
.line-wrapper.blank-placeholder .line-content {{
color: #dee2e6;
font-style: italic;
min-height: 24px;
display: flex;
align-items: center;
}}
/* 空白占位行的批注样式 */
.line-wrapper.blank-placeholder .annotation {{
width: 400px;
background-color: #f8f9fa;
border-left: 1px solid #dee2e6;
padding: 10px 14px;
font-size: 14px;
display: block;
}}
.line-wrapper.blank-placeholder .annotation .annotation-item {{
margin-bottom: 8px;
padding: 8px 10px;
border-radius: 4px;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 13px;
line-height: 1.5;
}}
.line-wrapper.blank-placeholder .annotation .annotation-item:last-child {{
margin-bottom: 0;
}}
.line-wrapper.blank-placeholder .annotation .annotation-item.added {{
background-color: #e6ffec;
border-left: 3px solid #28a745;
color: #155724;
}}
.line-wrapper.blank-placeholder .annotation .annotation-header {{
font-size: 10px;
color: #6c757d;
margin-bottom: 4px;
font-weight: bold;
text-transform: uppercase;
letter-spacing: 0.5px;
}}
.line-wrapper.blank-placeholder:hover {{
background-color: #f8f9fa;
}}
.line-wrapper.blank-placeholder:hover .main-line {{
background-color: rgba(0, 123, 255, 0.02);
}}
/* 变更块高亮样式 */
.line-wrapper.change-block {{
border-left: 3px solid #007bff;
}}
/* 导航浮窗样式 */
.navigation-float {{
position: fixed;
left: 20px;
top: 50%;
transform: translateY(-50%);
width: 280px;
max-height: 70vh;
background-color: #fff;
border: 1px solid #dee2e6;
border-radius: 8px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
z-index: 1000;
overflow: hidden;
}}
.navigation-header {{
background-color: #007bff;
color: white;
padding: 12px 16px;
font-weight: bold;
font-size: 14px;
display: flex;
justify-content: space-between;
align-items: center;
}}
.navigation-toggle {{
background: none;
border: none;
color: white;
cursor: pointer;
font-size: 16px;
padding: 4px 8px;
border-radius: 4px;
transition: background-color 0.2s;
}}
.navigation-toggle:hover {{
background-color: rgba(255, 255, 255, 0.2);
}}
.navigation-content {{
max-height: calc(70vh - 50px);
overflow-y: auto;
padding: 8px;
}}
.navigation-item {{
padding: 10px 12px;
margin-bottom: 6px;
background-color: #f8f9fa;
border-radius: 4px;
cursor: pointer;
transition: all 0.2s;
border-left: 3px solid transparent;
}}
.navigation-item:hover {{
background-color: #e9ecef;
border-left-color: #007bff;
}}
.navigation-item.active {{
background-color: #e3f2fd;
border-left-color: #007bff;
}}
.navigation-item-header {{
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 4px;
}}
.navigation-item-number {{
background-color: #007bff;
color: white;
font-size: 11px;
padding: 2px 6px;
border-radius: 10px;
font-weight: bold;
}}
.navigation-item-type {{
font-size: 11px;
padding: 2px 6px;
border-radius: 3px;
font-weight: bold;
text-transform: uppercase;
}}
.navigation-item-type.added {{
background-color: #28a745;
color: white;
}}
.navigation-item-type.replaced {{
background-color: #ffc107;
color: #212529;
}}
.navigation-item-type.removed {{
background-color: #dc3545;
color: white;
}}
.navigation-item-preview {{
font-size: 12px;
color: #6c757d;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
line-height: 1.3;
word-break: break-word;
}}
.navigation-item-line {{
font-size: 10px;
color: #adb5bd;
margin-top: 2px;
}}
/* 导航浮窗收起状态 */
.navigation-float.collapsed .navigation-content {{
display: none;
}}
.navigation-float.collapsed {{
width: auto;
min-width: 160px;
}}
/* 响应式设计 */
@media (max-width: 768px) {{
.navigation-float {{
left: 10px;
width: 240px;
}}
.navigation-float.collapsed {{
min-width: 120px;
}}
}}
@media (max-width: 480px) {{
.navigation-float {{
position: relative;
left: auto;
top: auto;
transform: none;
width: 100%;
max-height: none;
margin-bottom: 20px;
}}
}}
</style>
</head>
<body>
<div class="header">
<h1>{title}</h1>
<div class="meta">
<span>英文Wiki: wiki.projectdiablo2.com</span>
{f' | 中文Wiki: wiki.projectdiablo2.cn' if cn_content else ''}
</div>
</div>
<div class="navigation-float" id="navigation-float">
<div class="navigation-header">
<span>变更导航</span>
<button class="navigation-toggle" id="navigation-toggle"></button>
</div>
<div class="navigation-content" id="navigation-content">
<!-- Navigation items will be generated here -->
</div>
</div>
<div class="content-container">
<div class="content-header">中文翻译(含英文变更批注)</div>
<div class="diff-content">
'''
# 添加中文内容和英文变更批注
if cn_content:
for i, line in enumerate(cn_lines, 1):
# 检查是否需要在此行之前插入空白行
if i in blank_lines_to_insert:
additions_list = blank_lines_to_insert[i]
change_block_id += 1
# 添加变更块到导航列表
preview_text = additions_list[0][:50] + "..." if len(additions_list[0]) > 50 else additions_list[0]
change_blocks.append({
'id': change_block_id,
'line': i,
'type': '新增',
'preview': preview_text,
'count': len(additions_list)
})
for idx, addition_content in enumerate(additions_list):
html += f'<div class="line-wrapper blank-placeholder change-block" data-change-id="{change_block_id}">'
html += '<div class="main-line">'
html += '<span class="line-number">&nbsp;</span>' # 不显示行号
html += f'<span class="line-content">(新增英文内容占位)</span>'
html += '</div>'
# 为空白行添加对应的新增批注
escaped_addition = html_escape(addition_content)
html += '<div class="annotation">'
html += f'<div class="annotation-item added">'
html += f'<div class="annotation-header">新增</div>'
html += f'<div>{escaped_addition}</div>'
html += '</div>'
html += '</div>'
html += '</div>'
escaped_line = html_escape(line)
has_changes = i in en_changes_by_line
changes = en_changes_by_line.get(i, [])
# 判断是否为空行
is_empty = not line.strip()
# 如果有变更(除了新增),添加到导航列表
if has_changes and any(change['type'] in ['replaced', 'removed'] for change in changes):
change_block_id += 1
preview_text = line[:50] + "..." if len(line) > 50 else line
change_type = "替换" if any(change['type'] == 'replaced' for change in changes) else "删除"
change_blocks.append({
'id': change_block_id,
'line': i,
'type': change_type,
'preview': preview_text,
'count': 1
})
html += f'<div class="line-wrapper {"has-changes" if has_changes else ""} {"empty-line" if is_empty else ""} {"change-block" if has_changes else ""}" data-change-id="{change_block_id if has_changes else ""}">'
html += f'<div class="main-line">'
html += f'<span class="line-number">{i}</span>'
html += f'<span class="line-content">{escaped_line if not is_empty else "(空行)"}</span>'
html += '</div>'
# 添加英文变更批注(只显示替换和删除操作,新增操作已经在空白行中显示)
if has_changes:
html += '<div class="annotation">'
for change in changes:
if change['type'] == 'added':
# 新增内容已经在空白行中显示,这里跳过
continue
elif change['type'] == 'removed':
escaped_change = html_escape(change['content'])
html += f'<div class="annotation-item removed">'
html += f'<div class="annotation-header">删除</div>'
html += f'<div>{escaped_change}</div>'
html += '</div>'
elif change['type'] == 'replaced':
# 生成干净的新内容(只显示新增部分的高亮,不包含删除部分)
clean_new_content = generate_clean_new_content(change['old_content'], change['new_content'])
html += f'<div class="annotation-item replaced">'
html += f'<div class="annotation-header">替换</div>'
html += f'<div class="old-content">{html_escape(change["old_content"])}</div>'
html += f'<div class="new-content">{clean_new_content}</div>'
html += '</div>'
html += '</div>'
html += '</div>'
else:
html += '<div class="no-translation">未找到对应的中文翻译页面</div>'
# 调试日志打印change_blocks信息
print(f"DEBUG: Final change_blocks length = {len(change_blocks)}")
for i, block in enumerate(change_blocks):
print(f"DEBUG: Final block {i}: {block}")
html += '''
</div>
</div>
<script>
// 动态生成导航项
function generateNavigationItems() {
const navContent = document.getElementById('navigation-content');
const changeBlocks = document.querySelectorAll('.change-block');
// 清空现有内容
navContent.innerHTML = '';
// 变更块信息已经在HTML中通过data属性标记
const changes = [];
// 收集所有变更块信息
changeBlocks.forEach((block, index) => {
const changeId = block.getAttribute('data-change-id');
if (changeId) {
// 获取变更类型和预览文本
const annotation = block.querySelector('.annotation');
if (annotation) {
const addedItem = annotation.querySelector('.annotation-item.added .annotation-header');
const replacedItem = annotation.querySelector('.annotation-item.replaced .annotation-header');
const removedItem = annotation.querySelector('.annotation-item.removed .annotation-header');
let type = '变更';
let preview = '';
if (addedItem) {
type = '新增';
const content = annotation.querySelector('.annotation-item.added > div:last-child');
preview = content ? content.textContent.substring(0, 50) : '';
} else if (replacedItem) {
type = '替换';
const content = annotation.querySelector('.annotation-item.replaced .old-content');
preview = content ? content.textContent.substring(0, 50) : '';
} else if (removedItem) {
type = '删除';
const content = annotation.querySelector('.annotation-item.removed > div:last-child');
preview = content ? content.textContent.substring(0, 50) : '';
}
if (preview) {
preview += '...';
}
// 获取行号
const lineNumber = block.querySelector('.line-number');
const line = lineNumber ? lineNumber.textContent : '?';
changes.push({
id: changeId,
type: type,
preview: preview || '变更内容',
line: line
});
}
}
});
// 如果没有变更,显示未发现变更
if (changes.length === 0) {
navContent.innerHTML = '<div class="navigation-item"><div class="navigation-item-preview">未发现变更</div></div>';
return;
}
// 生成导航项
changes.forEach(change => {
const navItem = document.createElement('div');
navItem.className = 'navigation-item';
navItem.setAttribute('data-change-id', change.id);
navItem.innerHTML = `
<div class="navigation-item-header">
<span class="navigation-item-number">${change.id}</span>
<span class="navigation-item-type ${change.type.toLowerCase()}">${change.type}</span>
</div>
<div class="navigation-item-preview">${change.preview}</div>
<div class="navigation-item-line">行 ${change.line}</div>
`;
// 添加点击事件
navItem.addEventListener('click', () => {
const targetBlock = document.querySelector(`[data-change-id="${change.id}"].change-block`);
if (targetBlock) {
targetBlock.scrollIntoView({
behavior: 'smooth',
block: 'center'
});
updateActiveNavItem();
highlightBlock(targetBlock);
}
});
navContent.appendChild(navItem);
});
}
// 导航功能
class DiffNavigation {
constructor() {
this.navFloat = document.getElementById('navigation-float');
this.navToggle = document.getElementById('navigation-toggle');
// 延迟获取导航项,确保它们已经生成
this.updateNavigationElements();
this.isCollapsed = false;
// 调试日志
console.log('DiffNavigation Constructor:');
console.log(' navFloat:', this.navFloat);
console.log(' navToggle:', this.navToggle);
console.log(' navItems length:', this.navItems.length);
console.log(' changeBlocks length:', this.changeBlocks.length);
// 打印所有导航项的详细信息
this.navItems.forEach((item, index) => {
console.log(` Navigation item ${index}:`, {
id: item.getAttribute('data-change-id'),
hasType: !!item.querySelector('.navigation-item-type'),
hasPreview: !!item.querySelector('.navigation-item-preview')
});
});
this.init();
}
updateNavigationElements() {
this.navItems = document.querySelectorAll('.navigation-item');
this.changeBlocks = document.querySelectorAll('.change-block');
}
init() {
// 绑定收起/展开按钮
this.navToggle.addEventListener('click', () => {
this.toggleCollapse();
});
// 绑定导航项点击事件
this.navItems.forEach(item => {
item.addEventListener('click', () => {
this.navigateToChange(item);
});
});
// 监听滚动,更新当前激活的导航项
window.addEventListener('scroll', () => {
this.updateActiveNavItem();
});
// 初始化当前激活项
this.updateActiveNavItem();
}
toggleCollapse() {
this.isCollapsed = !this.isCollapsed;
if (this.isCollapsed) {
this.navFloat.classList.add('collapsed');
this.navToggle.textContent = '+';
} else {
this.navFloat.classList.remove('collapsed');
this.navToggle.textContent = '';
}
}
navigateToChange(navItem) {
const changeId = navItem.getAttribute('data-change-id');
const targetBlock = document.querySelector(`[data-change-id="${changeId}"].change-block`);
if (targetBlock) {
// 滚动到目标位置
targetBlock.scrollIntoView({
behavior: 'smooth',
block: 'center'
});
// 更新激活状态
this.updateActiveNavItem();
// 添加临时高亮效果
this.highlightBlock(targetBlock);
}
}
updateActiveNavItem() {
const scrollPosition = window.scrollY + window.innerHeight / 2;
let activeItem = null;
this.changeBlocks.forEach(block => {
const blockTop = block.offsetTop;
const blockBottom = blockTop + block.offsetHeight;
if (scrollPosition >= blockTop && scrollPosition <= blockBottom) {
const changeId = block.getAttribute('data-change-id');
activeItem = document.querySelector(`[data-change-id="${changeId}"].navigation-item`);
}
});
// 更新激活状态
this.navItems.forEach(item => {
item.classList.remove('active');
});
if (activeItem) {
activeItem.classList.add('active');
// 确保激活项在视口内
this.ensureNavItemVisible(activeItem);
}
}
ensureNavItemVisible(navItem) {
const navContent = document.getElementById('navigation-content');
const itemTop = navItem.offsetTop;
const itemBottom = itemTop + navItem.offsetHeight;
const scrollTop = navContent.scrollTop;
const contentHeight = navContent.clientHeight;
if (itemTop < scrollTop) {
navContent.scrollTop = itemTop - 10;
} else if (itemBottom > scrollTop + contentHeight) {
navContent.scrollTop = itemBottom - contentHeight + 10;
}
}
highlightBlock(block) {
// 移除所有高亮
document.querySelectorAll('.line-wrapper.highlight').forEach(line => {
line.classList.remove('highlight');
});
// 添加高亮效果
block.classList.add('highlight');
// 2秒后移除高亮
setTimeout(() => {
block.classList.remove('highlight');
}, 2000);
}
// 公共方法:跳转到指定变更
goToChange(changeId) {
const navItem = document.querySelector(`[data-change-id="${changeId}"].navigation-item`);
if (navItem) {
this.navigateToChange(navItem);
}
}
// 公共方法:获取所有变更列表
getChanges() {
const changes = [];
this.navItems.forEach(item => {
const changeId = item.getAttribute('data-change-id');
const typeElement = item.querySelector('.navigation-item-type');
const previewElement = item.querySelector('.navigation-item-preview');
const lineElement = item.querySelector('.navigation-item-line');
if (changeId && typeElement && previewElement && lineElement) {
changes.push({
id: parseInt(changeId),
type: typeElement.textContent,
preview: previewElement.textContent,
line: lineElement.textContent
});
}
});
return changes;
}
}
// 初始化导航
let diffNavigation;
document.addEventListener('DOMContentLoaded', () => {
// 首先生成导航项
generateNavigationItems();
// 调试日志
console.log('=== Diff Navigation Debug ===');
console.log('Navigation float:', document.getElementById('navigation-float'));
console.log('Navigation content:', document.getElementById('navigation-content'));
console.log('Navigation items:', document.querySelectorAll('.navigation-item'));
console.log('Change blocks:', document.querySelectorAll('.change-block'));
console.log('Has changes lines:', document.querySelectorAll('.line-wrapper.has-changes'));
diffNavigation = new DiffNavigation();
// 添加键盘快捷键支持
document.addEventListener('keydown', (e) => {
if (e.ctrlKey || e.metaKey) {
if (e.key >= '1' && e.key <= '9') {
e.preventDefault();
const changeId = parseInt(e.key);
diffNavigation.goToChange(changeId);
}
}
});
});
// 原有的点击高亮功能
document.querySelectorAll('.line-wrapper.has-changes').forEach(lineWrapper => {
lineWrapper.addEventListener('click', () => {
// 移除所有高亮
document.querySelectorAll('.line-wrapper.highlight').forEach(line => {
line.classList.remove('highlight');
});
// 高亮当前行
lineWrapper.classList.add('highlight');
});
});
</script>
</body>
</html>'''
return html
def save_files(title, diff_html, diff_text, full_text, timestamp, note="", revid=None, cn_content=None, old_full_text=None):
global CURRENT_OUTPUT_DIR
# 确保本次执行的输出目录已经创建
if CURRENT_OUTPUT_DIR is None:
current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str
CURRENT_OUTPUT_DIR.mkdir(exist_ok=True)
# 创建子目录
(CURRENT_OUTPUT_DIR / "new_pages").mkdir(exist_ok=True)
(CURRENT_OUTPUT_DIR / "changed_pages").mkdir(exist_ok=True)
print(f"创建本次执行的输出目录: {CURRENT_OUTPUT_DIR}")
safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title)
time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_")
base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}"
# 判断是否为新页面
is_new_page = diff_text.startswith("新创建页面")
if is_new_page:
# 新页面:只保存完整内容到 new_pages 目录
target_dir = CURRENT_OUTPUT_DIR / "new_pages"
print(f" 检测到新页面,只保存完整内容")
# 保存最新完整内容
full_file = target_dir / f"{base_filename}.full.txt"
if full_text:
try:
with open(full_file, "w", encoding="utf-8") as f:
f.write(full_text)
print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)} (新页面完整内容)")
except Exception as e:
print(f" → 保存文件 {full_file} 时出错: {e}")
else:
# 变更页面:保存主要文件到 changed_pages 目录,其他文件到 files 目录
target_dir = CURRENT_OUTPUT_DIR / "changed_pages"
files_dir = target_dir / "files"
files_dir.mkdir(exist_ok=True)
# 1. 保存最新完整内容(主文件)
full_file = target_dir / f"{base_filename}.full.txt"
if full_text:
try:
with open(full_file, "w", encoding="utf-8") as f:
f.write(full_text)
print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)} (完整内容)")
except Exception as e:
print(f" → 保存文件 {full_file} 时出错: {e}")
# 2. 保存中文翻译内容(主文件)
if cn_content:
cn_file = target_dir / f"{base_filename}.cn.txt"
try:
with open(cn_file, "w", encoding="utf-8") as f:
f.write(cn_content)
print(f" → 已保存: {cn_file.relative_to(OUTPUT_DIR)} (中文翻译)")
except Exception as e:
print(f" → 保存文件 {cn_file} 时出错: {e}")
# 3. 创建双语对比HTML页面主文件
en_new_lines = full_text.splitlines() if full_text else []
en_old_lines = old_full_text.splitlines() if old_full_text else []
comparison_html = create_diff_html(title, diff_text, en_old_lines, en_new_lines, cn_content)
comparison_file = target_dir / f"{base_filename}.comparison.html"
try:
with open(comparison_file, "w", encoding="utf-8") as f:
f.write(comparison_html)
print(f" → 已保存: {comparison_file.relative_to(OUTPUT_DIR)} (双语对比页面)")
except Exception as e:
print(f" → 保存文件 {comparison_file} 时出错: {e}")
# 保存参考文件到 files 目录
if diff_html:
diff_file = files_dir / f"{base_filename}.diff.html"
try:
with open(diff_file, "w", encoding="utf-8") as f:
f.write(diff_html)
print(f" → 已保存: {diff_file.relative_to(OUTPUT_DIR)} (官方diff - 参考)")
except Exception as e:
print(f" → 保存文件 {diff_file} 时出错: {e}")
if diff_text:
text_diff_file = files_dir / f"{base_filename}.diff.txt"
try:
with open(text_diff_file, "w", encoding="utf-8") as f:
f.write(diff_text)
print(f" → 已保存: {text_diff_file.relative_to(OUTPUT_DIR)} (文本diff - 参考)")
except Exception as e:
print(f" → 保存文件 {text_diff_file} 时出错: {e}")
if old_full_text:
old_full_file = files_dir / f"{base_filename}.old.txt"
try:
with open(old_full_file, "w", encoding="utf-8") as f:
f.write(old_full_text)
print(f" → 已保存: {old_full_file.relative_to(OUTPUT_DIR)} (历史版本 - 参考)")
except Exception as e:
print(f" → 保存文件 {old_full_file} 时出错: {e}")
def process_single_page(title, since_time, update_timestamp=False):
"""只处理单个页面"""
print(f"正在单独处理页面:{title}")
# 获取当前最新 revid
try:
latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title)
if latest_content is None:
print("页面不存在或被删除")
return None
# 获取旧 revid
old_revid = get_old_revid(title, since_time)
# 初始化变量
diff_html = None
diff_text = None
old_content = None
cn_content = None
if old_revid:
# 获取历史版本内容
old_content, old_ts, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid)
if old_content is not None:
# 生成文本diff
diff_text = generate_text_diff(old_content, latest_content)
print(f" 生成了文本diff ({len(diff_text)} 字符)")
else:
print(f" 无法获取历史版本内容")
else:
# 新页面
print(" 这是新创建的页面")
# 搜索对应的中文页面
print(" 搜索中文翻译...")
cn_title = search_chinese_page(title)
if cn_title:
print(f" 找到中文页面: {cn_title}")
cn_content, cn_ts, cn_revid = get_page_content(WIKI_API_URL_CN, SESSION_CN, cn_title)
if cn_content:
print(f" 获取中文内容成功 ({len(cn_content)} 字符)")
else:
print(" 无法获取中文页面内容")
else:
print(" 未找到对应的中文翻译页面")
# 获取官方diff可选
if old_revid:
diff_params = {
"action": "compare",
"fromrev": old_revid,
"torev": latest_revid,
"format": "json"
}
try:
diff_resp = SESSION_EN.get(WIKI_API_URL_EN, params=diff_params).json()
diff_html = diff_resp.get("compare", {}).get("*", "")
except Exception as e:
print(f" 获取官方HTML diff时出错: {e}")
# 保存所有文件
save_files(title, diff_html, diff_text, latest_content, latest_ts, "", latest_revid, cn_content, old_content)
if update_timestamp:
save_last_timestamp(latest_ts)
print(f"已更新全局时间戳 → {latest_ts}")
return latest_ts
except Exception as e:
print(f"处理页面 '{title}' 时出错: {e}")
return None
def process_all_pages_since(since_time):
"""处理自指定时间以来的所有页面变更"""
print("正在获取最近变更列表...")
changes = get_recent_changes(since_time)
if not changes:
print("没有发现任何变更")
return
latest_global_ts = since_time
for title, (latest_revid, ts) in changes.items():
print(f"\n处理:{title}")
# 复用单页处理逻辑
page_latest_ts = process_single_page(title, since_time)
if page_latest_ts and page_latest_ts > latest_global_ts:
latest_global_ts = page_latest_ts
save_last_timestamp(latest_global_ts)
print(f"\n全量同步完成!本次最新时间戳已更新为:{latest_global_ts}")
print(f"文件保存在:{CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}")
def main():
parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - 增强版支持双语对比")
parser.add_argument("--since", type=str, help="强制从指定时间开始同步,格式如 2025-11-28T00:00:00Z")
parser.add_argument("--title", type=str, help="只同步指定的单个页面标题")
parser.add_argument("--update-timestamp", action="store_true",
help="在单页模式下,完成后仍然更新全局 last_sync_timestamp.txt")
parser.add_argument("--run", action="store_true",
help="执行同步操作(必须提供此参数才能真正执行同步)")
args = parser.parse_args()
# 如果没有提供 --run 参数,则显示帮助信息并退出
if not args.run:
parser.print_help()
return
# 确定实际使用的 since 时间
if args.since:
since_time = args.since
print(f"使用命令行指定的时间起点:{since_time}")
else:
since_time = load_last_timestamp()
if not since_time:
from datetime import timedelta
since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z"
print(f"使用上次记录的时间起点:{since_time}")
# 单页面模式
if args.title:
process_single_page(args.title.strip(), since_time, args.update_timestamp)
return
# 全量模式 - 使用复用的单页处理逻辑
process_all_pages_since(since_time)
if __name__ == "__main__":
main()