867 lines
29 KiB
Python
867 lines
29 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
MediaWiki 最近变更同步工具 - 增强版
|
||
支持:
|
||
1. 正常全量同步(无参数)
|
||
2. 手动指定时间起点:--since 2025-11-28T00:00:00Z
|
||
3. 只同步单个页面:--title "页面名称"
|
||
4. 单个页面时可选更新全局时间戳:--update-timestamp
|
||
5. 获取历史版本并生成diff
|
||
6. 同步中文翻译版本
|
||
7. 生成双语对比网页
|
||
"""
|
||
|
||
import os
|
||
import argparse
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
import requests
|
||
from dotenv import load_dotenv
|
||
import difflib
|
||
import json
|
||
import re
|
||
from urllib.parse import quote
|
||
|
||
# ==================== 配置区 ====================
|
||
load_dotenv()
|
||
WIKI_API_URL_EN = os.getenv("WIKI_API_URL_EN", "https://wiki.projectdiablo2.com/w/api.php")
|
||
WIKI_API_URL_CN = os.getenv("WIKI_API_URL_CN", "https://wiki.projectdiablo2.cn/w/api.php")
|
||
OUTPUT_DIR = Path("wiki_sync_output")
|
||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||
|
||
# 全局变量,存储本次执行的输出目录
|
||
CURRENT_OUTPUT_DIR = None
|
||
|
||
LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt"
|
||
|
||
SESSION_EN = requests.Session()
|
||
SESSION_EN.headers.update({
|
||
"User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)"
|
||
})
|
||
|
||
SESSION_CN = requests.Session()
|
||
SESSION_CN.headers.update({
|
||
"User-Agent": "WikiSyncTool/4.0 (your-email@example.com; MediaWiki Sync Bot)"
|
||
})
|
||
# ================================================
|
||
|
||
def load_last_timestamp():
|
||
if not os.path.exists(LAST_TIMESTAMP_FILE):
|
||
return None
|
||
with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f:
|
||
return f.read().strip()
|
||
|
||
def save_last_timestamp(ts):
|
||
with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f:
|
||
f.write(ts)
|
||
|
||
def get_recent_changes(since):
|
||
"""获取自 since 时间后每个页面的最新 revid(自动去重)"""
|
||
params = {
|
||
"action": "query",
|
||
"list": "recentchanges",
|
||
"rcprop": "title|ids|timestamp",
|
||
"rctype": "edit|new",
|
||
"rcdir": "newer",
|
||
"rcstart": since,
|
||
"rclimit": 500,
|
||
"format": "json"
|
||
}
|
||
latest = {}
|
||
while True:
|
||
try:
|
||
r = SESSION_EN.get(WIKI_API_URL_EN, params=params)
|
||
r.raise_for_status()
|
||
response_data = r.json()
|
||
if "error" in response_data:
|
||
raise Exception(response_data["error"])
|
||
for rc in response_data.get("query", {}).get("recentchanges", []):
|
||
latest[rc["title"]] = (rc["revid"], rc["timestamp"])
|
||
if "continue" not in response_data:
|
||
break
|
||
params.update(response_data["continue"])
|
||
except Exception as e:
|
||
print(f"获取最近更改时出错: {e}")
|
||
break
|
||
return latest
|
||
|
||
def get_old_revid(title, end_time):
|
||
"""获取 ≤ end_time 的最后一次修订的 revid(用于 fromrev)"""
|
||
params = {
|
||
"action": "query",
|
||
"prop": "revisions",
|
||
"titles": title,
|
||
"rvprop": "ids|timestamp",
|
||
"rvlimit": 1,
|
||
"rvdir": "older",
|
||
"rvstart": end_time,
|
||
"format": "json"
|
||
}
|
||
try:
|
||
r = SESSION_EN.get(WIKI_API_URL_EN, params=params).json()
|
||
pages = r["query"]["pages"]
|
||
page = next(iter(pages.values()))
|
||
if "revisions" not in page:
|
||
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
|
||
return None
|
||
|
||
revisions = page["revisions"]
|
||
if len(revisions) >= 1:
|
||
return revisions[0]["revid"]
|
||
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
|
||
return None
|
||
except Exception as e:
|
||
print(f"获取旧版本ID时出错: {e}")
|
||
return None
|
||
|
||
def get_page_content(wiki_url, session, title, revid=None):
|
||
"""获取页面完整内容"""
|
||
params = {
|
||
"action": "query",
|
||
"prop": "revisions",
|
||
"titles": title,
|
||
"rvprop": "content|timestamp|ids",
|
||
"rvslots": "main",
|
||
"format": "json"
|
||
}
|
||
if revid:
|
||
params["rvstartid"] = revid
|
||
params["rvendid"] = revid
|
||
|
||
try:
|
||
r = session.get(wiki_url, params=params).json()
|
||
pages = r["query"]["pages"]
|
||
page = next(iter(pages.values()))
|
||
|
||
if "revisions" not in page:
|
||
return None, None, None
|
||
|
||
rev = page["revisions"][0]
|
||
content = rev["slots"]["main"]["*"]
|
||
timestamp = rev["timestamp"]
|
||
rev_id = rev["revid"]
|
||
|
||
return content, timestamp, rev_id
|
||
except Exception as e:
|
||
print(f"获取页面内容时出错: {e}")
|
||
return None, None, None
|
||
|
||
def generate_text_diff(old_text, new_text):
|
||
"""生成类似git diff的文本diff"""
|
||
if not old_text:
|
||
return "新创建页面"
|
||
|
||
old_lines = old_text.splitlines(keepends=True)
|
||
new_lines = new_text.splitlines(keepends=True)
|
||
|
||
differ = difflib.unified_diff(
|
||
old_lines,
|
||
new_lines,
|
||
lineterm='\n'
|
||
)
|
||
|
||
return ''.join(differ)
|
||
|
||
def parse_diff_with_line_numbers(diff_text):
|
||
"""解析diff文本,提取详细的行号信息"""
|
||
if not diff_text or diff_text.startswith("新创建页面"):
|
||
return []
|
||
|
||
parsed_lines = []
|
||
current_old_line = 0
|
||
current_new_line = 0
|
||
in_hunk = False
|
||
|
||
for line in diff_text.splitlines():
|
||
if line.startswith('@@'):
|
||
# 解析hunk头部,格式如: @@ -start,count +start,count @@
|
||
import re
|
||
match = re.match(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@', line)
|
||
if match:
|
||
old_start = int(match.group(1))
|
||
old_count = int(match.group(2)) if match.group(2) else 1
|
||
new_start = int(match.group(3))
|
||
new_count = int(match.group(4)) if match.group(4) else 1
|
||
|
||
current_old_line = old_start
|
||
current_new_line = new_start
|
||
in_hunk = True
|
||
|
||
parsed_lines.append({
|
||
'type': 'hunk',
|
||
'content': line,
|
||
'old_start': old_start,
|
||
'old_count': old_count,
|
||
'new_start': new_start,
|
||
'new_count': new_count,
|
||
'old_line': None,
|
||
'new_line': None
|
||
})
|
||
else:
|
||
parsed_lines.append({
|
||
'type': 'other',
|
||
'content': line,
|
||
'old_line': None,
|
||
'new_line': None
|
||
})
|
||
elif line.startswith('---') or line.startswith('+++'):
|
||
# 文件头信息
|
||
parsed_lines.append({
|
||
'type': 'header',
|
||
'content': line,
|
||
'old_line': None,
|
||
'new_line': None
|
||
})
|
||
elif in_hunk:
|
||
if line.startswith('-'):
|
||
# 删除的行
|
||
parsed_lines.append({
|
||
'type': 'removed',
|
||
'content': line[1:], # 去掉开头的 '-'
|
||
'old_line': current_old_line,
|
||
'new_line': None
|
||
})
|
||
current_old_line += 1
|
||
elif line.startswith('+'):
|
||
# 新增的行
|
||
parsed_lines.append({
|
||
'type': 'added',
|
||
'content': line[1:], # 去掉开头的 '+'
|
||
'old_line': None,
|
||
'new_line': current_new_line
|
||
})
|
||
current_new_line += 1
|
||
elif line.startswith(' '):
|
||
# 未变更的行
|
||
parsed_lines.append({
|
||
'type': 'context',
|
||
'content': line[1:], # 去掉开头的 ' '
|
||
'old_line': current_old_line,
|
||
'new_line': current_new_line
|
||
})
|
||
current_old_line += 1
|
||
current_new_line += 1
|
||
else:
|
||
# 其他行(如空行)
|
||
parsed_lines.append({
|
||
'type': 'other',
|
||
'content': line,
|
||
'old_line': None,
|
||
'new_line': None
|
||
})
|
||
else:
|
||
# 不在任何hunk中的行
|
||
parsed_lines.append({
|
||
'type': 'other',
|
||
'content': line,
|
||
'old_line': None,
|
||
'new_line': None
|
||
})
|
||
|
||
return parsed_lines
|
||
|
||
def search_chinese_page(title):
|
||
"""在中文wiki中搜索对应的页面"""
|
||
# 首先尝试精确匹配
|
||
params = {
|
||
"action": "query",
|
||
"list": "search",
|
||
"srsearch": f'"{title}"',
|
||
"srwhat": "title",
|
||
"srlimit": 5,
|
||
"format": "json"
|
||
}
|
||
|
||
try:
|
||
r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json()
|
||
search_results = r.get("query", {}).get("search", [])
|
||
|
||
if search_results:
|
||
# 返回第一个匹配的结果
|
||
return search_results[0]["title"]
|
||
|
||
# 如果精确匹配没有结果,尝试模糊搜索
|
||
params["srsearch"] = title.replace(" ", "%20")
|
||
r = SESSION_CN.get(WIKI_API_URL_CN, params=params).json()
|
||
search_results = r.get("query", {}).get("search", [])
|
||
|
||
if search_results:
|
||
return search_results[0]["title"]
|
||
|
||
except Exception as e:
|
||
print(f"搜索中文页面时出错: {e}")
|
||
|
||
return None
|
||
|
||
def create_diff_html(title, en_diff, en_old_lines, en_new_lines, cn_content=None):
|
||
"""创建双语对比的HTML页面 - 使用精确的行号映射"""
|
||
# 准备中文内容行
|
||
cn_lines = []
|
||
if cn_content:
|
||
cn_lines = cn_content.splitlines()
|
||
|
||
# 解析diff并获取行号信息
|
||
parsed_diff = parse_diff_with_line_numbers(en_diff) if en_diff else []
|
||
|
||
# 生成HTML
|
||
html = f'''<!DOCTYPE html>
|
||
<html lang="zh-CN">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<title>Wiki Diff: {title}</title>
|
||
<style>
|
||
* {{
|
||
margin: 0;
|
||
padding: 0;
|
||
box-sizing: border-box;
|
||
}}
|
||
|
||
body {{
|
||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||
background-color: #f5f5f5;
|
||
line-height: 1.6;
|
||
}}
|
||
|
||
.header {{
|
||
background-color: #fff;
|
||
padding: 20px;
|
||
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
||
margin-bottom: 20px;
|
||
}}
|
||
|
||
.header h1 {{
|
||
color: #333;
|
||
font-size: 24px;
|
||
margin-bottom: 10px;
|
||
}}
|
||
|
||
.header .meta {{
|
||
color: #666;
|
||
font-size: 14px;
|
||
}}
|
||
|
||
.container {{
|
||
display: flex;
|
||
max-width: 100%;
|
||
margin: 0 auto;
|
||
background-color: #fff;
|
||
min-height: calc(100vh - 100px);
|
||
}}
|
||
|
||
.column {{
|
||
flex: 1;
|
||
overflow: hidden;
|
||
display: flex;
|
||
flex-direction: column;
|
||
}}
|
||
|
||
.column-header {{
|
||
background-color: #e9ecef;
|
||
padding: 12px 20px;
|
||
font-weight: bold;
|
||
color: #495057;
|
||
border-bottom: 1px solid #dee2e6;
|
||
}}
|
||
|
||
.diff-content {{
|
||
flex: 1;
|
||
overflow-y: auto;
|
||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||
font-size: 13px;
|
||
line-height: 1.4;
|
||
}}
|
||
|
||
.line {{
|
||
display: flex;
|
||
min-height: 20px;
|
||
position: relative;
|
||
}}
|
||
|
||
.line-number {{
|
||
width: 60px;
|
||
text-align: right;
|
||
padding: 0 10px;
|
||
background-color: #f8f9fa;
|
||
color: #6c757d;
|
||
border-right: 1px solid #dee2e6;
|
||
user-select: none;
|
||
flex-shrink: 0;
|
||
}}
|
||
|
||
.line.highlight {{
|
||
background-color: rgba(255, 235, 59, 0.3) !important;
|
||
animation: highlight 2s ease-in-out;
|
||
}}
|
||
|
||
@keyframes highlight {{
|
||
0% {{ background-color: rgba(255, 235, 59, 0.8); }}
|
||
100% {{ background-color: rgba(255, 235, 59, 0.3); }}
|
||
}}
|
||
|
||
.line-content {{
|
||
flex: 1;
|
||
padding: 0 10px;
|
||
white-space: pre-wrap;
|
||
word-break: break-word;
|
||
}}
|
||
|
||
/* Diff specific styles */
|
||
.line.diff-added {{
|
||
background-color: #e6ffec;
|
||
}}
|
||
|
||
.line.diff-added .line-content {{
|
||
background-color: #cdffd8;
|
||
border-left: 3px solid #28a745;
|
||
}}
|
||
|
||
.line.diff-removed {{
|
||
background-color: #ffeef0;
|
||
}}
|
||
|
||
.line.diff-removed .line-content {{
|
||
background-color: #fdb8c0;
|
||
border-left: 3px solid #dc3545;
|
||
text-decoration: line-through;
|
||
}}
|
||
|
||
.line.diff-context {{
|
||
background-color: #ffffff;
|
||
}}
|
||
|
||
.line.diff-context .line-content {{
|
||
background-color: #ffffff;
|
||
}}
|
||
|
||
.line.diff-hunk {{
|
||
background-color: #f8f9fa;
|
||
color: #6c757d;
|
||
font-style: italic;
|
||
}}
|
||
|
||
.line.diff-hunk .line-content {{
|
||
background-color: #f1f3f4;
|
||
}}
|
||
|
||
.line.diff-header {{
|
||
background-color: #e9ecef;
|
||
color: #495057;
|
||
font-style: italic;
|
||
}}
|
||
|
||
.line.diff-header .line-content {{
|
||
background-color: #e9ecef;
|
||
}}
|
||
|
||
/* Separator between columns */
|
||
.separator {{
|
||
width: 1px;
|
||
background-color: #dee2e6;
|
||
box-shadow: 0 0 5px rgba(0,0,0,0.1);
|
||
position: relative;
|
||
z-index: 10;
|
||
}}
|
||
|
||
/* Scrollbar styling */
|
||
.diff-content::-webkit-scrollbar {{
|
||
width: 8px;
|
||
height: 8px;
|
||
}}
|
||
|
||
.diff-content::-webkit-scrollbar-track {{
|
||
background: #f1f1f1;
|
||
}}
|
||
|
||
.diff-content::-webkit-scrollbar-thumb {{
|
||
background: #888;
|
||
border-radius: 4px;
|
||
}}
|
||
|
||
.diff-content::-webkit-scrollbar-thumb:hover {{
|
||
background: #555;
|
||
}}
|
||
|
||
/* Responsive design */
|
||
@media (max-width: 768px) {{
|
||
.container {{
|
||
flex-direction: column;
|
||
}}
|
||
|
||
.separator {{
|
||
width: 100%;
|
||
height: 1px;
|
||
}}
|
||
}}
|
||
|
||
/* Special styling for new page */
|
||
.new-page-notice {{
|
||
background-color: #d4edda;
|
||
color: #155724;
|
||
padding: 15px 20px;
|
||
margin-bottom: 20px;
|
||
border-left: 4px solid #28a745;
|
||
}}
|
||
|
||
.no-translation {{
|
||
background-color: #fff3cd;
|
||
color: #856404;
|
||
padding: 15px 20px;
|
||
margin-bottom: 20px;
|
||
border-left: 4px solid #ffc107;
|
||
}}
|
||
|
||
/* Line linking styles */
|
||
.line[data-cn-line] {{
|
||
cursor: pointer;
|
||
}}
|
||
|
||
.line:hover {{
|
||
background-color: rgba(0, 123, 255, 0.05);
|
||
}}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<div class="header">
|
||
<h1>{title}</h1>
|
||
<div class="meta">
|
||
<span>英文Wiki: wiki.projectdiablo2.com</span>
|
||
{f' | 中文Wiki: wiki.projectdiablo2.cn' if cn_content else ''}
|
||
</div>
|
||
</div>
|
||
|
||
<div class="container">
|
||
<div class="column">
|
||
<div class="column-header">English Diff</div>
|
||
<div class="diff-content" id="en-diff">
|
||
'''
|
||
|
||
# 生成英文diff内容
|
||
if parsed_diff:
|
||
for item in parsed_diff:
|
||
if item['type'] == 'hunk':
|
||
html += f'<div class="line diff-hunk"><span class="line-content">{item["content"]}</span></div>'
|
||
elif item['type'] == 'header':
|
||
html += f'<div class="line diff-header"><span class="line-content">{item["content"]}</span></div>'
|
||
elif item['type'] == 'added':
|
||
cn_line_attr = f'data-cn-line="{item["new_line"]}"' if item["new_line"] and cn_lines and item["new_line"] <= len(cn_lines) else ''
|
||
cn_title = f'中文第{item["new_line"]}行' if item["new_line"] and cn_lines and item["new_line"] <= len(cn_lines) else ''
|
||
html += f'<div class="line diff-added" {cn_line_attr} title="{cn_title}"><span class="line-number">{item["new_line"] or ""}</span><span class="line-content">{item["content"]}</span></div>'
|
||
elif item['type'] == 'removed':
|
||
html += f'<div class="line diff-removed" title="已删除"><span class="line-number">{item["old_line"] or ""}</span><span class="line-content">{item["content"]}</span></div>'
|
||
elif item['type'] == 'context':
|
||
cn_line_attr = f'data-cn-line="{item["new_line"]}"' if item["new_line"] and cn_lines and item["new_line"] <= len(cn_lines) else ''
|
||
cn_title = f'中文第{item["new_line"]}行' if item["new_line"] and cn_lines and item["new_line"] <= len(cn_lines) else ''
|
||
html += f'<div class="line diff-context" {cn_line_attr} title="{cn_title}"><span class="line-number">{item["new_line"]}</span><span class="line-content">{item["content"]}</span></div>'
|
||
else:
|
||
html += f'<div class="line"><span class="line-content">{item["content"]}</span></div>'
|
||
else:
|
||
# 新页面或无diff
|
||
if en_diff and en_diff.startswith("新创建页面"):
|
||
html += '<div class="new-page-notice">新创建页面</div>'
|
||
|
||
# 显示完整内容(新页面或无diff时)
|
||
for i, line in enumerate(en_new_lines or [], 1):
|
||
cn_line_attr = f'data-cn-line="{i}"' if cn_lines and i <= len(cn_lines) else ''
|
||
cn_title = f'中文第{i}行' if cn_lines and i <= len(cn_lines) else ''
|
||
html += f'<div class="line diff-context" {cn_line_attr} title="{cn_title}"><span class="line-number">{i}</span><span class="line-content">{line}</span></div>'
|
||
|
||
html += '''
|
||
</div>
|
||
</div>
|
||
|
||
<div class="separator"></div>
|
||
|
||
<div class="column">
|
||
<div class="column-header">中文翻译</div>
|
||
<div class="diff-content" id="cn-content">
|
||
'''
|
||
|
||
# 添加中文内容
|
||
if cn_content:
|
||
html += '<div id="cn-lines">'
|
||
for i, line in enumerate(cn_lines, 1):
|
||
html += f'<div class="line diff-context" id="cn-line-{i}"><span class="line-number">{i}</span><span class="line-content">{line}</span></div>'
|
||
html += '</div>'
|
||
else:
|
||
html += '<div class="no-translation">未找到对应的中文翻译页面</div>'
|
||
|
||
html += '''
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<script>
|
||
// 同步滚动功能
|
||
const enDiff = document.querySelector('#en-diff');
|
||
const cnContent = document.querySelector('#cn-content');
|
||
const cnLines = {};
|
||
|
||
// 构建中文行的位置映射
|
||
if (document.getElementById('cn-lines')) {{
|
||
document.querySelectorAll('#cn-lines .line').forEach(line => {{
|
||
const lineNum = line.querySelector('.line-number').textContent;
|
||
if (lineNum) {{
|
||
cnLines[lineNum] = line.offsetTop;
|
||
}}
|
||
}});
|
||
}}
|
||
|
||
// 同步滚动
|
||
if (enDiff && cnContent) {{
|
||
enDiff.addEventListener('scroll', () => {{
|
||
cnContent.scrollTop = enDiff.scrollTop;
|
||
}});
|
||
|
||
cnContent.addEventListener('scroll', () => {{
|
||
enDiff.scrollTop = cnContent.scrollTop;
|
||
}});
|
||
}}
|
||
|
||
// 点击英文行时,高亮对应的中文行
|
||
document.querySelectorAll('[data-cn-line]').forEach(enLine => {{
|
||
enLine.addEventListener('click', () => {{
|
||
const cnLineNum = enLine.getAttribute('data-cn-line');
|
||
if (cnLineNum) {{
|
||
const cnLine = document.getElementById(`cn-line-${cnLineNum}`);
|
||
if (cnLine) {{
|
||
// 移除所有高亮
|
||
document.querySelectorAll('.line.highlight').forEach(line => {{
|
||
line.classList.remove('highlight');
|
||
}});
|
||
|
||
// 高亮英文行和中文行
|
||
enLine.classList.add('highlight');
|
||
cnLine.classList.add('highlight');
|
||
|
||
// 滚动到中文行的位置
|
||
cnLine.scrollIntoView({{ behavior: 'smooth', block: 'center' }});
|
||
}}
|
||
}}
|
||
}});
|
||
|
||
// 鼠标悬停时显示预览
|
||
enLine.addEventListener('mouseenter', () => {{
|
||
const cnLineNum = enLine.getAttribute('data-cn-line');
|
||
if (cnLineNum) {{
|
||
const cnLine = document.getElementById(`cn-line-${cnLineNum}`);
|
||
if (cnLine) {{
|
||
enLine.style.backgroundColor = 'rgba(0, 123, 255, 0.1)';
|
||
cnLine.style.backgroundColor = 'rgba(0, 123, 255, 0.1)';
|
||
}}
|
||
}}
|
||
}});
|
||
|
||
enLine.addEventListener('mouseleave', () => {{
|
||
if (!enLine.classList.contains('highlight')) {{
|
||
enLine.style.backgroundColor = '';
|
||
}}
|
||
const cnLineNum = enLine.getAttribute('data-cn-line');
|
||
if (cnLineNum) {{
|
||
const cnLine = document.getElementById(`cn-line-${cnLineNum}`);
|
||
if (cnLine && !cnLine.classList.contains('highlight')) {{
|
||
cnLine.style.backgroundColor = '';
|
||
}}
|
||
}}
|
||
}});
|
||
}});
|
||
</script>
|
||
</body>
|
||
</html>'''
|
||
|
||
return html
|
||
|
||
def save_files(title, diff_html, diff_text, full_text, timestamp, note="", revid=None, cn_content=None, old_full_text=None):
|
||
global CURRENT_OUTPUT_DIR
|
||
|
||
# 确保本次执行的输出目录已经创建
|
||
if CURRENT_OUTPUT_DIR is None:
|
||
current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str
|
||
CURRENT_OUTPUT_DIR.mkdir(exist_ok=True)
|
||
print(f"创建本次执行的输出目录: {CURRENT_OUTPUT_DIR}")
|
||
|
||
safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title)
|
||
time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_")
|
||
base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}"
|
||
|
||
# 保存各种文件
|
||
files_to_save = []
|
||
|
||
# 1. 标准MediaWiki diff HTML
|
||
diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.html"
|
||
if diff_html:
|
||
files_to_save.append((diff_file, diff_html))
|
||
|
||
# 2. 文本格式的diff
|
||
text_diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.txt"
|
||
if diff_text:
|
||
files_to_save.append((text_diff_file, diff_text))
|
||
|
||
# 3. 最新完整内容
|
||
full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.full.txt"
|
||
if full_text:
|
||
files_to_save.append((full_file, full_text))
|
||
|
||
# 4. 历史版本内容(如果存在)
|
||
if old_full_text:
|
||
old_full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.old.txt"
|
||
files_to_save.append((old_full_file, old_full_text))
|
||
|
||
# 5. 中文翻译内容(如果存在)
|
||
if cn_content:
|
||
cn_file = CURRENT_OUTPUT_DIR / f"{base_filename}.cn.txt"
|
||
files_to_save.append((cn_file, cn_content))
|
||
|
||
# 6. 双语对比HTML页面
|
||
if cn_content:
|
||
# 为文本diff准备行
|
||
en_new_lines = full_text.splitlines() if full_text else []
|
||
en_old_lines = old_full_text.splitlines() if old_full_text else []
|
||
|
||
# 创建双语对比页面
|
||
comparison_html = create_diff_html(title, diff_text, en_old_lines, en_new_lines, cn_content)
|
||
comparison_file = CURRENT_OUTPUT_DIR / f"{base_filename}.comparison.html"
|
||
files_to_save.append((comparison_file, comparison_html))
|
||
print(f" → 已保存: {comparison_file.relative_to(OUTPUT_DIR)} (双语对比页面)")
|
||
|
||
# 写入所有文件
|
||
for file_path, content in files_to_save:
|
||
try:
|
||
with open(file_path, "w", encoding="utf-8") as f:
|
||
f.write(content)
|
||
print(f" → 已保存: {file_path.relative_to(OUTPUT_DIR)}")
|
||
except Exception as e:
|
||
print(f" → 保存文件 {file_path} 时出错: {e}")
|
||
|
||
def process_single_page(title, since_time, update_timestamp=False):
|
||
"""只处理单个页面"""
|
||
print(f"正在单独处理页面:{title}")
|
||
|
||
# 获取当前最新 revid
|
||
try:
|
||
latest_content, latest_ts, latest_revid = get_page_content(WIKI_API_URL_EN, SESSION_EN, title)
|
||
if latest_content is None:
|
||
print("页面不存在或被删除")
|
||
return None
|
||
|
||
# 获取旧 revid
|
||
old_revid = get_old_revid(title, since_time)
|
||
|
||
# 初始化变量
|
||
diff_html = None
|
||
diff_text = None
|
||
old_content = None
|
||
cn_content = None
|
||
|
||
if old_revid:
|
||
# 获取历史版本内容
|
||
old_content, old_ts, _ = get_page_content(WIKI_API_URL_EN, SESSION_EN, title, old_revid)
|
||
|
||
if old_content is not None:
|
||
# 生成文本diff
|
||
diff_text = generate_text_diff(old_content, latest_content)
|
||
print(f" 生成了文本diff ({len(diff_text)} 字符)")
|
||
else:
|
||
print(f" 无法获取历史版本内容")
|
||
else:
|
||
# 新页面
|
||
print(" 这是新创建的页面")
|
||
|
||
# 搜索对应的中文页面
|
||
print(" 搜索中文翻译...")
|
||
cn_title = search_chinese_page(title)
|
||
if cn_title:
|
||
print(f" 找到中文页面: {cn_title}")
|
||
cn_content, cn_ts, cn_revid = get_page_content(WIKI_API_URL_CN, SESSION_CN, cn_title)
|
||
if cn_content:
|
||
print(f" 获取中文内容成功 ({len(cn_content)} 字符)")
|
||
else:
|
||
print(" 无法获取中文页面内容")
|
||
else:
|
||
print(" 未找到对应的中文翻译页面")
|
||
|
||
# 获取官方diff(可选)
|
||
if old_revid:
|
||
diff_params = {
|
||
"action": "compare",
|
||
"fromrev": old_revid,
|
||
"torev": latest_revid,
|
||
"format": "json"
|
||
}
|
||
try:
|
||
diff_resp = SESSION_EN.get(WIKI_API_URL_EN, params=diff_params).json()
|
||
diff_html = diff_resp.get("compare", {}).get("*", "")
|
||
except Exception as e:
|
||
print(f" 获取官方HTML diff时出错: {e}")
|
||
|
||
# 保存所有文件
|
||
save_files(title, diff_html, diff_text, latest_content, latest_ts, "", latest_revid, cn_content, old_content)
|
||
|
||
if update_timestamp:
|
||
save_last_timestamp(latest_ts)
|
||
print(f"已更新全局时间戳 → {latest_ts}")
|
||
|
||
return latest_ts
|
||
except Exception as e:
|
||
print(f"处理页面 '{title}' 时出错: {e}")
|
||
return None
|
||
|
||
def process_all_pages_since(since_time):
|
||
"""处理自指定时间以来的所有页面变更"""
|
||
print("正在获取最近变更列表...")
|
||
changes = get_recent_changes(since_time)
|
||
if not changes:
|
||
print("没有发现任何变更")
|
||
return
|
||
|
||
latest_global_ts = since_time
|
||
for title, (latest_revid, ts) in changes.items():
|
||
print(f"\n处理:{title}")
|
||
# 复用单页处理逻辑
|
||
page_latest_ts = process_single_page(title, since_time)
|
||
|
||
if page_latest_ts and page_latest_ts > latest_global_ts:
|
||
latest_global_ts = page_latest_ts
|
||
|
||
save_last_timestamp(latest_global_ts)
|
||
print(f"\n全量同步完成!本次最新时间戳已更新为:{latest_global_ts}")
|
||
print(f"文件保存在:{CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}")
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - 增强版支持双语对比")
|
||
parser.add_argument("--since", type=str, help="强制从指定时间开始同步,格式如 2025-11-28T00:00:00Z")
|
||
parser.add_argument("--title", type=str, help="只同步指定的单个页面标题")
|
||
parser.add_argument("--update-timestamp", action="store_true",
|
||
help="在单页模式下,完成后仍然更新全局 last_sync_timestamp.txt")
|
||
parser.add_argument("--run", action="store_true",
|
||
help="执行同步操作(必须提供此参数才能真正执行同步)")
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 如果没有提供 --run 参数,则显示帮助信息并退出
|
||
if not args.run:
|
||
parser.print_help()
|
||
return
|
||
|
||
# 确定实际使用的 since 时间
|
||
if args.since:
|
||
since_time = args.since
|
||
print(f"使用命令行指定的时间起点:{since_time}")
|
||
else:
|
||
since_time = load_last_timestamp()
|
||
if not since_time:
|
||
from datetime import timedelta
|
||
since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z"
|
||
print(f"使用上次记录的时间起点:{since_time}")
|
||
|
||
# 单页面模式
|
||
if args.title:
|
||
process_single_page(args.title.strip(), since_time, args.update_timestamp)
|
||
return
|
||
|
||
# 全量模式 - 使用复用的单页处理逻辑
|
||
process_all_pages_since(since_time)
|
||
|
||
if __name__ == "__main__":
|
||
main() |