This commit is contained in:
wdjwxh 2025-12-03 15:54:39 +08:00
commit 8e93b5b82b
2 changed files with 562 additions and 0 deletions

164
.gitignore vendored Normal file
View File

@ -0,0 +1,164 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak
venv.bak
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# 自定义忽略项
last_sync_timestamp.txt
wiki_sync_output/

398
sync.py Normal file
View File

@ -0,0 +1,398 @@
# -*- coding: utf-8 -*-
"""
MediaWiki 最近变更同步工具 - 绯红终版
支持
1. 正常全量同步无参数
2. 手动指定时间起点--since 2025-11-28T00:00:00Z
3. 只同步单个页面--title "页面名称"
4. 单个页面时可选更新全局时间戳--update-timestamp
5. 全部使用官方 action=compare 生成最完美的 diff
"""
import os
import argparse
from pathlib import Path
from datetime import datetime
import requests
# ==================== 配置区 ====================
WIKI_API_URL = "https://wiki.projectdiablo2.com/w/api.php" # ← 改成你的国外 wiki
OUTPUT_DIR = Path("wiki_sync_output")
OUTPUT_DIR.mkdir(exist_ok=True)
# 全局变量,存储本次执行的输出目录
CURRENT_OUTPUT_DIR = None
LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt"
SESSION = requests.Session()
SESSION.headers.update({
"User-Agent": "WikiSyncTool/3.0 (your-email@example.com; MediaWiki Sync Bot)"
})
# ================================================
def load_last_timestamp():
if not os.path.exists(LAST_TIMESTAMP_FILE):
return None
with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f:
return f.read().strip()
def save_last_timestamp(ts):
with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f:
f.write(ts)
def get_recent_changes(since):
"""获取自 since 时间后每个页面的最新 revid自动去重"""
params = {
"action": "query",
"list": "recentchanges",
"rcprop": "title|ids|timestamp",
"rctype": "edit|new",
"rcdir": "newer",
"rcstart": since,
"rclimit": 500,
"format": "json"
}
latest = {}
while True:
try:
r = SESSION.get(WIKI_API_URL, params=params)
r.raise_for_status()
response_data = r.json()
if "error" in response_data:
raise Exception(response_data["error"])
for rc in response_data.get("query", {}).get("recentchanges", []):
latest[rc["title"]] = (rc["revid"], rc["timestamp"])
if "continue" not in response_data:
break
params.update(response_data["continue"])
except Exception as e:
print(f"获取最近更改时出错: {e}")
break
return latest
def get_old_revid(title, end_time):
"""获取 ≤ end_time 的最后一次修订的 revid用于 fromrev"""
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "ids|timestamp",
"rvlimit": 1, # 获取2个版本确保能找到不同的版本
"rvdir": "older",
"rvstart": end_time,
"format": "json"
}
try:
r = SESSION.get(WIKI_API_URL, params=params).json()
url = WIKI_API_URL + "?" + "&".join([f"{k}={v}" for k, v in params.items()])
print(f" 请求URL: {url}")
pages = r["query"]["pages"]
page = next(iter(pages.values()))
if "revisions" not in page:
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
return None
revisions = page["revisions"]
if len(revisions) >= 1:
return revisions[0]["revid"]
print(f" 页面 '{title}' 在指定时间前没有找到修订版本")
return None
except Exception as e:
print(f"获取旧版本ID时出错: {e}")
return None
def get_official_diff_and_content(title, from_revid, to_revid):
# 获取官方 diffHTML
diff_params = {
"action": "compare",
"fromrev": from_revid or "",
"torev": to_revid,
"format": "json"
}
print(f" 获取diff: fromrev={from_revid}, torev={to_revid}")
try:
diff_resp = SESSION.get(WIKI_API_URL, params=diff_params).json()
print(f" Diff响应: {list(diff_resp.keys())}")
diff_html = diff_resp.get("compare", {}).get("*", "<p>无法获取 diff</p>")
print(f" Diff内容长度: {len(diff_html)} 字符")
# 获取最新完整内容
content_params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "content|timestamp",
"rvslots": "main",
"format": "json"
}
r = SESSION.get(WIKI_API_URL, params=content_params).json()
page = next(iter(r["query"]["pages"].values()))
if "revisions" not in page:
return None, None, None
rev = page["revisions"][0]
full_text = rev["slots"]["main"]["*"]
ts = rev["timestamp"]
return diff_html, full_text, ts
except Exception as e:
print(f"获取diff和内容时出错: {e}")
return None, None, None
def save_files(title, diff_html, full_text, timestamp, note="", revid=None):
global CURRENT_OUTPUT_DIR
# 确保本次执行的输出目录已经创建
if CURRENT_OUTPUT_DIR is None:
current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str
CURRENT_OUTPUT_DIR.mkdir(exist_ok=True)
print(f"创建本次执行的输出目录: {CURRENT_OUTPUT_DIR}")
safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title)
time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_")
# 简化文件名格式只包含标题、时间和revid
base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}"
diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.html"
full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.full.txt"
# 美化 HTML diff使用类似git diff的配色方案
# 先处理diff_html将ins/del标签替换为span标签
processed_diff_html = diff_html.replace('<ins class="diffchange', '<span class="diffchange added"').replace('</ins>', '</span>').replace('<del class="diffchange', '<span class="diffchange deleted"').replace('</del>', '</span>')
# 再处理diff标记将data-marker属性替换为实际的span元素
processed_diff_html = processed_diff_html.replace('<td class="diff-marker" data-marker=""></td>', '<td class="diff-marker"><span class="minus-marker"></span></td>').replace('<td class="diff-marker" data-marker="+"></td>', '<td class="diff-marker"><span class="plus-marker">+</span></td>')
html_wrapper = f'''<!DOCTYPE html>
<html><head><meta charset="utf-8"><title>Diff: {title}</title>
<style>
body {{
font-family: system-ui, sans-serif;
margin: 20px;
}}
table.diff {{
border-collapse: collapse;
font-family: monospace;
width: 100%;
table-layout: fixed;
}}
table.diff td {{
padding: 0 5px;
vertical-align: top;
white-space: pre-wrap;
word-break: break-all;
font-size: 14px;
line-height: 1.4;
}}
table.diff col.diff-marker {{
width: 20px;
text-align: right;
background-color: #fafafa;
}}
table.diff col.diff-content {{
width: auto;
}}
table.diff col.diff-addedline,
table.diff col.diff-deletedline {{
width: 50%;
}}
.diff-addedline {{
background-color: #dfd;
}}
.diff-addedline .diffchange {{
background-color: #9e9;
color: #000;
}}
.diff-deletedline {{
background-color: #fee8e8;
}}
.diff-deletedline .diffchange {{
background-color: #faa;
color: #000;
}}
.diff-context {{
background-color: #fafafa;
}}
.diff-context td {{
color: #777;
}}
.diff-marker {{
font-weight: bold;
text-align: right;
padding: 0 4px;
}}
.diff-lineno {{
background-color: #f0f0f0;
text-align: right;
padding: 0 4px;
}}
.diff-addedline .diff-marker {{
color: #080;
}}
.diff-deletedline .diff-marker {{
color: #800;
}}
/* 新增的diff标记样式 */
.plus-marker {{
color: #080;
font-weight: bold;
}}
.minus-marker {{
color: #800;
font-weight: bold;
}}
/* 确保变更行有明显的视觉区分 */
.diff-addedline div,
.diff-deletedline div {{
display: inline-block;
width: 100%;
}}
/* 增加一些额外的视觉提示 */
.diff-addedline {{
border-left: 4px solid #080;
}}
.diff-deletedline {{
border-left: 4px solid #800;
}}
.diff-context {{
border-left: 4px solid #ccc;
}}
/* 替换ins/del标签为span标签的样式 */
.diffchange.added {{
background-color: #9e9;
color: #000;
font-weight: bold;
text-decoration: none;
}}
.diffchange.deleted {{
background-color: #faa;
color: #000;
font-weight: bold;
text-decoration: line-through;
}}
</style></head><body>
<h2>{title}</h2>
<p>修改时间: {timestamp}</p>
{processed_diff_html}
</body></html>'''
try:
with open(diff_file, "w", encoding="utf-8") as f:
f.write(html_wrapper)
with open(full_file, "w", encoding="utf-8") as f:
f.write(full_text)
print(f" → 已保存: {diff_file.relative_to(OUTPUT_DIR)}")
print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)}")
except Exception as e:
print(f" → 保存文件时出错: {e}")
print(f" → 完整路径: {diff_file}")
print(f" → 完整路径: {full_file}")
def process_single_page(title, since_time, update_timestamp=False):
"""只处理单个页面"""
print(f"正在单独处理页面:{title}")
# 获取当前最新 revid
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvprop": "ids|timestamp",
"rvlimit": 1,
"format": "json"
}
try:
r = SESSION.get(WIKI_API_URL, params=params).json()
page = next(iter(r["query"]["pages"].values()))
if "revisions" not in page:
print("页面不存在或被删除")
return None
latest_revid = page["revisions"][0]["revid"]
latest_ts = page["revisions"][0]["timestamp"]
# 获取旧 revid
old_revid = get_old_revid(title, since_time)
diff_html, full_text, new_ts = get_official_diff_and_content(title, old_revid, latest_revid)
if diff_html is not None and full_text is not None:
# 移除旧的note标记使用更简洁的命名方式
if not old_revid:
diff_html = "<p style='color:green;font-weight:bold'>新创建页面(无历史版本)</p>"
save_files(title, diff_html, full_text, new_ts, "", latest_revid)
else:
print(f" 警告: 未能获取完整的差异或内容数据")
if update_timestamp:
save_last_timestamp(latest_ts)
print(f"已更新全局时间戳 → {latest_ts}")
return latest_ts
except Exception as e:
print(f"处理页面 '{title}' 时出错: {e}")
return None
def process_all_pages_since(since_time):
"""处理自指定时间以来的所有页面变更"""
print("正在获取最近变更列表...")
changes = get_recent_changes(since_time)
if not changes:
print("没有发现任何变更")
return
latest_global_ts = since_time
for title, (latest_revid, ts) in changes.items():
print(f"\n处理:{title}")
# 复用单页处理逻辑
page_latest_ts = process_single_page(title, since_time)
if page_latest_ts and page_latest_ts > latest_global_ts:
latest_global_ts = page_latest_ts
save_last_timestamp(latest_global_ts)
print(f"\n全量同步完成!本次最新时间戳已更新为:{latest_global_ts}")
print(f"文件保存在:{CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}")
def main():
parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - 支持全量/单页/自定义时间")
parser.add_argument("--since", type=str, help="强制从指定时间开始同步,格式如 2025-11-28T00:00:00Z")
parser.add_argument("--title", type=str, help="只同步指定的单个页面标题")
parser.add_argument("--update-timestamp", action="store_true",
help="在单页模式下,完成后仍然更新全局 last_sync_timestamp.txt")
parser.add_argument("--run", action="store_true",
help="执行同步操作(必须提供此参数才能真正执行同步)")
args = parser.parse_args()
# 如果没有提供 --run 参数,则显示帮助信息并退出
if not args.run:
parser.print_help()
return
# 确定实际使用的 since 时间
if args.since:
since_time = args.since
print(f"使用命令行指定的时间起点:{since_time}")
else:
since_time = load_last_timestamp()
if not since_time:
from datetime import timedelta
since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z"
print(f"使用上次记录的时间起点:{since_time}")
# 单页面模式
if args.title:
process_single_page(args.title.strip(), since_time, args.update_timestamp)
return
# 全量模式 - 使用复用的单页处理逻辑
process_all_pages_since(since_time)
if __name__ == "__main__":
main()