commit 8e93b5b82b28ac637f3324b85aaf99f9617b14c4 Author: wdjwxh Date: Wed Dec 3 15:54:39 2025 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..906d727 --- /dev/null +++ b/.gitignore @@ -0,0 +1,164 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak +venv.bak + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# 自定义忽略项 +last_sync_timestamp.txt +wiki_sync_output/ \ No newline at end of file diff --git a/sync.py b/sync.py new file mode 100644 index 0000000..05fd727 --- /dev/null +++ b/sync.py @@ -0,0 +1,398 @@ +# -*- coding: utf-8 -*- +""" +MediaWiki 最近变更同步工具 - 绯红终版 +支持: +1. 正常全量同步(无参数) +2. 手动指定时间起点:--since 2025-11-28T00:00:00Z +3. 只同步单个页面:--title "页面名称" +4. 单个页面时可选更新全局时间戳:--update-timestamp +5. 全部使用官方 action=compare 生成最完美的 diff +""" + +import os +import argparse +from pathlib import Path +from datetime import datetime +import requests + +# ==================== 配置区 ==================== +WIKI_API_URL = "https://wiki.projectdiablo2.com/w/api.php" # ← 改成你的国外 wiki +OUTPUT_DIR = Path("wiki_sync_output") +OUTPUT_DIR.mkdir(exist_ok=True) + +# 全局变量,存储本次执行的输出目录 +CURRENT_OUTPUT_DIR = None + +LAST_TIMESTAMP_FILE = "last_sync_timestamp.txt" + +SESSION = requests.Session() +SESSION.headers.update({ + "User-Agent": "WikiSyncTool/3.0 (your-email@example.com; MediaWiki Sync Bot)" +}) +# ================================================ + +def load_last_timestamp(): + if not os.path.exists(LAST_TIMESTAMP_FILE): + return None + with open(LAST_TIMESTAMP_FILE, encoding="utf-8") as f: + return f.read().strip() + +def save_last_timestamp(ts): + with open(LAST_TIMESTAMP_FILE, "w", encoding="utf-8") as f: + f.write(ts) + +def get_recent_changes(since): + """获取自 since 时间后每个页面的最新 revid(自动去重)""" + params = { + "action": "query", + "list": "recentchanges", + "rcprop": "title|ids|timestamp", + "rctype": "edit|new", + "rcdir": "newer", + "rcstart": since, + "rclimit": 500, + "format": "json" + } + latest = {} + while True: + try: + r = SESSION.get(WIKI_API_URL, params=params) + r.raise_for_status() + response_data = r.json() + if "error" in response_data: + raise Exception(response_data["error"]) + for rc in response_data.get("query", {}).get("recentchanges", []): + latest[rc["title"]] = (rc["revid"], rc["timestamp"]) + if "continue" not in response_data: + break + params.update(response_data["continue"]) + except Exception as e: + print(f"获取最近更改时出错: {e}") + break + return latest + +def get_old_revid(title, end_time): + """获取 ≤ end_time 的最后一次修订的 revid(用于 fromrev)""" + params = { + "action": "query", + "prop": "revisions", + "titles": title, + "rvprop": "ids|timestamp", + "rvlimit": 1, # 获取2个版本,确保能找到不同的版本 + "rvdir": "older", + "rvstart": end_time, + "format": "json" + } + try: + r = SESSION.get(WIKI_API_URL, params=params).json() + url = WIKI_API_URL + "?" + "&".join([f"{k}={v}" for k, v in params.items()]) + print(f" 请求URL: {url}") + pages = r["query"]["pages"] + page = next(iter(pages.values())) + if "revisions" not in page: + print(f" 页面 '{title}' 在指定时间前没有找到修订版本") + return None + + revisions = page["revisions"] + if len(revisions) >= 1: + return revisions[0]["revid"] + print(f" 页面 '{title}' 在指定时间前没有找到修订版本") + return None + except Exception as e: + print(f"获取旧版本ID时出错: {e}") + return None + +def get_official_diff_and_content(title, from_revid, to_revid): + # 获取官方 diff(HTML) + diff_params = { + "action": "compare", + "fromrev": from_revid or "", + "torev": to_revid, + "format": "json" + } + + print(f" 获取diff: fromrev={from_revid}, torev={to_revid}") + + try: + diff_resp = SESSION.get(WIKI_API_URL, params=diff_params).json() + print(f" Diff响应: {list(diff_resp.keys())}") + diff_html = diff_resp.get("compare", {}).get("*", "

无法获取 diff

") + print(f" Diff内容长度: {len(diff_html)} 字符") + + # 获取最新完整内容 + content_params = { + "action": "query", + "prop": "revisions", + "titles": title, + "rvprop": "content|timestamp", + "rvslots": "main", + "format": "json" + } + r = SESSION.get(WIKI_API_URL, params=content_params).json() + page = next(iter(r["query"]["pages"].values())) + if "revisions" not in page: + return None, None, None + rev = page["revisions"][0] + full_text = rev["slots"]["main"]["*"] + ts = rev["timestamp"] + return diff_html, full_text, ts + except Exception as e: + print(f"获取diff和内容时出错: {e}") + return None, None, None + +def save_files(title, diff_html, full_text, timestamp, note="", revid=None): + global CURRENT_OUTPUT_DIR + + # 确保本次执行的输出目录已经创建 + if CURRENT_OUTPUT_DIR is None: + current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S") + CURRENT_OUTPUT_DIR = OUTPUT_DIR / current_time_str + CURRENT_OUTPUT_DIR.mkdir(exist_ok=True) + print(f"创建本次执行的输出目录: {CURRENT_OUTPUT_DIR}") + + safe_title = "".join(c if c.isalnum() or c in " -_." else "_" for c in title) + time_str = timestamp[:19].replace("-", "").replace(":", "").replace("T", "_") + # 简化文件名格式,只包含标题、时间和revid + base_filename = f"{safe_title}-{time_str}-{revid}" if revid else f"{safe_title}-{time_str}" + + diff_file = CURRENT_OUTPUT_DIR / f"{base_filename}.diff.html" + full_file = CURRENT_OUTPUT_DIR / f"{base_filename}.full.txt" + + # 美化 HTML diff,使用类似git diff的配色方案 + # 先处理diff_html,将ins/del标签替换为span标签 + processed_diff_html = diff_html.replace('', '').replace('', '') + # 再处理diff标记,将data-marker属性替换为实际的span元素 + processed_diff_html = processed_diff_html.replace('', '').replace('', '+') + + html_wrapper = f''' +Diff: {title} + +

{title}

+

修改时间: {timestamp}

+{processed_diff_html} +''' + + try: + with open(diff_file, "w", encoding="utf-8") as f: + f.write(html_wrapper) + with open(full_file, "w", encoding="utf-8") as f: + f.write(full_text) + + print(f" → 已保存: {diff_file.relative_to(OUTPUT_DIR)}") + print(f" → 已保存: {full_file.relative_to(OUTPUT_DIR)}") + except Exception as e: + print(f" → 保存文件时出错: {e}") + + print(f" → 完整路径: {diff_file}") + print(f" → 完整路径: {full_file}") + +def process_single_page(title, since_time, update_timestamp=False): + """只处理单个页面""" + print(f"正在单独处理页面:{title}") + + # 获取当前最新 revid + params = { + "action": "query", + "prop": "revisions", + "titles": title, + "rvprop": "ids|timestamp", + "rvlimit": 1, + "format": "json" + } + try: + r = SESSION.get(WIKI_API_URL, params=params).json() + page = next(iter(r["query"]["pages"].values())) + if "revisions" not in page: + print("页面不存在或被删除") + return None + latest_revid = page["revisions"][0]["revid"] + latest_ts = page["revisions"][0]["timestamp"] + + # 获取旧 revid + old_revid = get_old_revid(title, since_time) + + diff_html, full_text, new_ts = get_official_diff_and_content(title, old_revid, latest_revid) + if diff_html is not None and full_text is not None: + # 移除旧的note标记,使用更简洁的命名方式 + if not old_revid: + diff_html = "

新创建页面(无历史版本)

" + save_files(title, diff_html, full_text, new_ts, "", latest_revid) + else: + print(f" 警告: 未能获取完整的差异或内容数据") + + if update_timestamp: + save_last_timestamp(latest_ts) + print(f"已更新全局时间戳 → {latest_ts}") + + return latest_ts + except Exception as e: + print(f"处理页面 '{title}' 时出错: {e}") + return None + +def process_all_pages_since(since_time): + """处理自指定时间以来的所有页面变更""" + print("正在获取最近变更列表...") + changes = get_recent_changes(since_time) + if not changes: + print("没有发现任何变更") + return + + latest_global_ts = since_time + for title, (latest_revid, ts) in changes.items(): + print(f"\n处理:{title}") + # 复用单页处理逻辑 + page_latest_ts = process_single_page(title, since_time) + + if page_latest_ts and page_latest_ts > latest_global_ts: + latest_global_ts = page_latest_ts + + save_last_timestamp(latest_global_ts) + print(f"\n全量同步完成!本次最新时间戳已更新为:{latest_global_ts}") + print(f"文件保存在:{CURRENT_OUTPUT_DIR.resolve() if CURRENT_OUTPUT_DIR else OUTPUT_DIR.resolve()}") + +def main(): + parser = argparse.ArgumentParser(description="MediaWiki 同步工具 - 支持全量/单页/自定义时间") + parser.add_argument("--since", type=str, help="强制从指定时间开始同步,格式如 2025-11-28T00:00:00Z") + parser.add_argument("--title", type=str, help="只同步指定的单个页面标题") + parser.add_argument("--update-timestamp", action="store_true", + help="在单页模式下,完成后仍然更新全局 last_sync_timestamp.txt") + parser.add_argument("--run", action="store_true", + help="执行同步操作(必须提供此参数才能真正执行同步)") + + args = parser.parse_args() + + # 如果没有提供 --run 参数,则显示帮助信息并退出 + if not args.run: + parser.print_help() + return + + # 确定实际使用的 since 时间 + if args.since: + since_time = args.since + print(f"使用命令行指定的时间起点:{since_time}") + else: + since_time = load_last_timestamp() + if not since_time: + from datetime import timedelta + since_time = (datetime.utcnow() - timedelta(days=1)).isoformat(timespec='seconds') + "Z" + print(f"使用上次记录的时间起点:{since_time}") + + # 单页面模式 + if args.title: + process_single_page(args.title.strip(), since_time, args.update_timestamp) + return + + # 全量模式 - 使用复用的单页处理逻辑 + process_all_pages_since(since_time) + +if __name__ == "__main__": + main() \ No newline at end of file