From b4e11b037cca863e1016d87375b5213b8706909c Mon Sep 17 00:00:00 2001 From: tukuaiai Date: Tue, 10 Feb 2026 17:09:49 +0800 Subject: [PATCH] feat: skills - add markdown-to-epub --- i18n/zh/skills/05-生产力/AGENTS.md | 30 ++ .../05-生产力/markdown-to-epub/SKILL.md | 92 +++++ .../markdown-to-epub/agents/openai.yaml | 4 + .../markdown-to-epub/scripts/build_epub.py | 389 ++++++++++++++++++ 4 files changed, 515 insertions(+) create mode 100644 i18n/zh/skills/05-生产力/AGENTS.md create mode 100644 i18n/zh/skills/05-生产力/markdown-to-epub/SKILL.md create mode 100644 i18n/zh/skills/05-生产力/markdown-to-epub/agents/openai.yaml create mode 100644 i18n/zh/skills/05-生产力/markdown-to-epub/scripts/build_epub.py diff --git a/i18n/zh/skills/05-生产力/AGENTS.md b/i18n/zh/skills/05-生产力/AGENTS.md new file mode 100644 index 0000000..45e889a --- /dev/null +++ b/i18n/zh/skills/05-生产力/AGENTS.md @@ -0,0 +1,30 @@ +# AGENTS.md(i18n/zh/skills/05-生产力) + +本目录用于收纳「生产力类」技能:偏向内容生产、格式转换与交付物构建。 + +## 目录结构 + +```text +i18n/zh/skills/05-生产力/ +├── AGENTS.md +└── markdown-to-epub/ + ├── SKILL.md + ├── agents/ + │ └── openai.yaml + └── scripts/ + └── build_epub.py +``` + +## 模块职责与边界 + +- `markdown-to-epub/`:将 Markdown 手稿 + 本地图片资产稳定转换为 EPUB,并做最小可用的完整性校验。 + - `markdown-to-epub/SKILL.md`:面向使用者的入口文档(触发条件、边界、快速上手、排错)。 + - `markdown-to-epub/agents/openai.yaml`:Codex Skill 的交互入口元数据(展示名、默认提示语)。 + - `markdown-to-epub/scripts/build_epub.py`:核心实现脚本(重写图片引用、拷贝资产、调用 `ebook-convert`、输出报告)。 + +## 依赖与上下游 + +- 上游输入:Markdown 手稿文件、同目录或指定根目录下的本地图片。 +- 外部依赖:Calibre `ebook-convert`(用于实际转换)。 +- 下游输出:EPUB 文件 + `build_dir/` 工作目录(规范化 Markdown、assets、转换日志、报告 JSON)。 + diff --git a/i18n/zh/skills/05-生产力/markdown-to-epub/SKILL.md b/i18n/zh/skills/05-生产力/markdown-to-epub/SKILL.md new file mode 100644 index 0000000..b176f6b --- /dev/null +++ b/i18n/zh/skills/05-生产力/markdown-to-epub/SKILL.md @@ -0,0 +1,92 @@ +--- +name: markdown-to-epub +description: "将 Markdown 手稿与本地图片资产转换为可校验的 EPUB:修复/归一化图片引用与扩展名,保持标题层级 TOC,并做基础包结构检查。" +--- + +# markdown-to-epub Skill + +把 Markdown 手稿(含本地图片)稳定构建为 EPUB:规范化图片引用、拷贝资产到可重复的构建目录、调用 Calibre `ebook-convert` 转换,并输出可核查报告。 + +## When to Use This Skill + +触发条件(满足其一即可): +- 需要把一份(或多份)Markdown 手稿打包交付为 EPUB。 +- 图片引用混乱(URL 编码、路径飘忽、扩展名不可信如 `.bin/.idunno`),需要自动归一化。 +- 需要在转换后做最基本的 EPUB 包结构检查(OPF/NCX/NAV、图片数量等)。 + +## Not For / Boundaries + +- 不负责生成/改写正文内容(不会修改源手稿,只在构建目录里产出规范化版本)。 +- 不下载远程图片(`http(s)`/`data:` 引用会保持原样)。 +- 不替代真正的排版/校对流程(这里只做可交付构建与结构验证)。 + +## Quick Start + +从仓库根目录执行(推荐 `python3`): + +```bash +python3 i18n/zh/skills/05-生产力/markdown-to-epub/scripts/build_epub.py \ + --input-md "./book.md" \ + --output-epub "./book.epub" \ + --title "Book Title" \ + --authors "Author Name" \ + --language "zh-CN" +``` + +脚本会创建构建工作区(默认 `build_epub/`),包含: +- `book.normalized.md` +- `assets/`:拷贝后的图片(会按真实文件签名推断扩展名) +- `conversion.log` +- `report.json` + +## 依赖 + +- 需要安装 Calibre,并确保 `ebook-convert` 在 `PATH` 中(或用 `--ebook-convert-bin` 指定路径)。 + +## Missing Asset Recovery + +如果 Markdown 里引用了图片但文件找不到,可以提供一个 JSON 映射表(按「basename」匹配): + +```json +{ + "missing-file.idunno": "replacement-file.idunno" +} +``` + +然后重跑(示例): + +```bash +python3 i18n/zh/skills/05-生产力/markdown-to-epub/scripts/build_epub.py \ + --input-md "./book.md" \ + --output-epub "./book.epub" \ + --fallback-map "./fallback-map.json" +``` + +## Operational Rules + +- 优先使用 `ebook-convert`;缺失时明确报错并快速失败。 +- 源手稿只读;所有输出写入 `build_dir/`。 +- TOC 以标题层级(`h1/h2/h3`)为准。 +- 缺失资产必须显式报告;严格模式下不允许静默跳过。 +- 命令保持非交互式。 + +## Script Interface + +`scripts/build_epub.py` 参数: +- `--input-md`(必选):源 Markdown 路径 +- `--output-epub`(可选):输出 EPUB 路径,默认 `.epub` +- `--source-root`(可选):解析图片引用的根目录,默认使用 Markdown 所在目录 +- `--build-dir`(可选):构建工作区目录,默认 `/build_epub` +- `--fallback-map`(可选):JSON 映射(缺失图片 basename → 替换 basename) +- `--title` / `--authors` / `--language`:传给 `ebook-convert` 的元数据 +- `--input-encoding`:输入 Markdown 编码,默认 `utf-8` +- `--strict-missing`:严格模式(有任何本地图片无法解析则失败,默认开启) +- `--no-strict-missing`:关闭严格模式(保留未解析链接,继续转换) +- `--ebook-convert-bin`:`ebook-convert` 可执行文件名/路径,默认 `ebook-convert` + +## Validation Checklist + +- 确认 EPUB 文件生成且大小不是「几 KB 的空壳」。 +- 确认 EPUB(zip)内包含 OPF 与 NCX/NAV。 +- 确认 EPUB 内图片数量不低于对手稿的预期。 +- 严格模式下确认 `report.json` 的 `missing_images` 为空。 diff --git a/i18n/zh/skills/05-生产力/markdown-to-epub/agents/openai.yaml b/i18n/zh/skills/05-生产力/markdown-to-epub/agents/openai.yaml new file mode 100644 index 0000000..e9c1986 --- /dev/null +++ b/i18n/zh/skills/05-生产力/markdown-to-epub/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Markdown → EPUB 构建器" + short_description: "把 Markdown 手稿 + 本地图片资产转换为可校验的 EPUB。" + default_prompt: "使用 $markdown-to-epub 把我的 Markdown 手稿和本地图片资产转换成可校验的 EPUB 文件。" diff --git a/i18n/zh/skills/05-生产力/markdown-to-epub/scripts/build_epub.py b/i18n/zh/skills/05-生产力/markdown-to-epub/scripts/build_epub.py new file mode 100644 index 0000000..0232e94 --- /dev/null +++ b/i18n/zh/skills/05-生产力/markdown-to-epub/scripts/build_epub.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +""" +Build a robust EPUB from Markdown with local image assets. + +Features: +- Normalize Markdown image references into build_dir/assets +- Detect real image extensions from file signatures (.png/.jpg/.gif/.webp/.svg) +- Optionally resolve missing files via fallback JSON map +- Convert using Calibre ebook-convert +- Emit conversion report JSON for verification +""" + +from __future__ import annotations + +import argparse +import json +import re +import shutil +import subprocess +import sys +import urllib.parse +import zipfile +from dataclasses import dataclass +from hashlib import sha1 +from pathlib import Path +from typing import Dict, List, Optional, Tuple + + +IMAGE_PATTERN = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") +REMOTE_PREFIXES = ("http://", "https://", "data:") +VALID_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".bmp"} + + +@dataclass +class RewriteResult: + normalized_markdown: Path + assets_dir: Path + total_refs: int + rewritten_refs: int + copied_assets: int + missing_images: List[str] + + +def detect_extension(file_path: Path, data: bytes) -> str: + lower_name = file_path.name.lower() + if lower_name.endswith(".svg"): + return ".svg" + if data.startswith(b"\x89PNG\r\n\x1a\n"): + return ".png" + if data.startswith(b"\xff\xd8\xff"): + return ".jpg" + if data.startswith(b"GIF87a") or data.startswith(b"GIF89a"): + return ".gif" + if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WEBP": + return ".webp" + if data.startswith(b"BM"): + return ".bmp" + current_ext = file_path.suffix.lower() + if current_ext in VALID_IMAGE_EXTS: + return current_ext + return ".bin" + + +def decode_reference(reference: str) -> str: + return urllib.parse.unquote(reference.strip()) + + +def resolve_source_file( + source_root: Path, + decoded_ref: str, + fallback_map: Dict[str, str], +) -> Tuple[Optional[Path], str]: + decoded_ref = decoded_ref.replace("\\", "/") + basename = Path(decoded_ref).name + candidates = [] + + # Keep relative path when possible. + rel_path = Path(decoded_ref) + if not rel_path.is_absolute(): + candidates.append((source_root / rel_path).resolve()) + + # Common exported markdown style: "/" + if "/" in decoded_ref: + candidates.append((source_root / basename).resolve()) + + # Direct basename fallback. + candidates.append((source_root / basename).resolve()) + + checked = set() + for candidate in candidates: + key = str(candidate).lower() + if key in checked: + continue + checked.add(key) + if candidate.exists() and candidate.is_file(): + return candidate, basename + + fallback_name = fallback_map.get(basename) + if fallback_name: + fallback_candidate = (source_root / fallback_name).resolve() + if fallback_candidate.exists() and fallback_candidate.is_file(): + return fallback_candidate, basename + + return None, basename + + +def rewrite_markdown_and_copy_assets( + input_md: Path, + source_root: Path, + build_dir: Path, + input_encoding: str, + fallback_map: Dict[str, str], + strict_missing: bool, +) -> RewriteResult: + assets_dir = build_dir / "assets" + assets_dir.mkdir(parents=True, exist_ok=True) + + text = input_md.read_text(encoding=input_encoding) + copied_name_by_source: Dict[str, str] = {} + missing_images: List[str] = [] + total_refs = 0 + rewritten_refs = 0 + + def replace(match: re.Match[str]) -> str: + nonlocal total_refs, rewritten_refs + total_refs += 1 + alt_text = match.group(1) + original_ref = match.group(2).strip() + + if original_ref.lower().startswith(REMOTE_PREFIXES): + return match.group(0) + + decoded = decode_reference(original_ref) + source_file, missing_name = resolve_source_file(source_root, decoded, fallback_map) + if source_file is None: + missing_images.append(missing_name) + return match.group(0) + + source_key = str(source_file.resolve()).lower() + if source_key in copied_name_by_source: + target_name = copied_name_by_source[source_key] + else: + data = source_file.read_bytes() + ext = detect_extension(source_file, data) + target_name = f"{source_file.stem}{ext}" + target_path = assets_dir / target_name + if target_path.exists(): + existing_data = target_path.read_bytes() + if existing_data != data: + digest = sha1(data).hexdigest()[:8] + target_name = f"{source_file.stem}-{digest}{ext}" + target_path = assets_dir / target_name + target_path.write_bytes(data) + copied_name_by_source[source_key] = target_name + + rewritten_refs += 1 + return f"![{alt_text}](assets/{target_name})" + + rewritten = IMAGE_PATTERN.sub(replace, text) + normalized_md = build_dir / "book.normalized.md" + normalized_md.write_text(rewritten, encoding="utf-8") + + unique_missing = sorted(set(missing_images)) + if strict_missing and unique_missing: + msg = ( + "Missing local image files detected. " + f"Count={len(unique_missing)}; examples={unique_missing[:10]}" + ) + raise FileNotFoundError(msg) + + return RewriteResult( + normalized_markdown=normalized_md, + assets_dir=assets_dir, + total_refs=total_refs, + rewritten_refs=rewritten_refs, + copied_assets=len(copied_name_by_source), + missing_images=unique_missing, + ) + + +def run_ebook_convert( + ebook_convert_bin: str, + normalized_md: Path, + output_epub: Path, + title: Optional[str], + authors: Optional[str], + language: Optional[str], + input_encoding: str, + conversion_log: Path, +) -> None: + cmd = [ + ebook_convert_bin, + str(normalized_md), + str(output_epub), + "--input-encoding", + input_encoding, + "--level1-toc", + "//h:h1", + "--level2-toc", + "//h:h2", + "--level3-toc", + "//h:h3", + ] + + if title: + cmd.extend(["--title", title]) + if authors: + cmd.extend(["--authors", authors]) + if language: + cmd.extend(["--language", language]) + + proc = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace") + conversion_log.write_text( + "\n".join( + [ + f"COMMAND: {' '.join(cmd)}", + "", + "STDOUT:", + proc.stdout, + "", + "STDERR:", + proc.stderr, + "", + f"EXIT_CODE: {proc.returncode}", + ] + ), + encoding="utf-8", + ) + if proc.returncode != 0: + raise RuntimeError(f"ebook-convert failed with exit code {proc.returncode}") + + +def inspect_epub(epub_file: Path) -> Dict[str, object]: + if not epub_file.exists(): + raise FileNotFoundError(f"EPUB not found: {epub_file}") + + with zipfile.ZipFile(epub_file) as zf: + names = zf.namelist() + image_files = [ + n for n in names if re.search(r"\.(png|jpg|jpeg|gif|svg|webp|bmp)$", n, flags=re.IGNORECASE) + ] + has_opf = any(n.lower().endswith(".opf") for n in names) + has_ncx_or_nav = any(n.lower().endswith(".ncx") or "nav" in n.lower() for n in names) + nav_points = 0 + for name in names: + if name.lower().endswith(".ncx"): + content = zf.read(name).decode("utf-8", errors="ignore") + nav_points = len(re.findall(r" Dict[str, str]: + if path is None: + return {} + content = path.read_text(encoding="utf-8-sig") + raw = json.loads(content) + if not isinstance(raw, dict): + raise ValueError("--fallback-map must be a JSON object") + output: Dict[str, str] = {} + for key, value in raw.items(): + if isinstance(key, str) and isinstance(value, str): + output[key] = value + return output + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="从 Markdown 与本地图片资产构建 EPUB。") + parser.add_argument("--input-md", required=True, type=Path, help="源 Markdown 路径。") + parser.add_argument( + "--output-epub", + type=Path, + help="输出 EPUB 路径。默认:当前目录下的 .epub。", + ) + parser.add_argument( + "--source-root", + type=Path, + help="解析图片引用的根目录。默认:Markdown 所在目录。", + ) + parser.add_argument( + "--build-dir", + type=Path, + default=Path.cwd() / "build_epub", + help="构建工作区目录(规范化 Markdown / assets / 日志 / 报告)。", + ) + parser.add_argument( + "--fallback-map", + type=Path, + help="JSON 映射:缺失图片 basename → 替换 basename。", + ) + parser.add_argument("--title", help="EPUB 标题元数据。") + parser.add_argument("--authors", help="EPUB 作者元数据。") + parser.add_argument("--language", default="zh-CN", help="EPUB 语言元数据。") + parser.add_argument("--input-encoding", default="utf-8", help="输入 Markdown 编码。") + parser.add_argument("--ebook-convert-bin", default="ebook-convert", help="ebook-convert 可执行文件名/路径。") + parser.add_argument( + "--strict-missing", + action="store_true", + default=True, + help="严格模式:任何本地图片无法解析则失败(默认开启)。", + ) + parser.add_argument( + "--no-strict-missing", + action="store_false", + dest="strict_missing", + help="关闭严格模式:即使存在未解析的本地图片引用也继续转换。", + ) + parser.add_argument( + "--clean-build-dir", + action="store_true", + help="转换前清空 build-dir。", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + + input_md = args.input_md.resolve() + if not input_md.exists(): + raise FileNotFoundError(f"Markdown not found: {input_md}") + + output_epub = ( + args.output_epub.resolve() + if args.output_epub + else (Path.cwd() / f"{input_md.stem}.epub").resolve() + ) + source_root = args.source_root.resolve() if args.source_root else input_md.parent.resolve() + build_dir = args.build_dir.resolve() + + if args.clean_build_dir and build_dir.exists(): + shutil.rmtree(build_dir) + build_dir.mkdir(parents=True, exist_ok=True) + + fallback_map = load_fallback_map(args.fallback_map.resolve() if args.fallback_map else None) + + rewrite_result = rewrite_markdown_and_copy_assets( + input_md=input_md, + source_root=source_root, + build_dir=build_dir, + input_encoding=args.input_encoding, + fallback_map=fallback_map, + strict_missing=args.strict_missing, + ) + + conversion_log = build_dir / "conversion.log" + run_ebook_convert( + ebook_convert_bin=args.ebook_convert_bin, + normalized_md=rewrite_result.normalized_markdown, + output_epub=output_epub, + title=args.title, + authors=args.authors, + language=args.language, + input_encoding="utf-8", + conversion_log=conversion_log, + ) + + epub_info = inspect_epub(output_epub) + report = { + "input_markdown": str(input_md), + "output_epub": str(output_epub), + "build_dir": str(build_dir), + "total_image_refs": rewrite_result.total_refs, + "rewritten_image_refs": rewrite_result.rewritten_refs, + "copied_assets": rewrite_result.copied_assets, + "missing_images": rewrite_result.missing_images, + "epub": epub_info, + } + report_path = build_dir / "report.json" + report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + + print(json.dumps(report, ensure_ascii=False, indent=2)) + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except Exception as exc: # pragma: no cover + print(f"错误:{exc}", file=sys.stderr) + raise