vibe-coding-cn/assets/repo/prompts-library/scripts/md_to_jsonl.py

74 lines
2.1 KiB
Python

#!/usr/bin/env python3
"""
将 prompt_docs 目录下的 md 文件转换为 JSONL 格式
用法:
python md_to_jsonl.py <prompt_docs目录>
python md_to_jsonl.py prompt_docs/prompt_docs_2025_1222_004537
"""
import json
import re
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
OUTPUT_DIR = REPO_ROOT / "prompt_jsonl"
def convert(docs_dir: Path):
prompts_dir = docs_dir / "prompts"
if not prompts_dir.exists():
print(f"❌ 找不到 prompts 目录: {prompts_dir}")
return
# 输出文件名基于输入目录名
output_file = OUTPUT_DIR / f"{docs_dir.name}.jsonl"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
records = []
for category_dir in sorted(prompts_dir.iterdir()):
if not category_dir.is_dir():
continue
m = re.match(r'\((\d+)\)_(.+)', category_dir.name)
cat_id, cat_name = (m.groups() if m else (0, category_dir.name))
for md_file in sorted(category_dir.glob("*.md")):
if md_file.name == "index.md":
continue
fm = re.match(r'\((\d+),(\d+)\)_(.+)\.md', md_file.name)
if not fm:
continue
row, col, title = fm.groups()
content = md_file.read_text(encoding='utf-8')
records.append({
"category_id": int(cat_id),
"category": cat_name,
"row": int(row),
"col": int(col),
"title": title[:80],
"content": content
})
with open(output_file, 'w', encoding='utf-8') as f:
for r in records:
f.write(json.dumps(r, ensure_ascii=False) + '\n')
print(f"✅ 转换完成: {len(records)} 条 → {output_file}")
def main():
if len(sys.argv) < 2:
print(__doc__)
sys.exit(1)
docs_dir = Path(sys.argv[1])
if not docs_dir.is_absolute():
docs_dir = REPO_ROOT / docs_dir
convert(docs_dir)
if __name__ == "__main__":
main()