vibe-coding-cn/assets/repo/prompts-library/scripts/verify_integrity.py

110 lines
3.6 KiB
Python

import json
import os
from collections import defaultdict
jsonl_path = "prompt_jsonl/prompt_docs_refactored.jsonl"
docs_root = "prompt_docs/prompt_docs_refactored/prompts"
def verify():
print("=== 开始全面完整性检查 ===\n")
# 1. JSONL 数据加载与基础检查
if not os.path.exists(jsonl_path):
print(f"❌ 错误: JSONL 文件不存在: {jsonl_path}")
return
data = []
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
try:
data.append(json.loads(line))
except json.JSONDecodeError:
print(f"❌ 错误: 发现无效的 JSON 行: {line[:50]}...")
total_items = len(data)
print(f"✅ JSONL 读取成功,共 {total_items} 条数据。")
# 2. 规则验证
errors = []
categories = defaultdict(list)
expected_categories = {
"内容创作", "商业分析", "学习教育", "提示词工程", "综合杂项", "编程技术", "逻辑工具箱"
}
for item in data:
cat = item.get('category')
row = item.get('row')
col = item.get('col')
title = item.get('title')
content = item.get('content')
# 收集分类数据用于后续分析
categories[cat].append(row)
# 检查 1: 分类合法性
if cat not in expected_categories:
errors.append(f"❌ 未知分类: '{cat}' (Title: {title[:20]}...)")
# 检查 2: 列归位 (col == 1)
if col != 1:
errors.append(f"❌ 列未归位: Category '{cat}', Row {row}, Col {col} (应为 1)")
# 检查 3: 内容完整性 (简单检查)
if not title:
errors.append(f"⚠️ 警告: 标题为空 (Category '{cat}', Row {row})")
if not content or len(content) < 5:
errors.append(f"⚠️ 警告: 内容过短或为空 (Category '{cat}', Row {row}, Content len: {len(content) if content else 0})")
# 检查 4: 行连续性
print("\n--- 分类与行号连续性检查 ---")
for cat, rows in categories.items():
rows.sort()
count = len(rows)
if count == 0:
print(f"⚠️ 分类 '{cat}' 为空")
continue
max_row = rows[-1]
expected_rows = list(range(1, count + 1))
status = "✅ 正常"
if rows != expected_rows:
status = "❌ 异常 (行号不连续或重复)"
errors.append(f"行号错误: {cat} (Expect 1-{count}, Got max {max_row})")
print(f"{cat.ljust(10)}: {count} 条 | Max Row: {max_row} | {status}")
# 3. 文件系统同步检查
print("\n--- 文档文件同步检查 ---")
files_found = 0
if os.path.exists(docs_root):
for root, dirs, files in os.walk(docs_root):
for file in files:
if file.endswith(".md") and not file.startswith("index"):
files_found += 1
else:
print(f"❌ 文档目录不存在: {docs_root}")
print(f"JSONL 条目数: {total_items}")
print(f"Markdown 文件数: {files_found}")
if total_items == files_found:
print("✅ 文件数量一致")
else:
print(f"❌ 文件数量不匹配! (差值: {files_found - total_items})")
errors.append("文件系统数量与 JSONL 不一致")
# 4. 总结
print("\n=== 检查总结 ===")
if not errors:
print("🎉 完美!所有检查通过。数据结构完整、规范。")
else:
print(f"发现 {len(errors)} 个问题,请检视:")
for err in errors:
print(err)
if __name__ == "__main__":
verify()