From a4c70e2ef8f65796049b3ba354da176a314d69ba Mon Sep 17 00:00:00 2001 From: tukuaiai Date: Thu, 12 Feb 2026 03:22:57 +0800 Subject: [PATCH] feat: skills-skills - vendor Skill Seekers tool --- skills/skills-skills/AGENTS.md | 18 +- skills/skills-skills/SKILL.md | 37 + skills/skills-skills/references/index.md | 1 + .../skills-skills/references/skill-seekers.md | 39 + .../scripts/Skill_Seekers-development/LICENSE | 21 + .../configs/ansible-core.json | 31 + .../configs/astro.json | 30 + .../configs/claude-code.json | 37 + .../configs/django.json | 34 + .../configs/django_unified.json | 49 + .../configs/example_pdf.json | 17 + .../configs/fastapi.json | 33 + .../configs/fastapi_unified.json | 45 + .../configs/fastapi_unified_test.json | 41 + .../configs/godot-large-example.json | 63 + .../configs/godot.json | 47 + .../configs/godot_github.json | 19 + .../configs/godot_unified.json | 50 + .../configs/hono.json | 18 + .../configs/kubernetes.json | 48 + .../configs/laravel.json | 34 + .../configs/python-tutorial-test.json | 17 + .../configs/react.json | 31 + .../configs/react_github.json | 15 + .../configs/react_unified.json | 44 + .../configs/steam-economy-complete.json | 108 + .../configs/tailwind.json | 30 + .../configs/test-manual.json | 17 + .../configs/vue.json | 31 + .../demo_conflicts.py | 195 ++ .../example-mcp-config.json | 11 + .../Skill_Seekers-development/mypy.ini | 13 + .../Skill_Seekers-development/pyproject.toml | 149 ++ .../requirements.txt | 42 + .../Skill_Seekers-development/setup_mcp.sh | 266 +++ .../src/skill_seekers/__init__.py | 22 + .../src/skill_seekers/cli/__init__.py | 39 + .../src/skill_seekers/cli/code_analyzer.py | 500 +++++ .../src/skill_seekers/cli/config_validator.py | 376 ++++ .../skill_seekers/cli/conflict_detector.py | 513 +++++ .../src/skill_seekers/cli/constants.py | 72 + .../src/skill_seekers/cli/doc_scraper.py | 1822 +++++++++++++++++ .../src/skill_seekers/cli/enhance_skill.py | 273 +++ .../skill_seekers/cli/enhance_skill_local.py | 451 ++++ .../src/skill_seekers/cli/estimate_pages.py | 288 +++ .../src/skill_seekers/cli/generate_router.py | 274 +++ .../src/skill_seekers/cli/github_scraper.py | 900 ++++++++ .../skill_seekers/cli/llms_txt_detector.py | 66 + .../skill_seekers/cli/llms_txt_downloader.py | 94 + .../src/skill_seekers/cli/llms_txt_parser.py | 74 + .../src/skill_seekers/cli/main.py | 285 +++ .../src/skill_seekers/cli/merge_sources.py | 513 +++++ .../src/skill_seekers/cli/package_multi.py | 81 + .../src/skill_seekers/cli/package_skill.py | 220 ++ .../skill_seekers/cli/pdf_extractor_poc.py | 1222 +++++++++++ .../src/skill_seekers/cli/pdf_scraper.py | 401 ++++ .../src/skill_seekers/cli/quality_checker.py | 480 +++++ .../src/skill_seekers/cli/run_tests.py | 228 +++ .../src/skill_seekers/cli/split_config.py | 320 +++ .../skill_seekers/cli/test_unified_simple.py | 192 ++ .../src/skill_seekers/cli/unified_scraper.py | 450 ++++ .../cli/unified_skill_builder.py | 444 ++++ .../src/skill_seekers/cli/upload_skill.py | 175 ++ .../src/skill_seekers/cli/utils.py | 224 ++ .../src/skill_seekers/mcp/__init__.py | 27 + .../src/skill_seekers/mcp/requirements.txt | 9 + .../src/skill_seekers/mcp/server.py | 1064 ++++++++++ .../src/skill_seekers/mcp/tools/__init__.py | 19 + .../scripts/skill-seekers-bootstrap.sh | 69 + .../scripts/skill-seekers-configs | 1 + .../scripts/skill-seekers-import.sh | 81 + .../skills-skills/scripts/skill-seekers-src | 1 + .../scripts/skill-seekers-update.sh | 118 ++ skills/skills-skills/scripts/skill-seekers.sh | 66 + 74 files changed, 14134 insertions(+), 1 deletion(-) create mode 100644 skills/skills-skills/references/skill-seekers.md create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/LICENSE create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/ansible-core.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/astro.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/claude-code.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/django.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/django_unified.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/example_pdf.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi_unified.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi_unified_test.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/godot-large-example.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/godot.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/godot_github.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/godot_unified.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/hono.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/kubernetes.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/laravel.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/python-tutorial-test.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/react.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/react_github.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/react_unified.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/steam-economy-complete.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/tailwind.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/test-manual.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/configs/vue.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/demo_conflicts.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/example-mcp-config.json create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/mypy.ini create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/pyproject.toml create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/requirements.txt create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/setup_mcp.sh create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/__init__.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/__init__.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/code_analyzer.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/config_validator.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/conflict_detector.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/constants.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/doc_scraper.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/enhance_skill.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/enhance_skill_local.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/estimate_pages.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/generate_router.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/github_scraper.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_detector.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_downloader.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_parser.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/main.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/merge_sources.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/package_multi.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/package_skill.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/pdf_extractor_poc.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/pdf_scraper.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/quality_checker.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/run_tests.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/split_config.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/test_unified_simple.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/unified_scraper.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/unified_skill_builder.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/upload_skill.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/utils.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/__init__.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/requirements.txt create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/server.py create mode 100644 skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/tools/__init__.py create mode 100755 skills/skills-skills/scripts/skill-seekers-bootstrap.sh create mode 120000 skills/skills-skills/scripts/skill-seekers-configs create mode 100755 skills/skills-skills/scripts/skill-seekers-import.sh create mode 120000 skills/skills-skills/scripts/skill-seekers-src create mode 100755 skills/skills-skills/scripts/skill-seekers-update.sh create mode 100755 skills/skills-skills/scripts/skill-seekers.sh diff --git a/skills/skills-skills/AGENTS.md b/skills/skills-skills/AGENTS.md index 390dd7a..ae4aa2e 100644 --- a/skills/skills-skills/AGENTS.md +++ b/skills/skills-skills/AGENTS.md @@ -12,12 +12,20 @@ skills-skills/ | |-- template-minimal.md | `-- template-complete.md |-- scripts/ +| |-- Skill_Seekers-development/ | |-- create-skill.sh +| |-- skill-seekers-bootstrap.sh +| |-- skill-seekers-configs -> Skill_Seekers-development/configs +| |-- skill-seekers-import.sh +| |-- skill-seekers.sh +| |-- skill-seekers-src -> Skill_Seekers-development/src +| |-- skill-seekers-update.sh | `-- validate-skill.sh `-- references/ |-- index.md |-- README.md |-- anti-patterns.md + |-- skill-seekers.md |-- quality-checklist.md `-- skill-spec.md ``` @@ -28,14 +36,22 @@ skills-skills/ - `skills/skills-skills/assets/template-minimal.md`: minimal template (small domains / quick bootstrap). - `skills/skills-skills/assets/template-complete.md`: full template (production-grade / complex domains). - `skills/skills-skills/scripts/create-skill.sh`: scaffold generator (minimal/full, output dir, overwrite). +- `skills/skills-skills/scripts/Skill_Seekers-development/`: vendored Skill Seekers source snapshot (code + configs; excludes upstream Markdown docs). +- `skills/skills-skills/scripts/skill-seekers-bootstrap.sh`: create a local venv and install deps for the vendored Skill Seekers tool. +- `skills/skills-skills/scripts/skill-seekers.sh`: run Skill Seekers from vendored source (docs/github/pdf -> output//). +- `skills/skills-skills/scripts/skill-seekers-import.sh`: import output// into the canonical skills// tree. +- `skills/skills-skills/scripts/skill-seekers-update.sh`: update the vendored source snapshot from upstream (network required). - `skills/skills-skills/scripts/validate-skill.sh`: spec validator (supports `--strict`). - `skills/skills-skills/references/index.md`: navigation for this meta-skill's reference docs. - `skills/skills-skills/references/README.md`: upstream official reference (lightly adjusted to keep links working in this repo). - `skills/skills-skills/references/skill-spec.md`: the local Skill spec (MUST/SHOULD/NEVER). - `skills/skills-skills/references/quality-checklist.md`: quality gate checklist + scoring. - `skills/skills-skills/references/anti-patterns.md`: common failure modes and how to fix them. +- `skills/skills-skills/references/skill-seekers.md`: how to use the vendored tool as a mandatory first-draft generator. ## Dependencies & Boundaries -- `scripts/*.sh`: depend only on `bash` + common POSIX tooling (`sed/awk/grep/find`), no network required. +- `scripts/*.sh`: depend on `bash` + common POSIX tooling; some scripts require extra tooling: + - `skill-seekers-bootstrap.sh`: requires `python3` + `pip` (network required for PyPI). + - `skill-seekers-update.sh`: requires `curl` + `tar` + `rsync` (network required). - This directory is about "how to build Skills", not about any specific domain; domain knowledge belongs in `skills//`. diff --git a/skills/skills-skills/SKILL.md b/skills/skills-skills/SKILL.md index 9123d7f..4d4df40 100644 --- a/skills/skills-skills/SKILL.md +++ b/skills/skills-skills/SKILL.md @@ -37,6 +37,41 @@ Your output MUST include: 3. Long-form docs moved to `references/` with a `references/index.md` 4. A pre-delivery checklist (Quality Gate) +### Built-in Tool (Mandatory): Skill Seekers (Vendored) + +This repo vendors the Skill Seekers source code inside this meta-skill so you can generate a first-draft Skill from: +- Documentation websites +- GitHub repositories +- PDFs + +Bootstrap dependencies (once): + +```bash +./skills/skills-skills/scripts/skill-seekers-bootstrap.sh +``` + +Run Skill Seekers (from vendored source): + +```bash +./skills/skills-skills/scripts/skill-seekers.sh -- --version +./skills/skills-skills/scripts/skill-seekers.sh -- scrape --config ./skills/skills-skills/scripts/Skill_Seekers-development/configs/react.json +./skills/skills-skills/scripts/skill-seekers.sh -- github --repo facebook/react --name react +``` + +Import the generated skill into this repo's canonical `skills/` tree: + +```bash +./skills/skills-skills/scripts/skill-seekers-import.sh react +./skills/skills-skills/scripts/skill-seekers-import.sh react --force +``` + +Update the vendored source snapshot (optional, network required): + +```bash +./skills/skills-skills/scripts/skill-seekers-update.sh +./skills/skills-skills/scripts/skill-seekers-update.sh --ref main +``` + ### Recommended Layout (Minimal -> Full) ``` @@ -134,6 +169,7 @@ Trigger when any of these applies: ### Workflow (Material -> Skill) Do not skip steps: +0. If your source material is a docs site / GitHub repo / PDF: generate a first draft with the vendored Skill Seekers tool, then import into `skills//` 1. Scope: write MUST/SHOULD/NEVER (three sentences total is fine) 2. Extract patterns: pick 10-20 high-frequency patterns (commands/snippets/flows) 3. Add examples: >= 3 end-to-end examples (input -> steps -> acceptance) @@ -229,6 +265,7 @@ Local docs: - `references/quality-checklist.md` - `references/anti-patterns.md` - `references/README.md` (upstream official reference) +- `references/skill-seekers.md` (vendored tool integration + workflow) External (official): - https://support.claude.com/en/articles/12512176-what-are-skills diff --git a/skills/skills-skills/references/index.md b/skills/skills-skills/references/index.md index 61a2421..915f991 100644 --- a/skills/skills-skills/references/index.md +++ b/skills/skills-skills/references/index.md @@ -11,6 +11,7 @@ This directory contains long-form documentation that supports the `skills-skills - [`skill-spec.md`](skill-spec.md): normative spec (MUST/SHOULD/NEVER) for a production-grade Skill in this repo - [`quality-checklist.md`](quality-checklist.md): quality gate checklist + scoring rubric - [`anti-patterns.md`](anti-patterns.md): common failure modes and how to fix them +- [`skill-seekers.md`](skill-seekers.md): vendored Skill Seekers tool integration (workflow + commands) ## Upstream / Official Reference diff --git a/skills/skills-skills/references/skill-seekers.md b/skills/skills-skills/references/skill-seekers.md new file mode 100644 index 0000000..3985a78 --- /dev/null +++ b/skills/skills-skills/references/skill-seekers.md @@ -0,0 +1,39 @@ +# Skill Seekers(内置工具)使用说明 + +本目录把 `Skill_Seekers-development` 的源码作为 `skills-skills` 的必备工具内置,用于把「文档 / GitHub 仓库 / PDF」快速转成一个可落地的 Skill 初稿。 + +## 目录约定 + +- 工具源码:`skills/skills-skills/scripts/Skill_Seekers-development/` +- 运行入口:`skills/skills-skills/scripts/skill-seekers.sh` +- 依赖初始化:`skills/skills-skills/scripts/skill-seekers-bootstrap.sh` +- 导入到本仓库:`skills/skills-skills/scripts/skill-seekers-import.sh` +- 更新源码快照:`skills/skills-skills/scripts/skill-seekers-update.sh`(需要网络) + +## 推荐工作流(强约束) + +1. 用 Skill Seekers 生成初稿到 `output//` +2. 导入到 `skills//` +3. 用 `validate-skill.sh --strict` 做质量闸门 +4. 回到 `skills-skills` 的规范对 `SKILL.md` 做“可激活性”与“边界”修订 + +## 最小可执行示例 + +```bash +# 1) 初始化(只需一次) +./skills/skills-skills/scripts/skill-seekers-bootstrap.sh + +# 2) 生成(示例:抓 docs 配置) +./skills/skills-skills/scripts/skill-seekers.sh -- scrape --config ./skills/skills-skills/scripts/Skill_Seekers-development/configs/react.json + +# 3) 导入到 skills/ +./skills/skills-skills/scripts/skill-seekers-import.sh react + +# 4) 严格校验 +./skills/skills-skills/scripts/validate-skill.sh skills/react --strict +``` + +## 设计原则 + +- `skills/skills-skills/` 负责:规范、模板、闸门、可激活性;不直接承载领域知识。 +- Skill Seekers 负责:抓取与初稿生成;最终交付仍以本仓库的 `validate-skill.sh --strict` 为准。 diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/LICENSE b/skills/skills-skills/scripts/Skill_Seekers-development/LICENSE new file mode 100644 index 0000000..11d6561 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 [Your Name/Username] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/ansible-core.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/ansible-core.json new file mode 100644 index 0000000..764cead --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/ansible-core.json @@ -0,0 +1,31 @@ +{ + "name": "ansible-core", + "description": "Ansible Core 2.19 skill for automation and configuration management", + "base_url": "https://docs.ansible.com/ansible-core/2.19/", + "selectors": { + "main_content": "div[role=main]", + "title": "title", + "code_blocks": "pre" + }, + "url_patterns": { + "include": [], + "exclude": ["/_static/", "/_images/", "/_downloads/", "/search.html", "/genindex.html", "/py-modindex.html", "/index.html", "/roadmap/"] + }, + "categories": { + "getting_started": ["getting_started", "getting-started", "introduction", "overview"], + "installation": ["installation_guide", "installation", "setup"], + "inventory": ["inventory_guide", "inventory"], + "playbooks": ["playbook_guide", "playbooks", "playbook"], + "modules": ["module_plugin_guide", "modules", "plugins"], + "collections": ["collections_guide", "collections"], + "vault": ["vault_guide", "vault", "encryption"], + "commands": ["command_guide", "commands", "cli"], + "porting": ["porting_guides", "porting", "migration"], + "os_specific": ["os_guide", "platform"], + "tips": ["tips_tricks", "tips", "tricks", "best-practices"], + "community": ["community", "contributing", "contributions"], + "development": ["dev_guide", "development", "developing"] + }, + "rate_limit": 0.5, + "max_pages": 800 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/astro.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/astro.json new file mode 100644 index 0000000..89b2798 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/astro.json @@ -0,0 +1,30 @@ +{ + "name": "astro", + "description": "Astro web framework for content-focused websites. Use for Astro components, islands architecture, content collections, SSR/SSG, and modern web development.", + "base_url": "https://docs.astro.build/en/getting-started/", + "start_urls": [ + "https://docs.astro.build/en/getting-started/", + "https://docs.astro.build/en/install/auto/", + "https://docs.astro.build/en/core-concepts/project-structure/", + "https://docs.astro.build/en/core-concepts/astro-components/", + "https://docs.astro.build/en/core-concepts/astro-pages/" + ], + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": ["/en/"], + "exclude": ["/blog", "/integrations"] + }, + "categories": { + "getting_started": ["getting-started", "install", "tutorial"], + "core_concepts": ["core-concepts", "project-structure", "components", "pages"], + "guides": ["guides", "deploy", "migrate"], + "configuration": ["configuration", "config", "typescript"], + "integrations": ["integrations", "framework", "adapter"] + }, + "rate_limit": 0.5, + "max_pages": 100 +} \ No newline at end of file diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/claude-code.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/claude-code.json new file mode 100644 index 0000000..c84e709 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/claude-code.json @@ -0,0 +1,37 @@ +{ + "name": "claude-code", + "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, configuration, and AI-assisted development.", + "base_url": "https://docs.claude.com/en/docs/claude-code/", + "start_urls": [ + "https://docs.claude.com/en/docs/claude-code/overview", + "https://docs.claude.com/en/docs/claude-code/quickstart", + "https://docs.claude.com/en/docs/claude-code/common-workflows", + "https://docs.claude.com/en/docs/claude-code/mcp", + "https://docs.claude.com/en/docs/claude-code/settings", + "https://docs.claude.com/en/docs/claude-code/troubleshooting", + "https://docs.claude.com/en/docs/claude-code/iam" + ], + "selectors": { + "main_content": "#content-container", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": ["/claude-code/"], + "exclude": ["/api-reference/", "/claude-ai/", "/claude.ai/", "/prompt-engineering/", "/changelog/"] + }, + "categories": { + "getting_started": ["overview", "quickstart", "installation", "setup", "terminal-config"], + "workflows": ["workflow", "common-workflows", "git", "testing", "debugging", "interactive"], + "mcp": ["mcp", "model-context-protocol"], + "configuration": ["config", "settings", "preferences", "customize", "hooks", "statusline", "model-config", "memory", "output-styles"], + "agents": ["agent", "task", "subagent", "sub-agent", "specialized"], + "skills": ["skill", "agent-skill"], + "integrations": ["ide-integrations", "vs-code", "jetbrains", "plugin", "marketplace"], + "deployment": ["bedrock", "vertex", "deployment", "network", "gateway", "devcontainer", "sandboxing", "third-party"], + "reference": ["reference", "api", "command", "cli-reference", "slash", "checkpointing", "headless", "sdk"], + "enterprise": ["iam", "security", "monitoring", "analytics", "costs", "legal", "data-usage"] + }, + "rate_limit": 0.5, + "max_pages": 200 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/django.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/django.json new file mode 100644 index 0000000..70f84b6 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/django.json @@ -0,0 +1,34 @@ +{ + "name": "django", + "description": "Django web framework for Python. Use for Django models, views, templates, ORM, authentication, and web development.", + "base_url": "https://docs.djangoproject.com/en/stable/", + "start_urls": [ + "https://docs.djangoproject.com/en/stable/intro/", + "https://docs.djangoproject.com/en/stable/topics/db/models/", + "https://docs.djangoproject.com/en/stable/topics/http/views/", + "https://docs.djangoproject.com/en/stable/topics/templates/", + "https://docs.djangoproject.com/en/stable/topics/forms/", + "https://docs.djangoproject.com/en/stable/topics/auth/", + "https://docs.djangoproject.com/en/stable/ref/models/" + ], + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre" + }, + "url_patterns": { + "include": ["/intro/", "/topics/", "/ref/", "/howto/"], + "exclude": ["/faq/", "/misc/", "/releases/"] + }, + "categories": { + "getting_started": ["intro", "tutorial", "install"], + "models": ["models", "database", "orm", "queries"], + "views": ["views", "urlconf", "routing"], + "templates": ["templates", "template"], + "forms": ["forms", "form"], + "authentication": ["auth", "authentication", "user"], + "api": ["ref", "reference"] + }, + "rate_limit": 0.3, + "max_pages": 500 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/django_unified.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/django_unified.json new file mode 100644 index 0000000..7bb2db2 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/django_unified.json @@ -0,0 +1,49 @@ +{ + "name": "django", + "description": "Complete Django framework knowledge combining official documentation and Django codebase. Use when building Django applications, understanding ORM internals, or debugging Django issues.", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://docs.djangoproject.com/en/stable/", + "extract_api": true, + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre" + }, + "url_patterns": { + "include": [], + "exclude": ["/search/", "/genindex/"] + }, + "categories": { + "getting_started": ["intro", "tutorial", "install"], + "models": ["models", "orm", "queries", "database"], + "views": ["views", "urls", "templates"], + "forms": ["forms", "modelforms"], + "admin": ["admin"], + "api": ["ref/"], + "topics": ["topics/"], + "security": ["security", "csrf", "authentication"] + }, + "rate_limit": 0.5, + "max_pages": 300 + }, + { + "type": "github", + "repo": "django/django", + "include_issues": true, + "max_issues": 100, + "include_changelog": true, + "include_releases": true, + "include_code": true, + "code_analysis_depth": "surface", + "file_patterns": [ + "django/db/**/*.py", + "django/views/**/*.py", + "django/forms/**/*.py", + "django/contrib/admin/**/*.py" + ] + } + ] +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/example_pdf.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/example_pdf.json new file mode 100644 index 0000000..08c7475 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/example_pdf.json @@ -0,0 +1,17 @@ +{ + "name": "example_manual", + "description": "Example PDF documentation skill", + "pdf_path": "docs/manual.pdf", + "extract_options": { + "chunk_size": 10, + "min_quality": 5.0, + "extract_images": true, + "min_image_size": 100 + }, + "categories": { + "getting_started": ["introduction", "getting started", "quick start", "setup"], + "tutorial": ["tutorial", "guide", "walkthrough", "example"], + "api": ["api", "reference", "function", "class", "method"], + "advanced": ["advanced", "optimization", "performance", "best practices"] + } +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi.json new file mode 100644 index 0000000..f08a08c --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi.json @@ -0,0 +1,33 @@ +{ + "name": "fastapi", + "description": "FastAPI modern Python web framework. Use for building APIs, async endpoints, dependency injection, and Python backend development.", + "base_url": "https://fastapi.tiangolo.com/", + "start_urls": [ + "https://fastapi.tiangolo.com/tutorial/", + "https://fastapi.tiangolo.com/tutorial/first-steps/", + "https://fastapi.tiangolo.com/tutorial/path-params/", + "https://fastapi.tiangolo.com/tutorial/body/", + "https://fastapi.tiangolo.com/tutorial/dependencies/", + "https://fastapi.tiangolo.com/advanced/", + "https://fastapi.tiangolo.com/reference/" + ], + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": ["/tutorial/", "/advanced/", "/reference/"], + "exclude": ["/help/", "/external-links/", "/deployment/"] + }, + "categories": { + "getting_started": ["first-steps", "tutorial", "intro"], + "path_operations": ["path", "operations", "routing"], + "request_data": ["request", "body", "query", "parameters"], + "dependencies": ["dependencies", "injection"], + "security": ["security", "oauth", "authentication"], + "database": ["database", "sql", "orm"] + }, + "rate_limit": 0.5, + "max_pages": 250 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi_unified.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi_unified.json new file mode 100644 index 0000000..6f76b9e --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi_unified.json @@ -0,0 +1,45 @@ +{ + "name": "fastapi", + "description": "Complete FastAPI knowledge combining official documentation and FastAPI codebase. Use when building FastAPI applications, understanding async patterns, or working with Pydantic models.", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://fastapi.tiangolo.com/", + "extract_api": true, + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": ["/img/", "/js/"] + }, + "categories": { + "getting_started": ["tutorial", "first-steps"], + "path_operations": ["path-params", "query-params", "body"], + "dependencies": ["dependencies"], + "security": ["security", "oauth2"], + "database": ["sql-databases"], + "advanced": ["advanced", "async", "middleware"], + "deployment": ["deployment"] + }, + "rate_limit": 0.5, + "max_pages": 150 + }, + { + "type": "github", + "repo": "tiangolo/fastapi", + "include_issues": true, + "max_issues": 100, + "include_changelog": true, + "include_releases": true, + "include_code": true, + "code_analysis_depth": "surface", + "file_patterns": [ + "fastapi/**/*.py" + ] + } + ] +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi_unified_test.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi_unified_test.json new file mode 100644 index 0000000..cd18825 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/fastapi_unified_test.json @@ -0,0 +1,41 @@ +{ + "name": "fastapi_test", + "description": "FastAPI test - unified scraping with limited pages", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://fastapi.tiangolo.com/", + "extract_api": true, + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": ["/img/", "/js/"] + }, + "categories": { + "getting_started": ["tutorial", "first-steps"], + "path_operations": ["path-params", "query-params"], + "api": ["reference"] + }, + "rate_limit": 0.5, + "max_pages": 20 + }, + { + "type": "github", + "repo": "tiangolo/fastapi", + "include_issues": false, + "include_changelog": false, + "include_releases": true, + "include_code": true, + "code_analysis_depth": "surface", + "file_patterns": [ + "fastapi/routing.py", + "fastapi/applications.py" + ] + } + ] +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot-large-example.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot-large-example.json new file mode 100644 index 0000000..a4d04b9 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot-large-example.json @@ -0,0 +1,63 @@ +{ + "name": "godot", + "description": "Godot Engine game development. Use for Godot projects, GDScript/C# coding, scene setup, node systems, 2D/3D development, physics, animation, UI, shaders, or any Godot-specific questions.", + "base_url": "https://docs.godotengine.org/en/stable/", + "start_urls": [ + "https://docs.godotengine.org/en/stable/getting_started/introduction/index.html", + "https://docs.godotengine.org/en/stable/tutorials/scripting/gdscript/index.html", + "https://docs.godotengine.org/en/stable/tutorials/2d/index.html", + "https://docs.godotengine.org/en/stable/tutorials/3d/index.html", + "https://docs.godotengine.org/en/stable/tutorials/physics/index.html", + "https://docs.godotengine.org/en/stable/tutorials/animation/index.html", + "https://docs.godotengine.org/en/stable/classes/index.html" + ], + "selectors": { + "main_content": "div[role='main']", + "title": "title", + "code_blocks": "pre" + }, + "url_patterns": { + "include": [ + "/getting_started/", + "/tutorials/", + "/classes/" + ], + "exclude": [ + "/genindex.html", + "/search.html", + "/_static/", + "/_sources/" + ] + }, + "categories": { + "getting_started": ["introduction", "getting_started", "first", "your_first"], + "scripting": ["scripting", "gdscript", "c#", "csharp"], + "2d": ["/2d/", "sprite", "canvas", "tilemap"], + "3d": ["/3d/", "spatial", "mesh", "3d_"], + "physics": ["physics", "collision", "rigidbody", "characterbody"], + "animation": ["animation", "tween", "animationplayer"], + "ui": ["ui", "control", "gui", "theme"], + "shaders": ["shader", "material", "visual_shader"], + "audio": ["audio", "sound"], + "networking": ["networking", "multiplayer", "rpc"], + "export": ["export", "platform", "deploy"] + }, + "rate_limit": 0.5, + "max_pages": 40000, + + "_comment": "=== NEW: Split Strategy Configuration ===", + "split_strategy": "router", + "split_config": { + "target_pages_per_skill": 5000, + "create_router": true, + "split_by_categories": ["scripting", "2d", "3d", "physics", "shaders"], + "router_name": "godot", + "parallel_scraping": true + }, + + "_comment2": "=== NEW: Checkpoint Configuration ===", + "checkpoint": { + "enabled": true, + "interval": 1000 + } +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot.json new file mode 100644 index 0000000..acd49f2 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot.json @@ -0,0 +1,47 @@ +{ + "name": "godot", + "description": "Godot Engine game development. Use for Godot projects, GDScript/C# coding, scene setup, node systems, 2D/3D development, physics, animation, UI, shaders, or any Godot-specific questions.", + "base_url": "https://docs.godotengine.org/en/stable/", + "start_urls": [ + "https://docs.godotengine.org/en/stable/getting_started/introduction/index.html", + "https://docs.godotengine.org/en/stable/tutorials/scripting/gdscript/index.html", + "https://docs.godotengine.org/en/stable/tutorials/2d/index.html", + "https://docs.godotengine.org/en/stable/tutorials/3d/index.html", + "https://docs.godotengine.org/en/stable/tutorials/physics/index.html", + "https://docs.godotengine.org/en/stable/tutorials/animation/index.html", + "https://docs.godotengine.org/en/stable/classes/index.html" + ], + "selectors": { + "main_content": "div[role='main']", + "title": "title", + "code_blocks": "pre" + }, + "url_patterns": { + "include": [ + "/getting_started/", + "/tutorials/", + "/classes/" + ], + "exclude": [ + "/genindex.html", + "/search.html", + "/_static/", + "/_sources/" + ] + }, + "categories": { + "getting_started": ["introduction", "getting_started", "first", "your_first"], + "scripting": ["scripting", "gdscript", "c#", "csharp"], + "2d": ["/2d/", "sprite", "canvas", "tilemap"], + "3d": ["/3d/", "spatial", "mesh", "3d_"], + "physics": ["physics", "collision", "rigidbody", "characterbody"], + "animation": ["animation", "tween", "animationplayer"], + "ui": ["ui", "control", "gui", "theme"], + "shaders": ["shader", "material", "visual_shader"], + "audio": ["audio", "sound"], + "networking": ["networking", "multiplayer", "rpc"], + "export": ["export", "platform", "deploy"] + }, + "rate_limit": 0.5, + "max_pages": 500 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot_github.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot_github.json new file mode 100644 index 0000000..e33c66f --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot_github.json @@ -0,0 +1,19 @@ +{ + "name": "godot", + "repo": "godotengine/godot", + "description": "Godot Engine - Multi-platform 2D and 3D game engine", + "github_token": null, + "include_issues": true, + "max_issues": 100, + "include_changelog": true, + "include_releases": true, + "include_code": false, + "file_patterns": [ + "core/**/*.h", + "core/**/*.cpp", + "scene/**/*.h", + "scene/**/*.cpp", + "servers/**/*.h", + "servers/**/*.cpp" + ] +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot_unified.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot_unified.json new file mode 100644 index 0000000..3366dea --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/godot_unified.json @@ -0,0 +1,50 @@ +{ + "name": "godot", + "description": "Complete Godot Engine knowledge base combining official documentation and source code analysis", + "merge_mode": "claude-enhanced", + "sources": [ + { + "type": "documentation", + "base_url": "https://docs.godotengine.org/en/stable/", + "extract_api": true, + "selectors": { + "main_content": "div[role='main']", + "title": "title", + "code_blocks": "pre" + }, + "url_patterns": { + "include": [], + "exclude": ["/search.html", "/_static/", "/_images/"] + }, + "categories": { + "getting_started": ["introduction", "getting_started", "step_by_step"], + "scripting": ["scripting", "gdscript", "c_sharp"], + "2d": ["2d", "canvas", "sprite", "animation"], + "3d": ["3d", "spatial", "mesh", "shader"], + "physics": ["physics", "collision", "rigidbody"], + "api": ["api", "class", "reference", "method"] + }, + "rate_limit": 0.5, + "max_pages": 500 + }, + { + "type": "github", + "repo": "godotengine/godot", + "github_token": null, + "code_analysis_depth": "deep", + "include_code": true, + "include_issues": true, + "max_issues": 100, + "include_changelog": true, + "include_releases": true, + "file_patterns": [ + "core/**/*.h", + "core/**/*.cpp", + "scene/**/*.h", + "scene/**/*.cpp", + "servers/**/*.h", + "servers/**/*.cpp" + ] + } + ] +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/hono.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/hono.json new file mode 100644 index 0000000..e27ca41 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/hono.json @@ -0,0 +1,18 @@ +{ + "name": "hono", + "description": "Hono web application framework for building fast, lightweight APIs. Use for Hono routing, middleware, context handling, and modern JavaScript/TypeScript web development.", + "llms_txt_url": "https://hono.dev/llms-full.txt", + "base_url": "https://hono.dev/docs", + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": [] + }, + "categories": {}, + "rate_limit": 0.5, + "max_pages": 50 +} \ No newline at end of file diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/kubernetes.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/kubernetes.json new file mode 100644 index 0000000..717794b --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/kubernetes.json @@ -0,0 +1,48 @@ +{ + "name": "kubernetes", + "description": "Kubernetes container orchestration platform. Use for K8s clusters, deployments, pods, services, networking, storage, configuration, and DevOps tasks.", + "base_url": "https://kubernetes.io/docs/", + "start_urls": [ + "https://kubernetes.io/docs/home/", + "https://kubernetes.io/docs/concepts/", + "https://kubernetes.io/docs/tasks/", + "https://kubernetes.io/docs/tutorials/", + "https://kubernetes.io/docs/reference/" + ], + "selectors": { + "main_content": "main", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [ + "/docs/concepts/", + "/docs/tasks/", + "/docs/tutorials/", + "/docs/reference/", + "/docs/setup/" + ], + "exclude": [ + "/search/", + "/blog/", + "/training/", + "/partners/", + "/community/", + "/_print/", + "/case-studies/" + ] + }, + "categories": { + "getting_started": ["getting-started", "setup", "learning-environment"], + "concepts": ["concepts", "overview", "architecture"], + "workloads": ["workloads", "pods", "deployments", "replicaset", "statefulset", "daemonset"], + "services": ["services", "networking", "ingress", "service"], + "storage": ["storage", "volumes", "persistent"], + "configuration": ["configuration", "configmap", "secret"], + "security": ["security", "rbac", "policies", "authentication"], + "tasks": ["tasks", "administer", "configure"], + "tutorials": ["tutorials", "stateless", "stateful"] + }, + "rate_limit": 0.5, + "max_pages": 1000 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/laravel.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/laravel.json new file mode 100644 index 0000000..f68c9bf --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/laravel.json @@ -0,0 +1,34 @@ +{ + "name": "laravel", + "description": "Laravel PHP web framework. Use for Laravel models, routes, controllers, Blade templates, Eloquent ORM, authentication, and PHP web development.", + "base_url": "https://laravel.com/docs/9.x/", + "start_urls": [ + "https://laravel.com/docs/9.x/installation", + "https://laravel.com/docs/9.x/routing", + "https://laravel.com/docs/9.x/controllers", + "https://laravel.com/docs/9.x/views", + "https://laravel.com/docs/9.x/blade", + "https://laravel.com/docs/9.x/eloquent", + "https://laravel.com/docs/9.x/migrations", + "https://laravel.com/docs/9.x/authentication" + ], + "selectors": { + "main_content": "#main-content", + "title": "h1", + "code_blocks": "pre" + }, + "url_patterns": { + "include": ["/docs/9.x/", "/docs/10.x/", "/docs/11.x/"], + "exclude": ["/api/", "/packages/"] + }, + "categories": { + "getting_started": ["installation", "configuration", "structure", "deployment"], + "routing": ["routing", "middleware", "controllers"], + "views": ["views", "blade", "templates"], + "models": ["eloquent", "database", "migrations", "seeding", "queries"], + "authentication": ["authentication", "authorization", "passwords"], + "api": ["api", "resources", "requests", "responses"] + }, + "rate_limit": 0.3, + "max_pages": 500 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/python-tutorial-test.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/python-tutorial-test.json new file mode 100644 index 0000000..240b0be --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/python-tutorial-test.json @@ -0,0 +1,17 @@ +{ + "name": "python-tutorial-test", + "description": "Python tutorial for testing MCP tools", + "base_url": "https://docs.python.org/3/tutorial/", + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": [] + }, + "categories": {}, + "rate_limit": 0.3, + "max_pages": 10 +} \ No newline at end of file diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/react.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/react.json new file mode 100644 index 0000000..e6f4c92 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/react.json @@ -0,0 +1,31 @@ +{ + "name": "react", + "description": "React framework for building user interfaces. Use for React components, hooks, state management, JSX, and modern frontend development.", + "base_url": "https://react.dev/", + "start_urls": [ + "https://react.dev/learn", + "https://react.dev/learn/quick-start", + "https://react.dev/learn/thinking-in-react", + "https://react.dev/reference/react", + "https://react.dev/reference/react-dom", + "https://react.dev/reference/react/hooks" + ], + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": ["/learn", "/reference"], + "exclude": ["/community", "/blog"] + }, + "categories": { + "getting_started": ["quick-start", "installation", "tutorial"], + "hooks": ["usestate", "useeffect", "usememo", "usecallback", "usecontext", "useref", "hook"], + "components": ["component", "props", "jsx"], + "state": ["state", "context", "reducer"], + "api": ["api", "reference"] + }, + "rate_limit": 0.5, + "max_pages": 300 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/react_github.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/react_github.json new file mode 100644 index 0000000..4c8b86a --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/react_github.json @@ -0,0 +1,15 @@ +{ + "name": "react", + "repo": "facebook/react", + "description": "React JavaScript library for building user interfaces", + "github_token": null, + "include_issues": true, + "max_issues": 100, + "include_changelog": true, + "include_releases": true, + "include_code": false, + "file_patterns": [ + "packages/**/*.js", + "packages/**/*.ts" + ] +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/react_unified.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/react_unified.json new file mode 100644 index 0000000..437bd1d --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/react_unified.json @@ -0,0 +1,44 @@ +{ + "name": "react", + "description": "Complete React knowledge base combining official documentation and React codebase insights. Use when working with React, understanding API changes, or debugging React internals.", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://react.dev/", + "extract_api": true, + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": ["/blog/", "/community/"] + }, + "categories": { + "getting_started": ["learn", "installation", "quick-start"], + "components": ["components", "props", "state"], + "hooks": ["hooks", "usestate", "useeffect", "usecontext"], + "api": ["api", "reference"], + "advanced": ["context", "refs", "portals", "suspense"] + }, + "rate_limit": 0.5, + "max_pages": 200 + }, + { + "type": "github", + "repo": "facebook/react", + "include_issues": true, + "max_issues": 100, + "include_changelog": true, + "include_releases": true, + "include_code": true, + "code_analysis_depth": "surface", + "file_patterns": [ + "packages/react/src/**/*.js", + "packages/react-dom/src/**/*.js" + ] + } + ] +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/steam-economy-complete.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/steam-economy-complete.json new file mode 100644 index 0000000..2642cd9 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/steam-economy-complete.json @@ -0,0 +1,108 @@ +{ + "name": "steam-economy-complete", + "description": "Complete Steam Economy system including inventory, microtransactions, trading, and monetization. Use for ISteamInventory API, ISteamEconomy API, IInventoryService Web API, Steam Wallet integration, in-app purchases, item definitions, trading, crafting, market integration, and all economy features for game developers.", + "base_url": "https://partner.steamgames.com/doc/", + "start_urls": [ + "https://partner.steamgames.com/doc/features/inventory", + "https://partner.steamgames.com/doc/features/microtransactions", + "https://partner.steamgames.com/doc/features/microtransactions/implementation", + "https://partner.steamgames.com/doc/api/ISteamInventory", + "https://partner.steamgames.com/doc/webapi/ISteamEconomy", + "https://partner.steamgames.com/doc/webapi/IInventoryService", + "https://partner.steamgames.com/doc/features/inventory/economy" + ], + "selectors": { + "main_content": "div.documentation_bbcode", + "title": "div.docPageTitle", + "code_blocks": "div.bb_code" + }, + "url_patterns": { + "include": [ + "/features/inventory", + "/features/microtransactions", + "/api/ISteamInventory", + "/webapi/ISteamEconomy", + "/webapi/IInventoryService" + ], + "exclude": [ + "/home", + "/sales", + "/marketing", + "/legal", + "/finance", + "/login", + "/search", + "/steamworks/apps", + "/steamworks/partner" + ] + }, + "categories": { + "getting_started": [ + "overview", + "getting started", + "introduction", + "quickstart", + "setup" + ], + "inventory_system": [ + "inventory", + "item definition", + "item schema", + "item properties", + "itemdefs", + "ISteamInventory" + ], + "microtransactions": [ + "microtransaction", + "purchase", + "payment", + "checkout", + "wallet", + "transaction" + ], + "economy_api": [ + "ISteamEconomy", + "economy", + "asset", + "context" + ], + "inventory_webapi": [ + "IInventoryService", + "webapi", + "web api", + "http" + ], + "trading": [ + "trading", + "trade", + "exchange", + "market" + ], + "crafting": [ + "crafting", + "recipe", + "combine", + "exchange" + ], + "pricing": [ + "pricing", + "price", + "cost", + "currency" + ], + "implementation": [ + "integration", + "implementation", + "configure", + "best practices" + ], + "examples": [ + "example", + "sample", + "tutorial", + "walkthrough" + ] + }, + "rate_limit": 0.7, + "max_pages": 1000 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/tailwind.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/tailwind.json new file mode 100644 index 0000000..38a11d7 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/tailwind.json @@ -0,0 +1,30 @@ +{ + "name": "tailwind", + "description": "Tailwind CSS utility-first framework for rapid UI development. Use for Tailwind utilities, responsive design, custom configurations, and modern CSS workflows.", + "base_url": "https://tailwindcss.com/docs", + "start_urls": [ + "https://tailwindcss.com/docs/installation", + "https://tailwindcss.com/docs/utility-first", + "https://tailwindcss.com/docs/responsive-design", + "https://tailwindcss.com/docs/hover-focus-and-other-states" + ], + "selectors": { + "main_content": "div.prose", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": ["/docs"], + "exclude": ["/blog", "/resources"] + }, + "categories": { + "getting_started": ["installation", "editor-setup", "intellisense"], + "core_concepts": ["utility-first", "responsive", "hover-focus", "dark-mode"], + "layout": ["container", "columns", "flex", "grid"], + "typography": ["font-family", "font-size", "text-align", "text-color"], + "backgrounds": ["background-color", "background-image", "gradient"], + "customization": ["configuration", "theme", "plugins"] + }, + "rate_limit": 0.5, + "max_pages": 100 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/test-manual.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/test-manual.json new file mode 100644 index 0000000..cfbcba5 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/test-manual.json @@ -0,0 +1,17 @@ +{ + "name": "test-manual", + "description": "Manual test config", + "base_url": "https://test.example.com/", + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": [] + }, + "categories": {}, + "rate_limit": 0.5, + "max_pages": 50 +} \ No newline at end of file diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/configs/vue.json b/skills/skills-skills/scripts/Skill_Seekers-development/configs/vue.json new file mode 100644 index 0000000..dc39d13 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/configs/vue.json @@ -0,0 +1,31 @@ +{ + "name": "vue", + "description": "Vue.js progressive JavaScript framework. Use for Vue components, reactivity, composition API, and frontend development.", + "base_url": "https://vuejs.org/", + "start_urls": [ + "https://vuejs.org/guide/introduction.html", + "https://vuejs.org/guide/quick-start.html", + "https://vuejs.org/guide/essentials/application.html", + "https://vuejs.org/guide/components/registration.html", + "https://vuejs.org/guide/reusability/composables.html", + "https://vuejs.org/api/" + ], + "selectors": { + "main_content": "main", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": ["/guide/", "/api/", "/examples/"], + "exclude": ["/about/", "/sponsor/", "/partners/"] + }, + "categories": { + "getting_started": ["quick-start", "introduction", "essentials"], + "components": ["component", "props", "events"], + "reactivity": ["reactivity", "reactive", "ref", "computed"], + "composition_api": ["composition", "setup"], + "api": ["api", "reference"] + }, + "rate_limit": 0.5, + "max_pages": 200 +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/demo_conflicts.py b/skills/skills-skills/scripts/Skill_Seekers-development/demo_conflicts.py new file mode 100644 index 0000000..776ad50 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/demo_conflicts.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +""" +Demo: Conflict Detection and Reporting + +This demonstrates the unified scraper's ability to detect and report +conflicts between documentation and code implementation. +""" + +import sys +import json +from pathlib import Path + +# Add CLI to path +sys.path.insert(0, str(Path(__file__).parent / 'cli')) + +from conflict_detector import ConflictDetector + +print("=" * 70) +print("UNIFIED SCRAPER - CONFLICT DETECTION DEMO") +print("=" * 70) +print() + +# Load test data +print("📂 Loading test data...") +print(" - Documentation APIs from example docs") +print(" - Code APIs from example repository") +print() + +with open('cli/conflicts.json', 'r') as f: + conflicts_data = json.load(f) + +conflicts = conflicts_data['conflicts'] +summary = conflicts_data['summary'] + +print(f"✅ Loaded {summary['total']} conflicts") +print() + +# Display summary +print("=" * 70) +print("CONFLICT SUMMARY") +print("=" * 70) +print() + +print(f"📊 **Total Conflicts**: {summary['total']}") +print() + +print("**By Type:**") +for conflict_type, count in summary['by_type'].items(): + if count > 0: + emoji = "📖" if conflict_type == "missing_in_docs" else "💻" if conflict_type == "missing_in_code" else "⚠️" + print(f" {emoji} {conflict_type}: {count}") +print() + +print("**By Severity:**") +for severity, count in summary['by_severity'].items(): + if count > 0: + emoji = "🔴" if severity == "high" else "🟡" if severity == "medium" else "🟢" + print(f" {emoji} {severity.upper()}: {count}") +print() + +# Display detailed conflicts +print("=" * 70) +print("DETAILED CONFLICT REPORTS") +print("=" * 70) +print() + +# Group by severity +high = [c for c in conflicts if c['severity'] == 'high'] +medium = [c for c in conflicts if c['severity'] == 'medium'] +low = [c for c in conflicts if c['severity'] == 'low'] + +# Show high severity first +if high: + print("🔴 **HIGH SEVERITY CONFLICTS** (Requires immediate attention)") + print("-" * 70) + for conflict in high: + print() + print(f"**API**: `{conflict['api_name']}`") + print(f"**Type**: {conflict['type']}") + print(f"**Issue**: {conflict['difference']}") + print(f"**Suggestion**: {conflict['suggestion']}") + + if conflict['docs_info']: + print(f"\n**Documented as**:") + print(f" Signature: {conflict['docs_info'].get('raw_signature', 'N/A')}") + + if conflict['code_info']: + print(f"\n**Implemented as**:") + params = conflict['code_info'].get('parameters', []) + param_str = ', '.join(f"{p['name']}: {p.get('type_hint', 'Any')}" for p in params if p['name'] != 'self') + print(f" Signature: {conflict['code_info']['name']}({param_str})") + print(f" Return type: {conflict['code_info'].get('return_type', 'None')}") + print(f" Location: {conflict['code_info'].get('source', 'N/A')}:{conflict['code_info'].get('line', '?')}") + print() + +# Show medium severity +if medium: + print("🟡 **MEDIUM SEVERITY CONFLICTS** (Review recommended)") + print("-" * 70) + for conflict in medium[:3]: # Show first 3 + print() + print(f"**API**: `{conflict['api_name']}`") + print(f"**Type**: {conflict['type']}") + print(f"**Issue**: {conflict['difference']}") + + if conflict['code_info']: + print(f"**Location**: {conflict['code_info'].get('source', 'N/A')}") + + if len(medium) > 3: + print(f"\n ... and {len(medium) - 3} more medium severity conflicts") + print() + +# Example: How conflicts appear in final skill +print("=" * 70) +print("HOW CONFLICTS APPEAR IN SKILL.MD") +print("=" * 70) +print() + +example_conflict = high[0] if high else medium[0] if medium else conflicts[0] + +print("```markdown") +print("## 🔧 API Reference") +print() +print("### ⚠️ APIs with Conflicts") +print() +print(f"#### `{example_conflict['api_name']}`") +print() +print(f"⚠️ **Conflict**: {example_conflict['difference']}") +print() + +if example_conflict.get('docs_info'): + print("**Documentation says:**") + print("```") + print(example_conflict['docs_info'].get('raw_signature', 'N/A')) + print("```") + print() + +if example_conflict.get('code_info'): + print("**Code implementation:**") + print("```python") + params = example_conflict['code_info'].get('parameters', []) + param_strs = [] + for p in params: + if p['name'] == 'self': + continue + param_str = p['name'] + if p.get('type_hint'): + param_str += f": {p['type_hint']}" + if p.get('default'): + param_str += f" = {p['default']}" + param_strs.append(param_str) + + sig = f"def {example_conflict['code_info']['name']}({', '.join(param_strs)})" + if example_conflict['code_info'].get('return_type'): + sig += f" -> {example_conflict['code_info']['return_type']}" + + print(sig) + print("```") +print() + +print("*Source: both (conflict)*") +print("```") +print() + +# Key takeaways +print("=" * 70) +print("KEY TAKEAWAYS") +print("=" * 70) +print() + +print("✅ **What the Unified Scraper Does:**") +print(" 1. Extracts APIs from both documentation and code") +print(" 2. Compares them to detect discrepancies") +print(" 3. Classifies conflicts by type and severity") +print(" 4. Provides actionable suggestions") +print(" 5. Shows both versions transparently in the skill") +print() + +print("⚠️ **Common Conflict Types:**") +print(" - **Missing in docs**: Undocumented features in code") +print(" - **Missing in code**: Documented but not implemented") +print(" - **Signature mismatch**: Different parameters/types") +print(" - **Description mismatch**: Different explanations") +print() + +print("🎯 **Value:**") +print(" - Identifies documentation gaps") +print(" - Catches outdated documentation") +print(" - Highlights implementation differences") +print(" - Creates single source of truth showing reality") +print() + +print("=" * 70) +print("END OF DEMO") +print("=" * 70) diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/example-mcp-config.json b/skills/skills-skills/scripts/Skill_Seekers-development/example-mcp-config.json new file mode 100644 index 0000000..80d946c --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/example-mcp-config.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "skill-seeker": { + "command": "python3", + "args": [ + "/mnt/1ece809a-2821-4f10-aecb-fcdf34760c0b/Git/Skill_Seekers/mcp/server.py" + ], + "cwd": "/mnt/1ece809a-2821-4f10-aecb-fcdf34760c0b/Git/Skill_Seekers" + } + } +} diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/mypy.ini b/skills/skills-skills/scripts/Skill_Seekers-development/mypy.ini new file mode 100644 index 0000000..857c31c --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/mypy.ini @@ -0,0 +1,13 @@ +[mypy] +python_version = 3.10 +warn_return_any = False +warn_unused_configs = True +disallow_untyped_defs = False +check_untyped_defs = True +ignore_missing_imports = True +no_implicit_optional = True +show_error_codes = True + +# Gradual typing - be lenient for now +disallow_incomplete_defs = False +disallow_untyped_calls = False diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/pyproject.toml b/skills/skills-skills/scripts/Skill_Seekers-development/pyproject.toml new file mode 100644 index 0000000..91c8391 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/pyproject.toml @@ -0,0 +1,149 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "skill-seekers" +version = "2.1.1" +description = "Convert documentation websites, GitHub repositories, and PDFs into Claude AI skills" +readme = "README.md" +requires-python = ">=3.10" +license = {text = "MIT"} +authors = [ + {name = "Yusuf Karaaslan"} +] +keywords = [ + "claude", + "ai", + "documentation", + "scraping", + "skills", + "llm", + "mcp", + "automation" +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Software Development :: Documentation", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Markup :: Markdown", +] + +# Core dependencies +dependencies = [ + "requests>=2.32.5", + "beautifulsoup4>=4.14.2", + "PyGithub>=2.5.0", + "mcp>=1.18.0", + "httpx>=0.28.1", + "httpx-sse>=0.4.3", + "PyMuPDF>=1.24.14", + "Pillow>=11.0.0", + "pytesseract>=0.3.13", + "pydantic>=2.12.3", + "pydantic-settings>=2.11.0", + "python-dotenv>=1.1.1", + "jsonschema>=4.25.1", + "click>=8.3.0", + "Pygments>=2.19.2", +] + +[project.optional-dependencies] +# Development dependencies +dev = [ + "pytest>=8.4.2", + "pytest-cov>=7.0.0", + "coverage>=7.11.0", +] + +# MCP server dependencies (included by default, but optional) +mcp = [ + "mcp>=1.18.0", + "httpx>=0.28.1", + "httpx-sse>=0.4.3", + "uvicorn>=0.38.0", + "starlette>=0.48.0", + "sse-starlette>=3.0.2", +] + +# All optional dependencies combined +all = [ + "pytest>=8.4.2", + "pytest-cov>=7.0.0", + "coverage>=7.11.0", + "mcp>=1.18.0", + "httpx>=0.28.1", + "httpx-sse>=0.4.3", + "uvicorn>=0.38.0", + "starlette>=0.48.0", + "sse-starlette>=3.0.2", +] + +[project.urls] +Homepage = "https://github.com/yusufkaraaslan/Skill_Seekers" +Repository = "https://github.com/yusufkaraaslan/Skill_Seekers" +"Bug Tracker" = "https://github.com/yusufkaraaslan/Skill_Seekers/issues" +Documentation = "https://github.com/yusufkaraaslan/Skill_Seekers#readme" + +[project.scripts] +# Main unified CLI +skill-seekers = "skill_seekers.cli.main:main" + +# Individual tool entry points +skill-seekers-scrape = "skill_seekers.cli.doc_scraper:main" +skill-seekers-github = "skill_seekers.cli.github_scraper:main" +skill-seekers-pdf = "skill_seekers.cli.pdf_scraper:main" +skill-seekers-unified = "skill_seekers.cli.unified_scraper:main" +skill-seekers-enhance = "skill_seekers.cli.enhance_skill_local:main" +skill-seekers-package = "skill_seekers.cli.package_skill:main" +skill-seekers-upload = "skill_seekers.cli.upload_skill:main" +skill-seekers-estimate = "skill_seekers.cli.estimate_pages:main" + +[tool.setuptools] +packages = ["skill_seekers", "skill_seekers.cli", "skill_seekers.mcp", "skill_seekers.mcp.tools"] + +[tool.setuptools.package-dir] +"" = "src" + +[tool.setuptools.package-data] +skill_seekers = ["py.typed"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v --tb=short --strict-markers" + +[tool.coverage.run] +source = ["src/skill_seekers"] +omit = ["*/tests/*", "*/__pycache__/*", "*/venv/*"] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", + "@abstractmethod", +] + +[tool.uv] +dev-dependencies = [ + "pytest>=8.4.2", + "pytest-cov>=7.0.0", + "coverage>=7.11.0", +] + +[tool.uv.sources] +# Use PyPI for all dependencies diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/requirements.txt b/skills/skills-skills/scripts/Skill_Seekers-development/requirements.txt new file mode 100644 index 0000000..c6e9ced --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/requirements.txt @@ -0,0 +1,42 @@ +annotated-types==0.7.0 +anyio==4.11.0 +attrs==25.4.0 +beautifulsoup4==4.14.2 +certifi==2025.10.5 +charset-normalizer==3.4.4 +click==8.3.0 +coverage==7.11.0 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 +httpx-sse==0.4.3 +idna==3.11 +iniconfig==2.3.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +mcp==1.18.0 +packaging==25.0 +pluggy==1.6.0 +pydantic==2.12.3 +pydantic-settings==2.11.0 +pydantic_core==2.41.4 +PyGithub==2.5.0 +Pygments==2.19.2 +PyMuPDF==1.24.14 +Pillow==11.0.0 +pytesseract==0.3.13 +pytest==8.4.2 +pytest-cov==7.0.0 +python-dotenv==1.1.1 +python-multipart==0.0.20 +referencing==0.37.0 +requests==2.32.5 +rpds-py==0.27.1 +sniffio==1.3.1 +soupsieve==2.8 +sse-starlette==3.0.2 +starlette==0.48.0 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.5.0 +uvicorn==0.38.0 diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/setup_mcp.sh b/skills/skills-skills/scripts/Skill_Seekers-development/setup_mcp.sh new file mode 100644 index 0000000..4047102 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/setup_mcp.sh @@ -0,0 +1,266 @@ +#!/bin/bash +# Skill Seeker MCP Server - Quick Setup Script +# This script automates the MCP server setup for Claude Code + +set -e # Exit on error + +echo "==================================================" +echo "Skill Seeker MCP Server - Quick Setup" +echo "==================================================" +echo "" + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +# Step 1: Check Python version +echo "Step 1: Checking Python version..." +if ! command -v python3 &> /dev/null; then + echo -e "${RED}❌ Error: python3 not found${NC}" + echo "Please install Python 3.7 or higher" + exit 1 +fi + +PYTHON_VERSION=$(python3 --version | cut -d' ' -f2) +echo -e "${GREEN}✓${NC} Python $PYTHON_VERSION found" +echo "" + +# Step 2: Get repository path +REPO_PATH=$(pwd) +echo "Step 2: Repository location" +echo "Path: $REPO_PATH" +echo "" + +# Step 3: Install dependencies +echo "Step 3: Installing Python dependencies..." + +# Check if we're in a virtual environment +if [[ -n "$VIRTUAL_ENV" ]]; then + echo -e "${GREEN}✓${NC} Virtual environment detected: $VIRTUAL_ENV" + PIP_INSTALL_CMD="pip install" +elif [[ -d "venv" ]]; then + echo -e "${YELLOW}⚠${NC} Virtual environment found but not activated" + echo "Activating venv..." + source venv/bin/activate + PIP_INSTALL_CMD="pip install" +else + echo -e "${YELLOW}⚠${NC} No virtual environment found" + echo "It's recommended to use a virtual environment to avoid conflicts." + echo "" + read -p "Would you like to create one now? (y/n) " -n 1 -r + echo "" + + if [[ $REPLY =~ ^[Yy]$ ]]; then + echo "Creating virtual environment..." + python3 -m venv venv || { + echo -e "${RED}❌ Failed to create virtual environment${NC}" + echo "Falling back to system install..." + PIP_INSTALL_CMD="pip3 install --user --break-system-packages" + } + + if [[ -d "venv" ]]; then + source venv/bin/activate + PIP_INSTALL_CMD="pip install" + echo -e "${GREEN}✓${NC} Virtual environment created and activated" + fi + else + echo "Proceeding with system install (using --user --break-system-packages)..." + echo -e "${YELLOW}Note:${NC} This may override system-managed packages" + PIP_INSTALL_CMD="pip3 install --user --break-system-packages" + fi +fi + +echo "This will install: mcp, requests, beautifulsoup4" +read -p "Continue? (y/n) " -n 1 -r +echo "" + +if [[ $REPLY =~ ^[Yy]$ ]]; then + echo "Installing package in editable mode..." + $PIP_INSTALL_CMD -e . || { + echo -e "${RED}❌ Failed to install package${NC}" + exit 1 + } + + echo -e "${GREEN}✓${NC} Dependencies installed successfully" +else + echo "Skipping dependency installation" +fi +echo "" + +# Step 4: Test MCP server +echo "Step 4: Testing MCP server..." +timeout 3 python3 src/skill_seekers/mcp/server.py 2>/dev/null || { + if [ $? -eq 124 ]; then + echo -e "${GREEN}✓${NC} MCP server starts correctly (timeout expected)" + else + echo -e "${YELLOW}⚠${NC} MCP server test inconclusive, but may still work" + fi +} +echo "" + +# Step 5: Optional - Run tests +echo "Step 5: Run test suite? (optional)" +read -p "Run MCP tests to verify everything works? (y/n) " -n 1 -r +echo "" + +if [[ $REPLY =~ ^[Yy]$ ]]; then + # Check if pytest is installed + if ! command -v pytest &> /dev/null; then + echo "Installing pytest..." + $PIP_INSTALL_CMD pytest || { + echo -e "${YELLOW}⚠${NC} Could not install pytest, skipping tests" + } + fi + + if command -v pytest &> /dev/null; then + echo "Running MCP server tests..." + python3 -m pytest tests/test_mcp_server.py -v --tb=short || { + echo -e "${RED}❌ Some tests failed${NC}" + echo "The server may still work, but please check the errors above" + } + fi +else + echo "Skipping tests" +fi +echo "" + +# Step 6: Configure Claude Code +echo "Step 6: Configure Claude Code" +echo "==================================================" +echo "" +echo "You need to add this configuration to Claude Code:" +echo "" +echo -e "${YELLOW}Configuration file:${NC} ~/.config/claude-code/mcp.json" +echo "" +echo "Add this JSON configuration (paths are auto-detected for YOUR system):" +echo "" +echo -e "${GREEN}{" +echo " \"mcpServers\": {" +echo " \"skill-seeker\": {" +echo " \"command\": \"python3\"," +echo " \"args\": [" +echo " \"$REPO_PATH/src/skill_seekers/mcp/server.py\"" +echo " ]," +echo " \"cwd\": \"$REPO_PATH\"" +echo " }" +echo " }" +echo -e "}${NC}" +echo "" +echo -e "${YELLOW}Note:${NC} The paths above are YOUR actual paths (not placeholders!)" +echo "" + +# Ask if user wants auto-configure +echo "" +read -p "Auto-configure Claude Code now? (y/n) " -n 1 -r +echo "" + +if [[ $REPLY =~ ^[Yy]$ ]]; then + # Check if config already exists + if [ -f ~/.config/claude-code/mcp.json ]; then + echo -e "${YELLOW}⚠ Warning: ~/.config/claude-code/mcp.json already exists${NC}" + echo "Current contents:" + cat ~/.config/claude-code/mcp.json + echo "" + read -p "Overwrite? (y/n) " -n 1 -r + echo "" + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Skipping auto-configuration" + echo "Please manually add the skill-seeker server to your config" + exit 0 + fi + fi + + # Create config directory + mkdir -p ~/.config/claude-code + + # Write configuration with actual expanded path + cat > ~/.config/claude-code/mcp.json << EOF +{ + "mcpServers": { + "skill-seeker": { + "command": "python3", + "args": [ + "$REPO_PATH/src/skill_seekers/mcp/server.py" + ], + "cwd": "$REPO_PATH" + } + } +} +EOF + + echo -e "${GREEN}✓${NC} Configuration written to ~/.config/claude-code/mcp.json" + echo "" + echo "Configuration contents:" + cat ~/.config/claude-code/mcp.json + echo "" + + # Verify the path exists + if [ -f "$REPO_PATH/src/skill_seekers/mcp/server.py" ]; then + echo -e "${GREEN}✓${NC} Verified: MCP server file exists at $REPO_PATH/src/skill_seekers/mcp/server.py" + else + echo -e "${RED}❌ Warning: MCP server not found at $REPO_PATH/src/skill_seekers/mcp/server.py${NC}" + echo "Please check the path!" + fi +else + echo "Skipping auto-configuration" + echo "Please manually configure Claude Code using the JSON above" + echo "" + echo "IMPORTANT: Replace \$REPO_PATH with the actual path: $REPO_PATH" +fi +echo "" + +# Step 7: Test the configuration +if [ -f ~/.config/claude-code/mcp.json ]; then + echo "Step 7: Testing MCP configuration..." + echo "Checking if paths are correct..." + + # Extract the configured path + if command -v jq &> /dev/null; then + CONFIGURED_PATH=$(jq -r '.mcpServers["skill-seeker"].args[0]' ~/.config/claude-code/mcp.json 2>/dev/null || echo "") + if [ -n "$CONFIGURED_PATH" ] && [ -f "$CONFIGURED_PATH" ]; then + echo -e "${GREEN}✓${NC} MCP server path is valid: $CONFIGURED_PATH" + elif [ -n "$CONFIGURED_PATH" ]; then + echo -e "${YELLOW}⚠${NC} Warning: Configured path doesn't exist: $CONFIGURED_PATH" + fi + else + echo "Install 'jq' for config validation: brew install jq (macOS) or apt install jq (Linux)" + fi +fi +echo "" + +# Step 8: Final instructions +echo "==================================================" +echo "Setup Complete!" +echo "==================================================" +echo "" +echo "Next steps:" +echo "" +echo " 1. ${YELLOW}Restart Claude Code${NC} (quit and reopen, don't just close window)" +echo " 2. In Claude Code, test with: ${GREEN}\"List all available configs\"${NC}" +echo " 3. You should see 9 Skill Seeker tools available" +echo "" +echo "Available MCP Tools:" +echo " • generate_config - Create new config files" +echo " • estimate_pages - Estimate scraping time" +echo " • scrape_docs - Scrape documentation" +echo " • package_skill - Create .zip files" +echo " • list_configs - Show available configs" +echo " • validate_config - Validate config files" +echo "" +echo "Example commands to try in Claude Code:" +echo " • ${GREEN}List all available configs${NC}" +echo " • ${GREEN}Validate configs/react.json${NC}" +echo " • ${GREEN}Generate config for Tailwind at https://tailwindcss.com/docs${NC}" +echo "" +echo "Documentation:" +echo " • MCP Setup Guide: ${YELLOW}docs/MCP_SETUP.md${NC}" +echo " • Full docs: ${YELLOW}README.md${NC}" +echo "" +echo "Troubleshooting:" +echo " • Check logs: ~/Library/Logs/Claude Code/ (macOS)" +echo " • Test server: python3 src/skill_seekers/mcp/server.py" +echo " • Run tests: python3 -m pytest tests/test_mcp_server.py -v" +echo "" +echo "Happy skill creating! 🚀" diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/__init__.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/__init__.py new file mode 100644 index 0000000..752904b --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/__init__.py @@ -0,0 +1,22 @@ +""" +Skill Seekers - Convert documentation, GitHub repos, and PDFs into Claude AI skills. + +This package provides tools for automatically scraping, organizing, and packaging +documentation from various sources into uploadable Claude AI skills. +""" + +__version__ = "2.0.0" +__author__ = "Yusuf Karaaslan" +__license__ = "MIT" + +# Expose main components for easier imports +from skill_seekers.cli import __version__ as cli_version +from skill_seekers.mcp import __version__ as mcp_version + +__all__ = [ + "__version__", + "__author__", + "__license__", + "cli_version", + "mcp_version", +] diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/__init__.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/__init__.py new file mode 100644 index 0000000..d782d5d --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/__init__.py @@ -0,0 +1,39 @@ +"""Skill Seekers CLI tools package. + +This package provides command-line tools for converting documentation +websites into Claude AI skills. + +Main modules: + - doc_scraper: Main documentation scraping and skill building tool + - llms_txt_detector: Detect llms.txt files at documentation URLs + - llms_txt_downloader: Download llms.txt content + - llms_txt_parser: Parse llms.txt markdown content + - pdf_scraper: Extract documentation from PDF files + - enhance_skill: AI-powered skill enhancement (API-based) + - enhance_skill_local: AI-powered skill enhancement (local) + - estimate_pages: Estimate page count before scraping + - package_skill: Package skills into .zip files + - upload_skill: Upload skills to Claude + - utils: Shared utility functions +""" + +from .llms_txt_detector import LlmsTxtDetector +from .llms_txt_downloader import LlmsTxtDownloader +from .llms_txt_parser import LlmsTxtParser + +try: + from .utils import open_folder, read_reference_files +except ImportError: + # utils.py might not exist in all configurations + open_folder = None + read_reference_files = None + +__version__ = "2.0.0" + +__all__ = [ + "LlmsTxtDetector", + "LlmsTxtDownloader", + "LlmsTxtParser", + "open_folder", + "read_reference_files", +] diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/code_analyzer.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/code_analyzer.py new file mode 100644 index 0000000..cf33b16 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/code_analyzer.py @@ -0,0 +1,500 @@ +#!/usr/bin/env python3 +""" +Code Analyzer for GitHub Repositories + +Extracts code signatures at configurable depth levels: +- surface: File tree only (existing behavior) +- deep: Parse files for signatures, parameters, types +- full: Complete AST analysis (future enhancement) + +Supports multiple languages with language-specific parsers. +""" + +import ast +import re +import logging +from typing import Dict, List, Any, Optional +from dataclasses import dataclass, asdict + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@dataclass +class Parameter: + """Represents a function parameter.""" + name: str + type_hint: Optional[str] = None + default: Optional[str] = None + + +@dataclass +class FunctionSignature: + """Represents a function/method signature.""" + name: str + parameters: List[Parameter] + return_type: Optional[str] = None + docstring: Optional[str] = None + line_number: Optional[int] = None + is_async: bool = False + is_method: bool = False + decorators: List[str] = None + + def __post_init__(self): + if self.decorators is None: + self.decorators = [] + + +@dataclass +class ClassSignature: + """Represents a class signature.""" + name: str + base_classes: List[str] + methods: List[FunctionSignature] + docstring: Optional[str] = None + line_number: Optional[int] = None + + +class CodeAnalyzer: + """ + Analyzes code at different depth levels. + """ + + def __init__(self, depth: str = 'surface'): + """ + Initialize code analyzer. + + Args: + depth: Analysis depth ('surface', 'deep', 'full') + """ + self.depth = depth + + def analyze_file(self, file_path: str, content: str, language: str) -> Dict[str, Any]: + """ + Analyze a single file based on depth level. + + Args: + file_path: Path to file in repository + content: File content as string + language: Programming language (Python, JavaScript, etc.) + + Returns: + Dict containing extracted signatures + """ + if self.depth == 'surface': + return {} # Surface level doesn't analyze individual files + + logger.debug(f"Analyzing {file_path} (language: {language}, depth: {self.depth})") + + try: + if language == 'Python': + return self._analyze_python(content, file_path) + elif language in ['JavaScript', 'TypeScript']: + return self._analyze_javascript(content, file_path) + elif language in ['C', 'C++']: + return self._analyze_cpp(content, file_path) + else: + logger.debug(f"No analyzer for language: {language}") + return {} + except Exception as e: + logger.warning(f"Error analyzing {file_path}: {e}") + return {} + + def _analyze_python(self, content: str, file_path: str) -> Dict[str, Any]: + """Analyze Python file using AST.""" + try: + tree = ast.parse(content) + except SyntaxError as e: + logger.debug(f"Syntax error in {file_path}: {e}") + return {} + + classes = [] + functions = [] + + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + class_sig = self._extract_python_class(node) + classes.append(asdict(class_sig)) + elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef): + # Only top-level functions (not methods) + # Fix AST parser to check isinstance(parent.body, list) before 'in' operator + is_method = False + try: + is_method = any(isinstance(parent, ast.ClassDef) + for parent in ast.walk(tree) + if hasattr(parent, 'body') and isinstance(parent.body, list) and node in parent.body) + except (TypeError, AttributeError): + # If body is not iterable or check fails, assume it's a top-level function + is_method = False + + if not is_method: + func_sig = self._extract_python_function(node) + functions.append(asdict(func_sig)) + + return { + 'classes': classes, + 'functions': functions + } + + def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature: + """Extract class signature from AST node.""" + # Extract base classes + bases = [] + for base in node.bases: + if isinstance(base, ast.Name): + bases.append(base.id) + elif isinstance(base, ast.Attribute): + bases.append(f"{base.value.id}.{base.attr}" if hasattr(base.value, 'id') else base.attr) + + # Extract methods + methods = [] + for item in node.body: + if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)): + method_sig = self._extract_python_function(item, is_method=True) + methods.append(method_sig) + + # Extract docstring + docstring = ast.get_docstring(node) + + return ClassSignature( + name=node.name, + base_classes=bases, + methods=methods, + docstring=docstring, + line_number=node.lineno + ) + + def _extract_python_function(self, node, is_method: bool = False) -> FunctionSignature: + """Extract function signature from AST node.""" + # Extract parameters + params = [] + for arg in node.args.args: + param_type = None + if arg.annotation: + param_type = ast.unparse(arg.annotation) if hasattr(ast, 'unparse') else None + + params.append(Parameter( + name=arg.arg, + type_hint=param_type + )) + + # Extract defaults + defaults = node.args.defaults + if defaults: + # Defaults are aligned to the end of params + num_no_default = len(params) - len(defaults) + for i, default in enumerate(defaults): + param_idx = num_no_default + i + if param_idx < len(params): + try: + params[param_idx].default = ast.unparse(default) if hasattr(ast, 'unparse') else str(default) + except: + params[param_idx].default = "..." + + # Extract return type + return_type = None + if node.returns: + try: + return_type = ast.unparse(node.returns) if hasattr(ast, 'unparse') else None + except: + pass + + # Extract decorators + decorators = [] + for decorator in node.decorator_list: + try: + if hasattr(ast, 'unparse'): + decorators.append(ast.unparse(decorator)) + elif isinstance(decorator, ast.Name): + decorators.append(decorator.id) + except: + pass + + # Extract docstring + docstring = ast.get_docstring(node) + + return FunctionSignature( + name=node.name, + parameters=params, + return_type=return_type, + docstring=docstring, + line_number=node.lineno, + is_async=isinstance(node, ast.AsyncFunctionDef), + is_method=is_method, + decorators=decorators + ) + + def _analyze_javascript(self, content: str, file_path: str) -> Dict[str, Any]: + """ + Analyze JavaScript/TypeScript file using regex patterns. + + Note: This is a simplified approach. For production, consider using + a proper JS/TS parser like esprima or ts-morph. + """ + classes = [] + functions = [] + + # Extract class definitions + class_pattern = r'class\s+(\w+)(?:\s+extends\s+(\w+))?\s*\{' + for match in re.finditer(class_pattern, content): + class_name = match.group(1) + base_class = match.group(2) if match.group(2) else None + + # Try to extract methods (simplified) + class_block_start = match.end() + # This is a simplification - proper parsing would track braces + class_block_end = content.find('}', class_block_start) + if class_block_end != -1: + class_body = content[class_block_start:class_block_end] + methods = self._extract_js_methods(class_body) + else: + methods = [] + + classes.append({ + 'name': class_name, + 'base_classes': [base_class] if base_class else [], + 'methods': methods, + 'docstring': None, + 'line_number': content[:match.start()].count('\n') + 1 + }) + + # Extract top-level functions + func_pattern = r'(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)' + for match in re.finditer(func_pattern, content): + func_name = match.group(1) + params_str = match.group(2) + is_async = 'async' in match.group(0) + + params = self._parse_js_parameters(params_str) + + functions.append({ + 'name': func_name, + 'parameters': params, + 'return_type': None, # JS doesn't have type annotations (unless TS) + 'docstring': None, + 'line_number': content[:match.start()].count('\n') + 1, + 'is_async': is_async, + 'is_method': False, + 'decorators': [] + }) + + # Extract arrow functions assigned to const/let + arrow_pattern = r'(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\(([^)]*)\)\s*=>' + for match in re.finditer(arrow_pattern, content): + func_name = match.group(1) + params_str = match.group(2) + is_async = 'async' in match.group(0) + + params = self._parse_js_parameters(params_str) + + functions.append({ + 'name': func_name, + 'parameters': params, + 'return_type': None, + 'docstring': None, + 'line_number': content[:match.start()].count('\n') + 1, + 'is_async': is_async, + 'is_method': False, + 'decorators': [] + }) + + return { + 'classes': classes, + 'functions': functions + } + + def _extract_js_methods(self, class_body: str) -> List[Dict]: + """Extract method signatures from class body.""" + methods = [] + + # Match method definitions + method_pattern = r'(?:async\s+)?(\w+)\s*\(([^)]*)\)' + for match in re.finditer(method_pattern, class_body): + method_name = match.group(1) + params_str = match.group(2) + is_async = 'async' in match.group(0) + + # Skip constructor keyword detection + if method_name in ['if', 'for', 'while', 'switch']: + continue + + params = self._parse_js_parameters(params_str) + + methods.append({ + 'name': method_name, + 'parameters': params, + 'return_type': None, + 'docstring': None, + 'line_number': None, + 'is_async': is_async, + 'is_method': True, + 'decorators': [] + }) + + return methods + + def _parse_js_parameters(self, params_str: str) -> List[Dict]: + """Parse JavaScript parameter string.""" + params = [] + + if not params_str.strip(): + return params + + # Split by comma (simplified - doesn't handle complex default values) + param_list = [p.strip() for p in params_str.split(',')] + + for param in param_list: + if not param: + continue + + # Check for default value + if '=' in param: + name, default = param.split('=', 1) + name = name.strip() + default = default.strip() + else: + name = param + default = None + + # Check for type annotation (TypeScript) + type_hint = None + if ':' in name: + name, type_hint = name.split(':', 1) + name = name.strip() + type_hint = type_hint.strip() + + params.append({ + 'name': name, + 'type_hint': type_hint, + 'default': default + }) + + return params + + def _analyze_cpp(self, content: str, file_path: str) -> Dict[str, Any]: + """ + Analyze C/C++ header file using regex patterns. + + Note: This is a simplified approach focusing on header files. + For production, consider using libclang or similar. + """ + classes = [] + functions = [] + + # Extract class definitions (simplified - doesn't handle nested classes) + class_pattern = r'class\s+(\w+)(?:\s*:\s*public\s+(\w+))?\s*\{' + for match in re.finditer(class_pattern, content): + class_name = match.group(1) + base_class = match.group(2) if match.group(2) else None + + classes.append({ + 'name': class_name, + 'base_classes': [base_class] if base_class else [], + 'methods': [], # Simplified - would need to parse class body + 'docstring': None, + 'line_number': content[:match.start()].count('\n') + 1 + }) + + # Extract function declarations + func_pattern = r'(\w+(?:\s*\*|\s*&)?)\s+(\w+)\s*\(([^)]*)\)' + for match in re.finditer(func_pattern, content): + return_type = match.group(1).strip() + func_name = match.group(2) + params_str = match.group(3) + + # Skip common keywords + if func_name in ['if', 'for', 'while', 'switch', 'return']: + continue + + params = self._parse_cpp_parameters(params_str) + + functions.append({ + 'name': func_name, + 'parameters': params, + 'return_type': return_type, + 'docstring': None, + 'line_number': content[:match.start()].count('\n') + 1, + 'is_async': False, + 'is_method': False, + 'decorators': [] + }) + + return { + 'classes': classes, + 'functions': functions + } + + def _parse_cpp_parameters(self, params_str: str) -> List[Dict]: + """Parse C++ parameter string.""" + params = [] + + if not params_str.strip() or params_str.strip() == 'void': + return params + + # Split by comma (simplified) + param_list = [p.strip() for p in params_str.split(',')] + + for param in param_list: + if not param: + continue + + # Check for default value + default = None + if '=' in param: + param, default = param.rsplit('=', 1) + param = param.strip() + default = default.strip() + + # Extract type and name (simplified) + # Format: "type name" or "type* name" or "type& name" + parts = param.split() + if len(parts) >= 2: + param_type = ' '.join(parts[:-1]) + param_name = parts[-1] + else: + param_type = param + param_name = "unknown" + + params.append({ + 'name': param_name, + 'type_hint': param_type, + 'default': default + }) + + return params + + +if __name__ == '__main__': + # Test the analyzer + python_code = ''' +class Node2D: + """Base class for 2D nodes.""" + + def move_local_x(self, delta: float, snap: bool = False) -> None: + """Move node along local X axis.""" + pass + + async def tween_position(self, target: tuple, duration: float = 1.0): + """Animate position to target.""" + pass + +def create_sprite(texture: str) -> Node2D: + """Create a new sprite node.""" + return Node2D() +''' + + analyzer = CodeAnalyzer(depth='deep') + result = analyzer.analyze_file('test.py', python_code, 'Python') + + print("Analysis Result:") + print(f"Classes: {len(result.get('classes', []))}") + print(f"Functions: {len(result.get('functions', []))}") + + if result.get('classes'): + cls = result['classes'][0] + print(f"\nClass: {cls['name']}") + print(f" Methods: {len(cls['methods'])}") + for method in cls['methods']: + params = ', '.join([f"{p['name']}: {p['type_hint']}" + (f" = {p['default']}" if p.get('default') else "") + for p in method['parameters']]) + print(f" {method['name']}({params}) -> {method['return_type']}") diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/config_validator.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/config_validator.py new file mode 100644 index 0000000..b8391de --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/config_validator.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +""" +Unified Config Validator + +Validates unified config format that supports multiple sources: +- documentation (website scraping) +- github (repository scraping) +- pdf (PDF document scraping) + +Also provides backward compatibility detection for legacy configs. +""" + +import json +import logging +from typing import Dict, Any, List, Optional, Union +from pathlib import Path + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ConfigValidator: + """ + Validates unified config format and provides backward compatibility. + """ + + # Valid source types + VALID_SOURCE_TYPES = {'documentation', 'github', 'pdf'} + + # Valid merge modes + VALID_MERGE_MODES = {'rule-based', 'claude-enhanced'} + + # Valid code analysis depth levels + VALID_DEPTH_LEVELS = {'surface', 'deep', 'full'} + + def __init__(self, config_or_path: Union[Dict[str, Any], str]): + """ + Initialize validator with config dict or file path. + + Args: + config_or_path: Either a config dict or path to config JSON file + """ + if isinstance(config_or_path, dict): + self.config_path = None + self.config = config_or_path + else: + self.config_path = config_or_path + self.config = self._load_config() + self.is_unified = self._detect_format() + + def _load_config(self) -> Dict[str, Any]: + """Load JSON config file.""" + try: + with open(self.config_path, 'r', encoding='utf-8') as f: + return json.load(f) + except FileNotFoundError: + raise ValueError(f"Config file not found: {self.config_path}") + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in config file: {e}") + + def _detect_format(self) -> bool: + """ + Detect if config is unified format or legacy. + + Returns: + True if unified format (has 'sources' array) + False if legacy format + """ + return 'sources' in self.config and isinstance(self.config['sources'], list) + + def validate(self) -> bool: + """ + Validate config based on detected format. + + Returns: + True if valid + + Raises: + ValueError if invalid with detailed error message + """ + if self.is_unified: + return self._validate_unified() + else: + return self._validate_legacy() + + def _validate_unified(self) -> bool: + """Validate unified config format.""" + logger.info("Validating unified config format...") + + # Required top-level fields + if 'name' not in self.config: + raise ValueError("Missing required field: 'name'") + + if 'description' not in self.config: + raise ValueError("Missing required field: 'description'") + + if 'sources' not in self.config: + raise ValueError("Missing required field: 'sources'") + + # Validate sources array + sources = self.config['sources'] + + if not isinstance(sources, list): + raise ValueError("'sources' must be an array") + + if len(sources) == 0: + raise ValueError("'sources' array cannot be empty") + + # Validate merge_mode (optional) + merge_mode = self.config.get('merge_mode', 'rule-based') + if merge_mode not in self.VALID_MERGE_MODES: + raise ValueError(f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}") + + # Validate each source + for i, source in enumerate(sources): + self._validate_source(source, i) + + logger.info(f"✅ Unified config valid: {len(sources)} sources") + return True + + def _validate_source(self, source: Dict[str, Any], index: int): + """Validate individual source configuration.""" + # Check source has 'type' field + if 'type' not in source: + raise ValueError(f"Source {index}: Missing required field 'type'") + + source_type = source['type'] + + if source_type not in self.VALID_SOURCE_TYPES: + raise ValueError( + f"Source {index}: Invalid type '{source_type}'. " + f"Must be one of {self.VALID_SOURCE_TYPES}" + ) + + # Type-specific validation + if source_type == 'documentation': + self._validate_documentation_source(source, index) + elif source_type == 'github': + self._validate_github_source(source, index) + elif source_type == 'pdf': + self._validate_pdf_source(source, index) + + def _validate_documentation_source(self, source: Dict[str, Any], index: int): + """Validate documentation source configuration.""" + if 'base_url' not in source: + raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'") + + # Optional but recommended fields + if 'selectors' not in source: + logger.warning(f"Source {index} (documentation): No 'selectors' specified, using defaults") + + if 'max_pages' in source and not isinstance(source['max_pages'], int): + raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer") + + def _validate_github_source(self, source: Dict[str, Any], index: int): + """Validate GitHub source configuration.""" + if 'repo' not in source: + raise ValueError(f"Source {index} (github): Missing required field 'repo'") + + # Validate repo format (owner/repo) + repo = source['repo'] + if '/' not in repo: + raise ValueError( + f"Source {index} (github): Invalid repo format '{repo}'. " + f"Must be 'owner/repo' (e.g., 'facebook/react')" + ) + + # Validate code_analysis_depth if specified + if 'code_analysis_depth' in source: + depth = source['code_analysis_depth'] + if depth not in self.VALID_DEPTH_LEVELS: + raise ValueError( + f"Source {index} (github): Invalid code_analysis_depth '{depth}'. " + f"Must be one of {self.VALID_DEPTH_LEVELS}" + ) + + # Validate max_issues if specified + if 'max_issues' in source and not isinstance(source['max_issues'], int): + raise ValueError(f"Source {index} (github): 'max_issues' must be an integer") + + def _validate_pdf_source(self, source: Dict[str, Any], index: int): + """Validate PDF source configuration.""" + if 'path' not in source: + raise ValueError(f"Source {index} (pdf): Missing required field 'path'") + + # Check if file exists + pdf_path = source['path'] + if not Path(pdf_path).exists(): + logger.warning(f"Source {index} (pdf): File not found: {pdf_path}") + + def _validate_legacy(self) -> bool: + """ + Validate legacy config format (backward compatibility). + + Legacy configs are the old format used by doc_scraper, github_scraper, pdf_scraper. + """ + logger.info("Detected legacy config format (backward compatible)") + + # Detect which legacy type based on fields + if 'base_url' in self.config: + logger.info("Legacy type: documentation") + elif 'repo' in self.config: + logger.info("Legacy type: github") + elif 'pdf' in self.config or 'path' in self.config: + logger.info("Legacy type: pdf") + else: + raise ValueError("Cannot detect legacy config type (missing base_url, repo, or pdf)") + + return True + + def convert_legacy_to_unified(self) -> Dict[str, Any]: + """ + Convert legacy config to unified format. + + Returns: + Unified config dict + """ + if self.is_unified: + logger.info("Config already in unified format") + return self.config + + logger.info("Converting legacy config to unified format...") + + # Detect legacy type and convert + if 'base_url' in self.config: + return self._convert_legacy_documentation() + elif 'repo' in self.config: + return self._convert_legacy_github() + elif 'pdf' in self.config or 'path' in self.config: + return self._convert_legacy_pdf() + else: + raise ValueError("Cannot convert: unknown legacy format") + + def _convert_legacy_documentation(self) -> Dict[str, Any]: + """Convert legacy documentation config to unified.""" + unified = { + 'name': self.config.get('name', 'unnamed'), + 'description': self.config.get('description', 'Documentation skill'), + 'merge_mode': 'rule-based', + 'sources': [ + { + 'type': 'documentation', + **{k: v for k, v in self.config.items() + if k not in ['name', 'description']} + } + ] + } + return unified + + def _convert_legacy_github(self) -> Dict[str, Any]: + """Convert legacy GitHub config to unified.""" + unified = { + 'name': self.config.get('name', 'unnamed'), + 'description': self.config.get('description', 'GitHub repository skill'), + 'merge_mode': 'rule-based', + 'sources': [ + { + 'type': 'github', + **{k: v for k, v in self.config.items() + if k not in ['name', 'description']} + } + ] + } + return unified + + def _convert_legacy_pdf(self) -> Dict[str, Any]: + """Convert legacy PDF config to unified.""" + unified = { + 'name': self.config.get('name', 'unnamed'), + 'description': self.config.get('description', 'PDF document skill'), + 'merge_mode': 'rule-based', + 'sources': [ + { + 'type': 'pdf', + **{k: v for k, v in self.config.items() + if k not in ['name', 'description']} + } + ] + } + return unified + + def get_sources_by_type(self, source_type: str) -> List[Dict[str, Any]]: + """ + Get all sources of a specific type. + + Args: + source_type: 'documentation', 'github', or 'pdf' + + Returns: + List of sources matching the type + """ + if not self.is_unified: + # For legacy, convert and get sources + unified = self.convert_legacy_to_unified() + sources = unified['sources'] + else: + sources = self.config['sources'] + + return [s for s in sources if s.get('type') == source_type] + + def has_multiple_sources(self) -> bool: + """Check if config has multiple sources (requires merging).""" + if not self.is_unified: + return False + return len(self.config['sources']) > 1 + + def needs_api_merge(self) -> bool: + """ + Check if config needs API merging. + + Returns True if both documentation and github sources exist + with API extraction enabled. + """ + if not self.has_multiple_sources(): + return False + + has_docs_api = any( + s.get('type') == 'documentation' and s.get('extract_api', True) + for s in self.config['sources'] + ) + + has_github_code = any( + s.get('type') == 'github' and s.get('include_code', False) + for s in self.config['sources'] + ) + + return has_docs_api and has_github_code + + +def validate_config(config_path: str) -> ConfigValidator: + """ + Validate config file and return validator instance. + + Args: + config_path: Path to config JSON file + + Returns: + ConfigValidator instance + + Raises: + ValueError if config is invalid + """ + validator = ConfigValidator(config_path) + validator.validate() + return validator + + +if __name__ == '__main__': + import sys + + if len(sys.argv) < 2: + print("Usage: python config_validator.py ") + sys.exit(1) + + config_file = sys.argv[1] + + try: + validator = validate_config(config_file) + + print(f"\n✅ Config valid!") + print(f" Format: {'Unified' if validator.is_unified else 'Legacy'}") + print(f" Name: {validator.config.get('name')}") + + if validator.is_unified: + sources = validator.config['sources'] + print(f" Sources: {len(sources)}") + for i, source in enumerate(sources): + print(f" {i+1}. {source['type']}") + + if validator.needs_api_merge(): + merge_mode = validator.config.get('merge_mode', 'rule-based') + print(f" ⚠️ API merge required (mode: {merge_mode})") + + except ValueError as e: + print(f"\n❌ Config invalid: {e}") + sys.exit(1) diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/conflict_detector.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/conflict_detector.py new file mode 100644 index 0000000..5f7d4c2 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/conflict_detector.py @@ -0,0 +1,513 @@ +#!/usr/bin/env python3 +""" +Conflict Detector for Multi-Source Skills + +Detects conflicts between documentation and code: +- missing_in_docs: API exists in code but not documented +- missing_in_code: API documented but doesn't exist in code +- signature_mismatch: Different parameters/types between docs and code +- description_mismatch: Docs say one thing, code comments say another + +Used by unified scraper to identify discrepancies before merging. +""" + +import json +import logging +from typing import Dict, List, Any, Optional, Tuple +from dataclasses import dataclass, asdict +from difflib import SequenceMatcher + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@dataclass +class Conflict: + """Represents a conflict between documentation and code.""" + type: str # 'missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch' + severity: str # 'low', 'medium', 'high' + api_name: str + docs_info: Optional[Dict[str, Any]] = None + code_info: Optional[Dict[str, Any]] = None + difference: Optional[str] = None + suggestion: Optional[str] = None + + +class ConflictDetector: + """ + Detects conflicts between documentation and code sources. + """ + + def __init__(self, docs_data: Dict[str, Any], github_data: Dict[str, Any]): + """ + Initialize conflict detector. + + Args: + docs_data: Data from documentation scraper + github_data: Data from GitHub scraper with code analysis + """ + self.docs_data = docs_data + self.github_data = github_data + + # Extract API information from both sources + self.docs_apis = self._extract_docs_apis() + self.code_apis = self._extract_code_apis() + + logger.info(f"Loaded {len(self.docs_apis)} APIs from documentation") + logger.info(f"Loaded {len(self.code_apis)} APIs from code") + + def _extract_docs_apis(self) -> Dict[str, Dict[str, Any]]: + """ + Extract API information from documentation data. + + Returns: + Dict mapping API name to API info + """ + apis = {} + + # Documentation structure varies, but typically has 'pages' or 'references' + pages = self.docs_data.get('pages', {}) + + # Handle both dict and list formats + if isinstance(pages, dict): + # Format: {url: page_data, ...} + for url, page_data in pages.items(): + content = page_data.get('content', '') + title = page_data.get('title', '') + + # Simple heuristic: if title or URL contains "api", "reference", "class", "function" + # it might be an API page + if any(keyword in title.lower() or keyword in url.lower() + for keyword in ['api', 'reference', 'class', 'function', 'method']): + + # Extract API signatures from content (simplified) + extracted_apis = self._parse_doc_content_for_apis(content, url) + apis.update(extracted_apis) + elif isinstance(pages, list): + # Format: [{url: '...', apis: [...]}, ...] + for page in pages: + url = page.get('url', '') + page_apis = page.get('apis', []) + + # If APIs are already extracted in the page data + for api in page_apis: + api_name = api.get('name', '') + if api_name: + apis[api_name] = { + 'parameters': api.get('parameters', []), + 'return_type': api.get('return_type', 'Any'), + 'source_url': url + } + + return apis + + def _parse_doc_content_for_apis(self, content: str, source_url: str) -> Dict[str, Dict]: + """ + Parse documentation content to extract API signatures. + + This is a simplified approach - real implementation would need + to understand the documentation format (Sphinx, JSDoc, etc.) + """ + apis = {} + + # Look for function/method signatures in code blocks + # Common patterns: + # - function_name(param1, param2) + # - ClassName.method_name(param1, param2) + # - def function_name(param1: type, param2: type) -> return_type + + import re + + # Pattern for common API signatures + patterns = [ + # Python style: def name(params) -> return + r'def\s+(\w+)\s*\(([^)]*)\)(?:\s*->\s*(\w+))?', + # JavaScript style: function name(params) + r'function\s+(\w+)\s*\(([^)]*)\)', + # C++ style: return_type name(params) + r'(\w+)\s+(\w+)\s*\(([^)]*)\)', + # Method style: ClassName.method_name(params) + r'(\w+)\.(\w+)\s*\(([^)]*)\)' + ] + + for pattern in patterns: + for match in re.finditer(pattern, content): + groups = match.groups() + + # Parse based on pattern matched + if 'def' in pattern: + # Python function + name = groups[0] + params_str = groups[1] + return_type = groups[2] if len(groups) > 2 else None + elif 'function' in pattern: + # JavaScript function + name = groups[0] + params_str = groups[1] + return_type = None + elif '.' in pattern: + # Class method + class_name = groups[0] + method_name = groups[1] + name = f"{class_name}.{method_name}" + params_str = groups[2] if len(groups) > 2 else groups[1] + return_type = None + else: + # C++ function + return_type = groups[0] + name = groups[1] + params_str = groups[2] + + # Parse parameters + params = self._parse_param_string(params_str) + + apis[name] = { + 'name': name, + 'parameters': params, + 'return_type': return_type, + 'source': source_url, + 'raw_signature': match.group(0) + } + + return apis + + def _parse_param_string(self, params_str: str) -> List[Dict]: + """Parse parameter string into list of parameter dicts.""" + if not params_str.strip(): + return [] + + params = [] + for param in params_str.split(','): + param = param.strip() + if not param: + continue + + # Try to extract name and type + param_info = {'name': param, 'type': None, 'default': None} + + # Check for type annotation (: type) + if ':' in param: + parts = param.split(':', 1) + param_info['name'] = parts[0].strip() + type_part = parts[1].strip() + + # Check for default value (= value) + if '=' in type_part: + type_str, default_str = type_part.split('=', 1) + param_info['type'] = type_str.strip() + param_info['default'] = default_str.strip() + else: + param_info['type'] = type_part + + # Check for default without type (= value) + elif '=' in param: + parts = param.split('=', 1) + param_info['name'] = parts[0].strip() + param_info['default'] = parts[1].strip() + + params.append(param_info) + + return params + + def _extract_code_apis(self) -> Dict[str, Dict[str, Any]]: + """ + Extract API information from GitHub code analysis. + + Returns: + Dict mapping API name to API info + """ + apis = {} + + code_analysis = self.github_data.get('code_analysis', {}) + if not code_analysis: + return apis + + # Support both 'files' and 'analyzed_files' keys + files = code_analysis.get('files', code_analysis.get('analyzed_files', [])) + + for file_info in files: + file_path = file_info.get('file', 'unknown') + + # Extract classes and their methods + for class_info in file_info.get('classes', []): + class_name = class_info['name'] + + # Add class itself + apis[class_name] = { + 'name': class_name, + 'type': 'class', + 'source': file_path, + 'line': class_info.get('line_number'), + 'base_classes': class_info.get('base_classes', []), + 'docstring': class_info.get('docstring') + } + + # Add methods + for method in class_info.get('methods', []): + method_name = f"{class_name}.{method['name']}" + apis[method_name] = { + 'name': method_name, + 'type': 'method', + 'parameters': method.get('parameters', []), + 'return_type': method.get('return_type'), + 'source': file_path, + 'line': method.get('line_number'), + 'docstring': method.get('docstring'), + 'is_async': method.get('is_async', False) + } + + # Extract standalone functions + for func_info in file_info.get('functions', []): + func_name = func_info['name'] + apis[func_name] = { + 'name': func_name, + 'type': 'function', + 'parameters': func_info.get('parameters', []), + 'return_type': func_info.get('return_type'), + 'source': file_path, + 'line': func_info.get('line_number'), + 'docstring': func_info.get('docstring'), + 'is_async': func_info.get('is_async', False) + } + + return apis + + def detect_all_conflicts(self) -> List[Conflict]: + """ + Detect all types of conflicts. + + Returns: + List of Conflict objects + """ + logger.info("Detecting conflicts between documentation and code...") + + conflicts = [] + + # 1. Find APIs missing in documentation + conflicts.extend(self._find_missing_in_docs()) + + # 2. Find APIs missing in code + conflicts.extend(self._find_missing_in_code()) + + # 3. Find signature mismatches + conflicts.extend(self._find_signature_mismatches()) + + logger.info(f"Found {len(conflicts)} conflicts total") + + return conflicts + + def _find_missing_in_docs(self) -> List[Conflict]: + """Find APIs that exist in code but not in documentation.""" + conflicts = [] + + for api_name, code_info in self.code_apis.items(): + # Simple name matching (can be enhanced with fuzzy matching) + if api_name not in self.docs_apis: + # Check if it's a private/internal API (often not documented) + is_private = api_name.startswith('_') or '__' in api_name + severity = 'low' if is_private else 'medium' + + conflicts.append(Conflict( + type='missing_in_docs', + severity=severity, + api_name=api_name, + code_info=code_info, + difference=f"API exists in code ({code_info['source']}) but not found in documentation", + suggestion="Add documentation for this API" if not is_private else "Consider if this internal API should be documented" + )) + + logger.info(f"Found {len(conflicts)} APIs missing in documentation") + return conflicts + + def _find_missing_in_code(self) -> List[Conflict]: + """Find APIs that are documented but don't exist in code.""" + conflicts = [] + + for api_name, docs_info in self.docs_apis.items(): + if api_name not in self.code_apis: + conflicts.append(Conflict( + type='missing_in_code', + severity='high', # This is serious - documented but doesn't exist + api_name=api_name, + docs_info=docs_info, + difference=f"API documented ({docs_info.get('source', 'unknown')}) but not found in code", + suggestion="Update documentation to remove this API, or add it to codebase" + )) + + logger.info(f"Found {len(conflicts)} APIs missing in code") + return conflicts + + def _find_signature_mismatches(self) -> List[Conflict]: + """Find APIs where signature differs between docs and code.""" + conflicts = [] + + # Find APIs that exist in both + common_apis = set(self.docs_apis.keys()) & set(self.code_apis.keys()) + + for api_name in common_apis: + docs_info = self.docs_apis[api_name] + code_info = self.code_apis[api_name] + + # Compare signatures + mismatch = self._compare_signatures(docs_info, code_info) + + if mismatch: + conflicts.append(Conflict( + type='signature_mismatch', + severity=mismatch['severity'], + api_name=api_name, + docs_info=docs_info, + code_info=code_info, + difference=mismatch['difference'], + suggestion=mismatch['suggestion'] + )) + + logger.info(f"Found {len(conflicts)} signature mismatches") + return conflicts + + def _compare_signatures(self, docs_info: Dict, code_info: Dict) -> Optional[Dict]: + """ + Compare signatures between docs and code. + + Returns: + Dict with mismatch details if conflict found, None otherwise + """ + docs_params = docs_info.get('parameters', []) + code_params = code_info.get('parameters', []) + + # Compare parameter counts + if len(docs_params) != len(code_params): + return { + 'severity': 'medium', + 'difference': f"Parameter count mismatch: docs has {len(docs_params)}, code has {len(code_params)}", + 'suggestion': f"Documentation shows {len(docs_params)} parameters, but code has {len(code_params)}" + } + + # Compare parameter names and types + for i, (doc_param, code_param) in enumerate(zip(docs_params, code_params)): + doc_name = doc_param.get('name', '') + code_name = code_param.get('name', '') + + # Parameter name mismatch + if doc_name != code_name: + # Use fuzzy matching for slight variations + similarity = SequenceMatcher(None, doc_name, code_name).ratio() + if similarity < 0.8: # Not similar enough + return { + 'severity': 'medium', + 'difference': f"Parameter {i+1} name mismatch: '{doc_name}' in docs vs '{code_name}' in code", + 'suggestion': f"Update documentation to use parameter name '{code_name}'" + } + + # Type mismatch + doc_type = doc_param.get('type') + code_type = code_param.get('type_hint') + + if doc_type and code_type and doc_type != code_type: + return { + 'severity': 'low', + 'difference': f"Parameter '{doc_name}' type mismatch: '{doc_type}' in docs vs '{code_type}' in code", + 'suggestion': f"Verify correct type for parameter '{doc_name}'" + } + + # Compare return types if both have them + docs_return = docs_info.get('return_type') + code_return = code_info.get('return_type') + + if docs_return and code_return and docs_return != code_return: + return { + 'severity': 'low', + 'difference': f"Return type mismatch: '{docs_return}' in docs vs '{code_return}' in code", + 'suggestion': "Verify correct return type" + } + + return None + + def generate_summary(self, conflicts: List[Conflict]) -> Dict[str, Any]: + """ + Generate summary statistics for conflicts. + + Args: + conflicts: List of Conflict objects + + Returns: + Summary dict with statistics + """ + summary = { + 'total': len(conflicts), + 'by_type': {}, + 'by_severity': {}, + 'apis_affected': len(set(c.api_name for c in conflicts)) + } + + # Count by type + for conflict_type in ['missing_in_docs', 'missing_in_code', 'signature_mismatch', 'description_mismatch']: + count = sum(1 for c in conflicts if c.type == conflict_type) + summary['by_type'][conflict_type] = count + + # Count by severity + for severity in ['low', 'medium', 'high']: + count = sum(1 for c in conflicts if c.severity == severity) + summary['by_severity'][severity] = count + + return summary + + def save_conflicts(self, conflicts: List[Conflict], output_path: str): + """ + Save conflicts to JSON file. + + Args: + conflicts: List of Conflict objects + output_path: Path to output JSON file + """ + data = { + 'conflicts': [asdict(c) for c in conflicts], + 'summary': self.generate_summary(conflicts) + } + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + logger.info(f"Conflicts saved to: {output_path}") + + +if __name__ == '__main__': + import sys + + if len(sys.argv) < 3: + print("Usage: python conflict_detector.py ") + sys.exit(1) + + docs_file = sys.argv[1] + github_file = sys.argv[2] + + # Load data + with open(docs_file, 'r') as f: + docs_data = json.load(f) + + with open(github_file, 'r') as f: + github_data = json.load(f) + + # Detect conflicts + detector = ConflictDetector(docs_data, github_data) + conflicts = detector.detect_all_conflicts() + + # Print summary + summary = detector.generate_summary(conflicts) + print("\n📊 Conflict Summary:") + print(f" Total conflicts: {summary['total']}") + print(f" APIs affected: {summary['apis_affected']}") + print("\n By Type:") + for conflict_type, count in summary['by_type'].items(): + if count > 0: + print(f" {conflict_type}: {count}") + print("\n By Severity:") + for severity, count in summary['by_severity'].items(): + if count > 0: + emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢' + print(f" {emoji} {severity}: {count}") + + # Save to file + output_file = 'conflicts.json' + detector.save_conflicts(conflicts, output_file) + print(f"\n✅ Full report saved to: {output_file}") diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/constants.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/constants.py new file mode 100644 index 0000000..2685e93 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/constants.py @@ -0,0 +1,72 @@ +"""Configuration constants for Skill Seekers CLI. + +This module centralizes all magic numbers and configuration values used +across the CLI tools to improve maintainability and clarity. +""" + +# ===== SCRAPING CONFIGURATION ===== + +# Default scraping limits +DEFAULT_RATE_LIMIT = 0.5 # seconds between requests +DEFAULT_MAX_PAGES = 500 # maximum pages to scrape +DEFAULT_CHECKPOINT_INTERVAL = 1000 # pages between checkpoints +DEFAULT_ASYNC_MODE = False # use async mode for parallel scraping (opt-in) + +# Content analysis limits +CONTENT_PREVIEW_LENGTH = 500 # characters to check for categorization +MAX_PAGES_WARNING_THRESHOLD = 10000 # warn if config exceeds this + +# Quality thresholds +MIN_CATEGORIZATION_SCORE = 2 # minimum score for category assignment +URL_MATCH_POINTS = 3 # points for URL keyword match +TITLE_MATCH_POINTS = 2 # points for title keyword match +CONTENT_MATCH_POINTS = 1 # points for content keyword match + +# ===== ENHANCEMENT CONFIGURATION ===== + +# API-based enhancement limits (uses Anthropic API) +API_CONTENT_LIMIT = 100000 # max characters for API enhancement +API_PREVIEW_LIMIT = 40000 # max characters for preview + +# Local enhancement limits (uses Claude Code Max) +LOCAL_CONTENT_LIMIT = 50000 # max characters for local enhancement +LOCAL_PREVIEW_LIMIT = 20000 # max characters for preview + +# ===== PAGE ESTIMATION ===== + +# Estimation and discovery settings +DEFAULT_MAX_DISCOVERY = 1000 # default max pages to discover +DISCOVERY_THRESHOLD = 10000 # threshold for warnings + +# ===== FILE LIMITS ===== + +# Output and processing limits +MAX_REFERENCE_FILES = 100 # maximum reference files per skill +MAX_CODE_BLOCKS_PER_PAGE = 5 # maximum code blocks to extract per page + +# ===== EXPORT CONSTANTS ===== + +__all__ = [ + # Scraping + 'DEFAULT_RATE_LIMIT', + 'DEFAULT_MAX_PAGES', + 'DEFAULT_CHECKPOINT_INTERVAL', + 'DEFAULT_ASYNC_MODE', + 'CONTENT_PREVIEW_LENGTH', + 'MAX_PAGES_WARNING_THRESHOLD', + 'MIN_CATEGORIZATION_SCORE', + 'URL_MATCH_POINTS', + 'TITLE_MATCH_POINTS', + 'CONTENT_MATCH_POINTS', + # Enhancement + 'API_CONTENT_LIMIT', + 'API_PREVIEW_LIMIT', + 'LOCAL_CONTENT_LIMIT', + 'LOCAL_PREVIEW_LIMIT', + # Estimation + 'DEFAULT_MAX_DISCOVERY', + 'DISCOVERY_THRESHOLD', + # Limits + 'MAX_REFERENCE_FILES', + 'MAX_CODE_BLOCKS_PER_PAGE', +] diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/doc_scraper.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/doc_scraper.py new file mode 100644 index 0000000..963780d --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/doc_scraper.py @@ -0,0 +1,1822 @@ +#!/usr/bin/env python3 +""" +Documentation to Claude Skill Converter +Single tool to scrape any documentation and create high-quality Claude skills. + +Usage: + skill-seekers scrape --interactive + skill-seekers scrape --config configs/godot.json + skill-seekers scrape --url https://react.dev/ --name react +""" + +import os +import sys +import json +import time +import re +import argparse +import hashlib +import logging +import asyncio +import requests +import httpx +from pathlib import Path +from urllib.parse import urljoin, urlparse +from bs4 import BeautifulSoup +from collections import deque, defaultdict +from typing import Optional, Dict, List, Tuple, Set, Deque, Any + +# Add parent directory to path for imports when run as script +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector +from skill_seekers.cli.llms_txt_parser import LlmsTxtParser +from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader +from skill_seekers.cli.constants import ( + DEFAULT_RATE_LIMIT, + DEFAULT_MAX_PAGES, + DEFAULT_CHECKPOINT_INTERVAL, + DEFAULT_ASYNC_MODE, + CONTENT_PREVIEW_LENGTH, + MAX_PAGES_WARNING_THRESHOLD, + MIN_CATEGORIZATION_SCORE +) + +# Configure logging +logger = logging.getLogger(__name__) + + +def setup_logging(verbose: bool = False, quiet: bool = False) -> None: + """Configure logging based on verbosity level. + + Args: + verbose: Enable DEBUG level logging + quiet: Enable WARNING level logging only + """ + if quiet: + level = logging.WARNING + elif verbose: + level = logging.DEBUG + else: + level = logging.INFO + + logging.basicConfig( + level=level, + format='%(message)s', + force=True + ) + + +class DocToSkillConverter: + def __init__(self, config: Dict[str, Any], dry_run: bool = False, resume: bool = False) -> None: + self.config = config + self.name = config['name'] + self.base_url = config['base_url'] + self.dry_run = dry_run + self.resume = resume + + # Paths + self.data_dir = f"output/{self.name}_data" + self.skill_dir = f"output/{self.name}" + self.checkpoint_file = f"{self.data_dir}/checkpoint.json" + + # Checkpoint config + checkpoint_config = config.get('checkpoint', {}) + self.checkpoint_enabled = checkpoint_config.get('enabled', False) + self.checkpoint_interval = checkpoint_config.get('interval', DEFAULT_CHECKPOINT_INTERVAL) + + # llms.txt detection state + skip_llms_txt_value = config.get('skip_llms_txt', False) + if not isinstance(skip_llms_txt_value, bool): + logger.warning( + "Invalid value for 'skip_llms_txt': %r (expected bool). Defaulting to False.", + skip_llms_txt_value + ) + self.skip_llms_txt = False + else: + self.skip_llms_txt = skip_llms_txt_value + self.llms_txt_detected = False + self.llms_txt_variant = None + self.llms_txt_variants: List[str] = [] # Track all downloaded variants + + # Parallel scraping config + self.workers = config.get('workers', 1) + self.async_mode = config.get('async_mode', DEFAULT_ASYNC_MODE) + + # State + self.visited_urls: set[str] = set() + # Support multiple starting URLs + start_urls = config.get('start_urls', [self.base_url]) + self.pending_urls = deque(start_urls) + self.pages: List[Dict[str, Any]] = [] + self.pages_scraped = 0 + + # Thread-safe lock for parallel scraping + if self.workers > 1: + import threading + self.lock = threading.Lock() + + # Create directories (unless dry-run) + if not dry_run: + os.makedirs(f"{self.data_dir}/pages", exist_ok=True) + os.makedirs(f"{self.skill_dir}/references", exist_ok=True) + os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) + os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) + + # Load checkpoint if resuming + if resume and not dry_run: + self.load_checkpoint() + + def is_valid_url(self, url: str) -> bool: + """Check if URL should be scraped based on patterns. + + Args: + url (str): URL to validate + + Returns: + bool: True if URL matches include patterns and doesn't match exclude patterns + """ + if not url.startswith(self.base_url): + return False + + # Include patterns + includes = self.config.get('url_patterns', {}).get('include', []) + if includes and not any(pattern in url for pattern in includes): + return False + + # Exclude patterns + excludes = self.config.get('url_patterns', {}).get('exclude', []) + if any(pattern in url for pattern in excludes): + return False + + return True + + def save_checkpoint(self) -> None: + """Save progress checkpoint""" + if not self.checkpoint_enabled or self.dry_run: + return + + checkpoint_data = { + "config": self.config, + "visited_urls": list(self.visited_urls), + "pending_urls": list(self.pending_urls), + "pages_scraped": self.pages_scraped, + "last_updated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "checkpoint_interval": self.checkpoint_interval + } + + try: + with open(self.checkpoint_file, 'w') as f: + json.dump(checkpoint_data, f, indent=2) + logger.info(" 💾 Checkpoint saved (%d pages)", self.pages_scraped) + except Exception as e: + logger.warning(" ⚠️ Failed to save checkpoint: %s", e) + + def load_checkpoint(self) -> None: + """Load progress from checkpoint""" + if not os.path.exists(self.checkpoint_file): + logger.info("ℹ️ No checkpoint found, starting fresh") + return + + try: + with open(self.checkpoint_file, 'r') as f: + checkpoint_data = json.load(f) + + self.visited_urls = set(checkpoint_data["visited_urls"]) + self.pending_urls = deque(checkpoint_data["pending_urls"]) + self.pages_scraped = checkpoint_data["pages_scraped"] + + logger.info("✅ Resumed from checkpoint") + logger.info(" Pages already scraped: %d", self.pages_scraped) + logger.info(" URLs visited: %d", len(self.visited_urls)) + logger.info(" URLs pending: %d", len(self.pending_urls)) + logger.info(" Last updated: %s", checkpoint_data['last_updated']) + logger.info("") + + except Exception as e: + logger.warning("⚠️ Failed to load checkpoint: %s", e) + logger.info(" Starting fresh") + + def clear_checkpoint(self) -> None: + """Remove checkpoint file""" + if os.path.exists(self.checkpoint_file): + try: + os.remove(self.checkpoint_file) + logger.info("✅ Checkpoint cleared") + except Exception as e: + logger.warning("⚠️ Failed to clear checkpoint: %s", e) + + def extract_content(self, soup: Any, url: str) -> Dict[str, Any]: + """Extract content with improved code and pattern detection""" + page = { + 'url': url, + 'title': '', + 'content': '', + 'headings': [], + 'code_samples': [], + 'patterns': [], # NEW: Extract common patterns + 'links': [] + } + + selectors = self.config.get('selectors', {}) + + # Extract title + title_elem = soup.select_one(selectors.get('title', 'title')) + if title_elem: + page['title'] = self.clean_text(title_elem.get_text()) + + # Find main content + main_selector = selectors.get('main_content', 'div[role="main"]') + main = soup.select_one(main_selector) + + if not main: + logger.warning("⚠ No content: %s", url) + return page + + # Extract headings with better structure + for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): + text = self.clean_text(h.get_text()) + if text: + page['headings'].append({ + 'level': h.name, + 'text': text, + 'id': h.get('id', '') + }) + + # Extract code with language detection + code_selector = selectors.get('code_blocks', 'pre code') + for code_elem in main.select(code_selector): + code = code_elem.get_text() + if len(code.strip()) > 10: + # Try to detect language + lang = self.detect_language(code_elem, code) + page['code_samples'].append({ + 'code': code.strip(), + 'language': lang + }) + + # Extract patterns (NEW: common code patterns) + page['patterns'] = self.extract_patterns(main, page['code_samples']) + + # Extract paragraphs + paragraphs = [] + for p in main.find_all('p'): + text = self.clean_text(p.get_text()) + if text and len(text) > 20: # Skip very short paragraphs + paragraphs.append(text) + + page['content'] = '\n\n'.join(paragraphs) + + # Extract links from entire page (not just main content) + # This allows discovery of navigation links outside the main content area + for link in soup.find_all('a', href=True): + href = urljoin(url, link['href']) + # Strip anchor fragments to avoid treating #anchors as separate pages + href = href.split('#')[0] + if self.is_valid_url(href) and href not in page['links']: + page['links'].append(href) + + return page + + def _extract_language_from_classes(self, classes): + """Extract language from class list + + Supports multiple patterns: + - language-{lang} (e.g., "language-python") + - lang-{lang} (e.g., "lang-javascript") + - brush: {lang} (e.g., "brush: java") + - bare language name (e.g., "python", "java") + + """ + # Define common programming languages + known_languages = [ + "javascript", "java", "xml", "html", "python", "bash", "cpp", "typescript", + "go", "rust", "php", "ruby", "swift", "kotlin", "csharp", "c", "sql", + "yaml", "json", "markdown", "css", "scss", "sass", "jsx", "tsx", "vue", + "shell", "powershell", "r", "scala", "dart", "perl", "lua", "elixir" + ] + + for cls in classes: + # Clean special characters (except word chars and hyphens) + cls = re.sub(r'[^\w-]', '', cls) + + if 'language-' in cls: + return cls.replace('language-', '') + + if 'lang-' in cls: + return cls.replace('lang-', '') + + # Check for brush: pattern (e.g., "brush: java") + if 'brush' in cls.lower(): + lang = cls.lower().replace('brush', '').strip() + if lang in known_languages: + return lang + + # Check for bare language name + if cls in known_languages: + return cls + + return None + + def detect_language(self, elem, code): + """Detect programming language from code block""" + + # Check element classes + lang = self._extract_language_from_classes(elem.get('class', [])) + if lang: + return lang + + # Check parent pre element + parent = elem.parent + if parent and parent.name == 'pre': + lang = self._extract_language_from_classes(parent.get('class', [])) + if lang: + return lang + + # Heuristic detection + if 'import ' in code and 'from ' in code: + return 'python' + if 'const ' in code or 'let ' in code or '=>' in code: + return 'javascript' + if 'func ' in code and 'var ' in code: + return 'gdscript' + if 'def ' in code and ':' in code: + return 'python' + if '#include' in code or 'int main' in code: + return 'cpp' + # C# detection + if 'using System' in code or 'namespace ' in code: + return 'csharp' + if '{ get; set; }' in code: + return 'csharp' + if any(keyword in code for keyword in ['public class ', 'private class ', 'internal class ', 'public static void ']): + return 'csharp' + + return 'unknown' + + def extract_patterns(self, main: Any, code_samples: List[Dict[str, Any]]) -> List[Dict[str, str]]: + """Extract common coding patterns (NEW FEATURE)""" + patterns = [] + + # Look for "Example:" or "Pattern:" sections + for elem in main.find_all(['p', 'div']): + text = elem.get_text().lower() + if any(word in text for word in ['example:', 'pattern:', 'usage:', 'typical use']): + # Get the code that follows + next_code = elem.find_next(['pre', 'code']) + if next_code: + patterns.append({ + 'description': self.clean_text(elem.get_text()), + 'code': next_code.get_text().strip() + }) + + return patterns[:5] # Limit to 5 most relevant patterns + + def clean_text(self, text: str) -> str: + """Clean text content""" + text = re.sub(r'\s+', ' ', text) + return text.strip() + + def save_page(self, page: Dict[str, Any]) -> None: + """Save page data""" + url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10] + safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50] + safe_title = re.sub(r'[-\s]+', '_', safe_title) + + filename = f"{safe_title}_{url_hash}.json" + filepath = os.path.join(self.data_dir, "pages", filename) + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(page, f, indent=2, ensure_ascii=False) + + def scrape_page(self, url: str) -> None: + """Scrape a single page with thread-safe operations. + + Args: + url (str): URL to scrape + + Returns: + dict or None: Page data dict on success, None on failure + + Note: + Uses threading locks when workers > 1 for thread safety + """ + try: + # Scraping part (no lock needed - independent) + headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper)'} + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + page = self.extract_content(soup, url) + + # Thread-safe operations (lock required) + if self.workers > 1: + with self.lock: + logger.info(" %s", url) + self.save_page(page) + self.pages.append(page) + + # Add new URLs + for link in page['links']: + if link not in self.visited_urls and link not in self.pending_urls: + self.pending_urls.append(link) + else: + # Single-threaded mode (no lock needed) + logger.info(" %s", url) + self.save_page(page) + self.pages.append(page) + + # Add new URLs + for link in page['links']: + if link not in self.visited_urls and link not in self.pending_urls: + self.pending_urls.append(link) + + # Rate limiting + rate_limit = self.config.get('rate_limit', DEFAULT_RATE_LIMIT) + if rate_limit > 0: + time.sleep(rate_limit) + + except Exception as e: + if self.workers > 1: + with self.lock: + logger.error(" ✗ Error scraping %s: %s: %s", url, type(e).__name__, e) + else: + logger.error(" ✗ Error scraping page: %s: %s", type(e).__name__, e) + logger.error(" URL: %s", url) + + async def scrape_page_async(self, url: str, semaphore: asyncio.Semaphore, client: httpx.AsyncClient) -> None: + """Scrape a single page asynchronously. + + Args: + url: URL to scrape + semaphore: Asyncio semaphore for concurrency control + client: Shared httpx AsyncClient for connection pooling + + Note: + Uses asyncio.Lock for async-safe operations instead of threading.Lock + """ + async with semaphore: # Limit concurrent requests + try: + # Async HTTP request + headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper)'} + response = await client.get(url, headers=headers, timeout=30.0) + response.raise_for_status() + + # BeautifulSoup parsing (still synchronous, but fast) + soup = BeautifulSoup(response.content, 'html.parser') + page = self.extract_content(soup, url) + + # Async-safe operations (no lock needed - single event loop) + logger.info(" %s", url) + self.save_page(page) + self.pages.append(page) + + # Add new URLs + for link in page['links']: + if link not in self.visited_urls and link not in self.pending_urls: + self.pending_urls.append(link) + + # Rate limiting + rate_limit = self.config.get('rate_limit', DEFAULT_RATE_LIMIT) + if rate_limit > 0: + await asyncio.sleep(rate_limit) + + except Exception as e: + logger.error(" ✗ Error scraping %s: %s: %s", url, type(e).__name__, e) + + def _try_llms_txt(self) -> bool: + """ + Try to use llms.txt instead of HTML scraping. + Downloads ALL available variants and stores with .md extension. + + Returns: + True if llms.txt was found and processed successfully + """ + logger.info("\n🔍 Checking for llms.txt at %s...", self.base_url) + + # Check for explicit config URL first + explicit_url = self.config.get('llms_txt_url') + if explicit_url: + logger.info("\n📌 Using explicit llms_txt_url from config: %s", explicit_url) + + # Download explicit file first + downloader = LlmsTxtDownloader(explicit_url) + content = downloader.download() + + if content: + # Save explicit file with proper .md extension + filename = downloader.get_proper_filename() + filepath = os.path.join(self.skill_dir, "references", filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + logger.info(" 💾 Saved %s (%d chars)", filename, len(content)) + + # Also try to detect and download ALL other variants + detector = LlmsTxtDetector(self.base_url) + variants = detector.detect_all() + + if variants: + logger.info("\n🔍 Found %d total variant(s), downloading remaining...", len(variants)) + for variant_info in variants: + url = variant_info['url'] + variant = variant_info['variant'] + + # Skip the explicit one we already downloaded + if url == explicit_url: + continue + + logger.info(" 📥 Downloading %s...", variant) + extra_downloader = LlmsTxtDownloader(url) + extra_content = extra_downloader.download() + + if extra_content: + extra_filename = extra_downloader.get_proper_filename() + extra_filepath = os.path.join(self.skill_dir, "references", extra_filename) + with open(extra_filepath, 'w', encoding='utf-8') as f: + f.write(extra_content) + logger.info(" ✓ %s (%d chars)", extra_filename, len(extra_content)) + + # Parse explicit file for skill building + parser = LlmsTxtParser(content) + pages = parser.parse() + + if pages: + for page in pages: + self.save_page(page) + self.pages.append(page) + + self.llms_txt_detected = True + self.llms_txt_variant = 'explicit' + return True + + # Auto-detection: Find ALL variants + detector = LlmsTxtDetector(self.base_url) + variants = detector.detect_all() + + if not variants: + logger.info("ℹ️ No llms.txt found, using HTML scraping") + return False + + logger.info("✅ Found %d llms.txt variant(s)", len(variants)) + + # Download ALL variants + downloaded = {} + for variant_info in variants: + url = variant_info['url'] + variant = variant_info['variant'] + + logger.info(" 📥 Downloading %s...", variant) + downloader = LlmsTxtDownloader(url) + content = downloader.download() + + if content: + filename = downloader.get_proper_filename() + downloaded[variant] = { + 'content': content, + 'filename': filename, + 'size': len(content) + } + logger.info(" ✓ %s (%d chars)", filename, len(content)) + + if not downloaded: + logger.warning("⚠️ Failed to download any variants, falling back to HTML scraping") + return False + + # Save ALL variants to references/ + os.makedirs(os.path.join(self.skill_dir, "references"), exist_ok=True) + + for variant, data in downloaded.items(): + filepath = os.path.join(self.skill_dir, "references", data['filename']) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(data['content']) + logger.info(" 💾 Saved %s", data['filename']) + + # Parse LARGEST variant for skill building + largest = max(downloaded.items(), key=lambda x: x[1]['size']) + logger.info("\n📄 Parsing %s for skill building...", largest[1]['filename']) + + parser = LlmsTxtParser(largest[1]['content']) + pages = parser.parse() + + if not pages: + logger.warning("⚠️ Failed to parse llms.txt, falling back to HTML scraping") + return False + + logger.info(" ✓ Parsed %d sections", len(pages)) + + # Save pages for skill building + for page in pages: + self.save_page(page) + self.pages.append(page) + + self.llms_txt_detected = True + self.llms_txt_variants = list(downloaded.keys()) + + return True + + def scrape_all(self) -> None: + """Scrape all pages (supports llms.txt and HTML scraping) + + Routes to async version if async_mode is enabled in config. + """ + # Route to async version if enabled + if self.async_mode: + asyncio.run(self.scrape_all_async()) + return + + # Try llms.txt first (unless dry-run or explicitly disabled) + if not self.dry_run and not self.skip_llms_txt: + llms_result = self._try_llms_txt() + if llms_result: + logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant) + self.save_summary() + return + + # HTML scraping (sync/thread-based logic) + logger.info("\n" + "=" * 60) + if self.dry_run: + logger.info("DRY RUN: %s", self.name) + else: + logger.info("SCRAPING: %s", self.name) + logger.info("=" * 60) + logger.info("Base URL: %s", self.base_url) + + if self.dry_run: + logger.info("Mode: Preview only (no actual scraping)\n") + else: + logger.info("Output: %s", self.data_dir) + if self.workers > 1: + logger.info("Workers: %d parallel threads", self.workers) + logger.info("") + + max_pages = self.config.get('max_pages', DEFAULT_MAX_PAGES) + + # Handle unlimited mode + if max_pages is None or max_pages == -1: + logger.warning("⚠️ UNLIMITED MODE: No page limit (will scrape all pages)\n") + unlimited = True + else: + unlimited = False + + # Dry run: preview first 20 URLs + preview_limit = 20 if self.dry_run else max_pages + + # Single-threaded mode (original sequential logic) + if self.workers <= 1: + while self.pending_urls and (unlimited or len(self.visited_urls) < preview_limit): + url = self.pending_urls.popleft() + + if url in self.visited_urls: + continue + + self.visited_urls.add(url) + + if self.dry_run: + # Just show what would be scraped + logger.info(" [Preview] %s", url) + try: + headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper - Dry Run)'} + response = requests.get(url, headers=headers, timeout=10) + soup = BeautifulSoup(response.content, 'html.parser') + + main_selector = self.config.get('selectors', {}).get('main_content', 'div[role="main"]') + main = soup.select_one(main_selector) + + if main: + for link in main.find_all('a', href=True): + href = urljoin(url, link['href']) + if self.is_valid_url(href) and href not in self.visited_urls: + self.pending_urls.append(href) + except Exception as e: + # Failed to extract links in fast mode, continue anyway + logger.warning("⚠️ Warning: Could not extract links from %s: %s", url, e) + else: + self.scrape_page(url) + self.pages_scraped += 1 + + if self.checkpoint_enabled and self.pages_scraped % self.checkpoint_interval == 0: + self.save_checkpoint() + + if len(self.visited_urls) % 10 == 0: + logger.info(" [%d pages]", len(self.visited_urls)) + + # Multi-threaded mode (parallel scraping) + else: + from concurrent.futures import ThreadPoolExecutor, as_completed + + logger.info("🚀 Starting parallel scraping with %d workers\n", self.workers) + + with ThreadPoolExecutor(max_workers=self.workers) as executor: + futures = [] + + while self.pending_urls and (unlimited or len(self.visited_urls) < preview_limit): + # Get next batch of URLs (thread-safe) + batch = [] + batch_size = min(self.workers * 2, len(self.pending_urls)) + + with self.lock: + for _ in range(batch_size): + if not self.pending_urls: + break + url = self.pending_urls.popleft() + + if url not in self.visited_urls: + self.visited_urls.add(url) + batch.append(url) + + # Submit batch to executor + for url in batch: + if unlimited or len(self.visited_urls) <= preview_limit: + future = executor.submit(self.scrape_page, url) + futures.append(future) + + # Wait for some to complete before submitting more + completed = 0 + for future in as_completed(futures[:batch_size]): + # Check for exceptions + try: + future.result() # Raises exception if scrape_page failed + except Exception as e: + with self.lock: + logger.warning(" ⚠️ Worker exception: %s", e) + + completed += 1 + + with self.lock: + self.pages_scraped += 1 + + if self.checkpoint_enabled and self.pages_scraped % self.checkpoint_interval == 0: + self.save_checkpoint() + + if self.pages_scraped % 10 == 0: + logger.info(" [%d pages scraped]", self.pages_scraped) + + # Remove completed futures + futures = [f for f in futures if not f.done()] + + # Wait for remaining futures + for future in as_completed(futures): + # Check for exceptions + try: + future.result() + except Exception as e: + with self.lock: + logger.warning(" ⚠️ Worker exception: %s", e) + + with self.lock: + self.pages_scraped += 1 + + if self.dry_run: + logger.info("\n✅ Dry run complete: would scrape ~%d pages", len(self.visited_urls)) + if len(self.visited_urls) >= preview_limit: + logger.info(" (showing first %d, actual scraping may find more)", preview_limit) + logger.info("\n💡 To actually scrape, run without --dry-run") + else: + logger.info("\n✅ Scraped %d pages", len(self.visited_urls)) + self.save_summary() + + async def scrape_all_async(self) -> None: + """Scrape all pages asynchronously (async/await version). + + This method provides significantly better performance for parallel scraping + compared to thread-based scraping, with lower memory overhead and better + CPU utilization. + + Performance: ~2-3x faster than sync mode with same worker count. + """ + # Try llms.txt first (unless dry-run or explicitly disabled) + if not self.dry_run and not self.skip_llms_txt: + llms_result = self._try_llms_txt() + if llms_result: + logger.info("\n✅ Used llms.txt (%s) - skipping HTML scraping", self.llms_txt_variant) + self.save_summary() + return + + # HTML scraping (async version) + logger.info("\n" + "=" * 60) + if self.dry_run: + logger.info("DRY RUN (ASYNC): %s", self.name) + else: + logger.info("SCRAPING (ASYNC): %s", self.name) + logger.info("=" * 60) + logger.info("Base URL: %s", self.base_url) + + if self.dry_run: + logger.info("Mode: Preview only (no actual scraping)\n") + else: + logger.info("Output: %s", self.data_dir) + logger.info("Workers: %d concurrent tasks (async)", self.workers) + logger.info("") + + max_pages = self.config.get('max_pages', DEFAULT_MAX_PAGES) + + # Handle unlimited mode + if max_pages is None or max_pages == -1: + logger.warning("⚠️ UNLIMITED MODE: No page limit (will scrape all pages)\n") + unlimited = True + preview_limit = float('inf') + else: + unlimited = False + preview_limit = 20 if self.dry_run else max_pages + + # Create semaphore for concurrency control + semaphore = asyncio.Semaphore(self.workers) + + # Create shared HTTP client with connection pooling + async with httpx.AsyncClient( + timeout=30.0, + limits=httpx.Limits(max_connections=self.workers * 2) + ) as client: + tasks = [] + + while self.pending_urls and (unlimited or len(self.visited_urls) < preview_limit): + # Get next batch of URLs + batch = [] + batch_size = min(self.workers * 2, len(self.pending_urls)) + + for _ in range(batch_size): + if not self.pending_urls: + break + url = self.pending_urls.popleft() + + if url not in self.visited_urls: + self.visited_urls.add(url) + batch.append(url) + + # Create async tasks for batch + for url in batch: + if unlimited or len(self.visited_urls) <= preview_limit: + if self.dry_run: + logger.info(" [Preview] %s", url) + else: + task = asyncio.create_task( + self.scrape_page_async(url, semaphore, client) + ) + tasks.append(task) + + # Wait for batch to complete before continuing + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + tasks = [] + self.pages_scraped = len(self.visited_urls) + + # Progress indicator + if self.pages_scraped % 10 == 0 and not self.dry_run: + logger.info(" [%d pages scraped]", self.pages_scraped) + + # Checkpoint saving + if not self.dry_run and self.checkpoint_enabled: + if self.pages_scraped % self.checkpoint_interval == 0: + self.save_checkpoint() + + # Wait for any remaining tasks + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + + if self.dry_run: + logger.info("\n✅ Dry run complete: would scrape ~%d pages", len(self.visited_urls)) + if len(self.visited_urls) >= preview_limit: + logger.info(" (showing first %d, actual scraping may find more)", int(preview_limit)) + logger.info("\n💡 To actually scrape, run without --dry-run") + else: + logger.info("\n✅ Scraped %d pages (async mode)", len(self.visited_urls)) + self.save_summary() + + def save_summary(self) -> None: + """Save scraping summary""" + summary = { + 'name': self.name, + 'total_pages': len(self.pages), + 'base_url': self.base_url, + 'llms_txt_detected': self.llms_txt_detected, + 'llms_txt_variant': self.llms_txt_variant, + 'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages] + } + + with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f: + json.dump(summary, f, indent=2, ensure_ascii=False) + + def load_scraped_data(self) -> List[Dict[str, Any]]: + """Load previously scraped data""" + pages = [] + pages_dir = Path(self.data_dir) / "pages" + + if not pages_dir.exists(): + return [] + + for json_file in pages_dir.glob("*.json"): + try: + with open(json_file, 'r', encoding='utf-8') as f: + pages.append(json.load(f)) + except Exception as e: + logger.error("⚠️ Error loading scraped data file %s: %s: %s", json_file, type(e).__name__, e) + logger.error(" Suggestion: File may be corrupted, consider re-scraping with --fresh") + + return pages + + def smart_categorize(self, pages: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: + """Improved categorization with better pattern matching""" + category_defs = self.config.get('categories', {}) + + # Default smart categories if none provided + if not category_defs: + category_defs = self.infer_categories(pages) + + categories: Dict[str, List[Dict[str, Any]]] = {cat: [] for cat in category_defs.keys()} + categories['other'] = [] + + for page in pages: + url = page['url'].lower() + title = page['title'].lower() + content = page.get('content', '').lower()[:CONTENT_PREVIEW_LENGTH] # Check first N chars for categorization + + categorized = False + + # Match against keywords + for cat, keywords in category_defs.items(): + score = 0 + for keyword in keywords: + keyword = keyword.lower() + if keyword in url: + score += 3 + if keyword in title: + score += 2 + if keyword in content: + score += 1 + + if score >= MIN_CATEGORIZATION_SCORE: # Threshold for categorization + categories[cat].append(page) + categorized = True + break + + if not categorized: + categories['other'].append(page) + + # Remove empty categories + categories = {k: v for k, v in categories.items() if v} + + return categories + + def infer_categories(self, pages: List[Dict[str, Any]]) -> Dict[str, List[str]]: + """Infer categories from URL patterns (IMPROVED)""" + url_segments: defaultdict[str, int] = defaultdict(int) + + for page in pages: + path = urlparse(page['url']).path + segments = [s for s in path.split('/') if s and s not in ['en', 'stable', 'latest', 'docs']] + + for seg in segments: + url_segments[seg] += 1 + + # Top segments become categories + top_segments = sorted(url_segments.items(), key=lambda x: x[1], reverse=True)[:8] + + categories = {} + for seg, count in top_segments: + if count >= 3: # At least 3 pages + categories[seg] = [seg] + + # Add common defaults + if 'tutorial' not in categories and any('tutorial' in url for url in [p['url'] for p in pages]): + categories['tutorials'] = ['tutorial', 'guide', 'getting-started'] + + if 'api' not in categories and any('api' in url or 'reference' in url for url in [p['url'] for p in pages]): + categories['api'] = ['api', 'reference', 'class'] + + return categories + + def generate_quick_reference(self, pages: List[Dict[str, Any]]) -> List[Dict[str, str]]: + """Generate quick reference from common patterns (NEW FEATURE)""" + quick_ref = [] + + # Collect all patterns + all_patterns = [] + for page in pages: + all_patterns.extend(page.get('patterns', [])) + + # Get most common code patterns + seen_codes = set() + for pattern in all_patterns: + code = pattern['code'] + if code not in seen_codes and len(code) < 300: + quick_ref.append(pattern) + seen_codes.add(code) + if len(quick_ref) >= 15: + break + + return quick_ref + + def create_reference_file(self, category: str, pages: List[Dict[str, Any]]) -> None: + """Create enhanced reference file""" + if not pages: + return + + lines = [] + lines.append(f"# {self.name.title()} - {category.replace('_', ' ').title()}\n") + lines.append(f"**Pages:** {len(pages)}\n") + lines.append("---\n") + + for page in pages: + lines.append(f"## {page['title']}\n") + lines.append(f"**URL:** {page['url']}\n") + + # Table of contents from headings + if page.get('headings'): + lines.append("**Contents:**") + for h in page['headings'][:10]: + level = int(h['level'][1]) if len(h['level']) > 1 else 1 + indent = " " * max(0, level - 2) + lines.append(f"{indent}- {h['text']}") + lines.append("") + + # Content (NO TRUNCATION) + if page.get('content'): + lines.append(page['content']) + lines.append("") + + # Code examples with language (NO TRUNCATION) + if page.get('code_samples'): + lines.append("**Examples:**\n") + for i, sample in enumerate(page['code_samples'][:4], 1): + lang = sample.get('language', 'unknown') + code = sample.get('code', sample if isinstance(sample, str) else '') + lines.append(f"Example {i} ({lang}):") + lines.append(f"```{lang}") + lines.append(code) # Full code, no truncation + lines.append("```\n") + + lines.append("---\n") + + filepath = os.path.join(self.skill_dir, "references", f"{category}.md") + with open(filepath, 'w', encoding='utf-8') as f: + f.write('\n'.join(lines)) + + logger.info(" ✓ %s.md (%d pages)", category, len(pages)) + + def create_enhanced_skill_md(self, categories: Dict[str, List[Dict[str, Any]]], quick_ref: List[Dict[str, str]]) -> None: + """Create SKILL.md with actual examples (IMPROVED)""" + description = self.config.get('description', f'Comprehensive assistance with {self.name}') + + # Extract actual code examples from docs + example_codes = [] + for pages in categories.values(): + for page in pages[:3]: # First 3 pages per category + for sample in page.get('code_samples', [])[:2]: # First 2 samples per page + code = sample.get('code', sample if isinstance(sample, str) else '') + lang = sample.get('language', 'unknown') + if len(code) < 200 and lang != 'unknown': + example_codes.append((lang, code)) + if len(example_codes) >= 10: + break + if len(example_codes) >= 10: + break + if len(example_codes) >= 10: + break + + content = f"""--- +name: {self.name} +description: {description} +--- + +# {self.name.title()} Skill + +Comprehensive assistance with {self.name} development, generated from official documentation. + +## When to Use This Skill + +This skill should be triggered when: +- Working with {self.name} +- Asking about {self.name} features or APIs +- Implementing {self.name} solutions +- Debugging {self.name} code +- Learning {self.name} best practices + +## Quick Reference + +### Common Patterns + +""" + + # Add actual quick reference patterns + if quick_ref: + for i, pattern in enumerate(quick_ref[:8], 1): + content += f"**Pattern {i}:** {pattern.get('description', 'Example pattern')}\n\n" + content += "```\n" + content += pattern.get('code', '')[:300] + content += "\n```\n\n" + else: + content += "*Quick reference patterns will be added as you use the skill.*\n\n" + + # Add example codes from docs + if example_codes: + content += "### Example Code Patterns\n\n" + for i, (lang, code) in enumerate(example_codes[:5], 1): + content += f"**Example {i}** ({lang}):\n```{lang}\n{code}\n```\n\n" + + content += f"""## Reference Files + +This skill includes comprehensive documentation in `references/`: + +""" + + for cat in sorted(categories.keys()): + content += f"- **{cat}.md** - {cat.replace('_', ' ').title()} documentation\n" + + content += """ +Use `view` to read specific reference files when detailed information is needed. + +## Working with This Skill + +### For Beginners +Start with the getting_started or tutorials reference files for foundational concepts. + +### For Specific Features +Use the appropriate category reference file (api, guides, etc.) for detailed information. + +### For Code Examples +The quick reference section above contains common patterns extracted from the official docs. + +## Resources + +### references/ +Organized documentation extracted from official sources. These files contain: +- Detailed explanations +- Code examples with language annotations +- Links to original documentation +- Table of contents for quick navigation + +### scripts/ +Add helper scripts here for common automation tasks. + +### assets/ +Add templates, boilerplate, or example projects here. + +## Notes + +- This skill was automatically generated from official documentation +- Reference files preserve the structure and examples from source docs +- Code examples include language detection for better syntax highlighting +- Quick reference patterns are extracted from common usage examples in the docs + +## Updating + +To refresh this skill with updated documentation: +1. Re-run the scraper with the same configuration +2. The skill will be rebuilt with the latest information +""" + + filepath = os.path.join(self.skill_dir, "SKILL.md") + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + logger.info(" ✓ SKILL.md (enhanced with %d examples)", len(example_codes)) + + def create_index(self, categories: Dict[str, List[Dict[str, Any]]]) -> None: + """Create navigation index""" + lines = [] + lines.append(f"# {self.name.title()} Documentation Index\n") + lines.append("## Categories\n") + + for cat, pages in sorted(categories.items()): + lines.append(f"### {cat.replace('_', ' ').title()}") + lines.append(f"**File:** `{cat}.md`") + lines.append(f"**Pages:** {len(pages)}\n") + + filepath = os.path.join(self.skill_dir, "references", "index.md") + with open(filepath, 'w', encoding='utf-8') as f: + f.write('\n'.join(lines)) + + logger.info(" ✓ index.md") + + def build_skill(self) -> bool: + """Build the skill from scraped data. + + Loads scraped JSON files, categorizes pages, extracts patterns, + and generates SKILL.md and reference files. + + Returns: + bool: True if build succeeded, False otherwise + """ + logger.info("\n" + "=" * 60) + logger.info("BUILDING SKILL: %s", self.name) + logger.info("=" * 60 + "\n") + + # Load data + logger.info("Loading scraped data...") + pages = self.load_scraped_data() + + if not pages: + logger.error("✗ No scraped data found!") + return False + + logger.info(" ✓ Loaded %d pages\n", len(pages)) + + # Categorize + logger.info("Categorizing pages...") + categories = self.smart_categorize(pages) + logger.info(" ✓ Created %d categories\n", len(categories)) + + # Generate quick reference + logger.info("Generating quick reference...") + quick_ref = self.generate_quick_reference(pages) + logger.info(" ✓ Extracted %d patterns\n", len(quick_ref)) + + # Create reference files + logger.info("Creating reference files...") + for cat, cat_pages in categories.items(): + self.create_reference_file(cat, cat_pages) + + # Create index + self.create_index(categories) + logger.info("") + + # Create enhanced SKILL.md + logger.info("Creating SKILL.md...") + self.create_enhanced_skill_md(categories, quick_ref) + + logger.info("\n✅ Skill built: %s/", self.skill_dir) + return True + + +def validate_config(config: Dict[str, Any]) -> Tuple[List[str], List[str]]: + """Validate configuration structure and values. + + Args: + config (dict): Configuration dictionary to validate + + Returns: + tuple: (errors, warnings) where each is a list of strings + + Example: + >>> errors, warnings = validate_config({'name': 'test', 'base_url': 'https://example.com'}) + >>> if errors: + ... print("Invalid config:", errors) + """ + errors = [] + warnings = [] + + # Required fields + required_fields = ['name', 'base_url'] + for field in required_fields: + if field not in config: + errors.append(f"Missing required field: '{field}'") + + # Validate name (alphanumeric, hyphens, underscores only) + if 'name' in config: + if not re.match(r'^[a-zA-Z0-9_-]+$', config['name']): + errors.append(f"Invalid name: '{config['name']}' (use only letters, numbers, hyphens, underscores)") + + # Validate base_url + if 'base_url' in config: + if not config['base_url'].startswith(('http://', 'https://')): + errors.append(f"Invalid base_url: '{config['base_url']}' (must start with http:// or https://)") + + # Validate selectors structure + if 'selectors' in config: + if not isinstance(config['selectors'], dict): + errors.append("'selectors' must be a dictionary") + else: + recommended_selectors = ['main_content', 'title', 'code_blocks'] + for selector in recommended_selectors: + if selector not in config['selectors']: + warnings.append(f"Missing recommended selector: '{selector}'") + else: + warnings.append("Missing 'selectors' section (recommended)") + + # Validate url_patterns + if 'url_patterns' in config: + if not isinstance(config['url_patterns'], dict): + errors.append("'url_patterns' must be a dictionary") + else: + for key in ['include', 'exclude']: + if key in config['url_patterns']: + if not isinstance(config['url_patterns'][key], list): + errors.append(f"'url_patterns.{key}' must be a list") + + # Validate categories + if 'categories' in config: + if not isinstance(config['categories'], dict): + errors.append("'categories' must be a dictionary") + else: + for cat_name, keywords in config['categories'].items(): + if not isinstance(keywords, list): + errors.append(f"'categories.{cat_name}' must be a list of keywords") + + # Validate rate_limit + if 'rate_limit' in config: + try: + rate = float(config['rate_limit']) + if rate < 0: + errors.append(f"'rate_limit' must be non-negative (got {rate})") + elif rate > 10: + warnings.append(f"'rate_limit' is very high ({rate}s) - this may slow down scraping significantly") + except (ValueError, TypeError): + errors.append(f"'rate_limit' must be a number (got {config['rate_limit']})") + + # Validate max_pages + if 'max_pages' in config: + max_p_value = config['max_pages'] + + # Allow None for unlimited + if max_p_value is None: + warnings.append("'max_pages' is None (unlimited) - this will scrape ALL pages. Use with caution!") + else: + try: + max_p = int(max_p_value) + # Allow -1 for unlimited + if max_p == -1: + warnings.append("'max_pages' is -1 (unlimited) - this will scrape ALL pages. Use with caution!") + elif max_p < 1: + errors.append(f"'max_pages' must be at least 1 or -1 for unlimited (got {max_p})") + elif max_p > MAX_PAGES_WARNING_THRESHOLD: + warnings.append(f"'max_pages' is very high ({max_p}) - scraping may take a very long time") + except (ValueError, TypeError): + errors.append(f"'max_pages' must be an integer, -1, or null (got {config['max_pages']})") + + # Validate start_urls if present + if 'start_urls' in config: + if not isinstance(config['start_urls'], list): + errors.append("'start_urls' must be a list") + else: + for url in config['start_urls']: + if not url.startswith(('http://', 'https://')): + errors.append(f"Invalid start_url: '{url}' (must start with http:// or https://)") + + return errors, warnings + + +def load_config(config_path: str) -> Dict[str, Any]: + """Load and validate configuration from JSON file. + + Args: + config_path (str): Path to JSON configuration file + + Returns: + dict: Validated configuration dictionary + + Raises: + SystemExit: If config is invalid or file not found + + Example: + >>> config = load_config('configs/react.json') + >>> print(config['name']) + 'react' + """ + try: + with open(config_path, 'r') as f: + config = json.load(f) + except json.JSONDecodeError as e: + logger.error("❌ Error: Invalid JSON in config file: %s", config_path) + logger.error(" Details: %s", e) + logger.error(" Suggestion: Check syntax at line %d, column %d", e.lineno, e.colno) + sys.exit(1) + except FileNotFoundError: + logger.error("❌ Error: Config file not found: %s", config_path) + logger.error(" Suggestion: Create a config file or use an existing one from configs/") + logger.error(" Available configs: react.json, vue.json, django.json, godot.json") + sys.exit(1) + + # Validate config + errors, warnings = validate_config(config) + + # Show warnings (non-blocking) + if warnings: + logger.warning("⚠️ Configuration warnings in %s:", config_path) + for warning in warnings: + logger.warning(" - %s", warning) + logger.info("") + + # Show errors (blocking) + if errors: + logger.error("❌ Configuration validation errors in %s:", config_path) + for error in errors: + logger.error(" - %s", error) + logger.error("\n Suggestion: Fix the above errors or check configs/ for working examples") + sys.exit(1) + + return config + + +def interactive_config() -> Dict[str, Any]: + """Interactive configuration wizard for creating new configs. + + Prompts user for all required configuration fields step-by-step + and returns a complete configuration dictionary. + + Returns: + dict: Complete configuration dictionary with user-provided values + + Example: + >>> config = interactive_config() + # User enters: name=react, url=https://react.dev, etc. + >>> config['name'] + 'react' + """ + logger.info("\n" + "="*60) + logger.info("Documentation to Skill Converter") + logger.info("="*60 + "\n") + + config: Dict[str, Any] = {} + + # Basic info + config['name'] = input("Skill name (e.g., 'react', 'godot'): ").strip() + config['description'] = input("Skill description: ").strip() + config['base_url'] = input("Base URL (e.g., https://docs.example.com/): ").strip() + + if not config['base_url'].endswith('/'): + config['base_url'] += '/' + + # Selectors + logger.info("\nCSS Selectors (press Enter for defaults):") + selectors = {} + selectors['main_content'] = input(" Main content [div[role='main']]: ").strip() or "div[role='main']" + selectors['title'] = input(" Title [title]: ").strip() or "title" + selectors['code_blocks'] = input(" Code blocks [pre code]: ").strip() or "pre code" + config['selectors'] = selectors + + # URL patterns + logger.info("\nURL Patterns (comma-separated, optional):") + include = input(" Include: ").strip() + exclude = input(" Exclude: ").strip() + config['url_patterns'] = { + 'include': [p.strip() for p in include.split(',') if p.strip()], + 'exclude': [p.strip() for p in exclude.split(',') if p.strip()] + } + + # Settings + rate = input(f"\nRate limit (seconds) [{DEFAULT_RATE_LIMIT}]: ").strip() + config['rate_limit'] = float(rate) if rate else DEFAULT_RATE_LIMIT + + max_p = input(f"Max pages [{DEFAULT_MAX_PAGES}]: ").strip() + config['max_pages'] = int(max_p) if max_p else DEFAULT_MAX_PAGES + + return config + + +def check_existing_data(name: str) -> Tuple[bool, int]: + """Check if scraped data already exists for a skill. + + Args: + name (str): Skill name to check + + Returns: + tuple: (exists, page_count) where exists is bool and page_count is int + + Example: + >>> exists, count = check_existing_data('react') + >>> if exists: + ... print(f"Found {count} existing pages") + """ + data_dir = f"output/{name}_data" + if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"): + with open(f"{data_dir}/summary.json", 'r') as f: + summary = json.load(f) + return True, summary.get('total_pages', 0) + return False, 0 + + +def setup_argument_parser() -> argparse.ArgumentParser: + """Setup and configure command-line argument parser. + + Creates an ArgumentParser with all CLI options for the doc scraper tool, + including configuration, scraping, enhancement, and performance options. + + Returns: + argparse.ArgumentParser: Configured argument parser + + Example: + >>> parser = setup_argument_parser() + >>> args = parser.parse_args(['--config', 'configs/react.json']) + >>> print(args.config) + configs/react.json + """ + parser = argparse.ArgumentParser( + description='Convert documentation websites to Claude skills', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument('--interactive', '-i', action='store_true', + help='Interactive configuration mode') + parser.add_argument('--config', '-c', type=str, + help='Load configuration from file (e.g., configs/godot.json)') + parser.add_argument('--name', type=str, + help='Skill name') + parser.add_argument('--url', type=str, + help='Base documentation URL') + parser.add_argument('--description', '-d', type=str, + help='Skill description') + parser.add_argument('--skip-scrape', action='store_true', + help='Skip scraping, use existing data') + parser.add_argument('--dry-run', action='store_true', + help='Preview what will be scraped without actually scraping') + parser.add_argument('--enhance', action='store_true', + help='Enhance SKILL.md using Claude API after building (requires API key)') + parser.add_argument('--enhance-local', action='store_true', + help='Enhance SKILL.md using Claude Code (no API key needed, runs in background)') + parser.add_argument('--interactive-enhancement', action='store_true', + help='Open terminal window for enhancement (use with --enhance-local)') + parser.add_argument('--api-key', type=str, + help='Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)') + parser.add_argument('--resume', action='store_true', + help='Resume from last checkpoint (for interrupted scrapes)') + parser.add_argument('--fresh', action='store_true', + help='Clear checkpoint and start fresh') + parser.add_argument('--rate-limit', '-r', type=float, metavar='SECONDS', + help=f'Override rate limit in seconds (default: from config or {DEFAULT_RATE_LIMIT}). Use 0 for no delay.') + parser.add_argument('--workers', '-w', type=int, metavar='N', + help='Number of parallel workers for faster scraping (default: 1, max: 10)') + parser.add_argument('--async', dest='async_mode', action='store_true', + help='Enable async mode for better parallel performance (2-3x faster than threads)') + parser.add_argument('--no-rate-limit', action='store_true', + help='Disable rate limiting completely (same as --rate-limit 0)') + parser.add_argument('--verbose', '-v', action='store_true', + help='Enable verbose output (DEBUG level logging)') + parser.add_argument('--quiet', '-q', action='store_true', + help='Minimize output (WARNING level logging only)') + + return parser + + +def get_configuration(args: argparse.Namespace) -> Dict[str, Any]: + """Load or create configuration from command-line arguments. + + Handles three configuration modes: + 1. Load from JSON file (--config) + 2. Interactive configuration wizard (--interactive or missing args) + 3. Quick mode from command-line arguments (--name, --url) + + Also applies CLI overrides for rate limiting and worker count. + + Args: + args: Parsed command-line arguments from argparse + + Returns: + dict: Configuration dictionary with all required fields + + Example: + >>> args = parser.parse_args(['--name', 'react', '--url', 'https://react.dev']) + >>> config = get_configuration(args) + >>> print(config['name']) + react + """ + # Get base configuration + if args.config: + config = load_config(args.config) + elif args.interactive or not (args.name and args.url): + config = interactive_config() + else: + config = { + 'name': args.name, + 'description': args.description or f'Comprehensive assistance with {args.name}', + 'base_url': args.url, + 'selectors': { + 'main_content': "div[role='main']", + 'title': 'title', + 'code_blocks': 'pre code' + }, + 'url_patterns': {'include': [], 'exclude': []}, + 'rate_limit': DEFAULT_RATE_LIMIT, + 'max_pages': DEFAULT_MAX_PAGES + } + + # Apply CLI overrides for rate limiting + if args.no_rate_limit: + config['rate_limit'] = 0 + logger.info("⚡ Rate limiting disabled") + elif args.rate_limit is not None: + config['rate_limit'] = args.rate_limit + if args.rate_limit == 0: + logger.info("⚡ Rate limiting disabled") + else: + logger.info("⚡ Rate limit override: %ss per page", args.rate_limit) + + # Apply CLI overrides for worker count + if args.workers: + # Validate workers count + if args.workers < 1: + logger.error("❌ Error: --workers must be at least 1 (got %d)", args.workers) + logger.error(" Suggestion: Use --workers 1 (default) or omit the flag") + sys.exit(1) + if args.workers > 10: + logger.warning("⚠️ Warning: --workers capped at 10 (requested %d)", args.workers) + args.workers = 10 + config['workers'] = args.workers + if args.workers > 1: + logger.info("🚀 Parallel scraping enabled: %d workers", args.workers) + + # Apply CLI override for async mode + if args.async_mode: + config['async_mode'] = True + if config.get('workers', 1) > 1: + logger.info("⚡ Async mode enabled (2-3x faster than threads)") + else: + logger.warning("⚠️ Async mode enabled but workers=1. Consider using --workers 4 for better performance") + + return config + + +def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespace) -> Optional['DocToSkillConverter']: + """Execute the scraping and skill building process. + + Handles dry run mode, existing data checks, scraping with checkpoints, + keyboard interrupts, and skill building. This is the core workflow + orchestration for the scraping phase. + + Args: + config (dict): Configuration dictionary with scraping parameters + args: Parsed command-line arguments + + Returns: + DocToSkillConverter: The converter instance after scraping/building, + or None if process was aborted + + Example: + >>> config = {'name': 'react', 'base_url': 'https://react.dev'} + >>> converter = execute_scraping_and_building(config, args) + >>> if converter: + ... print("Scraping complete!") + """ + # Dry run mode - preview only + if args.dry_run: + logger.info("\n" + "=" * 60) + logger.info("DRY RUN MODE") + logger.info("=" * 60) + logger.info("This will show what would be scraped without saving anything.\n") + + converter = DocToSkillConverter(config, dry_run=True) + converter.scrape_all() + + logger.info("\n📋 Configuration Summary:") + logger.info(" Name: %s", config['name']) + logger.info(" Base URL: %s", config['base_url']) + logger.info(" Max pages: %d", config.get('max_pages', DEFAULT_MAX_PAGES)) + logger.info(" Rate limit: %ss", config.get('rate_limit', DEFAULT_RATE_LIMIT)) + logger.info(" Categories: %d", len(config.get('categories', {}))) + return None + + # Check for existing data + exists, page_count = check_existing_data(config['name']) + + if exists and not args.skip_scrape and not args.fresh: + # Check force_rescrape flag from config + if config.get('force_rescrape', False): + # Auto-delete cached data and rescrape + logger.info("\n✓ Found existing data: %d pages", page_count) + logger.info(" force_rescrape enabled - deleting cached data and rescaping") + import shutil + data_dir = f"output/{config['name']}_data" + if os.path.exists(data_dir): + shutil.rmtree(data_dir) + logger.info(f" Deleted: {data_dir}") + else: + # Only prompt if force_rescrape is False + logger.info("\n✓ Found existing data: %d pages", page_count) + response = input("Use existing data? (y/n): ").strip().lower() + if response == 'y': + args.skip_scrape = True + elif exists and args.fresh: + logger.info("\n✓ Found existing data: %d pages", page_count) + logger.info(" --fresh flag set, will re-scrape from scratch") + + # Create converter + converter = DocToSkillConverter(config, resume=args.resume) + + # Handle fresh start (clear checkpoint) + if args.fresh: + converter.clear_checkpoint() + + # Scrape or skip + if not args.skip_scrape: + try: + converter.scrape_all() + # Save final checkpoint + if converter.checkpoint_enabled: + converter.save_checkpoint() + logger.info("\n💾 Final checkpoint saved") + # Clear checkpoint after successful completion + converter.clear_checkpoint() + logger.info("✅ Scraping complete - checkpoint cleared") + except KeyboardInterrupt: + logger.warning("\n\nScraping interrupted.") + if converter.checkpoint_enabled: + converter.save_checkpoint() + logger.info("💾 Progress saved to checkpoint") + logger.info(" Resume with: --config %s --resume", args.config if args.config else 'config.json') + response = input("Continue with skill building? (y/n): ").strip().lower() + if response != 'y': + return None + else: + logger.info("\n⏭️ Skipping scrape, using existing data") + + # Build skill + success = converter.build_skill() + + if not success: + sys.exit(1) + + return converter + + +def execute_enhancement(config: Dict[str, Any], args: argparse.Namespace) -> None: + """Execute optional SKILL.md enhancement with Claude. + + Supports two enhancement modes: + 1. API-based enhancement (requires ANTHROPIC_API_KEY) + 2. Local enhancement using Claude Code (no API key needed) + + Prints appropriate messages and suggestions based on whether + enhancement was requested and whether it succeeded. + + Args: + config (dict): Configuration dictionary with skill name + args: Parsed command-line arguments with enhancement flags + + Example: + >>> execute_enhancement(config, args) + # Runs enhancement if --enhance or --enhance-local flag is set + """ + import subprocess + + # Optional enhancement with Claude API + if args.enhance: + logger.info("\n" + "=" * 60) + logger.info("ENHANCING SKILL.MD WITH CLAUDE API") + logger.info("=" * 60 + "\n") + + try: + enhance_cmd = ['python3', 'cli/enhance_skill.py', f'output/{config["name"]}/'] + if args.api_key: + enhance_cmd.extend(['--api-key', args.api_key]) + + result = subprocess.run(enhance_cmd, check=True) + if result.returncode == 0: + logger.info("\n✅ Enhancement complete!") + except subprocess.CalledProcessError: + logger.warning("\n⚠ Enhancement failed, but skill was still built") + except FileNotFoundError: + logger.warning("\n⚠ enhance_skill.py not found. Run manually:") + logger.info(" skill-seekers-enhance output/%s/", config['name']) + + # Optional enhancement with Claude Code (local, no API key) + if args.enhance_local: + logger.info("\n" + "=" * 60) + if args.interactive_enhancement: + logger.info("ENHANCING SKILL.MD WITH CLAUDE CODE (INTERACTIVE)") + else: + logger.info("ENHANCING SKILL.MD WITH CLAUDE CODE (HEADLESS)") + logger.info("=" * 60 + "\n") + + try: + enhance_cmd = ['skill-seekers-enhance', f'output/{config["name"]}/'] + if args.interactive_enhancement: + enhance_cmd.append('--interactive-enhancement') + + result = subprocess.run(enhance_cmd, check=True) + + if result.returncode == 0: + logger.info("\n✅ Enhancement complete!") + except subprocess.CalledProcessError: + logger.warning("\n⚠ Enhancement failed, but skill was still built") + except FileNotFoundError: + logger.warning("\n⚠ skill-seekers-enhance command not found. Run manually:") + logger.info(" skill-seekers-enhance output/%s/", config['name']) + + # Print packaging instructions + logger.info("\n📦 Package your skill:") + logger.info(" skill-seekers-package output/%s/", config['name']) + + # Suggest enhancement if not done + if not args.enhance and not args.enhance_local: + logger.info("\n💡 Optional: Enhance SKILL.md with Claude:") + logger.info(" Local (recommended): skill-seekers-enhance output/%s/", config['name']) + logger.info(" or re-run with: --enhance-local") + logger.info(" API-based: skill-seekers-enhance-api output/%s/", config['name']) + logger.info(" or re-run with: --enhance") + logger.info("\n💡 Tip: Use --interactive-enhancement with --enhance-local to open terminal window") + + +def main() -> None: + parser = setup_argument_parser() + args = parser.parse_args() + + # Setup logging based on verbosity flags + setup_logging(verbose=args.verbose, quiet=args.quiet) + + config = get_configuration(args) + + # Execute scraping and building + converter = execute_scraping_and_building(config, args) + + # Exit if dry run or aborted + if converter is None: + return + + # Execute enhancement and print instructions + execute_enhancement(config, args) + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/enhance_skill.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/enhance_skill.py new file mode 100644 index 0000000..50df45b --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/enhance_skill.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +""" +SKILL.md Enhancement Script +Uses Claude API to improve SKILL.md by analyzing reference documentation. + +Usage: + skill-seekers enhance output/steam-inventory/ + skill-seekers enhance output/react/ + skill-seekers enhance output/godot/ --api-key YOUR_API_KEY +""" + +import os +import sys +import json +import argparse +from pathlib import Path + +# Add parent directory to path for imports when run as script +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from skill_seekers.cli.constants import API_CONTENT_LIMIT, API_PREVIEW_LIMIT +from skill_seekers.cli.utils import read_reference_files + +try: + import anthropic +except ImportError: + print("❌ Error: anthropic package not installed") + print("Install with: pip3 install anthropic") + sys.exit(1) + + +class SkillEnhancer: + def __init__(self, skill_dir, api_key=None): + self.skill_dir = Path(skill_dir) + self.references_dir = self.skill_dir / "references" + self.skill_md_path = self.skill_dir / "SKILL.md" + + # Get API key + self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY') + if not self.api_key: + raise ValueError( + "No API key provided. Set ANTHROPIC_API_KEY environment variable " + "or use --api-key argument" + ) + + self.client = anthropic.Anthropic(api_key=self.api_key) + + def read_current_skill_md(self): + """Read existing SKILL.md""" + if not self.skill_md_path.exists(): + return None + return self.skill_md_path.read_text(encoding='utf-8') + + def enhance_skill_md(self, references, current_skill_md): + """Use Claude to enhance SKILL.md""" + + # Build prompt + prompt = self._build_enhancement_prompt(references, current_skill_md) + + print("\n🤖 Asking Claude to enhance SKILL.md...") + print(f" Input: {len(prompt):,} characters") + + try: + message = self.client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=4096, + temperature=0.3, + messages=[{ + "role": "user", + "content": prompt + }] + ) + + enhanced_content = message.content[0].text + return enhanced_content + + except Exception as e: + print(f"❌ Error calling Claude API: {e}") + return None + + def _build_enhancement_prompt(self, references, current_skill_md): + """Build the prompt for Claude""" + + # Extract skill name and description + skill_name = self.skill_dir.name + + prompt = f"""You are enhancing a Claude skill's SKILL.md file. This skill is about: {skill_name} + +I've scraped documentation and organized it into reference files. Your job is to create an EXCELLENT SKILL.md that will help Claude use this documentation effectively. + +CURRENT SKILL.MD: +{'```markdown' if current_skill_md else '(none - create from scratch)'} +{current_skill_md or 'No existing SKILL.md'} +{'```' if current_skill_md else ''} + +REFERENCE DOCUMENTATION: +""" + + for filename, content in references.items(): + prompt += f"\n\n## {filename}\n```markdown\n{content[:30000]}\n```\n" + + prompt += """ + +YOUR TASK: +Create an enhanced SKILL.md that includes: + +1. **Clear "When to Use This Skill" section** - Be specific about trigger conditions +2. **Excellent Quick Reference section** - Extract 5-10 of the BEST, most practical code examples from the reference docs + - Choose SHORT, clear examples that demonstrate common tasks + - Include both simple and intermediate examples + - Annotate examples with clear descriptions + - Use proper language tags (cpp, python, javascript, json, etc.) +3. **Detailed Reference Files description** - Explain what's in each reference file +4. **Practical "Working with This Skill" section** - Give users clear guidance on how to navigate the skill +5. **Key Concepts section** (if applicable) - Explain core concepts +6. **Keep the frontmatter** (---\nname: ...\n---) intact + +IMPORTANT: +- Extract REAL examples from the reference docs, don't make them up +- Prioritize SHORT, clear examples (5-20 lines max) +- Make it actionable and practical +- Don't be too verbose - be concise but useful +- Maintain the markdown structure for Claude skills +- Keep code examples properly formatted with language tags + +OUTPUT: +Return ONLY the complete SKILL.md content, starting with the frontmatter (---). +""" + + return prompt + + def save_enhanced_skill_md(self, content): + """Save the enhanced SKILL.md""" + # Backup original + if self.skill_md_path.exists(): + backup_path = self.skill_md_path.with_suffix('.md.backup') + self.skill_md_path.rename(backup_path) + print(f" 💾 Backed up original to: {backup_path.name}") + + # Save enhanced version + self.skill_md_path.write_text(content, encoding='utf-8') + print(f" ✅ Saved enhanced SKILL.md") + + def run(self): + """Main enhancement workflow""" + print(f"\n{'='*60}") + print(f"ENHANCING SKILL: {self.skill_dir.name}") + print(f"{'='*60}\n") + + # Read reference files + print("📖 Reading reference documentation...") + references = read_reference_files( + self.skill_dir, + max_chars=API_CONTENT_LIMIT, + preview_limit=API_PREVIEW_LIMIT + ) + + if not references: + print("❌ No reference files found to analyze") + return False + + print(f" ✓ Read {len(references)} reference files") + total_size = sum(len(c) for c in references.values()) + print(f" ✓ Total size: {total_size:,} characters\n") + + # Read current SKILL.md + current_skill_md = self.read_current_skill_md() + if current_skill_md: + print(f" ℹ Found existing SKILL.md ({len(current_skill_md)} chars)") + else: + print(f" ℹ No existing SKILL.md, will create new one") + + # Enhance with Claude + enhanced = self.enhance_skill_md(references, current_skill_md) + + if not enhanced: + print("❌ Enhancement failed") + return False + + print(f" ✓ Generated enhanced SKILL.md ({len(enhanced)} chars)\n") + + # Save + print("💾 Saving enhanced SKILL.md...") + self.save_enhanced_skill_md(enhanced) + + print(f"\n✅ Enhancement complete!") + print(f"\nNext steps:") + print(f" 1. Review: {self.skill_md_path}") + print(f" 2. If you don't like it, restore backup: {self.skill_md_path.with_suffix('.md.backup')}") + print(f" 3. Package your skill:") + print(f" skill-seekers package {self.skill_dir}/") + + return True + + +def main(): + parser = argparse.ArgumentParser( + description='Enhance SKILL.md using Claude API', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Using ANTHROPIC_API_KEY environment variable + export ANTHROPIC_API_KEY=sk-ant-... + skill-seekers enhance output/steam-inventory/ + + # Providing API key directly + skill-seekers enhance output/react/ --api-key sk-ant-... + + # Show what would be done (dry run) + skill-seekers enhance output/godot/ --dry-run +""" + ) + + parser.add_argument('skill_dir', type=str, + help='Path to skill directory (e.g., output/steam-inventory/)') + parser.add_argument('--api-key', type=str, + help='Anthropic API key (or set ANTHROPIC_API_KEY env var)') + parser.add_argument('--dry-run', action='store_true', + help='Show what would be done without calling API') + + args = parser.parse_args() + + # Validate skill directory + skill_dir = Path(args.skill_dir) + if not skill_dir.exists(): + print(f"❌ Error: Directory not found: {skill_dir}") + sys.exit(1) + + if not skill_dir.is_dir(): + print(f"❌ Error: Not a directory: {skill_dir}") + sys.exit(1) + + # Dry run mode + if args.dry_run: + print(f"🔍 DRY RUN MODE") + print(f" Would enhance: {skill_dir}") + print(f" References: {skill_dir / 'references'}") + print(f" SKILL.md: {skill_dir / 'SKILL.md'}") + + refs_dir = skill_dir / "references" + if refs_dir.exists(): + ref_files = list(refs_dir.glob("*.md")) + print(f" Found {len(ref_files)} reference files:") + for rf in ref_files: + size = rf.stat().st_size + print(f" - {rf.name} ({size:,} bytes)") + + print("\nTo actually run enhancement:") + print(f" skill-seekers enhance {skill_dir}") + return + + # Create enhancer and run + try: + enhancer = SkillEnhancer(skill_dir, api_key=args.api_key) + success = enhancer.run() + sys.exit(0 if success else 1) + + except ValueError as e: + print(f"❌ Error: {e}") + print("\nSet your API key:") + print(" export ANTHROPIC_API_KEY=sk-ant-...") + print("Or provide it directly:") + print(f" skill-seekers enhance {skill_dir} --api-key sk-ant-...") + sys.exit(1) + except Exception as e: + print(f"❌ Unexpected error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/enhance_skill_local.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/enhance_skill_local.py new file mode 100644 index 0000000..99480c5 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/enhance_skill_local.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python3 +""" +SKILL.md Enhancement Script (Local - Using Claude Code) +Opens a new terminal with Claude Code to enhance SKILL.md, then reports back. +No API key needed - uses your existing Claude Code Max plan! + +Usage: + skill-seekers enhance output/steam-inventory/ + skill-seekers enhance output/react/ + +Terminal Selection: + The script automatically detects which terminal app to use: + 1. SKILL_SEEKER_TERMINAL env var (highest priority) + Example: export SKILL_SEEKER_TERMINAL="Ghostty" + 2. TERM_PROGRAM env var (current terminal) + 3. Terminal.app (fallback) + + Supported terminals: Ghostty, iTerm, Terminal, WezTerm +""" + +import os +import sys +import time +import subprocess +import tempfile +from pathlib import Path + +# Add parent directory to path for imports when run as script +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from skill_seekers.cli.constants import LOCAL_CONTENT_LIMIT, LOCAL_PREVIEW_LIMIT +from skill_seekers.cli.utils import read_reference_files + + +def detect_terminal_app(): + """Detect which terminal app to use with cascading priority. + + Priority order: + 1. SKILL_SEEKER_TERMINAL environment variable (explicit user preference) + 2. TERM_PROGRAM environment variable (inherit current terminal) + 3. Terminal.app (fallback default) + + Returns: + tuple: (terminal_app_name, detection_method) + - terminal_app_name (str): Name of terminal app to launch (e.g., "Ghostty", "Terminal") + - detection_method (str): How the terminal was detected (for logging) + + Examples: + >>> os.environ['SKILL_SEEKER_TERMINAL'] = 'Ghostty' + >>> detect_terminal_app() + ('Ghostty', 'SKILL_SEEKER_TERMINAL') + + >>> os.environ['TERM_PROGRAM'] = 'iTerm.app' + >>> detect_terminal_app() + ('iTerm', 'TERM_PROGRAM') + """ + # Map TERM_PROGRAM values to macOS app names + TERMINAL_MAP = { + 'Apple_Terminal': 'Terminal', + 'iTerm.app': 'iTerm', + 'ghostty': 'Ghostty', + 'WezTerm': 'WezTerm', + } + + # Priority 1: Check SKILL_SEEKER_TERMINAL env var (explicit preference) + preferred_terminal = os.environ.get('SKILL_SEEKER_TERMINAL', '').strip() + if preferred_terminal: + return preferred_terminal, 'SKILL_SEEKER_TERMINAL' + + # Priority 2: Check TERM_PROGRAM (inherit current terminal) + term_program = os.environ.get('TERM_PROGRAM', '').strip() + if term_program and term_program in TERMINAL_MAP: + return TERMINAL_MAP[term_program], 'TERM_PROGRAM' + + # Priority 3: Fallback to Terminal.app + if term_program: + # TERM_PROGRAM is set but unknown + return 'Terminal', f'unknown TERM_PROGRAM ({term_program})' + else: + # No TERM_PROGRAM set + return 'Terminal', 'default' + + +class LocalSkillEnhancer: + def __init__(self, skill_dir): + self.skill_dir = Path(skill_dir) + self.references_dir = self.skill_dir / "references" + self.skill_md_path = self.skill_dir / "SKILL.md" + + def create_enhancement_prompt(self): + """Create the prompt file for Claude Code""" + + # Read reference files + references = read_reference_files( + self.skill_dir, + max_chars=LOCAL_CONTENT_LIMIT, + preview_limit=LOCAL_PREVIEW_LIMIT + ) + + if not references: + print("❌ No reference files found") + return None + + # Read current SKILL.md + current_skill_md = "" + if self.skill_md_path.exists(): + current_skill_md = self.skill_md_path.read_text(encoding='utf-8') + + # Build prompt + prompt = f"""I need you to enhance the SKILL.md file for the {self.skill_dir.name} skill. + +CURRENT SKILL.MD: +{'-'*60} +{current_skill_md if current_skill_md else '(No existing SKILL.md - create from scratch)'} +{'-'*60} + +REFERENCE DOCUMENTATION: +{'-'*60} +""" + + for filename, content in references.items(): + prompt += f"\n## {filename}\n{content[:15000]}\n" + + prompt += f""" +{'-'*60} + +YOUR TASK: +Create an EXCELLENT SKILL.md file that will help Claude use this documentation effectively. + +Requirements: +1. **Clear "When to Use This Skill" section** + - Be SPECIFIC about trigger conditions + - List concrete use cases + +2. **Excellent Quick Reference section** + - Extract 5-10 of the BEST, most practical code examples from the reference docs + - Choose SHORT, clear examples (5-20 lines max) + - Include both simple and intermediate examples + - Use proper language tags (cpp, python, javascript, json, etc.) + - Add clear descriptions for each example + +3. **Detailed Reference Files description** + - Explain what's in each reference file + - Help users navigate the documentation + +4. **Practical "Working with This Skill" section** + - Clear guidance for beginners, intermediate, and advanced users + - Navigation tips + +5. **Key Concepts section** (if applicable) + - Explain core concepts + - Define important terminology + +IMPORTANT: +- Extract REAL examples from the reference docs above +- Prioritize SHORT, clear examples +- Make it actionable and practical +- Keep the frontmatter (---\\nname: ...\\n---) intact +- Use proper markdown formatting + +SAVE THE RESULT: +Save the complete enhanced SKILL.md to: {self.skill_md_path.absolute()} + +First, backup the original to: {self.skill_md_path.with_suffix('.md.backup').absolute()} +""" + + return prompt + + def run(self, headless=True, timeout=600): + """Main enhancement workflow + + Args: + headless: If True, run claude directly without opening terminal (default: True) + timeout: Maximum time to wait for enhancement in seconds (default: 600 = 10 minutes) + """ + print(f"\n{'='*60}") + print(f"LOCAL ENHANCEMENT: {self.skill_dir.name}") + print(f"{'='*60}\n") + + # Validate + if not self.skill_dir.exists(): + print(f"❌ Directory not found: {self.skill_dir}") + return False + + # Read reference files + print("📖 Reading reference documentation...") + references = read_reference_files( + self.skill_dir, + max_chars=LOCAL_CONTENT_LIMIT, + preview_limit=LOCAL_PREVIEW_LIMIT + ) + + if not references: + print("❌ No reference files found to analyze") + return False + + print(f" ✓ Read {len(references)} reference files") + total_size = sum(len(c) for c in references.values()) + print(f" ✓ Total size: {total_size:,} characters\n") + + # Create prompt + print("📝 Creating enhancement prompt...") + prompt = self.create_enhancement_prompt() + + if not prompt: + return False + + # Save prompt to temp file + with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f: + prompt_file = f.name + f.write(prompt) + + print(f" ✓ Prompt saved ({len(prompt):,} characters)\n") + + # Headless mode: Run claude directly without opening terminal + if headless: + return self._run_headless(prompt_file, timeout) + + # Terminal mode: Launch Claude Code in new terminal + print("🚀 Launching Claude Code in new terminal...") + print(" This will:") + print(" 1. Open a new terminal window") + print(" 2. Run Claude Code with the enhancement task") + print(" 3. Claude will read the docs and enhance SKILL.md") + print(" 4. Terminal will auto-close when done") + print() + + # Create a shell script to run in the terminal + shell_script = f'''#!/bin/bash +claude {prompt_file} +echo "" +echo "✅ Enhancement complete!" +echo "Press any key to close..." +read -n 1 +rm {prompt_file} +''' + + # Save shell script + with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f: + script_file = f.name + f.write(shell_script) + + os.chmod(script_file, 0o755) + + # Launch in new terminal (macOS specific) + if sys.platform == 'darwin': + # Detect which terminal app to use + terminal_app, detection_method = detect_terminal_app() + + # Show detection info + if detection_method == 'SKILL_SEEKER_TERMINAL': + print(f" Using terminal: {terminal_app} (from SKILL_SEEKER_TERMINAL)") + elif detection_method == 'TERM_PROGRAM': + print(f" Using terminal: {terminal_app} (inherited from current terminal)") + elif detection_method.startswith('unknown TERM_PROGRAM'): + print(f"⚠️ {detection_method}") + print(f" → Using Terminal.app as fallback") + else: + print(f" Using terminal: {terminal_app} (default)") + + try: + subprocess.Popen(['open', '-a', terminal_app, script_file]) + except Exception as e: + print(f"⚠️ Error launching {terminal_app}: {e}") + print(f"\nManually run: {script_file}") + return False + else: + print("⚠️ Auto-launch only works on macOS") + print(f"\nManually run this command in a new terminal:") + print(f" claude '{prompt_file}'") + print(f"\nThen delete the prompt file:") + print(f" rm '{prompt_file}'") + return False + + print("✅ New terminal launched with Claude Code!") + print() + print("📊 Status:") + print(f" - Prompt file: {prompt_file}") + print(f" - Skill directory: {self.skill_dir.absolute()}") + print(f" - SKILL.md will be saved to: {self.skill_md_path.absolute()}") + print(f" - Original backed up to: {self.skill_md_path.with_suffix('.md.backup').absolute()}") + print() + print("⏳ Wait for Claude Code to finish in the other terminal...") + print(" (Usually takes 30-60 seconds)") + print() + print("💡 When done:") + print(f" 1. Check the enhanced SKILL.md: {self.skill_md_path}") + print(f" 2. If you don't like it, restore: mv {self.skill_md_path.with_suffix('.md.backup')} {self.skill_md_path}") + print(f" 3. Package: skill-seekers package {self.skill_dir}/") + + return True + + def _run_headless(self, prompt_file, timeout): + """Run Claude enhancement in headless mode (no terminal window) + + Args: + prompt_file: Path to prompt file + timeout: Maximum seconds to wait + + Returns: + bool: True if enhancement succeeded + """ + import time + from pathlib import Path + + print("✨ Running Claude Code enhancement (headless mode)...") + print(f" Timeout: {timeout} seconds ({timeout//60} minutes)") + print() + + # Record initial state + initial_mtime = self.skill_md_path.stat().st_mtime if self.skill_md_path.exists() else 0 + initial_size = self.skill_md_path.stat().st_size if self.skill_md_path.exists() else 0 + + # Start timer + start_time = time.time() + + try: + # Run claude command directly (this WAITS for completion) + print(" Running: claude {prompt_file}") + print(" ⏳ Please wait...") + print() + + result = subprocess.run( + ['claude', prompt_file], + capture_output=True, + text=True, + timeout=timeout + ) + + elapsed = time.time() - start_time + + # Check if successful + if result.returncode == 0: + # Verify SKILL.md was actually updated + if self.skill_md_path.exists(): + new_mtime = self.skill_md_path.stat().st_mtime + new_size = self.skill_md_path.stat().st_size + + if new_mtime > initial_mtime and new_size > initial_size: + print(f"✅ Enhancement complete! ({elapsed:.1f} seconds)") + print(f" SKILL.md updated: {new_size:,} bytes") + print() + + # Clean up prompt file + try: + os.unlink(prompt_file) + except: + pass + + return True + else: + print(f"⚠️ Claude finished but SKILL.md was not updated") + print(f" This might indicate an error during enhancement") + print() + return False + else: + print(f"❌ SKILL.md not found after enhancement") + return False + else: + print(f"❌ Claude Code returned error (exit code: {result.returncode})") + if result.stderr: + print(f" Error: {result.stderr[:200]}") + return False + + except subprocess.TimeoutExpired: + elapsed = time.time() - start_time + print(f"\n⚠️ Enhancement timed out after {elapsed:.0f} seconds") + print(f" Timeout limit: {timeout} seconds") + print() + print(" Possible reasons:") + print(" - Skill is very large (many references)") + print(" - Claude is taking longer than usual") + print(" - Network issues") + print() + print(" Try:") + print(" 1. Use terminal mode: --interactive-enhancement") + print(" 2. Reduce reference content") + print(" 3. Try again later") + + # Clean up + try: + os.unlink(prompt_file) + except: + pass + + return False + + except FileNotFoundError: + print("❌ 'claude' command not found") + print() + print(" Make sure Claude Code CLI is installed:") + print(" See: https://docs.claude.com/claude-code") + print() + print(" Try terminal mode instead: --interactive-enhancement") + + return False + + except Exception as e: + print(f"❌ Unexpected error: {e}") + return False + + +def main(): + import argparse + + parser = argparse.ArgumentParser( + description="Enhance a skill with Claude Code (local)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Headless mode (default - runs in background) + skill-seekers enhance output/react/ + + # Interactive mode (opens terminal window) + skill-seekers enhance output/react/ --interactive-enhancement + + # Custom timeout + skill-seekers enhance output/react/ --timeout 1200 +""" + ) + + parser.add_argument( + 'skill_directory', + help='Path to skill directory (e.g., output/react/)' + ) + + parser.add_argument( + '--interactive-enhancement', + action='store_true', + help='Open terminal window for enhancement (default: headless mode)' + ) + + parser.add_argument( + '--timeout', + type=int, + default=600, + help='Timeout in seconds for headless mode (default: 600 = 10 minutes)' + ) + + args = parser.parse_args() + + # Run enhancement + enhancer = LocalSkillEnhancer(args.skill_directory) + headless = not args.interactive_enhancement # Invert: default is headless + success = enhancer.run(headless=headless, timeout=args.timeout) + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/estimate_pages.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/estimate_pages.py new file mode 100644 index 0000000..c2a23b0 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/estimate_pages.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +""" +Page Count Estimator for Skill Seeker +Quickly estimates how many pages a config will scrape without downloading content +""" + +import sys +import os +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse +import time +import json + +# Add parent directory to path for imports when run as script +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from skill_seekers.cli.constants import ( + DEFAULT_RATE_LIMIT, + DEFAULT_MAX_DISCOVERY, + DISCOVERY_THRESHOLD +) + + +def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30): + """ + Estimate total pages that will be scraped + + Args: + config: Configuration dictionary + max_discovery: Maximum pages to discover (safety limit, use -1 for unlimited) + timeout: Timeout for HTTP requests in seconds + + Returns: + dict with estimation results + """ + base_url = config['base_url'] + start_urls = config.get('start_urls', [base_url]) + url_patterns = config.get('url_patterns', {'include': [], 'exclude': []}) + rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT) + + visited = set() + pending = list(start_urls) + discovered = 0 + + include_patterns = url_patterns.get('include', []) + exclude_patterns = url_patterns.get('exclude', []) + + # Handle unlimited mode + unlimited = (max_discovery == -1 or max_discovery is None) + + print(f"🔍 Estimating pages for: {config['name']}") + print(f"📍 Base URL: {base_url}") + print(f"🎯 Start URLs: {len(start_urls)}") + print(f"⏱️ Rate limit: {rate_limit}s") + + if unlimited: + print(f"🔢 Max discovery: UNLIMITED (will discover all pages)") + print(f"⚠️ WARNING: This may take a long time!") + else: + print(f"🔢 Max discovery: {max_discovery}") + + print() + + start_time = time.time() + + # Loop condition: stop if no more URLs, or if limit reached (when not unlimited) + while pending and (unlimited or discovered < max_discovery): + url = pending.pop(0) + + # Skip if already visited + if url in visited: + continue + + visited.add(url) + discovered += 1 + + # Progress indicator + if discovered % 10 == 0: + elapsed = time.time() - start_time + rate = discovered / elapsed if elapsed > 0 else 0 + print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r') + + try: + # HEAD request first to check if page exists (faster) + head_response = requests.head(url, timeout=timeout, allow_redirects=True) + + # Skip non-HTML content + content_type = head_response.headers.get('Content-Type', '') + if 'text/html' not in content_type: + continue + + # Now GET the page to find links + response = requests.get(url, timeout=timeout) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # Find all links + for link in soup.find_all('a', href=True): + href = link['href'] + full_url = urljoin(url, href) + + # Normalize URL + parsed = urlparse(full_url) + full_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" + + # Check if URL is valid + if not is_valid_url(full_url, base_url, include_patterns, exclude_patterns): + continue + + # Add to pending if not visited + if full_url not in visited and full_url not in pending: + pending.append(full_url) + + # Rate limiting + time.sleep(rate_limit) + + except requests.RequestException as e: + # Silently skip errors during estimation + pass + except Exception as e: + # Silently skip other errors + pass + + elapsed = time.time() - start_time + + # Results + results = { + 'discovered': discovered, + 'pending': len(pending), + 'estimated_total': discovered + len(pending), + 'elapsed_seconds': round(elapsed, 2), + 'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2), + 'hit_limit': (not unlimited) and (discovered >= max_discovery), + 'unlimited': unlimited + } + + return results + + +def is_valid_url(url, base_url, include_patterns, exclude_patterns): + """Check if URL should be crawled""" + # Must be same domain + if not url.startswith(base_url.rstrip('/')): + return False + + # Check exclude patterns first + if exclude_patterns: + for pattern in exclude_patterns: + if pattern in url: + return False + + # Check include patterns (if specified) + if include_patterns: + for pattern in include_patterns: + if pattern in url: + return True + return False + + # If no include patterns, accept by default + return True + + +def print_results(results, config): + """Print estimation results""" + print() + print("=" * 70) + print("📊 ESTIMATION RESULTS") + print("=" * 70) + print() + print(f"Config: {config['name']}") + print(f"Base URL: {config['base_url']}") + print() + print(f"✅ Pages Discovered: {results['discovered']}") + print(f"⏳ Pages Pending: {results['pending']}") + print(f"📈 Estimated Total: {results['estimated_total']}") + print() + print(f"⏱️ Time Elapsed: {results['elapsed_seconds']}s") + print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec") + + if results.get('unlimited', False): + print() + print("✅ UNLIMITED MODE - Discovered all reachable pages") + print(f" Total pages: {results['estimated_total']}") + elif results['hit_limit']: + print() + print("⚠️ Hit discovery limit - actual total may be higher") + print(" Increase max_discovery parameter for more accurate estimate") + + print() + print("=" * 70) + print("💡 RECOMMENDATIONS") + print("=" * 70) + print() + + estimated = results['estimated_total'] + current_max = config.get('max_pages', 100) + + if estimated <= current_max: + print(f"✅ Current max_pages ({current_max}) is sufficient") + else: + recommended = min(estimated + 50, DISCOVERY_THRESHOLD) # Add 50 buffer, cap at threshold + print(f"⚠️ Current max_pages ({current_max}) may be too low") + print(f"📝 Recommended max_pages: {recommended}") + print(f" (Estimated {estimated} + 50 buffer)") + + # Estimate time for full scrape + rate_limit = config.get('rate_limit', DEFAULT_RATE_LIMIT) + estimated_time = (estimated * rate_limit) / 60 # in minutes + + print() + print(f"⏱️ Estimated full scrape time: {estimated_time:.1f} minutes") + print(f" (Based on rate_limit: {rate_limit}s)") + + print() + + +def load_config(config_path): + """Load configuration from JSON file""" + try: + with open(config_path, 'r') as f: + config = json.load(f) + return config + except FileNotFoundError: + print(f"❌ Error: Config file not found: {config_path}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"❌ Error: Invalid JSON in config file: {e}") + sys.exit(1) + + +def main(): + """Main entry point""" + import argparse + + parser = argparse.ArgumentParser( + description='Estimate page count for Skill Seeker configs', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Estimate pages for a config + skill-seekers estimate configs/react.json + + # Estimate with higher discovery limit + skill-seekers estimate configs/godot.json --max-discovery 2000 + + # Quick estimate (stop at 100 pages) + skill-seekers estimate configs/vue.json --max-discovery 100 + """ + ) + + parser.add_argument('config', help='Path to config JSON file') + parser.add_argument('--max-discovery', '-m', type=int, default=DEFAULT_MAX_DISCOVERY, + help=f'Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)') + parser.add_argument('--unlimited', '-u', action='store_true', + help='Remove discovery limit - discover all pages (same as --max-discovery -1)') + parser.add_argument('--timeout', '-t', type=int, default=30, + help='HTTP request timeout in seconds (default: 30)') + + args = parser.parse_args() + + # Handle unlimited flag + max_discovery = -1 if args.unlimited else args.max_discovery + + # Load config + config = load_config(args.config) + + # Run estimation + try: + results = estimate_pages(config, max_discovery, args.timeout) + print_results(results, config) + + # Return exit code based on results + if results['hit_limit']: + return 2 # Warning: hit limit + return 0 # Success + + except KeyboardInterrupt: + print("\n\n⚠️ Estimation interrupted by user") + return 1 + except Exception as e: + print(f"\n\n❌ Error during estimation: {e}") + return 1 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/generate_router.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/generate_router.py new file mode 100644 index 0000000..0d4ef84 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/generate_router.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +""" +Router Skill Generator + +Creates a router/hub skill that intelligently directs queries to specialized sub-skills. +This is used for large documentation sites split into multiple focused skills. +""" + +import json +import sys +import argparse +from pathlib import Path +from typing import Dict, List, Any, Tuple + + +class RouterGenerator: + """Generates router skills that direct to specialized sub-skills""" + + def __init__(self, config_paths: List[str], router_name: str = None): + self.config_paths = [Path(p) for p in config_paths] + self.configs = [self.load_config(p) for p in self.config_paths] + self.router_name = router_name or self.infer_router_name() + self.base_config = self.configs[0] # Use first as template + + def load_config(self, path: Path) -> Dict[str, Any]: + """Load a config file""" + try: + with open(path, 'r') as f: + return json.load(f) + except Exception as e: + print(f"❌ Error loading {path}: {e}") + sys.exit(1) + + def infer_router_name(self) -> str: + """Infer router name from sub-skill names""" + # Find common prefix + names = [cfg['name'] for cfg in self.configs] + if not names: + return "router" + + # Get common prefix before first dash + first_name = names[0] + if '-' in first_name: + return first_name.split('-')[0] + return first_name + + def extract_routing_keywords(self) -> Dict[str, List[str]]: + """Extract keywords for routing to each skill""" + routing = {} + + for config in self.configs: + name = config['name'] + keywords = [] + + # Extract from categories + if 'categories' in config: + keywords.extend(config['categories'].keys()) + + # Extract from name (part after dash) + if '-' in name: + skill_topic = name.split('-', 1)[1] + keywords.append(skill_topic) + + routing[name] = keywords + + return routing + + def generate_skill_md(self) -> str: + """Generate router SKILL.md content""" + routing_keywords = self.extract_routing_keywords() + + skill_md = f"""# {self.router_name.replace('-', ' ').title()} Documentation (Router) + +## When to Use This Skill + +{self.base_config.get('description', f'Use for {self.router_name} development and programming.')} + +This is a router skill that directs your questions to specialized sub-skills for efficient, focused assistance. + +## How It Works + +This skill analyzes your question and activates the appropriate specialized skill(s): + +""" + + # List sub-skills + for config in self.configs: + name = config['name'] + desc = config.get('description', '') + # Remove router name prefix from description if present + if desc.startswith(f"{self.router_name.title()} -"): + desc = desc.split(' - ', 1)[1] + + skill_md += f"### {name}\n{desc}\n\n" + + # Routing logic + skill_md += """## Routing Logic + +The router analyzes your question for topic keywords and activates relevant skills: + +**Keywords → Skills:** +""" + + for skill_name, keywords in routing_keywords.items(): + keyword_str = ", ".join(keywords) + skill_md += f"- {keyword_str} → **{skill_name}**\n" + + # Quick reference + skill_md += f""" + +## Quick Reference + +For quick answers, this router provides basic overview information. For detailed documentation, the specialized skills contain comprehensive references. + +### Getting Started + +1. Ask your question naturally - mention the topic area +2. The router will activate the appropriate skill(s) +3. You'll receive focused, detailed answers from specialized documentation + +### Examples + +**Question:** "How do I create a 2D sprite?" +**Activates:** {self.router_name}-2d skill + +**Question:** "GDScript function syntax" +**Activates:** {self.router_name}-scripting skill + +**Question:** "Physics collision handling in 3D" +**Activates:** {self.router_name}-3d + {self.router_name}-physics skills + +### All Available Skills + +""" + + # List all skills + for config in self.configs: + skill_md += f"- **{config['name']}**\n" + + skill_md += f""" + +## Need Help? + +Simply ask your question and mention the topic. The router will find the right specialized skill for you! + +--- + +*This is a router skill. For complete documentation, see the specialized skills listed above.* +""" + + return skill_md + + def create_router_config(self) -> Dict[str, Any]: + """Create router configuration""" + routing_keywords = self.extract_routing_keywords() + + router_config = { + "name": self.router_name, + "description": self.base_config.get('description', f'{self.router_name.title()} documentation router'), + "base_url": self.base_config['base_url'], + "selectors": self.base_config.get('selectors', {}), + "url_patterns": self.base_config.get('url_patterns', {}), + "rate_limit": self.base_config.get('rate_limit', 0.5), + "max_pages": 500, # Router only scrapes overview pages + "_router": True, + "_sub_skills": [cfg['name'] for cfg in self.configs], + "_routing_keywords": routing_keywords + } + + return router_config + + def generate(self, output_dir: Path = None) -> Tuple[Path, Path]: + """Generate router skill and config""" + if output_dir is None: + output_dir = self.config_paths[0].parent + + output_dir = Path(output_dir) + + # Generate SKILL.md + skill_md = self.generate_skill_md() + skill_path = output_dir.parent / f"output/{self.router_name}/SKILL.md" + skill_path.parent.mkdir(parents=True, exist_ok=True) + + with open(skill_path, 'w') as f: + f.write(skill_md) + + # Generate config + router_config = self.create_router_config() + config_path = output_dir / f"{self.router_name}.json" + + with open(config_path, 'w') as f: + json.dump(router_config, f, indent=2) + + return config_path, skill_path + + +def main(): + parser = argparse.ArgumentParser( + description="Generate router/hub skill for split documentation", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Generate router from multiple configs + python3 generate_router.py configs/godot-2d.json configs/godot-3d.json configs/godot-scripting.json + + # Use glob pattern + python3 generate_router.py configs/godot-*.json + + # Custom router name + python3 generate_router.py configs/godot-*.json --name godot-hub + + # Custom output directory + python3 generate_router.py configs/godot-*.json --output-dir configs/routers/ + """ + ) + + parser.add_argument( + 'configs', + nargs='+', + help='Sub-skill config files' + ) + + parser.add_argument( + '--name', + help='Router skill name (default: inferred from sub-skills)' + ) + + parser.add_argument( + '--output-dir', + help='Output directory (default: same as input configs)' + ) + + args = parser.parse_args() + + # Filter out router configs (avoid recursion) + config_files = [] + for path_str in args.configs: + path = Path(path_str) + if path.exists() and not path.stem.endswith('-router'): + config_files.append(path_str) + + if not config_files: + print("❌ Error: No valid config files provided") + sys.exit(1) + + print(f"\n{'='*60}") + print("ROUTER SKILL GENERATOR") + print(f"{'='*60}") + print(f"Sub-skills: {len(config_files)}") + for cfg in config_files: + print(f" - {Path(cfg).stem}") + print("") + + # Generate router + generator = RouterGenerator(config_files, args.name) + config_path, skill_path = generator.generate(args.output_dir) + + print(f"✅ Router config created: {config_path}") + print(f"✅ Router SKILL.md created: {skill_path}") + print("") + print(f"{'='*60}") + print("NEXT STEPS") + print(f"{'='*60}") + print(f"1. Review router SKILL.md: {skill_path}") + print(f"2. Optionally scrape router (for overview pages):") + print(f" skill-seekers scrape --config {config_path}") + print("3. Package router skill:") + print(f" skill-seekers package output/{generator.router_name}/") + print("4. Upload router + all sub-skills to Claude") + print("") + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/github_scraper.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/github_scraper.py new file mode 100644 index 0000000..861f6c6 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/github_scraper.py @@ -0,0 +1,900 @@ +#!/usr/bin/env python3 +""" +GitHub Repository to Claude Skill Converter (Tasks C1.1-C1.12) + +Converts GitHub repositories into Claude AI skills by extracting: +- README and documentation +- Code structure and signatures +- GitHub Issues, Changelog, and Releases +- Usage examples from tests + +Usage: + skill-seekers github --repo facebook/react + skill-seekers github --config configs/react_github.json + skill-seekers github --repo owner/repo --token $GITHUB_TOKEN +""" + +import os +import sys +import json +import re +import argparse +import logging +from pathlib import Path +from typing import Dict, List, Optional, Any +from datetime import datetime + +try: + from github import Github, GithubException, Repository + from github.GithubException import RateLimitExceededException +except ImportError: + print("Error: PyGithub not installed. Run: pip install PyGithub") + sys.exit(1) + +# Configure logging FIRST (before using logger) +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Import code analyzer for deep code analysis +try: + from .code_analyzer import CodeAnalyzer + CODE_ANALYZER_AVAILABLE = True +except ImportError: + CODE_ANALYZER_AVAILABLE = False + logger.warning("Code analyzer not available - deep analysis disabled") + +# Directories to exclude from local repository analysis +EXCLUDED_DIRS = { + 'venv', 'env', '.venv', '.env', # Virtual environments + 'node_modules', '__pycache__', '.pytest_cache', # Dependencies and caches + '.git', '.svn', '.hg', # Version control + 'build', 'dist', '*.egg-info', # Build artifacts + 'htmlcov', '.coverage', # Coverage reports + '.tox', '.nox', # Testing environments + '.mypy_cache', '.ruff_cache', # Linter caches +} + + +class GitHubScraper: + """ + GitHub Repository Scraper (C1.1-C1.9) + + Extracts repository information for skill generation: + - Repository structure + - README files + - Code comments and docstrings + - Programming language detection + - Function/class signatures + - Test examples + - GitHub Issues + - CHANGELOG + - Releases + """ + + def __init__(self, config: Dict[str, Any], local_repo_path: Optional[str] = None): + """Initialize GitHub scraper with configuration.""" + self.config = config + self.repo_name = config['repo'] + self.name = config.get('name', self.repo_name.split('/')[-1]) + self.description = config.get('description', f'Skill for {self.repo_name}') + + # Local repository path (optional - enables unlimited analysis) + self.local_repo_path = local_repo_path or config.get('local_repo_path') + if self.local_repo_path: + self.local_repo_path = os.path.expanduser(self.local_repo_path) + logger.info(f"Local repository mode enabled: {self.local_repo_path}") + + # Configure directory exclusions (smart defaults + optional customization) + self.excluded_dirs = set(EXCLUDED_DIRS) # Start with smart defaults + + # Option 1: Replace mode - Use only specified exclusions + if 'exclude_dirs' in config: + self.excluded_dirs = set(config['exclude_dirs']) + logger.warning( + f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - " + "defaults overridden" + ) + logger.debug(f"Custom exclusions: {sorted(self.excluded_dirs)}") + + # Option 2: Extend mode - Add to default exclusions + elif 'exclude_dirs_additional' in config: + additional = set(config['exclude_dirs_additional']) + self.excluded_dirs = self.excluded_dirs.union(additional) + logger.info( + f"Added {len(additional)} custom directory exclusions " + f"(total: {len(self.excluded_dirs)})" + ) + logger.debug(f"Additional exclusions: {sorted(additional)}") + + # GitHub client setup (C1.1) + token = self._get_token() + self.github = Github(token) if token else Github() + self.repo: Optional[Repository.Repository] = None + + # Options + self.include_issues = config.get('include_issues', True) + self.max_issues = config.get('max_issues', 100) + self.include_changelog = config.get('include_changelog', True) + self.include_releases = config.get('include_releases', True) + self.include_code = config.get('include_code', False) + self.code_analysis_depth = config.get('code_analysis_depth', 'surface') # 'surface', 'deep', 'full' + self.file_patterns = config.get('file_patterns', []) + + # Initialize code analyzer if deep analysis requested + self.code_analyzer = None + if self.code_analysis_depth != 'surface' and CODE_ANALYZER_AVAILABLE: + self.code_analyzer = CodeAnalyzer(depth=self.code_analysis_depth) + logger.info(f"Code analysis depth: {self.code_analysis_depth}") + + # Output paths + self.skill_dir = f"output/{self.name}" + self.data_file = f"output/{self.name}_github_data.json" + + # Extracted data storage + self.extracted_data = { + 'repo_info': {}, + 'readme': '', + 'file_tree': [], + 'languages': {}, + 'signatures': [], + 'test_examples': [], + 'issues': [], + 'changelog': '', + 'releases': [] + } + + def _get_token(self) -> Optional[str]: + """ + Get GitHub token from env var or config (both options supported). + Priority: GITHUB_TOKEN env var > config file > None + """ + # Try environment variable first (recommended) + token = os.getenv('GITHUB_TOKEN') + if token: + logger.info("Using GitHub token from GITHUB_TOKEN environment variable") + return token + + # Fall back to config file + token = self.config.get('github_token') + if token: + logger.warning("Using GitHub token from config file (less secure)") + return token + + logger.warning("No GitHub token provided - using unauthenticated access (lower rate limits)") + return None + + def scrape(self) -> Dict[str, Any]: + """ + Main scraping entry point. + Executes all C1 tasks in sequence. + """ + try: + logger.info(f"Starting GitHub scrape for: {self.repo_name}") + + # C1.1: Fetch repository + self._fetch_repository() + + # C1.2: Extract README + self._extract_readme() + + # C1.3-C1.6: Extract code structure + self._extract_code_structure() + + # C1.7: Extract Issues + if self.include_issues: + self._extract_issues() + + # C1.8: Extract CHANGELOG + if self.include_changelog: + self._extract_changelog() + + # C1.9: Extract Releases + if self.include_releases: + self._extract_releases() + + # Save extracted data + self._save_data() + + logger.info(f"✅ Scraping complete! Data saved to: {self.data_file}") + return self.extracted_data + + except RateLimitExceededException: + logger.error("GitHub API rate limit exceeded. Please wait or use authentication token.") + raise + except GithubException as e: + logger.error(f"GitHub API error: {e}") + raise + except Exception as e: + logger.error(f"Unexpected error during scraping: {e}") + raise + + def _fetch_repository(self): + """C1.1: Fetch repository structure using GitHub API.""" + logger.info(f"Fetching repository: {self.repo_name}") + + try: + self.repo = self.github.get_repo(self.repo_name) + + # Extract basic repo info + self.extracted_data['repo_info'] = { + 'name': self.repo.name, + 'full_name': self.repo.full_name, + 'description': self.repo.description, + 'url': self.repo.html_url, + 'homepage': self.repo.homepage, + 'stars': self.repo.stargazers_count, + 'forks': self.repo.forks_count, + 'open_issues': self.repo.open_issues_count, + 'default_branch': self.repo.default_branch, + 'created_at': self.repo.created_at.isoformat() if self.repo.created_at else None, + 'updated_at': self.repo.updated_at.isoformat() if self.repo.updated_at else None, + 'language': self.repo.language, + 'license': self.repo.license.name if self.repo.license else None, + 'topics': self.repo.get_topics() + } + + logger.info(f"Repository fetched: {self.repo.full_name} ({self.repo.stargazers_count} stars)") + + except GithubException as e: + if e.status == 404: + raise ValueError(f"Repository not found: {self.repo_name}") + raise + + def _extract_readme(self): + """C1.2: Extract README.md files.""" + logger.info("Extracting README...") + + # Try common README locations + readme_files = ['README.md', 'README.rst', 'README.txt', 'README', + 'docs/README.md', '.github/README.md'] + + for readme_path in readme_files: + try: + content = self.repo.get_contents(readme_path) + if content: + self.extracted_data['readme'] = content.decoded_content.decode('utf-8') + logger.info(f"README found: {readme_path}") + return + except GithubException: + continue + + logger.warning("No README found in repository") + + def _extract_code_structure(self): + """ + C1.3-C1.6: Extract code structure, languages, signatures, and test examples. + Surface layer only - no full implementation code. + """ + logger.info("Extracting code structure...") + + # C1.4: Get language breakdown + self._extract_languages() + + # Get file tree + self._extract_file_tree() + + # Extract signatures and test examples + if self.include_code: + self._extract_signatures_and_tests() + + def _extract_languages(self): + """C1.4: Detect programming languages in repository.""" + logger.info("Detecting programming languages...") + + try: + languages = self.repo.get_languages() + total_bytes = sum(languages.values()) + + self.extracted_data['languages'] = { + lang: { + 'bytes': bytes_count, + 'percentage': round((bytes_count / total_bytes) * 100, 2) if total_bytes > 0 else 0 + } + for lang, bytes_count in languages.items() + } + + logger.info(f"Languages detected: {', '.join(languages.keys())}") + + except GithubException as e: + logger.warning(f"Could not fetch languages: {e}") + + def should_exclude_dir(self, dir_name: str) -> bool: + """Check if directory should be excluded from analysis.""" + return dir_name in self.excluded_dirs or dir_name.startswith('.') + + def _extract_file_tree(self): + """Extract repository file tree structure (dual-mode: GitHub API or local filesystem).""" + logger.info("Building file tree...") + + if self.local_repo_path: + # Local filesystem mode - unlimited files + self._extract_file_tree_local() + else: + # GitHub API mode - limited by API rate limits + self._extract_file_tree_github() + + def _extract_file_tree_local(self): + """Extract file tree from local filesystem (unlimited files).""" + if not os.path.exists(self.local_repo_path): + logger.error(f"Local repository path not found: {self.local_repo_path}") + return + + file_tree = [] + for root, dirs, files in os.walk(self.local_repo_path): + # Exclude directories in-place to prevent os.walk from descending into them + dirs[:] = [d for d in dirs if not self.should_exclude_dir(d)] + + # Calculate relative path from repo root + rel_root = os.path.relpath(root, self.local_repo_path) + if rel_root == '.': + rel_root = '' + + # Add directories + for dir_name in dirs: + dir_path = os.path.join(rel_root, dir_name) if rel_root else dir_name + file_tree.append({ + 'path': dir_path, + 'type': 'dir', + 'size': None + }) + + # Add files + for file_name in files: + file_path = os.path.join(rel_root, file_name) if rel_root else file_name + full_path = os.path.join(root, file_name) + try: + file_size = os.path.getsize(full_path) + except OSError: + file_size = None + + file_tree.append({ + 'path': file_path, + 'type': 'file', + 'size': file_size + }) + + self.extracted_data['file_tree'] = file_tree + logger.info(f"File tree built (local mode): {len(file_tree)} items") + + def _extract_file_tree_github(self): + """Extract file tree from GitHub API (rate-limited).""" + try: + contents = self.repo.get_contents("") + file_tree = [] + + while contents: + file_content = contents.pop(0) + + file_info = { + 'path': file_content.path, + 'type': file_content.type, + 'size': file_content.size if file_content.type == 'file' else None + } + file_tree.append(file_info) + + if file_content.type == "dir": + contents.extend(self.repo.get_contents(file_content.path)) + + self.extracted_data['file_tree'] = file_tree + logger.info(f"File tree built (GitHub API mode): {len(file_tree)} items") + + except GithubException as e: + logger.warning(f"Could not build file tree: {e}") + + def _extract_signatures_and_tests(self): + """ + C1.3, C1.5, C1.6: Extract signatures, docstrings, and test examples. + + Extraction depth depends on code_analysis_depth setting: + - surface: File tree only (minimal) + - deep: Parse files for signatures, parameters, types + - full: Complete AST analysis (future enhancement) + """ + if self.code_analysis_depth == 'surface': + logger.info("Code extraction: Surface level (file tree only)") + return + + if not self.code_analyzer: + logger.warning("Code analyzer not available - skipping deep analysis") + return + + logger.info(f"Extracting code signatures ({self.code_analysis_depth} analysis)...") + + # Get primary language for the repository + languages = self.extracted_data.get('languages', {}) + if not languages: + logger.warning("No languages detected - skipping code analysis") + return + + # Determine primary language + primary_language = max(languages.items(), key=lambda x: x[1]['bytes'])[0] + logger.info(f"Primary language: {primary_language}") + + # Determine file extensions to analyze + extension_map = { + 'Python': ['.py'], + 'JavaScript': ['.js', '.jsx'], + 'TypeScript': ['.ts', '.tsx'], + 'C': ['.c', '.h'], + 'C++': ['.cpp', '.hpp', '.cc', '.hh', '.cxx'] + } + + extensions = extension_map.get(primary_language, []) + if not extensions: + logger.warning(f"No file extensions mapped for {primary_language}") + return + + # Analyze files matching patterns and extensions + analyzed_files = [] + file_tree = self.extracted_data.get('file_tree', []) + + for file_info in file_tree: + file_path = file_info['path'] + + # Check if file matches extension + if not any(file_path.endswith(ext) for ext in extensions): + continue + + # Check if file matches patterns (if specified) + if self.file_patterns: + import fnmatch + if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns): + continue + + # Analyze this file + try: + # Read file content based on mode + if self.local_repo_path: + # Local mode - read from filesystem + full_path = os.path.join(self.local_repo_path, file_path) + with open(full_path, 'r', encoding='utf-8') as f: + content = f.read() + else: + # GitHub API mode - fetch from API + file_content = self.repo.get_contents(file_path) + content = file_content.decoded_content.decode('utf-8') + + analysis_result = self.code_analyzer.analyze_file( + file_path, + content, + primary_language + ) + + if analysis_result and (analysis_result.get('classes') or analysis_result.get('functions')): + analyzed_files.append({ + 'file': file_path, + 'language': primary_language, + **analysis_result + }) + + logger.debug(f"Analyzed {file_path}: " + f"{len(analysis_result.get('classes', []))} classes, " + f"{len(analysis_result.get('functions', []))} functions") + + except Exception as e: + logger.debug(f"Could not analyze {file_path}: {e}") + continue + + # Limit number of files analyzed to avoid rate limits (GitHub API mode only) + if not self.local_repo_path and len(analyzed_files) >= 50: + logger.info(f"Reached analysis limit (50 files, GitHub API mode)") + break + + self.extracted_data['code_analysis'] = { + 'depth': self.code_analysis_depth, + 'language': primary_language, + 'files_analyzed': len(analyzed_files), + 'files': analyzed_files + } + + # Calculate totals + total_classes = sum(len(f.get('classes', [])) for f in analyzed_files) + total_functions = sum(len(f.get('functions', [])) for f in analyzed_files) + + logger.info(f"Code analysis complete: {len(analyzed_files)} files, " + f"{total_classes} classes, {total_functions} functions") + + def _extract_issues(self): + """C1.7: Extract GitHub Issues (open/closed, labels, milestones).""" + logger.info(f"Extracting GitHub Issues (max {self.max_issues})...") + + try: + # Fetch recent issues (open + closed) + issues = self.repo.get_issues(state='all', sort='updated', direction='desc') + + issue_list = [] + for issue in issues[:self.max_issues]: + # Skip pull requests (they appear in issues) + if issue.pull_request: + continue + + issue_data = { + 'number': issue.number, + 'title': issue.title, + 'state': issue.state, + 'labels': [label.name for label in issue.labels], + 'milestone': issue.milestone.title if issue.milestone else None, + 'created_at': issue.created_at.isoformat() if issue.created_at else None, + 'updated_at': issue.updated_at.isoformat() if issue.updated_at else None, + 'closed_at': issue.closed_at.isoformat() if issue.closed_at else None, + 'url': issue.html_url, + 'body': issue.body[:500] if issue.body else None # First 500 chars + } + issue_list.append(issue_data) + + self.extracted_data['issues'] = issue_list + logger.info(f"Extracted {len(issue_list)} issues") + + except GithubException as e: + logger.warning(f"Could not fetch issues: {e}") + + def _extract_changelog(self): + """C1.8: Extract CHANGELOG.md and release notes.""" + logger.info("Extracting CHANGELOG...") + + # Try common changelog locations + changelog_files = ['CHANGELOG.md', 'CHANGES.md', 'HISTORY.md', + 'CHANGELOG.rst', 'CHANGELOG.txt', 'CHANGELOG', + 'docs/CHANGELOG.md', '.github/CHANGELOG.md'] + + for changelog_path in changelog_files: + try: + content = self.repo.get_contents(changelog_path) + if content: + self.extracted_data['changelog'] = content.decoded_content.decode('utf-8') + logger.info(f"CHANGELOG found: {changelog_path}") + return + except GithubException: + continue + + logger.warning("No CHANGELOG found in repository") + + def _extract_releases(self): + """C1.9: Extract GitHub Releases with version history.""" + logger.info("Extracting GitHub Releases...") + + try: + releases = self.repo.get_releases() + + release_list = [] + for release in releases: + release_data = { + 'tag_name': release.tag_name, + 'name': release.title, + 'body': release.body, + 'draft': release.draft, + 'prerelease': release.prerelease, + 'created_at': release.created_at.isoformat() if release.created_at else None, + 'published_at': release.published_at.isoformat() if release.published_at else None, + 'url': release.html_url, + 'tarball_url': release.tarball_url, + 'zipball_url': release.zipball_url + } + release_list.append(release_data) + + self.extracted_data['releases'] = release_list + logger.info(f"Extracted {len(release_list)} releases") + + except GithubException as e: + logger.warning(f"Could not fetch releases: {e}") + + def _save_data(self): + """Save extracted data to JSON file.""" + os.makedirs('output', exist_ok=True) + + with open(self.data_file, 'w', encoding='utf-8') as f: + json.dump(self.extracted_data, f, indent=2, ensure_ascii=False) + + logger.info(f"Data saved to: {self.data_file}") + + +class GitHubToSkillConverter: + """ + Convert extracted GitHub data to Claude skill format (C1.10). + """ + + def __init__(self, config: Dict[str, Any]): + """Initialize converter with configuration.""" + self.config = config + self.name = config.get('name', config['repo'].split('/')[-1]) + self.description = config.get('description', f'Skill for {config["repo"]}') + + # Paths + self.data_file = f"output/{self.name}_github_data.json" + self.skill_dir = f"output/{self.name}" + + # Load extracted data + self.data = self._load_data() + + def _load_data(self) -> Dict[str, Any]: + """Load extracted GitHub data from JSON.""" + if not os.path.exists(self.data_file): + raise FileNotFoundError(f"Data file not found: {self.data_file}") + + with open(self.data_file, 'r', encoding='utf-8') as f: + return json.load(f) + + def build_skill(self): + """Build complete skill structure.""" + logger.info(f"Building skill for: {self.name}") + + # Create directories + os.makedirs(self.skill_dir, exist_ok=True) + os.makedirs(f"{self.skill_dir}/references", exist_ok=True) + os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) + os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) + + # Generate SKILL.md + self._generate_skill_md() + + # Generate reference files + self._generate_references() + + logger.info(f"✅ Skill built successfully: {self.skill_dir}/") + + def _generate_skill_md(self): + """Generate main SKILL.md file.""" + repo_info = self.data.get('repo_info', {}) + + # Generate skill name (lowercase, hyphens only, max 64 chars) + skill_name = self.name.lower().replace('_', '-').replace(' ', '-')[:64] + + # Truncate description to 1024 chars if needed + desc = self.description[:1024] if len(self.description) > 1024 else self.description + + skill_content = f"""--- +name: {skill_name} +description: {desc} +--- + +# {repo_info.get('name', self.name)} + +{self.description} + +## Description + +{repo_info.get('description', 'GitHub repository skill')} + +**Repository:** [{repo_info.get('full_name', 'N/A')}]({repo_info.get('url', '#')}) +**Language:** {repo_info.get('language', 'N/A')} +**Stars:** {repo_info.get('stars', 0):,} +**License:** {repo_info.get('license', 'N/A')} + +## When to Use This Skill + +Use this skill when you need to: +- Understand how to use {self.name} +- Look up API documentation +- Find usage examples +- Check for known issues or recent changes +- Review release history + +## Quick Reference + +### Repository Info +- **Homepage:** {repo_info.get('homepage', 'N/A')} +- **Topics:** {', '.join(repo_info.get('topics', []))} +- **Open Issues:** {repo_info.get('open_issues', 0)} +- **Last Updated:** {repo_info.get('updated_at', 'N/A')[:10]} + +### Languages +{self._format_languages()} + +### Recent Releases +{self._format_recent_releases()} + +## Available References + +- `references/README.md` - Complete README documentation +- `references/CHANGELOG.md` - Version history and changes +- `references/issues.md` - Recent GitHub issues +- `references/releases.md` - Release notes +- `references/file_structure.md` - Repository structure + +## Usage + +See README.md for complete usage instructions and examples. + +--- + +**Generated by Skill Seeker** | GitHub Repository Scraper +""" + + skill_path = f"{self.skill_dir}/SKILL.md" + with open(skill_path, 'w', encoding='utf-8') as f: + f.write(skill_content) + + logger.info(f"Generated: {skill_path}") + + def _format_languages(self) -> str: + """Format language breakdown.""" + languages = self.data.get('languages', {}) + if not languages: + return "No language data available" + + lines = [] + for lang, info in sorted(languages.items(), key=lambda x: x[1]['bytes'], reverse=True): + lines.append(f"- **{lang}:** {info['percentage']:.1f}%") + + return '\n'.join(lines) + + def _format_recent_releases(self) -> str: + """Format recent releases (top 3).""" + releases = self.data.get('releases', []) + if not releases: + return "No releases available" + + lines = [] + for release in releases[:3]: + lines.append(f"- **{release['tag_name']}** ({release['published_at'][:10]}): {release['name']}") + + return '\n'.join(lines) + + def _generate_references(self): + """Generate all reference files.""" + # README + if self.data.get('readme'): + readme_path = f"{self.skill_dir}/references/README.md" + with open(readme_path, 'w', encoding='utf-8') as f: + f.write(self.data['readme']) + logger.info(f"Generated: {readme_path}") + + # CHANGELOG + if self.data.get('changelog'): + changelog_path = f"{self.skill_dir}/references/CHANGELOG.md" + with open(changelog_path, 'w', encoding='utf-8') as f: + f.write(self.data['changelog']) + logger.info(f"Generated: {changelog_path}") + + # Issues + if self.data.get('issues'): + self._generate_issues_reference() + + # Releases + if self.data.get('releases'): + self._generate_releases_reference() + + # File structure + if self.data.get('file_tree'): + self._generate_file_structure_reference() + + def _generate_issues_reference(self): + """Generate issues.md reference file.""" + issues = self.data['issues'] + + content = f"# GitHub Issues\n\nRecent issues from the repository ({len(issues)} total).\n\n" + + # Group by state + open_issues = [i for i in issues if i['state'] == 'open'] + closed_issues = [i for i in issues if i['state'] == 'closed'] + + content += f"## Open Issues ({len(open_issues)})\n\n" + for issue in open_issues[:20]: + labels = ', '.join(issue['labels']) if issue['labels'] else 'No labels' + content += f"### #{issue['number']}: {issue['title']}\n" + content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n" + content += f"[View on GitHub]({issue['url']})\n\n" + + content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n" + for issue in closed_issues[:10]: + labels = ', '.join(issue['labels']) if issue['labels'] else 'No labels' + content += f"### #{issue['number']}: {issue['title']}\n" + content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n" + content += f"[View on GitHub]({issue['url']})\n\n" + + issues_path = f"{self.skill_dir}/references/issues.md" + with open(issues_path, 'w', encoding='utf-8') as f: + f.write(content) + logger.info(f"Generated: {issues_path}") + + def _generate_releases_reference(self): + """Generate releases.md reference file.""" + releases = self.data['releases'] + + content = f"# Releases\n\nVersion history for this repository ({len(releases)} releases).\n\n" + + for release in releases: + content += f"## {release['tag_name']}: {release['name']}\n" + content += f"**Published:** {release['published_at'][:10]}\n" + if release['prerelease']: + content += f"**Pre-release**\n" + content += f"\n{release['body']}\n\n" + content += f"[View on GitHub]({release['url']})\n\n---\n\n" + + releases_path = f"{self.skill_dir}/references/releases.md" + with open(releases_path, 'w', encoding='utf-8') as f: + f.write(content) + logger.info(f"Generated: {releases_path}") + + def _generate_file_structure_reference(self): + """Generate file_structure.md reference file.""" + file_tree = self.data['file_tree'] + + content = f"# Repository File Structure\n\n" + content += f"Total items: {len(file_tree)}\n\n" + content += "```\n" + + # Build tree structure + for item in file_tree: + indent = " " * item['path'].count('/') + icon = "📁" if item['type'] == 'dir' else "📄" + content += f"{indent}{icon} {os.path.basename(item['path'])}\n" + + content += "```\n" + + structure_path = f"{self.skill_dir}/references/file_structure.md" + with open(structure_path, 'w', encoding='utf-8') as f: + f.write(content) + logger.info(f"Generated: {structure_path}") + + +def main(): + """C1.10: CLI tool entry point.""" + parser = argparse.ArgumentParser( + description='GitHub Repository to Claude Skill Converter', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + skill-seekers github --repo facebook/react + skill-seekers github --config configs/react_github.json + skill-seekers github --repo owner/repo --token $GITHUB_TOKEN + """ + ) + + parser.add_argument('--repo', help='GitHub repository (owner/repo)') + parser.add_argument('--config', help='Path to config JSON file') + parser.add_argument('--token', help='GitHub personal access token') + parser.add_argument('--name', help='Skill name (default: repo name)') + parser.add_argument('--description', help='Skill description') + parser.add_argument('--no-issues', action='store_true', help='Skip GitHub issues') + parser.add_argument('--no-changelog', action='store_true', help='Skip CHANGELOG') + parser.add_argument('--no-releases', action='store_true', help='Skip releases') + parser.add_argument('--max-issues', type=int, default=100, help='Max issues to fetch') + parser.add_argument('--scrape-only', action='store_true', help='Only scrape, don\'t build skill') + + args = parser.parse_args() + + # Build config from args or file + if args.config: + with open(args.config, 'r') as f: + config = json.load(f) + elif args.repo: + config = { + 'repo': args.repo, + 'name': args.name or args.repo.split('/')[-1], + 'description': args.description or f'GitHub repository skill for {args.repo}', + 'github_token': args.token, + 'include_issues': not args.no_issues, + 'include_changelog': not args.no_changelog, + 'include_releases': not args.no_releases, + 'max_issues': args.max_issues + } + else: + parser.error('Either --repo or --config is required') + + try: + # Phase 1: Scrape GitHub repository + scraper = GitHubScraper(config) + scraper.scrape() + + if args.scrape_only: + logger.info("Scrape complete (--scrape-only mode)") + return + + # Phase 2: Build skill + converter = GitHubToSkillConverter(config) + converter.build_skill() + + logger.info(f"\n✅ Success! Skill created at: output/{config.get('name', config['repo'].split('/')[-1])}/") + logger.info(f"Next step: skill-seekers-package output/{config.get('name', config['repo'].split('/')[-1])}/") + + except Exception as e: + logger.error(f"Error: {e}") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_detector.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_detector.py new file mode 100644 index 0000000..688fdb7 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_detector.py @@ -0,0 +1,66 @@ +# ABOUTME: Detects and validates llms.txt file availability at documentation URLs +# ABOUTME: Supports llms-full.txt, llms.txt, and llms-small.txt variants + +import requests +from typing import Optional, Dict, List +from urllib.parse import urlparse + +class LlmsTxtDetector: + """Detect llms.txt files at documentation URLs""" + + VARIANTS = [ + ('llms-full.txt', 'full'), + ('llms.txt', 'standard'), + ('llms-small.txt', 'small') + ] + + def __init__(self, base_url: str): + self.base_url = base_url.rstrip('/') + + def detect(self) -> Optional[Dict[str, str]]: + """ + Detect available llms.txt variant. + + Returns: + Dict with 'url' and 'variant' keys, or None if not found + """ + parsed = urlparse(self.base_url) + root_url = f"{parsed.scheme}://{parsed.netloc}" + + for filename, variant in self.VARIANTS: + url = f"{root_url}/{filename}" + + if self._check_url_exists(url): + return {'url': url, 'variant': variant} + + return None + + def detect_all(self) -> List[Dict[str, str]]: + """ + Detect all available llms.txt variants. + + Returns: + List of dicts with 'url' and 'variant' keys for each found variant + """ + found_variants = [] + + for filename, variant in self.VARIANTS: + parsed = urlparse(self.base_url) + root_url = f"{parsed.scheme}://{parsed.netloc}" + url = f"{root_url}/{filename}" + + if self._check_url_exists(url): + found_variants.append({ + 'url': url, + 'variant': variant + }) + + return found_variants + + def _check_url_exists(self, url: str) -> bool: + """Check if URL returns 200 status""" + try: + response = requests.head(url, timeout=5, allow_redirects=True) + return response.status_code == 200 + except requests.RequestException: + return False diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_downloader.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_downloader.py new file mode 100644 index 0000000..1049f86 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_downloader.py @@ -0,0 +1,94 @@ +"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic""" +"""ABOUTME: Validates markdown content and handles timeouts with exponential backoff""" + +import requests +import time +from typing import Optional + +class LlmsTxtDownloader: + """Download llms.txt content from URLs with retry logic""" + + def __init__(self, url: str, timeout: int = 30, max_retries: int = 3): + self.url = url + self.timeout = timeout + self.max_retries = max_retries + + def get_proper_filename(self) -> str: + """ + Extract filename from URL and convert .txt to .md + + Returns: + Proper filename with .md extension + + Examples: + https://hono.dev/llms-full.txt -> llms-full.md + https://hono.dev/llms.txt -> llms.md + https://hono.dev/llms-small.txt -> llms-small.md + """ + # Extract filename from URL + from urllib.parse import urlparse + parsed = urlparse(self.url) + filename = parsed.path.split('/')[-1] + + # Replace .txt with .md + if filename.endswith('.txt'): + filename = filename[:-4] + '.md' + + return filename + + def _is_markdown(self, content: str) -> bool: + """ + Check if content looks like markdown. + + Returns: + True if content contains markdown patterns + """ + markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`'] + return any(pattern in content for pattern in markdown_patterns) + + def download(self) -> Optional[str]: + """ + Download llms.txt content with retry logic. + + Returns: + String content or None if download fails + """ + headers = { + 'User-Agent': 'Skill-Seekers-llms.txt-Reader/1.0' + } + + for attempt in range(self.max_retries): + try: + response = requests.get( + self.url, + headers=headers, + timeout=self.timeout + ) + response.raise_for_status() + + content = response.text + + # Validate content is not empty + if len(content) < 100: + print(f"⚠️ Content too short ({len(content)} chars), rejecting") + return None + + # Validate content looks like markdown + if not self._is_markdown(content): + print(f"⚠️ Content doesn't look like markdown") + return None + + return content + + except requests.RequestException as e: + if attempt < self.max_retries - 1: + # Calculate exponential backoff delay: 1s, 2s, 4s, etc. + delay = 2 ** attempt + print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}") + print(f" Retrying in {delay}s...") + time.sleep(delay) + else: + print(f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}") + return None + + return None diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_parser.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_parser.py new file mode 100644 index 0000000..e288c92 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/llms_txt_parser.py @@ -0,0 +1,74 @@ +"""ABOUTME: Parses llms.txt markdown content into structured page data""" +"""ABOUTME: Extracts titles, content, code samples, and headings from markdown""" + +import re +from typing import List, Dict + +class LlmsTxtParser: + """Parse llms.txt markdown content into page structures""" + + def __init__(self, content: str): + self.content = content + + def parse(self) -> List[Dict]: + """ + Parse markdown content into page structures. + + Returns: + List of page dicts with title, content, code_samples, headings + """ + pages = [] + + # Split by h1 headers (# Title) + sections = re.split(r'\n# ', self.content) + + for section in sections: + if not section.strip(): + continue + + # First line is title + lines = section.split('\n') + title = lines[0].strip('#').strip() + + # Parse content + page = self._parse_section('\n'.join(lines[1:]), title) + pages.append(page) + + return pages + + def _parse_section(self, content: str, title: str) -> Dict: + """Parse a single section into page structure""" + page = { + 'title': title, + 'content': '', + 'code_samples': [], + 'headings': [], + 'url': f'llms-txt#{title.lower().replace(" ", "-")}', + 'links': [] + } + + # Extract code blocks + code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL) + for lang, code in code_blocks: + page['code_samples'].append({ + 'code': code.strip(), + 'language': lang or 'unknown' + }) + + # Extract h2/h3 headings + headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE) + for level_markers, text in headings: + page['headings'].append({ + 'level': f'h{len(level_markers)}', + 'text': text.strip(), + 'id': text.lower().replace(' ', '-') + }) + + # Remove code blocks from content for plain text + content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL) + + # Extract paragraphs + paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20] + page['content'] = '\n\n'.join(paragraphs) + + return page diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/main.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/main.py new file mode 100644 index 0000000..dcf677d --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/main.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +""" +Skill Seekers - Unified CLI Entry Point + +Provides a git-style unified command-line interface for all Skill Seekers tools. + +Usage: + skill-seekers [options] + +Commands: + scrape Scrape documentation website + github Scrape GitHub repository + pdf Extract from PDF file + unified Multi-source scraping (docs + GitHub + PDF) + enhance AI-powered enhancement (local, no API key) + package Package skill into .zip file + upload Upload skill to Claude + estimate Estimate page count before scraping + +Examples: + skill-seekers scrape --config configs/react.json + skill-seekers github --repo microsoft/TypeScript + skill-seekers unified --config configs/react_unified.json + skill-seekers package output/react/ +""" + +import sys +import argparse +from typing import List, Optional + + +def create_parser() -> argparse.ArgumentParser: + """Create the main argument parser with subcommands.""" + parser = argparse.ArgumentParser( + prog="skill-seekers", + description="Convert documentation, GitHub repos, and PDFs into Claude AI skills", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Scrape documentation + skill-seekers scrape --config configs/react.json + + # Scrape GitHub repository + skill-seekers github --repo microsoft/TypeScript --name typescript + + # Multi-source scraping (unified) + skill-seekers unified --config configs/react_unified.json + + # AI-powered enhancement + skill-seekers enhance output/react/ + + # Package and upload + skill-seekers package output/react/ + skill-seekers upload output/react.zip + +For more information: https://github.com/yusufkaraaslan/Skill_Seekers + """ + ) + + parser.add_argument( + "--version", + action="version", + version="%(prog)s 2.1.1" + ) + + subparsers = parser.add_subparsers( + dest="command", + title="commands", + description="Available Skill Seekers commands", + help="Command to run" + ) + + # === scrape subcommand === + scrape_parser = subparsers.add_parser( + "scrape", + help="Scrape documentation website", + description="Scrape documentation website and generate skill" + ) + scrape_parser.add_argument("--config", help="Config JSON file") + scrape_parser.add_argument("--name", help="Skill name") + scrape_parser.add_argument("--url", help="Documentation URL") + scrape_parser.add_argument("--description", help="Skill description") + scrape_parser.add_argument("--skip-scrape", action="store_true", help="Skip scraping, use cached data") + scrape_parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)") + scrape_parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)") + scrape_parser.add_argument("--dry-run", action="store_true", help="Dry run mode") + scrape_parser.add_argument("--async", dest="async_mode", action="store_true", help="Use async scraping") + scrape_parser.add_argument("--workers", type=int, help="Number of async workers") + + # === github subcommand === + github_parser = subparsers.add_parser( + "github", + help="Scrape GitHub repository", + description="Scrape GitHub repository and generate skill" + ) + github_parser.add_argument("--config", help="Config JSON file") + github_parser.add_argument("--repo", help="GitHub repo (owner/repo)") + github_parser.add_argument("--name", help="Skill name") + github_parser.add_argument("--description", help="Skill description") + + # === pdf subcommand === + pdf_parser = subparsers.add_parser( + "pdf", + help="Extract from PDF file", + description="Extract content from PDF and generate skill" + ) + pdf_parser.add_argument("--config", help="Config JSON file") + pdf_parser.add_argument("--pdf", help="PDF file path") + pdf_parser.add_argument("--name", help="Skill name") + pdf_parser.add_argument("--description", help="Skill description") + pdf_parser.add_argument("--from-json", help="Build from extracted JSON") + + # === unified subcommand === + unified_parser = subparsers.add_parser( + "unified", + help="Multi-source scraping (docs + GitHub + PDF)", + description="Combine multiple sources into one skill" + ) + unified_parser.add_argument("--config", required=True, help="Unified config JSON file") + unified_parser.add_argument("--merge-mode", help="Merge mode (rule-based, claude-enhanced)") + unified_parser.add_argument("--dry-run", action="store_true", help="Dry run mode") + + # === enhance subcommand === + enhance_parser = subparsers.add_parser( + "enhance", + help="AI-powered enhancement (local, no API key)", + description="Enhance SKILL.md using Claude Code (local)" + ) + enhance_parser.add_argument("skill_directory", help="Skill directory path") + + # === package subcommand === + package_parser = subparsers.add_parser( + "package", + help="Package skill into .zip file", + description="Package skill directory into uploadable .zip" + ) + package_parser.add_argument("skill_directory", help="Skill directory path") + package_parser.add_argument("--no-open", action="store_true", help="Don't open output folder") + package_parser.add_argument("--upload", action="store_true", help="Auto-upload after packaging") + + # === upload subcommand === + upload_parser = subparsers.add_parser( + "upload", + help="Upload skill to Claude", + description="Upload .zip file to Claude via Anthropic API" + ) + upload_parser.add_argument("zip_file", help=".zip file to upload") + upload_parser.add_argument("--api-key", help="Anthropic API key") + + # === estimate subcommand === + estimate_parser = subparsers.add_parser( + "estimate", + help="Estimate page count before scraping", + description="Estimate total pages for documentation scraping" + ) + estimate_parser.add_argument("config", help="Config JSON file") + estimate_parser.add_argument("--max-discovery", type=int, help="Max pages to discover") + + return parser + + +def main(argv: Optional[List[str]] = None) -> int: + """Main entry point for the unified CLI. + + Args: + argv: Command-line arguments (defaults to sys.argv) + + Returns: + Exit code (0 for success, non-zero for error) + """ + parser = create_parser() + args = parser.parse_args(argv) + + if not args.command: + parser.print_help() + return 1 + + # Delegate to the appropriate tool + try: + if args.command == "scrape": + from skill_seekers.cli.doc_scraper import main as scrape_main + # Convert args namespace to sys.argv format for doc_scraper + sys.argv = ["doc_scraper.py"] + if args.config: + sys.argv.extend(["--config", args.config]) + if args.name: + sys.argv.extend(["--name", args.name]) + if args.url: + sys.argv.extend(["--url", args.url]) + if args.description: + sys.argv.extend(["--description", args.description]) + if args.skip_scrape: + sys.argv.append("--skip-scrape") + if args.enhance: + sys.argv.append("--enhance") + if args.enhance_local: + sys.argv.append("--enhance-local") + if args.dry_run: + sys.argv.append("--dry-run") + if args.async_mode: + sys.argv.append("--async") + if args.workers: + sys.argv.extend(["--workers", str(args.workers)]) + return scrape_main() or 0 + + elif args.command == "github": + from skill_seekers.cli.github_scraper import main as github_main + sys.argv = ["github_scraper.py"] + if args.config: + sys.argv.extend(["--config", args.config]) + if args.repo: + sys.argv.extend(["--repo", args.repo]) + if args.name: + sys.argv.extend(["--name", args.name]) + if args.description: + sys.argv.extend(["--description", args.description]) + return github_main() or 0 + + elif args.command == "pdf": + from skill_seekers.cli.pdf_scraper import main as pdf_main + sys.argv = ["pdf_scraper.py"] + if args.config: + sys.argv.extend(["--config", args.config]) + if args.pdf: + sys.argv.extend(["--pdf", args.pdf]) + if args.name: + sys.argv.extend(["--name", args.name]) + if args.description: + sys.argv.extend(["--description", args.description]) + if args.from_json: + sys.argv.extend(["--from-json", args.from_json]) + return pdf_main() or 0 + + elif args.command == "unified": + from skill_seekers.cli.unified_scraper import main as unified_main + sys.argv = ["unified_scraper.py", "--config", args.config] + if args.merge_mode: + sys.argv.extend(["--merge-mode", args.merge_mode]) + if args.dry_run: + sys.argv.append("--dry-run") + return unified_main() or 0 + + elif args.command == "enhance": + from skill_seekers.cli.enhance_skill_local import main as enhance_main + sys.argv = ["enhance_skill_local.py", args.skill_directory] + return enhance_main() or 0 + + elif args.command == "package": + from skill_seekers.cli.package_skill import main as package_main + sys.argv = ["package_skill.py", args.skill_directory] + if args.no_open: + sys.argv.append("--no-open") + if args.upload: + sys.argv.append("--upload") + return package_main() or 0 + + elif args.command == "upload": + from skill_seekers.cli.upload_skill import main as upload_main + sys.argv = ["upload_skill.py", args.zip_file] + if args.api_key: + sys.argv.extend(["--api-key", args.api_key]) + return upload_main() or 0 + + elif args.command == "estimate": + from skill_seekers.cli.estimate_pages import main as estimate_main + sys.argv = ["estimate_pages.py", args.config] + if args.max_discovery: + sys.argv.extend(["--max-discovery", str(args.max_discovery)]) + return estimate_main() or 0 + + else: + print(f"Error: Unknown command '{args.command}'", file=sys.stderr) + parser.print_help() + return 1 + + except KeyboardInterrupt: + print("\n\nInterrupted by user", file=sys.stderr) + return 130 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/merge_sources.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/merge_sources.py new file mode 100644 index 0000000..552ac82 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/merge_sources.py @@ -0,0 +1,513 @@ +#!/usr/bin/env python3 +""" +Source Merger for Multi-Source Skills + +Merges documentation and code data intelligently: +- Rule-based merge: Fast, deterministic rules +- Claude-enhanced merge: AI-powered reconciliation + +Handles conflicts and creates unified API reference. +""" + +import json +import logging +import subprocess +import tempfile +import os +from pathlib import Path +from typing import Dict, List, Any, Optional +from .conflict_detector import Conflict, ConflictDetector + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class RuleBasedMerger: + """ + Rule-based API merger using deterministic rules. + + Rules: + 1. If API only in docs → Include with [DOCS_ONLY] tag + 2. If API only in code → Include with [UNDOCUMENTED] tag + 3. If both match perfectly → Include normally + 4. If conflict → Include both versions with [CONFLICT] tag, prefer code signature + """ + + def __init__(self, docs_data: Dict, github_data: Dict, conflicts: List[Conflict]): + """ + Initialize rule-based merger. + + Args: + docs_data: Documentation scraper data + github_data: GitHub scraper data + conflicts: List of detected conflicts + """ + self.docs_data = docs_data + self.github_data = github_data + self.conflicts = conflicts + + # Build conflict index for fast lookup + self.conflict_index = {c.api_name: c for c in conflicts} + + # Extract APIs from both sources + detector = ConflictDetector(docs_data, github_data) + self.docs_apis = detector.docs_apis + self.code_apis = detector.code_apis + + def merge_all(self) -> Dict[str, Any]: + """ + Merge all APIs using rule-based logic. + + Returns: + Dict containing merged API data + """ + logger.info("Starting rule-based merge...") + + merged_apis = {} + + # Get all unique API names + all_api_names = set(self.docs_apis.keys()) | set(self.code_apis.keys()) + + for api_name in sorted(all_api_names): + merged_api = self._merge_single_api(api_name) + merged_apis[api_name] = merged_api + + logger.info(f"Merged {len(merged_apis)} APIs") + + return { + 'merge_mode': 'rule-based', + 'apis': merged_apis, + 'summary': { + 'total_apis': len(merged_apis), + 'docs_only': sum(1 for api in merged_apis.values() if api['status'] == 'docs_only'), + 'code_only': sum(1 for api in merged_apis.values() if api['status'] == 'code_only'), + 'matched': sum(1 for api in merged_apis.values() if api['status'] == 'matched'), + 'conflict': sum(1 for api in merged_apis.values() if api['status'] == 'conflict') + } + } + + def _merge_single_api(self, api_name: str) -> Dict[str, Any]: + """ + Merge a single API using rules. + + Args: + api_name: Name of the API to merge + + Returns: + Merged API dict + """ + in_docs = api_name in self.docs_apis + in_code = api_name in self.code_apis + has_conflict = api_name in self.conflict_index + + # Rule 1: Only in docs + if in_docs and not in_code: + conflict = self.conflict_index.get(api_name) + return { + 'name': api_name, + 'status': 'docs_only', + 'source': 'documentation', + 'data': self.docs_apis[api_name], + 'warning': 'This API is documented but not found in codebase', + 'conflict': conflict.__dict__ if conflict else None + } + + # Rule 2: Only in code + if in_code and not in_docs: + is_private = api_name.startswith('_') + conflict = self.conflict_index.get(api_name) + return { + 'name': api_name, + 'status': 'code_only', + 'source': 'code', + 'data': self.code_apis[api_name], + 'warning': 'This API exists in code but is not documented' if not is_private else 'Internal/private API', + 'conflict': conflict.__dict__ if conflict else None + } + + # Both exist - check for conflicts + docs_info = self.docs_apis[api_name] + code_info = self.code_apis[api_name] + + # Rule 3: Both match perfectly (no conflict) + if not has_conflict: + return { + 'name': api_name, + 'status': 'matched', + 'source': 'both', + 'docs_data': docs_info, + 'code_data': code_info, + 'merged_signature': self._create_merged_signature(code_info, docs_info), + 'merged_description': docs_info.get('docstring') or code_info.get('docstring') + } + + # Rule 4: Conflict exists - prefer code signature, keep docs description + conflict = self.conflict_index[api_name] + + return { + 'name': api_name, + 'status': 'conflict', + 'source': 'both', + 'docs_data': docs_info, + 'code_data': code_info, + 'conflict': conflict.__dict__, + 'resolution': 'prefer_code_signature', + 'merged_signature': self._create_merged_signature(code_info, docs_info), + 'merged_description': docs_info.get('docstring') or code_info.get('docstring'), + 'warning': conflict.difference + } + + def _create_merged_signature(self, code_info: Dict, docs_info: Dict) -> str: + """ + Create merged signature preferring code data. + + Args: + code_info: API info from code + docs_info: API info from docs + + Returns: + Merged signature string + """ + name = code_info.get('name', docs_info.get('name')) + params = code_info.get('parameters', docs_info.get('parameters', [])) + return_type = code_info.get('return_type', docs_info.get('return_type')) + + # Build parameter string + param_strs = [] + for param in params: + param_str = param['name'] + if param.get('type_hint'): + param_str += f": {param['type_hint']}" + if param.get('default'): + param_str += f" = {param['default']}" + param_strs.append(param_str) + + signature = f"{name}({', '.join(param_strs)})" + + if return_type: + signature += f" -> {return_type}" + + return signature + + +class ClaudeEnhancedMerger: + """ + Claude-enhanced API merger using local Claude Code. + + Opens Claude Code in a new terminal to intelligently reconcile conflicts. + Uses the same approach as enhance_skill_local.py. + """ + + def __init__(self, docs_data: Dict, github_data: Dict, conflicts: List[Conflict]): + """ + Initialize Claude-enhanced merger. + + Args: + docs_data: Documentation scraper data + github_data: GitHub scraper data + conflicts: List of detected conflicts + """ + self.docs_data = docs_data + self.github_data = github_data + self.conflicts = conflicts + + # First do rule-based merge as baseline + self.rule_merger = RuleBasedMerger(docs_data, github_data, conflicts) + + def merge_all(self) -> Dict[str, Any]: + """ + Merge all APIs using Claude enhancement. + + Returns: + Dict containing merged API data + """ + logger.info("Starting Claude-enhanced merge...") + + # Create temporary workspace + workspace_dir = self._create_workspace() + + # Launch Claude Code for enhancement + logger.info("Launching Claude Code for intelligent merging...") + logger.info("Claude will analyze conflicts and create reconciled API reference") + + try: + self._launch_claude_merge(workspace_dir) + + # Read enhanced results + merged_data = self._read_merged_results(workspace_dir) + + logger.info("Claude-enhanced merge complete") + return merged_data + + except Exception as e: + logger.error(f"Claude enhancement failed: {e}") + logger.info("Falling back to rule-based merge") + return self.rule_merger.merge_all() + + def _create_workspace(self) -> str: + """ + Create temporary workspace with merge context. + + Returns: + Path to workspace directory + """ + workspace = tempfile.mkdtemp(prefix='skill_merge_') + logger.info(f"Created merge workspace: {workspace}") + + # Write context files for Claude + self._write_context_files(workspace) + + return workspace + + def _write_context_files(self, workspace: str): + """Write context files for Claude to analyze.""" + + # 1. Write conflicts summary + conflicts_file = os.path.join(workspace, 'conflicts.json') + with open(conflicts_file, 'w') as f: + json.dump({ + 'conflicts': [c.__dict__ for c in self.conflicts], + 'summary': { + 'total': len(self.conflicts), + 'by_type': self._count_by_field('type'), + 'by_severity': self._count_by_field('severity') + } + }, f, indent=2) + + # 2. Write documentation APIs + docs_apis_file = os.path.join(workspace, 'docs_apis.json') + detector = ConflictDetector(self.docs_data, self.github_data) + with open(docs_apis_file, 'w') as f: + json.dump(detector.docs_apis, f, indent=2) + + # 3. Write code APIs + code_apis_file = os.path.join(workspace, 'code_apis.json') + with open(code_apis_file, 'w') as f: + json.dump(detector.code_apis, f, indent=2) + + # 4. Write merge instructions for Claude + instructions = """# API Merge Task + +You are merging API documentation from two sources: +1. Official documentation (user-facing) +2. Source code analysis (implementation reality) + +## Context Files: +- `conflicts.json` - All detected conflicts between sources +- `docs_apis.json` - APIs from documentation +- `code_apis.json` - APIs from source code + +## Your Task: +For each conflict, reconcile the differences intelligently: + +1. **Prefer code signatures as source of truth** + - Use actual parameter names, types, defaults from code + - Code is what actually runs, docs might be outdated + +2. **Keep documentation descriptions** + - Docs are user-friendly, code comments might be technical + - Keep the docs' explanation of what the API does + +3. **Add implementation notes for discrepancies** + - If docs differ from code, explain the difference + - Example: "⚠️ The `snap` parameter exists in code but is not documented" + +4. **Flag missing APIs clearly** + - Missing in docs → Add [UNDOCUMENTED] tag + - Missing in code → Add [REMOVED] or [DOCS_ERROR] tag + +5. **Create unified API reference** + - One definitive signature per API + - Clear warnings about conflicts + - Implementation notes where helpful + +## Output Format: +Create `merged_apis.json` with this structure: + +```json +{ + "apis": { + "API.name": { + "signature": "final_signature_here", + "parameters": [...], + "return_type": "type", + "description": "user-friendly description", + "implementation_notes": "Any discrepancies or warnings", + "source": "both|docs_only|code_only", + "confidence": "high|medium|low" + } + } +} +``` + +Take your time to analyze each conflict carefully. The goal is to create the most accurate and helpful API reference possible. +""" + + instructions_file = os.path.join(workspace, 'MERGE_INSTRUCTIONS.md') + with open(instructions_file, 'w') as f: + f.write(instructions) + + logger.info(f"Wrote context files to {workspace}") + + def _count_by_field(self, field: str) -> Dict[str, int]: + """Count conflicts by a specific field.""" + counts = {} + for conflict in self.conflicts: + value = getattr(conflict, field) + counts[value] = counts.get(value, 0) + 1 + return counts + + def _launch_claude_merge(self, workspace: str): + """ + Launch Claude Code to perform merge. + + Similar to enhance_skill_local.py approach. + """ + # Create a script that Claude will execute + script_path = os.path.join(workspace, 'merge_script.sh') + + script_content = f"""#!/bin/bash +# Automatic merge script for Claude Code + +cd "{workspace}" + +echo "📊 Analyzing conflicts..." +cat conflicts.json | head -20 + +echo "" +echo "📖 Documentation APIs: $(cat docs_apis.json | grep -c '\"name\"')" +echo "💻 Code APIs: $(cat code_apis.json | grep -c '\"name\"')" +echo "" +echo "Please review the conflicts and create merged_apis.json" +echo "Follow the instructions in MERGE_INSTRUCTIONS.md" +echo "" +echo "When done, save merged_apis.json and close this terminal." + +# Wait for user to complete merge +read -p "Press Enter when merge is complete..." +""" + + with open(script_path, 'w') as f: + f.write(script_content) + + os.chmod(script_path, 0o755) + + # Open new terminal with Claude Code + # Try different terminal emulators + terminals = [ + ['x-terminal-emulator', '-e'], + ['gnome-terminal', '--'], + ['xterm', '-e'], + ['konsole', '-e'] + ] + + for terminal_cmd in terminals: + try: + cmd = terminal_cmd + ['bash', script_path] + subprocess.Popen(cmd) + logger.info(f"Opened terminal with {terminal_cmd[0]}") + break + except FileNotFoundError: + continue + + # Wait for merge to complete + merged_file = os.path.join(workspace, 'merged_apis.json') + logger.info(f"Waiting for merged results at: {merged_file}") + logger.info("Close the terminal when done to continue...") + + # Poll for file existence + import time + timeout = 3600 # 1 hour max + elapsed = 0 + while not os.path.exists(merged_file) and elapsed < timeout: + time.sleep(5) + elapsed += 5 + + if not os.path.exists(merged_file): + raise TimeoutError("Claude merge timed out after 1 hour") + + def _read_merged_results(self, workspace: str) -> Dict[str, Any]: + """Read merged results from workspace.""" + merged_file = os.path.join(workspace, 'merged_apis.json') + + if not os.path.exists(merged_file): + raise FileNotFoundError(f"Merged results not found: {merged_file}") + + with open(merged_file, 'r') as f: + merged_data = json.load(f) + + return { + 'merge_mode': 'claude-enhanced', + **merged_data + } + + +def merge_sources(docs_data_path: str, + github_data_path: str, + output_path: str, + mode: str = 'rule-based') -> Dict[str, Any]: + """ + Merge documentation and GitHub data. + + Args: + docs_data_path: Path to documentation data JSON + github_data_path: Path to GitHub data JSON + output_path: Path to save merged output + mode: 'rule-based' or 'claude-enhanced' + + Returns: + Merged data dict + """ + # Load data + with open(docs_data_path, 'r') as f: + docs_data = json.load(f) + + with open(github_data_path, 'r') as f: + github_data = json.load(f) + + # Detect conflicts + detector = ConflictDetector(docs_data, github_data) + conflicts = detector.detect_all_conflicts() + + logger.info(f"Detected {len(conflicts)} conflicts") + + # Merge based on mode + if mode == 'claude-enhanced': + merger = ClaudeEnhancedMerger(docs_data, github_data, conflicts) + else: + merger = RuleBasedMerger(docs_data, github_data, conflicts) + + merged_data = merger.merge_all() + + # Save merged data + with open(output_path, 'w') as f: + json.dump(merged_data, f, indent=2, ensure_ascii=False) + + logger.info(f"Merged data saved to: {output_path}") + + return merged_data + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='Merge documentation and code sources') + parser.add_argument('docs_data', help='Path to documentation data JSON') + parser.add_argument('github_data', help='Path to GitHub data JSON') + parser.add_argument('--output', '-o', default='merged_data.json', help='Output file path') + parser.add_argument('--mode', '-m', choices=['rule-based', 'claude-enhanced'], + default='rule-based', help='Merge mode') + + args = parser.parse_args() + + merged = merge_sources(args.docs_data, args.github_data, args.output, args.mode) + + # Print summary + summary = merged.get('summary', {}) + print(f"\n✅ Merge complete ({merged.get('merge_mode')})") + print(f" Total APIs: {summary.get('total_apis', 0)}") + print(f" Matched: {summary.get('matched', 0)}") + print(f" Docs only: {summary.get('docs_only', 0)}") + print(f" Code only: {summary.get('code_only', 0)}") + print(f" Conflicts: {summary.get('conflict', 0)}") + print(f"\n📄 Saved to: {args.output}") diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/package_multi.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/package_multi.py new file mode 100644 index 0000000..bffdb9c --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/package_multi.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" +Multi-Skill Packager + +Package multiple skills at once. Useful for packaging router + sub-skills together. +""" + +import sys +import argparse +from pathlib import Path +import subprocess + + +def package_skill(skill_dir: Path) -> bool: + """Package a single skill""" + try: + result = subprocess.run( + [sys.executable, str(Path(__file__).parent / "package_skill.py"), str(skill_dir)], + capture_output=True, + text=True + ) + return result.returncode == 0 + except Exception as e: + print(f"❌ Error packaging {skill_dir}: {e}") + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Package multiple skills at once", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Package all godot skills + python3 package_multi.py output/godot*/ + + # Package specific skills + python3 package_multi.py output/godot-2d/ output/godot-3d/ output/godot-scripting/ + """ + ) + + parser.add_argument( + 'skill_dirs', + nargs='+', + help='Skill directories to package' + ) + + args = parser.parse_args() + + print(f"\n{'='*60}") + print(f"MULTI-SKILL PACKAGER") + print(f"{'='*60}\n") + + skill_dirs = [Path(d) for d in args.skill_dirs] + success_count = 0 + total_count = len(skill_dirs) + + for skill_dir in skill_dirs: + if not skill_dir.exists(): + print(f"⚠️ Skipping (not found): {skill_dir}") + continue + + if not (skill_dir / "SKILL.md").exists(): + print(f"⚠️ Skipping (no SKILL.md): {skill_dir}") + continue + + print(f"📦 Packaging: {skill_dir.name}") + if package_skill(skill_dir): + success_count += 1 + print(f" ✅ Success") + else: + print(f" ❌ Failed") + print("") + + print(f"{'='*60}") + print(f"SUMMARY: {success_count}/{total_count} skills packaged") + print(f"{'='*60}\n") + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/package_skill.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/package_skill.py new file mode 100644 index 0000000..cf251d0 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/package_skill.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +""" +Simple Skill Packager +Packages a skill directory into a .zip file for Claude. + +Usage: + skill-seekers package output/steam-inventory/ + skill-seekers package output/react/ + skill-seekers package output/react/ --no-open # Don't open folder +""" + +import os +import sys +import zipfile +import argparse +from pathlib import Path + +# Import utilities +try: + from utils import ( + open_folder, + print_upload_instructions, + format_file_size, + validate_skill_directory + ) + from quality_checker import SkillQualityChecker, print_report +except ImportError: + # If running from different directory, add cli to path + sys.path.insert(0, str(Path(__file__).parent)) + from utils import ( + open_folder, + print_upload_instructions, + format_file_size, + validate_skill_directory + ) + from quality_checker import SkillQualityChecker, print_report + + +def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False): + """ + Package a skill directory into a .zip file + + Args: + skill_dir: Path to skill directory + open_folder_after: Whether to open the output folder after packaging + skip_quality_check: Skip quality checks before packaging + + Returns: + tuple: (success, zip_path) where success is bool and zip_path is Path or None + """ + skill_path = Path(skill_dir) + + # Validate skill directory + is_valid, error_msg = validate_skill_directory(skill_path) + if not is_valid: + print(f"❌ Error: {error_msg}") + return False, None + + # Run quality checks (unless skipped) + if not skip_quality_check: + print("\n" + "=" * 60) + print("QUALITY CHECK") + print("=" * 60) + + checker = SkillQualityChecker(skill_path) + report = checker.check_all() + + # Print report + print_report(report, verbose=False) + + # If there are errors or warnings, ask user to confirm + if report.has_errors or report.has_warnings: + print("=" * 60) + response = input("\nContinue with packaging? (y/n): ").strip().lower() + if response != 'y': + print("\n❌ Packaging cancelled by user") + return False, None + print() + else: + print("=" * 60) + print() + + # Create zip filename + skill_name = skill_path.name + zip_path = skill_path.parent / f"{skill_name}.zip" + + print(f"📦 Packaging skill: {skill_name}") + print(f" Source: {skill_path}") + print(f" Output: {zip_path}") + + # Create zip file + with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf: + for root, dirs, files in os.walk(skill_path): + # Skip backup files + files = [f for f in files if not f.endswith('.backup')] + + for file in files: + file_path = Path(root) / file + arcname = file_path.relative_to(skill_path) + zf.write(file_path, arcname) + print(f" + {arcname}") + + # Get zip size + zip_size = zip_path.stat().st_size + print(f"\n✅ Package created: {zip_path}") + print(f" Size: {zip_size:,} bytes ({format_file_size(zip_size)})") + + # Open folder in file browser + if open_folder_after: + print(f"\n📂 Opening folder: {zip_path.parent}") + open_folder(zip_path.parent) + + # Print upload instructions + print_upload_instructions(zip_path) + + return True, zip_path + + +def main(): + parser = argparse.ArgumentParser( + description="Package a skill directory into a .zip file for Claude", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Package skill with quality checks (recommended) + skill-seekers package output/react/ + + # Package skill without opening folder + skill-seekers package output/react/ --no-open + + # Skip quality checks (faster, but not recommended) + skill-seekers package output/react/ --skip-quality-check + + # Package and auto-upload to Claude + skill-seekers package output/react/ --upload + + # Get help + skill-seekers package --help + """ + ) + + parser.add_argument( + 'skill_dir', + help='Path to skill directory (e.g., output/react/)' + ) + + parser.add_argument( + '--no-open', + action='store_true', + help='Do not open the output folder after packaging' + ) + + parser.add_argument( + '--skip-quality-check', + action='store_true', + help='Skip quality checks before packaging' + ) + + parser.add_argument( + '--upload', + action='store_true', + help='Automatically upload to Claude after packaging (requires ANTHROPIC_API_KEY)' + ) + + args = parser.parse_args() + + success, zip_path = package_skill( + args.skill_dir, + open_folder_after=not args.no_open, + skip_quality_check=args.skip_quality_check + ) + + if not success: + sys.exit(1) + + # Auto-upload if requested + if args.upload: + # Check if API key is set BEFORE attempting upload + api_key = os.environ.get('ANTHROPIC_API_KEY', '').strip() + + if not api_key: + # No API key - show helpful message but DON'T fail + print("\n" + "="*60) + print("💡 Automatic Upload") + print("="*60) + print() + print("To enable automatic upload:") + print(" 1. Get API key from https://console.anthropic.com/") + print(" 2. Set: export ANTHROPIC_API_KEY=sk-ant-...") + print(" 3. Run package_skill.py with --upload flag") + print() + print("For now, use manual upload (instructions above) ☝️") + print("="*60) + # Exit successfully - packaging worked! + sys.exit(0) + + # API key exists - try upload + try: + from upload_skill import upload_skill_api + print("\n" + "="*60) + upload_success, message = upload_skill_api(zip_path) + if not upload_success: + print(f"❌ Upload failed: {message}") + print() + print("💡 Try manual upload instead (instructions above) ☝️") + print("="*60) + # Exit successfully - packaging worked even if upload failed + sys.exit(0) + else: + print("="*60) + sys.exit(0) + except ImportError: + print("\n❌ Error: upload_skill.py not found") + sys.exit(1) + + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/pdf_extractor_poc.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/pdf_extractor_poc.py new file mode 100644 index 0000000..f8c0fe8 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/pdf_extractor_poc.py @@ -0,0 +1,1222 @@ +#!/usr/bin/env python3 +""" +PDF Text Extractor - Complete Feature Set (Tasks B1.2 + B1.3 + B1.4 + B1.5 + Priority 2 & 3) + +Extracts text, code blocks, and images from PDF documentation files. +Uses PyMuPDF (fitz) for fast, high-quality extraction. + +Features: + - Text and markdown extraction + - Code block detection (font, indent, pattern) + - Language detection with confidence scoring (19+ languages) (B1.4) + - Syntax validation and quality scoring (B1.4) + - Quality statistics and filtering (B1.4) + - Image extraction to files (B1.5) + - Image filtering by size (B1.5) + - Page chunking and chapter detection (B1.3) + - Code block merging across pages (B1.3) + +Advanced Features (Priority 2 & 3): + - OCR support for scanned PDFs (requires pytesseract) (Priority 2) + - Password-protected PDF support (Priority 2) + - Table extraction (Priority 2) + - Parallel page processing (Priority 3) + - Caching of expensive operations (Priority 3) + +Usage: + # Basic extraction + python3 pdf_extractor_poc.py input.pdf + python3 pdf_extractor_poc.py input.pdf --output output.json + python3 pdf_extractor_poc.py input.pdf --verbose + + # Quality filtering + python3 pdf_extractor_poc.py input.pdf --min-quality 5.0 + + # Image extraction + python3 pdf_extractor_poc.py input.pdf --extract-images + python3 pdf_extractor_poc.py input.pdf --extract-images --image-dir images/ + + # Advanced features + python3 pdf_extractor_poc.py scanned.pdf --ocr + python3 pdf_extractor_poc.py encrypted.pdf --password mypassword + python3 pdf_extractor_poc.py input.pdf --extract-tables + python3 pdf_extractor_poc.py large.pdf --parallel --workers 8 + +Example: + python3 pdf_extractor_poc.py docs/manual.pdf -o output.json -v \ + --chunk-size 15 --min-quality 6.0 --extract-images \ + --extract-tables --parallel +""" + +import os +import sys +import json +import re +import argparse +from pathlib import Path + +# Check if PyMuPDF is installed +try: + import fitz # PyMuPDF +except ImportError: + print("ERROR: PyMuPDF not installed") + print("Install with: pip install PyMuPDF") + sys.exit(1) + +# Optional dependencies for advanced features +try: + import pytesseract + from PIL import Image + TESSERACT_AVAILABLE = True +except ImportError: + TESSERACT_AVAILABLE = False + +try: + import concurrent.futures + CONCURRENT_AVAILABLE = True +except ImportError: + CONCURRENT_AVAILABLE = False + + +class PDFExtractor: + """Extract text and code from PDF documentation""" + + def __init__(self, pdf_path, verbose=False, chunk_size=10, min_quality=0.0, + extract_images=False, image_dir=None, min_image_size=100, + use_ocr=False, password=None, extract_tables=False, + parallel=False, max_workers=None, use_cache=True): + self.pdf_path = pdf_path + self.verbose = verbose + self.chunk_size = chunk_size # Pages per chunk (0 = no chunking) + self.min_quality = min_quality # Minimum quality score (0-10) + self.extract_images = extract_images # Extract images to files (NEW in B1.5) + self.image_dir = image_dir # Directory to save images (NEW in B1.5) + self.min_image_size = min_image_size # Minimum image dimension (NEW in B1.5) + + # Advanced features (Priority 2 & 3) + self.use_ocr = use_ocr # OCR for scanned PDFs (Priority 2) + self.password = password # Password for encrypted PDFs (Priority 2) + self.extract_tables = extract_tables # Extract tables (Priority 2) + self.parallel = parallel # Parallel processing (Priority 3) + self.max_workers = max_workers or os.cpu_count() # Worker threads (Priority 3) + self.use_cache = use_cache # Cache expensive operations (Priority 3) + + self.doc = None + self.pages = [] + self.chapters = [] # Detected chapters/sections + self.extracted_images = [] # List of extracted image info (NEW in B1.5) + self._cache = {} # Cache for expensive operations (Priority 3) + + def log(self, message): + """Print message if verbose mode enabled""" + if self.verbose: + print(message) + + def extract_text_with_ocr(self, page): + """ + Extract text from scanned PDF page using OCR (Priority 2). + Falls back to regular text extraction if OCR is not available. + + Args: + page: PyMuPDF page object + + Returns: + str: Extracted text + """ + # Try regular text extraction first + text = page.get_text("text").strip() + + # If page has very little text, it might be scanned + if len(text) < 50 and self.use_ocr: + if not TESSERACT_AVAILABLE: + self.log("⚠️ OCR requested but pytesseract not installed") + self.log(" Install with: pip install pytesseract Pillow") + return text + + try: + # Render page as image + pix = page.get_pixmap() + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + # Run OCR + ocr_text = pytesseract.image_to_string(img) + self.log(f" OCR extracted {len(ocr_text)} chars (was {len(text)})") + return ocr_text if len(ocr_text) > len(text) else text + + except Exception as e: + self.log(f" OCR failed: {e}") + return text + + return text + + def extract_tables_from_page(self, page): + """ + Extract tables from PDF page (Priority 2). + Uses PyMuPDF's table detection. + + Args: + page: PyMuPDF page object + + Returns: + list: List of extracted tables as dicts + """ + if not self.extract_tables: + return [] + + tables = [] + try: + # PyMuPDF table extraction + tabs = page.find_tables() + for idx, tab in enumerate(tabs.tables): + table_data = { + 'table_index': idx, + 'rows': tab.extract(), + 'bbox': tab.bbox, + 'row_count': len(tab.extract()), + 'col_count': len(tab.extract()[0]) if tab.extract() else 0 + } + tables.append(table_data) + self.log(f" Found table {idx}: {table_data['row_count']}x{table_data['col_count']}") + + except Exception as e: + self.log(f" Table extraction failed: {e}") + + return tables + + def get_cached(self, key): + """ + Get cached value (Priority 3). + + Args: + key: Cache key + + Returns: + Cached value or None + """ + if not self.use_cache: + return None + return self._cache.get(key) + + def set_cached(self, key, value): + """ + Set cached value (Priority 3). + + Args: + key: Cache key + value: Value to cache + """ + if self.use_cache: + self._cache[key] = value + + def detect_language_from_code(self, code): + """ + Detect programming language from code content using patterns. + Enhanced in B1.4 with confidence scoring. + + Returns (language, confidence) tuple + """ + code_lower = code.lower() + + # Language detection patterns with weights + patterns = { + 'python': [ + (r'\bdef\s+\w+\s*\(', 3), + (r'\bimport\s+\w+', 2), + (r'\bclass\s+\w+:', 3), + (r'\bfrom\s+\w+\s+import', 2), + (r':\s*$', 1), # Lines ending with : + (r'^\s{4}|\t', 1), # Indentation + ], + 'javascript': [ + (r'\bfunction\s+\w+\s*\(', 3), + (r'\bconst\s+\w+\s*=', 2), + (r'\blet\s+\w+\s*=', 2), + (r'=>', 2), + (r'\bconsole\.log', 2), + (r'\bvar\s+\w+\s*=', 1), + ], + 'java': [ + (r'\bpublic\s+class\s+\w+', 4), + (r'\bprivate\s+\w+\s+\w+', 2), + (r'\bSystem\.out\.println', 3), + (r'\bpublic\s+static\s+void', 3), + ], + 'cpp': [ + (r'#include\s*<', 3), + (r'\bstd::', 3), + (r'\bnamespace\s+\w+', 2), + (r'cout\s*<<', 3), + (r'\bvoid\s+\w+\s*\(', 1), + ], + 'c': [ + (r'#include\s+<\w+\.h>', 4), + (r'\bprintf\s*\(', 3), + (r'\bmain\s*\(', 2), + (r'\bstruct\s+\w+', 2), + ], + 'csharp': [ + (r'\bnamespace\s+\w+', 3), + (r'\bpublic\s+class\s+\w+', 3), + (r'\busing\s+System', 3), + ], + 'go': [ + (r'\bfunc\s+\w+\s*\(', 3), + (r'\bpackage\s+\w+', 4), + (r':=', 2), + (r'\bfmt\.Print', 2), + ], + 'rust': [ + (r'\bfn\s+\w+\s*\(', 4), + (r'\blet\s+mut\s+\w+', 3), + (r'\bprintln!', 3), + (r'\bimpl\s+\w+', 2), + ], + 'php': [ + (r'<\?php', 5), + (r'\$\w+\s*=', 2), + (r'\bfunction\s+\w+\s*\(', 1), + ], + 'ruby': [ + (r'\bdef\s+\w+', 3), + (r'\bend\b', 2), + (r'\brequire\s+[\'"]', 2), + ], + 'swift': [ + (r'\bfunc\s+\w+\s*\(', 3), + (r'\bvar\s+\w+:', 2), + (r'\blet\s+\w+:', 2), + ], + 'kotlin': [ + (r'\bfun\s+\w+\s*\(', 4), + (r'\bval\s+\w+\s*=', 2), + (r'\bvar\s+\w+\s*=', 2), + ], + 'shell': [ + (r'#!/bin/bash', 5), + (r'#!/bin/sh', 5), + (r'\becho\s+', 1), + (r'\$\{?\w+\}?', 1), + ], + 'sql': [ + (r'\bSELECT\s+', 4), + (r'\bFROM\s+', 3), + (r'\bWHERE\s+', 2), + (r'\bINSERT\s+INTO', 4), + (r'\bCREATE\s+TABLE', 4), + ], + 'html': [ + (r'', 1), + ], + } + + # Calculate confidence scores for each language + scores = {} + for lang, lang_patterns in patterns.items(): + score = 0 + for pattern, weight in lang_patterns: + if re.search(pattern, code, re.IGNORECASE | re.MULTILINE): + score += weight + if score > 0: + scores[lang] = score + + if not scores: + return 'unknown', 0 + + # Get language with highest score + best_lang = max(scores, key=scores.get) + confidence = min(scores[best_lang] / 10.0, 1.0) # Normalize to 0-1 + + return best_lang, confidence + + def validate_code_syntax(self, code, language): + """ + Validate code syntax (basic checks). + Enhanced in B1.4 with syntax validation. + + Returns (is_valid, issues) tuple + """ + issues = [] + + # Common syntax checks + if not code.strip(): + return False, ['Empty code block'] + + # Language-specific validation + if language == 'python': + # Check indentation consistency + lines = code.split('\n') + indent_chars = set() + for line in lines: + if line.startswith(' '): + indent_chars.add('space') + elif line.startswith('\t'): + indent_chars.add('tab') + + if len(indent_chars) > 1: + issues.append('Mixed tabs and spaces') + + # Check for unclosed brackets/parens + open_count = code.count('(') + code.count('[') + code.count('{') + close_count = code.count(')') + code.count(']') + code.count('}') + if abs(open_count - close_count) > 2: # Allow small mismatch + issues.append('Unbalanced brackets') + + elif language in ['javascript', 'java', 'cpp', 'c', 'csharp', 'go']: + # Check for balanced braces + open_braces = code.count('{') + close_braces = code.count('}') + if abs(open_braces - close_braces) > 1: + issues.append('Unbalanced braces') + + elif language == 'json': + # Try to parse JSON + try: + json.loads(code) + except (json.JSONDecodeError, ValueError) as e: + issues.append(f'Invalid JSON syntax: {str(e)[:50]}') + + # General checks + # Check if code looks like natural language (too many common words) + common_words = ['the', 'and', 'for', 'with', 'this', 'that', 'have', 'from'] + word_count = sum(1 for word in common_words if word in code.lower()) + if word_count > 5 and len(code.split()) < 50: + issues.append('May be natural language, not code') + + # Check code/comment ratio + comment_lines = sum(1 for line in code.split('\n') if line.strip().startswith(('#', '//', '/*', '*', '--'))) + total_lines = len([l for l in code.split('\n') if l.strip()]) + if total_lines > 0 and comment_lines / total_lines > 0.7: + issues.append('Mostly comments') + + return len(issues) == 0, issues + + def score_code_quality(self, code, language, confidence): + """ + Score the quality/usefulness of detected code block. + New in B1.4. + + Returns quality score (0-10) + """ + score = 5.0 # Start with neutral score + + # Factor 1: Language detection confidence + score += confidence * 2.0 + + # Factor 2: Code length (not too short, not too long) + code_length = len(code.strip()) + if 20 <= code_length <= 500: + score += 1.0 + elif 500 < code_length <= 2000: + score += 0.5 + elif code_length < 10: + score -= 2.0 + + # Factor 3: Number of lines + lines = [l for l in code.split('\n') if l.strip()] + if 2 <= len(lines) <= 50: + score += 1.0 + elif len(lines) > 100: + score -= 1.0 + + # Factor 4: Has function/class definitions + if re.search(r'\b(def|function|class|func|fn|public class)\b', code): + score += 1.5 + + # Factor 5: Has meaningful variable names (not just x, y, i) + meaningful_vars = re.findall(r'\b[a-z_][a-z0-9_]{3,}\b', code.lower()) + if len(meaningful_vars) >= 2: + score += 1.0 + + # Factor 6: Syntax validation + is_valid, issues = self.validate_code_syntax(code, language) + if is_valid: + score += 1.0 + else: + score -= len(issues) * 0.5 + + # Clamp score to 0-10 range + return max(0, min(10, score)) + + def detect_code_blocks_by_font(self, page): + """ + Detect code blocks by analyzing font properties. + Monospace fonts typically indicate code. + + Returns list of detected code blocks with metadata. + """ + code_blocks = [] + blocks = page.get_text("dict")["blocks"] + + monospace_fonts = ['courier', 'mono', 'consolas', 'menlo', 'monaco', 'dejavu'] + + current_code = [] + current_font = None + + for block in blocks: + if 'lines' not in block: + continue + + for line in block['lines']: + for span in line['spans']: + font = span['font'].lower() + text = span['text'] + + # Check if font is monospace + is_monospace = any(mf in font for mf in monospace_fonts) + + if is_monospace: + # Accumulate code text + current_code.append(text) + current_font = span['font'] + else: + # End of code block + if current_code: + code_text = ''.join(current_code).strip() + if len(code_text) > 10: # Minimum code length + lang, confidence = self.detect_language_from_code(code_text) + quality = self.score_code_quality(code_text, lang, confidence) + is_valid, issues = self.validate_code_syntax(code_text, lang) + + code_blocks.append({ + 'code': code_text, + 'language': lang, + 'confidence': confidence, + 'quality_score': quality, + 'is_valid': is_valid, + 'validation_issues': issues if not is_valid else [], + 'font': current_font, + 'detection_method': 'font' + }) + current_code = [] + current_font = None + + # Handle final code block + if current_code: + code_text = ''.join(current_code).strip() + if len(code_text) > 10: + lang, confidence = self.detect_language_from_code(code_text) + quality = self.score_code_quality(code_text, lang, confidence) + is_valid, issues = self.validate_code_syntax(code_text, lang) + + code_blocks.append({ + 'code': code_text, + 'language': lang, + 'confidence': confidence, + 'quality_score': quality, + 'is_valid': is_valid, + 'validation_issues': issues if not is_valid else [], + 'font': current_font, + 'detection_method': 'font' + }) + + return code_blocks + + def detect_code_blocks_by_indent(self, text): + """ + Detect code blocks by indentation patterns. + Code often has consistent indentation. + + Returns list of detected code blocks. + """ + code_blocks = [] + lines = text.split('\n') + current_block = [] + indent_pattern = None + + for line in lines: + # Check for indentation (4 spaces or tab) + if line.startswith(' ') or line.startswith('\t'): + # Start or continue code block + if not indent_pattern: + indent_pattern = line[:4] if line.startswith(' ') else '\t' + current_block.append(line) + else: + # End of code block + if current_block and len(current_block) >= 2: # At least 2 lines + code_text = '\n'.join(current_block).strip() + if len(code_text) > 20: # Minimum code length + lang, confidence = self.detect_language_from_code(code_text) + quality = self.score_code_quality(code_text, lang, confidence) + is_valid, issues = self.validate_code_syntax(code_text, lang) + + code_blocks.append({ + 'code': code_text, + 'language': lang, + 'confidence': confidence, + 'quality_score': quality, + 'is_valid': is_valid, + 'validation_issues': issues if not is_valid else [], + 'detection_method': 'indent' + }) + current_block = [] + indent_pattern = None + + # Handle final block + if current_block and len(current_block) >= 2: + code_text = '\n'.join(current_block).strip() + if len(code_text) > 20: + lang, confidence = self.detect_language_from_code(code_text) + quality = self.score_code_quality(code_text, lang, confidence) + is_valid, issues = self.validate_code_syntax(code_text, lang) + + code_blocks.append({ + 'code': code_text, + 'language': lang, + 'confidence': confidence, + 'quality_score': quality, + 'is_valid': is_valid, + 'validation_issues': issues if not is_valid else [], + 'detection_method': 'indent' + }) + + return code_blocks + + def detect_code_blocks_by_pattern(self, text): + """ + Detect code blocks by common code patterns (keywords, syntax). + + Returns list of detected code snippets. + """ + code_blocks = [] + + # Common code patterns that span multiple lines + patterns = [ + # Function definitions + (r'((?:def|function|func|fn|public|private)\s+\w+\s*\([^)]*\)\s*[{:]?[^}]*[}]?)', 'function'), + # Class definitions + (r'(class\s+\w+[^{]*\{[^}]*\})', 'class'), + # Import statements block + (r'((?:import|require|use|include)[^\n]+(?:\n(?:import|require|use|include)[^\n]+)*)', 'imports'), + ] + + for pattern, block_type in patterns: + matches = re.finditer(pattern, text, re.MULTILINE | re.DOTALL) + for match in matches: + code_text = match.group(1).strip() + if len(code_text) > 15: + lang, confidence = self.detect_language_from_code(code_text) + quality = self.score_code_quality(code_text, lang, confidence) + is_valid, issues = self.validate_code_syntax(code_text, lang) + + code_blocks.append({ + 'code': code_text, + 'language': lang, + 'confidence': confidence, + 'quality_score': quality, + 'is_valid': is_valid, + 'validation_issues': issues if not is_valid else [], + 'detection_method': 'pattern', + 'pattern_type': block_type + }) + + return code_blocks + + def detect_chapter_start(self, page_data): + """ + Detect if a page starts a new chapter/section. + + Returns (is_chapter_start, chapter_title) tuple. + """ + headings = page_data.get('headings', []) + + # Check for h1 or h2 at start of page + if headings: + first_heading = headings[0] + # H1 headings are strong indicators of chapters + if first_heading['level'] in ['h1', 'h2']: + return True, first_heading['text'] + + # Check for specific chapter markers in text + text = page_data.get('text', '') + first_line = text.split('\n')[0] if text else '' + + chapter_patterns = [ + r'^Chapter\s+\d+', + r'^Part\s+\d+', + r'^Section\s+\d+', + r'^\d+\.\s+[A-Z]', # "1. Introduction" + ] + + for pattern in chapter_patterns: + if re.match(pattern, first_line, re.IGNORECASE): + return True, first_line.strip() + + return False, None + + def merge_continued_code_blocks(self, pages): + """ + Merge code blocks that are split across pages. + + Detects when a code block at the end of one page continues + on the next page. + """ + for i in range(len(pages) - 1): + current_page = pages[i] + next_page = pages[i + 1] + + # Check if current page has code blocks + if not current_page['code_samples']: + continue + + # Get last code block of current page + last_code = current_page['code_samples'][-1] + + # Check if next page starts with code + if not next_page['code_samples']: + continue + + first_next_code = next_page['code_samples'][0] + + # Same language and detection method = likely continuation + if (last_code['language'] == first_next_code['language'] and + last_code['detection_method'] == first_next_code['detection_method']): + + # Check if last code block looks incomplete (doesn't end with closing brace/etc) + last_code_text = last_code['code'].rstrip() + continuation_indicators = [ + not last_code_text.endswith('}'), + not last_code_text.endswith(';'), + last_code_text.endswith(','), + last_code_text.endswith('\\'), + ] + + if any(continuation_indicators): + # Merge the code blocks + merged_code = last_code['code'] + '\n' + first_next_code['code'] + last_code['code'] = merged_code + last_code['merged_from_next_page'] = True + + # Remove the first code block from next page + next_page['code_samples'].pop(0) + next_page['code_blocks_count'] -= 1 + + self.log(f" Merged code block from page {i+1} to {i+2}") + + return pages + + def create_chunks(self, pages): + """ + Create chunks of pages for better organization. + + Returns array of chunks, each containing: + - chunk_number + - start_page, end_page + - pages (array) + - chapter_title (if detected) + """ + if self.chunk_size == 0: + # No chunking - return all pages as one chunk + return [{ + 'chunk_number': 1, + 'start_page': 1, + 'end_page': len(pages), + 'pages': pages, + 'chapter_title': None + }] + + chunks = [] + current_chunk = [] + chunk_start = 0 + current_chapter = None + + for i, page in enumerate(pages): + # Check if this page starts a new chapter + is_chapter, chapter_title = self.detect_chapter_start(page) + + if is_chapter and current_chunk: + # Save current chunk before starting new one + chunks.append({ + 'chunk_number': len(chunks) + 1, + 'start_page': chunk_start + 1, + 'end_page': i, + 'pages': current_chunk, + 'chapter_title': current_chapter + }) + current_chunk = [] + chunk_start = i + current_chapter = chapter_title + + if not current_chapter and is_chapter: + current_chapter = chapter_title + + current_chunk.append(page) + + # Check if chunk size reached (but don't break chapters) + if not is_chapter and len(current_chunk) >= self.chunk_size: + chunks.append({ + 'chunk_number': len(chunks) + 1, + 'start_page': chunk_start + 1, + 'end_page': i + 1, + 'pages': current_chunk, + 'chapter_title': current_chapter + }) + current_chunk = [] + chunk_start = i + 1 + current_chapter = None + + # Add remaining pages as final chunk + if current_chunk: + chunks.append({ + 'chunk_number': len(chunks) + 1, + 'start_page': chunk_start + 1, + 'end_page': len(pages), + 'pages': current_chunk, + 'chapter_title': current_chapter + }) + + return chunks + + def extract_images_from_page(self, page, page_num): + """ + Extract images from a PDF page and save to disk (NEW in B1.5). + + Returns list of extracted image metadata. + """ + if not self.extract_images: + # Just count images, don't extract + return [] + + extracted = [] + image_list = page.get_images() + + for img_index, img in enumerate(image_list): + try: + xref = img[0] # Image XREF number + base_image = self.doc.extract_image(xref) + + if not base_image: + continue + + image_bytes = base_image["image"] + image_ext = base_image["ext"] # png, jpeg, etc. + width = base_image.get("width", 0) + height = base_image.get("height", 0) + + # Filter out small images (icons, bullets, etc.) + if width < self.min_image_size or height < self.min_image_size: + self.log(f" Skipping small image: {width}x{height}") + continue + + # Generate filename + pdf_basename = Path(self.pdf_path).stem + image_filename = f"{pdf_basename}_page{page_num+1}_img{img_index+1}.{image_ext}" + + # Save image + image_path = Path(self.image_dir) / image_filename + image_path.parent.mkdir(parents=True, exist_ok=True) + + with open(image_path, "wb") as f: + f.write(image_bytes) + + # Store metadata + image_info = { + 'filename': image_filename, + 'path': str(image_path), + 'page_number': page_num + 1, + 'width': width, + 'height': height, + 'format': image_ext, + 'size_bytes': len(image_bytes), + 'xref': xref + } + + extracted.append(image_info) + self.extracted_images.append(image_info) + self.log(f" Extracted image: {image_filename} ({width}x{height})") + + except Exception as e: + self.log(f" Error extracting image {img_index}: {e}") + continue + + return extracted + + def extract_page(self, page_num): + """ + Extract content from a single PDF page. + + Returns dict with page content, code blocks, and metadata. + """ + # Check cache first (Priority 3) + cache_key = f"page_{page_num}" + cached = self.get_cached(cache_key) + if cached is not None: + self.log(f" Page {page_num + 1}: Using cached data") + return cached + + page = self.doc.load_page(page_num) + + # Extract plain text (with OCR if enabled - Priority 2) + if self.use_ocr: + text = self.extract_text_with_ocr(page) + else: + text = page.get_text("text") + + # Extract markdown (better structure preservation) + markdown = page.get_text("markdown") + + # Extract tables (Priority 2) + tables = self.extract_tables_from_page(page) + + # Get page images (for diagrams) + images = page.get_images() + + # Extract images to files (NEW in B1.5) + extracted_images = self.extract_images_from_page(page, page_num) + + # Detect code blocks using multiple methods + font_code_blocks = self.detect_code_blocks_by_font(page) + indent_code_blocks = self.detect_code_blocks_by_indent(text) + pattern_code_blocks = self.detect_code_blocks_by_pattern(text) + + # Merge and deduplicate code blocks + all_code_blocks = font_code_blocks + indent_code_blocks + pattern_code_blocks + + # Simple deduplication by code content + unique_code = {} + for block in all_code_blocks: + code_hash = hash(block['code']) + if code_hash not in unique_code: + unique_code[code_hash] = block + else: + # Keep the one with higher quality score + if block['quality_score'] > unique_code[code_hash]['quality_score']: + unique_code[code_hash] = block + + code_samples = list(unique_code.values()) + + # Filter by minimum quality (NEW in B1.4) + if self.min_quality > 0: + code_samples_before = len(code_samples) + code_samples = [c for c in code_samples if c['quality_score'] >= self.min_quality] + filtered_count = code_samples_before - len(code_samples) + if filtered_count > 0: + self.log(f" Filtered out {filtered_count} low-quality code blocks (min_quality={self.min_quality})") + + # Sort by quality score (highest first) + code_samples.sort(key=lambda x: x['quality_score'], reverse=True) + + # Extract headings from markdown + headings = [] + for line in markdown.split('\n'): + if line.startswith('#'): + level = len(line) - len(line.lstrip('#')) + text = line.lstrip('#').strip() + if text: + headings.append({ + 'level': f'h{level}', + 'text': text + }) + + page_data = { + 'page_number': page_num + 1, # 1-indexed for humans + 'text': text.strip(), + 'markdown': markdown.strip(), + 'headings': headings, + 'code_samples': code_samples, + 'images_count': len(images), + 'extracted_images': extracted_images, # NEW in B1.5 + 'tables': tables, # NEW in Priority 2 + 'char_count': len(text), + 'code_blocks_count': len(code_samples), + 'tables_count': len(tables) # NEW in Priority 2 + } + + # Cache the result (Priority 3) + self.set_cached(cache_key, page_data) + + self.log(f" Page {page_num + 1}: {len(text)} chars, {len(code_samples)} code blocks, {len(headings)} headings, {len(extracted_images)} images, {len(tables)} tables") + + return page_data + + def extract_all(self): + """ + Extract content from all pages of the PDF. + Enhanced with password support and parallel processing. + + Returns dict with metadata and pages array. + """ + print(f"\n📄 Extracting from: {self.pdf_path}") + + # Open PDF (with password support - Priority 2) + try: + self.doc = fitz.open(self.pdf_path) + + # Handle encrypted PDFs (Priority 2) + if self.doc.is_encrypted: + if self.password: + print(f" 🔐 PDF is encrypted, trying password...") + if self.doc.authenticate(self.password): + print(f" ✅ Password accepted") + else: + print(f" ❌ Invalid password") + return None + else: + print(f" ❌ PDF is encrypted but no password provided") + print(f" Use --password option to provide password") + return None + + except Exception as e: + print(f"❌ Error opening PDF: {e}") + return None + + print(f" Pages: {len(self.doc)}") + print(f" Metadata: {self.doc.metadata}") + + # Set up image directory (NEW in B1.5) + if self.extract_images and not self.image_dir: + pdf_basename = Path(self.pdf_path).stem + self.image_dir = f"output/{pdf_basename}_images" + print(f" Image directory: {self.image_dir}") + + # Show feature status + if self.use_ocr: + status = "✅ enabled" if TESSERACT_AVAILABLE else "⚠️ not available (install pytesseract)" + print(f" OCR: {status}") + if self.extract_tables: + print(f" Table extraction: ✅ enabled") + if self.parallel: + status = "✅ enabled" if CONCURRENT_AVAILABLE else "⚠️ not available" + print(f" Parallel processing: {status} ({self.max_workers} workers)") + if self.use_cache: + print(f" Caching: ✅ enabled") + + print("") + + # Extract each page (with parallel processing - Priority 3) + if self.parallel and CONCURRENT_AVAILABLE and len(self.doc) > 5: + print(f"🚀 Extracting {len(self.doc)} pages in parallel ({self.max_workers} workers)...") + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: + page_numbers = list(range(len(self.doc))) + self.pages = list(executor.map(self.extract_page, page_numbers)) + else: + # Sequential extraction + for page_num in range(len(self.doc)): + page_data = self.extract_page(page_num) + self.pages.append(page_data) + + # Merge code blocks that span across pages + self.log("\n🔗 Merging code blocks across pages...") + self.pages = self.merge_continued_code_blocks(self.pages) + + # Create chunks + self.log(f"\n📦 Creating chunks (chunk_size={self.chunk_size})...") + chunks = self.create_chunks(self.pages) + + # Build summary + total_chars = sum(p['char_count'] for p in self.pages) + total_code_blocks = sum(p['code_blocks_count'] for p in self.pages) + total_headings = sum(len(p['headings']) for p in self.pages) + total_images = sum(p['images_count'] for p in self.pages) + total_tables = sum(p['tables_count'] for p in self.pages) # NEW in Priority 2 + + # Detect languages used + languages = {} + all_code_blocks_list = [] + for page in self.pages: + for code in page['code_samples']: + lang = code['language'] + languages[lang] = languages.get(lang, 0) + 1 + all_code_blocks_list.append(code) + + # Calculate quality statistics (NEW in B1.4) + quality_stats = {} + if all_code_blocks_list: + quality_scores = [c['quality_score'] for c in all_code_blocks_list] + confidences = [c['confidence'] for c in all_code_blocks_list] + valid_count = sum(1 for c in all_code_blocks_list if c['is_valid']) + + quality_stats = { + 'average_quality': sum(quality_scores) / len(quality_scores), + 'average_confidence': sum(confidences) / len(confidences), + 'valid_code_blocks': valid_count, + 'invalid_code_blocks': total_code_blocks - valid_count, + 'validation_rate': valid_count / total_code_blocks if total_code_blocks > 0 else 0, + 'high_quality_blocks': sum(1 for s in quality_scores if s >= 7.0), + 'medium_quality_blocks': sum(1 for s in quality_scores if 4.0 <= s < 7.0), + 'low_quality_blocks': sum(1 for s in quality_scores if s < 4.0), + } + + # Extract chapter information + chapters = [] + for chunk in chunks: + if chunk['chapter_title']: + chapters.append({ + 'title': chunk['chapter_title'], + 'start_page': chunk['start_page'], + 'end_page': chunk['end_page'] + }) + + result = { + 'source_file': self.pdf_path, + 'metadata': self.doc.metadata, + 'total_pages': len(self.doc), + 'total_chars': total_chars, + 'total_code_blocks': total_code_blocks, + 'total_headings': total_headings, + 'total_images': total_images, + 'total_extracted_images': len(self.extracted_images), # NEW in B1.5 + 'total_tables': total_tables, # NEW in Priority 2 + 'image_directory': self.image_dir if self.extract_images else None, # NEW in B1.5 + 'extracted_images': self.extracted_images, # NEW in B1.5 + 'total_chunks': len(chunks), + 'chapters': chapters, + 'languages_detected': languages, + 'quality_statistics': quality_stats, # NEW in B1.4 + 'chunks': chunks, + 'pages': self.pages # Still include all pages for compatibility + } + + # Close document + self.doc.close() + + print(f"\n✅ Extraction complete:") + print(f" Total characters: {total_chars:,}") + print(f" Code blocks found: {total_code_blocks}") + print(f" Headings found: {total_headings}") + print(f" Images found: {total_images}") + if self.extract_images: + print(f" Images extracted: {len(self.extracted_images)}") + if self.image_dir: + print(f" Image directory: {self.image_dir}") + if self.extract_tables: + print(f" Tables found: {total_tables}") + print(f" Chunks created: {len(chunks)}") + print(f" Chapters detected: {len(chapters)}") + print(f" Languages detected: {', '.join(languages.keys())}") + + # Print quality statistics (NEW in B1.4) + if quality_stats: + print(f"\n📊 Code Quality Statistics:") + print(f" Average quality: {quality_stats['average_quality']:.1f}/10") + print(f" Average confidence: {quality_stats['average_confidence']:.1%}") + print(f" Valid code blocks: {quality_stats['valid_code_blocks']}/{total_code_blocks} ({quality_stats['validation_rate']:.1%})") + print(f" High quality (7+): {quality_stats['high_quality_blocks']}") + print(f" Medium quality (4-7): {quality_stats['medium_quality_blocks']}") + print(f" Low quality (<4): {quality_stats['low_quality_blocks']}") + + return result + + +def main(): + parser = argparse.ArgumentParser( + description='Extract text and code blocks from PDF documentation', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Extract from PDF + python3 pdf_extractor_poc.py input.pdf + + # Save to JSON file + python3 pdf_extractor_poc.py input.pdf --output result.json + + # Verbose mode + python3 pdf_extractor_poc.py input.pdf --verbose + + # Extract and save + python3 pdf_extractor_poc.py docs/python.pdf -o python_extracted.json -v + """ + ) + + parser.add_argument('pdf_file', help='Path to PDF file to extract') + parser.add_argument('-o', '--output', help='Output JSON file path (default: print to stdout)') + parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output') + parser.add_argument('--pretty', action='store_true', help='Pretty-print JSON output') + parser.add_argument('--chunk-size', type=int, default=10, + help='Pages per chunk (0 = no chunking, default: 10)') + parser.add_argument('--no-merge', action='store_true', + help='Disable merging code blocks across pages') + parser.add_argument('--min-quality', type=float, default=0.0, + help='Minimum code quality score (0-10, default: 0 = no filtering)') + parser.add_argument('--extract-images', action='store_true', + help='Extract images to files (NEW in B1.5)') + parser.add_argument('--image-dir', type=str, default=None, + help='Directory to save extracted images (default: output/{pdf_name}_images)') + parser.add_argument('--min-image-size', type=int, default=100, + help='Minimum image dimension in pixels (filters icons, default: 100)') + + # Advanced features (Priority 2 & 3) + parser.add_argument('--ocr', action='store_true', + help='Use OCR for scanned PDFs (requires pytesseract)') + parser.add_argument('--password', type=str, default=None, + help='Password for encrypted PDF') + parser.add_argument('--extract-tables', action='store_true', + help='Extract tables from PDF (Priority 2)') + parser.add_argument('--parallel', action='store_true', + help='Process pages in parallel (Priority 3)') + parser.add_argument('--workers', type=int, default=None, + help='Number of parallel workers (default: CPU count)') + parser.add_argument('--no-cache', action='store_true', + help='Disable caching of expensive operations') + + args = parser.parse_args() + + # Validate input file + if not os.path.exists(args.pdf_file): + print(f"❌ Error: File not found: {args.pdf_file}") + sys.exit(1) + + if not args.pdf_file.lower().endswith('.pdf'): + print(f"⚠️ Warning: File does not have .pdf extension") + + # Extract + extractor = PDFExtractor( + args.pdf_file, + verbose=args.verbose, + chunk_size=args.chunk_size, + min_quality=args.min_quality, + extract_images=args.extract_images, + image_dir=args.image_dir, + min_image_size=args.min_image_size, + # Advanced features (Priority 2 & 3) + use_ocr=args.ocr, + password=args.password, + extract_tables=args.extract_tables, + parallel=args.parallel, + max_workers=args.workers, + use_cache=not args.no_cache + ) + result = extractor.extract_all() + + if result is None: + sys.exit(1) + + # Output + if args.output: + # Save to file + with open(args.output, 'w', encoding='utf-8') as f: + if args.pretty: + json.dump(result, f, indent=2, ensure_ascii=False) + else: + json.dump(result, f, ensure_ascii=False) + print(f"\n💾 Saved to: {args.output}") + else: + # Print to stdout + if args.pretty: + print("\n" + json.dumps(result, indent=2, ensure_ascii=False)) + else: + print(json.dumps(result, ensure_ascii=False)) + + +if __name__ == '__main__': + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/pdf_scraper.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/pdf_scraper.py new file mode 100644 index 0000000..76ce377 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/pdf_scraper.py @@ -0,0 +1,401 @@ +#!/usr/bin/env python3 +""" +PDF Documentation to Claude Skill Converter (Task B1.6) + +Converts PDF documentation into Claude AI skills. +Uses pdf_extractor_poc.py for extraction, builds skill structure. + +Usage: + python3 pdf_scraper.py --config configs/manual_pdf.json + python3 pdf_scraper.py --pdf manual.pdf --name myskill + python3 pdf_scraper.py --from-json manual_extracted.json +""" + +import os +import sys +import json +import re +import argparse +from pathlib import Path + +# Import the PDF extractor +from .pdf_extractor_poc import PDFExtractor + + +class PDFToSkillConverter: + """Convert PDF documentation to Claude skill""" + + def __init__(self, config): + self.config = config + self.name = config['name'] + self.pdf_path = config.get('pdf_path', '') + self.description = config.get('description', f'Documentation skill for {self.name}') + + # Paths + self.skill_dir = f"output/{self.name}" + self.data_file = f"output/{self.name}_extracted.json" + + # Extraction options + self.extract_options = config.get('extract_options', {}) + + # Categories + self.categories = config.get('categories', {}) + + # Extracted data + self.extracted_data = None + + def extract_pdf(self): + """Extract content from PDF using pdf_extractor_poc.py""" + print(f"\n🔍 Extracting from PDF: {self.pdf_path}") + + # Create extractor with options + extractor = PDFExtractor( + self.pdf_path, + verbose=True, + chunk_size=self.extract_options.get('chunk_size', 10), + min_quality=self.extract_options.get('min_quality', 5.0), + extract_images=self.extract_options.get('extract_images', True), + image_dir=f"{self.skill_dir}/assets/images", + min_image_size=self.extract_options.get('min_image_size', 100) + ) + + # Extract + result = extractor.extract_all() + + if not result: + print("❌ Extraction failed") + raise RuntimeError(f"Failed to extract PDF: {self.pdf_path}") + + # Save extracted data + with open(self.data_file, 'w', encoding='utf-8') as f: + json.dump(result, f, indent=2, ensure_ascii=False) + + print(f"\n💾 Saved extracted data to: {self.data_file}") + self.extracted_data = result + return True + + def load_extracted_data(self, json_path): + """Load previously extracted data from JSON""" + print(f"\n📂 Loading extracted data from: {json_path}") + + with open(json_path, 'r', encoding='utf-8') as f: + self.extracted_data = json.load(f) + + print(f"✅ Loaded {self.extracted_data['total_pages']} pages") + return True + + def categorize_content(self): + """Categorize pages based on chapters or keywords""" + print(f"\n📋 Categorizing content...") + + categorized = {} + + # Use chapters if available + if self.extracted_data.get('chapters'): + for chapter in self.extracted_data['chapters']: + category_key = self._sanitize_filename(chapter['title']) + categorized[category_key] = { + 'title': chapter['title'], + 'pages': [] + } + + # Assign pages to chapters + for page in self.extracted_data['pages']: + page_num = page['page_number'] + + # Find which chapter this page belongs to + for chapter in self.extracted_data['chapters']: + if chapter['start_page'] <= page_num <= chapter['end_page']: + category_key = self._sanitize_filename(chapter['title']) + categorized[category_key]['pages'].append(page) + break + + # Fall back to keyword-based categorization + elif self.categories: + # Check if categories is already in the right format (for tests) + # If first value is a list of dicts (pages), use as-is + first_value = next(iter(self.categories.values())) + if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict): + # Already categorized - convert to expected format + for cat_key, pages in self.categories.items(): + categorized[cat_key] = { + 'title': cat_key.replace('_', ' ').title(), + 'pages': pages + } + else: + # Keyword-based categorization + # Initialize categories + for cat_key, keywords in self.categories.items(): + categorized[cat_key] = { + 'title': cat_key.replace('_', ' ').title(), + 'pages': [] + } + + # Categorize by keywords + for page in self.extracted_data['pages']: + text = page.get('text', '').lower() + headings_text = ' '.join([h['text'] for h in page.get('headings', [])]).lower() + + # Score against each category + scores = {} + for cat_key, keywords in self.categories.items(): + # Handle both string keywords and dict keywords (shouldn't happen, but be safe) + if isinstance(keywords, list): + score = sum(1 for kw in keywords + if isinstance(kw, str) and (kw.lower() in text or kw.lower() in headings_text)) + else: + score = 0 + if score > 0: + scores[cat_key] = score + + # Assign to highest scoring category + if scores: + best_cat = max(scores, key=scores.get) + categorized[best_cat]['pages'].append(page) + else: + # Default category + if 'other' not in categorized: + categorized['other'] = {'title': 'Other', 'pages': []} + categorized['other']['pages'].append(page) + + else: + # No categorization - use single category + categorized['content'] = { + 'title': 'Content', + 'pages': self.extracted_data['pages'] + } + + print(f"✅ Created {len(categorized)} categories") + for cat_key, cat_data in categorized.items(): + print(f" - {cat_data['title']}: {len(cat_data['pages'])} pages") + + return categorized + + def build_skill(self): + """Build complete skill structure""" + print(f"\n🏗️ Building skill: {self.name}") + + # Create directories + os.makedirs(f"{self.skill_dir}/references", exist_ok=True) + os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) + os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) + + # Categorize content + categorized = self.categorize_content() + + # Generate reference files + print(f"\n📝 Generating reference files...") + for cat_key, cat_data in categorized.items(): + self._generate_reference_file(cat_key, cat_data) + + # Generate index + self._generate_index(categorized) + + # Generate SKILL.md + self._generate_skill_md(categorized) + + print(f"\n✅ Skill built successfully: {self.skill_dir}/") + print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/") + + def _generate_reference_file(self, cat_key, cat_data): + """Generate a reference markdown file for a category""" + filename = f"{self.skill_dir}/references/{cat_key}.md" + + with open(filename, 'w', encoding='utf-8') as f: + f.write(f"# {cat_data['title']}\n\n") + + for page in cat_data['pages']: + # Add headings as section markers + if page.get('headings'): + f.write(f"## {page['headings'][0]['text']}\n\n") + + # Add text content + if page.get('text'): + # Limit to first 1000 chars per page to avoid huge files + text = page['text'][:1000] + f.write(f"{text}\n\n") + + # Add code samples (check both 'code_samples' and 'code_blocks' for compatibility) + code_list = page.get('code_samples') or page.get('code_blocks') + if code_list: + f.write("### Code Examples\n\n") + for code in code_list[:3]: # Limit to top 3 + lang = code.get('language', '') + f.write(f"```{lang}\n{code['code']}\n```\n\n") + + # Add images + if page.get('images'): + # Create assets directory if needed + assets_dir = os.path.join(self.skill_dir, 'assets') + os.makedirs(assets_dir, exist_ok=True) + + f.write("### Images\n\n") + for img in page['images']: + # Save image to assets + img_filename = f"page_{page['page_number']}_img_{img['index']}.png" + img_path = os.path.join(assets_dir, img_filename) + + with open(img_path, 'wb') as img_file: + img_file.write(img['data']) + + # Add markdown image reference + f.write(f"![Image {img['index']}](../assets/{img_filename})\n\n") + + f.write("---\n\n") + + print(f" Generated: {filename}") + + def _generate_index(self, categorized): + """Generate reference index""" + filename = f"{self.skill_dir}/references/index.md" + + with open(filename, 'w', encoding='utf-8') as f: + f.write(f"# {self.name.title()} Documentation Reference\n\n") + f.write("## Categories\n\n") + + for cat_key, cat_data in categorized.items(): + page_count = len(cat_data['pages']) + f.write(f"- [{cat_data['title']}]({cat_key}.md) ({page_count} pages)\n") + + f.write("\n## Statistics\n\n") + stats = self.extracted_data.get('quality_statistics', {}) + f.write(f"- Total pages: {self.extracted_data.get('total_pages', 0)}\n") + f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n") + f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n") + if stats: + f.write(f"- Average code quality: {stats.get('average_quality', 0):.1f}/10\n") + f.write(f"- Valid code blocks: {stats.get('valid_code_blocks', 0)}\n") + + print(f" Generated: {filename}") + + def _generate_skill_md(self, categorized): + """Generate main SKILL.md file""" + filename = f"{self.skill_dir}/SKILL.md" + + # Generate skill name (lowercase, hyphens only, max 64 chars) + skill_name = self.name.lower().replace('_', '-').replace(' ', '-')[:64] + + # Truncate description to 1024 chars if needed + desc = self.description[:1024] if len(self.description) > 1024 else self.description + + with open(filename, 'w', encoding='utf-8') as f: + # Write YAML frontmatter + f.write(f"---\n") + f.write(f"name: {skill_name}\n") + f.write(f"description: {desc}\n") + f.write(f"---\n\n") + + f.write(f"# {self.name.title()} Documentation Skill\n\n") + f.write(f"{self.description}\n\n") + + f.write("## When to use this skill\n\n") + f.write(f"Use this skill when the user asks about {self.name} documentation, ") + f.write("including API references, tutorials, examples, and best practices.\n\n") + + f.write("## What's included\n\n") + f.write("This skill contains:\n\n") + for cat_key, cat_data in categorized.items(): + f.write(f"- **{cat_data['title']}**: {len(cat_data['pages'])} pages\n") + + f.write("\n## Quick Reference\n\n") + + # Get high-quality code samples + all_code = [] + for page in self.extracted_data['pages']: + all_code.extend(page.get('code_samples', [])) + + # Sort by quality and get top 5 + all_code.sort(key=lambda x: x.get('quality_score', 0), reverse=True) + top_code = all_code[:5] + + if top_code: + f.write("### Top Code Examples\n\n") + for i, code in enumerate(top_code, 1): + lang = code['language'] + quality = code.get('quality_score', 0) + f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n") + f.write(f"```{lang}\n{code['code'][:300]}...\n```\n\n") + + f.write("## Navigation\n\n") + f.write("See `references/index.md` for complete documentation structure.\n\n") + + # Add language statistics + langs = self.extracted_data.get('languages_detected', {}) + if langs: + f.write("## Languages Covered\n\n") + for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True): + f.write(f"- {lang}: {count} examples\n") + + print(f" Generated: {filename}") + + def _sanitize_filename(self, name): + """Convert string to safe filename""" + # Remove special chars, replace spaces with underscores + safe = re.sub(r'[^\w\s-]', '', name.lower()) + safe = re.sub(r'[-\s]+', '_', safe) + return safe + + +def main(): + parser = argparse.ArgumentParser( + description='Convert PDF documentation to Claude skill', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument('--config', help='PDF config JSON file') + parser.add_argument('--pdf', help='Direct PDF file path') + parser.add_argument('--name', help='Skill name (with --pdf)') + parser.add_argument('--from-json', help='Build skill from extracted JSON') + parser.add_argument('--description', help='Skill description') + + args = parser.parse_args() + + # Validate inputs + if not (args.config or args.pdf or args.from_json): + parser.error("Must specify --config, --pdf, or --from-json") + + # Load or create config + if args.config: + with open(args.config, 'r') as f: + config = json.load(f) + elif args.from_json: + # Build from extracted JSON + name = Path(args.from_json).stem.replace('_extracted', '') + config = { + 'name': name, + 'description': args.description or f'Documentation skill for {name}' + } + converter = PDFToSkillConverter(config) + converter.load_extracted_data(args.from_json) + converter.build_skill() + return + else: + # Direct PDF mode + if not args.name: + parser.error("Must specify --name with --pdf") + config = { + 'name': args.name, + 'pdf_path': args.pdf, + 'description': args.description or f'Documentation skill for {args.name}', + 'extract_options': { + 'chunk_size': 10, + 'min_quality': 5.0, + 'extract_images': True, + 'min_image_size': 100 + } + } + + # Create converter + converter = PDFToSkillConverter(config) + + # Extract if needed + if config.get('pdf_path'): + if not converter.extract_pdf(): + sys.exit(1) + + # Build skill + converter.build_skill() + + +if __name__ == '__main__': + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/quality_checker.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/quality_checker.py new file mode 100644 index 0000000..8ff66c5 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/quality_checker.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python3 +""" +Quality Checker for Claude Skills +Validates skill quality, checks links, and generates quality reports. + +Usage: + python3 quality_checker.py output/react/ + python3 quality_checker.py output/godot/ --verbose +""" + +import os +import re +import sys +from pathlib import Path +from typing import Dict, List, Tuple, Optional +from dataclasses import dataclass, field + + +@dataclass +class QualityIssue: + """Represents a quality issue found during validation.""" + level: str # 'error', 'warning', 'info' + category: str # 'enhancement', 'content', 'links', 'structure' + message: str + file: Optional[str] = None + line: Optional[int] = None + + +@dataclass +class QualityReport: + """Complete quality report for a skill.""" + skill_name: str + skill_path: Path + errors: List[QualityIssue] = field(default_factory=list) + warnings: List[QualityIssue] = field(default_factory=list) + info: List[QualityIssue] = field(default_factory=list) + + def add_error(self, category: str, message: str, file: str = None, line: int = None): + """Add an error to the report.""" + self.errors.append(QualityIssue('error', category, message, file, line)) + + def add_warning(self, category: str, message: str, file: str = None, line: int = None): + """Add a warning to the report.""" + self.warnings.append(QualityIssue('warning', category, message, file, line)) + + def add_info(self, category: str, message: str, file: str = None, line: int = None): + """Add info to the report.""" + self.info.append(QualityIssue('info', category, message, file, line)) + + @property + def has_errors(self) -> bool: + """Check if there are any errors.""" + return len(self.errors) > 0 + + @property + def has_warnings(self) -> bool: + """Check if there are any warnings.""" + return len(self.warnings) > 0 + + @property + def is_excellent(self) -> bool: + """Check if quality is excellent (no errors, no warnings).""" + return not self.has_errors and not self.has_warnings + + @property + def quality_score(self) -> float: + """Calculate quality score (0-100).""" + # Start with perfect score + score = 100.0 + + # Deduct points for issues + score -= len(self.errors) * 15 # -15 per error + score -= len(self.warnings) * 5 # -5 per warning + + # Never go below 0 + return max(0.0, score) + + @property + def quality_grade(self) -> str: + """Get quality grade (A-F).""" + score = self.quality_score + if score >= 90: + return 'A' + elif score >= 80: + return 'B' + elif score >= 70: + return 'C' + elif score >= 60: + return 'D' + else: + return 'F' + + +class SkillQualityChecker: + """Validates skill quality and generates reports.""" + + def __init__(self, skill_dir: Path): + """Initialize quality checker. + + Args: + skill_dir: Path to skill directory + """ + self.skill_dir = Path(skill_dir) + self.skill_md_path = self.skill_dir / "SKILL.md" + self.references_dir = self.skill_dir / "references" + self.report = QualityReport( + skill_name=self.skill_dir.name, + skill_path=self.skill_dir + ) + + def check_all(self) -> QualityReport: + """Run all quality checks and return report. + + Returns: + QualityReport: Complete quality report + """ + # Basic structure checks + self._check_skill_structure() + + # Enhancement verification + self._check_enhancement_quality() + + # Content quality checks + self._check_content_quality() + + # Link validation + self._check_links() + + return self.report + + def _check_skill_structure(self): + """Check basic skill structure.""" + # Check SKILL.md exists + if not self.skill_md_path.exists(): + self.report.add_error( + 'structure', + 'SKILL.md file not found', + str(self.skill_md_path) + ) + return + + # Check references directory exists + if not self.references_dir.exists(): + self.report.add_warning( + 'structure', + 'references/ directory not found - skill may be incomplete', + str(self.references_dir) + ) + elif not list(self.references_dir.glob('*.md')): + self.report.add_warning( + 'structure', + 'references/ directory is empty - no reference documentation found', + str(self.references_dir) + ) + + def _check_enhancement_quality(self): + """Check if SKILL.md was properly enhanced.""" + if not self.skill_md_path.exists(): + return + + content = self.skill_md_path.read_text(encoding='utf-8') + + # Check for template indicators (signs it wasn't enhanced) + template_indicators = [ + "TODO:", + "[Add description]", + "[Framework specific tips]", + "coming soon", + ] + + for indicator in template_indicators: + if indicator.lower() in content.lower(): + self.report.add_warning( + 'enhancement', + f'Found template placeholder: "{indicator}" - SKILL.md may not be enhanced', + 'SKILL.md' + ) + + # Check for good signs of enhancement + enhancement_indicators = { + 'code_examples': re.compile(r'```[\w-]+\n', re.MULTILINE), + 'real_examples': re.compile(r'Example:', re.IGNORECASE), + 'sections': re.compile(r'^## .+', re.MULTILINE), + } + + code_blocks = len(enhancement_indicators['code_examples'].findall(content)) + real_examples = len(enhancement_indicators['real_examples'].findall(content)) + sections = len(enhancement_indicators['sections'].findall(content)) + + # Quality thresholds + if code_blocks == 0: + self.report.add_warning( + 'enhancement', + 'No code examples found in SKILL.md - consider enhancing', + 'SKILL.md' + ) + elif code_blocks < 3: + self.report.add_info( + 'enhancement', + f'Only {code_blocks} code examples found - more examples would improve quality', + 'SKILL.md' + ) + else: + self.report.add_info( + 'enhancement', + f'✓ Found {code_blocks} code examples', + 'SKILL.md' + ) + + if sections < 4: + self.report.add_warning( + 'enhancement', + f'Only {sections} sections found - SKILL.md may be too basic', + 'SKILL.md' + ) + else: + self.report.add_info( + 'enhancement', + f'✓ Found {sections} sections', + 'SKILL.md' + ) + + def _check_content_quality(self): + """Check content quality.""" + if not self.skill_md_path.exists(): + return + + content = self.skill_md_path.read_text(encoding='utf-8') + + # Check YAML frontmatter + if not content.startswith('---'): + self.report.add_error( + 'content', + 'Missing YAML frontmatter - SKILL.md must start with ---', + 'SKILL.md', + 1 + ) + else: + # Extract frontmatter + try: + frontmatter_match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL) + if frontmatter_match: + frontmatter = frontmatter_match.group(1) + + # Check for required fields + if 'name:' not in frontmatter: + self.report.add_error( + 'content', + 'Missing "name:" field in YAML frontmatter', + 'SKILL.md', + 2 + ) + + # Check for description + if 'description:' in frontmatter: + self.report.add_info( + 'content', + '✓ YAML frontmatter includes description', + 'SKILL.md' + ) + else: + self.report.add_error( + 'content', + 'Invalid YAML frontmatter format', + 'SKILL.md', + 1 + ) + except Exception as e: + self.report.add_error( + 'content', + f'Error parsing YAML frontmatter: {e}', + 'SKILL.md', + 1 + ) + + # Check code block language tags + code_blocks_without_lang = re.findall(r'```\n[^`]', content) + if code_blocks_without_lang: + self.report.add_warning( + 'content', + f'Found {len(code_blocks_without_lang)} code blocks without language tags', + 'SKILL.md' + ) + + # Check for "When to Use" section + if 'when to use' not in content.lower(): + self.report.add_warning( + 'content', + 'Missing "When to Use This Skill" section', + 'SKILL.md' + ) + else: + self.report.add_info( + 'content', + '✓ Found "When to Use" section', + 'SKILL.md' + ) + + # Check reference files + if self.references_dir.exists(): + ref_files = list(self.references_dir.glob('*.md')) + if ref_files: + self.report.add_info( + 'content', + f'✓ Found {len(ref_files)} reference files', + 'references/' + ) + + # Check if references are mentioned in SKILL.md + mentioned_refs = 0 + for ref_file in ref_files: + if ref_file.name in content: + mentioned_refs += 1 + + if mentioned_refs == 0: + self.report.add_warning( + 'content', + 'Reference files exist but none are mentioned in SKILL.md', + 'SKILL.md' + ) + + def _check_links(self): + """Check internal markdown links.""" + if not self.skill_md_path.exists(): + return + + content = self.skill_md_path.read_text(encoding='utf-8') + + # Find all markdown links [text](path) + link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)') + links = link_pattern.findall(content) + + broken_links = [] + + for text, link in links: + # Skip external links (http/https) + if link.startswith('http://') or link.startswith('https://'): + continue + + # Skip anchor links + if link.startswith('#'): + continue + + # Check if file exists (relative to SKILL.md) + link_path = self.skill_dir / link + if not link_path.exists(): + broken_links.append((text, link)) + + if broken_links: + for text, link in broken_links: + self.report.add_warning( + 'links', + f'Broken link: [{text}]({link})', + 'SKILL.md' + ) + else: + if links: + internal_links = [l for t, l in links if not l.startswith('http')] + if internal_links: + self.report.add_info( + 'links', + f'✓ All {len(internal_links)} internal links are valid', + 'SKILL.md' + ) + + +def print_report(report: QualityReport, verbose: bool = False): + """Print quality report to console. + + Args: + report: Quality report to print + verbose: Show all info messages + """ + print("\n" + "=" * 60) + print(f"QUALITY REPORT: {report.skill_name}") + print("=" * 60) + print() + + # Quality score + print(f"Quality Score: {report.quality_score:.1f}/100 (Grade: {report.quality_grade})") + print() + + # Errors + if report.errors: + print(f"❌ ERRORS ({len(report.errors)}):") + for issue in report.errors: + location = f" ({issue.file}:{issue.line})" if issue.file and issue.line else f" ({issue.file})" if issue.file else "" + print(f" [{issue.category}] {issue.message}{location}") + print() + + # Warnings + if report.warnings: + print(f"⚠️ WARNINGS ({len(report.warnings)}):") + for issue in report.warnings: + location = f" ({issue.file}:{issue.line})" if issue.file and issue.line else f" ({issue.file})" if issue.file else "" + print(f" [{issue.category}] {issue.message}{location}") + print() + + # Info (only in verbose mode) + if verbose and report.info: + print(f"ℹ️ INFO ({len(report.info)}):") + for issue in report.info: + location = f" ({issue.file})" if issue.file else "" + print(f" [{issue.category}] {issue.message}{location}") + print() + + # Summary + if report.is_excellent: + print("✅ EXCELLENT! No issues found.") + elif not report.has_errors: + print("✓ GOOD! No errors, but some warnings to review.") + else: + print("❌ NEEDS IMPROVEMENT! Please fix errors before packaging.") + + print() + + +def main(): + """Main entry point.""" + import argparse + + parser = argparse.ArgumentParser( + description="Check skill quality and generate report", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic quality check + python3 quality_checker.py output/react/ + + # Verbose mode (show all info) + python3 quality_checker.py output/godot/ --verbose + + # Exit with error code if issues found + python3 quality_checker.py output/django/ --strict +""" + ) + + parser.add_argument( + 'skill_directory', + help='Path to skill directory (e.g., output/react/)' + ) + + parser.add_argument( + '--verbose', '-v', + action='store_true', + help='Show all info messages' + ) + + parser.add_argument( + '--strict', + action='store_true', + help='Exit with error code if any warnings or errors found' + ) + + args = parser.parse_args() + + # Check if directory exists + skill_dir = Path(args.skill_directory) + if not skill_dir.exists(): + print(f"❌ Directory not found: {skill_dir}") + sys.exit(1) + + # Run quality checks + checker = SkillQualityChecker(skill_dir) + report = checker.check_all() + + # Print report + print_report(report, verbose=args.verbose) + + # Exit code + if args.strict and (report.has_errors or report.has_warnings): + sys.exit(1) + elif report.has_errors: + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/run_tests.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/run_tests.py new file mode 100644 index 0000000..ab38fcc --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/run_tests.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +Test Runner for Skill Seeker +Runs all test suites and generates a comprehensive test report +""" + +import sys +import unittest +import os +from io import StringIO +from pathlib import Path + + +class ColoredTextTestResult(unittest.TextTestResult): + """Custom test result class with colored output""" + + # ANSI color codes + GREEN = '\033[92m' + RED = '\033[91m' + YELLOW = '\033[93m' + BLUE = '\033[94m' + RESET = '\033[0m' + BOLD = '\033[1m' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.test_results = [] + + def addSuccess(self, test): + super().addSuccess(test) + self.test_results.append(('PASS', test)) + if self.showAll: + self.stream.write(f"{self.GREEN}✓ PASS{self.RESET}\n") + elif self.dots: + self.stream.write(f"{self.GREEN}.{self.RESET}") + self.stream.flush() + + def addError(self, test, err): + super().addError(test, err) + self.test_results.append(('ERROR', test)) + if self.showAll: + self.stream.write(f"{self.RED}✗ ERROR{self.RESET}\n") + elif self.dots: + self.stream.write(f"{self.RED}E{self.RESET}") + self.stream.flush() + + def addFailure(self, test, err): + super().addFailure(test, err) + self.test_results.append(('FAIL', test)) + if self.showAll: + self.stream.write(f"{self.RED}✗ FAIL{self.RESET}\n") + elif self.dots: + self.stream.write(f"{self.RED}F{self.RESET}") + self.stream.flush() + + def addSkip(self, test, reason): + super().addSkip(test, reason) + self.test_results.append(('SKIP', test)) + if self.showAll: + self.stream.write(f"{self.YELLOW}⊘ SKIP{self.RESET}\n") + elif self.dots: + self.stream.write(f"{self.YELLOW}s{self.RESET}") + self.stream.flush() + + +class ColoredTextTestRunner(unittest.TextTestRunner): + """Custom test runner with colored output""" + resultclass = ColoredTextTestResult + + +def discover_tests(test_dir='tests'): + """Discover all test files in the tests directory""" + loader = unittest.TestLoader() + start_dir = test_dir + pattern = 'test_*.py' + + suite = loader.discover(start_dir, pattern=pattern) + return suite + + +def run_specific_suite(suite_name): + """Run a specific test suite""" + loader = unittest.TestLoader() + + suite_map = { + 'config': 'tests.test_config_validation', + 'features': 'tests.test_scraper_features', + 'integration': 'tests.test_integration' + } + + if suite_name not in suite_map: + print(f"Unknown test suite: {suite_name}") + print(f"Available suites: {', '.join(suite_map.keys())}") + return None + + module_name = suite_map[suite_name] + try: + suite = loader.loadTestsFromName(module_name) + return suite + except Exception as e: + print(f"Error loading test suite '{suite_name}': {e}") + return None + + +def print_summary(result): + """Print a detailed test summary""" + total = result.testsRun + passed = total - len(result.failures) - len(result.errors) - len(result.skipped) + failed = len(result.failures) + errors = len(result.errors) + skipped = len(result.skipped) + + print("\n" + "="*70) + print("TEST SUMMARY") + print("="*70) + + # Overall stats + print(f"\n{ColoredTextTestResult.BOLD}Total Tests:{ColoredTextTestResult.RESET} {total}") + print(f"{ColoredTextTestResult.GREEN}✓ Passed:{ColoredTextTestResult.RESET} {passed}") + if failed > 0: + print(f"{ColoredTextTestResult.RED}✗ Failed:{ColoredTextTestResult.RESET} {failed}") + if errors > 0: + print(f"{ColoredTextTestResult.RED}✗ Errors:{ColoredTextTestResult.RESET} {errors}") + if skipped > 0: + print(f"{ColoredTextTestResult.YELLOW}⊘ Skipped:{ColoredTextTestResult.RESET} {skipped}") + + # Success rate + if total > 0: + success_rate = (passed / total) * 100 + color = ColoredTextTestResult.GREEN if success_rate == 100 else \ + ColoredTextTestResult.YELLOW if success_rate >= 80 else \ + ColoredTextTestResult.RED + print(f"\n{color}Success Rate: {success_rate:.1f}%{ColoredTextTestResult.RESET}") + + # Category breakdown + if hasattr(result, 'test_results'): + print(f"\n{ColoredTextTestResult.BOLD}Test Breakdown by Category:{ColoredTextTestResult.RESET}") + + categories = {} + for status, test in result.test_results: + test_name = str(test) + # Extract test class name + if '.' in test_name: + class_name = test_name.split('.')[0].split()[-1] + if class_name not in categories: + categories[class_name] = {'PASS': 0, 'FAIL': 0, 'ERROR': 0, 'SKIP': 0} + categories[class_name][status] += 1 + + for category, stats in sorted(categories.items()): + total_cat = sum(stats.values()) + passed_cat = stats['PASS'] + print(f" {category}: {passed_cat}/{total_cat} passed") + + print("\n" + "="*70) + + # Return status + return failed == 0 and errors == 0 + + +def main(): + """Main test runner""" + import argparse + + parser = argparse.ArgumentParser( + description='Run tests for Skill Seeker', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument('--suite', '-s', type=str, + help='Run specific test suite (config, features, integration)') + parser.add_argument('--verbose', '-v', action='store_true', + help='Verbose output (show each test)') + parser.add_argument('--quiet', '-q', action='store_true', + help='Quiet output (minimal output)') + parser.add_argument('--failfast', '-f', action='store_true', + help='Stop on first failure') + parser.add_argument('--list', '-l', action='store_true', + help='List all available tests') + + args = parser.parse_args() + + # Set verbosity + verbosity = 1 + if args.verbose: + verbosity = 2 + elif args.quiet: + verbosity = 0 + + print(f"\n{ColoredTextTestResult.BOLD}{'='*70}{ColoredTextTestResult.RESET}") + print(f"{ColoredTextTestResult.BOLD}SKILL SEEKER TEST SUITE{ColoredTextTestResult.RESET}") + print(f"{ColoredTextTestResult.BOLD}{'='*70}{ColoredTextTestResult.RESET}\n") + + # Discover or load specific suite + if args.suite: + print(f"Running test suite: {ColoredTextTestResult.BLUE}{args.suite}{ColoredTextTestResult.RESET}\n") + suite = run_specific_suite(args.suite) + if suite is None: + return 1 + else: + print(f"Running {ColoredTextTestResult.BLUE}all tests{ColoredTextTestResult.RESET}\n") + suite = discover_tests() + + # List tests + if args.list: + print("\nAvailable tests:\n") + for test_group in suite: + for test in test_group: + print(f" - {test}") + print() + return 0 + + # Run tests + runner = ColoredTextTestRunner( + verbosity=verbosity, + failfast=args.failfast + ) + + result = runner.run(suite) + + # Print summary + success = print_summary(result) + + # Return appropriate exit code + return 0 if success else 1 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/split_config.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/split_config.py new file mode 100644 index 0000000..40551ad --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/split_config.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +""" +Config Splitter for Large Documentation Sites + +Splits large documentation configs into multiple smaller, focused skill configs. +Supports multiple splitting strategies: category-based, size-based, and automatic. +""" + +import json +import sys +import argparse +from pathlib import Path +from typing import Dict, List, Any, Tuple +from collections import defaultdict + + +class ConfigSplitter: + """Splits large documentation configs into multiple focused configs""" + + def __init__(self, config_path: str, strategy: str = "auto", target_pages: int = 5000): + self.config_path = Path(config_path) + self.strategy = strategy + self.target_pages = target_pages + self.config = self.load_config() + self.base_name = self.config['name'] + + def load_config(self) -> Dict[str, Any]: + """Load configuration from file""" + try: + with open(self.config_path, 'r') as f: + return json.load(f) + except FileNotFoundError: + print(f"❌ Error: Config file not found: {self.config_path}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"❌ Error: Invalid JSON in config file: {e}") + sys.exit(1) + + def get_split_strategy(self) -> str: + """Determine split strategy""" + # Check if strategy is defined in config + if 'split_strategy' in self.config: + config_strategy = self.config['split_strategy'] + if config_strategy != "none": + return config_strategy + + # Use provided strategy or auto-detect + if self.strategy == "auto": + max_pages = self.config.get('max_pages', 500) + + if max_pages < 5000: + print(f"ℹ️ Small documentation ({max_pages} pages) - no splitting needed") + return "none" + elif max_pages < 10000 and 'categories' in self.config: + print(f"ℹ️ Medium documentation ({max_pages} pages) - category split recommended") + return "category" + elif 'categories' in self.config and len(self.config['categories']) >= 3: + print(f"ℹ️ Large documentation ({max_pages} pages) - router + categories recommended") + return "router" + else: + print(f"ℹ️ Large documentation ({max_pages} pages) - size-based split") + return "size" + + return self.strategy + + def split_by_category(self, create_router: bool = False) -> List[Dict[str, Any]]: + """Split config by categories""" + if 'categories' not in self.config: + print("❌ Error: No categories defined in config") + sys.exit(1) + + categories = self.config['categories'] + split_categories = self.config.get('split_config', {}).get('split_by_categories') + + # If specific categories specified, use only those + if split_categories: + categories = {k: v for k, v in categories.items() if k in split_categories} + + configs = [] + + for category_name, keywords in categories.items(): + # Create new config for this category + new_config = self.config.copy() + new_config['name'] = f"{self.base_name}-{category_name}" + new_config['description'] = f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}" + + # Update URL patterns to focus on this category + url_patterns = new_config.get('url_patterns', {}) + + # Add category keywords to includes + includes = url_patterns.get('include', []) + for keyword in keywords: + if keyword.startswith('/'): + includes.append(keyword) + + if includes: + url_patterns['include'] = list(set(includes)) + new_config['url_patterns'] = url_patterns + + # Keep only this category + new_config['categories'] = {category_name: keywords} + + # Remove split config from child + if 'split_strategy' in new_config: + del new_config['split_strategy'] + if 'split_config' in new_config: + del new_config['split_config'] + + # Adjust max_pages estimate + if 'max_pages' in new_config: + new_config['max_pages'] = self.target_pages + + configs.append(new_config) + + print(f"✅ Created {len(configs)} category-based configs") + + # Optionally create router config + if create_router: + router_config = self.create_router_config(configs) + configs.insert(0, router_config) + print(f"✅ Created router config: {router_config['name']}") + + return configs + + def split_by_size(self) -> List[Dict[str, Any]]: + """Split config by size (page count)""" + max_pages = self.config.get('max_pages', 500) + num_splits = (max_pages + self.target_pages - 1) // self.target_pages + + configs = [] + + for i in range(num_splits): + new_config = self.config.copy() + part_num = i + 1 + new_config['name'] = f"{self.base_name}-part{part_num}" + new_config['description'] = f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}" + new_config['max_pages'] = self.target_pages + + # Remove split config from child + if 'split_strategy' in new_config: + del new_config['split_strategy'] + if 'split_config' in new_config: + del new_config['split_config'] + + configs.append(new_config) + + print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)") + return configs + + def create_router_config(self, sub_configs: List[Dict[str, Any]]) -> Dict[str, Any]: + """Create a router config that references sub-skills""" + router_name = self.config.get('split_config', {}).get('router_name', self.base_name) + + router_config = { + "name": router_name, + "description": self.config.get('description', ''), + "base_url": self.config['base_url'], + "selectors": self.config['selectors'], + "url_patterns": self.config.get('url_patterns', {}), + "rate_limit": self.config.get('rate_limit', 0.5), + "max_pages": 500, # Router only needs overview pages + "_router": True, + "_sub_skills": [cfg['name'] for cfg in sub_configs], + "_routing_keywords": { + cfg['name']: list(cfg.get('categories', {}).keys()) + for cfg in sub_configs + } + } + + return router_config + + def split(self) -> List[Dict[str, Any]]: + """Execute split based on strategy""" + strategy = self.get_split_strategy() + + print(f"\n{'='*60}") + print(f"CONFIG SPLITTER: {self.base_name}") + print(f"{'='*60}") + print(f"Strategy: {strategy}") + print(f"Target pages per skill: {self.target_pages}") + print("") + + if strategy == "none": + print("ℹ️ No splitting required") + return [self.config] + + elif strategy == "category": + return self.split_by_category(create_router=False) + + elif strategy == "router": + create_router = self.config.get('split_config', {}).get('create_router', True) + return self.split_by_category(create_router=create_router) + + elif strategy == "size": + return self.split_by_size() + + else: + print(f"❌ Error: Unknown strategy: {strategy}") + sys.exit(1) + + def save_configs(self, configs: List[Dict[str, Any]], output_dir: Path = None) -> List[Path]: + """Save configs to files""" + if output_dir is None: + output_dir = self.config_path.parent + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + saved_files = [] + + for config in configs: + filename = f"{config['name']}.json" + filepath = output_dir / filename + + with open(filepath, 'w') as f: + json.dump(config, f, indent=2) + + saved_files.append(filepath) + print(f" 💾 Saved: {filepath}") + + return saved_files + + +def main(): + parser = argparse.ArgumentParser( + description="Split large documentation configs into multiple focused skills", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Auto-detect strategy + python3 split_config.py configs/godot.json + + # Use category-based split + python3 split_config.py configs/godot.json --strategy category + + # Use router + categories + python3 split_config.py configs/godot.json --strategy router + + # Custom target size + python3 split_config.py configs/godot.json --target-pages 3000 + + # Dry run (don't save files) + python3 split_config.py configs/godot.json --dry-run + +Split Strategies: + none - No splitting (single skill) + auto - Automatically choose best strategy + category - Split by categories defined in config + router - Create router + category-based sub-skills + size - Split by page count + """ + ) + + parser.add_argument( + 'config', + help='Path to config file (e.g., configs/godot.json)' + ) + + parser.add_argument( + '--strategy', + choices=['auto', 'none', 'category', 'router', 'size'], + default='auto', + help='Splitting strategy (default: auto)' + ) + + parser.add_argument( + '--target-pages', + type=int, + default=5000, + help='Target pages per skill (default: 5000)' + ) + + parser.add_argument( + '--output-dir', + help='Output directory for configs (default: same as input)' + ) + + parser.add_argument( + '--dry-run', + action='store_true', + help='Show what would be created without saving files' + ) + + args = parser.parse_args() + + # Create splitter + splitter = ConfigSplitter(args.config, args.strategy, args.target_pages) + + # Split config + configs = splitter.split() + + if args.dry_run: + print(f"\n{'='*60}") + print("DRY RUN - No files saved") + print(f"{'='*60}") + print(f"Would create {len(configs)} config files:") + for cfg in configs: + is_router = cfg.get('_router', False) + router_marker = " (ROUTER)" if is_router else "" + print(f" 📄 {cfg['name']}.json{router_marker}") + else: + print(f"\n{'='*60}") + print("SAVING CONFIGS") + print(f"{'='*60}") + saved_files = splitter.save_configs(configs, args.output_dir) + + print(f"\n{'='*60}") + print("NEXT STEPS") + print(f"{'='*60}") + print("1. Review generated configs") + print("2. Scrape each config:") + for filepath in saved_files: + print(f" skill-seekers scrape --config {filepath}") + print("3. Package skills:") + print(" skill-seekers-package-multi configs/-*.json") + print("") + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/test_unified_simple.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/test_unified_simple.py new file mode 100644 index 0000000..f759fd1 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/test_unified_simple.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Simple Integration Tests for Unified Multi-Source Scraper + +Focuses on real-world usage patterns rather than unit tests. +""" + +import os +import sys +import json +import tempfile +from pathlib import Path + +# Add CLI to path +sys.path.insert(0, str(Path(__file__).parent)) + +from .config_validator import validate_config + +def test_validate_existing_unified_configs(): + """Test that all existing unified configs are valid""" + configs_dir = Path(__file__).parent.parent / 'configs' + + unified_configs = [ + 'godot_unified.json', + 'react_unified.json', + 'django_unified.json', + 'fastapi_unified.json' + ] + + for config_name in unified_configs: + config_path = configs_dir / config_name + if config_path.exists(): + print(f"\n✓ Validating {config_name}...") + validator = validate_config(str(config_path)) + assert validator.is_unified, f"{config_name} should be unified format" + assert validator.needs_api_merge(), f"{config_name} should need API merging" + print(f" Sources: {len(validator.config['sources'])}") + print(f" Merge mode: {validator.config.get('merge_mode')}") + + +def test_backward_compatibility(): + """Test that legacy configs still work""" + configs_dir = Path(__file__).parent.parent / 'configs' + + legacy_configs = [ + 'react.json', + 'godot.json', + 'django.json' + ] + + for config_name in legacy_configs: + config_path = configs_dir / config_name + if config_path.exists(): + print(f"\n✓ Validating legacy {config_name}...") + validator = validate_config(str(config_path)) + assert not validator.is_unified, f"{config_name} should be legacy format" + print(f" Format: Legacy") + + +def test_create_temp_unified_config(): + """Test creating a unified config from scratch""" + config = { + "name": "test_unified", + "description": "Test unified config", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://example.com/docs", + "extract_api": True, + "max_pages": 50 + }, + { + "type": "github", + "repo": "test/repo", + "include_code": True, + "code_analysis_depth": "surface" + } + ] + } + + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + json.dump(config, f) + config_path = f.name + + try: + print("\n✓ Validating temp unified config...") + validator = validate_config(config_path) + assert validator.is_unified + assert validator.needs_api_merge() + assert len(validator.config['sources']) == 2 + print(" ✓ Config is valid unified format") + print(f" Sources: {len(validator.config['sources'])}") + finally: + os.unlink(config_path) + + +def test_mixed_source_types(): + """Test config with documentation, GitHub, and PDF sources""" + config = { + "name": "test_mixed", + "description": "Test mixed sources", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://example.com" + }, + { + "type": "github", + "repo": "test/repo" + }, + { + "type": "pdf", + "path": "/path/to/manual.pdf" + } + ] + } + + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + json.dump(config, f) + config_path = f.name + + try: + print("\n✓ Validating mixed source types...") + validator = validate_config(config_path) + assert validator.is_unified + assert len(validator.config['sources']) == 3 + + # Check each source type + source_types = [s['type'] for s in validator.config['sources']] + assert 'documentation' in source_types + assert 'github' in source_types + assert 'pdf' in source_types + print(" ✓ All 3 source types validated") + finally: + os.unlink(config_path) + + +def test_config_validation_errors(): + """Test that invalid configs are rejected""" + # Invalid source type + config = { + "name": "test", + "description": "Test", + "sources": [ + {"type": "invalid_type", "url": "https://example.com"} + ] + } + + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + json.dump(config, f) + config_path = f.name + + try: + print("\n✓ Testing invalid source type...") + try: + # validate_config() calls .validate() automatically + validator = validate_config(config_path) + assert False, "Should have raised error for invalid source type" + except ValueError as e: + assert "Invalid" in str(e) or "invalid" in str(e) + print(" ✓ Invalid source type correctly rejected") + finally: + os.unlink(config_path) + + +# Run tests +if __name__ == '__main__': + print("=" * 60) + print("Running Unified Scraper Integration Tests") + print("=" * 60) + + try: + test_validate_existing_unified_configs() + test_backward_compatibility() + test_create_temp_unified_config() + test_mixed_source_types() + test_config_validation_errors() + + print("\n" + "=" * 60) + print("✅ All integration tests passed!") + print("=" * 60) + + except AssertionError as e: + print(f"\n❌ Test failed: {e}") + sys.exit(1) + except Exception as e: + print(f"\n❌ Unexpected error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/unified_scraper.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/unified_scraper.py new file mode 100644 index 0000000..81d2bc1 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/unified_scraper.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +""" +Unified Multi-Source Scraper + +Orchestrates scraping from multiple sources (documentation, GitHub, PDF), +detects conflicts, merges intelligently, and builds unified skills. + +This is the main entry point for unified config workflow. + +Usage: + skill-seekers unified --config configs/godot_unified.json + skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced +""" + +import os +import sys +import json +import logging +import argparse +import subprocess +from pathlib import Path +from typing import Dict, List, Any, Optional + +# Import validators and scrapers +try: + from config_validator import ConfigValidator, validate_config + from conflict_detector import ConflictDetector + from merge_sources import RuleBasedMerger, ClaudeEnhancedMerger + from unified_skill_builder import UnifiedSkillBuilder +except ImportError as e: + print(f"Error importing modules: {e}") + print("Make sure you're running from the project root directory") + sys.exit(1) + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +class UnifiedScraper: + """ + Orchestrates multi-source scraping and merging. + + Main workflow: + 1. Load and validate unified config + 2. Scrape all sources (docs, GitHub, PDF) + 3. Detect conflicts between sources + 4. Merge intelligently (rule-based or Claude-enhanced) + 5. Build unified skill + """ + + def __init__(self, config_path: str, merge_mode: Optional[str] = None): + """ + Initialize unified scraper. + + Args: + config_path: Path to unified config JSON + merge_mode: Override config merge_mode ('rule-based' or 'claude-enhanced') + """ + self.config_path = config_path + + # Validate and load config + logger.info(f"Loading config: {config_path}") + self.validator = validate_config(config_path) + self.config = self.validator.config + + # Determine merge mode + self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based') + logger.info(f"Merge mode: {self.merge_mode}") + + # Storage for scraped data + self.scraped_data = {} + + # Output paths + self.name = self.config['name'] + self.output_dir = f"output/{self.name}" + self.data_dir = f"output/{self.name}_unified_data" + + os.makedirs(self.output_dir, exist_ok=True) + os.makedirs(self.data_dir, exist_ok=True) + + def scrape_all_sources(self): + """ + Scrape all configured sources. + + Routes to appropriate scraper based on source type. + """ + logger.info("=" * 60) + logger.info("PHASE 1: Scraping all sources") + logger.info("=" * 60) + + if not self.validator.is_unified: + logger.warning("Config is not unified format, converting...") + self.config = self.validator.convert_legacy_to_unified() + + sources = self.config.get('sources', []) + + for i, source in enumerate(sources): + source_type = source['type'] + logger.info(f"\n[{i+1}/{len(sources)}] Scraping {source_type} source...") + + try: + if source_type == 'documentation': + self._scrape_documentation(source) + elif source_type == 'github': + self._scrape_github(source) + elif source_type == 'pdf': + self._scrape_pdf(source) + else: + logger.warning(f"Unknown source type: {source_type}") + except Exception as e: + logger.error(f"Error scraping {source_type}: {e}") + logger.info("Continuing with other sources...") + + logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully") + + def _scrape_documentation(self, source: Dict[str, Any]): + """Scrape documentation website.""" + # Create temporary config for doc scraper + doc_config = { + 'name': f"{self.name}_docs", + 'base_url': source['base_url'], + 'selectors': source.get('selectors', {}), + 'url_patterns': source.get('url_patterns', {}), + 'categories': source.get('categories', {}), + 'rate_limit': source.get('rate_limit', 0.5), + 'max_pages': source.get('max_pages', 100) + } + + # Write temporary config + temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json') + with open(temp_config_path, 'w') as f: + json.dump(doc_config, f, indent=2) + + # Run doc_scraper as subprocess + logger.info(f"Scraping documentation from {source['base_url']}") + + doc_scraper_path = Path(__file__).parent / "doc_scraper.py" + cmd = [sys.executable, str(doc_scraper_path), '--config', temp_config_path] + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + logger.error(f"Documentation scraping failed: {result.stderr}") + return + + # Load scraped data + docs_data_file = f"output/{doc_config['name']}_data/summary.json" + + if os.path.exists(docs_data_file): + with open(docs_data_file, 'r') as f: + summary = json.load(f) + + self.scraped_data['documentation'] = { + 'pages': summary.get('pages', []), + 'data_file': docs_data_file + } + + logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped") + else: + logger.warning("Documentation data file not found") + + # Clean up temp config + if os.path.exists(temp_config_path): + os.remove(temp_config_path) + + def _scrape_github(self, source: Dict[str, Any]): + """Scrape GitHub repository.""" + sys.path.insert(0, str(Path(__file__).parent)) + + try: + from github_scraper import GitHubScraper + except ImportError: + logger.error("github_scraper.py not found") + return + + # Create config for GitHub scraper + github_config = { + 'repo': source['repo'], + 'name': f"{self.name}_github", + 'github_token': source.get('github_token'), + 'include_issues': source.get('include_issues', True), + 'max_issues': source.get('max_issues', 100), + 'include_changelog': source.get('include_changelog', True), + 'include_releases': source.get('include_releases', True), + 'include_code': source.get('include_code', True), + 'code_analysis_depth': source.get('code_analysis_depth', 'surface'), + 'file_patterns': source.get('file_patterns', []), + 'local_repo_path': source.get('local_repo_path') # Pass local_repo_path from config + } + + # Scrape + logger.info(f"Scraping GitHub repository: {source['repo']}") + scraper = GitHubScraper(github_config) + github_data = scraper.scrape() + + # Save data + github_data_file = os.path.join(self.data_dir, 'github_data.json') + with open(github_data_file, 'w') as f: + json.dump(github_data, f, indent=2, ensure_ascii=False) + + self.scraped_data['github'] = { + 'data': github_data, + 'data_file': github_data_file + } + + logger.info(f"✅ GitHub: Repository scraped successfully") + + def _scrape_pdf(self, source: Dict[str, Any]): + """Scrape PDF document.""" + sys.path.insert(0, str(Path(__file__).parent)) + + try: + from pdf_scraper import PDFToSkillConverter + except ImportError: + logger.error("pdf_scraper.py not found") + return + + # Create config for PDF scraper + pdf_config = { + 'name': f"{self.name}_pdf", + 'pdf': source['path'], + 'extract_tables': source.get('extract_tables', False), + 'ocr': source.get('ocr', False), + 'password': source.get('password') + } + + # Scrape + logger.info(f"Scraping PDF: {source['path']}") + converter = PDFToSkillConverter(pdf_config) + pdf_data = converter.extract_all() + + # Save data + pdf_data_file = os.path.join(self.data_dir, 'pdf_data.json') + with open(pdf_data_file, 'w') as f: + json.dump(pdf_data, f, indent=2, ensure_ascii=False) + + self.scraped_data['pdf'] = { + 'data': pdf_data, + 'data_file': pdf_data_file + } + + logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted") + + def detect_conflicts(self) -> List: + """ + Detect conflicts between documentation and code. + + Only applicable if both documentation and GitHub sources exist. + + Returns: + List of conflicts + """ + logger.info("\n" + "=" * 60) + logger.info("PHASE 2: Detecting conflicts") + logger.info("=" * 60) + + if not self.validator.needs_api_merge(): + logger.info("No API merge needed (only one API source)") + return [] + + # Get documentation and GitHub data + docs_data = self.scraped_data.get('documentation', {}) + github_data = self.scraped_data.get('github', {}) + + if not docs_data or not github_data: + logger.warning("Missing documentation or GitHub data for conflict detection") + return [] + + # Load data files + with open(docs_data['data_file'], 'r') as f: + docs_json = json.load(f) + + with open(github_data['data_file'], 'r') as f: + github_json = json.load(f) + + # Detect conflicts + detector = ConflictDetector(docs_json, github_json) + conflicts = detector.detect_all_conflicts() + + # Save conflicts + conflicts_file = os.path.join(self.data_dir, 'conflicts.json') + detector.save_conflicts(conflicts, conflicts_file) + + # Print summary + summary = detector.generate_summary(conflicts) + logger.info(f"\n📊 Conflict Summary:") + logger.info(f" Total: {summary['total']}") + logger.info(f" By Type:") + for ctype, count in summary['by_type'].items(): + if count > 0: + logger.info(f" - {ctype}: {count}") + logger.info(f" By Severity:") + for severity, count in summary['by_severity'].items(): + if count > 0: + emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢' + logger.info(f" {emoji} {severity}: {count}") + + return conflicts + + def merge_sources(self, conflicts: List): + """ + Merge data from multiple sources. + + Args: + conflicts: List of detected conflicts + """ + logger.info("\n" + "=" * 60) + logger.info(f"PHASE 3: Merging sources ({self.merge_mode})") + logger.info("=" * 60) + + if not conflicts: + logger.info("No conflicts to merge") + return None + + # Get data files + docs_data = self.scraped_data.get('documentation', {}) + github_data = self.scraped_data.get('github', {}) + + # Load data + with open(docs_data['data_file'], 'r') as f: + docs_json = json.load(f) + + with open(github_data['data_file'], 'r') as f: + github_json = json.load(f) + + # Choose merger + if self.merge_mode == 'claude-enhanced': + merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts) + else: + merger = RuleBasedMerger(docs_json, github_json, conflicts) + + # Merge + merged_data = merger.merge_all() + + # Save merged data + merged_file = os.path.join(self.data_dir, 'merged_data.json') + with open(merged_file, 'w') as f: + json.dump(merged_data, f, indent=2, ensure_ascii=False) + + logger.info(f"✅ Merged data saved: {merged_file}") + + return merged_data + + def build_skill(self, merged_data: Optional[Dict] = None): + """ + Build final unified skill. + + Args: + merged_data: Merged API data (if conflicts were resolved) + """ + logger.info("\n" + "=" * 60) + logger.info("PHASE 4: Building unified skill") + logger.info("=" * 60) + + # Load conflicts if they exist + conflicts = [] + conflicts_file = os.path.join(self.data_dir, 'conflicts.json') + if os.path.exists(conflicts_file): + with open(conflicts_file, 'r') as f: + conflicts_data = json.load(f) + conflicts = conflicts_data.get('conflicts', []) + + # Build skill + builder = UnifiedSkillBuilder( + self.config, + self.scraped_data, + merged_data, + conflicts + ) + + builder.build() + + logger.info(f"✅ Unified skill built: {self.output_dir}/") + + def run(self): + """ + Execute complete unified scraping workflow. + """ + logger.info("\n" + "🚀 " * 20) + logger.info(f"Unified Scraper: {self.config['name']}") + logger.info("🚀 " * 20 + "\n") + + try: + # Phase 1: Scrape all sources + self.scrape_all_sources() + + # Phase 2: Detect conflicts (if applicable) + conflicts = self.detect_conflicts() + + # Phase 3: Merge sources (if conflicts exist) + merged_data = None + if conflicts: + merged_data = self.merge_sources(conflicts) + + # Phase 4: Build skill + self.build_skill(merged_data) + + logger.info("\n" + "✅ " * 20) + logger.info("Unified scraping complete!") + logger.info("✅ " * 20 + "\n") + + logger.info(f"📁 Output: {self.output_dir}/") + logger.info(f"📁 Data: {self.data_dir}/") + + except KeyboardInterrupt: + logger.info("\n\n⚠️ Scraping interrupted by user") + sys.exit(1) + except Exception as e: + logger.error(f"\n\n❌ Error during scraping: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description='Unified multi-source scraper', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage with unified config + skill-seekers unified --config configs/godot_unified.json + + # Override merge mode + skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced + + # Backward compatible with legacy configs + skill-seekers unified --config configs/react.json + """ + ) + + parser.add_argument('--config', '-c', required=True, + help='Path to unified config JSON file') + parser.add_argument('--merge-mode', '-m', + choices=['rule-based', 'claude-enhanced'], + help='Override config merge mode') + + args = parser.parse_args() + + # Create and run scraper + scraper = UnifiedScraper(args.config, args.merge_mode) + scraper.run() + + +if __name__ == '__main__': + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/unified_skill_builder.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/unified_skill_builder.py new file mode 100644 index 0000000..dd3051d --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/unified_skill_builder.py @@ -0,0 +1,444 @@ +#!/usr/bin/env python3 +""" +Unified Skill Builder + +Generates final skill structure from merged multi-source data: +- SKILL.md with merged APIs and conflict warnings +- references/ with organized content by source +- Inline conflict markers (⚠️) +- Separate conflicts summary section + +Supports mixed sources (documentation, GitHub, PDF) and highlights +discrepancies transparently. +""" + +import os +import json +import logging +from pathlib import Path +from typing import Dict, List, Any, Optional + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class UnifiedSkillBuilder: + """ + Builds unified skill from multi-source data. + """ + + def __init__(self, config: Dict, scraped_data: Dict, + merged_data: Optional[Dict] = None, conflicts: Optional[List] = None): + """ + Initialize skill builder. + + Args: + config: Unified config dict + scraped_data: Dict of scraped data by source type + merged_data: Merged API data (if conflicts were resolved) + conflicts: List of detected conflicts + """ + self.config = config + self.scraped_data = scraped_data + self.merged_data = merged_data + self.conflicts = conflicts or [] + + self.name = config['name'] + self.description = config['description'] + self.skill_dir = f"output/{self.name}" + + # Create directories + os.makedirs(self.skill_dir, exist_ok=True) + os.makedirs(f"{self.skill_dir}/references", exist_ok=True) + os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) + os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) + + def build(self): + """Build complete skill structure.""" + logger.info(f"Building unified skill: {self.name}") + + # Generate main SKILL.md + self._generate_skill_md() + + # Generate reference files by source + self._generate_references() + + # Generate conflicts report (if any) + if self.conflicts: + self._generate_conflicts_report() + + logger.info(f"✅ Unified skill built: {self.skill_dir}/") + + def _generate_skill_md(self): + """Generate main SKILL.md file.""" + skill_path = os.path.join(self.skill_dir, 'SKILL.md') + + # Generate skill name (lowercase, hyphens only, max 64 chars) + skill_name = self.name.lower().replace('_', '-').replace(' ', '-')[:64] + + # Truncate description to 1024 chars if needed + desc = self.description[:1024] if len(self.description) > 1024 else self.description + + content = f"""--- +name: {skill_name} +description: {desc} +--- + +# {self.name.title()} + +{self.description} + +## 📚 Sources + +This skill combines knowledge from multiple sources: + +""" + + # List sources + for source in self.config.get('sources', []): + source_type = source['type'] + if source_type == 'documentation': + content += f"- ✅ **Documentation**: {source.get('base_url', 'N/A')}\n" + content += f" - Pages: {source.get('max_pages', 'unlimited')}\n" + elif source_type == 'github': + content += f"- ✅ **GitHub Repository**: {source.get('repo', 'N/A')}\n" + content += f" - Code Analysis: {source.get('code_analysis_depth', 'surface')}\n" + content += f" - Issues: {source.get('max_issues', 0)}\n" + elif source_type == 'pdf': + content += f"- ✅ **PDF Document**: {source.get('path', 'N/A')}\n" + + # Data quality section + if self.conflicts: + content += f"\n## ⚠️ Data Quality\n\n" + content += f"**{len(self.conflicts)} conflicts detected** between sources.\n\n" + + # Count by type + by_type = {} + for conflict in self.conflicts: + ctype = conflict.type if hasattr(conflict, 'type') else conflict.get('type', 'unknown') + by_type[ctype] = by_type.get(ctype, 0) + 1 + + content += "**Conflict Breakdown:**\n" + for ctype, count in by_type.items(): + content += f"- {ctype}: {count}\n" + + content += f"\nSee `references/conflicts.md` for detailed conflict information.\n" + + # Merged API section (if available) + if self.merged_data: + content += self._format_merged_apis() + + # Quick reference from each source + content += "\n## 📖 Reference Documentation\n\n" + content += "Organized by source:\n\n" + + for source in self.config.get('sources', []): + source_type = source['type'] + content += f"- [{source_type.title()}](references/{source_type}/)\n" + + # When to use this skill + content += f"\n## 💡 When to Use This Skill\n\n" + content += f"Use this skill when you need to:\n" + content += f"- Understand how to use {self.name}\n" + content += f"- Look up API documentation\n" + content += f"- Find usage examples\n" + + if 'github' in self.scraped_data: + content += f"- Check for known issues or recent changes\n" + content += f"- Review release history\n" + + content += "\n---\n\n" + content += "*Generated by Skill Seeker's unified multi-source scraper*\n" + + with open(skill_path, 'w', encoding='utf-8') as f: + f.write(content) + + logger.info(f"Created SKILL.md") + + def _format_merged_apis(self) -> str: + """Format merged APIs section with inline conflict warnings.""" + if not self.merged_data: + return "" + + content = "\n## 🔧 API Reference\n\n" + content += "*Merged from documentation and code analysis*\n\n" + + apis = self.merged_data.get('apis', {}) + + if not apis: + return content + "*No APIs to display*\n" + + # Group APIs by status + matched = {k: v for k, v in apis.items() if v.get('status') == 'matched'} + conflicts = {k: v for k, v in apis.items() if v.get('status') == 'conflict'} + docs_only = {k: v for k, v in apis.items() if v.get('status') == 'docs_only'} + code_only = {k: v for k, v in apis.items() if v.get('status') == 'code_only'} + + # Show matched APIs first + if matched: + content += "### ✅ Verified APIs\n\n" + content += "*Documentation and code agree*\n\n" + for api_name, api_data in list(matched.items())[:10]: # Limit to first 10 + content += self._format_api_entry(api_data, inline_conflict=False) + + # Show conflicting APIs with warnings + if conflicts: + content += "\n### ⚠️ APIs with Conflicts\n\n" + content += "*Documentation and code differ*\n\n" + for api_name, api_data in list(conflicts.items())[:10]: + content += self._format_api_entry(api_data, inline_conflict=True) + + # Show undocumented APIs + if code_only: + content += f"\n### 💻 Undocumented APIs\n\n" + content += f"*Found in code but not in documentation ({len(code_only)} total)*\n\n" + for api_name, api_data in list(code_only.items())[:5]: + content += self._format_api_entry(api_data, inline_conflict=False) + + # Show removed/missing APIs + if docs_only: + content += f"\n### 📖 Documentation-Only APIs\n\n" + content += f"*Documented but not found in code ({len(docs_only)} total)*\n\n" + for api_name, api_data in list(docs_only.items())[:5]: + content += self._format_api_entry(api_data, inline_conflict=False) + + content += f"\n*See references/api/ for complete API documentation*\n" + + return content + + def _format_api_entry(self, api_data: Dict, inline_conflict: bool = False) -> str: + """Format a single API entry.""" + name = api_data.get('name', 'Unknown') + signature = api_data.get('merged_signature', name) + description = api_data.get('merged_description', '') + warning = api_data.get('warning', '') + + entry = f"#### `{signature}`\n\n" + + if description: + entry += f"{description}\n\n" + + # Add inline conflict warning + if inline_conflict and warning: + entry += f"⚠️ **Conflict**: {warning}\n\n" + + # Show both versions if available + conflict = api_data.get('conflict', {}) + if conflict: + docs_info = conflict.get('docs_info') + code_info = conflict.get('code_info') + + if docs_info and code_info: + entry += "**Documentation says:**\n" + entry += f"```\n{docs_info.get('raw_signature', 'N/A')}\n```\n\n" + entry += "**Code implementation:**\n" + entry += f"```\n{self._format_code_signature(code_info)}\n```\n\n" + + # Add source info + source = api_data.get('source', 'unknown') + entry += f"*Source: {source}*\n\n" + + entry += "---\n\n" + + return entry + + def _format_code_signature(self, code_info: Dict) -> str: + """Format code signature for display.""" + name = code_info.get('name', '') + params = code_info.get('parameters', []) + return_type = code_info.get('return_type') + + param_strs = [] + for param in params: + param_str = param.get('name', '') + if param.get('type_hint'): + param_str += f": {param['type_hint']}" + if param.get('default'): + param_str += f" = {param['default']}" + param_strs.append(param_str) + + sig = f"{name}({', '.join(param_strs)})" + if return_type: + sig += f" -> {return_type}" + + return sig + + def _generate_references(self): + """Generate reference files organized by source.""" + logger.info("Generating reference files...") + + # Generate references for each source type + if 'documentation' in self.scraped_data: + self._generate_docs_references() + + if 'github' in self.scraped_data: + self._generate_github_references() + + if 'pdf' in self.scraped_data: + self._generate_pdf_references() + + # Generate merged API reference if available + if self.merged_data: + self._generate_merged_api_reference() + + def _generate_docs_references(self): + """Generate references from documentation source.""" + docs_dir = os.path.join(self.skill_dir, 'references', 'documentation') + os.makedirs(docs_dir, exist_ok=True) + + # Create index + index_path = os.path.join(docs_dir, 'index.md') + with open(index_path, 'w') as f: + f.write("# Documentation\n\n") + f.write("Reference from official documentation.\n\n") + + logger.info("Created documentation references") + + def _generate_github_references(self): + """Generate references from GitHub source.""" + github_dir = os.path.join(self.skill_dir, 'references', 'github') + os.makedirs(github_dir, exist_ok=True) + + github_data = self.scraped_data['github']['data'] + + # Create README reference + if github_data.get('readme'): + readme_path = os.path.join(github_dir, 'README.md') + with open(readme_path, 'w') as f: + f.write("# Repository README\n\n") + f.write(github_data['readme']) + + # Create issues reference + if github_data.get('issues'): + issues_path = os.path.join(github_dir, 'issues.md') + with open(issues_path, 'w') as f: + f.write("# GitHub Issues\n\n") + f.write(f"{len(github_data['issues'])} recent issues.\n\n") + + for issue in github_data['issues'][:20]: + f.write(f"## #{issue['number']}: {issue['title']}\n\n") + f.write(f"**State**: {issue['state']}\n") + if issue.get('labels'): + f.write(f"**Labels**: {', '.join(issue['labels'])}\n") + f.write(f"**URL**: {issue.get('url', 'N/A')}\n\n") + + # Create releases reference + if github_data.get('releases'): + releases_path = os.path.join(github_dir, 'releases.md') + with open(releases_path, 'w') as f: + f.write("# Releases\n\n") + + for release in github_data['releases'][:10]: + f.write(f"## {release['tag_name']}: {release.get('name', 'N/A')}\n\n") + f.write(f"**Published**: {release.get('published_at', 'N/A')[:10]}\n\n") + if release.get('body'): + f.write(release['body'][:500]) + f.write("\n\n") + + logger.info("Created GitHub references") + + def _generate_pdf_references(self): + """Generate references from PDF source.""" + pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf') + os.makedirs(pdf_dir, exist_ok=True) + + # Create index + index_path = os.path.join(pdf_dir, 'index.md') + with open(index_path, 'w') as f: + f.write("# PDF Documentation\n\n") + f.write("Reference from PDF document.\n\n") + + logger.info("Created PDF references") + + def _generate_merged_api_reference(self): + """Generate merged API reference file.""" + api_dir = os.path.join(self.skill_dir, 'references', 'api') + os.makedirs(api_dir, exist_ok=True) + + api_path = os.path.join(api_dir, 'merged_api.md') + + with open(api_path, 'w') as f: + f.write("# Merged API Reference\n\n") + f.write("*Combined from documentation and code analysis*\n\n") + + apis = self.merged_data.get('apis', {}) + + for api_name in sorted(apis.keys()): + api_data = apis[api_name] + entry = self._format_api_entry(api_data, inline_conflict=True) + f.write(entry) + + logger.info(f"Created merged API reference ({len(apis)} APIs)") + + def _generate_conflicts_report(self): + """Generate detailed conflicts report.""" + conflicts_path = os.path.join(self.skill_dir, 'references', 'conflicts.md') + + with open(conflicts_path, 'w') as f: + f.write("# Conflict Report\n\n") + f.write(f"Found **{len(self.conflicts)}** conflicts between sources.\n\n") + + # Group by severity + high = [c for c in self.conflicts if (hasattr(c, 'severity') and c.severity == 'high') or c.get('severity') == 'high'] + medium = [c for c in self.conflicts if (hasattr(c, 'severity') and c.severity == 'medium') or c.get('severity') == 'medium'] + low = [c for c in self.conflicts if (hasattr(c, 'severity') and c.severity == 'low') or c.get('severity') == 'low'] + + f.write("## Severity Breakdown\n\n") + f.write(f"- 🔴 **High**: {len(high)} (action required)\n") + f.write(f"- 🟡 **Medium**: {len(medium)} (review recommended)\n") + f.write(f"- 🟢 **Low**: {len(low)} (informational)\n\n") + + # List high severity conflicts + if high: + f.write("## 🔴 High Severity\n\n") + f.write("*These conflicts require immediate attention*\n\n") + + for conflict in high: + api_name = conflict.api_name if hasattr(conflict, 'api_name') else conflict.get('api_name', 'Unknown') + diff = conflict.difference if hasattr(conflict, 'difference') else conflict.get('difference', 'N/A') + + f.write(f"### {api_name}\n\n") + f.write(f"**Issue**: {diff}\n\n") + + # List medium severity + if medium: + f.write("## 🟡 Medium Severity\n\n") + + for conflict in medium[:20]: # Limit to 20 + api_name = conflict.api_name if hasattr(conflict, 'api_name') else conflict.get('api_name', 'Unknown') + diff = conflict.difference if hasattr(conflict, 'difference') else conflict.get('difference', 'N/A') + + f.write(f"### {api_name}\n\n") + f.write(f"{diff}\n\n") + + logger.info(f"Created conflicts report") + + +if __name__ == '__main__': + # Test with mock data + import sys + + if len(sys.argv) < 2: + print("Usage: python unified_skill_builder.py ") + sys.exit(1) + + config_path = sys.argv[1] + + with open(config_path, 'r') as f: + config = json.load(f) + + # Mock scraped data + scraped_data = { + 'github': { + 'data': { + 'readme': '# Test Repository', + 'issues': [], + 'releases': [] + } + } + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder.build() + + print(f"\n✅ Test skill built in: output/{config['name']}/") diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/upload_skill.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/upload_skill.py new file mode 100644 index 0000000..0694195 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/upload_skill.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +""" +Automatic Skill Uploader +Uploads a skill .zip file to Claude using the Anthropic API + +Usage: + # Set API key (one-time) + export ANTHROPIC_API_KEY=sk-ant-... + + # Upload skill + python3 upload_skill.py output/react.zip + python3 upload_skill.py output/godot.zip +""" + +import os +import sys +import json +import argparse +from pathlib import Path + +# Import utilities +try: + from utils import ( + get_api_key, + get_upload_url, + print_upload_instructions, + validate_zip_file + ) +except ImportError: + sys.path.insert(0, str(Path(__file__).parent)) + from utils import ( + get_api_key, + get_upload_url, + print_upload_instructions, + validate_zip_file + ) + + +def upload_skill_api(zip_path): + """ + Upload skill to Claude via Anthropic API + + Args: + zip_path: Path to skill .zip file + + Returns: + tuple: (success, message) + """ + # Check for requests library + try: + import requests + except ImportError: + return False, "requests library not installed. Run: pip install requests" + + # Validate zip file + is_valid, error_msg = validate_zip_file(zip_path) + if not is_valid: + return False, error_msg + + # Get API key + api_key = get_api_key() + if not api_key: + return False, "ANTHROPIC_API_KEY not set. Run: export ANTHROPIC_API_KEY=sk-ant-..." + + zip_path = Path(zip_path) + skill_name = zip_path.stem + + print(f"📤 Uploading skill: {skill_name}") + print(f" Source: {zip_path}") + print(f" Size: {zip_path.stat().st_size:,} bytes") + print() + + # Prepare API request + api_url = "https://api.anthropic.com/v1/skills" + headers = { + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "anthropic-beta": "skills-2025-10-02" + } + + try: + # Read zip file + with open(zip_path, 'rb') as f: + zip_data = f.read() + + # Upload skill + print("⏳ Uploading to Anthropic API...") + + files = { + 'files[]': (zip_path.name, zip_data, 'application/zip') + } + + response = requests.post( + api_url, + headers=headers, + files=files, + timeout=60 + ) + + # Check response + if response.status_code == 200: + print() + print("✅ Skill uploaded successfully!") + print() + print("Your skill is now available in Claude at:") + print(f" {get_upload_url()}") + print() + return True, "Upload successful" + + elif response.status_code == 401: + return False, "Authentication failed. Check your ANTHROPIC_API_KEY" + + elif response.status_code == 400: + error_msg = response.json().get('error', {}).get('message', 'Unknown error') + return False, f"Invalid skill format: {error_msg}" + + else: + error_msg = response.json().get('error', {}).get('message', 'Unknown error') + return False, f"Upload failed ({response.status_code}): {error_msg}" + + except requests.exceptions.Timeout: + return False, "Upload timed out. Try again or use manual upload" + + except requests.exceptions.ConnectionError: + return False, "Connection error. Check your internet connection" + + except Exception as e: + return False, f"Unexpected error: {str(e)}" + + +def main(): + parser = argparse.ArgumentParser( + description="Upload a skill .zip file to Claude via Anthropic API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Setup: + 1. Get your Anthropic API key from https://console.anthropic.com/ + 2. Set the API key: + export ANTHROPIC_API_KEY=sk-ant-... + +Examples: + # Upload skill + python3 upload_skill.py output/react.zip + + # Upload with explicit path + python3 upload_skill.py /path/to/skill.zip + +Requirements: + - ANTHROPIC_API_KEY environment variable must be set + - requests library (pip install requests) + """ + ) + + parser.add_argument( + 'zip_file', + help='Path to skill .zip file (e.g., output/react.zip)' + ) + + args = parser.parse_args() + + # Upload skill + success, message = upload_skill_api(args.zip_file) + + if success: + sys.exit(0) + else: + print(f"\n❌ Upload failed: {message}") + print() + print("📝 Manual upload instructions:") + print_upload_instructions(args.zip_file) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/utils.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/utils.py new file mode 100644 index 0000000..2432cd1 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/cli/utils.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +""" +Utility functions for Skill Seeker CLI tools +""" + +import os +import sys +import subprocess +import platform +from pathlib import Path +from typing import Optional, Tuple, Dict, Union + + +def open_folder(folder_path: Union[str, Path]) -> bool: + """ + Open a folder in the system file browser + + Args: + folder_path: Path to folder to open + + Returns: + bool: True if successful, False otherwise + """ + folder_path = Path(folder_path).resolve() + + if not folder_path.exists(): + print(f"⚠️ Folder not found: {folder_path}") + return False + + system = platform.system() + + try: + if system == "Linux": + # Try xdg-open first (standard) + subprocess.run(["xdg-open", str(folder_path)], check=True) + elif system == "Darwin": # macOS + subprocess.run(["open", str(folder_path)], check=True) + elif system == "Windows": + subprocess.run(["explorer", str(folder_path)], check=True) + else: + print(f"⚠️ Unknown operating system: {system}") + return False + + return True + + except subprocess.CalledProcessError: + print(f"⚠️ Could not open folder automatically") + return False + except FileNotFoundError: + print(f"⚠️ File browser not found on system") + return False + + +def has_api_key() -> bool: + """ + Check if ANTHROPIC_API_KEY is set in environment + + Returns: + bool: True if API key is set, False otherwise + """ + api_key = os.environ.get('ANTHROPIC_API_KEY', '').strip() + return len(api_key) > 0 + + +def get_api_key() -> Optional[str]: + """ + Get ANTHROPIC_API_KEY from environment + + Returns: + str: API key or None if not set + """ + api_key = os.environ.get('ANTHROPIC_API_KEY', '').strip() + return api_key if api_key else None + + +def get_upload_url() -> str: + """ + Get the Claude skills upload URL + + Returns: + str: Claude skills upload URL + """ + return "https://claude.ai/skills" + + +def print_upload_instructions(zip_path: Union[str, Path]) -> None: + """ + Print clear upload instructions for manual upload + + Args: + zip_path: Path to the .zip file to upload + """ + zip_path = Path(zip_path) + + print() + print("╔══════════════════════════════════════════════════════════╗") + print("║ NEXT STEP ║") + print("╚══════════════════════════════════════════════════════════╝") + print() + print(f"📤 Upload to Claude: {get_upload_url()}") + print() + print(f"1. Go to {get_upload_url()}") + print("2. Click \"Upload Skill\"") + print(f"3. Select: {zip_path}") + print("4. Done! ✅") + print() + + +def format_file_size(size_bytes: int) -> str: + """ + Format file size in human-readable format + + Args: + size_bytes: Size in bytes + + Returns: + str: Formatted size (e.g., "45.3 KB") + """ + if size_bytes < 1024: + return f"{size_bytes} bytes" + elif size_bytes < 1024 * 1024: + return f"{size_bytes / 1024:.1f} KB" + else: + return f"{size_bytes / (1024 * 1024):.1f} MB" + + +def validate_skill_directory(skill_dir: Union[str, Path]) -> Tuple[bool, Optional[str]]: + """ + Validate that a directory is a valid skill directory + + Args: + skill_dir: Path to skill directory + + Returns: + tuple: (is_valid, error_message) + """ + skill_path = Path(skill_dir) + + if not skill_path.exists(): + return False, f"Directory not found: {skill_dir}" + + if not skill_path.is_dir(): + return False, f"Not a directory: {skill_dir}" + + skill_md = skill_path / "SKILL.md" + if not skill_md.exists(): + return False, f"SKILL.md not found in {skill_dir}" + + return True, None + + +def validate_zip_file(zip_path: Union[str, Path]) -> Tuple[bool, Optional[str]]: + """ + Validate that a file is a valid skill .zip file + + Args: + zip_path: Path to .zip file + + Returns: + tuple: (is_valid, error_message) + """ + zip_path = Path(zip_path) + + if not zip_path.exists(): + return False, f"File not found: {zip_path}" + + if not zip_path.is_file(): + return False, f"Not a file: {zip_path}" + + if not zip_path.suffix == '.zip': + return False, f"Not a .zip file: {zip_path}" + + return True, None + + +def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, preview_limit: int = 40000) -> Dict[str, str]: + """Read reference files from a skill directory with size limits. + + This function reads markdown files from the references/ subdirectory + of a skill, applying both per-file and total content limits. + + Args: + skill_dir (str or Path): Path to skill directory + max_chars (int): Maximum total characters to read (default: 100000) + preview_limit (int): Maximum characters per file (default: 40000) + + Returns: + dict: Dictionary mapping filename to content + + Example: + >>> refs = read_reference_files('output/react/', max_chars=50000) + >>> len(refs) + 5 + """ + from pathlib import Path + + skill_path = Path(skill_dir) + references_dir = skill_path / "references" + references: Dict[str, str] = {} + + if not references_dir.exists(): + print(f"⚠ No references directory found at {references_dir}") + return references + + total_chars = 0 + for ref_file in sorted(references_dir.glob("*.md")): + if ref_file.name == "index.md": + continue + + content = ref_file.read_text(encoding='utf-8') + + # Limit size per file + if len(content) > preview_limit: + content = content[:preview_limit] + "\n\n[Content truncated...]" + + references[ref_file.name] = content + total_chars += len(content) + + # Stop if we've read enough + if total_chars > max_chars: + print(f" ℹ Limiting input to {max_chars:,} characters") + break + + return references diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/__init__.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/__init__.py new file mode 100644 index 0000000..4616b37 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/__init__.py @@ -0,0 +1,27 @@ +"""Skill Seekers MCP (Model Context Protocol) server package. + +This package provides MCP server integration for Claude Code, allowing +natural language interaction with Skill Seekers tools. + +Main modules: + - server: MCP server implementation with 9 tools + +Available MCP Tools: + - list_configs: List all available preset configurations + - generate_config: Generate a new config file for any docs site + - validate_config: Validate a config file structure + - estimate_pages: Estimate page count before scraping + - scrape_docs: Scrape and build a skill + - package_skill: Package skill into .zip file (with auto-upload) + - upload_skill: Upload .zip to Claude + - split_config: Split large documentation configs + - generate_router: Generate router/hub skills + +Usage: + The MCP server is typically run by Claude Code via configuration + in ~/.config/claude-code/mcp.json +""" + +__version__ = "2.0.0" + +__all__ = [] diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/requirements.txt b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/requirements.txt new file mode 100644 index 0000000..18088ef --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/requirements.txt @@ -0,0 +1,9 @@ +# MCP Server dependencies +mcp>=1.0.0 + +# CLI tool dependencies (shared) +requests>=2.31.0 +beautifulsoup4>=4.12.0 + +# Optional: for API-based enhancement +# anthropic>=0.18.0 diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/server.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/server.py new file mode 100644 index 0000000..4e054de --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/server.py @@ -0,0 +1,1064 @@ +#!/usr/bin/env python3 +""" +Skill Seeker MCP Server +Model Context Protocol server for generating Claude AI skills from documentation +""" + +import asyncio +import json +import os +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +# Import external MCP package +# NOTE: Directory renamed from 'mcp/' to 'skill_seeker_mcp/' to avoid shadowing the external mcp package +MCP_AVAILABLE = False +Server = None +Tool = None +TextContent = None + +try: + from mcp.server import Server + from mcp.types import Tool, TextContent + MCP_AVAILABLE = True +except ImportError as e: + if __name__ == "__main__": + print("❌ Error: mcp package not installed") + print("Install with: pip install mcp") + print(f"Import error: {e}") + sys.exit(1) + + +# Initialize MCP server (only if MCP is available) +app = Server("skill-seeker") if MCP_AVAILABLE and Server is not None else None + +# Path to CLI tools +CLI_DIR = Path(__file__).parent.parent / "cli" + +# Helper decorator that works even when app is None +def safe_decorator(decorator_func): + """Returns the decorator if MCP is available, otherwise returns a no-op""" + if MCP_AVAILABLE and app is not None: + return decorator_func + else: + # Return a decorator that just returns the function unchanged + def noop_decorator(func): + return func + return noop_decorator + + +def run_subprocess_with_streaming(cmd, timeout=None): + """ + Run subprocess with real-time output streaming. + Returns (stdout, stderr, returncode). + + This solves the blocking issue where long-running processes (like scraping) + would cause MCP to appear frozen. Now we stream output as it comes. + """ + try: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, # Line buffered + universal_newlines=True + ) + + stdout_lines = [] + stderr_lines = [] + start_time = time.time() + + # Read output line by line as it comes + while True: + # Check timeout + if timeout and (time.time() - start_time) > timeout: + process.kill() + stderr_lines.append(f"\n⚠️ Process killed after {timeout}s timeout") + break + + # Check if process finished + if process.poll() is not None: + break + + # Read available output (non-blocking) + try: + import select + readable, _, _ = select.select([process.stdout, process.stderr], [], [], 0.1) + + if process.stdout in readable: + line = process.stdout.readline() + if line: + stdout_lines.append(line) + + if process.stderr in readable: + line = process.stderr.readline() + if line: + stderr_lines.append(line) + except: + # Fallback for Windows (no select) + time.sleep(0.1) + + # Get any remaining output + remaining_stdout, remaining_stderr = process.communicate() + if remaining_stdout: + stdout_lines.append(remaining_stdout) + if remaining_stderr: + stderr_lines.append(remaining_stderr) + + stdout = ''.join(stdout_lines) + stderr = ''.join(stderr_lines) + returncode = process.returncode + + return stdout, stderr, returncode + + except Exception as e: + return "", f"Error running subprocess: {str(e)}", 1 + + +@safe_decorator(app.list_tools() if app else lambda: lambda f: f) +async def list_tools() -> list[Tool]: + """List available tools""" + return [ + Tool( + name="generate_config", + description="Generate a config file for documentation scraping. Interactively creates a JSON config for any documentation website.", + inputSchema={ + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Skill name (lowercase, alphanumeric, hyphens, underscores)", + }, + "url": { + "type": "string", + "description": "Base documentation URL (must include http:// or https://)", + }, + "description": { + "type": "string", + "description": "Description of when to use this skill", + }, + "max_pages": { + "type": "integer", + "description": "Maximum pages to scrape (default: 100, use -1 for unlimited)", + "default": 100, + }, + "unlimited": { + "type": "boolean", + "description": "Remove all limits - scrape all pages (default: false). Overrides max_pages.", + "default": False, + }, + "rate_limit": { + "type": "number", + "description": "Delay between requests in seconds (default: 0.5)", + "default": 0.5, + }, + }, + "required": ["name", "url", "description"], + }, + ), + Tool( + name="estimate_pages", + description="Estimate how many pages will be scraped from a config. Fast preview without downloading content.", + inputSchema={ + "type": "object", + "properties": { + "config_path": { + "type": "string", + "description": "Path to config JSON file (e.g., configs/react.json)", + }, + "max_discovery": { + "type": "integer", + "description": "Maximum pages to discover during estimation (default: 1000, use -1 for unlimited)", + "default": 1000, + }, + "unlimited": { + "type": "boolean", + "description": "Remove discovery limit - estimate all pages (default: false). Overrides max_discovery.", + "default": False, + }, + }, + "required": ["config_path"], + }, + ), + Tool( + name="scrape_docs", + description="Scrape documentation and build Claude skill. Supports both single-source (legacy) and unified multi-source configs. Creates SKILL.md and reference files. Automatically detects llms.txt files for 10x faster processing. Falls back to HTML scraping if not available.", + inputSchema={ + "type": "object", + "properties": { + "config_path": { + "type": "string", + "description": "Path to config JSON file (e.g., configs/react.json or configs/godot_unified.json)", + }, + "unlimited": { + "type": "boolean", + "description": "Remove page limit - scrape all pages (default: false). Overrides max_pages in config.", + "default": False, + }, + "enhance_local": { + "type": "boolean", + "description": "Open terminal for local enhancement with Claude Code (default: false)", + "default": False, + }, + "skip_scrape": { + "type": "boolean", + "description": "Skip scraping, use cached data (default: false)", + "default": False, + }, + "dry_run": { + "type": "boolean", + "description": "Preview what will be scraped without saving (default: false)", + "default": False, + }, + "merge_mode": { + "type": "string", + "description": "Override merge mode for unified configs: 'rule-based' or 'claude-enhanced' (default: from config)", + }, + }, + "required": ["config_path"], + }, + ), + Tool( + name="package_skill", + description="Package a skill directory into a .zip file ready for Claude upload. Automatically uploads if ANTHROPIC_API_KEY is set.", + inputSchema={ + "type": "object", + "properties": { + "skill_dir": { + "type": "string", + "description": "Path to skill directory (e.g., output/react/)", + }, + "auto_upload": { + "type": "boolean", + "description": "Try to upload automatically if API key is available (default: true). If false, only package without upload attempt.", + "default": True, + }, + }, + "required": ["skill_dir"], + }, + ), + Tool( + name="upload_skill", + description="Upload a skill .zip file to Claude automatically (requires ANTHROPIC_API_KEY)", + inputSchema={ + "type": "object", + "properties": { + "skill_zip": { + "type": "string", + "description": "Path to skill .zip file (e.g., output/react.zip)", + }, + }, + "required": ["skill_zip"], + }, + ), + Tool( + name="list_configs", + description="List all available preset configurations.", + inputSchema={ + "type": "object", + "properties": {}, + }, + ), + Tool( + name="validate_config", + description="Validate a config file for errors.", + inputSchema={ + "type": "object", + "properties": { + "config_path": { + "type": "string", + "description": "Path to config JSON file", + }, + }, + "required": ["config_path"], + }, + ), + Tool( + name="split_config", + description="Split large documentation config into multiple focused skills. For 10K+ page documentation.", + inputSchema={ + "type": "object", + "properties": { + "config_path": { + "type": "string", + "description": "Path to config JSON file (e.g., configs/godot.json)", + }, + "strategy": { + "type": "string", + "description": "Split strategy: auto, none, category, router, size (default: auto)", + "default": "auto", + }, + "target_pages": { + "type": "integer", + "description": "Target pages per skill (default: 5000)", + "default": 5000, + }, + "dry_run": { + "type": "boolean", + "description": "Preview without saving files (default: false)", + "default": False, + }, + }, + "required": ["config_path"], + }, + ), + Tool( + name="generate_router", + description="Generate router/hub skill for split documentation. Creates intelligent routing to sub-skills.", + inputSchema={ + "type": "object", + "properties": { + "config_pattern": { + "type": "string", + "description": "Config pattern for sub-skills (e.g., 'configs/godot-*.json')", + }, + "router_name": { + "type": "string", + "description": "Router skill name (optional, inferred from configs)", + }, + }, + "required": ["config_pattern"], + }, + ), + Tool( + name="scrape_pdf", + description="Scrape PDF documentation and build Claude skill. Extracts text, code, and images from PDF files.", + inputSchema={ + "type": "object", + "properties": { + "config_path": { + "type": "string", + "description": "Path to PDF config JSON file (e.g., configs/manual_pdf.json)", + }, + "pdf_path": { + "type": "string", + "description": "Direct PDF path (alternative to config_path)", + }, + "name": { + "type": "string", + "description": "Skill name (required with pdf_path)", + }, + "description": { + "type": "string", + "description": "Skill description (optional)", + }, + "from_json": { + "type": "string", + "description": "Build from extracted JSON file (e.g., output/manual_extracted.json)", + }, + }, + "required": [], + }, + ), + Tool( + name="scrape_github", + description="Scrape GitHub repository and build Claude skill. Extracts README, Issues, Changelog, Releases, and code structure.", + inputSchema={ + "type": "object", + "properties": { + "repo": { + "type": "string", + "description": "GitHub repository (owner/repo, e.g., facebook/react)", + }, + "config_path": { + "type": "string", + "description": "Path to GitHub config JSON file (e.g., configs/react_github.json)", + }, + "name": { + "type": "string", + "description": "Skill name (default: repo name)", + }, + "description": { + "type": "string", + "description": "Skill description", + }, + "token": { + "type": "string", + "description": "GitHub personal access token (or use GITHUB_TOKEN env var)", + }, + "no_issues": { + "type": "boolean", + "description": "Skip GitHub issues extraction (default: false)", + "default": False, + }, + "no_changelog": { + "type": "boolean", + "description": "Skip CHANGELOG extraction (default: false)", + "default": False, + }, + "no_releases": { + "type": "boolean", + "description": "Skip releases extraction (default: false)", + "default": False, + }, + "max_issues": { + "type": "integer", + "description": "Maximum issues to fetch (default: 100)", + "default": 100, + }, + "scrape_only": { + "type": "boolean", + "description": "Only scrape, don't build skill (default: false)", + "default": False, + }, + }, + "required": [], + }, + ), + ] + + +@safe_decorator(app.call_tool() if app else lambda: lambda f: f) +async def call_tool(name: str, arguments: Any) -> list[TextContent]: + """Handle tool calls""" + + try: + if name == "generate_config": + return await generate_config_tool(arguments) + elif name == "estimate_pages": + return await estimate_pages_tool(arguments) + elif name == "scrape_docs": + return await scrape_docs_tool(arguments) + elif name == "package_skill": + return await package_skill_tool(arguments) + elif name == "upload_skill": + return await upload_skill_tool(arguments) + elif name == "list_configs": + return await list_configs_tool(arguments) + elif name == "validate_config": + return await validate_config_tool(arguments) + elif name == "split_config": + return await split_config_tool(arguments) + elif name == "generate_router": + return await generate_router_tool(arguments) + elif name == "scrape_pdf": + return await scrape_pdf_tool(arguments) + elif name == "scrape_github": + return await scrape_github_tool(arguments) + else: + return [TextContent(type="text", text=f"Unknown tool: {name}")] + + except Exception as e: + return [TextContent(type="text", text=f"Error: {str(e)}")] + + +async def generate_config_tool(args: dict) -> list[TextContent]: + """Generate a config file""" + name = args["name"] + url = args["url"] + description = args["description"] + max_pages = args.get("max_pages", 100) + unlimited = args.get("unlimited", False) + rate_limit = args.get("rate_limit", 0.5) + + # Handle unlimited mode + if unlimited: + max_pages = None + limit_msg = "unlimited (no page limit)" + elif max_pages == -1: + max_pages = None + limit_msg = "unlimited (no page limit)" + else: + limit_msg = str(max_pages) + + # Create config + config = { + "name": name, + "description": description, + "base_url": url, + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": [] + }, + "categories": {}, + "rate_limit": rate_limit, + "max_pages": max_pages + } + + # Save to configs directory + config_path = Path("configs") / f"{name}.json" + config_path.parent.mkdir(exist_ok=True) + + with open(config_path, 'w') as f: + json.dump(config, f, indent=2) + + result = f"""✅ Config created: {config_path} + +Configuration: + Name: {name} + URL: {url} + Max pages: {limit_msg} + Rate limit: {rate_limit}s + +Next steps: + 1. Review/edit config: cat {config_path} + 2. Estimate pages: Use estimate_pages tool + 3. Scrape docs: Use scrape_docs tool + +Note: Default selectors may need adjustment for your documentation site. +""" + + return [TextContent(type="text", text=result)] + + +async def estimate_pages_tool(args: dict) -> list[TextContent]: + """Estimate page count""" + config_path = args["config_path"] + max_discovery = args.get("max_discovery", 1000) + unlimited = args.get("unlimited", False) + + # Handle unlimited mode + if unlimited or max_discovery == -1: + max_discovery = -1 + timeout = 1800 # 30 minutes for unlimited discovery + else: + # Estimate: 0.5s per page discovered + timeout = max(300, max_discovery // 2) # Minimum 5 minutes + + # Run estimate_pages.py + cmd = [ + sys.executable, + str(CLI_DIR / "estimate_pages.py"), + config_path, + "--max-discovery", str(max_discovery) + ] + + progress_msg = f"🔄 Estimating page count...\n" + progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n" + + stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + + output = progress_msg + stdout + + if returncode == 0: + return [TextContent(type="text", text=output)] + else: + return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")] + + +async def scrape_docs_tool(args: dict) -> list[TextContent]: + """Scrape documentation - auto-detects unified vs legacy format""" + config_path = args["config_path"] + unlimited = args.get("unlimited", False) + enhance_local = args.get("enhance_local", False) + skip_scrape = args.get("skip_scrape", False) + dry_run = args.get("dry_run", False) + merge_mode = args.get("merge_mode") + + # Load config to detect format + with open(config_path, 'r') as f: + config = json.load(f) + + # Detect if unified format (has 'sources' array) + is_unified = 'sources' in config and isinstance(config['sources'], list) + + # Handle unlimited mode by modifying config temporarily + if unlimited: + # Set max_pages to None (unlimited) + if is_unified: + # For unified configs, set max_pages on documentation sources + for source in config.get('sources', []): + if source.get('type') == 'documentation': + source['max_pages'] = None + else: + # For legacy configs + config['max_pages'] = None + + # Create temporary config file + temp_config_path = config_path.replace('.json', '_unlimited_temp.json') + with open(temp_config_path, 'w') as f: + json.dump(config, f, indent=2) + + config_to_use = temp_config_path + else: + config_to_use = config_path + + # Choose scraper based on format + if is_unified: + scraper_script = "unified_scraper.py" + progress_msg = f"🔄 Starting unified multi-source scraping...\n" + progress_msg += f"📦 Config format: Unified (multiple sources)\n" + else: + scraper_script = "doc_scraper.py" + progress_msg = f"🔄 Starting scraping process...\n" + progress_msg += f"📦 Config format: Legacy (single source)\n" + + # Build command + cmd = [ + sys.executable, + str(CLI_DIR / scraper_script), + "--config", config_to_use + ] + + # Add merge mode for unified configs + if is_unified and merge_mode: + cmd.extend(["--merge-mode", merge_mode]) + + # Add --fresh to avoid user input prompts when existing data found + if not skip_scrape: + cmd.append("--fresh") + + if enhance_local: + cmd.append("--enhance-local") + if skip_scrape: + cmd.append("--skip-scrape") + if dry_run: + cmd.append("--dry-run") + + # Determine timeout based on operation type + if dry_run: + timeout = 300 # 5 minutes for dry run + elif skip_scrape: + timeout = 600 # 10 minutes for building from cache + elif unlimited: + timeout = None # No timeout for unlimited mode (user explicitly requested) + else: + # Read config to estimate timeout + try: + if is_unified: + # For unified configs, estimate based on all sources + total_pages = 0 + for source in config.get('sources', []): + if source.get('type') == 'documentation': + total_pages += source.get('max_pages', 500) + max_pages = total_pages or 500 + else: + max_pages = config.get('max_pages', 500) + + # Estimate: 30s per page + buffer + timeout = max(3600, max_pages * 35) # Minimum 1 hour, or 35s per page + except: + timeout = 14400 # Default: 4 hours + + # Add progress message + if timeout: + progress_msg += f"⏱️ Maximum time allowed: {timeout // 60} minutes\n" + else: + progress_msg += f"⏱️ Unlimited mode - no timeout\n" + progress_msg += f"📝 Progress will be shown below:\n\n" + + # Run scraper with streaming + stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + + # Clean up temporary config + if unlimited and Path(config_to_use).exists(): + Path(config_to_use).unlink() + + output = progress_msg + stdout + + if returncode == 0: + return [TextContent(type="text", text=output)] + else: + error_output = output + f"\n\n❌ Error:\n{stderr}" + return [TextContent(type="text", text=error_output)] + + +async def package_skill_tool(args: dict) -> list[TextContent]: + """Package skill to .zip and optionally auto-upload""" + skill_dir = args["skill_dir"] + auto_upload = args.get("auto_upload", True) + + # Check if API key exists - only upload if available + has_api_key = os.environ.get('ANTHROPIC_API_KEY', '').strip() + should_upload = auto_upload and has_api_key + + # Run package_skill.py + cmd = [ + sys.executable, + str(CLI_DIR / "package_skill.py"), + skill_dir, + "--no-open", # Don't open folder in MCP context + "--skip-quality-check" # Skip interactive quality checks in MCP context + ] + + # Add upload flag only if we have API key + if should_upload: + cmd.append("--upload") + + # Timeout: 5 minutes for packaging + upload + timeout = 300 + + progress_msg = "📦 Packaging skill...\n" + if should_upload: + progress_msg += "📤 Will auto-upload if successful\n" + progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n" + + stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + + output = progress_msg + stdout + + if returncode == 0: + if should_upload: + # Upload succeeded + output += "\n\n✅ Skill packaged and uploaded automatically!" + output += "\n Your skill is now available in Claude!" + elif auto_upload and not has_api_key: + # User wanted upload but no API key + output += "\n\n📝 Skill packaged successfully!" + output += "\n" + output += "\n💡 To enable automatic upload:" + output += "\n 1. Get API key from https://console.anthropic.com/" + output += "\n 2. Set: export ANTHROPIC_API_KEY=sk-ant-..." + output += "\n" + output += "\n📤 Manual upload:" + output += "\n 1. Find the .zip file in your output/ folder" + output += "\n 2. Go to https://claude.ai/skills" + output += "\n 3. Click 'Upload Skill' and select the .zip file" + else: + # auto_upload=False, just packaged + output += "\n\n✅ Skill packaged successfully!" + output += "\n Upload manually to https://claude.ai/skills" + + return [TextContent(type="text", text=output)] + else: + return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")] + + +async def upload_skill_tool(args: dict) -> list[TextContent]: + """Upload skill .zip to Claude""" + skill_zip = args["skill_zip"] + + # Run upload_skill.py + cmd = [ + sys.executable, + str(CLI_DIR / "upload_skill.py"), + skill_zip + ] + + # Timeout: 5 minutes for upload + timeout = 300 + + progress_msg = "📤 Uploading skill to Claude...\n" + progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n" + + stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + + output = progress_msg + stdout + + if returncode == 0: + return [TextContent(type="text", text=output)] + else: + return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")] + + +async def list_configs_tool(args: dict) -> list[TextContent]: + """List available configs""" + configs_dir = Path("configs") + + if not configs_dir.exists(): + return [TextContent(type="text", text="No configs directory found")] + + configs = list(configs_dir.glob("*.json")) + + if not configs: + return [TextContent(type="text", text="No config files found")] + + result = "📋 Available Configs:\n\n" + + for config_file in sorted(configs): + try: + with open(config_file) as f: + config = json.load(f) + name = config.get("name", config_file.stem) + desc = config.get("description", "No description") + url = config.get("base_url", "") + + result += f" • {config_file.name}\n" + result += f" Name: {name}\n" + result += f" URL: {url}\n" + result += f" Description: {desc}\n\n" + except Exception as e: + result += f" • {config_file.name} - Error reading: {e}\n\n" + + return [TextContent(type="text", text=result)] + + +async def validate_config_tool(args: dict) -> list[TextContent]: + """Validate a config file - supports both legacy and unified formats""" + config_path = args["config_path"] + + # Import validation classes + sys.path.insert(0, str(CLI_DIR)) + + try: + # Check if file exists + if not Path(config_path).exists(): + return [TextContent(type="text", text=f"❌ Error: Config file not found: {config_path}")] + + # Try unified config validator first + try: + from config_validator import validate_config + validator = validate_config(config_path) + + result = f"✅ Config is valid!\n\n" + + # Show format + if validator.is_unified: + result += f"📦 Format: Unified (multi-source)\n" + result += f" Name: {validator.config['name']}\n" + result += f" Sources: {len(validator.config.get('sources', []))}\n" + + # Show sources + for i, source in enumerate(validator.config.get('sources', []), 1): + result += f"\n Source {i}: {source['type']}\n" + if source['type'] == 'documentation': + result += f" URL: {source.get('base_url', 'N/A')}\n" + result += f" Max pages: {source.get('max_pages', 'Not set')}\n" + elif source['type'] == 'github': + result += f" Repo: {source.get('repo', 'N/A')}\n" + result += f" Code depth: {source.get('code_analysis_depth', 'surface')}\n" + elif source['type'] == 'pdf': + result += f" Path: {source.get('path', 'N/A')}\n" + + # Show merge settings if applicable + if validator.needs_api_merge(): + merge_mode = validator.config.get('merge_mode', 'rule-based') + result += f"\n Merge mode: {merge_mode}\n" + result += f" API merging: Required (docs + code sources)\n" + + else: + result += f"📦 Format: Legacy (single source)\n" + result += f" Name: {validator.config['name']}\n" + result += f" Base URL: {validator.config.get('base_url', 'N/A')}\n" + result += f" Max pages: {validator.config.get('max_pages', 'Not set')}\n" + result += f" Rate limit: {validator.config.get('rate_limit', 'Not set')}s\n" + + return [TextContent(type="text", text=result)] + + except ImportError: + # Fall back to legacy validation + from doc_scraper import validate_config + import json + + with open(config_path, 'r') as f: + config = json.load(f) + + # Validate config - returns (errors, warnings) tuple + errors, warnings = validate_config(config) + + if errors: + result = f"❌ Config validation failed:\n\n" + for error in errors: + result += f" • {error}\n" + else: + result = f"✅ Config is valid!\n\n" + result += f"📦 Format: Legacy (single source)\n" + result += f" Name: {config['name']}\n" + result += f" Base URL: {config['base_url']}\n" + result += f" Max pages: {config.get('max_pages', 'Not set')}\n" + result += f" Rate limit: {config.get('rate_limit', 'Not set')}s\n" + + if warnings: + result += f"\n⚠️ Warnings:\n" + for warning in warnings: + result += f" • {warning}\n" + + return [TextContent(type="text", text=result)] + + except Exception as e: + return [TextContent(type="text", text=f"❌ Error: {str(e)}")] + + +async def split_config_tool(args: dict) -> list[TextContent]: + """Split large config into multiple focused configs""" + config_path = args["config_path"] + strategy = args.get("strategy", "auto") + target_pages = args.get("target_pages", 5000) + dry_run = args.get("dry_run", False) + + # Run split_config.py + cmd = [ + sys.executable, + str(CLI_DIR / "split_config.py"), + config_path, + "--strategy", strategy, + "--target-pages", str(target_pages) + ] + + if dry_run: + cmd.append("--dry-run") + + # Timeout: 5 minutes for config splitting + timeout = 300 + + progress_msg = "✂️ Splitting configuration...\n" + progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n" + + stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + + output = progress_msg + stdout + + if returncode == 0: + return [TextContent(type="text", text=output)] + else: + return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")] + + +async def generate_router_tool(args: dict) -> list[TextContent]: + """Generate router skill for split documentation""" + import glob + + config_pattern = args["config_pattern"] + router_name = args.get("router_name") + + # Expand glob pattern + config_files = glob.glob(config_pattern) + + if not config_files: + return [TextContent(type="text", text=f"❌ No config files match pattern: {config_pattern}")] + + # Run generate_router.py + cmd = [ + sys.executable, + str(CLI_DIR / "generate_router.py"), + ] + config_files + + if router_name: + cmd.extend(["--name", router_name]) + + # Timeout: 5 minutes for router generation + timeout = 300 + + progress_msg = "🧭 Generating router skill...\n" + progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n" + + stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + + output = progress_msg + stdout + + if returncode == 0: + return [TextContent(type="text", text=output)] + else: + return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")] + + +async def scrape_pdf_tool(args: dict) -> list[TextContent]: + """Scrape PDF documentation and build skill""" + config_path = args.get("config_path") + pdf_path = args.get("pdf_path") + name = args.get("name") + description = args.get("description") + from_json = args.get("from_json") + + # Build command + cmd = [sys.executable, str(CLI_DIR / "pdf_scraper.py")] + + # Mode 1: Config file + if config_path: + cmd.extend(["--config", config_path]) + + # Mode 2: Direct PDF + elif pdf_path and name: + cmd.extend(["--pdf", pdf_path, "--name", name]) + if description: + cmd.extend(["--description", description]) + + # Mode 3: From JSON + elif from_json: + cmd.extend(["--from-json", from_json]) + + else: + return [TextContent(type="text", text="❌ Error: Must specify --config, --pdf + --name, or --from-json")] + + # Run pdf_scraper.py with streaming (can take a while) + timeout = 600 # 10 minutes for PDF extraction + + progress_msg = "📄 Scraping PDF documentation...\n" + progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n" + + stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + + output = progress_msg + stdout + + if returncode == 0: + return [TextContent(type="text", text=output)] + else: + return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")] + + +async def scrape_github_tool(args: dict) -> list[TextContent]: + """Scrape GitHub repository to Claude skill (C1.11)""" + repo = args.get("repo") + config_path = args.get("config_path") + name = args.get("name") + description = args.get("description") + token = args.get("token") + no_issues = args.get("no_issues", False) + no_changelog = args.get("no_changelog", False) + no_releases = args.get("no_releases", False) + max_issues = args.get("max_issues", 100) + scrape_only = args.get("scrape_only", False) + + # Build command + cmd = [sys.executable, str(CLI_DIR / "github_scraper.py")] + + # Mode 1: Config file + if config_path: + cmd.extend(["--config", config_path]) + + # Mode 2: Direct repo + elif repo: + cmd.extend(["--repo", repo]) + if name: + cmd.extend(["--name", name]) + if description: + cmd.extend(["--description", description]) + if token: + cmd.extend(["--token", token]) + if no_issues: + cmd.append("--no-issues") + if no_changelog: + cmd.append("--no-changelog") + if no_releases: + cmd.append("--no-releases") + if max_issues != 100: + cmd.extend(["--max-issues", str(max_issues)]) + if scrape_only: + cmd.append("--scrape-only") + + else: + return [TextContent(type="text", text="❌ Error: Must specify --repo or --config")] + + # Run github_scraper.py with streaming (can take a while) + timeout = 600 # 10 minutes for GitHub scraping + + progress_msg = "🐙 Scraping GitHub repository...\n" + progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n" + + stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + + output = progress_msg + stdout + + if returncode == 0: + return [TextContent(type="text", text=output)] + else: + return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")] + + +async def main(): + """Run the MCP server""" + if not MCP_AVAILABLE or app is None: + print("❌ Error: MCP server cannot start - MCP package not available") + sys.exit(1) + + from mcp.server.stdio import stdio_server + + async with stdio_server() as (read_stream, write_stream): + await app.run( + read_stream, + write_stream, + app.create_initialization_options() + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/tools/__init__.py b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/tools/__init__.py new file mode 100644 index 0000000..388f312 --- /dev/null +++ b/skills/skills-skills/scripts/Skill_Seekers-development/src/skill_seekers/mcp/tools/__init__.py @@ -0,0 +1,19 @@ +"""MCP tools subpackage. + +This package will contain modularized MCP tool implementations. + +Planned structure (for future refactoring): + - scraping_tools.py: Tools for scraping (estimate_pages, scrape_docs) + - building_tools.py: Tools for building (package_skill, validate_config) + - deployment_tools.py: Tools for deployment (upload_skill) + - config_tools.py: Tools for configs (list_configs, generate_config) + - advanced_tools.py: Advanced tools (split_config, generate_router) + +Current state: + All tools are currently implemented in mcp/server.py + This directory is a placeholder for future modularization. +""" + +__version__ = "2.0.0" + +__all__ = [] diff --git a/skills/skills-skills/scripts/skill-seekers-bootstrap.sh b/skills/skills-skills/scripts/skill-seekers-bootstrap.sh new file mode 100755 index 0000000..6a0f077 --- /dev/null +++ b/skills/skills-skills/scripts/skill-seekers-bootstrap.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# ==================== Purpose ==================== +# Bootstraps a local venv for the vendored Skill Seekers source code. +# +# Output: +# - Creates: skills/skills-skills/scripts/.venv-skill-seekers/ + +usage() { + cat <<'EOF' +Usage: + skill-seekers-bootstrap.sh [--venv ] + +Examples: + ./skills/skills-skills/scripts/skill-seekers-bootstrap.sh + ./skills/skills-skills/scripts/skill-seekers-bootstrap.sh --venv ./skills/skills-skills/scripts/.venv-skill-seekers +EOF +} + +die() { + echo "Error: $*" >&2 + exit 1 +} + +script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +tool_dir="${script_dir}/Skill_Seekers-development" +default_venv="${script_dir}/.venv-skill-seekers" + +venv_dir="$default_venv" + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + --venv) + [[ $# -ge 2 ]] || die "--venv requires a directory argument" + venv_dir="$2" + shift 2 + ;; + --) + shift + break + ;; + -*) + die "Unknown argument: $1 (use --help)" + ;; + *) + die "Unexpected positional argument: $1 (use --help)" + ;; + esac +done + +[[ -d "$tool_dir" ]] || die "Missing vendored tool dir: $tool_dir" +[[ -f "$tool_dir/requirements.txt" ]] || die "Missing requirements.txt: $tool_dir/requirements.txt" +command -v python3 >/dev/null 2>&1 || die "python3 not found" + +if [[ ! -d "$venv_dir" ]]; then + python3 -m venv "$venv_dir" +fi + +"$venv_dir/bin/python" -m pip install --upgrade pip >/dev/null +"$venv_dir/bin/pip" install -r "$tool_dir/requirements.txt" + +echo "OK: venv ready: $venv_dir" + diff --git a/skills/skills-skills/scripts/skill-seekers-configs b/skills/skills-skills/scripts/skill-seekers-configs new file mode 120000 index 0000000..2068f4b --- /dev/null +++ b/skills/skills-skills/scripts/skill-seekers-configs @@ -0,0 +1 @@ +Skill_Seekers-development/configs \ No newline at end of file diff --git a/skills/skills-skills/scripts/skill-seekers-import.sh b/skills/skills-skills/scripts/skill-seekers-import.sh new file mode 100755 index 0000000..bacc55b --- /dev/null +++ b/skills/skills-skills/scripts/skill-seekers-import.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# ==================== Purpose ==================== +# Import Skill Seekers output/NAME/ into this repo's skills/NAME/. + +usage() { + cat <<'EOF' +Usage: + skill-seekers-import.sh [--force] + +Behavior: + - Source: ./output// + - Dest: ./skills// + - By default, refuses to overwrite an existing skills//SKILL.md + +Examples: + ./skills/skills-skills/scripts/skill-seekers-import.sh react + ./skills/skills-skills/scripts/skill-seekers-import.sh react --force +EOF +} + +die() { + echo "Error: $*" >&2 + exit 1 +} + +force=0 +skill_name="" + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + --force) + force=1 + shift + ;; + --) + shift + break + ;; + -*) + die "Unknown argument: $1 (use --help)" + ;; + *) + if [[ -z "$skill_name" ]]; then + skill_name="$1" + shift + else + die "Extra argument: $1 (only one is allowed)" + fi + ;; + esac +done + +[[ -n "$skill_name" ]] || { usage; exit 1; } +if [[ ! "$skill_name" =~ ^[a-z][a-z0-9-]*$ ]]; then + die "skill-name must match ^[a-z][a-z0-9-]*$ (e.g. my-skill)" +fi + +repo_root="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../../.." && pwd)" +src_dir="${repo_root}/output/${skill_name}" +dest_dir="${repo_root}/skills/${skill_name}" + +[[ -d "$src_dir" ]] || die "Missing Skill Seekers output dir: $src_dir" +[[ -f "$src_dir/SKILL.md" ]] || die "Missing output SKILL.md: $src_dir/SKILL.md" + +mkdir -p "$dest_dir" + +if [[ -f "$dest_dir/SKILL.md" && "$force" -ne 1 ]]; then + die "Refusing to overwrite existing: $dest_dir/SKILL.md (use --force)" +fi + +rsync -a --delete "$src_dir"/ "$dest_dir"/ + +echo "OK: imported to: $dest_dir" + diff --git a/skills/skills-skills/scripts/skill-seekers-src b/skills/skills-skills/scripts/skill-seekers-src new file mode 120000 index 0000000..4685fe0 --- /dev/null +++ b/skills/skills-skills/scripts/skill-seekers-src @@ -0,0 +1 @@ +Skill_Seekers-development/src \ No newline at end of file diff --git a/skills/skills-skills/scripts/skill-seekers-update.sh b/skills/skills-skills/scripts/skill-seekers-update.sh new file mode 100755 index 0000000..590842b --- /dev/null +++ b/skills/skills-skills/scripts/skill-seekers-update.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# ==================== Purpose ==================== +# Update the vendored Skill Seekers source snapshot inside this repo. +# +# Notes: +# - This keeps ONLY "source + configs + runtime manifests" to avoid importing upstream Markdown docs +# (which would affect this repo's markdownlint). + +usage() { + cat <<'EOF' +Usage: + skill-seekers-update.sh [--repo ] [--ref ] [--dry-run] + +Defaults: + --repo yusufkaraaslan/Skill_Seekers + --ref main + +Examples: + ./skills/skills-skills/scripts/skill-seekers-update.sh + ./skills/skills-skills/scripts/skill-seekers-update.sh --ref v2.1.1 + ./skills/skills-skills/scripts/skill-seekers-update.sh --dry-run +EOF +} + +die() { + echo "Error: $*" >&2 + exit 1 +} + +script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +target_dir="${script_dir}/Skill_Seekers-development" + +repo="yusufkaraaslan/Skill_Seekers" +ref="main" +dry_run=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + --repo) + [[ $# -ge 2 ]] || die "--repo requires an argument like owner/repo" + repo="$2" + shift 2 + ;; + --ref) + [[ $# -ge 2 ]] || die "--ref requires a git ref (branch/tag/commit)" + ref="$2" + shift 2 + ;; + --dry-run) + dry_run=1 + shift + ;; + --) + shift + break + ;; + *) + die "Unknown argument: $1 (use --help)" + ;; + esac +done + +command -v curl >/dev/null 2>&1 || die "curl not found" +command -v tar >/dev/null 2>&1 || die "tar not found" +command -v rsync >/dev/null 2>&1 || die "rsync not found" + +tmp_dir="$(mktemp -d)" +cleanup() { rm -rf "$tmp_dir"; } +trap cleanup EXIT + +archive_url="https://codeload.github.com/${repo}/tar.gz/${ref}" +archive_path="${tmp_dir}/skill-seekers.tgz" + +curl -fsSL "$archive_url" -o "$archive_path" +tar -xzf "$archive_path" -C "$tmp_dir" + +extracted_root="$(find "$tmp_dir" -mindepth 1 -maxdepth 1 -type d | head -n 1)" +[[ -n "$extracted_root" ]] || die "Failed to locate extracted archive root" + +if [[ "$dry_run" -eq 1 ]]; then + echo "DRY RUN:" + echo " repo: $repo" + echo " ref: $ref" + echo " from: $extracted_root" + echo " to: $target_dir" + exit 0 +fi + +mkdir -p "$target_dir" + +rsync -a --delete \ + --exclude '.git' \ + --exclude '*.md' \ + --exclude 'docs/' \ + --exclude 'tests/' \ + --exclude '.claude/' \ + --exclude '.gitignore' \ + --exclude 'CHANGELOG.md' \ + --exclude 'ROADMAP.md' \ + --exclude 'FUTURE_RELEASES.md' \ + --exclude 'ASYNC_SUPPORT.md' \ + --exclude 'STRUCTURE.md' \ + --exclude 'CONTRIBUTING.md' \ + --exclude 'QUICKSTART.md' \ + --exclude 'BULLETPROOF_QUICKSTART.md' \ + --exclude 'FLEXIBLE_ROADMAP.md' \ + "$extracted_root"/ \ + "$target_dir"/ + +echo "OK: updated vendored source in: $target_dir" + diff --git a/skills/skills-skills/scripts/skill-seekers.sh b/skills/skills-skills/scripts/skill-seekers.sh new file mode 100755 index 0000000..fe2a927 --- /dev/null +++ b/skills/skills-skills/scripts/skill-seekers.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# ==================== Purpose ==================== +# Run Skill Seekers from vendored source with a local venv. +# +# This script does NOT auto-install dependencies. +# Run skill-seekers-bootstrap.sh once if you see ImportError. + +usage() { + cat <<'EOF' +Usage: + skill-seekers.sh [--venv ] -- + +Examples: + ./skills/skills-skills/scripts/skill-seekers.sh -- --version + ./skills/skills-skills/scripts/skill-seekers.sh -- scrape --config ./skills/skills-skills/scripts/Skill_Seekers-development/configs/react.json + ./skills/skills-skills/scripts/skill-seekers.sh -- github --repo facebook/react --name react +EOF +} + +die() { + echo "Error: $*" >&2 + exit 1 +} + +script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +tool_dir="${script_dir}/Skill_Seekers-development" +tool_src="${tool_dir}/src" +default_venv="${script_dir}/.venv-skill-seekers" + +venv_dir="$default_venv" + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + --venv) + [[ $# -ge 2 ]] || die "--venv requires a directory argument" + venv_dir="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + die "Expected '--' before skill-seekers arguments (use --help)" + ;; + esac +done + +[[ -d "$tool_src" ]] || die "Missing vendored source dir: $tool_src" + +python_bin="python3" +if [[ -x "$venv_dir/bin/python" ]]; then + python_bin="$venv_dir/bin/python" +fi + +export PYTHONPATH="$tool_src${PYTHONPATH:+:$PYTHONPATH}" + +exec "$python_bin" -m skill_seekers.cli.main "$@" +