TradingAgents/.claude/lib/file_discovery.py

355 lines
10 KiB
Python

#!/usr/bin/env python3
"""
File Discovery Engine - Comprehensive plugin file discovery for installation
This module provides comprehensive file discovery for plugin installation,
ensuring 100% file coverage (all 201+ files) instead of the current ~76% (152 files).
Key Features:
- Recursive directory traversal (finds all files, not just *.md)
- Intelligent exclusion patterns (cache, build artifacts, hidden files)
- Nested skill structure support (skills/[name].skill/docs/...)
- Installation manifest generation
- Coverage validation
Current Problem:
- install.sh uses shallow glob patterns (*.md) - misses Python files
- Only copies ~152 of 201 files (76% coverage)
- Missing: All 9 scripts/, 23 of 48 lib/ files, 3 agent implementations
Solution:
- Comprehensive recursive file discovery
- Structured copy preserving directory hierarchy
- Validation to detect missing files
Usage:
from file_discovery import FileDiscovery
# Discover all files
discovery = FileDiscovery(plugin_dir)
files = discovery.discover_all_files() # Returns list of Path objects
# Generate manifest
manifest = discovery.generate_manifest()
# Validate against manifest
missing = discovery.validate_against_manifest(manifest)
Date: 2025-11-17
Issue: GitHub #80 (Bootstrap overhaul - 100% file coverage)
Agent: implementer
Design Patterns:
See library-design-patterns skill for standardized design patterns.
See file-organization skill for directory structure patterns.
"""
import json
from pathlib import Path
from typing import List, Dict, Any
# Security utilities for path validation and audit logging
try:
from plugins.autonomous_dev.lib.security_utils import validate_path, audit_log
except ImportError:
from security_utils import validate_path, audit_log
# Exclusion patterns for file discovery
EXCLUDE_PATTERNS = {
# Cache and build artifacts
"__pycache__",
"*.pyc",
"*.pyo",
"*.pyd",
".pytest_cache",
"*.egg-info",
".eggs",
"*.egg",
"build",
"dist",
# Version control
".git",
".gitignore",
".gitattributes",
# IDE and editor files
".vscode",
".idea",
"*.swp",
"*.swo",
".DS_Store",
# Temporary files
"*.tmp",
"*.bak",
"*.log",
"*~",
}
# Directory patterns to exclude (partial match)
EXCLUDE_DIR_PATTERNS = [
".egg-info",
"__pycache__",
".pytest_cache",
".git",
".eggs",
"build",
"dist",
]
# Hidden files to INCLUDE (exceptions to hidden file exclusion)
INCLUDE_HIDDEN = {
".env.example",
}
class FileDiscovery:
"""Comprehensive file discovery for plugin installation.
Discovers all files in plugin directory with intelligent exclusions,
supporting nested structures (skills, lib, scripts, etc.).
Attributes:
plugin_dir: Path to plugin directory (e.g., plugins/autonomous-dev/)
Examples:
>>> discovery = FileDiscovery(plugin_dir)
>>> files = discovery.discover_all_files()
>>> print(f"Found {len(files)} files")
>>> manifest = discovery.generate_manifest()
"""
def __init__(self, plugin_dir: Path):
"""Initialize file discovery for plugin directory.
Args:
plugin_dir: Path to plugin directory
Raises:
FileNotFoundError: If plugin directory doesn't exist
ValueError: If path validation fails (path traversal, symlink)
"""
# Validate plugin directory path (prevents CWE-22, CWE-59)
self.plugin_dir = validate_path(
Path(plugin_dir).resolve(),
purpose="plugin directory",
allow_missing=False
)
# Audit log initialization
audit_log("file_discovery", "initialized", {
"plugin_dir": str(self.plugin_dir)
})
def discover_all_files(self) -> List[Path]:
"""Discover all files in plugin directory recursively.
Returns:
List of absolute Path objects for all discovered files
Raises:
FileNotFoundError: If plugin directory doesn't exist
Examples:
>>> files = discovery.discover_all_files()
>>> for file in files:
... print(file.relative_to(plugin_dir))
"""
if not self.plugin_dir.exists():
raise FileNotFoundError(
f"Plugin directory not found: {self.plugin_dir}\n"
f"Expected structure: plugins/autonomous-dev/"
)
files = []
# Recursively walk directory tree
for path in self.plugin_dir.rglob("*"):
# Skip directories (we only want files)
if path.is_dir():
continue
# Security: Skip symlinks to prevent CWE-59 (Symlink Following)
if path.is_symlink():
audit_log("file_discovery", "skipped_symlink", {
"path": str(path),
"reason": "Symlinks not allowed in plugin distribution"
})
continue
# Skip if matches exclusion pattern
if self._should_exclude(path):
continue
files.append(path)
return sorted(files) # Sort for deterministic ordering
def _should_exclude(self, path: Path) -> bool:
"""Check if path should be excluded from discovery.
Exclusion rules:
- Cache directories (__pycache__, .pytest_cache)
- Build artifacts (*.pyc, *.egg-info)
- Version control (.git/)
- Hidden files (.*) EXCEPT .env.example
- Temporary files (*.tmp, *.bak)
Args:
path: Path to check
Returns:
True if path should be excluded, False otherwise
"""
# Check if in excluded directory
parts = path.relative_to(self.plugin_dir).parts
for part in parts:
# Excluded directory names (exact match)
if part in EXCLUDE_PATTERNS:
return True
# Excluded directory patterns (partial match for .egg-info, etc.)
for dir_pattern in EXCLUDE_DIR_PATTERNS:
if dir_pattern in part:
return True
# Hidden directories (except allowed)
if part.startswith(".") and part not in INCLUDE_HIDDEN:
return True
# Check file name patterns
name = path.name
# Excluded file patterns
for pattern in EXCLUDE_PATTERNS:
if "*" in pattern:
# Wildcard pattern (*.pyc, etc.)
suffix = pattern.replace("*", "")
if name.endswith(suffix):
return True
elif name == pattern:
return True
# Hidden files (except allowed)
if name.startswith(".") and name not in INCLUDE_HIDDEN:
return True
return False
def count_files(self) -> int:
"""Count total number of files discovered.
Returns:
Total file count
Examples:
>>> count = discovery.count_files()
>>> print(f"Total files: {count}")
"""
return len(self.discover_all_files())
def generate_manifest(self) -> Dict[str, Any]:
"""Generate installation manifest with file metadata.
Manifest format:
{
"version": "1.0",
"total_files": 201,
"files": [
{"path": "commands/auto-implement.md", "size": 1234},
{"path": "lib/security_utils.py", "size": 5678},
...
]
}
Returns:
Manifest dictionary
Examples:
>>> manifest = discovery.generate_manifest()
>>> print(f"Total files: {manifest['total_files']}")
"""
files = self.discover_all_files()
manifest = {
"version": "1.0",
"total_files": len(files),
"files": []
}
for file_path in files:
relative = file_path.relative_to(self.plugin_dir)
manifest["files"].append({
"path": str(relative).replace("\\", "/"), # Unix-style paths
"size": file_path.stat().st_size
})
return manifest
def save_manifest(self, manifest_path: Path) -> None:
"""Save installation manifest to JSON file.
Args:
manifest_path: Path to save manifest (e.g., config/installation_manifest.json)
Examples:
>>> manifest_path = plugin_dir / "config" / "installation_manifest.json"
>>> discovery.save_manifest(manifest_path)
"""
manifest = self.generate_manifest()
# Create parent directory if needed
manifest_path.parent.mkdir(parents=True, exist_ok=True)
# Save as formatted JSON
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
def validate_against_manifest(self, manifest: Dict[str, Any]) -> List[str]:
"""Validate current files against installation manifest.
Detects files that are in manifest but missing from filesystem.
Args:
manifest: Installation manifest dictionary
Returns:
List of missing file paths (relative to plugin_dir)
Examples:
>>> missing = discovery.validate_against_manifest(manifest)
>>> if missing:
... print(f"Missing {len(missing)} files:")
... for file in missing:
... print(f" - {file}")
"""
current_files = self.discover_all_files()
current_relative = {
str(f.relative_to(self.plugin_dir)).replace("\\", "/")
for f in current_files
}
expected_files = {f["path"] for f in manifest["files"]}
missing = expected_files - current_relative
return sorted(missing)
def get_relative_path(self, file_path: Path) -> Path:
"""Get relative path for file (for copying).
Args:
file_path: Absolute path to file
Returns:
Relative path from plugin_dir
Examples:
>>> abs_path = plugin_dir / "lib" / "nested" / "utils.py"
>>> rel_path = discovery.get_relative_path(abs_path)
>>> print(rel_path) # lib/nested/utils.py
"""
return file_path.relative_to(self.plugin_dir)