TradingAgents/.claude/lib/feature_dependency_analyzer.py

509 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Feature dependency analyzer for smart batch ordering.
This module analyzes feature descriptions to detect dependencies and
optimizes execution order using topological sort (Kahn's algorithm).
Features:
- Keyword-based dependency detection (requires, depends, after, before, uses)
- File reference detection (.py, .md, .json, etc.)
- Topological sort for optimal execution order
- Circular dependency detection
- ASCII dependency graph visualization
- Security validations (CWE-22, CWE-78)
- Performance limits (timeout, memory)
Usage:
>>> from feature_dependency_analyzer import analyze_dependencies, topological_sort
>>> features = ["Add auth", "Add tests for auth"]
>>> deps = analyze_dependencies(features)
>>> ordered = topological_sort(features, deps)
Security:
- Input sanitization for feature text
- Resource limits (MAX_FEATURES=1000, TIMEOUT_SECONDS=5)
- No shell execution
- Path traversal protection (CWE-22)
Date: 2025-12-23
Issue: #157 (Smart dependency ordering for /batch-implement)
Version: 1.0.0
"""
import re
import time
from pathlib import Path
from typing import Dict, List, Set, Any
import sys
# Add lib directory to path for validation imports
lib_path = Path(__file__).parent
if str(lib_path) not in sys.path:
sys.path.insert(0, str(lib_path))
try:
from validation import sanitize_text_input
except ImportError:
# Graceful degradation if validation not available
def sanitize_text_input(text: str) -> str:
"""Fallback sanitization."""
return str(text)[:10000] # Basic length limit
# =============================================================================
# Constants
# =============================================================================
DEPENDENCY_KEYWORDS = {"requires", "depends", "after", "before", "uses", "needs"}
FILE_KEYWORDS = {".py", ".md", ".json", ".yaml", ".yml", ".sh", ".ts", ".js", ".tsx", ".jsx"}
MAX_FEATURES = 1000
TIMEOUT_SECONDS = 5
# =============================================================================
# Exceptions
# =============================================================================
class FeatureDependencyError(Exception):
"""Base exception for feature dependency operations."""
pass
class CircularDependencyError(FeatureDependencyError):
"""Raised when circular dependencies detected."""
pass
class AnalysisTimeoutError(FeatureDependencyError):
"""Raised when analysis exceeds timeout."""
pass
# =============================================================================
# Core Functions
# =============================================================================
def detect_keywords(feature_text: str) -> Set[str]:
"""Extract dependency keywords from feature text.
Detects:
- Dependency keywords: requires, depends, after, before, uses, needs
- File references: .py, .md, .json, .yaml, .yml, .sh, .ts, .js
Args:
feature_text: Feature description text
Returns:
Set of detected keywords (lowercase)
Examples:
>>> detect_keywords("Add login that requires authentication")
{'authentication'}
>>> detect_keywords("Update auth.py to add JWT")
{'auth.py', 'jwt'}
"""
# Sanitize input
text = sanitize_text_input(feature_text)
text_lower = text.lower()
keywords = set()
# Detect dependency keywords
for keyword in DEPENDENCY_KEYWORDS:
pattern = rf'\b{keyword}\b\s+(\w+(?:\s+\w+)?)'
matches = re.finditer(pattern, text_lower)
for match in matches:
# Extract the word(s) after the keyword
extracted = match.group(1).strip()
# Split on common stop words and take meaningful parts
parts = extracted.split()
for part in parts:
if len(part) > 2 and part not in {'the', 'and', 'for', 'that', 'this', 'with'}:
keywords.add(part)
# Detect file references
for file_ext in FILE_KEYWORDS:
pattern = rf'(\w+{re.escape(file_ext)})'
matches = re.finditer(pattern, text_lower)
for match in matches:
keywords.add(match.group(1))
# Also extract significant words (nouns, tech terms)
# Look for capitalized words or common tech terms
tech_pattern = r'\b([A-Z][A-Za-z0-9]+|[a-z]+(?:API|DB|JWT|HTTP|SQL|REST|CRUD))\b'
matches = re.finditer(tech_pattern, text)
# Filter out common action verbs and generic words
stop_words = {'add', 'update', 'fix', 'remove', 'delete', 'create', 'implement',
'typo', 'documentation', 'file', 'code', 'change', 'modify'}
for match in matches:
word = match.group(1).lower()
if len(word) > 2 and word not in stop_words:
keywords.add(word)
return keywords
def build_dependency_graph(features: List[str], keywords: Dict[int, Set[str]]) -> Dict[int, List[int]]:
"""Build dependency graph from keywords.
Match keywords across features to detect dependencies.
If feature B's keywords match feature A's significant terms,
then B depends on A.
Logic:
- Features with "test", "tests", "testing" depend on features they test
- Features with dependency keywords (requires, depends, after, uses) depend on referenced features
- File references create dependencies (feature modifying file.py depends on feature creating file.py)
Args:
features: List of feature descriptions
keywords: Dict mapping feature index to keywords
Returns:
Dict mapping feature index to list of dependency indices
Example:
>>> features = ["Add auth", "Add tests for auth"]
>>> keywords = {0: {"auth"}, 1: {"tests", "auth"}}
>>> build_dependency_graph(features, keywords)
{0: [], 1: [0]}
"""
deps: Dict[int, List[int]] = {i: [] for i in range(len(features))}
# Extract main subject/topic from each feature
feature_topics: Dict[int, Set[str]] = {}
for i, feature in enumerate(features):
feature_lower = feature.lower()
topics = set()
# Extract main nouns/topics (skip verbs like "add", "update", "fix")
skip_words = {'add', 'update', 'fix', 'remove', 'delete', 'create', 'implement',
'the', 'and', 'for', 'that', 'this', 'with', 'to', 'from', 'test', 'tests', 'testing'}
words = feature_lower.split()
for word in words:
if len(word) > 2 and word not in skip_words:
topics.add(word)
feature_topics[i] = topics
# Build dependencies based on feature relationships
for i in range(len(features)):
feature_i = features[i].lower()
keywords_i = keywords.get(i, set())
topics_i = feature_topics[i]
# Check if this is a test/dependent feature
is_test = any(word in feature_i for word in ['test', 'tests', 'testing'])
has_dependency_keyword = any(kw in feature_i for kw in DEPENDENCY_KEYWORDS)
# Extract what feature i creates vs what it requires
creates_i = set()
requires_i = set()
# Pattern: "Add X" or "Create X" creates X
create_match = re.search(r'(?:add|create|implement)\s+(\w+)', feature_i)
if create_match:
creates_i.add(create_match.group(1))
# Pattern: "requires X", "depends on X", "after X", "using X"
for kw in DEPENDENCY_KEYWORDS:
pattern = rf'{kw}\s+(\w+)'
matches = re.finditer(pattern, feature_i)
for match in matches:
requires_i.add(match.group(1))
for j in range(len(features)):
if i == j:
continue
feature_j = features[j].lower()
topics_j = feature_topics[j]
# What does feature j create?
creates_j = set()
create_match_j = re.search(r'(?:add|create|implement)\s+(\w+)', feature_j)
if create_match_j:
creates_j.add(create_match_j.group(1))
# Skip if j is also a test (tests don't depend on other tests typically)
is_j_test = any(word in feature_j for word in ['test', 'tests', 'testing'])
# Rule 1: Test features depend on non-test features they reference
if is_test and not is_j_test:
# Check if feature i (test) references topics from feature j
if topics_i & topics_j:
if j not in deps[i]:
deps[i].append(j)
continue
# Rule 2: If feature i REQUIRES something that feature j CREATES, i depends on j
if requires_i & creates_j:
if j not in deps[i]:
deps[i].append(j)
continue
# Rule 3: Features with dependency keywords depend on earlier features with shared topics
# Only if j comes before i (temporal ordering)
if has_dependency_keyword and not is_test and j < i:
# Check if feature i has dependency keyword pointing to feature j topics
if topics_i & topics_j:
if j not in deps[i]:
deps[i].append(j)
continue
# Rule 4: File references - feature modifying file depends on feature creating it
# (Earlier features that mention a file are assumed to create it)
file_refs_i = {k for k in keywords_i if any(ext in k for ext in FILE_KEYWORDS)}
file_refs_j = {k for k in keywords.get(j, set()) if any(ext in k for ext in FILE_KEYWORDS)}
if file_refs_i & file_refs_j and j < i: # Only depend on earlier features
if j not in deps[i]:
deps[i].append(j)
return deps
def analyze_dependencies(features: List[str]) -> Dict[int, List[int]]:
"""Main entry point - detect dependencies via keyword matching.
Args:
features: List of feature descriptions
Returns:
Dict mapping feature index to list of dependency indices
Raises:
ValueError: If features list is too large (>MAX_FEATURES)
AnalysisTimeoutError: If analysis exceeds TIMEOUT_SECONDS
Examples:
>>> features = ["Add auth", "Add tests for auth"]
>>> analyze_dependencies(features)
{0: [], 1: [0]}
"""
# Validate input size
if len(features) > MAX_FEATURES:
raise ValueError(f"Too many features ({len(features)} > {MAX_FEATURES})")
start_time = time.time()
# Extract keywords from each feature
keywords: Dict[int, Set[str]] = {}
for i, feature in enumerate(features):
# Check timeout
if time.time() - start_time > TIMEOUT_SECONDS:
raise AnalysisTimeoutError(f"Analysis exceeded {TIMEOUT_SECONDS}s timeout")
keywords[i] = detect_keywords(feature)
# Build dependency graph
deps = build_dependency_graph(features, keywords)
return deps
def topological_sort(features: List[str], deps: Dict[int, List[int]]) -> List[int]:
"""Order features using Kahn's algorithm.
Returns features in dependency-respecting order.
If circular dependencies detected, returns original order.
Args:
features: List of feature descriptions
deps: Dict mapping feature index to dependency indices
Returns:
List of feature indices in execution order
Raises:
CircularDependencyError: If circular dependencies detected
Examples:
>>> features = ["Add auth", "Add tests"]
>>> deps = {0: [], 1: [0]}
>>> topological_sort(features, deps)
[0, 1]
"""
# Handle empty graph
if not features:
return []
# Remove self-dependencies (ignore them)
clean_deps = {}
for i, dependencies in deps.items():
clean_deps[i] = [d for d in dependencies if d != i]
# Calculate in-degree for each node
in_degree = {i: 0 for i in range(len(features))}
for i, dependencies in clean_deps.items():
in_degree[i] = len(dependencies)
# Queue of nodes with no dependencies
queue = [i for i, degree in in_degree.items() if degree == 0]
sorted_order = []
while queue:
# Sort queue to prefer original order (stable sort)
queue.sort()
current = queue.pop(0)
sorted_order.append(current)
# Update in-degrees for nodes that depend on current
for i, dependencies in clean_deps.items():
if current in dependencies:
in_degree[i] -= 1
if in_degree[i] == 0:
queue.append(i)
# Check for circular dependencies
if len(sorted_order) != len(features):
# Circular dependency detected
# Return original order as fallback
raise CircularDependencyError(
f"Circular dependency detected: {len(sorted_order)} of {len(features)} features ordered"
)
return sorted_order
def visualize_graph(features: List[str], deps: Dict[int, List[int]]) -> str:
"""Generate ASCII dependency graph for user review.
Args:
features: List of feature descriptions
deps: Dict mapping feature index to dependency indices
Returns:
Multi-line string showing dependencies
Examples:
>>> features = ["Add auth", "Add tests"]
>>> deps = {0: [], 1: [0]}
>>> print(visualize_graph(features, deps))
Feature Dependency Graph:
[0] Add auth
[1] Add tests
└─> depends on [0] Add auth
"""
if not features:
return "No features to visualize."
lines = ["Feature Dependency Graph:", ""]
for i, feature in enumerate(features):
# Truncate long features
display_feature = feature[:60] + "..." if len(feature) > 60 else feature
lines.append(f"[{i}] {display_feature}")
# Show dependencies
dependencies = deps.get(i, [])
if dependencies:
for dep_idx in dependencies:
dep_feature = features[dep_idx]
dep_display = dep_feature[:50] + "..." if len(dep_feature) > 50 else dep_feature
lines.append(f" └─> depends on [{dep_idx}] {dep_display}")
lines.append("") # Blank line between features
return "\n".join(lines)
# =============================================================================
# Helper Functions
# =============================================================================
def detect_circular_dependencies(deps: Dict[int, List[int]]) -> List[List[int]]:
"""Detect circular dependencies in graph.
Args:
deps: Dependency graph
Returns:
List of circular dependency chains
"""
cycles = []
visited = set()
rec_stack = set()
def dfs(node: int, path: List[int]) -> None:
"""DFS to detect cycles."""
visited.add(node)
rec_stack.add(node)
path.append(node)
for neighbor in deps.get(node, []):
if neighbor not in visited:
dfs(neighbor, path.copy())
elif neighbor in rec_stack:
# Cycle detected
cycle_start = path.index(neighbor)
cycle = path[cycle_start:] + [neighbor]
cycles.append(cycle)
rec_stack.remove(node)
for i in range(len(deps)):
if i not in visited:
dfs(i, [])
return cycles
def get_execution_order_stats(features: List[str], deps: Dict[int, List[int]],
ordered: List[int]) -> Dict[str, Any]:
"""Get statistics about execution order optimization.
Args:
features: List of feature descriptions
deps: Dependency graph
ordered: Ordered list of feature indices
Returns:
Dict with statistics
"""
total_deps = sum(len(d) for d in deps.values())
independent = sum(1 for d in deps.values() if len(d) == 0)
return {
"total_features": len(features),
"total_dependencies": total_deps,
"independent_features": independent,
"dependent_features": len(features) - independent,
"optimization_ratio": independent / len(features) if features else 0.0,
}
# =============================================================================
# Module Exports
# =============================================================================
__all__ = [
# Core functions
"analyze_dependencies",
"topological_sort",
"visualize_graph",
"detect_keywords",
"build_dependency_graph",
# Exceptions
"FeatureDependencyError",
"CircularDependencyError",
"AnalysisTimeoutError",
# Constants
"DEPENDENCY_KEYWORDS",
"FILE_KEYWORDS",
"TIMEOUT_SECONDS",
# Helper functions
"detect_circular_dependencies",
"get_execution_order_stats",
]