509 lines
16 KiB
Python
509 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Feature dependency analyzer for smart batch ordering.
|
|
|
|
This module analyzes feature descriptions to detect dependencies and
|
|
optimizes execution order using topological sort (Kahn's algorithm).
|
|
|
|
Features:
|
|
- Keyword-based dependency detection (requires, depends, after, before, uses)
|
|
- File reference detection (.py, .md, .json, etc.)
|
|
- Topological sort for optimal execution order
|
|
- Circular dependency detection
|
|
- ASCII dependency graph visualization
|
|
- Security validations (CWE-22, CWE-78)
|
|
- Performance limits (timeout, memory)
|
|
|
|
Usage:
|
|
>>> from feature_dependency_analyzer import analyze_dependencies, topological_sort
|
|
>>> features = ["Add auth", "Add tests for auth"]
|
|
>>> deps = analyze_dependencies(features)
|
|
>>> ordered = topological_sort(features, deps)
|
|
|
|
Security:
|
|
- Input sanitization for feature text
|
|
- Resource limits (MAX_FEATURES=1000, TIMEOUT_SECONDS=5)
|
|
- No shell execution
|
|
- Path traversal protection (CWE-22)
|
|
|
|
Date: 2025-12-23
|
|
Issue: #157 (Smart dependency ordering for /batch-implement)
|
|
Version: 1.0.0
|
|
"""
|
|
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set, Any
|
|
import sys
|
|
|
|
# Add lib directory to path for validation imports
|
|
lib_path = Path(__file__).parent
|
|
if str(lib_path) not in sys.path:
|
|
sys.path.insert(0, str(lib_path))
|
|
|
|
try:
|
|
from validation import sanitize_text_input
|
|
except ImportError:
|
|
# Graceful degradation if validation not available
|
|
def sanitize_text_input(text: str) -> str:
|
|
"""Fallback sanitization."""
|
|
return str(text)[:10000] # Basic length limit
|
|
|
|
|
|
# =============================================================================
|
|
# Constants
|
|
# =============================================================================
|
|
|
|
DEPENDENCY_KEYWORDS = {"requires", "depends", "after", "before", "uses", "needs"}
|
|
FILE_KEYWORDS = {".py", ".md", ".json", ".yaml", ".yml", ".sh", ".ts", ".js", ".tsx", ".jsx"}
|
|
MAX_FEATURES = 1000
|
|
TIMEOUT_SECONDS = 5
|
|
|
|
|
|
# =============================================================================
|
|
# Exceptions
|
|
# =============================================================================
|
|
|
|
class FeatureDependencyError(Exception):
|
|
"""Base exception for feature dependency operations."""
|
|
pass
|
|
|
|
|
|
class CircularDependencyError(FeatureDependencyError):
|
|
"""Raised when circular dependencies detected."""
|
|
pass
|
|
|
|
|
|
class AnalysisTimeoutError(FeatureDependencyError):
|
|
"""Raised when analysis exceeds timeout."""
|
|
pass
|
|
|
|
|
|
# =============================================================================
|
|
# Core Functions
|
|
# =============================================================================
|
|
|
|
def detect_keywords(feature_text: str) -> Set[str]:
|
|
"""Extract dependency keywords from feature text.
|
|
|
|
Detects:
|
|
- Dependency keywords: requires, depends, after, before, uses, needs
|
|
- File references: .py, .md, .json, .yaml, .yml, .sh, .ts, .js
|
|
|
|
Args:
|
|
feature_text: Feature description text
|
|
|
|
Returns:
|
|
Set of detected keywords (lowercase)
|
|
|
|
Examples:
|
|
>>> detect_keywords("Add login that requires authentication")
|
|
{'authentication'}
|
|
>>> detect_keywords("Update auth.py to add JWT")
|
|
{'auth.py', 'jwt'}
|
|
"""
|
|
# Sanitize input
|
|
text = sanitize_text_input(feature_text)
|
|
text_lower = text.lower()
|
|
|
|
keywords = set()
|
|
|
|
# Detect dependency keywords
|
|
for keyword in DEPENDENCY_KEYWORDS:
|
|
pattern = rf'\b{keyword}\b\s+(\w+(?:\s+\w+)?)'
|
|
matches = re.finditer(pattern, text_lower)
|
|
for match in matches:
|
|
# Extract the word(s) after the keyword
|
|
extracted = match.group(1).strip()
|
|
# Split on common stop words and take meaningful parts
|
|
parts = extracted.split()
|
|
for part in parts:
|
|
if len(part) > 2 and part not in {'the', 'and', 'for', 'that', 'this', 'with'}:
|
|
keywords.add(part)
|
|
|
|
# Detect file references
|
|
for file_ext in FILE_KEYWORDS:
|
|
pattern = rf'(\w+{re.escape(file_ext)})'
|
|
matches = re.finditer(pattern, text_lower)
|
|
for match in matches:
|
|
keywords.add(match.group(1))
|
|
|
|
# Also extract significant words (nouns, tech terms)
|
|
# Look for capitalized words or common tech terms
|
|
tech_pattern = r'\b([A-Z][A-Za-z0-9]+|[a-z]+(?:API|DB|JWT|HTTP|SQL|REST|CRUD))\b'
|
|
matches = re.finditer(tech_pattern, text)
|
|
|
|
# Filter out common action verbs and generic words
|
|
stop_words = {'add', 'update', 'fix', 'remove', 'delete', 'create', 'implement',
|
|
'typo', 'documentation', 'file', 'code', 'change', 'modify'}
|
|
|
|
for match in matches:
|
|
word = match.group(1).lower()
|
|
if len(word) > 2 and word not in stop_words:
|
|
keywords.add(word)
|
|
|
|
return keywords
|
|
|
|
|
|
def build_dependency_graph(features: List[str], keywords: Dict[int, Set[str]]) -> Dict[int, List[int]]:
|
|
"""Build dependency graph from keywords.
|
|
|
|
Match keywords across features to detect dependencies.
|
|
If feature B's keywords match feature A's significant terms,
|
|
then B depends on A.
|
|
|
|
Logic:
|
|
- Features with "test", "tests", "testing" depend on features they test
|
|
- Features with dependency keywords (requires, depends, after, uses) depend on referenced features
|
|
- File references create dependencies (feature modifying file.py depends on feature creating file.py)
|
|
|
|
Args:
|
|
features: List of feature descriptions
|
|
keywords: Dict mapping feature index to keywords
|
|
|
|
Returns:
|
|
Dict mapping feature index to list of dependency indices
|
|
|
|
Example:
|
|
>>> features = ["Add auth", "Add tests for auth"]
|
|
>>> keywords = {0: {"auth"}, 1: {"tests", "auth"}}
|
|
>>> build_dependency_graph(features, keywords)
|
|
{0: [], 1: [0]}
|
|
"""
|
|
deps: Dict[int, List[int]] = {i: [] for i in range(len(features))}
|
|
|
|
# Extract main subject/topic from each feature
|
|
feature_topics: Dict[int, Set[str]] = {}
|
|
for i, feature in enumerate(features):
|
|
feature_lower = feature.lower()
|
|
topics = set()
|
|
|
|
# Extract main nouns/topics (skip verbs like "add", "update", "fix")
|
|
skip_words = {'add', 'update', 'fix', 'remove', 'delete', 'create', 'implement',
|
|
'the', 'and', 'for', 'that', 'this', 'with', 'to', 'from', 'test', 'tests', 'testing'}
|
|
words = feature_lower.split()
|
|
for word in words:
|
|
if len(word) > 2 and word not in skip_words:
|
|
topics.add(word)
|
|
|
|
feature_topics[i] = topics
|
|
|
|
# Build dependencies based on feature relationships
|
|
for i in range(len(features)):
|
|
feature_i = features[i].lower()
|
|
keywords_i = keywords.get(i, set())
|
|
topics_i = feature_topics[i]
|
|
|
|
# Check if this is a test/dependent feature
|
|
is_test = any(word in feature_i for word in ['test', 'tests', 'testing'])
|
|
has_dependency_keyword = any(kw in feature_i for kw in DEPENDENCY_KEYWORDS)
|
|
|
|
# Extract what feature i creates vs what it requires
|
|
creates_i = set()
|
|
requires_i = set()
|
|
|
|
# Pattern: "Add X" or "Create X" creates X
|
|
create_match = re.search(r'(?:add|create|implement)\s+(\w+)', feature_i)
|
|
if create_match:
|
|
creates_i.add(create_match.group(1))
|
|
|
|
# Pattern: "requires X", "depends on X", "after X", "using X"
|
|
for kw in DEPENDENCY_KEYWORDS:
|
|
pattern = rf'{kw}\s+(\w+)'
|
|
matches = re.finditer(pattern, feature_i)
|
|
for match in matches:
|
|
requires_i.add(match.group(1))
|
|
|
|
for j in range(len(features)):
|
|
if i == j:
|
|
continue
|
|
|
|
feature_j = features[j].lower()
|
|
topics_j = feature_topics[j]
|
|
|
|
# What does feature j create?
|
|
creates_j = set()
|
|
create_match_j = re.search(r'(?:add|create|implement)\s+(\w+)', feature_j)
|
|
if create_match_j:
|
|
creates_j.add(create_match_j.group(1))
|
|
|
|
# Skip if j is also a test (tests don't depend on other tests typically)
|
|
is_j_test = any(word in feature_j for word in ['test', 'tests', 'testing'])
|
|
|
|
# Rule 1: Test features depend on non-test features they reference
|
|
if is_test and not is_j_test:
|
|
# Check if feature i (test) references topics from feature j
|
|
if topics_i & topics_j:
|
|
if j not in deps[i]:
|
|
deps[i].append(j)
|
|
continue
|
|
|
|
# Rule 2: If feature i REQUIRES something that feature j CREATES, i depends on j
|
|
if requires_i & creates_j:
|
|
if j not in deps[i]:
|
|
deps[i].append(j)
|
|
continue
|
|
|
|
# Rule 3: Features with dependency keywords depend on earlier features with shared topics
|
|
# Only if j comes before i (temporal ordering)
|
|
if has_dependency_keyword and not is_test and j < i:
|
|
# Check if feature i has dependency keyword pointing to feature j topics
|
|
if topics_i & topics_j:
|
|
if j not in deps[i]:
|
|
deps[i].append(j)
|
|
continue
|
|
|
|
# Rule 4: File references - feature modifying file depends on feature creating it
|
|
# (Earlier features that mention a file are assumed to create it)
|
|
file_refs_i = {k for k in keywords_i if any(ext in k for ext in FILE_KEYWORDS)}
|
|
file_refs_j = {k for k in keywords.get(j, set()) if any(ext in k for ext in FILE_KEYWORDS)}
|
|
|
|
if file_refs_i & file_refs_j and j < i: # Only depend on earlier features
|
|
if j not in deps[i]:
|
|
deps[i].append(j)
|
|
|
|
return deps
|
|
|
|
|
|
def analyze_dependencies(features: List[str]) -> Dict[int, List[int]]:
|
|
"""Main entry point - detect dependencies via keyword matching.
|
|
|
|
Args:
|
|
features: List of feature descriptions
|
|
|
|
Returns:
|
|
Dict mapping feature index to list of dependency indices
|
|
|
|
Raises:
|
|
ValueError: If features list is too large (>MAX_FEATURES)
|
|
AnalysisTimeoutError: If analysis exceeds TIMEOUT_SECONDS
|
|
|
|
Examples:
|
|
>>> features = ["Add auth", "Add tests for auth"]
|
|
>>> analyze_dependencies(features)
|
|
{0: [], 1: [0]}
|
|
"""
|
|
# Validate input size
|
|
if len(features) > MAX_FEATURES:
|
|
raise ValueError(f"Too many features ({len(features)} > {MAX_FEATURES})")
|
|
|
|
start_time = time.time()
|
|
|
|
# Extract keywords from each feature
|
|
keywords: Dict[int, Set[str]] = {}
|
|
for i, feature in enumerate(features):
|
|
# Check timeout
|
|
if time.time() - start_time > TIMEOUT_SECONDS:
|
|
raise AnalysisTimeoutError(f"Analysis exceeded {TIMEOUT_SECONDS}s timeout")
|
|
|
|
keywords[i] = detect_keywords(feature)
|
|
|
|
# Build dependency graph
|
|
deps = build_dependency_graph(features, keywords)
|
|
|
|
return deps
|
|
|
|
|
|
def topological_sort(features: List[str], deps: Dict[int, List[int]]) -> List[int]:
|
|
"""Order features using Kahn's algorithm.
|
|
|
|
Returns features in dependency-respecting order.
|
|
If circular dependencies detected, returns original order.
|
|
|
|
Args:
|
|
features: List of feature descriptions
|
|
deps: Dict mapping feature index to dependency indices
|
|
|
|
Returns:
|
|
List of feature indices in execution order
|
|
|
|
Raises:
|
|
CircularDependencyError: If circular dependencies detected
|
|
|
|
Examples:
|
|
>>> features = ["Add auth", "Add tests"]
|
|
>>> deps = {0: [], 1: [0]}
|
|
>>> topological_sort(features, deps)
|
|
[0, 1]
|
|
"""
|
|
# Handle empty graph
|
|
if not features:
|
|
return []
|
|
|
|
# Remove self-dependencies (ignore them)
|
|
clean_deps = {}
|
|
for i, dependencies in deps.items():
|
|
clean_deps[i] = [d for d in dependencies if d != i]
|
|
|
|
# Calculate in-degree for each node
|
|
in_degree = {i: 0 for i in range(len(features))}
|
|
for i, dependencies in clean_deps.items():
|
|
in_degree[i] = len(dependencies)
|
|
|
|
# Queue of nodes with no dependencies
|
|
queue = [i for i, degree in in_degree.items() if degree == 0]
|
|
sorted_order = []
|
|
|
|
while queue:
|
|
# Sort queue to prefer original order (stable sort)
|
|
queue.sort()
|
|
|
|
current = queue.pop(0)
|
|
sorted_order.append(current)
|
|
|
|
# Update in-degrees for nodes that depend on current
|
|
for i, dependencies in clean_deps.items():
|
|
if current in dependencies:
|
|
in_degree[i] -= 1
|
|
if in_degree[i] == 0:
|
|
queue.append(i)
|
|
|
|
# Check for circular dependencies
|
|
if len(sorted_order) != len(features):
|
|
# Circular dependency detected
|
|
# Return original order as fallback
|
|
raise CircularDependencyError(
|
|
f"Circular dependency detected: {len(sorted_order)} of {len(features)} features ordered"
|
|
)
|
|
|
|
return sorted_order
|
|
|
|
|
|
def visualize_graph(features: List[str], deps: Dict[int, List[int]]) -> str:
|
|
"""Generate ASCII dependency graph for user review.
|
|
|
|
Args:
|
|
features: List of feature descriptions
|
|
deps: Dict mapping feature index to dependency indices
|
|
|
|
Returns:
|
|
Multi-line string showing dependencies
|
|
|
|
Examples:
|
|
>>> features = ["Add auth", "Add tests"]
|
|
>>> deps = {0: [], 1: [0]}
|
|
>>> print(visualize_graph(features, deps))
|
|
Feature Dependency Graph:
|
|
|
|
[0] Add auth
|
|
|
|
[1] Add tests
|
|
└─> depends on [0] Add auth
|
|
"""
|
|
if not features:
|
|
return "No features to visualize."
|
|
|
|
lines = ["Feature Dependency Graph:", ""]
|
|
|
|
for i, feature in enumerate(features):
|
|
# Truncate long features
|
|
display_feature = feature[:60] + "..." if len(feature) > 60 else feature
|
|
|
|
lines.append(f"[{i}] {display_feature}")
|
|
|
|
# Show dependencies
|
|
dependencies = deps.get(i, [])
|
|
if dependencies:
|
|
for dep_idx in dependencies:
|
|
dep_feature = features[dep_idx]
|
|
dep_display = dep_feature[:50] + "..." if len(dep_feature) > 50 else dep_feature
|
|
lines.append(f" └─> depends on [{dep_idx}] {dep_display}")
|
|
|
|
lines.append("") # Blank line between features
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
# =============================================================================
|
|
# Helper Functions
|
|
# =============================================================================
|
|
|
|
def detect_circular_dependencies(deps: Dict[int, List[int]]) -> List[List[int]]:
|
|
"""Detect circular dependencies in graph.
|
|
|
|
Args:
|
|
deps: Dependency graph
|
|
|
|
Returns:
|
|
List of circular dependency chains
|
|
"""
|
|
cycles = []
|
|
visited = set()
|
|
rec_stack = set()
|
|
|
|
def dfs(node: int, path: List[int]) -> None:
|
|
"""DFS to detect cycles."""
|
|
visited.add(node)
|
|
rec_stack.add(node)
|
|
path.append(node)
|
|
|
|
for neighbor in deps.get(node, []):
|
|
if neighbor not in visited:
|
|
dfs(neighbor, path.copy())
|
|
elif neighbor in rec_stack:
|
|
# Cycle detected
|
|
cycle_start = path.index(neighbor)
|
|
cycle = path[cycle_start:] + [neighbor]
|
|
cycles.append(cycle)
|
|
|
|
rec_stack.remove(node)
|
|
|
|
for i in range(len(deps)):
|
|
if i not in visited:
|
|
dfs(i, [])
|
|
|
|
return cycles
|
|
|
|
|
|
def get_execution_order_stats(features: List[str], deps: Dict[int, List[int]],
|
|
ordered: List[int]) -> Dict[str, Any]:
|
|
"""Get statistics about execution order optimization.
|
|
|
|
Args:
|
|
features: List of feature descriptions
|
|
deps: Dependency graph
|
|
ordered: Ordered list of feature indices
|
|
|
|
Returns:
|
|
Dict with statistics
|
|
"""
|
|
total_deps = sum(len(d) for d in deps.values())
|
|
independent = sum(1 for d in deps.values() if len(d) == 0)
|
|
|
|
return {
|
|
"total_features": len(features),
|
|
"total_dependencies": total_deps,
|
|
"independent_features": independent,
|
|
"dependent_features": len(features) - independent,
|
|
"optimization_ratio": independent / len(features) if features else 0.0,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Module Exports
|
|
# =============================================================================
|
|
|
|
__all__ = [
|
|
# Core functions
|
|
"analyze_dependencies",
|
|
"topological_sort",
|
|
"visualize_graph",
|
|
"detect_keywords",
|
|
"build_dependency_graph",
|
|
|
|
# Exceptions
|
|
"FeatureDependencyError",
|
|
"CircularDependencyError",
|
|
"AnalysisTimeoutError",
|
|
|
|
# Constants
|
|
"DEPENDENCY_KEYWORDS",
|
|
"FILE_KEYWORDS",
|
|
"TIMEOUT_SECONDS",
|
|
|
|
# Helper functions
|
|
"detect_circular_dependencies",
|
|
"get_execution_order_stats",
|
|
]
|