562 lines
16 KiB
Python
562 lines
16 KiB
Python
"""Search utilities for researcher agent.
|
|
|
|
Provides utilities for:
|
|
- Web fetch caching
|
|
- Source quality scoring
|
|
- Pattern quality scoring
|
|
- Knowledge base freshness checking
|
|
"""
|
|
|
|
import hashlib
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
|
|
class WebFetchCache:
|
|
"""Cache for web fetch results to reduce duplicate API calls.
|
|
|
|
Caches fetched URLs with 7-day TTL to avoid re-fetching same content.
|
|
Saves API costs and improves performance.
|
|
|
|
Usage:
|
|
cache = WebFetchCache()
|
|
|
|
# Try cache first
|
|
content = cache.get(url)
|
|
if not content:
|
|
content = fetch_from_web(url)
|
|
cache.set(url, content)
|
|
"""
|
|
|
|
def __init__(self, cache_dir: Optional[Path] = None, ttl_days: int = 7):
|
|
"""Initialize web fetch cache.
|
|
|
|
Args:
|
|
cache_dir: Directory to store cached files. Defaults to .claude/cache/web-fetch
|
|
ttl_days: Time to live in days. Default 7 days.
|
|
"""
|
|
if cache_dir is None:
|
|
cache_dir = Path(".claude/cache/web-fetch")
|
|
|
|
self.cache_dir = Path(cache_dir)
|
|
self.ttl_days = ttl_days
|
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
def _get_cache_path(self, url: str) -> Path:
|
|
"""Get cache file path for URL."""
|
|
url_hash = hashlib.md5(url.encode()).hexdigest()
|
|
return self.cache_dir / f"{url_hash}.md"
|
|
|
|
def get(self, url: str) -> Optional[str]:
|
|
"""Get cached content if fresh.
|
|
|
|
Args:
|
|
url: URL to fetch from cache
|
|
|
|
Returns:
|
|
Cached content if exists and fresh, None otherwise
|
|
"""
|
|
cache_file = self._get_cache_path(url)
|
|
|
|
if not cache_file.exists():
|
|
return None
|
|
|
|
try:
|
|
content = cache_file.read_text()
|
|
|
|
# Extract expiry date
|
|
if "**Expires**:" in content:
|
|
for line in content.split("\n"):
|
|
if "**Expires**:" in line:
|
|
expires_str = line.split(":", 1)[1].strip()
|
|
expires = datetime.fromisoformat(expires_str)
|
|
|
|
# Check if expired
|
|
if datetime.now() > expires:
|
|
cache_file.unlink()
|
|
return None
|
|
|
|
break
|
|
|
|
# Extract content (after separator)
|
|
if "---" in content:
|
|
parts = content.split("---", 1)
|
|
if len(parts) == 2:
|
|
return parts[1].strip()
|
|
|
|
return content
|
|
|
|
except Exception:
|
|
# If any error reading cache, treat as miss
|
|
return None
|
|
|
|
def set(self, url: str, content: str) -> None:
|
|
"""Cache content with TTL.
|
|
|
|
Args:
|
|
url: URL being cached
|
|
content: Content to cache
|
|
"""
|
|
cache_file = self._get_cache_path(url)
|
|
|
|
expires = datetime.now() + timedelta(days=self.ttl_days)
|
|
|
|
cached = f"""# Cached Web Fetch
|
|
|
|
**URL**: {url}
|
|
**Fetched**: {datetime.now().isoformat()}
|
|
**Expires**: {expires.isoformat()}
|
|
|
|
---
|
|
|
|
{content}"""
|
|
|
|
cache_file.write_text(cached)
|
|
|
|
def clear_expired(self) -> int:
|
|
"""Remove all expired cache entries.
|
|
|
|
Returns:
|
|
Number of entries removed
|
|
"""
|
|
removed = 0
|
|
for cache_file in self.cache_dir.glob("*.md"):
|
|
try:
|
|
content = cache_file.read_text()
|
|
|
|
if "**Expires**:" in content:
|
|
for line in content.split("\n"):
|
|
if "**Expires**:" in line:
|
|
expires_str = line.split(":", 1)[1].strip()
|
|
expires = datetime.fromisoformat(expires_str)
|
|
|
|
if datetime.now() > expires:
|
|
cache_file.unlink()
|
|
removed += 1
|
|
break
|
|
except Exception:
|
|
# If can't read, remove to be safe
|
|
cache_file.unlink()
|
|
removed += 1
|
|
|
|
return removed
|
|
|
|
|
|
def score_source(url: str, title: str = "", snippet: str = "") -> float:
|
|
"""Score source quality for prioritization.
|
|
|
|
Scores sources based on:
|
|
- Authority (official docs, well-known sites)
|
|
- Recency (2024-2025 content preferred)
|
|
- Content indicators (tutorials, code examples)
|
|
|
|
Args:
|
|
url: Source URL
|
|
title: Page title
|
|
snippet: Text snippet from search result
|
|
|
|
Returns:
|
|
Quality score from 0.0 to 1.0
|
|
"""
|
|
score = 0.0
|
|
url_lower = url.lower()
|
|
title_lower = title.lower()
|
|
snippet_lower = snippet.lower()
|
|
|
|
# Authority scoring (0.5 max)
|
|
high_authority = [
|
|
"python.org", "docs.python.org",
|
|
"github.com", "anthropic.com", "docs.anthropic.com",
|
|
"martinfowler.com", "realpython.com",
|
|
"auth0.com", "owasp.org",
|
|
"readthedocs.io", "readthedocs.org",
|
|
]
|
|
|
|
medium_authority = [
|
|
"stackoverflow.com", "medium.com",
|
|
"dev.to", "hackernoon.com",
|
|
"thoughtworks.com", "elastic.co",
|
|
]
|
|
|
|
if any(auth in url_lower for auth in high_authority):
|
|
score += 0.5
|
|
elif any(auth in url_lower for auth in medium_authority):
|
|
score += 0.3
|
|
else:
|
|
score += 0.1 # Base score for any source
|
|
|
|
# Recency scoring (0.3 max)
|
|
# Extract year from snippet or title
|
|
year_pattern = r'\b(202[3-5]|2025)\b'
|
|
year_match = re.search(year_pattern, snippet_lower + " " + title_lower)
|
|
|
|
if year_match:
|
|
year = int(year_match.group(1))
|
|
current_year = datetime.now().year
|
|
years_old = current_year - year
|
|
|
|
if years_old == 0:
|
|
score += 0.3 # Current year
|
|
elif years_old == 1:
|
|
score += 0.2 # Last year
|
|
elif years_old == 2:
|
|
score += 0.1 # 2 years old
|
|
# Older than 2 years: no recency bonus
|
|
|
|
# Content quality indicators (0.2 max)
|
|
quality_indicators = {
|
|
"tutorial": 0.05,
|
|
"guide": 0.05,
|
|
"best practices": 0.1,
|
|
"example": 0.05,
|
|
"code": 0.05,
|
|
"documentation": 0.05,
|
|
"official": 0.1,
|
|
}
|
|
|
|
combined_text = title_lower + " " + snippet_lower
|
|
for indicator, points in quality_indicators.items():
|
|
if indicator in combined_text:
|
|
score += points
|
|
|
|
# Cap at 1.0
|
|
return min(1.0, score)
|
|
|
|
|
|
def score_pattern(
|
|
file_path: str,
|
|
content: str,
|
|
keyword_relevance: float = 0.5,
|
|
has_tests: bool = False,
|
|
has_docstrings: bool = False,
|
|
line_count: int = 0,
|
|
last_modified_days: Optional[int] = None
|
|
) -> float:
|
|
"""Score codebase pattern quality.
|
|
|
|
Scores patterns based on:
|
|
- Keyword relevance (how well it matches search)
|
|
- Has tests (indicates quality)
|
|
- Has docstrings (indicates documentation)
|
|
- Substantial code (>50 lines)
|
|
- Recently modified (indicates maintenance)
|
|
|
|
Args:
|
|
file_path: Path to file containing pattern
|
|
content: File content
|
|
keyword_relevance: How relevant to search (0.0-1.0)
|
|
has_tests: Whether tests exist for this pattern
|
|
has_docstrings: Whether docstrings present
|
|
line_count: Number of lines in file
|
|
last_modified_days: Days since last modification
|
|
|
|
Returns:
|
|
Quality score from 0.0 to 1.0
|
|
"""
|
|
score = 0.0
|
|
|
|
# Keyword relevance (0.0-0.2)
|
|
score += keyword_relevance * 0.2
|
|
|
|
# Has tests (+0.3)
|
|
if has_tests:
|
|
score += 0.3
|
|
|
|
# Has docstrings (+0.2)
|
|
if has_docstrings:
|
|
score += 0.2
|
|
elif '"""' in content or "'''" in content:
|
|
# Simple heuristic if not explicitly checked
|
|
score += 0.2
|
|
|
|
# Substantial code (+0.2)
|
|
if line_count > 50:
|
|
score += 0.2
|
|
elif line_count > 20:
|
|
score += 0.1
|
|
|
|
# Recently modified (+0.1)
|
|
if last_modified_days is not None:
|
|
if last_modified_days < 30:
|
|
score += 0.1
|
|
elif last_modified_days < 90:
|
|
score += 0.05
|
|
|
|
return min(1.0, score)
|
|
|
|
|
|
def check_knowledge_freshness(knowledge_file: Path, max_age_days: int = 180) -> Tuple[bool, int, str]:
|
|
"""Check if knowledge base entry is fresh.
|
|
|
|
Args:
|
|
knowledge_file: Path to knowledge file
|
|
max_age_days: Maximum age in days before considering stale
|
|
|
|
Returns:
|
|
Tuple of (is_fresh, age_in_days, status_message)
|
|
"""
|
|
if not knowledge_file.exists():
|
|
return False, -1, "File does not exist"
|
|
|
|
try:
|
|
content = knowledge_file.read_text()
|
|
|
|
# Extract date from frontmatter
|
|
date_pattern = r'\*\*Date(?:\s+Researched)?\*\*:\s*(\d{4}-\d{2}-\d{2})'
|
|
match = re.search(date_pattern, content)
|
|
|
|
if not match:
|
|
return False, -1, "No date found in file"
|
|
|
|
date_str = match.group(1)
|
|
research_date = datetime.strptime(date_str, "%Y-%m-%d")
|
|
age_days = (datetime.now() - research_date).days
|
|
|
|
# Check freshness
|
|
if age_days < 0:
|
|
return False, age_days, "Future date (invalid)"
|
|
elif age_days <= max_age_days:
|
|
return True, age_days, f"Fresh ({age_days} days old)"
|
|
else:
|
|
return False, age_days, f"Stale ({age_days} days old, max {max_age_days})"
|
|
|
|
except Exception as e:
|
|
return False, -1, f"Error reading file: {str(e)}"
|
|
|
|
|
|
def extract_keywords(text: str, min_length: int = 3, max_keywords: int = 10) -> List[str]:
|
|
"""Extract keywords from user request for codebase search.
|
|
|
|
Args:
|
|
text: User request text
|
|
min_length: Minimum keyword length
|
|
max_keywords: Maximum keywords to return
|
|
|
|
Returns:
|
|
List of keywords sorted by relevance
|
|
"""
|
|
# Common stop words to exclude
|
|
stop_words = {
|
|
"the", "and", "for", "are", "but", "not", "you", "all",
|
|
"can", "her", "was", "one", "our", "out", "this", "that",
|
|
"have", "has", "had", "with", "from", "what", "when", "where",
|
|
"how", "why", "should", "would", "could", "implement", "create",
|
|
"add", "make", "use", "using", "need", "want",
|
|
}
|
|
|
|
# Extract words
|
|
words = re.findall(r'\b[a-z]+\b', text.lower())
|
|
|
|
# Filter and count
|
|
keyword_counts: Dict[str, int] = {}
|
|
for word in words:
|
|
if len(word) >= min_length and word not in stop_words:
|
|
keyword_counts[word] = keyword_counts.get(word, 0) + 1
|
|
|
|
# Sort by frequency, then alphabetically
|
|
sorted_keywords = sorted(
|
|
keyword_counts.items(),
|
|
key=lambda x: (-x[1], x[0])
|
|
)
|
|
|
|
# Return top keywords
|
|
return [k for k, _ in sorted_keywords[:max_keywords]]
|
|
|
|
|
|
def parse_index_entry(index_content: str, topic: str) -> Optional[Dict[str, str]]:
|
|
"""Parse INDEX.md to find knowledge about a topic.
|
|
|
|
Args:
|
|
index_content: Content of INDEX.md file
|
|
topic: Topic to search for (e.g., "authentication")
|
|
|
|
Returns:
|
|
Dictionary with entry details if found, None otherwise
|
|
"""
|
|
topic_lower = topic.lower()
|
|
|
|
# Split into sections
|
|
sections = index_content.split("## ")
|
|
|
|
for section in sections:
|
|
if not section.strip():
|
|
continue
|
|
|
|
# Check if topic mentioned in section
|
|
if topic_lower not in section.lower():
|
|
continue
|
|
|
|
# Extract entry details
|
|
entry = {}
|
|
|
|
# Extract title (first line)
|
|
lines = section.split("\n")
|
|
if lines:
|
|
entry["title"] = lines[0].strip()
|
|
|
|
# Extract file path
|
|
file_match = re.search(r'\*\*File\*\*:\s*`([^`]+)`', section)
|
|
if file_match:
|
|
entry["file"] = file_match.group(1)
|
|
|
|
# Extract date
|
|
date_match = re.search(r'\*\*Date\*\*:\s*(\d{4}-\d{2}-\d{2})', section)
|
|
if date_match:
|
|
entry["date"] = date_match.group(1)
|
|
|
|
# Extract description
|
|
desc_match = re.search(r'\*\*Description\*\*:\s*([^\n]+)', section)
|
|
if desc_match:
|
|
entry["description"] = desc_match.group(1)
|
|
|
|
if "file" in entry:
|
|
return entry
|
|
|
|
return None
|
|
|
|
|
|
def bootstrap_knowledge_base(
|
|
workspace_kb: Optional[Path] = None,
|
|
template_kb: Optional[Path] = None
|
|
) -> Tuple[bool, str]:
|
|
"""Bootstrap knowledge base from plugin template if not exists.
|
|
|
|
Creates .claude/knowledge/ by copying from plugin templates/knowledge/
|
|
if the workspace knowledge base doesn't exist yet.
|
|
|
|
Args:
|
|
workspace_kb: Path to workspace knowledge base. Defaults to .claude/knowledge
|
|
template_kb: Path to template knowledge base. Defaults to plugins/.../templates/knowledge
|
|
|
|
Returns:
|
|
Tuple of (success, message)
|
|
"""
|
|
if workspace_kb is None:
|
|
workspace_kb = Path(".claude/knowledge")
|
|
|
|
if template_kb is None:
|
|
template_kb = Path("plugins/autonomous-dev/templates/knowledge")
|
|
|
|
# Check if workspace knowledge base already exists
|
|
if workspace_kb.exists():
|
|
# Already bootstrapped
|
|
return True, "Knowledge base already exists"
|
|
|
|
# Check if template exists
|
|
if not template_kb.exists():
|
|
# Create minimal structure without template
|
|
try:
|
|
workspace_kb.mkdir(parents=True, exist_ok=True)
|
|
(workspace_kb / "best-practices").mkdir(exist_ok=True)
|
|
(workspace_kb / "patterns").mkdir(exist_ok=True)
|
|
(workspace_kb / "research").mkdir(exist_ok=True)
|
|
|
|
# Create minimal INDEX.md
|
|
index_content = """# Knowledge Base Index
|
|
|
|
**Last Updated**: {date}
|
|
**Purpose**: Persistent, organized knowledge for autonomous-dev plugin
|
|
|
|
## How to Use This Knowledge Base
|
|
|
|
### For Agents
|
|
Before researching a topic:
|
|
1. Read this INDEX to check if knowledge already exists
|
|
2. If found, read the specific file (avoids duplicate research)
|
|
3. If not found, research and save new findings here
|
|
|
|
### For Humans
|
|
- Browse by category below
|
|
- Each entry includes: topic, file path, date researched, brief description
|
|
|
|
---
|
|
|
|
## Best Practices
|
|
|
|
*(No entries yet)*
|
|
|
|
## Patterns
|
|
|
|
*(No entries yet)*
|
|
|
|
## Research
|
|
|
|
*(No entries yet)*
|
|
""".format(date=datetime.now().strftime("%Y-%m-%d"))
|
|
|
|
(workspace_kb / "INDEX.md").write_text(index_content)
|
|
|
|
return True, "Created minimal knowledge base structure (no template found)"
|
|
|
|
except Exception as e:
|
|
return False, f"Failed to create knowledge base: {str(e)}"
|
|
|
|
# Copy template to workspace
|
|
try:
|
|
import shutil
|
|
shutil.copytree(template_kb, workspace_kb)
|
|
return True, f"Initialized knowledge base from template: {template_kb}"
|
|
|
|
except Exception as e:
|
|
return False, f"Failed to copy template: {str(e)}"
|
|
|
|
|
|
# Example usage and testing
|
|
if __name__ == "__main__":
|
|
print("=== Search Utilities Tests ===\n")
|
|
|
|
# Test WebFetchCache
|
|
print("1. Web Fetch Cache")
|
|
cache = WebFetchCache(Path("/tmp/test-cache"))
|
|
test_url = "https://example.com/article"
|
|
|
|
# Should be miss first time
|
|
result = cache.get(test_url)
|
|
print(f" Cache miss: {result is None}")
|
|
|
|
# Set cache
|
|
cache.set(test_url, "Test content")
|
|
|
|
# Should be hit second time
|
|
result = cache.get(test_url)
|
|
print(f" Cache hit: {result == 'Test content'}")
|
|
print()
|
|
|
|
# Test source scoring
|
|
print("2. Source Quality Scoring")
|
|
scores = [
|
|
("https://docs.python.org/guide", "Python Guide 2025", "Tutorial", "High authority + recent"),
|
|
("https://github.com/user/repo", "Example code", "Code examples 2024", "High authority"),
|
|
("https://medium.com/article", "Tutorial", "How to do X", "Medium authority"),
|
|
("https://random.com/post", "Old post", "Post from 2020", "Low authority + old"),
|
|
]
|
|
|
|
for url, title, snippet, description in scores:
|
|
score = score_source(url, title, snippet)
|
|
print(f" {description}: {score:.2f}")
|
|
print()
|
|
|
|
# Test pattern scoring
|
|
print("3. Pattern Quality Scoring")
|
|
patterns = [
|
|
("High quality", 0.9, True, True, 200, 10),
|
|
("Medium quality", 0.6, False, True, 100, 60),
|
|
("Low quality", 0.3, False, False, 30, 365),
|
|
]
|
|
|
|
for desc, relevance, tests, docs, lines, days in patterns:
|
|
score = score_pattern("test.py", "content", relevance, tests, docs, lines, days)
|
|
print(f" {desc}: {score:.2f}")
|
|
print()
|
|
|
|
# Test keyword extraction
|
|
print("4. Keyword Extraction")
|
|
text = "implement user authentication with JWT tokens for secure API access"
|
|
keywords = extract_keywords(text)
|
|
print(f" Keywords: {', '.join(keywords)}")
|
|
print()
|
|
|
|
print("✅ All tests complete!")
|