TradingAgents/.claude/lib/search_utils.py

562 lines
16 KiB
Python

"""Search utilities for researcher agent.
Provides utilities for:
- Web fetch caching
- Source quality scoring
- Pattern quality scoring
- Knowledge base freshness checking
"""
import hashlib
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple
class WebFetchCache:
"""Cache for web fetch results to reduce duplicate API calls.
Caches fetched URLs with 7-day TTL to avoid re-fetching same content.
Saves API costs and improves performance.
Usage:
cache = WebFetchCache()
# Try cache first
content = cache.get(url)
if not content:
content = fetch_from_web(url)
cache.set(url, content)
"""
def __init__(self, cache_dir: Optional[Path] = None, ttl_days: int = 7):
"""Initialize web fetch cache.
Args:
cache_dir: Directory to store cached files. Defaults to .claude/cache/web-fetch
ttl_days: Time to live in days. Default 7 days.
"""
if cache_dir is None:
cache_dir = Path(".claude/cache/web-fetch")
self.cache_dir = Path(cache_dir)
self.ttl_days = ttl_days
self.cache_dir.mkdir(parents=True, exist_ok=True)
def _get_cache_path(self, url: str) -> Path:
"""Get cache file path for URL."""
url_hash = hashlib.md5(url.encode()).hexdigest()
return self.cache_dir / f"{url_hash}.md"
def get(self, url: str) -> Optional[str]:
"""Get cached content if fresh.
Args:
url: URL to fetch from cache
Returns:
Cached content if exists and fresh, None otherwise
"""
cache_file = self._get_cache_path(url)
if not cache_file.exists():
return None
try:
content = cache_file.read_text()
# Extract expiry date
if "**Expires**:" in content:
for line in content.split("\n"):
if "**Expires**:" in line:
expires_str = line.split(":", 1)[1].strip()
expires = datetime.fromisoformat(expires_str)
# Check if expired
if datetime.now() > expires:
cache_file.unlink()
return None
break
# Extract content (after separator)
if "---" in content:
parts = content.split("---", 1)
if len(parts) == 2:
return parts[1].strip()
return content
except Exception:
# If any error reading cache, treat as miss
return None
def set(self, url: str, content: str) -> None:
"""Cache content with TTL.
Args:
url: URL being cached
content: Content to cache
"""
cache_file = self._get_cache_path(url)
expires = datetime.now() + timedelta(days=self.ttl_days)
cached = f"""# Cached Web Fetch
**URL**: {url}
**Fetched**: {datetime.now().isoformat()}
**Expires**: {expires.isoformat()}
---
{content}"""
cache_file.write_text(cached)
def clear_expired(self) -> int:
"""Remove all expired cache entries.
Returns:
Number of entries removed
"""
removed = 0
for cache_file in self.cache_dir.glob("*.md"):
try:
content = cache_file.read_text()
if "**Expires**:" in content:
for line in content.split("\n"):
if "**Expires**:" in line:
expires_str = line.split(":", 1)[1].strip()
expires = datetime.fromisoformat(expires_str)
if datetime.now() > expires:
cache_file.unlink()
removed += 1
break
except Exception:
# If can't read, remove to be safe
cache_file.unlink()
removed += 1
return removed
def score_source(url: str, title: str = "", snippet: str = "") -> float:
"""Score source quality for prioritization.
Scores sources based on:
- Authority (official docs, well-known sites)
- Recency (2024-2025 content preferred)
- Content indicators (tutorials, code examples)
Args:
url: Source URL
title: Page title
snippet: Text snippet from search result
Returns:
Quality score from 0.0 to 1.0
"""
score = 0.0
url_lower = url.lower()
title_lower = title.lower()
snippet_lower = snippet.lower()
# Authority scoring (0.5 max)
high_authority = [
"python.org", "docs.python.org",
"github.com", "anthropic.com", "docs.anthropic.com",
"martinfowler.com", "realpython.com",
"auth0.com", "owasp.org",
"readthedocs.io", "readthedocs.org",
]
medium_authority = [
"stackoverflow.com", "medium.com",
"dev.to", "hackernoon.com",
"thoughtworks.com", "elastic.co",
]
if any(auth in url_lower for auth in high_authority):
score += 0.5
elif any(auth in url_lower for auth in medium_authority):
score += 0.3
else:
score += 0.1 # Base score for any source
# Recency scoring (0.3 max)
# Extract year from snippet or title
year_pattern = r'\b(202[3-5]|2025)\b'
year_match = re.search(year_pattern, snippet_lower + " " + title_lower)
if year_match:
year = int(year_match.group(1))
current_year = datetime.now().year
years_old = current_year - year
if years_old == 0:
score += 0.3 # Current year
elif years_old == 1:
score += 0.2 # Last year
elif years_old == 2:
score += 0.1 # 2 years old
# Older than 2 years: no recency bonus
# Content quality indicators (0.2 max)
quality_indicators = {
"tutorial": 0.05,
"guide": 0.05,
"best practices": 0.1,
"example": 0.05,
"code": 0.05,
"documentation": 0.05,
"official": 0.1,
}
combined_text = title_lower + " " + snippet_lower
for indicator, points in quality_indicators.items():
if indicator in combined_text:
score += points
# Cap at 1.0
return min(1.0, score)
def score_pattern(
file_path: str,
content: str,
keyword_relevance: float = 0.5,
has_tests: bool = False,
has_docstrings: bool = False,
line_count: int = 0,
last_modified_days: Optional[int] = None
) -> float:
"""Score codebase pattern quality.
Scores patterns based on:
- Keyword relevance (how well it matches search)
- Has tests (indicates quality)
- Has docstrings (indicates documentation)
- Substantial code (>50 lines)
- Recently modified (indicates maintenance)
Args:
file_path: Path to file containing pattern
content: File content
keyword_relevance: How relevant to search (0.0-1.0)
has_tests: Whether tests exist for this pattern
has_docstrings: Whether docstrings present
line_count: Number of lines in file
last_modified_days: Days since last modification
Returns:
Quality score from 0.0 to 1.0
"""
score = 0.0
# Keyword relevance (0.0-0.2)
score += keyword_relevance * 0.2
# Has tests (+0.3)
if has_tests:
score += 0.3
# Has docstrings (+0.2)
if has_docstrings:
score += 0.2
elif '"""' in content or "'''" in content:
# Simple heuristic if not explicitly checked
score += 0.2
# Substantial code (+0.2)
if line_count > 50:
score += 0.2
elif line_count > 20:
score += 0.1
# Recently modified (+0.1)
if last_modified_days is not None:
if last_modified_days < 30:
score += 0.1
elif last_modified_days < 90:
score += 0.05
return min(1.0, score)
def check_knowledge_freshness(knowledge_file: Path, max_age_days: int = 180) -> Tuple[bool, int, str]:
"""Check if knowledge base entry is fresh.
Args:
knowledge_file: Path to knowledge file
max_age_days: Maximum age in days before considering stale
Returns:
Tuple of (is_fresh, age_in_days, status_message)
"""
if not knowledge_file.exists():
return False, -1, "File does not exist"
try:
content = knowledge_file.read_text()
# Extract date from frontmatter
date_pattern = r'\*\*Date(?:\s+Researched)?\*\*:\s*(\d{4}-\d{2}-\d{2})'
match = re.search(date_pattern, content)
if not match:
return False, -1, "No date found in file"
date_str = match.group(1)
research_date = datetime.strptime(date_str, "%Y-%m-%d")
age_days = (datetime.now() - research_date).days
# Check freshness
if age_days < 0:
return False, age_days, "Future date (invalid)"
elif age_days <= max_age_days:
return True, age_days, f"Fresh ({age_days} days old)"
else:
return False, age_days, f"Stale ({age_days} days old, max {max_age_days})"
except Exception as e:
return False, -1, f"Error reading file: {str(e)}"
def extract_keywords(text: str, min_length: int = 3, max_keywords: int = 10) -> List[str]:
"""Extract keywords from user request for codebase search.
Args:
text: User request text
min_length: Minimum keyword length
max_keywords: Maximum keywords to return
Returns:
List of keywords sorted by relevance
"""
# Common stop words to exclude
stop_words = {
"the", "and", "for", "are", "but", "not", "you", "all",
"can", "her", "was", "one", "our", "out", "this", "that",
"have", "has", "had", "with", "from", "what", "when", "where",
"how", "why", "should", "would", "could", "implement", "create",
"add", "make", "use", "using", "need", "want",
}
# Extract words
words = re.findall(r'\b[a-z]+\b', text.lower())
# Filter and count
keyword_counts: Dict[str, int] = {}
for word in words:
if len(word) >= min_length and word not in stop_words:
keyword_counts[word] = keyword_counts.get(word, 0) + 1
# Sort by frequency, then alphabetically
sorted_keywords = sorted(
keyword_counts.items(),
key=lambda x: (-x[1], x[0])
)
# Return top keywords
return [k for k, _ in sorted_keywords[:max_keywords]]
def parse_index_entry(index_content: str, topic: str) -> Optional[Dict[str, str]]:
"""Parse INDEX.md to find knowledge about a topic.
Args:
index_content: Content of INDEX.md file
topic: Topic to search for (e.g., "authentication")
Returns:
Dictionary with entry details if found, None otherwise
"""
topic_lower = topic.lower()
# Split into sections
sections = index_content.split("## ")
for section in sections:
if not section.strip():
continue
# Check if topic mentioned in section
if topic_lower not in section.lower():
continue
# Extract entry details
entry = {}
# Extract title (first line)
lines = section.split("\n")
if lines:
entry["title"] = lines[0].strip()
# Extract file path
file_match = re.search(r'\*\*File\*\*:\s*`([^`]+)`', section)
if file_match:
entry["file"] = file_match.group(1)
# Extract date
date_match = re.search(r'\*\*Date\*\*:\s*(\d{4}-\d{2}-\d{2})', section)
if date_match:
entry["date"] = date_match.group(1)
# Extract description
desc_match = re.search(r'\*\*Description\*\*:\s*([^\n]+)', section)
if desc_match:
entry["description"] = desc_match.group(1)
if "file" in entry:
return entry
return None
def bootstrap_knowledge_base(
workspace_kb: Optional[Path] = None,
template_kb: Optional[Path] = None
) -> Tuple[bool, str]:
"""Bootstrap knowledge base from plugin template if not exists.
Creates .claude/knowledge/ by copying from plugin templates/knowledge/
if the workspace knowledge base doesn't exist yet.
Args:
workspace_kb: Path to workspace knowledge base. Defaults to .claude/knowledge
template_kb: Path to template knowledge base. Defaults to plugins/.../templates/knowledge
Returns:
Tuple of (success, message)
"""
if workspace_kb is None:
workspace_kb = Path(".claude/knowledge")
if template_kb is None:
template_kb = Path("plugins/autonomous-dev/templates/knowledge")
# Check if workspace knowledge base already exists
if workspace_kb.exists():
# Already bootstrapped
return True, "Knowledge base already exists"
# Check if template exists
if not template_kb.exists():
# Create minimal structure without template
try:
workspace_kb.mkdir(parents=True, exist_ok=True)
(workspace_kb / "best-practices").mkdir(exist_ok=True)
(workspace_kb / "patterns").mkdir(exist_ok=True)
(workspace_kb / "research").mkdir(exist_ok=True)
# Create minimal INDEX.md
index_content = """# Knowledge Base Index
**Last Updated**: {date}
**Purpose**: Persistent, organized knowledge for autonomous-dev plugin
## How to Use This Knowledge Base
### For Agents
Before researching a topic:
1. Read this INDEX to check if knowledge already exists
2. If found, read the specific file (avoids duplicate research)
3. If not found, research and save new findings here
### For Humans
- Browse by category below
- Each entry includes: topic, file path, date researched, brief description
---
## Best Practices
*(No entries yet)*
## Patterns
*(No entries yet)*
## Research
*(No entries yet)*
""".format(date=datetime.now().strftime("%Y-%m-%d"))
(workspace_kb / "INDEX.md").write_text(index_content)
return True, "Created minimal knowledge base structure (no template found)"
except Exception as e:
return False, f"Failed to create knowledge base: {str(e)}"
# Copy template to workspace
try:
import shutil
shutil.copytree(template_kb, workspace_kb)
return True, f"Initialized knowledge base from template: {template_kb}"
except Exception as e:
return False, f"Failed to copy template: {str(e)}"
# Example usage and testing
if __name__ == "__main__":
print("=== Search Utilities Tests ===\n")
# Test WebFetchCache
print("1. Web Fetch Cache")
cache = WebFetchCache(Path("/tmp/test-cache"))
test_url = "https://example.com/article"
# Should be miss first time
result = cache.get(test_url)
print(f" Cache miss: {result is None}")
# Set cache
cache.set(test_url, "Test content")
# Should be hit second time
result = cache.get(test_url)
print(f" Cache hit: {result == 'Test content'}")
print()
# Test source scoring
print("2. Source Quality Scoring")
scores = [
("https://docs.python.org/guide", "Python Guide 2025", "Tutorial", "High authority + recent"),
("https://github.com/user/repo", "Example code", "Code examples 2024", "High authority"),
("https://medium.com/article", "Tutorial", "How to do X", "Medium authority"),
("https://random.com/post", "Old post", "Post from 2020", "Low authority + old"),
]
for url, title, snippet, description in scores:
score = score_source(url, title, snippet)
print(f" {description}: {score:.2f}")
print()
# Test pattern scoring
print("3. Pattern Quality Scoring")
patterns = [
("High quality", 0.9, True, True, 200, 10),
("Medium quality", 0.6, False, True, 100, 60),
("Low quality", 0.3, False, False, 30, 365),
]
for desc, relevance, tests, docs, lines, days in patterns:
score = score_pattern("test.py", "content", relevance, tests, docs, lines, days)
print(f" {desc}: {score:.2f}")
print()
# Test keyword extraction
print("4. Keyword Extraction")
text = "implement user authentication with JWT tokens for secure API access"
keywords = extract_keywords(text)
print(f" Keywords: {', '.join(keywords)}")
print()
print("✅ All tests complete!")