TradingAgents/.claude/lib/search_utils.py

"""Search utilities for researcher agent.

Provides utilities for:
- Web fetch caching
- Source quality scoring
- Pattern quality scoring
- Knowledge base freshness checking
"""

import hashlib
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple


class WebFetchCache:
    """Cache for web fetch results to reduce duplicate API calls.

    Caches fetched URLs with 7-day TTL to avoid re-fetching same content.
    Saves API costs and improves performance.

    Usage:
        cache = WebFetchCache()

        # Try cache first
        content = cache.get(url)
        if not content:
            content = fetch_from_web(url)
            cache.set(url, content)
    """

    def __init__(self, cache_dir: Optional[Path] = None, ttl_days: int = 7):
        """Initialize web fetch cache.

        Args:
            cache_dir: Directory to store cached files. Defaults to .claude/cache/web-fetch
            ttl_days: Time to live in days. Default 7 days.
        """
        if cache_dir is None:
            cache_dir = Path(".claude/cache/web-fetch")

        self.cache_dir = Path(cache_dir)
        self.ttl_days = ttl_days
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def _get_cache_path(self, url: str) -> Path:
        """Get cache file path for URL."""
        url_hash = hashlib.md5(url.encode()).hexdigest()
        return self.cache_dir / f"{url_hash}.md"

    def get(self, url: str) -> Optional[str]:
        """Get cached content if fresh.

        Args:
            url: URL to fetch from cache

        Returns:
            Cached content if exists and fresh, None otherwise
        """
        cache_file = self._get_cache_path(url)

        if not cache_file.exists():
            return None

        try:
            content = cache_file.read_text()

            # Extract expiry date
            if "**Expires**:" in content:
                for line in content.split("\n"):
                    if "**Expires**:" in line:
                        expires_str = line.split(":", 1)[1].strip()
                        expires = datetime.fromisoformat(expires_str)

                        # Check if expired
                        if datetime.now() > expires:
                            cache_file.unlink()
                            return None

                        break

            # Extract content (after separator)
            if "---" in content:
                parts = content.split("---", 1)
                if len(parts) == 2:
                    return parts[1].strip()

            return content

        except Exception:
            # If any error reading cache, treat as miss
            return None

    def set(self, url: str, content: str) -> None:
        """Cache content with TTL.

        Args:
            url: URL being cached
            content: Content to cache
        """
        cache_file = self._get_cache_path(url)

        expires = datetime.now() + timedelta(days=self.ttl_days)

        cached = f"""# Cached Web Fetch

**URL**: {url}
**Fetched**: {datetime.now().isoformat()}
**Expires**: {expires.isoformat()}

---

{content}"""

        cache_file.write_text(cached)

    def clear_expired(self) -> int:
        """Remove all expired cache entries.

        Returns:
            Number of entries removed
        """
        removed = 0
        for cache_file in self.cache_dir.glob("*.md"):
            try:
                content = cache_file.read_text()

                if "**Expires**:" in content:
                    for line in content.split("\n"):
                        if "**Expires**:" in line:
                            expires_str = line.split(":", 1)[1].strip()
                            expires = datetime.fromisoformat(expires_str)

                            if datetime.now() > expires:
                                cache_file.unlink()
                                removed += 1
                            break
            except Exception:
                # If can't read, remove to be safe
                cache_file.unlink()
                removed += 1

        return removed


def score_source(url: str, title: str = "", snippet: str = "") -> float:
    """Score source quality for prioritization.

    Scores sources based on:
    - Authority (official docs, well-known sites)
    - Recency (2024-2025 content preferred)
    - Content indicators (tutorials, code examples)

    Args:
        url: Source URL
        title: Page title
        snippet: Text snippet from search result

    Returns:
        Quality score from 0.0 to 1.0
    """
    score = 0.0
    url_lower = url.lower()
    title_lower = title.lower()
    snippet_lower = snippet.lower()

    # Authority scoring (0.5 max)
    high_authority = [
        "python.org", "docs.python.org",
        "github.com", "anthropic.com", "docs.anthropic.com",
        "martinfowler.com", "realpython.com",
        "auth0.com", "owasp.org",
        "readthedocs.io", "readthedocs.org",
    ]

    medium_authority = [
        "stackoverflow.com", "medium.com",
        "dev.to", "hackernoon.com",
        "thoughtworks.com", "elastic.co",
    ]

    if any(auth in url_lower for auth in high_authority):
        score += 0.5
    elif any(auth in url_lower for auth in medium_authority):
        score += 0.3
    else:
        score += 0.1  # Base score for any source

    # Recency scoring (0.3 max)
    # Extract year from snippet or title
    year_pattern = r'\b(202[3-5]|2025)\b'
    year_match = re.search(year_pattern, snippet_lower + " " + title_lower)

    if year_match:
        year = int(year_match.group(1))
        current_year = datetime.now().year
        years_old = current_year - year

        if years_old == 0:
            score += 0.3  # Current year
        elif years_old == 1:
            score += 0.2  # Last year
        elif years_old == 2:
            score += 0.1  # 2 years old
        # Older than 2 years: no recency bonus

    # Content quality indicators (0.2 max)
    quality_indicators = {
        "tutorial": 0.05,
        "guide": 0.05,
        "best practices": 0.1,
        "example": 0.05,
        "code": 0.05,
        "documentation": 0.05,
        "official": 0.1,
    }

    combined_text = title_lower + " " + snippet_lower
    for indicator, points in quality_indicators.items():
        if indicator in combined_text:
            score += points

    # Cap at 1.0
    return min(1.0, score)


def score_pattern(
    file_path: str,
    content: str,
    keyword_relevance: float = 0.5,
    has_tests: bool = False,
    has_docstrings: bool = False,
    line_count: int = 0,
    last_modified_days: Optional[int] = None
) -> float:
    """Score codebase pattern quality.

    Scores patterns based on:
    - Keyword relevance (how well it matches search)
    - Has tests (indicates quality)
    - Has docstrings (indicates documentation)
    - Substantial code (>50 lines)
    - Recently modified (indicates maintenance)

    Args:
        file_path: Path to file containing pattern
        content: File content
        keyword_relevance: How relevant to search (0.0-1.0)
        has_tests: Whether tests exist for this pattern
        has_docstrings: Whether docstrings present
        line_count: Number of lines in file
        last_modified_days: Days since last modification

    Returns:
        Quality score from 0.0 to 1.0
    """
    score = 0.0

    # Keyword relevance (0.0-0.2)
    score += keyword_relevance * 0.2

    # Has tests (+0.3)
    if has_tests:
        score += 0.3

    # Has docstrings (+0.2)
    if has_docstrings:
        score += 0.2
    elif '"""' in content or "'''" in content:
        # Simple heuristic if not explicitly checked
        score += 0.2

    # Substantial code (+0.2)
    if line_count > 50:
        score += 0.2
    elif line_count > 20:
        score += 0.1

    # Recently modified (+0.1)
    if last_modified_days is not None:
        if last_modified_days < 30:
            score += 0.1
        elif last_modified_days < 90:
            score += 0.05

    return min(1.0, score)


def check_knowledge_freshness(knowledge_file: Path, max_age_days: int = 180) -> Tuple[bool, int, str]:
    """Check if knowledge base entry is fresh.

    Args:
        knowledge_file: Path to knowledge file
        max_age_days: Maximum age in days before considering stale

    Returns:
        Tuple of (is_fresh, age_in_days, status_message)
    """
    if not knowledge_file.exists():
        return False, -1, "File does not exist"

    try:
        content = knowledge_file.read_text()

        # Extract date from frontmatter
        date_pattern = r'\*\*Date(?:\s+Researched)?\*\*:\s*(\d{4}-\d{2}-\d{2})'
        match = re.search(date_pattern, content)

        if not match:
            return False, -1, "No date found in file"

        date_str = match.group(1)
        research_date = datetime.strptime(date_str, "%Y-%m-%d")
        age_days = (datetime.now() - research_date).days

        # Check freshness
        if age_days < 0:
            return False, age_days, "Future date (invalid)"
        elif age_days <= max_age_days:
            return True, age_days, f"Fresh ({age_days} days old)"
        else:
            return False, age_days, f"Stale ({age_days} days old, max {max_age_days})"

    except Exception as e:
        return False, -1, f"Error reading file: {str(e)}"


def extract_keywords(text: str, min_length: int = 3, max_keywords: int = 10) -> List[str]:
    """Extract keywords from user request for codebase search.

    Args:
        text: User request text
        min_length: Minimum keyword length
        max_keywords: Maximum keywords to return

    Returns:
        List of keywords sorted by relevance
    """
    # Common stop words to exclude
    stop_words = {
        "the", "and", "for", "are", "but", "not", "you", "all",
        "can", "her", "was", "one", "our", "out", "this", "that",
        "have", "has", "had", "with", "from", "what", "when", "where",
        "how", "why", "should", "would", "could", "implement", "create",
        "add", "make", "use", "using", "need", "want",
    }

    # Extract words
    words = re.findall(r'\b[a-z]+\b', text.lower())

    # Filter and count
    keyword_counts: Dict[str, int] = {}
    for word in words:
        if len(word) >= min_length and word not in stop_words:
            keyword_counts[word] = keyword_counts.get(word, 0) + 1

    # Sort by frequency, then alphabetically
    sorted_keywords = sorted(
        keyword_counts.items(),
        key=lambda x: (-x[1], x[0])
    )

    # Return top keywords
    return [k for k, _ in sorted_keywords[:max_keywords]]


def parse_index_entry(index_content: str, topic: str) -> Optional[Dict[str, str]]:
    """Parse INDEX.md to find knowledge about a topic.

    Args:
        index_content: Content of INDEX.md file
        topic: Topic to search for (e.g., "authentication")

    Returns:
        Dictionary with entry details if found, None otherwise
    """
    topic_lower = topic.lower()

    # Split into sections
    sections = index_content.split("## ")

    for section in sections:
        if not section.strip():
            continue

        # Check if topic mentioned in section
        if topic_lower not in section.lower():
            continue

        # Extract entry details
        entry = {}

        # Extract title (first line)
        lines = section.split("\n")
        if lines:
            entry["title"] = lines[0].strip()

        # Extract file path
        file_match = re.search(r'\*\*File\*\*:\s*`([^`]+)`', section)
        if file_match:
            entry["file"] = file_match.group(1)

        # Extract date
        date_match = re.search(r'\*\*Date\*\*:\s*(\d{4}-\d{2}-\d{2})', section)
        if date_match:
            entry["date"] = date_match.group(1)

        # Extract description
        desc_match = re.search(r'\*\*Description\*\*:\s*([^\n]+)', section)
        if desc_match:
            entry["description"] = desc_match.group(1)

        if "file" in entry:
            return entry

    return None


def bootstrap_knowledge_base(
    workspace_kb: Optional[Path] = None,
    template_kb: Optional[Path] = None
) -> Tuple[bool, str]:
    """Bootstrap knowledge base from plugin template if not exists.

    Creates .claude/knowledge/ by copying from plugin templates/knowledge/
    if the workspace knowledge base doesn't exist yet.

    Args:
        workspace_kb: Path to workspace knowledge base. Defaults to .claude/knowledge
        template_kb: Path to template knowledge base. Defaults to plugins/.../templates/knowledge

    Returns:
        Tuple of (success, message)
    """
    if workspace_kb is None:
        workspace_kb = Path(".claude/knowledge")

    if template_kb is None:
        template_kb = Path("plugins/autonomous-dev/templates/knowledge")

    # Check if workspace knowledge base already exists
    if workspace_kb.exists():
        # Already bootstrapped
        return True, "Knowledge base already exists"

    # Check if template exists
    if not template_kb.exists():
        # Create minimal structure without template
        try:
            workspace_kb.mkdir(parents=True, exist_ok=True)
            (workspace_kb / "best-practices").mkdir(exist_ok=True)
            (workspace_kb / "patterns").mkdir(exist_ok=True)
            (workspace_kb / "research").mkdir(exist_ok=True)

            # Create minimal INDEX.md
            index_content = """# Knowledge Base Index

**Last Updated**: {date}
**Purpose**: Persistent, organized knowledge for autonomous-dev plugin

## How to Use This Knowledge Base

### For Agents
Before researching a topic:
1. Read this INDEX to check if knowledge already exists
2. If found, read the specific file (avoids duplicate research)
3. If not found, research and save new findings here

### For Humans
- Browse by category below
- Each entry includes: topic, file path, date researched, brief description

---

## Best Practices

*(No entries yet)*

## Patterns

*(No entries yet)*

## Research

*(No entries yet)*
""".format(date=datetime.now().strftime("%Y-%m-%d"))

            (workspace_kb / "INDEX.md").write_text(index_content)

            return True, "Created minimal knowledge base structure (no template found)"

        except Exception as e:
            return False, f"Failed to create knowledge base: {str(e)}"

    # Copy template to workspace
    try:
        import shutil
        shutil.copytree(template_kb, workspace_kb)
        return True, f"Initialized knowledge base from template: {template_kb}"

    except Exception as e:
        return False, f"Failed to copy template: {str(e)}"


# Example usage and testing
if __name__ == "__main__":
    print("=== Search Utilities Tests ===\n")

    # Test WebFetchCache
    print("1. Web Fetch Cache")
    cache = WebFetchCache(Path("/tmp/test-cache"))
    test_url = "https://example.com/article"

    # Should be miss first time
    result = cache.get(test_url)
    print(f"   Cache miss: {result is None}")

    # Set cache
    cache.set(test_url, "Test content")

    # Should be hit second time
    result = cache.get(test_url)
    print(f"   Cache hit: {result == 'Test content'}")
    print()

    # Test source scoring
    print("2. Source Quality Scoring")
    scores = [
        ("https://docs.python.org/guide", "Python Guide 2025", "Tutorial", "High authority + recent"),
        ("https://github.com/user/repo", "Example code", "Code examples 2024", "High authority"),
        ("https://medium.com/article", "Tutorial", "How to do X", "Medium authority"),
        ("https://random.com/post", "Old post", "Post from 2020", "Low authority + old"),
    ]

    for url, title, snippet, description in scores:
        score = score_source(url, title, snippet)
        print(f"   {description}: {score:.2f}")
    print()

    # Test pattern scoring
    print("3. Pattern Quality Scoring")
    patterns = [
        ("High quality", 0.9, True, True, 200, 10),
        ("Medium quality", 0.6, False, True, 100, 60),
        ("Low quality", 0.3, False, False, 30, 365),
    ]

    for desc, relevance, tests, docs, lines, days in patterns:
        score = score_pattern("test.py", "content", relevance, tests, docs, lines, days)
        print(f"   {desc}: {score:.2f}")
    print()

    # Test keyword extraction
    print("4. Keyword Extraction")
    text = "implement user authentication with JWT tokens for secure API access"
    keywords = extract_keywords(text)
    print(f"   Keywords: {', '.join(keywords)}")
    print()

    print("✅ All tests complete!")