TradingAgents/.claude/lib/project_md_parser.py

"""
PROJECT.md parsing and validation.

Parses PROJECT.md to extract GOALS, SCOPE (included/excluded), and CONSTRAINTS.
Provides structured access to project governance information.
"""

import re
from pathlib import Path
from typing import Dict, Any, Optional, List


class ProjectMdParser:
    """Parse and validate PROJECT.md"""

    def __init__(self, project_md_path: Path):
        """
        Initialize parser

        Args:
            project_md_path: Path to PROJECT.md file
        """
        self.project_md_path = project_md_path

        if not project_md_path.exists():
            raise FileNotFoundError(f"PROJECT.md not found at: {project_md_path}")

        self.content = project_md_path.read_text()
        self.goals = self._parse_section("GOALS")

        # Parse SCOPE section by emoji
        self.scope_included = self._parse_section("SCOPE", emoji_filter='✅')
        self.scope_excluded = self._parse_section("SCOPE", emoji_filter='❌')

        self.constraints = self._parse_section("CONSTRAINTS")

    def _parse_section(
        self,
        section_name: str,
        subsection: Optional[str] = None,
        emoji_filter: Optional[str] = None
    ) -> List[str]:
        """
        Parse a section from PROJECT.md

        Args:
            section_name: Name of main section (GOALS, SCOPE, CONSTRAINTS)
            subsection: Optional subsection name (e.g., "In Scope")
            emoji_filter: Optional emoji to filter items (e.g., '✅' or '❌')

        Returns:
            List of items in the section
        """
        # Find section (allow any characters after section name, like emojis)
        section_pattern = rf"^##\s+{section_name}\b"
        section_match = re.search(section_pattern, self.content, re.MULTILINE)

        if not section_match:
            return []

        # Extract section content (until next ## heading)
        start = section_match.end()
        next_section = re.search(r"^##\s+", self.content[start:], re.MULTILINE)
        end = start + next_section.start() if next_section else len(self.content)

        section_content = self.content[start:end]

        # If subsection specified, extract that
        if subsection:
            # Try ### header first (h3)
            subsection_pattern = rf"^###\s+{subsection}\s*$"
            subsection_match = re.search(subsection_pattern, section_content, re.MULTILINE)

            # If not found, try **bold** header with flexible matching
            if not subsection_match:
                # Match "**What's IN Scope**" for subsection="In Scope"
                # Use case-insensitive and partial matching
                subsection_pattern = rf"\*\*.*?{re.escape(subsection)}.*?\*\*"
                subsection_match = re.search(subsection_pattern, section_content, re.IGNORECASE)

            if not subsection_match:
                return []

            subsection_start = subsection_match.end()

            # Find next subsection (either ### or **)
            next_subsection = re.search(r"(^###\s+|\*\*.*?\*\*)", section_content[subsection_start:], re.MULTILINE)
            subsection_end = subsection_start + next_subsection.start() if next_subsection else len(section_content)

            section_content = section_content[subsection_start:subsection_end]

        # Extract bullet points and numbered lists
        items = []
        for line in section_content.split('\n'):
            line = line.strip()

            # Skip section headers (lines with ** that end with : or **:)
            if line.startswith('**') and (':' in line or line.endswith('**')):
                continue

            # Skip horizontal rules (---, ***, etc.)
            if line.startswith('---') or line.startswith('***') or line == '--':
                continue

            # Apply emoji filter if specified
            if emoji_filter and emoji_filter not in line:
                continue

            # Match bullet points (-, *) or numbered lists (1., 2., etc.)
            if line.startswith('-') or line.startswith('*') or re.match(r'^\d+\.', line):
                # Remove leading marker and ❌/✅ symbols
                item = re.sub(r'^[-*]\s*[❌✅]?\s*', '', line).strip()
                item = re.sub(r'^\d+\.\s*[❌✅]?\s*', '', item).strip()

                # Remove **bold** markers
                item = re.sub(r'\*\*(.*?)\*\*', r'\1', item)

                # Extract main content before dash or hyphen (for items like "Goal - explanation")
                # This gets "Goal" from "Goal - explanation text"
                if ' - ' in item:
                    item = item.split(' - ')[0].strip()

                if item and not item.endswith(':'):  # Skip headers and empty items
                    items.append(item)

        return items

    def to_dict(self) -> Dict[str, Any]:
        """Convert parsed PROJECT.md to dictionary"""
        return {
            'goals': self.goals,
            'scope': {
                'included': self.scope_included,
                'excluded': self.scope_excluded
            },
            'constraints': self.constraints
        }