TradingAgents/.claude/lib/project_md_parser.py

138 lines
5.0 KiB
Python

"""
PROJECT.md parsing and validation.
Parses PROJECT.md to extract GOALS, SCOPE (included/excluded), and CONSTRAINTS.
Provides structured access to project governance information.
"""
import re
from pathlib import Path
from typing import Dict, Any, Optional, List
class ProjectMdParser:
"""Parse and validate PROJECT.md"""
def __init__(self, project_md_path: Path):
"""
Initialize parser
Args:
project_md_path: Path to PROJECT.md file
"""
self.project_md_path = project_md_path
if not project_md_path.exists():
raise FileNotFoundError(f"PROJECT.md not found at: {project_md_path}")
self.content = project_md_path.read_text()
self.goals = self._parse_section("GOALS")
# Parse SCOPE section by emoji
self.scope_included = self._parse_section("SCOPE", emoji_filter='')
self.scope_excluded = self._parse_section("SCOPE", emoji_filter='')
self.constraints = self._parse_section("CONSTRAINTS")
def _parse_section(
self,
section_name: str,
subsection: Optional[str] = None,
emoji_filter: Optional[str] = None
) -> List[str]:
"""
Parse a section from PROJECT.md
Args:
section_name: Name of main section (GOALS, SCOPE, CONSTRAINTS)
subsection: Optional subsection name (e.g., "In Scope")
emoji_filter: Optional emoji to filter items (e.g., '' or '')
Returns:
List of items in the section
"""
# Find section (allow any characters after section name, like emojis)
section_pattern = rf"^##\s+{section_name}\b"
section_match = re.search(section_pattern, self.content, re.MULTILINE)
if not section_match:
return []
# Extract section content (until next ## heading)
start = section_match.end()
next_section = re.search(r"^##\s+", self.content[start:], re.MULTILINE)
end = start + next_section.start() if next_section else len(self.content)
section_content = self.content[start:end]
# If subsection specified, extract that
if subsection:
# Try ### header first (h3)
subsection_pattern = rf"^###\s+{subsection}\s*$"
subsection_match = re.search(subsection_pattern, section_content, re.MULTILINE)
# If not found, try **bold** header with flexible matching
if not subsection_match:
# Match "**What's IN Scope**" for subsection="In Scope"
# Use case-insensitive and partial matching
subsection_pattern = rf"\*\*.*?{re.escape(subsection)}.*?\*\*"
subsection_match = re.search(subsection_pattern, section_content, re.IGNORECASE)
if not subsection_match:
return []
subsection_start = subsection_match.end()
# Find next subsection (either ### or **)
next_subsection = re.search(r"(^###\s+|\*\*.*?\*\*)", section_content[subsection_start:], re.MULTILINE)
subsection_end = subsection_start + next_subsection.start() if next_subsection else len(section_content)
section_content = section_content[subsection_start:subsection_end]
# Extract bullet points and numbered lists
items = []
for line in section_content.split('\n'):
line = line.strip()
# Skip section headers (lines with ** that end with : or **:)
if line.startswith('**') and (':' in line or line.endswith('**')):
continue
# Skip horizontal rules (---, ***, etc.)
if line.startswith('---') or line.startswith('***') or line == '--':
continue
# Apply emoji filter if specified
if emoji_filter and emoji_filter not in line:
continue
# Match bullet points (-, *) or numbered lists (1., 2., etc.)
if line.startswith('-') or line.startswith('*') or re.match(r'^\d+\.', line):
# Remove leading marker and ❌/✅ symbols
item = re.sub(r'^[-*]\s*[❌✅]?\s*', '', line).strip()
item = re.sub(r'^\d+\.\s*[❌✅]?\s*', '', item).strip()
# Remove **bold** markers
item = re.sub(r'\*\*(.*?)\*\*', r'\1', item)
# Extract main content before dash or hyphen (for items like "Goal - explanation")
# This gets "Goal" from "Goal - explanation text"
if ' - ' in item:
item = item.split(' - ')[0].strip()
if item and not item.endswith(':'): # Skip headers and empty items
items.append(item)
return items
def to_dict(self) -> Dict[str, Any]:
"""Convert parsed PROJECT.md to dictionary"""
return {
'goals': self.goals,
'scope': {
'included': self.scope_included,
'excluded': self.scope_excluded
},
'constraints': self.constraints
}