#!/usr/bin/env python3
"""
Security Utilities - Shared security validation and audit logging

This module provides centralized security functions for path validation,
input sanitization, and audit logging to prevent common vulnerabilities:
- CWE-22: Path Traversal
- CWE-59: Improper Link Resolution Before File Access
- CWE-117: Improper Output Neutralization for Logs

All security-sensitive operations in the codebase should use these utilities
to ensure consistent security enforcement.

Security Features:
- Whitelist-based path validation (PROJECT_ROOT, ~/.claude/, and system temp in test mode)
- Symlink detection and rejection
- Path traversal prevention (reject .., resolve symlinks)
- Pytest format validation (test_file.py::test_name pattern)
- Thread-safe audit logging with rotation (10MB limit)
- Clear error messages for security violations

Usage:
    from security_utils import validate_path, validate_pytest_path, audit_log

    # Path validation (whitelist-based)
    try:
        safe_path = validate_path(user_path, "session file")
    except ValueError as e:
        print(f"Security violation: {e}")

    # Pytest path validation
    try:
        safe_pytest = validate_pytest_path(pytest_path, "test execution")
    except ValueError as e:
        print(f"Invalid pytest path: {e}")

    # Audit logging
    audit_log("path_validation", "success", {
        "operation": "validate_session_file",
        "path": str(safe_path),
        "user": os.getenv("USER")
    })

Date: 2025-11-07
Issue: GitHub #46 (CRITICAL path validation bypass)
Agent: implementer


Design Patterns:
    See library-design-patterns skill for standardized design patterns.
"""

import json
import logging
import os
import re
import tempfile
import threading
from datetime import datetime, timezone
from logging.handlers import RotatingFileHandler
from pathlib import Path
from typing import Optional, Dict, Any


# Project root for whitelist validation (DYNAMIC DETECTION for cross-project use)
def _detect_project_root() -> Path:
    """Dynamically detect project root from current working directory.

    Detection strategy (prioritizes .git over .claude):
    1. Search ALL the way up for .git first (git repos take precedence)
    2. If no .git found, search for .claude directory
    3. Fall back to CWD if no markers found

    This enables auto-approval to work across ALL projects, not just autonomous-dev.
    Prioritizing .git prevents nested .claude directories (e.g., plugins/autonomous-dev/.claude)
    from being incorrectly detected as project root.

    Returns:
        Detected project root directory

    Security Note:
        validate_path() still enforces security boundaries - this just makes
        those boundaries project-specific instead of hardcoded to plugin location.
    """
    start = Path.cwd()

    # Priority 1: Search ALL the way up for .git (git repos take precedence)
    current = start
    for _ in range(10):
        if (current / ".git").exists():
            return current.resolve()
        if current.parent == current:
            break  # Reached filesystem root
        current = current.parent

    # Priority 2: Search for .claude if no .git found
    current = start
    for _ in range(10):
        if (current / ".claude").exists():
            return current.resolve()
        if current.parent == current:
            break  # Reached filesystem root
        current = current.parent

    # Fall back to current working directory
    return Path.cwd().resolve()

PROJECT_ROOT = _detect_project_root()

# Whitelist of allowed directories (relative to PROJECT_ROOT)
ALLOWED_DIRS = [
    "",  # PROJECT_ROOT itself
    "docs/sessions",  # Session logs
    ".claude",  # Claude configuration
    "plugins/autonomous-dev/lib",  # Library files
    "scripts",  # Scripts
    "tests",  # Test files
]

# System temp directory (allowed in test mode)
SYSTEM_TEMP = Path(tempfile.gettempdir()).resolve()

# Claude home directory (~/.claude/) - allowed for Claude Code system operations
# This is a fixed, known location for:
# - Plan mode files (~/.claude/plans/)
# - Global CLAUDE.md (~/.claude/CLAUDE.md)
# - Global settings (~/.claude/settings.json)
# Security: Still validates symlinks and path traversal within this directory
CLAUDE_HOME_DIR = Path.home() / ".claude"

# Thread-safe logger for audit logs
_audit_logger: Optional[logging.Logger] = None
_audit_logger_lock = threading.Lock()

# Input validation constants
MAX_MESSAGE_LENGTH = 10000  # 10KB max message length
MAX_PATH_LENGTH = 4096  # POSIX PATH_MAX limit
PYTEST_PATH_PATTERN = re.compile(r'^[\w/.-]+\.py(?:::[\w\[\],_-]+)?$')


def _get_audit_logger() -> logging.Logger:
    """Get or create thread-safe audit logger with rotation.

    Returns:
        Configured logger for security audit events

    Logger Configuration:
    - File: logs/security_audit.log
    - Format: JSON with timestamp, event type, status, context
    - Rotation: 10MB max size, keep 5 backup files
    - Thread-safe: Uses threading.Lock for concurrent access
    
See error-handling-patterns skill for exception hierarchy and error handling best practices.
"""
    global _audit_logger

    if _audit_logger is not None:
        return _audit_logger

    with _audit_logger_lock:
        # Double-check pattern to prevent race condition
        if _audit_logger is not None:
            return _audit_logger

        # Create logs directory
        log_dir = PROJECT_ROOT / "logs"
        log_dir.mkdir(exist_ok=True)

        # Configure logger
        logger = logging.getLogger("security_audit")
        logger.setLevel(logging.INFO)
        logger.propagate = False  # Don't propagate to root logger

        # Create rotating file handler (10MB max, 5 backups)
        log_file = log_dir / "security_audit.log"
        handler = RotatingFileHandler(
            log_file,
            maxBytes=10 * 1024 * 1024,  # 10MB
            backupCount=5,
            encoding='utf-8'
        )

        # JSON format for structured logging
        handler.setFormatter(logging.Formatter('%(message)s'))
        logger.addHandler(handler)

        _audit_logger = logger
        return _audit_logger


def audit_log(event_type: str, status: str, context: Dict[str, Any]) -> None:
    """Log security event to audit log.

    Args:
        event_type: Type of security event (e.g., "path_validation", "input_sanitization")
        status: Event status ("success", "failure", "warning")
        context: Additional context dict (operation, path, user, etc.)

    Security Note:
    - All path validation operations should be audited
    - Failed validations are logged for security monitoring
    - Thread-safe for concurrent agent execution
    """
    logger = _get_audit_logger()

    # Create audit record
    record = {
        "timestamp": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
        "event_type": event_type,
        "status": status,
        "context": context
    }

    # Log as JSON
    logger.info(json.dumps(record))


def validate_path(
    path: Path | str,
    purpose: str,
    allow_missing: bool = False,
    test_mode: Optional[bool] = None
) -> Path:
    """Validate path is within project boundaries (whitelist-based).

    Args:
        path: Path to validate
        purpose: Human-readable description of what this path is for
        allow_missing: Whether to allow non-existent paths
        test_mode: Override test mode detection (None = auto-detect)

    Returns:
        Resolved, validated Path object

    Raises:
        ValueError: If path is outside project, is a symlink, or contains traversal

    Security Design (GitHub Issue #46):
    ===================================
    This function uses WHITELIST validation (allow known safe locations) instead
    of BLACKLIST validation (block known bad patterns).

    Validation Layers:
    1. String-level checks: Reject obvious traversal (.., absolute system paths)
    2. Symlink detection: Reject symlinks before resolution
    3. Path resolution: Normalize path to absolute form
    4. Whitelist validation: Ensure path is in PROJECT_ROOT or allowed temp dirs

    Allowed Locations (always):
    ===========================
    - PROJECT_ROOT and subdirectories
    - ~/.claude/ directory (Claude Code system files: plans, CLAUDE.md, settings)

    Test Mode (additional):
    =======================
    When pytest runs, it creates temp directories outside PROJECT_ROOT.
    Test mode additionally allows:
    - System temp directory (tempfile.gettempdir())

    Blocked Locations:
    ==================
    - /etc/, /usr/, /bin/, /sbin/, /var/log/ (system directories)
    - Arbitrary paths outside whitelist

    Attack Scenarios Blocked:
    =========================
    - Relative traversal: "../../etc/passwd" (blocked by check #1)
    - Absolute system paths: "/etc/passwd" (blocked by check #4)
    - Symlink escapes: "link" -> "/etc/passwd" (blocked by check #2)
    - Mixed traversal: "subdir/../../etc" (blocked by check #3 after resolve)
    """
    # Convert to Path if string
    if isinstance(path, str):
        path = Path(path)

    # Detect test mode
    if test_mode is None:
        test_mode = os.getenv("PYTEST_CURRENT_TEST") is not None

    # SECURITY LAYER 1: String-level validation
    path_str = str(path)

    # Reject obvious traversal patterns
    if ".." in path_str:
        audit_log("path_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "path": path_str,
            "reason": "path_traversal_attempt",
            "pattern": ".."
        })
        raise ValueError(
            f"Path traversal attempt detected: {path}\n"
            f"Purpose: {purpose}\n"
            f"Paths containing '..' are not allowed.\n"
            f"Expected: Path within project or allowed directories\n"
            f"See: docs/SECURITY.md#path-validation"
        )

    # Reject excessively long paths (potential buffer overflow)
    if len(path_str) > MAX_PATH_LENGTH:
        audit_log("path_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "path": path_str[:100] + "...",
            "reason": "path_too_long",
            "length": len(path_str)
        })
        raise ValueError(
            f"Path too long: {len(path_str)} characters\n"
            f"Purpose: {purpose}\n"
            f"Maximum allowed: {MAX_PATH_LENGTH} characters\n"
            f"Expected: Reasonable path length"
        )

    # SECURITY LAYER 2: Symlink detection (before resolution)
    if path.exists() and path.is_symlink():
        audit_log("path_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "path": path_str,
            "reason": "symlink_detected"
        })
        raise ValueError(
            f"Symlinks are not allowed: {path}\n"
            f"Purpose: {purpose}\n"
            f"Symlinks can be used to escape directory boundaries.\n"
            f"Expected: Regular file or directory path\n"
            f"See: docs/SECURITY.md#symlink-policy"
        )

    # SECURITY LAYER 3: Path resolution and normalization
    try:
        resolved_path = path.resolve()

        # Check resolved path for symlinks (catches symlinks in parent dirs)
        if not allow_missing and resolved_path.exists() and resolved_path.is_symlink():
            audit_log("path_validation", "failure", {
                "operation": f"validate_{purpose.replace(' ', '_')}",
                "path": path_str,
                "resolved": str(resolved_path),
                "reason": "symlink_in_resolved_path"
            })
            raise ValueError(
                f"Path contains symlink: {path}\n"
                f"Resolved path is a symlink: {resolved_path}\n"
                f"Purpose: {purpose}\n"
                f"Expected: Regular path without symlinks\n"
                f"See: docs/SECURITY.md#symlink-policy"
            )

    except (OSError, RuntimeError) as e:
        audit_log("path_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "path": path_str,
            "reason": "resolution_error",
            "error": str(e)
        })
        raise ValueError(
            f"Invalid path: {path}\n"
            f"Purpose: {purpose}\n"
            f"Error: {e}\n"
            f"Expected: Valid filesystem path"
        )

    # SECURITY LAYER 4: Whitelist validation
    is_in_project = False
    is_in_allowed_temp = False
    is_in_claude_home = False

    # Check if path is in PROJECT_ROOT
    try:
        resolved_path.relative_to(PROJECT_ROOT)
        is_in_project = True
    except ValueError:
        pass

    # Check if path is in ~/.claude/ (Claude Code system directory)
    # This allows plan mode, global CLAUDE.md, and other Claude Code features
    try:
        resolved_path.relative_to(CLAUDE_HOME_DIR.resolve())
        is_in_claude_home = True
    except ValueError:
        pass

    # In test mode, also check system temp directory
    if test_mode:
        try:
            resolved_path.relative_to(SYSTEM_TEMP)
            is_in_allowed_temp = True
        except ValueError:
            pass

    # Validate against whitelist
    if not is_in_project and not is_in_claude_home and not (test_mode and is_in_allowed_temp):
        audit_log("path_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "path": path_str,
            "resolved": str(resolved_path),
            "reason": "outside_whitelist",
            "test_mode": test_mode
        })

        error_msg = f"Path outside allowed locations: {path}\n"
        error_msg += f"Purpose: {purpose}\n"
        error_msg += f"Resolved path: {resolved_path}\n"
        error_msg += f"Allowed locations:\n"
        error_msg += f"  - Project root: {PROJECT_ROOT}\n"
        error_msg += f"  - Claude home: {CLAUDE_HOME_DIR}\n"

        if test_mode:
            error_msg += f"  - System temp: {SYSTEM_TEMP}\n"
            error_msg += f"Test mode uses WHITELIST approach for security.\n"
        else:
            error_msg += f"Production mode requires path within allowed locations.\n"

        error_msg += f"See: docs/SECURITY.md#path-validation"
        raise ValueError(error_msg)

    # Success - log and return
    audit_log("path_validation", "success", {
        "operation": f"validate_{purpose.replace(' ', '_')}",
        "path": path_str,
        "resolved": str(resolved_path),
        "test_mode": test_mode
    })

    return resolved_path


def validate_pytest_path(
    pytest_path: str,
    purpose: str = "pytest execution"
) -> str:
    """Validate pytest path format (test_file.py::test_name).

    Args:
        pytest_path: Pytest path to validate (e.g., "tests/test_foo.py::test_bar")
        purpose: Human-readable description of what this path is for

    Returns:
        Validated pytest path string

    Raises:
        ValueError: If format is invalid or contains suspicious patterns

    Valid Formats:
    - tests/test_security.py
    - tests/test_security.py::test_path_validation
    - tests/test_security.py::TestClass::test_method
    - tests/test_security.py::test_method[param1,param2]

    Security Design:
    ================
    Pytest paths can be used to execute arbitrary Python code if not validated.
    This function uses regex validation to ensure only legitimate pytest paths.

    Pattern: ^[\\w/.-]+\\.py(?:::[\\w\\[\\],_-]+)?$
    - [\\w/.-]+: Alphanumeric, slash, dot, hyphen (file path)
    - \\.py: Must be Python file
    - (?:::[\\w\\[\\],_-]+)?: Optional test specifier with :: prefix
    - [\\w\\[\\],_-]+: Test names with parameters in brackets

    Attack Scenarios Blocked:
    =========================
    - Shell injection: "test.py; rm -rf /" (blocked by regex)
    - Code injection: "test.py::test(); os.system('cmd')" (blocked by regex)
    - Path traversal: "../../etc/test.py" (blocked by .. check)
    """
    # String-level validation
    if not pytest_path or not isinstance(pytest_path, str):
        raise ValueError(
            f"Invalid pytest path: {pytest_path}\n"
            f"Purpose: {purpose}\n"
            f"Expected: Non-empty string\n"
            f"Format: test_file.py or test_file.py::test_name"
        )

    # Reject traversal attempts
    if ".." in pytest_path:
        audit_log("pytest_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "path": pytest_path,
            "reason": "path_traversal_attempt"
        })
        raise ValueError(
            f"Path traversal attempt in pytest path: {pytest_path}\n"
            f"Purpose: {purpose}\n"
            f"Paths containing '..' are not allowed.\n"
            f"Expected: tests/test_file.py or tests/test_file.py::test_name"
        )

    # Validate format with regex
    if not PYTEST_PATH_PATTERN.match(pytest_path):
        audit_log("pytest_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "path": pytest_path,
            "reason": "invalid_format"
        })
        raise ValueError(
            f"Invalid pytest path format: {pytest_path}\n"
            f"Purpose: {purpose}\n"
            f"Expected format:\n"
            f"  - test_file.py\n"
            f"  - test_file.py::test_name\n"
            f"  - test_file.py::TestClass::test_method\n"
            f"  - test_file.py::test_name[param1,param2]\n"
            f"Pattern: alphanumeric, slash, dot, hyphen, underscore only"
        )

    # Extract file path component
    file_path = pytest_path.split("::")[0]

    # Validate file path component against whitelist
    try:
        validate_path(Path(file_path), f"{purpose} (file component)", allow_missing=True)
    except ValueError as e:
        audit_log("pytest_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "path": pytest_path,
            "reason": "file_path_validation_failed",
            "error": str(e)
        })
        raise ValueError(
            f"Pytest file path validation failed: {pytest_path}\n"
            f"Purpose: {purpose}\n"
            f"File path: {file_path}\n"
            f"Error: {e}"
        )

    # Success
    audit_log("pytest_validation", "success", {
        "operation": f"validate_{purpose.replace(' ', '_')}",
        "path": pytest_path
    })

    return pytest_path


def validate_input_length(
    value: str,
    max_length: int,
    field_name: str,
    purpose: str = "input validation"
) -> str:
    """Validate input string length to prevent resource exhaustion.

    Args:
        value: Input string to validate
        max_length: Maximum allowed length
        field_name: Name of the field being validated
        purpose: Human-readable description

    Returns:
        Validated string

    Raises:
        ValueError: If string exceeds max_length

    Security Rationale:
    ===================
    Unbounded string inputs can cause:
    - Memory exhaustion (OOM kills)
    - Log file bloat (disk exhaustion)
    - DoS via resource consumption

    This function enforces reasonable limits on all user inputs.
    """
    if not isinstance(value, str):
        raise ValueError(
            f"Invalid {field_name}: must be string\n"
            f"Purpose: {purpose}\n"
            f"Got: {type(value).__name__}"
        )

    if len(value) > max_length:
        audit_log("input_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "field": field_name,
            "length": len(value),
            "max_length": max_length,
            "reason": "length_exceeded"
        })
        raise ValueError(
            f"Invalid {field_name}: {field_name} too long ({len(value)} characters)\n"
            f"Purpose: {purpose}\n"
            f"Maximum allowed: {max_length} characters\n"
            f"Preview: {value[:100]}..."
        )

    return value


def validate_agent_name(agent_name: str, purpose: str = "agent tracking") -> str:
    """Validate agent name format.

    Args:
        agent_name: Agent name to validate
        purpose: Human-readable description

    Returns:
        Validated agent name

    Raises:
        ValueError: If agent name format is invalid

    Valid Format:
    - 1-255 characters
    - Alphanumeric, hyphen, underscore only
    - No spaces or special characters

    Examples:
    - researcher ✓
    - test-master ✓
    - doc_master ✓
    - security auditor ✗ (space not allowed)
    - researcher; rm -rf / ✗ (semicolon not allowed)
    """
    # Length validation
    validate_input_length(agent_name, 255, "agent_name", purpose)

    # Format validation
    if not agent_name:
        raise ValueError(
            f"Agent name cannot be empty\n"
            f"Purpose: {purpose}\n"
            f"Expected: Non-empty string (e.g., 'researcher', 'test-master')"
        )

    # Alphanumeric + hyphen/underscore only
    if not re.match(r'^[\w-]+$', agent_name):
        audit_log("input_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "field": "agent_name",
            "value": agent_name,
            "reason": "invalid_characters"
        })
        raise ValueError(
            f"Invalid agent name: {agent_name}\n"
            f"Purpose: {purpose}\n"
            f"Allowed characters: alphanumeric, hyphen, underscore\n"
            f"Examples: 'researcher', 'test-master', 'doc_master'"
        )

    return agent_name


def validate_github_issue(issue_number: int, purpose: str = "issue tracking") -> int:
    """Validate GitHub issue number.

    Args:
        issue_number: Issue number to validate
        purpose: Human-readable description

    Returns:
        Validated issue number

    Raises:
        ValueError: If issue number is invalid

    Valid Range: 1 to 999999
    - GitHub issue numbers are typically < 1 million
    - Prevents integer overflow or negative values
    """
    if not isinstance(issue_number, int):
        raise ValueError(
            f"Invalid GitHub issue number: must be integer\n"
            f"Purpose: {purpose}\n"
            f"Got: {type(issue_number).__name__}"
        )

    if issue_number < 1 or issue_number > 999999:
        audit_log("input_validation", "failure", {
            "operation": f"validate_{purpose.replace(' ', '_')}",
            "field": "github_issue",
            "value": issue_number,
            "reason": "out_of_range"
        })
        raise ValueError(
            f"Invalid GitHub issue number: {issue_number}\n"
            f"Purpose: {purpose}\n"
            f"Expected range: 1 to 999999\n"
            f"Provided: {issue_number}"
        )

    return issue_number


# Export all public functions
__all__ = [
    "validate_path",
    "validate_pytest_path",
    "validate_input_length",
    "validate_agent_name",
    "validate_github_issue",
    "audit_log",
    "PROJECT_ROOT",
    "SYSTEM_TEMP",
    "CLAUDE_HOME_DIR",
]