TradingAgents/backend/app/services/pdf_generator.py

# -*- coding: utf-8 -*-
"""
PDF Generation Service for Analyst Reports
Converts markdown reports to PDF format with Chinese character support
"""
import io
import re
from typing import Optional
from datetime import datetime
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.lib.enums import TA_LEFT, TA_CENTER
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.colors import HexColor
import markdown


class PDFGenerator:
    """Generate PDF reports from markdown content"""

    # Emoji to safe ASCII character mapping for PDF compatibility
    # STSong-Light font has issues with certain Unicode symbols
    # Using ONLY ASCII characters to ensure perfect rendering
    EMOJI_TO_UNICODE = {
        # Status & Indicators - ASCII only
        '✅': '[OK]',
        '❌': '[X]',
        '⚠️': '[!]',
        '⚡': '*',
        '🔔': 'o',

        # Rating & Quality - ASCII only
        '⭐': '*',
        '🌟': '*',
        '💎': '+',
        '🏆': '#',

        # Charts & Analytics - ASCII or empty
        '📊': '',
        '📈': '^',
        '📉': 'v',
        '📋': '-',
        '📌': '*',

        # Money & Business - ASCII currency letters
        '💰': '$',
        '💵': '$',
        '💴': 'Y',  # 日元
        '💶': 'E',  # 歐元
        '💷': 'P',  # 英鎊
        '💸': '$',
        '💹': '^',

        # Direction & Movement - ASCII arrows
        '🚀': '^^',
        '⬆️': '^',
        '⬇️': 'v',
        '➡️': '>',
        '⬅️': '<',
        '🔼': '^',
        '🔽': 'v',

        # Symbols - ASCII only
        '🎯': 'o',
        '🔥': '*',
        '💡': '*',
        '⚙️': '*',
        '🔧': '>',
        '🔨': '>',

        # AI & Tech - remove or simple ASCII
        '🤖': '',
        '💻': '',
        '📱': '',
        '🖥️': '',

        # People & Roles - remove
        '👤': '',
        '👥': '',
        '🔬': '',
        '📚': '',

        # Time - simple ASCII
        '⏰': 'o',
        '📅': '-',
        '⏱️': 'o',

        # Other common emojis - ASCII or remove
        '✨': '*',
        '🎨': '',
        '📝': '-',
        '📄': '-',
        '🗂️': '=',
        '🌐': 'o',
        '🔗': '~',
        '💼': '',
    }
    """Generate PDF reports from markdown content"""

    def __init__(self):
        """Initialize PDF generator with Chinese font support"""
        import os
        from reportlab.pdfbase.cidfonts import UnicodeCIDFont
        from reportlab.pdfbase import pdfmetrics
        from reportlab.pdfbase.ttfonts import TTFont

        # Initialize font variables
        self.custom_font = None
        self.chinese_font = None

        # CRITICAL FIX: Use ReportLab's built-in CID fonts for proper character spacing
        # CID fonts (Adobe-GB1, Adobe-CNS1) are specifically designed for PDF rendering
        # and don't have the character spacing issues that TTC files have
        try:
            # Method 1: Try using built-in CID fonts (best for Chinese PDFs)
            # These fonts have PERFECT character spacing without gaps
            try:
                # Try STSong-Light (for Traditional + Simplified Chinese)
                pdfmetrics.registerFont(UnicodeCIDFont('STSong-Light'))
                self.custom_font = 'STSong-Light'
                self.chinese_font = 'STSong-Light'
                print(f"✅ Using STSong-Light CID font - Perfect Chinese character spacing")
            except:
                # Fallback to MSung-Light (Traditional Chinese)
                try:
                    pdfmetrics.registerFont(UnicodeCIDFont('MSung-Light'))
                    self.custom_font = 'MSung-Light'
                    self.chinese_font = 'MSung-Light'
                    print(f"✅ Using MSung-Light CID font - Perfect Traditional Chinese spacing")
                except:
                    # Last CID font attempt: STSongStd-Light
                    try:
                        pdfmetrics.registerFont(UnicodeCIDFont('STSongStd-Light'))
                        self.custom_font = 'STSongStd-Light'
                        self.chinese_font = 'STSongStd-Light'
                        print(f"✅ Using STSongStd-Light CID font")
                    except:
                        raise Exception("No CID fonts available")
        except:
            # Method 2: Fallback to TTF fonts if CID fonts fail
            print("⚠️  CID fonts not available, trying TTF fonts...")
            try:
                # Try Arial Unicode MS (TTF file, not TTC)
                arial_unicode_path = '/System/Library/Fonts/Supplemental/Arial Unicode.ttf'
                if os.path.exists(arial_unicode_path):
                    pdfmetrics.registerFont(TTFont('ArialUnicode', arial_unicode_path))
                    self.custom_font = 'ArialUnicode'
                    self.chinese_font = 'ArialUnicode'
                    print(f"✅ Using Arial Unicode MS (TTF) - Good Chinese support")
                else:
                    raise Exception("Arial Unicode not found")
            except Exception as e:
                # Final fallback: Use built-in Helvetica
                print(f"❌ Font registration failed: {e}")
                print(f"⚠️  Using Helvetica (limited Chinese character support)")
                self.custom_font = 'Helvetica'
                self.chinese_font = 'Helvetica'

        # Set primary font
        self.primary_font = self.custom_font if self.custom_font else self.chinese_font

    def generate_analyst_report_pdf(
        self,
        analyst_name: str,
        ticker: str,
        analysis_date: str,
        report_content: str,
    ) -> bytes:
        """
        Generate a PDF from analyst report content

        Args:
            analyst_name: Name of the analyst
            ticker: Stock ticker symbol
            analysis_date: Date of analysis
            report_content: Markdown formatted report content

        Returns:
            PDF file content as bytes
        """
        buffer = io.BytesIO()

        # Create PDF document with reduced margins for more content space
        doc = SimpleDocTemplate(
            buffer,
            pagesize=A4,
            rightMargin=1.5*cm,
            leftMargin=1.5*cm,
            topMargin=1.5*cm,
            bottomMargin=1.5*cm,
        )

        # Container for the 'Flowable' objects
        elements = []

        # Define styles
        styles = getSampleStyleSheet()

        # Custom styles with proper spacing and wrapping
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Heading1'],
            fontName=self.primary_font,
            fontSize=24,
            textColor=HexColor('#1a1a1a'),
            spaceAfter=30,
            alignment=TA_CENTER,
            wordWrap='CJK',
        )

        subtitle_style = ParagraphStyle(
            'CustomSubtitle',
            parent=styles['Normal'],
            fontName=self.primary_font,
            fontSize=12,
            textColor=HexColor('#666666'),
            spaceAfter=12,
            alignment=TA_CENTER,
            wordWrap='CJK',
        )

        heading_style = ParagraphStyle(
            'CustomHeading',
            parent=styles['Heading2'],
            fontName=self.primary_font,
            fontSize=16,
            textColor=HexColor('#2c3e50'),
            spaceAfter=12,
            spaceBefore=16,
            wordWrap='CJK',
        )

        body_style = ParagraphStyle(
            'CustomBody',
            parent=styles['Normal'],
            fontName=self.primary_font,
            fontSize=9,
            leading=14,
            textColor=HexColor('#333333'),
            spaceAfter=8,
            wordWrap='CJK',
            splitLongWords=True,
            allowOrphans=0,
            allowWidows=0,
        )

        # Add title
        title = f"{analyst_name}"
        elements.append(Paragraph(title, title_style))
        elements.append(Spacer(1, 0.3*cm))

        # Add metadata
        metadata = f"{ticker} | {analysis_date}"
        elements.append(Paragraph(metadata, subtitle_style))
        elements.append(Spacer(1, 0.5*cm))

        # STEP 1: Replace emojis with Unicode symbols BEFORE markdown cleaning
        report_content = self._replace_emojis(report_content)
        analyst_name = self._replace_emojis(analyst_name)

        # STEP 2: Clean markdown formatting
        content = self._clean_markdown(report_content)

        # Split content into paragraphs
        paragraphs = content.split('\n')

        for para in paragraphs:
            para = para.strip()
            if not para:
                elements.append(Spacer(1, 0.2*cm))
                continue

            # Check if it's a heading
            if para.startswith('# '):
                text = para[2:]
                elements.append(Paragraph(text, heading_style))
            elif para.startswith('## '):
                text = para[3:]
                elements.append(Paragraph(text, heading_style))
            elif para.startswith('### '):
                text = para[4:]
                elements.append(Paragraph(text, heading_style))
            else:
                # Regular paragraph - escape HTML chars and handle special characters
                text = self._escape_html(para)
                # Ensure proper UTF-8 handling
                elements.append(Paragraph(text, body_style))

        # Build PDF
        doc.build(elements)

        # Get the PDF content
        pdf_content = buffer.getvalue()
        buffer.close()

        return pdf_content

    def _clean_markdown(self, text: str) -> str:
        """
        Clean markdown formatting for PDF - IMPROVED VERSION
        Simplified regex patterns to prevent encoding artifacts

        Args:
            text: Markdown text

        Returns:
            Cleaned text
        """
        import unicodedata

        # 0. Normalize Unicode to prevent encoding issues
        text = unicodedata.normalize('NFKC', text)

        # 1. Remove markdown links but keep text
        text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)

        # 2. Remove bold markers (simplified version)
        text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
        text = re.sub(r'__(.+?)__', r'\1', text)

        # 3. Remove italic markers (SIMPLIFIED - avoid complex lookahead/lookbehind)
        # Only match single * or _ that are NOT part of ** or __
        text = re.sub(r'(?<![\*])\*([^\*]+?)\*(?![\*])', r'\1', text)
        text = re.sub(r'(?<![_])_([^_]+?)_(?![_])', r'\1', text)

        # 4. Remove code blocks
        text = re.sub(r'```[^`]*?```', '', text, flags=re.DOTALL)
        text = re.sub(r'`([^`]+?)`', r'\1', text)

        # 5. Clean up bullet points
        text = re.sub(r'^\s*[\*\-\+]\s+', '• ', text, flags=re.MULTILINE)

        # 6. Remove horizontal rules
        text = re.sub(r'^[\-\*_]{3,}\s*$', '', text, flags=re.MULTILINE)

        # 7. Clean table separators (simplified)
        text = re.sub(r'^\s*\|?\s*:?-+:?\s*\|?\s*$', '', text, flags=re.MULTILINE)

        # 8. Remove table | symbols (keep content)
        text = re.sub(r'^\s*\|', '', text, flags=re.MULTILINE)
        text = re.sub(r'\|\s*$', '', text, flags=re.MULTILINE)
        text = re.sub(r'\|', ' | ', text)

        # 9. Clean excess spaces
        text = re.sub(r' {2,}', ' ', text)

        # 10. Clean excess blank lines
        text = re.sub(r'\n{3,}', '\n\n', text)

        # 11. Remove isolated markdown symbols (SIMPLIFIED - no complex patterns)
        # Remove lines that only contain markdown symbols
        text = re.sub(r'^[\*_`~#\-\+]+\s*$', '', text, flags=re.MULTILINE)

        # 12. REMOVED problematic Unicode filter that was corrupting Chinese characters
        # The string comparison '\u4e00' <= char <= '\u9fff' was comparing UTF-8 bytes,
        # not Unicode code points, causing characters like '經' to be corrupted.
        # Unicode normalization at the start (line 237) is sufficient.

        return text.strip()

    def _escape_html(self, text: str) -> str:
        """
        Escape HTML special characters for PDF - IMPROVED VERSION

        Args:
            text: Text to escape

        Returns:
            Escaped text
        """
        # Escape in order to avoid double-escaping
        replacements = [
            ('&', '&amp;'),
            ('<', '&lt;'),
            ('>', '&gt;'),
            ('"', '&quot;'),
            ("'", '&apos;'),
        ]

        for old, new in replacements:
            text = text.replace(old, new)

        return text

    def _replace_emojis(self, text: str) -> str:
        """
        Replace emoji characters with Unicode text symbols for PDF compatibility

        Emojis don't render well in PDFs, especially with CID fonts.
        This method replaces common emojis with Unicode text symbols that
        display reliably across all PDF viewers.

        Args:
            text: Text containing potential emoji characters

        Returns:
            Text with emojis replaced by Unicode symbols
        """
        if not text:
            return text

        # Replace each emoji with its Unicode symbol equivalent
        for emoji, unicode_symbol in self.EMOJI_TO_UNICODE.items():
            text = text.replace(emoji, unicode_symbol)

        return text