TradingAgents/backend/app/services/pdf_generator.py

273 lines
9.6 KiB
Python

# -*- coding: utf-8 -*-
"""
PDF Generation Service for Analyst Reports
Converts markdown reports to PDF format with Chinese character support
"""
import io
import re
from typing import Optional
from datetime import datetime
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.lib.enums import TA_LEFT, TA_CENTER
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.colors import HexColor
import markdown
class PDFGenerator:
"""Generate PDF reports from markdown content"""
def __init__(self):
"""Initialize PDF generator with Chinese font support"""
import os
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
# Try to register custom Cactus Classical Serif font first
self.custom_font = None
# Ensure we have the absolute path to the current file
current_file = os.path.abspath(__file__)
# backend/app/services/pdf_generator.py -> backend/app/services -> backend/app -> backend -> root
root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
custom_font_path = os.path.join(
root_dir,
'Cactus_Classical_Serif',
'CactusClassicalSerif-Regular.ttf'
)
print(f"Attempting to load font from: {custom_font_path}")
if os.path.exists(custom_font_path):
try:
pdfmetrics.registerFont(TTFont('CactusClassicalSerif', custom_font_path))
self.custom_font = 'CactusClassicalSerif'
print(f"Successfully registered Cactus Classical Serif font")
except Exception as e:
print(f"Error registering custom font: {e}")
else:
print(f"Custom font file not found at {custom_font_path}")
# Try looking in current working directory as fallback
cwd_path = os.path.join(os.getcwd(), 'Cactus_Classical_Serif', 'CactusClassicalSerif-Regular.ttf')
if os.path.exists(cwd_path):
try:
pdfmetrics.registerFont(TTFont('CactusClassicalSerif', cwd_path))
self.custom_font = 'CactusClassicalSerif'
print(f"Successfully registered Cactus Classical Serif font from CWD")
except Exception as e:
print(f"Error registering custom font from CWD: {e}")
# Register Chinese font as fallback for CJK characters
if not self.custom_font:
try:
# Register Chinese font (Traditional Chinese support)
pdfmetrics.registerFont(UnicodeCIDFont('STSong-Light'))
self.chinese_font = 'STSong-Light'
print("Registered STSong-Light as fallback (Warning: May not be visible in all viewers)")
except Exception as e:
# If CID font registration fails, try alternative method
try:
pdfmetrics.registerFont(UnicodeCIDFont('STHeiti-Light'))
self.chinese_font = 'STHeiti-Light'
except Exception:
# Last resort: use MSung for Traditional Chinese
try:
pdfmetrics.registerFont(UnicodeCIDFont('MSung-Light'))
self.chinese_font = 'MSung-Light'
except Exception:
# Fallback to basic font
self.chinese_font = 'Helvetica'
print("Warning: Could not register Chinese font")
else:
self.chinese_font = self.custom_font
# Set primary font: use custom font if available, otherwise Chinese font
self.primary_font = self.custom_font if self.custom_font else self.chinese_font
def generate_analyst_report_pdf(
self,
analyst_name: str,
ticker: str,
analysis_date: str,
report_content: str,
) -> bytes:
"""
Generate a PDF from analyst report content
Args:
analyst_name: Name of the analyst
ticker: Stock ticker symbol
analysis_date: Date of analysis
report_content: Markdown formatted report content
Returns:
PDF file content as bytes
"""
buffer = io.BytesIO()
# Create PDF document
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=2*cm,
leftMargin=2*cm,
topMargin=2*cm,
bottomMargin=2*cm,
)
# Container for the 'Flowable' objects
elements = []
# Define styles
styles = getSampleStyleSheet()
# Custom styles with proper spacing and wrapping
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
fontName=self.primary_font,
fontSize=24,
textColor=HexColor('#1a1a1a'),
spaceAfter=30,
alignment=TA_CENTER,
wordWrap='CJK',
)
subtitle_style = ParagraphStyle(
'CustomSubtitle',
parent=styles['Normal'],
fontName=self.primary_font,
fontSize=12,
textColor=HexColor('#666666'),
spaceAfter=12,
alignment=TA_CENTER,
wordWrap='CJK',
)
heading_style = ParagraphStyle(
'CustomHeading',
parent=styles['Heading2'],
fontName=self.primary_font,
fontSize=16,
textColor=HexColor('#2c3e50'),
spaceAfter=12,
spaceBefore=16,
wordWrap='CJK',
)
body_style = ParagraphStyle(
'CustomBody',
parent=styles['Normal'],
fontName=self.primary_font,
fontSize=10,
leading=16, # Increased from 14 for better readability
textColor=HexColor('#333333'),
spaceAfter=10,
wordWrap='CJK',
splitLongWords=True,
)
# Add title
title = f"{analyst_name}"
elements.append(Paragraph(title, title_style))
elements.append(Spacer(1, 0.3*cm))
# Add metadata
metadata = f"{ticker} | {analysis_date}"
elements.append(Paragraph(metadata, subtitle_style))
elements.append(Spacer(1, 0.5*cm))
# Convert markdown to simple text (basic conversion)
# Clean markdown formatting
content = self._clean_markdown(report_content)
# Split content into paragraphs
paragraphs = content.split('\n')
for para in paragraphs:
para = para.strip()
if not para:
elements.append(Spacer(1, 0.2*cm))
continue
# Check if it's a heading
if para.startswith('# '):
text = para[2:]
elements.append(Paragraph(text, heading_style))
elif para.startswith('## '):
text = para[3:]
elements.append(Paragraph(text, heading_style))
elif para.startswith('### '):
text = para[4:]
elements.append(Paragraph(text, heading_style))
else:
# Regular paragraph - escape HTML chars and handle special characters
text = self._escape_html(para)
# Ensure proper UTF-8 handling
elements.append(Paragraph(text, body_style))
# Build PDF
doc.build(elements)
# Get the PDF content
pdf_content = buffer.getvalue()
buffer.close()
return pdf_content
def _clean_markdown(self, text: str) -> str:
"""
Clean markdown formatting for PDF
Args:
text: Markdown text
Returns:
Cleaned text
"""
# Remove markdown links but keep text
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
# Remove bold/italic markers carefully to avoid orphan characters
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
text = re.sub(r'(?<!\*)\*([^\*]+)\*(?!\*)', r'\1', text) # Avoid double asterisks
text = re.sub(r'__([^_]+)__', r'\1', text)
text = re.sub(r'(?<!_)_([^_]+)_(?!_)', r'\1', text) # Avoid double underscores
# Remove code blocks
text = re.sub(r'```[^`]*```', ' ', text, flags=re.DOTALL) # Replace with space not empty
text = re.sub(r'`([^`]+)`', r'\1', text)
# Clean up bullet points
text = re.sub(r'^\s*[\*\-\+]\s+', '', text, flags=re.MULTILINE)
# Remove horizontal rules
text = re.sub(r'^[\-\*\_]{3,}\s*$', '', text, flags=re.MULTILINE)
# Remove multiple consecutive spaces
text = re.sub(r' {2,}', ' ', text)
# Remove orphaned single characters that might be markdown artifacts
text = re.sub(r'(?<=[^\w])([*_`~#])(?=[^\w])', '', text)
return text
def _escape_html(self, text: str) -> str:
"""
Escape HTML special characters for PDF
Args:
text: Text to escape
Returns:
Escaped text
"""
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
return text