This commit is contained in:
MarkLo 2025-11-25 05:35:54 +08:00
parent 4cf7e808d7
commit 15babc2bea
4 changed files with 57 additions and 20 deletions

View File

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
"""
PDF Generation Service for Analyst Reports
Converts markdown reports to PDF format with Chinese character support
@ -101,7 +102,7 @@ class PDFGenerator:
# Define styles
styles = getSampleStyleSheet()
# Custom styles with Cactus Classical Serif font
# Custom styles with proper spacing and wrapping
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
@ -110,6 +111,7 @@ class PDFGenerator:
textColor=HexColor('#1a1a1a'),
spaceAfter=30,
alignment=TA_CENTER,
wordWrap='CJK',
)
subtitle_style = ParagraphStyle(
@ -118,8 +120,9 @@ class PDFGenerator:
fontName=self.primary_font,
fontSize=12,
textColor=HexColor('#666666'),
spaceAfter=20,
spaceAfter=12,
alignment=TA_CENTER,
wordWrap='CJK',
)
heading_style = ParagraphStyle(
@ -129,7 +132,8 @@ class PDFGenerator:
fontSize=16,
textColor=HexColor('#2c3e50'),
spaceAfter=12,
spaceBefore=12,
spaceBefore=16,
wordWrap='CJK',
)
body_style = ParagraphStyle(
@ -137,9 +141,11 @@ class PDFGenerator:
parent=styles['Normal'],
fontName=self.primary_font,
fontSize=10,
leading=14,
leading=16, # Increased from 14 for better readability
textColor=HexColor('#333333'),
spaceAfter=8,
spaceAfter=10,
wordWrap='CJK',
splitLongWords=True,
)
# Add title
@ -178,11 +184,8 @@ class PDFGenerator:
else:
# Regular paragraph - escape HTML chars and handle special characters
text = self._escape_html(para)
try:
elements.append(Paragraph(text, body_style))
except Exception as e:
# If paragraph fails, add as plain text
elements.append(Paragraph(text.encode('ascii', 'xmlcharrefreplace').decode(), body_style))
# Ensure proper UTF-8 handling
elements.append(Paragraph(text, body_style))
# Build PDF
doc.build(elements)
@ -206,14 +209,14 @@ class PDFGenerator:
# Remove markdown links but keep text
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
# Remove bold/italic markers
# Remove bold/italic markers carefully to avoid orphan characters
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
text = re.sub(r'\*([^\*]+)\*', r'\1', text)
text = re.sub(r'(?<!\*)\*([^\*]+)\*(?!\*)', r'\1', text) # Avoid double asterisks
text = re.sub(r'__([^_]+)__', r'\1', text)
text = re.sub(r'_([^_]+)_', r'\1', text)
text = re.sub(r'(?<!_)_([^_]+)_(?!_)', r'\1', text) # Avoid double underscores
# Remove code blocks
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
text = re.sub(r'```[^`]*```', ' ', text, flags=re.DOTALL) # Replace with space not empty
text = re.sub(r'`([^`]+)`', r'\1', text)
# Clean up bullet points
@ -222,6 +225,12 @@ class PDFGenerator:
# Remove horizontal rules
text = re.sub(r'^[\-\*\_]{3,}\s*$', '', text, flags=re.MULTILINE)
# Remove multiple consecutive spaces
text = re.sub(r' {2,}', ' ', text)
# Remove orphaned single characters that might be markdown artifacts
text = re.sub(r'(?<=[^\w])([*_`~#])(?=[^\w])', '', text)
return text
def _escape_html(self, text: str) -> str:

View File

@ -47,10 +47,16 @@ def create_bear_researcher(llm, memory):
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens
def truncate_text(text, max_chars):
"""截斷文本到指定字符數"""
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
if len(text) <= max_chars:
return text
return text[:max_chars] + "\n...(內容已截斷)"
truncated = text[:max_chars]
for delimiter in ['', '\n', '', '', ' ']:
last_pos = truncated.rfind(delimiter)
if last_pos > max_chars * 0.8:
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
return truncated + "...(為控制長度已精簡)"
# 為每個報告設置合理的字符限制
# 模型 gpt-4.1-mini 的限制是 8192 tokens

View File

@ -47,10 +47,21 @@ def create_bull_researcher(llm, memory):
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens
def truncate_text(text, max_chars):
"""截斷文本到指定字符數"""
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
if len(text) <= max_chars:
return text
return text[:max_chars] + "\n...(內容已截斷)"
# 在max_chars附近尋找句子結束標記
truncated = text[:max_chars]
# 尋找最後一個句號、換行或逗號
for delimiter in ['', '\n', '', '', ' ']:
last_pos = truncated.rfind(delimiter)
if last_pos > max_chars * 0.8: # 至少保留80%的內容
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
# 如果找不到合適的分隔符,直接在字符處截斷
return truncated + "...(為控制長度已精簡)"
# 為每個報告設置合理的字符限制
# 模型 gpt-4.1-mini 的限制是 8192 tokens

View File

@ -41,10 +41,21 @@ def create_trader(llm, memory):
# 定義文本截斷函數以避免超過 token 限制
def truncate_text(text, max_chars):
"""截斷文本到指定字符數"""
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
if len(text) <= max_chars:
return text
return text[:max_chars] + "\n...(內容已截斷)"
# 在max_chars附近尋找句子結束標記
truncated = text[:max_chars]
# 尋找最後一個句號、換行或逗號
for delimiter in ['', '\n', '', '', ' ']:
last_pos = truncated.rfind(delimiter)
if last_pos > max_chars * 0.8: # 至少保留80%的內容
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
# 如果找不到合適的分隔符,直接在字符處截斷
return truncated + "...(為控制長度已精簡)"
# 截斷各類報告以控制 token 使用量
# 這些報告將用於記憶檢索embedding和 LLM prompt