This commit is contained in:
MarkLo 2025-11-25 05:35:54 +08:00
parent 4cf7e808d7
commit 15babc2bea
4 changed files with 57 additions and 20 deletions

View File

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
""" """
PDF Generation Service for Analyst Reports PDF Generation Service for Analyst Reports
Converts markdown reports to PDF format with Chinese character support Converts markdown reports to PDF format with Chinese character support
@ -101,7 +102,7 @@ class PDFGenerator:
# Define styles # Define styles
styles = getSampleStyleSheet() styles = getSampleStyleSheet()
# Custom styles with Cactus Classical Serif font # Custom styles with proper spacing and wrapping
title_style = ParagraphStyle( title_style = ParagraphStyle(
'CustomTitle', 'CustomTitle',
parent=styles['Heading1'], parent=styles['Heading1'],
@ -110,6 +111,7 @@ class PDFGenerator:
textColor=HexColor('#1a1a1a'), textColor=HexColor('#1a1a1a'),
spaceAfter=30, spaceAfter=30,
alignment=TA_CENTER, alignment=TA_CENTER,
wordWrap='CJK',
) )
subtitle_style = ParagraphStyle( subtitle_style = ParagraphStyle(
@ -118,8 +120,9 @@ class PDFGenerator:
fontName=self.primary_font, fontName=self.primary_font,
fontSize=12, fontSize=12,
textColor=HexColor('#666666'), textColor=HexColor('#666666'),
spaceAfter=20, spaceAfter=12,
alignment=TA_CENTER, alignment=TA_CENTER,
wordWrap='CJK',
) )
heading_style = ParagraphStyle( heading_style = ParagraphStyle(
@ -129,7 +132,8 @@ class PDFGenerator:
fontSize=16, fontSize=16,
textColor=HexColor('#2c3e50'), textColor=HexColor('#2c3e50'),
spaceAfter=12, spaceAfter=12,
spaceBefore=12, spaceBefore=16,
wordWrap='CJK',
) )
body_style = ParagraphStyle( body_style = ParagraphStyle(
@ -137,9 +141,11 @@ class PDFGenerator:
parent=styles['Normal'], parent=styles['Normal'],
fontName=self.primary_font, fontName=self.primary_font,
fontSize=10, fontSize=10,
leading=14, leading=16, # Increased from 14 for better readability
textColor=HexColor('#333333'), textColor=HexColor('#333333'),
spaceAfter=8, spaceAfter=10,
wordWrap='CJK',
splitLongWords=True,
) )
# Add title # Add title
@ -178,11 +184,8 @@ class PDFGenerator:
else: else:
# Regular paragraph - escape HTML chars and handle special characters # Regular paragraph - escape HTML chars and handle special characters
text = self._escape_html(para) text = self._escape_html(para)
try: # Ensure proper UTF-8 handling
elements.append(Paragraph(text, body_style)) elements.append(Paragraph(text, body_style))
except Exception as e:
# If paragraph fails, add as plain text
elements.append(Paragraph(text.encode('ascii', 'xmlcharrefreplace').decode(), body_style))
# Build PDF # Build PDF
doc.build(elements) doc.build(elements)
@ -206,14 +209,14 @@ class PDFGenerator:
# Remove markdown links but keep text # Remove markdown links but keep text
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
# Remove bold/italic markers # Remove bold/italic markers carefully to avoid orphan characters
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
text = re.sub(r'\*([^\*]+)\*', r'\1', text) text = re.sub(r'(?<!\*)\*([^\*]+)\*(?!\*)', r'\1', text) # Avoid double asterisks
text = re.sub(r'__([^_]+)__', r'\1', text) text = re.sub(r'__([^_]+)__', r'\1', text)
text = re.sub(r'_([^_]+)_', r'\1', text) text = re.sub(r'(?<!_)_([^_]+)_(?!_)', r'\1', text) # Avoid double underscores
# Remove code blocks # Remove code blocks
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) text = re.sub(r'```[^`]*```', ' ', text, flags=re.DOTALL) # Replace with space not empty
text = re.sub(r'`([^`]+)`', r'\1', text) text = re.sub(r'`([^`]+)`', r'\1', text)
# Clean up bullet points # Clean up bullet points
@ -222,6 +225,12 @@ class PDFGenerator:
# Remove horizontal rules # Remove horizontal rules
text = re.sub(r'^[\-\*\_]{3,}\s*$', '', text, flags=re.MULTILINE) text = re.sub(r'^[\-\*\_]{3,}\s*$', '', text, flags=re.MULTILINE)
# Remove multiple consecutive spaces
text = re.sub(r' {2,}', ' ', text)
# Remove orphaned single characters that might be markdown artifacts
text = re.sub(r'(?<=[^\w])([*_`~#])(?=[^\w])', '', text)
return text return text
def _escape_html(self, text: str) -> str: def _escape_html(self, text: str) -> str:

View File

@ -47,10 +47,16 @@ def create_bear_researcher(llm, memory):
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens # 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens
def truncate_text(text, max_chars): def truncate_text(text, max_chars):
"""截斷文本到指定字符數""" """智能截斷文本到指定字符數,在句子邊界處截斷"""
if len(text) <= max_chars: if len(text) <= max_chars:
return text return text
return text[:max_chars] + "\n...(內容已截斷)"
truncated = text[:max_chars]
for delimiter in ['', '\n', '', '', ' ']:
last_pos = truncated.rfind(delimiter)
if last_pos > max_chars * 0.8:
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
return truncated + "...(為控制長度已精簡)"
# 為每個報告設置合理的字符限制 # 為每個報告設置合理的字符限制
# 模型 gpt-4.1-mini 的限制是 8192 tokens # 模型 gpt-4.1-mini 的限制是 8192 tokens

View File

@ -47,10 +47,21 @@ def create_bull_researcher(llm, memory):
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens # 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens
def truncate_text(text, max_chars): def truncate_text(text, max_chars):
"""截斷文本到指定字符數""" """智能截斷文本到指定字符數,在句子邊界處截斷"""
if len(text) <= max_chars: if len(text) <= max_chars:
return text return text
return text[:max_chars] + "\n...(內容已截斷)"
# 在max_chars附近尋找句子結束標記
truncated = text[:max_chars]
# 尋找最後一個句號、換行或逗號
for delimiter in ['', '\n', '', '', ' ']:
last_pos = truncated.rfind(delimiter)
if last_pos > max_chars * 0.8: # 至少保留80%的內容
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
# 如果找不到合適的分隔符,直接在字符處截斷
return truncated + "...(為控制長度已精簡)"
# 為每個報告設置合理的字符限制 # 為每個報告設置合理的字符限制
# 模型 gpt-4.1-mini 的限制是 8192 tokens # 模型 gpt-4.1-mini 的限制是 8192 tokens

View File

@ -41,10 +41,21 @@ def create_trader(llm, memory):
# 定義文本截斷函數以避免超過 token 限制 # 定義文本截斷函數以避免超過 token 限制
def truncate_text(text, max_chars): def truncate_text(text, max_chars):
"""截斷文本到指定字符數""" """智能截斷文本到指定字符數,在句子邊界處截斷"""
if len(text) <= max_chars: if len(text) <= max_chars:
return text return text
return text[:max_chars] + "\n...(內容已截斷)"
# 在max_chars附近尋找句子結束標記
truncated = text[:max_chars]
# 尋找最後一個句號、換行或逗號
for delimiter in ['', '\n', '', '', ' ']:
last_pos = truncated.rfind(delimiter)
if last_pos > max_chars * 0.8: # 至少保留80%的內容
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
# 如果找不到合適的分隔符,直接在字符處截斷
return truncated + "...(為控制長度已精簡)"
# 截斷各類報告以控制 token 使用量 # 截斷各類報告以控制 token 使用量
# 這些報告將用於記憶檢索embedding和 LLM prompt # 這些報告將用於記憶檢索embedding和 LLM prompt