This commit is contained in:
parent
4cf7e808d7
commit
15babc2bea
|
|
@ -1,3 +1,4 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
PDF Generation Service for Analyst Reports
|
||||
Converts markdown reports to PDF format with Chinese character support
|
||||
|
|
@ -101,7 +102,7 @@ class PDFGenerator:
|
|||
# Define styles
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
# Custom styles with Cactus Classical Serif font
|
||||
# Custom styles with proper spacing and wrapping
|
||||
title_style = ParagraphStyle(
|
||||
'CustomTitle',
|
||||
parent=styles['Heading1'],
|
||||
|
|
@ -110,6 +111,7 @@ class PDFGenerator:
|
|||
textColor=HexColor('#1a1a1a'),
|
||||
spaceAfter=30,
|
||||
alignment=TA_CENTER,
|
||||
wordWrap='CJK',
|
||||
)
|
||||
|
||||
subtitle_style = ParagraphStyle(
|
||||
|
|
@ -118,8 +120,9 @@ class PDFGenerator:
|
|||
fontName=self.primary_font,
|
||||
fontSize=12,
|
||||
textColor=HexColor('#666666'),
|
||||
spaceAfter=20,
|
||||
spaceAfter=12,
|
||||
alignment=TA_CENTER,
|
||||
wordWrap='CJK',
|
||||
)
|
||||
|
||||
heading_style = ParagraphStyle(
|
||||
|
|
@ -129,7 +132,8 @@ class PDFGenerator:
|
|||
fontSize=16,
|
||||
textColor=HexColor('#2c3e50'),
|
||||
spaceAfter=12,
|
||||
spaceBefore=12,
|
||||
spaceBefore=16,
|
||||
wordWrap='CJK',
|
||||
)
|
||||
|
||||
body_style = ParagraphStyle(
|
||||
|
|
@ -137,9 +141,11 @@ class PDFGenerator:
|
|||
parent=styles['Normal'],
|
||||
fontName=self.primary_font,
|
||||
fontSize=10,
|
||||
leading=14,
|
||||
leading=16, # Increased from 14 for better readability
|
||||
textColor=HexColor('#333333'),
|
||||
spaceAfter=8,
|
||||
spaceAfter=10,
|
||||
wordWrap='CJK',
|
||||
splitLongWords=True,
|
||||
)
|
||||
|
||||
# Add title
|
||||
|
|
@ -178,11 +184,8 @@ class PDFGenerator:
|
|||
else:
|
||||
# Regular paragraph - escape HTML chars and handle special characters
|
||||
text = self._escape_html(para)
|
||||
try:
|
||||
elements.append(Paragraph(text, body_style))
|
||||
except Exception as e:
|
||||
# If paragraph fails, add as plain text
|
||||
elements.append(Paragraph(text.encode('ascii', 'xmlcharrefreplace').decode(), body_style))
|
||||
# Ensure proper UTF-8 handling
|
||||
elements.append(Paragraph(text, body_style))
|
||||
|
||||
# Build PDF
|
||||
doc.build(elements)
|
||||
|
|
@ -206,14 +209,14 @@ class PDFGenerator:
|
|||
# Remove markdown links but keep text
|
||||
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
||||
|
||||
# Remove bold/italic markers
|
||||
# Remove bold/italic markers carefully to avoid orphan characters
|
||||
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
|
||||
text = re.sub(r'\*([^\*]+)\*', r'\1', text)
|
||||
text = re.sub(r'(?<!\*)\*([^\*]+)\*(?!\*)', r'\1', text) # Avoid double asterisks
|
||||
text = re.sub(r'__([^_]+)__', r'\1', text)
|
||||
text = re.sub(r'_([^_]+)_', r'\1', text)
|
||||
text = re.sub(r'(?<!_)_([^_]+)_(?!_)', r'\1', text) # Avoid double underscores
|
||||
|
||||
# Remove code blocks
|
||||
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r'```[^`]*```', ' ', text, flags=re.DOTALL) # Replace with space not empty
|
||||
text = re.sub(r'`([^`]+)`', r'\1', text)
|
||||
|
||||
# Clean up bullet points
|
||||
|
|
@ -222,6 +225,12 @@ class PDFGenerator:
|
|||
# Remove horizontal rules
|
||||
text = re.sub(r'^[\-\*\_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
||||
|
||||
# Remove multiple consecutive spaces
|
||||
text = re.sub(r' {2,}', ' ', text)
|
||||
|
||||
# Remove orphaned single characters that might be markdown artifacts
|
||||
text = re.sub(r'(?<=[^\w])([*_`~#])(?=[^\w])', '', text)
|
||||
|
||||
return text
|
||||
|
||||
def _escape_html(self, text: str) -> str:
|
||||
|
|
|
|||
|
|
@ -47,10 +47,16 @@ def create_bear_researcher(llm, memory):
|
|||
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens)
|
||||
|
||||
def truncate_text(text, max_chars):
|
||||
"""截斷文本到指定字符數"""
|
||||
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
|
||||
if len(text) <= max_chars:
|
||||
return text
|
||||
return text[:max_chars] + "\n...(內容已截斷)"
|
||||
|
||||
truncated = text[:max_chars]
|
||||
for delimiter in ['。', '\n', ',', '、', ' ']:
|
||||
last_pos = truncated.rfind(delimiter)
|
||||
if last_pos > max_chars * 0.8:
|
||||
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
|
||||
return truncated + "...(為控制長度已精簡)"
|
||||
|
||||
# 為每個報告設置合理的字符限制
|
||||
# 模型 gpt-4.1-mini 的限制是 8192 tokens
|
||||
|
|
|
|||
|
|
@ -47,10 +47,21 @@ def create_bull_researcher(llm, memory):
|
|||
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens)
|
||||
|
||||
def truncate_text(text, max_chars):
|
||||
"""截斷文本到指定字符數"""
|
||||
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
|
||||
if len(text) <= max_chars:
|
||||
return text
|
||||
return text[:max_chars] + "\n...(內容已截斷)"
|
||||
|
||||
# 在max_chars附近尋找句子結束標記
|
||||
truncated = text[:max_chars]
|
||||
|
||||
# 尋找最後一個句號、換行或逗號
|
||||
for delimiter in ['。', '\n', ',', '、', ' ']:
|
||||
last_pos = truncated.rfind(delimiter)
|
||||
if last_pos > max_chars * 0.8: # 至少保留80%的內容
|
||||
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
|
||||
|
||||
# 如果找不到合適的分隔符,直接在字符處截斷
|
||||
return truncated + "...(為控制長度已精簡)"
|
||||
|
||||
# 為每個報告設置合理的字符限制
|
||||
# 模型 gpt-4.1-mini 的限制是 8192 tokens
|
||||
|
|
|
|||
|
|
@ -41,10 +41,21 @@ def create_trader(llm, memory):
|
|||
|
||||
# 定義文本截斷函數以避免超過 token 限制
|
||||
def truncate_text(text, max_chars):
|
||||
"""截斷文本到指定字符數"""
|
||||
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
|
||||
if len(text) <= max_chars:
|
||||
return text
|
||||
return text[:max_chars] + "\n...(內容已截斷)"
|
||||
|
||||
# 在max_chars附近尋找句子結束標記
|
||||
truncated = text[:max_chars]
|
||||
|
||||
# 尋找最後一個句號、換行或逗號
|
||||
for delimiter in ['。', '\n', ',', '、', ' ']:
|
||||
last_pos = truncated.rfind(delimiter)
|
||||
if last_pos > max_chars * 0.8: # 至少保留80%的內容
|
||||
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
|
||||
|
||||
# 如果找不到合適的分隔符,直接在字符處截斷
|
||||
return truncated + "...(為控制長度已精簡)"
|
||||
|
||||
# 截斷各類報告以控制 token 使用量
|
||||
# 這些報告將用於記憶檢索(embedding)和 LLM prompt
|
||||
|
|
|
|||
Loading…
Reference in New Issue