This commit is contained in:
parent
4cf7e808d7
commit
15babc2bea
|
|
@ -1,3 +1,4 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
PDF Generation Service for Analyst Reports
|
PDF Generation Service for Analyst Reports
|
||||||
Converts markdown reports to PDF format with Chinese character support
|
Converts markdown reports to PDF format with Chinese character support
|
||||||
|
|
@ -101,7 +102,7 @@ class PDFGenerator:
|
||||||
# Define styles
|
# Define styles
|
||||||
styles = getSampleStyleSheet()
|
styles = getSampleStyleSheet()
|
||||||
|
|
||||||
# Custom styles with Cactus Classical Serif font
|
# Custom styles with proper spacing and wrapping
|
||||||
title_style = ParagraphStyle(
|
title_style = ParagraphStyle(
|
||||||
'CustomTitle',
|
'CustomTitle',
|
||||||
parent=styles['Heading1'],
|
parent=styles['Heading1'],
|
||||||
|
|
@ -110,6 +111,7 @@ class PDFGenerator:
|
||||||
textColor=HexColor('#1a1a1a'),
|
textColor=HexColor('#1a1a1a'),
|
||||||
spaceAfter=30,
|
spaceAfter=30,
|
||||||
alignment=TA_CENTER,
|
alignment=TA_CENTER,
|
||||||
|
wordWrap='CJK',
|
||||||
)
|
)
|
||||||
|
|
||||||
subtitle_style = ParagraphStyle(
|
subtitle_style = ParagraphStyle(
|
||||||
|
|
@ -118,8 +120,9 @@ class PDFGenerator:
|
||||||
fontName=self.primary_font,
|
fontName=self.primary_font,
|
||||||
fontSize=12,
|
fontSize=12,
|
||||||
textColor=HexColor('#666666'),
|
textColor=HexColor('#666666'),
|
||||||
spaceAfter=20,
|
spaceAfter=12,
|
||||||
alignment=TA_CENTER,
|
alignment=TA_CENTER,
|
||||||
|
wordWrap='CJK',
|
||||||
)
|
)
|
||||||
|
|
||||||
heading_style = ParagraphStyle(
|
heading_style = ParagraphStyle(
|
||||||
|
|
@ -129,7 +132,8 @@ class PDFGenerator:
|
||||||
fontSize=16,
|
fontSize=16,
|
||||||
textColor=HexColor('#2c3e50'),
|
textColor=HexColor('#2c3e50'),
|
||||||
spaceAfter=12,
|
spaceAfter=12,
|
||||||
spaceBefore=12,
|
spaceBefore=16,
|
||||||
|
wordWrap='CJK',
|
||||||
)
|
)
|
||||||
|
|
||||||
body_style = ParagraphStyle(
|
body_style = ParagraphStyle(
|
||||||
|
|
@ -137,9 +141,11 @@ class PDFGenerator:
|
||||||
parent=styles['Normal'],
|
parent=styles['Normal'],
|
||||||
fontName=self.primary_font,
|
fontName=self.primary_font,
|
||||||
fontSize=10,
|
fontSize=10,
|
||||||
leading=14,
|
leading=16, # Increased from 14 for better readability
|
||||||
textColor=HexColor('#333333'),
|
textColor=HexColor('#333333'),
|
||||||
spaceAfter=8,
|
spaceAfter=10,
|
||||||
|
wordWrap='CJK',
|
||||||
|
splitLongWords=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add title
|
# Add title
|
||||||
|
|
@ -178,11 +184,8 @@ class PDFGenerator:
|
||||||
else:
|
else:
|
||||||
# Regular paragraph - escape HTML chars and handle special characters
|
# Regular paragraph - escape HTML chars and handle special characters
|
||||||
text = self._escape_html(para)
|
text = self._escape_html(para)
|
||||||
try:
|
# Ensure proper UTF-8 handling
|
||||||
elements.append(Paragraph(text, body_style))
|
elements.append(Paragraph(text, body_style))
|
||||||
except Exception as e:
|
|
||||||
# If paragraph fails, add as plain text
|
|
||||||
elements.append(Paragraph(text.encode('ascii', 'xmlcharrefreplace').decode(), body_style))
|
|
||||||
|
|
||||||
# Build PDF
|
# Build PDF
|
||||||
doc.build(elements)
|
doc.build(elements)
|
||||||
|
|
@ -206,14 +209,14 @@ class PDFGenerator:
|
||||||
# Remove markdown links but keep text
|
# Remove markdown links but keep text
|
||||||
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
||||||
|
|
||||||
# Remove bold/italic markers
|
# Remove bold/italic markers carefully to avoid orphan characters
|
||||||
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
|
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
|
||||||
text = re.sub(r'\*([^\*]+)\*', r'\1', text)
|
text = re.sub(r'(?<!\*)\*([^\*]+)\*(?!\*)', r'\1', text) # Avoid double asterisks
|
||||||
text = re.sub(r'__([^_]+)__', r'\1', text)
|
text = re.sub(r'__([^_]+)__', r'\1', text)
|
||||||
text = re.sub(r'_([^_]+)_', r'\1', text)
|
text = re.sub(r'(?<!_)_([^_]+)_(?!_)', r'\1', text) # Avoid double underscores
|
||||||
|
|
||||||
# Remove code blocks
|
# Remove code blocks
|
||||||
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
|
text = re.sub(r'```[^`]*```', ' ', text, flags=re.DOTALL) # Replace with space not empty
|
||||||
text = re.sub(r'`([^`]+)`', r'\1', text)
|
text = re.sub(r'`([^`]+)`', r'\1', text)
|
||||||
|
|
||||||
# Clean up bullet points
|
# Clean up bullet points
|
||||||
|
|
@ -222,6 +225,12 @@ class PDFGenerator:
|
||||||
# Remove horizontal rules
|
# Remove horizontal rules
|
||||||
text = re.sub(r'^[\-\*\_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
text = re.sub(r'^[\-\*\_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# Remove multiple consecutive spaces
|
||||||
|
text = re.sub(r' {2,}', ' ', text)
|
||||||
|
|
||||||
|
# Remove orphaned single characters that might be markdown artifacts
|
||||||
|
text = re.sub(r'(?<=[^\w])([*_`~#])(?=[^\w])', '', text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _escape_html(self, text: str) -> str:
|
def _escape_html(self, text: str) -> str:
|
||||||
|
|
|
||||||
|
|
@ -47,10 +47,16 @@ def create_bear_researcher(llm, memory):
|
||||||
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens)
|
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens)
|
||||||
|
|
||||||
def truncate_text(text, max_chars):
|
def truncate_text(text, max_chars):
|
||||||
"""截斷文本到指定字符數"""
|
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
|
||||||
if len(text) <= max_chars:
|
if len(text) <= max_chars:
|
||||||
return text
|
return text
|
||||||
return text[:max_chars] + "\n...(內容已截斷)"
|
|
||||||
|
truncated = text[:max_chars]
|
||||||
|
for delimiter in ['。', '\n', ',', '、', ' ']:
|
||||||
|
last_pos = truncated.rfind(delimiter)
|
||||||
|
if last_pos > max_chars * 0.8:
|
||||||
|
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
|
||||||
|
return truncated + "...(為控制長度已精簡)"
|
||||||
|
|
||||||
# 為每個報告設置合理的字符限制
|
# 為每個報告設置合理的字符限制
|
||||||
# 模型 gpt-4.1-mini 的限制是 8192 tokens
|
# 模型 gpt-4.1-mini 的限制是 8192 tokens
|
||||||
|
|
|
||||||
|
|
@ -47,10 +47,21 @@ def create_bull_researcher(llm, memory):
|
||||||
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens)
|
# 目標:將每個報告限制在合理的字符數內,總共不超過約 15000 字符(約 20000-30000 tokens)
|
||||||
|
|
||||||
def truncate_text(text, max_chars):
|
def truncate_text(text, max_chars):
|
||||||
"""截斷文本到指定字符數"""
|
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
|
||||||
if len(text) <= max_chars:
|
if len(text) <= max_chars:
|
||||||
return text
|
return text
|
||||||
return text[:max_chars] + "\n...(內容已截斷)"
|
|
||||||
|
# 在max_chars附近尋找句子結束標記
|
||||||
|
truncated = text[:max_chars]
|
||||||
|
|
||||||
|
# 尋找最後一個句號、換行或逗號
|
||||||
|
for delimiter in ['。', '\n', ',', '、', ' ']:
|
||||||
|
last_pos = truncated.rfind(delimiter)
|
||||||
|
if last_pos > max_chars * 0.8: # 至少保留80%的內容
|
||||||
|
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
|
||||||
|
|
||||||
|
# 如果找不到合適的分隔符,直接在字符處截斷
|
||||||
|
return truncated + "...(為控制長度已精簡)"
|
||||||
|
|
||||||
# 為每個報告設置合理的字符限制
|
# 為每個報告設置合理的字符限制
|
||||||
# 模型 gpt-4.1-mini 的限制是 8192 tokens
|
# 模型 gpt-4.1-mini 的限制是 8192 tokens
|
||||||
|
|
|
||||||
|
|
@ -41,10 +41,21 @@ def create_trader(llm, memory):
|
||||||
|
|
||||||
# 定義文本截斷函數以避免超過 token 限制
|
# 定義文本截斷函數以避免超過 token 限制
|
||||||
def truncate_text(text, max_chars):
|
def truncate_text(text, max_chars):
|
||||||
"""截斷文本到指定字符數"""
|
"""智能截斷文本到指定字符數,在句子邊界處截斷"""
|
||||||
if len(text) <= max_chars:
|
if len(text) <= max_chars:
|
||||||
return text
|
return text
|
||||||
return text[:max_chars] + "\n...(內容已截斷)"
|
|
||||||
|
# 在max_chars附近尋找句子結束標記
|
||||||
|
truncated = text[:max_chars]
|
||||||
|
|
||||||
|
# 尋找最後一個句號、換行或逗號
|
||||||
|
for delimiter in ['。', '\n', ',', '、', ' ']:
|
||||||
|
last_pos = truncated.rfind(delimiter)
|
||||||
|
if last_pos > max_chars * 0.8: # 至少保留80%的內容
|
||||||
|
return text[:last_pos + 1] + "\n\n...(為控制長度已精簡)"
|
||||||
|
|
||||||
|
# 如果找不到合適的分隔符,直接在字符處截斷
|
||||||
|
return truncated + "...(為控制長度已精簡)"
|
||||||
|
|
||||||
# 截斷各類報告以控制 token 使用量
|
# 截斷各類報告以控制 token 使用量
|
||||||
# 這些報告將用於記憶檢索(embedding)和 LLM prompt
|
# 這些報告將用於記憶檢索(embedding)和 LLM prompt
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue