This commit is contained in:
parent
dbee0c4817
commit
8fcc3d6256
|
|
@ -612,14 +612,14 @@ class PDFGenerator:
|
||||||
|
|
||||||
def _clean_markdown(self, text: str) -> str:
|
def _clean_markdown(self, text: str) -> str:
|
||||||
"""
|
"""
|
||||||
Clean markdown formatting for PDF - IMPROVED VERSION
|
Clean markdown formatting for PDF - AGGRESSIVE VERSION
|
||||||
Simplified regex patterns to prevent encoding artifacts
|
Removes ALL markdown syntax to produce clean text
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Markdown text
|
text: Markdown text
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Cleaned text
|
Cleaned text with no markdown syntax
|
||||||
"""
|
"""
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
|
|
@ -629,49 +629,54 @@ class PDFGenerator:
|
||||||
# 1. Remove markdown links but keep text
|
# 1. Remove markdown links but keep text
|
||||||
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
||||||
|
|
||||||
# 2. Remove bold markers (simplified version)
|
# 2. Remove bold markers - MULTIPLE PASSES for nested cases
|
||||||
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
|
# Handle **text** pattern (greedy removal)
|
||||||
text = re.sub(r'__(.+?)__', r'\1', text)
|
for _ in range(3): # Multiple passes to handle nested/adjacent
|
||||||
|
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
|
||||||
|
text = re.sub(r'__([^_]+)__', r'\1', text)
|
||||||
|
|
||||||
# 3. Remove italic markers (SIMPLIFIED - avoid complex lookahead/lookbehind)
|
# 3. Handle remaining isolated ** or __
|
||||||
# Only match single * or _ that are NOT part of ** or __
|
text = re.sub(r'\*\*', '', text)
|
||||||
text = re.sub(r'(?<![\*])\*([^\*]+?)\*(?![\*])', r'\1', text)
|
text = re.sub(r'__', '', text)
|
||||||
|
|
||||||
|
# 4. Remove italic markers (single * or _)
|
||||||
|
text = re.sub(r'(?<![*])\*([^*]+?)\*(?![*])', r'\1', text)
|
||||||
text = re.sub(r'(?<![_])_([^_]+?)_(?![_])', r'\1', text)
|
text = re.sub(r'(?<![_])_([^_]+?)_(?![_])', r'\1', text)
|
||||||
|
|
||||||
# 4. Remove code blocks
|
# 5. Remove remaining isolated * that aren't bullet points
|
||||||
|
# Keep * at start of line (bullet points)
|
||||||
|
text = re.sub(r'(?<!^)(?<!\n)\*(?![*\s])', '', text)
|
||||||
|
|
||||||
|
# 6. Remove code blocks
|
||||||
text = re.sub(r'```[^`]*?```', '', text, flags=re.DOTALL)
|
text = re.sub(r'```[^`]*?```', '', text, flags=re.DOTALL)
|
||||||
text = re.sub(r'`([^`]+?)`', r'\1', text)
|
text = re.sub(r'`([^`]+?)`', r'\1', text)
|
||||||
|
|
||||||
# 5. Clean up bullet points - USE ASCII DASH, NOT UNICODE BULLET
|
# 7. Clean up bullet points - normalize to simple dash
|
||||||
# Unicode bullet • (U+2022) renders as '煉' in STSong-Light font!
|
|
||||||
text = re.sub(r'^\s*[\*\-\+]\s+', '- ', text, flags=re.MULTILINE)
|
text = re.sub(r'^\s*[\*\-\+]\s+', '- ', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
# 6. Remove horizontal rules
|
# 8. Remove horizontal rules
|
||||||
text = re.sub(r'^[\-\*_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
text = re.sub(r'^[\-\*_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
# 7. Clean table separators (simplified)
|
# 9. Clean table separators
|
||||||
text = re.sub(r'^\s*\|?\s*:?-+:?\s*\|?\s*$', '', text, flags=re.MULTILINE)
|
text = re.sub(r'^\s*\|?\s*:?-+:?\s*\|?\s*$', '', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
# 8. Remove table | symbols (keep content)
|
# 10. Remove table | symbols but keep content
|
||||||
text = re.sub(r'^\s*\|', '', text, flags=re.MULTILINE)
|
text = re.sub(r'^\s*\|', '', text, flags=re.MULTILINE)
|
||||||
text = re.sub(r'\|\s*$', '', text, flags=re.MULTILINE)
|
text = re.sub(r'\|\s*$', '', text, flags=re.MULTILINE)
|
||||||
text = re.sub(r'\|', ' | ', text)
|
text = re.sub(r'\|', ' | ', text)
|
||||||
|
|
||||||
# 9. Clean excess spaces
|
# 11. Remove heading markers (# ## ### etc)
|
||||||
|
text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# 12. Clean excess spaces
|
||||||
text = re.sub(r' {2,}', ' ', text)
|
text = re.sub(r' {2,}', ' ', text)
|
||||||
|
|
||||||
# 10. Clean excess blank lines
|
# 13. Clean excess blank lines
|
||||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||||
|
|
||||||
# 11. Remove isolated markdown symbols (SIMPLIFIED - no complex patterns)
|
# 14. Remove isolated markdown symbols on their own lines
|
||||||
# Remove lines that only contain markdown symbols
|
|
||||||
text = re.sub(r'^[\*_`~#\-\+]+\s*$', '', text, flags=re.MULTILINE)
|
text = re.sub(r'^[\*_`~#\-\+]+\s*$', '', text, flags=re.MULTILINE)
|
||||||
|
|
||||||
# 12. REMOVED problematic Unicode filter that was corrupting Chinese characters
|
|
||||||
# The string comparison '\u4e00' <= char <= '\u9fff' was comparing UTF-8 bytes,
|
|
||||||
# not Unicode code points, causing characters like '經' to be corrupted.
|
|
||||||
# Unicode normalization at the start (line 237) is sufficient.
|
|
||||||
|
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
def _escape_html(self, text: str) -> str:
|
def _escape_html(self, text: str) -> str:
|
||||||
|
|
@ -1083,24 +1088,17 @@ class PDFGenerator:
|
||||||
# Track which analysts are in the reports
|
# Track which analysts are in the reports
|
||||||
report_analyst_names = [r.get('analyst_name', '') for r in reports]
|
report_analyst_names = [r.get('analyst_name', '') for r in reports]
|
||||||
|
|
||||||
# Page numbering: Page 1 starts from chart page
|
# Build TOC as simple list (no page numbers since reports span multiple pages)
|
||||||
# Cover and TOC don't have page numbers
|
|
||||||
current_page = 1
|
|
||||||
|
|
||||||
# Build TOC table data
|
|
||||||
table_data = []
|
table_data = []
|
||||||
table_data.append([
|
table_data.append([
|
||||||
Paragraph('<b>章 節</b>', styles['toc_item']),
|
Paragraph('<b>報告內容</b>', styles['toc_section']),
|
||||||
Paragraph('<b>頁 碼</b>', styles['toc_item'])
|
|
||||||
])
|
])
|
||||||
|
|
||||||
# Add chart page entry if available
|
# Add chart page entry if available
|
||||||
if has_chart:
|
if has_chart:
|
||||||
table_data.append([
|
table_data.append([
|
||||||
Paragraph(' 價格走勢圖 & 交易量柱狀圖', styles['toc_item']),
|
Paragraph(' 價格走勢圖 & 交易量柱狀圖', styles['toc_item']),
|
||||||
Paragraph(f'{current_page}', styles['toc_item'])
|
|
||||||
])
|
])
|
||||||
current_page += 1
|
|
||||||
|
|
||||||
# Use teams if provided
|
# Use teams if provided
|
||||||
if teams:
|
if teams:
|
||||||
|
|
@ -1113,37 +1111,31 @@ class PDFGenerator:
|
||||||
if team_report_count == 0:
|
if team_report_count == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Add team separator entry
|
# Add team separator entry
|
||||||
table_data.append([
|
table_data.append([
|
||||||
Paragraph(f'<b>{team_name} ({team_report_count} 位)</b>', styles['toc_section']),
|
Paragraph(f'<b>{team_name} ({team_report_count} 位)</b>', styles['toc_section']),
|
||||||
Paragraph(f'{current_page}', styles['toc_item'])
|
|
||||||
])
|
])
|
||||||
current_page += 1 # Team separator page
|
|
||||||
|
|
||||||
# Add each analyst in this team
|
# Add each analyst in this team
|
||||||
for analyst_name in team_members:
|
for analyst_name in team_members:
|
||||||
if analyst_name in report_analyst_names:
|
if analyst_name in report_analyst_names:
|
||||||
table_data.append([
|
table_data.append([
|
||||||
Paragraph(f' {analyst_name}', styles['toc_item']),
|
Paragraph(f' - {analyst_name}', styles['toc_item']),
|
||||||
Paragraph(f'{current_page}', styles['toc_item'])
|
|
||||||
])
|
])
|
||||||
current_page += 1
|
|
||||||
|
|
||||||
# Create table
|
# Create table (single column)
|
||||||
col_widths = [14*cm, 2*cm]
|
col_widths = [16*cm]
|
||||||
toc_table = Table(table_data, colWidths=col_widths)
|
toc_table = Table(table_data, colWidths=col_widths)
|
||||||
|
|
||||||
# Style the table
|
# Style the table
|
||||||
table_style = TableStyle([
|
table_style = TableStyle([
|
||||||
('ALIGN', (0, 0), (0, -1), 'LEFT'),
|
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
||||||
('ALIGN', (1, 0), (1, -1), 'RIGHT'),
|
|
||||||
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
|
||||||
('FONTNAME', (0, 0), (-1, -1), self.primary_font),
|
('FONTNAME', (0, 0), (-1, -1), self.primary_font),
|
||||||
('FONTSIZE', (0, 0), (-1, -1), 11),
|
('FONTSIZE', (0, 0), (-1, -1), 11),
|
||||||
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
|
('BOTTOMPADDING', (0, 0), (-1, -1), 5),
|
||||||
('TOPPADDING', (0, 0), (-1, -1), 6),
|
('TOPPADDING', (0, 0), (-1, -1), 5),
|
||||||
('LINEBELOW', (0, 0), (-1, 0), 1, black), # Header line
|
('LINEBELOW', (0, 0), (-1, 0), 1, black), # Header line
|
||||||
('LINEBELOW', (0, -1), (-1, -1), 0.5, lightgrey), # Bottom line
|
|
||||||
])
|
])
|
||||||
toc_table.setStyle(table_style)
|
toc_table.setStyle(table_style)
|
||||||
|
|
||||||
|
|
@ -1413,23 +1405,30 @@ class PDFGenerator:
|
||||||
i = end_idx
|
i = end_idx
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check heading levels
|
# Check heading levels (### ## #)
|
||||||
if line.startswith('### '):
|
if line.startswith('### '):
|
||||||
text = line[4:]
|
text = self._clean_markdown(line[4:])
|
||||||
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
||||||
elif line.startswith('## '):
|
elif line.startswith('## '):
|
||||||
text = line[3:]
|
text = self._clean_markdown(line[3:])
|
||||||
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
||||||
elif line.startswith('# '):
|
elif line.startswith('# '):
|
||||||
text = line[2:]
|
text = self._clean_markdown(line[2:])
|
||||||
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
||||||
elif line.startswith('**') and line.endswith('**'):
|
# Numbered headings like "1. Title" or "**1. Title**"
|
||||||
# Bold text as heading
|
elif re.match(r'^\*?\*?\d+\.', line):
|
||||||
text = line[2:-2]
|
# Extract the content, clean markdown
|
||||||
|
text = self._clean_markdown(line)
|
||||||
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
||||||
|
# Bold text as heading **text**
|
||||||
|
elif line.startswith('**') and '**' in line[2:]:
|
||||||
|
text = self._clean_markdown(line)
|
||||||
|
elements.append(Paragraph(self._escape_html(text), styles['heading']))
|
||||||
|
# Bullet points
|
||||||
elif line.startswith('- ') or line.startswith('* '):
|
elif line.startswith('- ') or line.startswith('* '):
|
||||||
# Bullet points - use simple dash instead of Unicode bullet
|
# Clean markdown from bullet content
|
||||||
text = ' - ' + line[2:]
|
bullet_content = self._clean_markdown(line[2:])
|
||||||
|
text = ' - ' + bullet_content
|
||||||
elements.append(Paragraph(self._escape_html(text), styles['body']))
|
elements.append(Paragraph(self._escape_html(text), styles['body']))
|
||||||
else:
|
else:
|
||||||
# Clean any remaining markdown
|
# Clean any remaining markdown
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue