From 8fcc3d6256acc65b810fb9ec093ef67ced27e58d Mon Sep 17 00:00:00 2001 From: MarkLo Date: Tue, 16 Dec 2025 01:57:40 +0800 Subject: [PATCH] --- backend/app/services/pdf_generator.py | 109 +++++++++++++------------- 1 file changed, 54 insertions(+), 55 deletions(-) diff --git a/backend/app/services/pdf_generator.py b/backend/app/services/pdf_generator.py index 46c5393f..2a0af002 100644 --- a/backend/app/services/pdf_generator.py +++ b/backend/app/services/pdf_generator.py @@ -612,14 +612,14 @@ class PDFGenerator: def _clean_markdown(self, text: str) -> str: """ - Clean markdown formatting for PDF - IMPROVED VERSION - Simplified regex patterns to prevent encoding artifacts + Clean markdown formatting for PDF - AGGRESSIVE VERSION + Removes ALL markdown syntax to produce clean text Args: text: Markdown text Returns: - Cleaned text + Cleaned text with no markdown syntax """ import unicodedata @@ -629,49 +629,54 @@ class PDFGenerator: # 1. Remove markdown links but keep text text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) - # 2. Remove bold markers (simplified version) - text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) - text = re.sub(r'__(.+?)__', r'\1', text) + # 2. Remove bold markers - MULTIPLE PASSES for nested cases + # Handle **text** pattern (greedy removal) + for _ in range(3): # Multiple passes to handle nested/adjacent + text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) + text = re.sub(r'__([^_]+)__', r'\1', text) - # 3. Remove italic markers (SIMPLIFIED - avoid complex lookahead/lookbehind) - # Only match single * or _ that are NOT part of ** or __ - text = re.sub(r'(? str: @@ -1083,24 +1088,17 @@ class PDFGenerator: # Track which analysts are in the reports report_analyst_names = [r.get('analyst_name', '') for r in reports] - # Page numbering: Page 1 starts from chart page - # Cover and TOC don't have page numbers - current_page = 1 - - # Build TOC table data + # Build TOC as simple list (no page numbers since reports span multiple pages) table_data = [] table_data.append([ - Paragraph('章 節', styles['toc_item']), - Paragraph('頁 碼', styles['toc_item']) + Paragraph('報告內容', styles['toc_section']), ]) # Add chart page entry if available if has_chart: table_data.append([ Paragraph(' 價格走勢圖 & 交易量柱狀圖', styles['toc_item']), - Paragraph(f'{current_page}', styles['toc_item']) ]) - current_page += 1 # Use teams if provided if teams: @@ -1113,37 +1111,31 @@ class PDFGenerator: if team_report_count == 0: continue - # Add team separator entry + # Add team separator entry table_data.append([ Paragraph(f'{team_name} ({team_report_count} 位)', styles['toc_section']), - Paragraph(f'{current_page}', styles['toc_item']) ]) - current_page += 1 # Team separator page # Add each analyst in this team for analyst_name in team_members: if analyst_name in report_analyst_names: table_data.append([ - Paragraph(f' {analyst_name}', styles['toc_item']), - Paragraph(f'{current_page}', styles['toc_item']) + Paragraph(f' - {analyst_name}', styles['toc_item']), ]) - current_page += 1 - # Create table - col_widths = [14*cm, 2*cm] + # Create table (single column) + col_widths = [16*cm] toc_table = Table(table_data, colWidths=col_widths) # Style the table table_style = TableStyle([ - ('ALIGN', (0, 0), (0, -1), 'LEFT'), - ('ALIGN', (1, 0), (1, -1), 'RIGHT'), + ('ALIGN', (0, 0), (-1, -1), 'LEFT'), ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), ('FONTNAME', (0, 0), (-1, -1), self.primary_font), ('FONTSIZE', (0, 0), (-1, -1), 11), - ('BOTTOMPADDING', (0, 0), (-1, -1), 6), - ('TOPPADDING', (0, 0), (-1, -1), 6), + ('BOTTOMPADDING', (0, 0), (-1, -1), 5), + ('TOPPADDING', (0, 0), (-1, -1), 5), ('LINEBELOW', (0, 0), (-1, 0), 1, black), # Header line - ('LINEBELOW', (0, -1), (-1, -1), 0.5, lightgrey), # Bottom line ]) toc_table.setStyle(table_style) @@ -1413,23 +1405,30 @@ class PDFGenerator: i = end_idx continue - # Check heading levels + # Check heading levels (### ## #) if line.startswith('### '): - text = line[4:] + text = self._clean_markdown(line[4:]) elements.append(Paragraph(self._escape_html(text), styles['heading'])) elif line.startswith('## '): - text = line[3:] + text = self._clean_markdown(line[3:]) elements.append(Paragraph(self._escape_html(text), styles['heading'])) elif line.startswith('# '): - text = line[2:] + text = self._clean_markdown(line[2:]) elements.append(Paragraph(self._escape_html(text), styles['heading'])) - elif line.startswith('**') and line.endswith('**'): - # Bold text as heading - text = line[2:-2] + # Numbered headings like "1. Title" or "**1. Title**" + elif re.match(r'^\*?\*?\d+\.', line): + # Extract the content, clean markdown + text = self._clean_markdown(line) elements.append(Paragraph(self._escape_html(text), styles['heading'])) + # Bold text as heading **text** + elif line.startswith('**') and '**' in line[2:]: + text = self._clean_markdown(line) + elements.append(Paragraph(self._escape_html(text), styles['heading'])) + # Bullet points elif line.startswith('- ') or line.startswith('* '): - # Bullet points - use simple dash instead of Unicode bullet - text = ' - ' + line[2:] + # Clean markdown from bullet content + bullet_content = self._clean_markdown(line[2:]) + text = ' - ' + bullet_content elements.append(Paragraph(self._escape_html(text), styles['body'])) else: # Clean any remaining markdown