149 lines
5.0 KiB
Python
149 lines
5.0 KiB
Python
import re
|
|
from typing import List, Dict, Any
|
|
|
|
|
|
def _extract_urls(text: str) -> List[str]:
|
|
url_pattern = re.compile(r"https?://[^\s)]+")
|
|
return url_pattern.findall(text or "")
|
|
|
|
|
|
def _strip_md(s: str) -> str:
|
|
if not s:
|
|
return s
|
|
# Remove simple markdown bold/italics markers
|
|
return re.sub(r"[*_`]+", "", s).strip()
|
|
|
|
|
|
def parse_global_news(raw_text: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Parses global news text produced by get_global_news_openai into a list of items.
|
|
Expected patterns include enumerated bold headings like:
|
|
1. **October 25, 2025: "Headline"**
|
|
- Trading Relevance: ...
|
|
(source links)
|
|
Returns a list of dicts with keys: date, headline, relevance, sources, raw.
|
|
"""
|
|
if not raw_text or not isinstance(raw_text, str):
|
|
return []
|
|
|
|
items: List[Dict[str, Any]] = []
|
|
|
|
# Find each enumerated bold heading and take content until next heading
|
|
header_iter = list(
|
|
re.finditer(r"(?m)^\s*\d+\.\s+\*\*(.+?)\*\*\s*$", raw_text)
|
|
)
|
|
if not header_iter:
|
|
# Fallback: try to split by lines that start with a date-like pattern in bold
|
|
header_iter = list(
|
|
re.finditer(r"(?m)^\s*\*\*([A-Za-z]+\s+\d{1,2},\s+\d{4}.*)\*\*\s*$", raw_text)
|
|
)
|
|
|
|
boundaries = []
|
|
for m in header_iter:
|
|
boundaries.append((m.start(), m.end(), m.group(1)))
|
|
# Add sentinel end
|
|
text_len = len(raw_text)
|
|
for i, (s, e, header_text) in enumerate(boundaries):
|
|
next_start = boundaries[i + 1][0] if i + 1 < len(boundaries) else text_len
|
|
block = raw_text[e:next_start].strip()
|
|
|
|
header = header_text.strip()
|
|
# Extract date and headline from header
|
|
date_match = re.search(r"([A-Za-z]+\s+\d{1,2},\s+\d{4})", header)
|
|
quoted_headline = re.search(r"\"([^\"]+)\"", header)
|
|
headline_after_colon = None
|
|
if ":" in header:
|
|
parts = header.split(":", 1)
|
|
headline_after_colon = parts[1].strip()
|
|
# Remove surrounding quotes if present
|
|
headline_after_colon = headline_after_colon.strip("\"“”")
|
|
|
|
date_str = date_match.group(1) if date_match else None
|
|
headline = (
|
|
quoted_headline.group(1)
|
|
if quoted_headline
|
|
else (headline_after_colon or _strip_md(header))
|
|
)
|
|
|
|
# Extract trading relevance line(s)
|
|
rel_match = re.search(
|
|
r"(?i)Trading\s+Relevance:\s*(.+)", block
|
|
)
|
|
relevance = rel_match.group(1).strip() if rel_match else ""
|
|
|
|
sources = _extract_urls(block + " " + header)
|
|
|
|
items.append(
|
|
{
|
|
"date": date_str,
|
|
"headline": headline,
|
|
"relevance": relevance,
|
|
"sources": list(dict.fromkeys(sources)),
|
|
"raw": header + "\n" + block,
|
|
}
|
|
)
|
|
return items
|
|
|
|
|
|
def parse_stock_news(raw_text: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Parses company-specific news text from get_stock_news_openai into a list of items.
|
|
Expected patterns include bold enumerated sections like:
|
|
**1. Topic**
|
|
Description ... (url)
|
|
Returns a list of dicts with keys: title, summary, sources, raw.
|
|
"""
|
|
if not raw_text or not isinstance(raw_text, str):
|
|
return []
|
|
|
|
items: List[Dict[str, Any]] = []
|
|
|
|
# Find headings like **1. Something** or **1. Something** on its own line
|
|
header_iter = list(
|
|
re.finditer(r"(?m)^\s*\*\*\s*\d+\.\s*(.+?)\s*\*\*\s*$", raw_text)
|
|
)
|
|
|
|
if not header_iter:
|
|
# Fallback: split by numbered lines even without bold
|
|
header_iter = list(
|
|
re.finditer(r"(?m)^\s*\d+\.\s+(.+?)\s*$", raw_text)
|
|
)
|
|
|
|
if header_iter:
|
|
boundaries = []
|
|
for m in header_iter:
|
|
boundaries.append((m.start(), m.end(), m.group(1)))
|
|
text_len = len(raw_text)
|
|
for i, (s, e, header_text) in enumerate(boundaries):
|
|
next_start = boundaries[i + 1][0] if i + 1 < len(boundaries) else text_len
|
|
block = raw_text[e:next_start].strip()
|
|
title = _strip_md(header_text)
|
|
sources = _extract_urls(block + " " + title)
|
|
summary = block.strip()
|
|
items.append(
|
|
{
|
|
"title": title,
|
|
"summary": summary,
|
|
"sources": list(dict.fromkeys(sources)),
|
|
"raw": f"{title}\n{summary}",
|
|
}
|
|
)
|
|
else:
|
|
# Last resort: try to split paragraphs; each paragraph with a URL is an item
|
|
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", raw_text) if p.strip()]
|
|
for p in paragraphs:
|
|
urls = _extract_urls(p)
|
|
if urls or len(p) > 120:
|
|
items.append(
|
|
{
|
|
"title": p.split("\n", 1)[0][:80],
|
|
"summary": p,
|
|
"sources": list(dict.fromkeys(urls)),
|
|
"raw": p,
|
|
}
|
|
)
|
|
|
|
return items
|
|
|
|
|