480 lines
16 KiB
Python
480 lines
16 KiB
Python
"""
|
|
Dagster operations for TradingAgents news collection workflow.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
from dagster import (
|
|
AssetMaterialization,
|
|
OpExecutionContext,
|
|
op,
|
|
MetadataValue,
|
|
)
|
|
|
|
from tradingagents.config import TradingAgentsConfig
|
|
from tradingagents.domains.news.news_service import NewsService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@op
|
|
def get_tracked_tickers(context: OpExecutionContext) -> list[str]:
|
|
"""
|
|
Get list of tickers to process from configuration.
|
|
|
|
Returns:
|
|
List of ticker symbols to process
|
|
"""
|
|
try:
|
|
# Default ticker list - can be made configurable
|
|
tickers = ["AAPL", "GOOGL", "MSFT", "TSLA"]
|
|
|
|
context.log.info(f"Processing {len(tickers)} tickers: {tickers}")
|
|
|
|
return tickers
|
|
|
|
except Exception as e:
|
|
context.log.error(f"Error getting tracked tickers: {e}")
|
|
raise
|
|
|
|
|
|
@op
|
|
def fetch_google_news_articles(
|
|
context: OpExecutionContext, ticker: str
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Fetch news articles for a single ticker from Google News.
|
|
|
|
Args:
|
|
context: Dagster operation context
|
|
ticker: Stock ticker symbol
|
|
|
|
Returns:
|
|
Dictionary with ticker and article list
|
|
"""
|
|
try:
|
|
context.log.info(f"Fetching articles for ticker: {ticker}")
|
|
|
|
# Initialize NewsService
|
|
config = TradingAgentsConfig.from_env()
|
|
news_service = NewsService.build(None, config) # Will be replaced with resource
|
|
|
|
# Get Google News articles
|
|
google_client = news_service.google_client
|
|
google_articles = google_client.get_company_news(ticker)
|
|
|
|
if not google_articles:
|
|
context.log.warning(f"No articles found for {ticker}")
|
|
return {
|
|
"ticker": ticker,
|
|
"articles": [],
|
|
"status": "no_articles",
|
|
"total_found": 0,
|
|
}
|
|
|
|
# Convert to simple dict format
|
|
article_list = []
|
|
for i, article in enumerate(google_articles):
|
|
article_list.append(
|
|
{
|
|
"index": i,
|
|
"ticker": ticker,
|
|
"title": article.title,
|
|
"url": article.link,
|
|
"source": article.source,
|
|
"published_date": article.published,
|
|
"summary": article.summary,
|
|
}
|
|
)
|
|
|
|
context.log.info(f"Found {len(article_list)} articles for {ticker}")
|
|
|
|
# Log asset materialization
|
|
context.log_event(
|
|
AssetMaterialization(
|
|
asset_key=f"google_news_articles_{ticker}",
|
|
description=f"Fetched {len(article_list)} articles for {ticker}",
|
|
metadata={
|
|
"ticker": MetadataValue.text(ticker),
|
|
"total_articles": MetadataValue.int(len(article_list)),
|
|
"sources": MetadataValue.text(", ".join({article["source"] for article in article_list})),
|
|
"fetched_at": MetadataValue.text(datetime.now(timezone.utc).isoformat()),
|
|
},
|
|
)
|
|
)
|
|
|
|
return {
|
|
"ticker": ticker,
|
|
"articles": article_list,
|
|
"status": "success",
|
|
"total_found": len(article_list),
|
|
}
|
|
|
|
except Exception as e:
|
|
context.log.error(f"Error fetching articles for {ticker}: {e}")
|
|
return {
|
|
"ticker": ticker,
|
|
"articles": [],
|
|
"status": "error",
|
|
"error": str(e),
|
|
"total_found": 0,
|
|
}
|
|
|
|
|
|
@op
|
|
def fetch_and_process_article(
|
|
context: OpExecutionContext, article_data: dict[str, Any]
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Complete processing pipeline for a single article:
|
|
- Scrape content
|
|
- LLM sentiment analysis
|
|
- Vector embeddings
|
|
- Store in database
|
|
|
|
Args:
|
|
context: Dagster operation context
|
|
article_data: Article information including URL
|
|
|
|
Returns:
|
|
Processed article data with all processing results
|
|
"""
|
|
try:
|
|
url = article_data["url"]
|
|
title = article_data["title"]
|
|
ticker = article_data["ticker"]
|
|
|
|
context.log.info(f"Processing article: {title[:50]}...")
|
|
|
|
# Initialize NewsService
|
|
config = TradingAgentsConfig.from_env()
|
|
news_service = NewsService.build(None, config)
|
|
scraper = news_service.article_scraper
|
|
|
|
# Step 1: Scrape content
|
|
context.log.info("Step 1: Scraping content...")
|
|
scrape_result = scraper.scrape_article(url)
|
|
|
|
if scrape_result.status in ["SUCCESS", "ARCHIVE_SUCCESS"]:
|
|
content = scrape_result.content
|
|
author = scrape_result.author
|
|
publish_date = scrape_result.publish_date
|
|
context.log.info(f"Successfully scraped {len(content)} characters")
|
|
else:
|
|
content = article_data.get("summary", "")
|
|
author = ""
|
|
publish_date = article_data.get("published_date", "")
|
|
context.log.warning(
|
|
f"Scraping failed, using summary: {scrape_result.status}"
|
|
)
|
|
|
|
# Step 2: LLM Sentiment Analysis
|
|
context.log.info("Step 2: Analyzing sentiment...")
|
|
try:
|
|
# Use real OpenRouter sentiment analysis
|
|
openrouter_client = news_service._openrouter_client
|
|
sentiment_llm_result = openrouter_client.analyze_sentiment(f"{title} {content}")
|
|
|
|
sentiment_result = {
|
|
"sentiment": sentiment_llm_result.sentiment,
|
|
"confidence": sentiment_llm_result.confidence,
|
|
"reasoning": sentiment_llm_result.reasoning or "LLM analysis complete",
|
|
}
|
|
context.log.info(
|
|
f"Sentiment: {sentiment_result['sentiment']} (confidence: {sentiment_result['confidence']})"
|
|
)
|
|
except Exception as e:
|
|
context.log.warning(f"OpenRouter sentiment analysis failed: {e}, using fallback")
|
|
sentiment_result = {
|
|
"sentiment": "neutral",
|
|
"confidence": 0.0,
|
|
"reasoning": f"Analysis failed: {str(e)}",
|
|
}
|
|
|
|
# Step 3: Vector Embeddings
|
|
context.log.info("Step 3: Generating embeddings...")
|
|
try:
|
|
# Use real OpenRouter embeddings
|
|
openrouter_client = news_service._openrouter_client
|
|
title_embedding = openrouter_client.create_embedding(title)
|
|
content_embedding = openrouter_client.create_embedding(content)
|
|
|
|
vector_result = {
|
|
"title_embedding": title_embedding,
|
|
"content_embedding": content_embedding,
|
|
"embedding_model": "text-embedding-3-small",
|
|
"embedding_dimensions": len(title_embedding),
|
|
}
|
|
context.log.info(
|
|
f"Generated {len(vector_result['title_embedding'])}-dim embeddings"
|
|
)
|
|
except Exception as e:
|
|
context.log.warning(f"OpenRouter embedding generation failed: {e}, using zero vectors")
|
|
vector_result = {
|
|
"title_embedding": [0.0] * 1536,
|
|
"content_embedding": [0.0] * 1536,
|
|
"embedding_model": "text-embedding-3-small",
|
|
"embedding_dimensions": 1536,
|
|
"error": str(e),
|
|
}
|
|
|
|
# Step 4: Store in database
|
|
context.log.info("Step 4: Storing in database...")
|
|
|
|
async def store_article():
|
|
from datetime import date
|
|
|
|
from tradingagents.domains.news.news_repository import NewsArticle
|
|
|
|
# Convert sentiment result to database format
|
|
sentiment_score = None
|
|
sentiment_confidence = sentiment_result.get("confidence", 0.0)
|
|
sentiment_label = sentiment_result.get("sentiment", "neutral")
|
|
|
|
if sentiment_label == "positive":
|
|
sentiment_score = sentiment_confidence
|
|
elif sentiment_label == "negative":
|
|
sentiment_score = -sentiment_confidence
|
|
else:
|
|
sentiment_score = 0.0
|
|
|
|
news_article = NewsArticle(
|
|
headline=title,
|
|
url=url,
|
|
source=article_data["source"],
|
|
published_date=date.fromisoformat(
|
|
publish_date[:10] if publish_date else "2025-01-01"
|
|
),
|
|
summary=content,
|
|
author=author,
|
|
sentiment_score=sentiment_score,
|
|
sentiment_confidence=sentiment_confidence,
|
|
sentiment_label=sentiment_label,
|
|
title_embedding=vector_result.get("title_embedding"),
|
|
content_embedding=vector_result.get("content_embedding"),
|
|
)
|
|
|
|
repository = news_service.repository
|
|
await repository.upsert_batch([news_article], ticker)
|
|
|
|
try:
|
|
asyncio.run(store_article())
|
|
storage_status = "success"
|
|
context.log.info("Successfully stored article")
|
|
except Exception as e:
|
|
storage_status = "error"
|
|
context.log.error(f"Error storing article: {e}")
|
|
|
|
# Return complete processed article
|
|
processed_article = {
|
|
**article_data,
|
|
"content": content,
|
|
"author": author,
|
|
"publish_date": publish_date,
|
|
"scrape_status": scrape_result.status,
|
|
"sentiment": sentiment_result,
|
|
"vectors": vector_result,
|
|
"storage_status": storage_status,
|
|
"processed_at": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Log asset materialization
|
|
context.log_event(
|
|
AssetMaterialization(
|
|
asset_key=f"processed_article_{ticker}_{article_data['index']}",
|
|
description=f"Completely processed article: {title[:50]}...",
|
|
metadata={
|
|
"ticker": MetadataValue.text(ticker),
|
|
"url": MetadataValue.text(url),
|
|
"scrape_status": MetadataValue.text(scrape_result.status),
|
|
"sentiment": MetadataValue.text(sentiment_result["sentiment"]),
|
|
"content_length": MetadataValue.int(len(content)),
|
|
"storage_status": MetadataValue.text(storage_status),
|
|
"processed_at": MetadataValue.text(datetime.now(timezone.utc).isoformat()),
|
|
},
|
|
)
|
|
)
|
|
|
|
return processed_article
|
|
|
|
except Exception as e:
|
|
context.log.error(f"Error processing article {article_data['url']}: {e}")
|
|
return {
|
|
**article_data,
|
|
"content": "",
|
|
"scrape_status": "error",
|
|
"sentiment": {
|
|
"sentiment": "neutral",
|
|
"confidence": 0.0,
|
|
"reasoning": f"Error: {str(e)}",
|
|
},
|
|
"vectors": {
|
|
"title_embedding": [],
|
|
"content_embedding": [],
|
|
"error": str(e),
|
|
},
|
|
"storage_status": "error",
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
@op
|
|
def collect_ticker_results(
|
|
context: OpExecutionContext, processed_articles: list[dict[str, Any]]
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Collect and summarize results for a ticker.
|
|
|
|
Args:
|
|
context: Dagster operation context
|
|
processed_articles: List of fully processed articles
|
|
|
|
Returns:
|
|
Summary results for the ticker
|
|
"""
|
|
try:
|
|
if not processed_articles:
|
|
return {"status": "no_articles", "total_processed": 0}
|
|
|
|
ticker = processed_articles[0]["ticker"]
|
|
|
|
# Calculate statistics
|
|
total_processed = len(processed_articles)
|
|
successful_scrapes = sum(
|
|
1
|
|
for a in processed_articles
|
|
if a.get("scrape_status") in ["SUCCESS", "ARCHIVE_SUCCESS"]
|
|
)
|
|
successful_storage = sum(
|
|
1 for a in processed_articles if a.get("storage_status") == "success"
|
|
)
|
|
|
|
# Sentiment analysis
|
|
sentiments = [
|
|
a.get("sentiment", {}).get("sentiment", "neutral")
|
|
for a in processed_articles
|
|
]
|
|
sentiment_counts = {
|
|
"positive": sentiments.count("positive"),
|
|
"negative": sentiments.count("negative"),
|
|
"neutral": sentiments.count("neutral"),
|
|
}
|
|
|
|
results = {
|
|
"ticker": ticker,
|
|
"status": "completed",
|
|
"total_processed": total_processed,
|
|
"successful_scrapes": successful_scrapes,
|
|
"successful_storage": successful_storage,
|
|
"sentiment_summary": sentiment_counts,
|
|
"completion_time": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
context.log.info(
|
|
f"Completed {ticker}: {total_processed} articles, {successful_storage} stored"
|
|
)
|
|
|
|
# Log asset materialization
|
|
context.log_event(
|
|
AssetMaterialization(
|
|
asset_key=f"ticker_results_{ticker}",
|
|
description=f"Completed news processing for {ticker}",
|
|
metadata={
|
|
"ticker": MetadataValue.text(results.get("ticker", "")),
|
|
"status": MetadataValue.text(results.get("status", "")),
|
|
"total_processed": MetadataValue.int(results.get("total_processed", 0)),
|
|
"successful_scrapes": MetadataValue.int(results.get("successful_scrapes", 0)),
|
|
"successful_storage": MetadataValue.int(results.get("successful_storage", 0)),
|
|
"completion_time": MetadataValue.text(results.get("completion_time", "")),
|
|
},
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
context.log.error(f"Error collecting ticker results: {e}")
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
@op
|
|
def collect_all_results(
|
|
context: OpExecutionContext, ticker_results: list[dict[str, Any]]
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Collect and summarize results for all tickers.
|
|
|
|
Args:
|
|
context: Dagster operation context
|
|
ticker_results: List of ticker result summaries
|
|
|
|
Returns:
|
|
Overall summary results
|
|
"""
|
|
try:
|
|
if not ticker_results:
|
|
return {"status": "no_results", "total_tickers": 0}
|
|
|
|
# Calculate overall statistics
|
|
total_tickers = len(ticker_results)
|
|
successful_tickers = sum(
|
|
1 for r in ticker_results if r.get("status") == "completed"
|
|
)
|
|
total_articles = sum(r.get("total_processed", 0) for r in ticker_results)
|
|
total_stored = sum(r.get("successful_storage", 0) for r in ticker_results)
|
|
|
|
# Aggregate sentiment data
|
|
overall_sentiment = {
|
|
"positive": sum(
|
|
r.get("sentiment_summary", {}).get("positive", 0)
|
|
for r in ticker_results
|
|
),
|
|
"negative": sum(
|
|
r.get("sentiment_summary", {}).get("negative", 0)
|
|
for r in ticker_results
|
|
),
|
|
"neutral": sum(
|
|
r.get("sentiment_summary", {}).get("neutral", 0) for r in ticker_results
|
|
),
|
|
}
|
|
|
|
results = {
|
|
"status": "completed",
|
|
"total_tickers": total_tickers,
|
|
"successful_tickers": successful_tickers,
|
|
"total_articles": total_articles,
|
|
"total_stored": total_stored,
|
|
"overall_sentiment": overall_sentiment,
|
|
"completion_time": datetime.now(timezone.utc).isoformat(),
|
|
"ticker_results": ticker_results,
|
|
}
|
|
|
|
context.log.info(
|
|
f"Completed all tickers: {total_tickers} tickers, {total_articles} articles, {total_stored} stored"
|
|
)
|
|
|
|
# Log asset materialization
|
|
context.log_event(
|
|
AssetMaterialization(
|
|
asset_key="daily_news_collection_summary",
|
|
description="Completed daily news collection for all tickers",
|
|
metadata={
|
|
"status": MetadataValue.text(results.get("status", "")),
|
|
"total_tickers": MetadataValue.int(results.get("total_tickers", 0)),
|
|
"successful_tickers": MetadataValue.int(results.get("successful_tickers", 0)),
|
|
"total_articles": MetadataValue.int(results.get("total_articles", 0)),
|
|
"total_stored": MetadataValue.int(results.get("total_stored", 0)),
|
|
"completion_time": MetadataValue.text(results.get("completion_time", "")),
|
|
},
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
context.log.error(f"Error collecting all results: {e}")
|
|
return {"status": "error", "error": str(e)}
|