TradingAgents/tradingagents/workflows/ops.py

480 lines
16 KiB
Python

"""
Dagster operations for TradingAgents news collection workflow.
"""
import asyncio
import logging
from datetime import datetime, timezone
from typing import Any
from dagster import (
AssetMaterialization,
OpExecutionContext,
op,
MetadataValue,
)
from tradingagents.config import TradingAgentsConfig
from tradingagents.domains.news.news_service import NewsService
logger = logging.getLogger(__name__)
@op
def get_tracked_tickers(context: OpExecutionContext) -> list[str]:
"""
Get list of tickers to process from configuration.
Returns:
List of ticker symbols to process
"""
try:
# Default ticker list - can be made configurable
tickers = ["AAPL", "GOOGL", "MSFT", "TSLA"]
context.log.info(f"Processing {len(tickers)} tickers: {tickers}")
return tickers
except Exception as e:
context.log.error(f"Error getting tracked tickers: {e}")
raise
@op
def fetch_google_news_articles(
context: OpExecutionContext, ticker: str
) -> dict[str, Any]:
"""
Fetch news articles for a single ticker from Google News.
Args:
context: Dagster operation context
ticker: Stock ticker symbol
Returns:
Dictionary with ticker and article list
"""
try:
context.log.info(f"Fetching articles for ticker: {ticker}")
# Initialize NewsService
config = TradingAgentsConfig.from_env()
news_service = NewsService.build(None, config) # Will be replaced with resource
# Get Google News articles
google_client = news_service.google_client
google_articles = google_client.get_company_news(ticker)
if not google_articles:
context.log.warning(f"No articles found for {ticker}")
return {
"ticker": ticker,
"articles": [],
"status": "no_articles",
"total_found": 0,
}
# Convert to simple dict format
article_list = []
for i, article in enumerate(google_articles):
article_list.append(
{
"index": i,
"ticker": ticker,
"title": article.title,
"url": article.link,
"source": article.source,
"published_date": article.published,
"summary": article.summary,
}
)
context.log.info(f"Found {len(article_list)} articles for {ticker}")
# Log asset materialization
context.log_event(
AssetMaterialization(
asset_key=f"google_news_articles_{ticker}",
description=f"Fetched {len(article_list)} articles for {ticker}",
metadata={
"ticker": MetadataValue.text(ticker),
"total_articles": MetadataValue.int(len(article_list)),
"sources": MetadataValue.text(", ".join({article["source"] for article in article_list})),
"fetched_at": MetadataValue.text(datetime.now(timezone.utc).isoformat()),
},
)
)
return {
"ticker": ticker,
"articles": article_list,
"status": "success",
"total_found": len(article_list),
}
except Exception as e:
context.log.error(f"Error fetching articles for {ticker}: {e}")
return {
"ticker": ticker,
"articles": [],
"status": "error",
"error": str(e),
"total_found": 0,
}
@op
def fetch_and_process_article(
context: OpExecutionContext, article_data: dict[str, Any]
) -> dict[str, Any]:
"""
Complete processing pipeline for a single article:
- Scrape content
- LLM sentiment analysis
- Vector embeddings
- Store in database
Args:
context: Dagster operation context
article_data: Article information including URL
Returns:
Processed article data with all processing results
"""
try:
url = article_data["url"]
title = article_data["title"]
ticker = article_data["ticker"]
context.log.info(f"Processing article: {title[:50]}...")
# Initialize NewsService
config = TradingAgentsConfig.from_env()
news_service = NewsService.build(None, config)
scraper = news_service.article_scraper
# Step 1: Scrape content
context.log.info("Step 1: Scraping content...")
scrape_result = scraper.scrape_article(url)
if scrape_result.status in ["SUCCESS", "ARCHIVE_SUCCESS"]:
content = scrape_result.content
author = scrape_result.author
publish_date = scrape_result.publish_date
context.log.info(f"Successfully scraped {len(content)} characters")
else:
content = article_data.get("summary", "")
author = ""
publish_date = article_data.get("published_date", "")
context.log.warning(
f"Scraping failed, using summary: {scrape_result.status}"
)
# Step 2: LLM Sentiment Analysis
context.log.info("Step 2: Analyzing sentiment...")
try:
# Use real OpenRouter sentiment analysis
openrouter_client = news_service._openrouter_client
sentiment_llm_result = openrouter_client.analyze_sentiment(f"{title} {content}")
sentiment_result = {
"sentiment": sentiment_llm_result.sentiment,
"confidence": sentiment_llm_result.confidence,
"reasoning": sentiment_llm_result.reasoning or "LLM analysis complete",
}
context.log.info(
f"Sentiment: {sentiment_result['sentiment']} (confidence: {sentiment_result['confidence']})"
)
except Exception as e:
context.log.warning(f"OpenRouter sentiment analysis failed: {e}, using fallback")
sentiment_result = {
"sentiment": "neutral",
"confidence": 0.0,
"reasoning": f"Analysis failed: {str(e)}",
}
# Step 3: Vector Embeddings
context.log.info("Step 3: Generating embeddings...")
try:
# Use real OpenRouter embeddings
openrouter_client = news_service._openrouter_client
title_embedding = openrouter_client.create_embedding(title)
content_embedding = openrouter_client.create_embedding(content)
vector_result = {
"title_embedding": title_embedding,
"content_embedding": content_embedding,
"embedding_model": "text-embedding-3-small",
"embedding_dimensions": len(title_embedding),
}
context.log.info(
f"Generated {len(vector_result['title_embedding'])}-dim embeddings"
)
except Exception as e:
context.log.warning(f"OpenRouter embedding generation failed: {e}, using zero vectors")
vector_result = {
"title_embedding": [0.0] * 1536,
"content_embedding": [0.0] * 1536,
"embedding_model": "text-embedding-3-small",
"embedding_dimensions": 1536,
"error": str(e),
}
# Step 4: Store in database
context.log.info("Step 4: Storing in database...")
async def store_article():
from datetime import date
from tradingagents.domains.news.news_repository import NewsArticle
# Convert sentiment result to database format
sentiment_score = None
sentiment_confidence = sentiment_result.get("confidence", 0.0)
sentiment_label = sentiment_result.get("sentiment", "neutral")
if sentiment_label == "positive":
sentiment_score = sentiment_confidence
elif sentiment_label == "negative":
sentiment_score = -sentiment_confidence
else:
sentiment_score = 0.0
news_article = NewsArticle(
headline=title,
url=url,
source=article_data["source"],
published_date=date.fromisoformat(
publish_date[:10] if publish_date else "2025-01-01"
),
summary=content,
author=author,
sentiment_score=sentiment_score,
sentiment_confidence=sentiment_confidence,
sentiment_label=sentiment_label,
title_embedding=vector_result.get("title_embedding"),
content_embedding=vector_result.get("content_embedding"),
)
repository = news_service.repository
await repository.upsert_batch([news_article], ticker)
try:
asyncio.run(store_article())
storage_status = "success"
context.log.info("Successfully stored article")
except Exception as e:
storage_status = "error"
context.log.error(f"Error storing article: {e}")
# Return complete processed article
processed_article = {
**article_data,
"content": content,
"author": author,
"publish_date": publish_date,
"scrape_status": scrape_result.status,
"sentiment": sentiment_result,
"vectors": vector_result,
"storage_status": storage_status,
"processed_at": datetime.now(timezone.utc).isoformat(),
}
# Log asset materialization
context.log_event(
AssetMaterialization(
asset_key=f"processed_article_{ticker}_{article_data['index']}",
description=f"Completely processed article: {title[:50]}...",
metadata={
"ticker": MetadataValue.text(ticker),
"url": MetadataValue.text(url),
"scrape_status": MetadataValue.text(scrape_result.status),
"sentiment": MetadataValue.text(sentiment_result["sentiment"]),
"content_length": MetadataValue.int(len(content)),
"storage_status": MetadataValue.text(storage_status),
"processed_at": MetadataValue.text(datetime.now(timezone.utc).isoformat()),
},
)
)
return processed_article
except Exception as e:
context.log.error(f"Error processing article {article_data['url']}: {e}")
return {
**article_data,
"content": "",
"scrape_status": "error",
"sentiment": {
"sentiment": "neutral",
"confidence": 0.0,
"reasoning": f"Error: {str(e)}",
},
"vectors": {
"title_embedding": [],
"content_embedding": [],
"error": str(e),
},
"storage_status": "error",
"error": str(e),
}
@op
def collect_ticker_results(
context: OpExecutionContext, processed_articles: list[dict[str, Any]]
) -> dict[str, Any]:
"""
Collect and summarize results for a ticker.
Args:
context: Dagster operation context
processed_articles: List of fully processed articles
Returns:
Summary results for the ticker
"""
try:
if not processed_articles:
return {"status": "no_articles", "total_processed": 0}
ticker = processed_articles[0]["ticker"]
# Calculate statistics
total_processed = len(processed_articles)
successful_scrapes = sum(
1
for a in processed_articles
if a.get("scrape_status") in ["SUCCESS", "ARCHIVE_SUCCESS"]
)
successful_storage = sum(
1 for a in processed_articles if a.get("storage_status") == "success"
)
# Sentiment analysis
sentiments = [
a.get("sentiment", {}).get("sentiment", "neutral")
for a in processed_articles
]
sentiment_counts = {
"positive": sentiments.count("positive"),
"negative": sentiments.count("negative"),
"neutral": sentiments.count("neutral"),
}
results = {
"ticker": ticker,
"status": "completed",
"total_processed": total_processed,
"successful_scrapes": successful_scrapes,
"successful_storage": successful_storage,
"sentiment_summary": sentiment_counts,
"completion_time": datetime.now(timezone.utc).isoformat(),
}
context.log.info(
f"Completed {ticker}: {total_processed} articles, {successful_storage} stored"
)
# Log asset materialization
context.log_event(
AssetMaterialization(
asset_key=f"ticker_results_{ticker}",
description=f"Completed news processing for {ticker}",
metadata={
"ticker": MetadataValue.text(results.get("ticker", "")),
"status": MetadataValue.text(results.get("status", "")),
"total_processed": MetadataValue.int(results.get("total_processed", 0)),
"successful_scrapes": MetadataValue.int(results.get("successful_scrapes", 0)),
"successful_storage": MetadataValue.int(results.get("successful_storage", 0)),
"completion_time": MetadataValue.text(results.get("completion_time", "")),
},
)
)
return results
except Exception as e:
context.log.error(f"Error collecting ticker results: {e}")
return {"status": "error", "error": str(e)}
@op
def collect_all_results(
context: OpExecutionContext, ticker_results: list[dict[str, Any]]
) -> dict[str, Any]:
"""
Collect and summarize results for all tickers.
Args:
context: Dagster operation context
ticker_results: List of ticker result summaries
Returns:
Overall summary results
"""
try:
if not ticker_results:
return {"status": "no_results", "total_tickers": 0}
# Calculate overall statistics
total_tickers = len(ticker_results)
successful_tickers = sum(
1 for r in ticker_results if r.get("status") == "completed"
)
total_articles = sum(r.get("total_processed", 0) for r in ticker_results)
total_stored = sum(r.get("successful_storage", 0) for r in ticker_results)
# Aggregate sentiment data
overall_sentiment = {
"positive": sum(
r.get("sentiment_summary", {}).get("positive", 0)
for r in ticker_results
),
"negative": sum(
r.get("sentiment_summary", {}).get("negative", 0)
for r in ticker_results
),
"neutral": sum(
r.get("sentiment_summary", {}).get("neutral", 0) for r in ticker_results
),
}
results = {
"status": "completed",
"total_tickers": total_tickers,
"successful_tickers": successful_tickers,
"total_articles": total_articles,
"total_stored": total_stored,
"overall_sentiment": overall_sentiment,
"completion_time": datetime.now(timezone.utc).isoformat(),
"ticker_results": ticker_results,
}
context.log.info(
f"Completed all tickers: {total_tickers} tickers, {total_articles} articles, {total_stored} stored"
)
# Log asset materialization
context.log_event(
AssetMaterialization(
asset_key="daily_news_collection_summary",
description="Completed daily news collection for all tickers",
metadata={
"status": MetadataValue.text(results.get("status", "")),
"total_tickers": MetadataValue.int(results.get("total_tickers", 0)),
"successful_tickers": MetadataValue.int(results.get("successful_tickers", 0)),
"total_articles": MetadataValue.int(results.get("total_articles", 0)),
"total_stored": MetadataValue.int(results.get("total_stored", 0)),
"completion_time": MetadataValue.text(results.get("completion_time", "")),
},
)
)
return results
except Exception as e:
context.log.error(f"Error collecting all results: {e}")
return {"status": "error", "error": str(e)}