TradingAgents/tradingagents/workflows/ops.py

421 lines
13 KiB
Python

"""
Dagster operations for TradingAgents news collection workflow.
"""
import asyncio
import logging
from datetime import datetime, timezone
from typing import Any
from dagster import (
AssetMaterialization,
OpExecutionContext,
op,
)
from tradingagents.config import TradingAgentsConfig
from tradingagents.domains.news.news_service import NewsService
logger = logging.getLogger(__name__)
@op
def get_tracked_tickers(context: OpExecutionContext) -> list[str]:
"""
Get list of tickers to process from configuration.
Returns:
List of ticker symbols to process
"""
try:
# Default ticker list - can be made configurable
tickers = ["AAPL", "GOOGL", "MSFT", "TSLA"]
context.log.info(f"Processing {len(tickers)} tickers: {tickers}")
return tickers
except Exception as e:
context.log.error(f"Error getting tracked tickers: {e}")
raise
@op
def fetch_google_news_articles(
context: OpExecutionContext, ticker: str
) -> dict[str, Any]:
"""
Fetch news articles for a single ticker from Google News.
Args:
context: Dagster operation context
ticker: Stock ticker symbol
Returns:
Dictionary with ticker and article list
"""
try:
context.log.info(f"Fetching articles for ticker: {ticker}")
# Initialize NewsService
config = TradingAgentsConfig.from_env()
news_service = NewsService.build(None, config) # Will be replaced with resource
# Get Google News articles
google_client = news_service.google_client
google_articles = google_client.get_company_news(ticker)
if not google_articles:
context.log.warning(f"No articles found for {ticker}")
return {
"ticker": ticker,
"articles": [],
"status": "no_articles",
"total_found": 0,
}
# Convert to simple dict format
article_list = []
for i, article in enumerate(google_articles):
article_list.append(
{
"index": i,
"ticker": ticker,
"title": article.title,
"url": article.link,
"source": article.source,
"published_date": article.published,
"summary": article.summary,
}
)
context.log.info(f"Found {len(article_list)} articles for {ticker}")
# Log asset materialization
context.log_event(
AssetMaterialization(
asset_key=f"google_news_articles_{ticker}",
description=f"Fetched {len(article_list)} articles for {ticker}",
metadata={
"ticker": ticker,
"total_articles": len(article_list),
"sources": {article["source"] for article in article_list},
"fetched_at": datetime.now(timezone.utc).isoformat(),
},
)
)
return {
"ticker": ticker,
"articles": article_list,
"status": "success",
"total_found": len(article_list),
}
except Exception as e:
context.log.error(f"Error fetching articles for {ticker}: {e}")
return {
"ticker": ticker,
"articles": [],
"status": "error",
"error": str(e),
"total_found": 0,
}
@op
def fetch_and_process_article(
context: OpExecutionContext, article_data: dict[str, Any]
) -> dict[str, Any]:
"""
Complete processing pipeline for a single article:
- Scrape content
- LLM sentiment analysis
- Vector embeddings
- Store in database
Args:
context: Dagster operation context
article_data: Article information including URL
Returns:
Processed article data with all processing results
"""
try:
url = article_data["url"]
title = article_data["title"]
ticker = article_data["ticker"]
context.log.info(f"Processing article: {title[:50]}...")
# Initialize NewsService
config = TradingAgentsConfig.from_env()
news_service = NewsService.build(None, config)
scraper = news_service.article_scraper
# Step 1: Scrape content
context.log.info("Step 1: Scraping content...")
scrape_result = scraper.scrape_article(url)
if scrape_result.status in ["SUCCESS", "ARCHIVE_SUCCESS"]:
content = scrape_result.content
author = scrape_result.author
publish_date = scrape_result.publish_date
context.log.info(f"Successfully scraped {len(content)} characters")
else:
content = article_data.get("summary", "")
author = ""
publish_date = article_data.get("published_date", "")
context.log.warning(
f"Scraping failed, using summary: {scrape_result.status}"
)
# Step 2: LLM Sentiment Analysis
context.log.info("Step 2: Analyzing sentiment...")
sentiment_result = {
"sentiment": "positive", # TODO: Implement OpenRouter LLM
"confidence": 0.75, # TODO: Implement OpenRouter LLM
"reasoning": "LLM analysis placeholder",
}
context.log.info(
f"Sentiment: {sentiment_result['sentiment']} (confidence: {sentiment_result['confidence']})"
)
# Step 3: Vector Embeddings
context.log.info("Step 3: Generating embeddings...")
vector_result = {
"title_embedding": [0.0] * 1536, # TODO: Implement OpenAI embeddings
"content_embedding": [0.0] * 1536, # TODO: Implement OpenAI embeddings
"embedding_model": "text-embedding-3-small",
"embedding_dimensions": 1536,
}
context.log.info(
f"Generated {len(vector_result['title_embedding'])}-dim embeddings"
)
# Step 4: Store in database
context.log.info("Step 4: Storing in database...")
async def store_article():
from datetime import date
from tradingagents.domains.news.news_repository import NewsArticle
news_article = NewsArticle(
headline=title,
url=url,
source=article_data["source"],
published_date=date.fromisoformat(
publish_date[:10] if publish_date else "2025-01-01"
),
summary=content,
author=author,
)
repository = news_service.repository
await repository.upsert_batch([news_article], ticker)
try:
asyncio.run(store_article())
storage_status = "success"
context.log.info("Successfully stored article")
except Exception as e:
storage_status = "error"
context.log.error(f"Error storing article: {e}")
# Return complete processed article
processed_article = {
**article_data,
"content": content,
"author": author,
"publish_date": publish_date,
"scrape_status": scrape_result.status,
"sentiment": sentiment_result,
"vectors": vector_result,
"storage_status": storage_status,
"processed_at": datetime.now(timezone.utc).isoformat(),
}
# Log asset materialization
context.log_event(
AssetMaterialization(
asset_key=f"processed_article_{ticker}_{article_data['index']}",
description=f"Completely processed article: {title[:50]}...",
metadata={
"ticker": ticker,
"url": url,
"scrape_status": scrape_result.status,
"sentiment": sentiment_result["sentiment"],
"content_length": len(content),
"storage_status": storage_status,
"processed_at": datetime.now(timezone.utc).isoformat(),
},
)
)
return processed_article
except Exception as e:
context.log.error(f"Error processing article {article_data['url']}: {e}")
return {
**article_data,
"content": "",
"scrape_status": "error",
"sentiment": {
"sentiment": "neutral",
"confidence": 0.0,
"reasoning": f"Error: {str(e)}",
},
"vectors": {
"title_embedding": [],
"content_embedding": [],
"error": str(e),
},
"storage_status": "error",
"error": str(e),
}
@op
def collect_ticker_results(
context: OpExecutionContext, processed_articles: list[dict[str, Any]]
) -> dict[str, Any]:
"""
Collect and summarize results for a ticker.
Args:
context: Dagster operation context
processed_articles: List of fully processed articles
Returns:
Summary results for the ticker
"""
try:
if not processed_articles:
return {"status": "no_articles", "total_processed": 0}
ticker = processed_articles[0]["ticker"]
# Calculate statistics
total_processed = len(processed_articles)
successful_scrapes = sum(
1
for a in processed_articles
if a.get("scrape_status") in ["SUCCESS", "ARCHIVE_SUCCESS"]
)
successful_storage = sum(
1 for a in processed_articles if a.get("storage_status") == "success"
)
# Sentiment analysis
sentiments = [
a.get("sentiment", {}).get("sentiment", "neutral")
for a in processed_articles
]
sentiment_counts = {
"positive": sentiments.count("positive"),
"negative": sentiments.count("negative"),
"neutral": sentiments.count("neutral"),
}
results = {
"ticker": ticker,
"status": "completed",
"total_processed": total_processed,
"successful_scrapes": successful_scrapes,
"successful_storage": successful_storage,
"sentiment_summary": sentiment_counts,
"completion_time": datetime.now(timezone.utc).isoformat(),
}
context.log.info(
f"Completed {ticker}: {total_processed} articles, {successful_storage} stored"
)
# Log asset materialization
context.log_event(
AssetMaterialization(
asset_key=f"ticker_results_{ticker}",
description=f"Completed news processing for {ticker}",
metadata=results,
)
)
return results
except Exception as e:
context.log.error(f"Error collecting ticker results: {e}")
return {"status": "error", "error": str(e)}
@op
def collect_all_results(
context: OpExecutionContext, ticker_results: list[dict[str, Any]]
) -> dict[str, Any]:
"""
Collect and summarize results for all tickers.
Args:
context: Dagster operation context
ticker_results: List of ticker result summaries
Returns:
Overall summary results
"""
try:
if not ticker_results:
return {"status": "no_results", "total_tickers": 0}
# Calculate overall statistics
total_tickers = len(ticker_results)
successful_tickers = sum(
1 for r in ticker_results if r.get("status") == "completed"
)
total_articles = sum(r.get("total_processed", 0) for r in ticker_results)
total_stored = sum(r.get("successful_storage", 0) for r in ticker_results)
# Aggregate sentiment data
overall_sentiment = {
"positive": sum(
r.get("sentiment_summary", {}).get("positive", 0)
for r in ticker_results
),
"negative": sum(
r.get("sentiment_summary", {}).get("negative", 0)
for r in ticker_results
),
"neutral": sum(
r.get("sentiment_summary", {}).get("neutral", 0) for r in ticker_results
),
}
results = {
"status": "completed",
"total_tickers": total_tickers,
"successful_tickers": successful_tickers,
"total_articles": total_articles,
"total_stored": total_stored,
"overall_sentiment": overall_sentiment,
"completion_time": datetime.now(timezone.utc).isoformat(),
"ticker_results": ticker_results,
}
context.log.info(
f"Completed all tickers: {total_tickers} tickers, {total_articles} articles, {total_stored} stored"
)
# Log asset materialization
context.log_event(
AssetMaterialization(
asset_key="daily_news_collection_summary",
description="Completed daily news collection for all tickers",
metadata=results,
)
)
return results
except Exception as e:
context.log.error(f"Error collecting all results: {e}")
return {"status": "error", "error": str(e)}