TradingAgents/tradingagents/workflows/news_assets.py

"""
Dagster assets for TradingAgents news collection workflow.
Replaces the op-based approach with declarative assets.
"""

import asyncio
import logging
from datetime import date, datetime, timezone

import pandas as pd
from dagster import (
    AssetExecutionContext,
    DailyPartitionsDefinition,
    MetadataValue,
    asset,
)

from tradingagents.config import TradingAgentsConfig
from tradingagents.domains.news.news_repository import NewsArticle
from tradingagents.domains.news.news_service import (
    ArticleData,
    NewsService,
)

logger = logging.getLogger(__name__)

# Daily partitions for time-series data
DAILY_PARTITIONS = DailyPartitionsDefinition(start_date="2024-01-01")


@asset(partitions_def=DAILY_PARTITIONS)
def raw_google_news_feeds(context: AssetExecutionContext) -> pd.DataFrame:
    """
    Raw RSS feeds from Google News by ticker and date.

    This asset fetches raw article metadata from Google News RSS feeds
    for all tracked tickers on the given partition date.
    """
    partition_date = context.partition_key
    context.log.info(f"Fetching raw Google News feeds for {partition_date}")

    # Initialize NewsService
    config = TradingAgentsConfig.from_env()
    news_service = NewsService.build(None, config)
    google_client = news_service.google_client

    # Get tracked tickers
    tickers = ["AAPL", "GOOGL", "MSFT", "TSLA"]  # TODO: Make configurable

    # Collect all articles
    all_articles = []

    for ticker in tickers:
        try:
            context.log.info(f"Fetching articles for {ticker}")
            google_articles = google_client.get_company_news(ticker)

            if not google_articles:
                context.log.warning(f"No articles found for {ticker}")
                continue

            # Convert to DataFrame format
            for article in google_articles:
                all_articles.append(
                    {
                        "ticker": ticker,
                        "title": article.title,
                        "url": article.link,
                        "source": article.source,
                        "published_date": article.published,
                        "summary": article.summary,
                        "fetch_date": partition_date,
                        "fetch_timestamp": datetime.now(timezone.utc).isoformat(),
                    }
                )

        except Exception as e:
            context.log.error(f"Error fetching articles for {ticker}: {e}")
            continue

    # Create DataFrame
    df = pd.DataFrame(all_articles)

    if df.empty:
        context.log.warning("No articles found for any tickers")
        return df

    # Log metadata
    context.add_output_metadata(
        {
            "total_articles": len(df),
            "tickers": df["ticker"].unique().tolist(),
            "sources": df["source"].unique().tolist(),
            "fetch_date": partition_date,
            "preview": MetadataValue.md(
                df.head().to_markdown() if not df.empty else "No data"
            ),
        }
    )

    context.log.info(f"Fetched {len(df)} raw articles for {partition_date}")
    return df


@asset(partitions_def=DAILY_PARTITIONS)
def scraped_article_content(
    context: AssetExecutionContext, raw_google_news_feeds: pd.DataFrame
) -> pd.DataFrame:
    """
    Full article content extracted via newspaper4k.

    This asset takes the raw RSS feeds and scrapes the full article content
    from each URL, handling paywalls and extraction failures gracefully.
    """
    partition_date = context.partition_key
    context.log.info(f"Scraping article content for {partition_date}")

    if raw_google_news_feeds.empty:
        context.log.warning("No raw articles to scrape")
        return pd.DataFrame()

    # Initialize scraper
    config = TradingAgentsConfig.from_env()
    news_service = NewsService.build(None, config)
    scraper = news_service.article_scraper

    # Process each article
    scraped_articles = []

    for idx, row in raw_google_news_feeds.iterrows():
        url = str(row["url"])
        ticker = str(row["ticker"])
        title = str(row["title"])

        try:
            context.log.info(
                f"Scraping article {idx + 1}/{len(raw_google_news_feeds)}: {title[:50]}..."
            )

            # Scrape content
            scrape_result = scraper.scrape_article(url)

            if scrape_result.status in ["SUCCESS", "ARCHIVE_SUCCESS"]:
                content = scrape_result.content or ""
                author = scrape_result.author or ""
                publish_date = scrape_result.publish_date or ""
                scrape_status = scrape_result.status
            else:
                # Fallback to RSS data
                content = str(row.get("summary", ""))
                author = ""
                publish_date = str(row.get("published_date", ""))
                scrape_status = "rss_fallback"
                context.log.warning(
                    f"Scraping failed for {url}, using RSS summary: {scrape_result.status}"
                )

            # Create enhanced article record
            scraped_articles.append(
                {
                    "ticker": ticker,
                    "title": title,
                    "url": url,
                    "source": str(row["source"]),
                    "published_date": publish_date,
                    "author": author,
                    "content": content,
                    "summary": str(row["summary"]),  # Keep original summary
                    "scrape_status": scrape_status,
                    "content_length": len(content) if content else 0,
                    "fetch_date": partition_date,
                    "scraped_timestamp": datetime.now(timezone.utc).isoformat(),
                }
            )

        except Exception as e:
            context.log.error(f"Error scraping article {url}: {e}")
            # Add failed record
            scraped_articles.append(
                {
                    "ticker": ticker,
                    "title": title,
                    "url": url,
                    "source": str(row["source"]),
                    "published_date": str(row.get("published_date", "")),
                    "author": "",
                    "content": str(row.get("summary", "")),
                    "summary": str(row["summary"]),
                    "scrape_status": "error",
                    "content_length": 0,
                    "fetch_date": partition_date,
                    "scraped_timestamp": datetime.now(timezone.utc).isoformat(),
                    "error": str(e),
                }
            )

    # Create DataFrame
    df = pd.DataFrame(scraped_articles)

    # Log metadata
    successful_scrapes = df["scrape_status"].isin(["SUCCESS", "ARCHIVE_SUCCESS"]).sum()
    context.add_output_metadata(
        {
            "total_articles": len(df),
            "successful_scrapes": int(successful_scrapes),
            "failed_scrapes": int(len(df) - successful_scrapes),
            "avg_content_length": float(df["content_length"].mean())
            if len(df) > 0
            else 0,
            "scrape_statuses": df["scrape_status"].value_counts().to_dict(),
            "preview": MetadataValue.md(
                df.head().to_markdown() if not df.empty else "No data"
            ),
        }
    )

    context.log.info(
        f"Scraped content for {len(df)} articles ({int(successful_scrapes)} successful)"
    )
    return df


@asset(partitions_def=DAILY_PARTITIONS)
def article_sentiment_analysis(
    context: AssetExecutionContext, scraped_article_content: pd.DataFrame
) -> pd.DataFrame:
    """
    LLM sentiment analysis via OpenRouter.

    This asset analyzes the sentiment of each scraped article using
    OpenRouter's LLM models with keyword fallback.
    """
    partition_date = context.partition_key
    context.log.info(f"Analyzing sentiment for {partition_date}")

    if scraped_article_content.empty:
        context.log.warning("No scraped articles to analyze")
        return pd.DataFrame()

    # Initialize NewsService with OpenRouter
    config = TradingAgentsConfig.from_env()
    news_service = NewsService.build(None, config)

    # Process sentiment for each article
    analyzed_articles = []

    for idx, row in scraped_article_content.iterrows():
        content = str(row["content"])
        title = str(row["title"])
        url = str(row["url"])
        ticker = str(row["ticker"])

        try:
            context.log.info(
                f"Analyzing sentiment for article {idx + 1}/{len(scraped_article_content)}: {title[:50]}..."
            )

            # Create ArticleData for sentiment analysis
            article_data = ArticleData(
                title=title,
                content=content,
                author=str(row["author"]),
                source=str(row["source"]),
                date=str(row["fetch_date"]),
                url=url,
            )

            # Calculate sentiment using NewsService
            sentiment_score = asyncio.run(
                news_service._calculate_sentiment_summary([article_data])
            )

            analyzed_articles.append(
                {
                    "ticker": ticker,
                    "title": title,
                    "url": url,
                    "source": str(row["source"]),
                    "published_date": str(row["published_date"]),
                    "author": str(row["author"]),
                    "content": content,
                    "summary": str(row["summary"]),
                    "scrape_status": str(row["scrape_status"]),
                    "content_length": int(row["content_length"]),
                    "fetch_date": partition_date,
                    "sentiment_score": sentiment_score.score,
                    "sentiment_confidence": sentiment_score.confidence,
                    "sentiment_label": sentiment_score.label,
                    "analyzed_timestamp": datetime.now(timezone.utc).isoformat(),
                }
            )

        except Exception as e:
            context.log.error(f"Error analyzing sentiment for {url}: {e}")
            # Add record with neutral sentiment
            analyzed_articles.append(
                {
                    "ticker": ticker,
                    "title": title,
                    "url": url,
                    "source": str(row["source"]),
                    "published_date": str(row["published_date"]),
                    "author": str(row["author"]),
                    "content": content,
                    "summary": str(row["summary"]),
                    "scrape_status": str(row["scrape_status"]),
                    "content_length": int(row["content_length"]),
                    "fetch_date": partition_date,
                    "sentiment_score": 0.0,
                    "sentiment_confidence": 0.0,
                    "sentiment_label": "neutral",
                    "analyzed_timestamp": datetime.now(timezone.utc).isoformat(),
                    "sentiment_error": str(e),
                }
            )

    # Create DataFrame
    df = pd.DataFrame(analyzed_articles)

    # Log metadata
    sentiment_counts = df["sentiment_label"].value_counts().to_dict()
    avg_confidence = float(df["sentiment_confidence"].mean()) if len(df) > 0 else 0.0

    context.add_output_metadata(
        {
            "total_articles": len(df),
            "sentiment_distribution": sentiment_counts,
            "avg_confidence": avg_confidence,
            "avg_sentiment_score": float(df["sentiment_score"].mean())
            if len(df) > 0
            else 0.0,
            "preview": MetadataValue.md(
                df.head().to_markdown() if not df.empty else "No data"
            ),
        }
    )

    context.log.info(f"Analyzed sentiment for {len(df)} articles")
    return df


@asset(partitions_def=DAILY_PARTITIONS)
def article_vector_embeddings(
    context: AssetExecutionContext, article_sentiment_analysis: pd.DataFrame
) -> pd.DataFrame:
    """
    Vector embeddings for RAG using OpenRouter.

    This asset generates 1536-dimension vector embeddings for each article
    to enable semantic search and RAG-powered agent context.
    """
    partition_date = context.partition_key
    context.log.info(f"Generating embeddings for {partition_date}")

    if article_sentiment_analysis.empty:
        context.log.warning("No analyzed articles to embed")
        return pd.DataFrame()

    # Initialize OpenRouter client for embeddings
    config = TradingAgentsConfig.from_env()
    news_service = NewsService.build(None, config)

    if not news_service.openrouter_client:
        context.log.warning(
            "OpenRouter client not available, using placeholder embeddings"
        )
        # Create placeholder embeddings
        df = article_sentiment_analysis.copy()
        df["title_embedding"] = [[0.0] * 1536] * len(df)
        df["content_embedding"] = [[0.0] * 1536] * len(df)
        df["embedding_model"] = "placeholder"
        df["embedding_dimensions"] = 1536
        df["embedded_timestamp"] = datetime.now(timezone.utc).isoformat()

        context.add_output_metadata(
            {
                "total_articles": len(df),
                "embedding_model": "placeholder",
                "embedding_dimensions": 1536,
                "preview": MetadataValue.md(
                    df.head().to_markdown() if not df.empty else "No data"
                ),
            }
        )

        return df

    # Process embeddings for each article
    embedded_articles = []

    for idx, row in article_sentiment_analysis.iterrows():
        title = str(row["title"])
        content = str(row["content"])
        url = str(row["url"])
        ticker = str(row["ticker"])

        try:
            context.log.info(
                f"Generating embeddings for article {idx + 1}/{len(article_sentiment_analysis)}: {title[:50]}..."
            )

            # Generate real embeddings using NewsService
            try:
                title_embedding, content_embedding = (
                    news_service.generate_article_embeddings(title, content)
                )
            except Exception as e:
                context.log.warning(f"Failed to generate embeddings for {url}: {e}")
                # Fallback to placeholder embeddings
                title_embedding = [0.0] * 1536
                content_embedding = [0.0] * 1536

            embedded_articles.append(
                {
                    "ticker": ticker,
                    "title": title,
                    "url": url,
                    "source": str(row["source"]),
                    "published_date": str(row["published_date"]),
                    "author": str(row["author"]),
                    "content": content,
                    "summary": str(row["summary"]),
                    "scrape_status": str(row["scrape_status"]),
                    "content_length": int(row["content_length"]),
                    "fetch_date": partition_date,
                    "sentiment_score": float(row["sentiment_score"]),
                    "sentiment_confidence": float(row["sentiment_confidence"]),
                    "sentiment_label": str(row["sentiment_label"]),
                    "title_embedding": title_embedding,
                    "content_embedding": content_embedding,
                    "embedding_model": config.news_embedding_llm,
                    "embedding_dimensions": 1536,
                    "embedded_timestamp": datetime.now(timezone.utc).isoformat(),
                }
            )

        except Exception as e:
            context.log.error(f"Error generating embeddings for {url}: {e}")
            # Add record with placeholder embeddings
            embedded_articles.append(
                {
                    "ticker": ticker,
                    "title": title,
                    "url": url,
                    "source": str(row["source"]),
                    "published_date": str(row["published_date"]),
                    "author": str(row["author"]),
                    "content": content,
                    "summary": str(row["summary"]),
                    "scrape_status": str(row["scrape_status"]),
                    "content_length": int(row["content_length"]),
                    "fetch_date": partition_date,
                    "sentiment_score": float(row["sentiment_score"]),
                    "sentiment_confidence": float(row["sentiment_confidence"]),
                    "sentiment_label": str(row["sentiment_label"]),
                    "title_embedding": [0.0] * 1536,
                    "content_embedding": [0.0] * 1536,
                    "embedding_model": "error-placeholder",
                    "embedding_dimensions": 1536,
                    "embedded_timestamp": datetime.now(timezone.utc).isoformat(),
                    "embedding_error": str(e),
                }
            )

    # Create DataFrame
    df = pd.DataFrame(embedded_articles)

    # Log metadata
    context.add_output_metadata(
        {
            "total_articles": len(df),
            "embedding_model": str(df["embedding_model"].iloc[0])
            if not df.empty
            else "none",
            "embedding_dimensions": 1536,
            "preview": MetadataValue.md(
                df.head().to_markdown() if not df.empty else "No data"
            ),
        }
    )

    context.log.info(f"Generated embeddings for {len(df)} articles")
    return df


@asset(partitions_def=DAILY_PARTITIONS)
def news_articles_table(
    context: AssetExecutionContext, article_vector_embeddings: pd.DataFrame
) -> None:
    """
    Final storage in PostgreSQL with TimescaleDB hypertable.

    This asset stores the fully processed articles with embeddings
    in the PostgreSQL database for use by trading agents.
    """
    partition_date = context.partition_key
    context.log.info(f"Storing articles in database for {partition_date}")

    if article_vector_embeddings.empty:
        context.log.warning("No embedded articles to store")
        return

    # Initialize NewsService and repository
    config = TradingAgentsConfig.from_env()
    news_service = NewsService.build(None, config)
    repository = news_service.repository

    if not repository:
        context.log.error("No repository available for storage")
        return

    # Convert DataFrame rows to NewsArticle objects
    stored_count = 0
    failed_count = 0

    for _idx, row in article_vector_embeddings.iterrows():
        try:
            # Create NewsArticle object
            news_article = NewsArticle(
                headline=str(row["title"]),
                url=str(row["url"]),
                source=str(row["source"]),
                published_date=date.fromisoformat(str(row["published_date"])[:10])
                if str(row["published_date"])
                else date.today(),
                summary=str(row["content"]),  # Use full content as summary
                author=str(row["author"]),
                # TODO: Add embedding fields to NewsArticle model
            )

            # Store in database (async operation)
            asyncio.run(repository.upsert_batch([news_article], str(row["ticker"])))
            stored_count += 1

        except Exception as e:
            context.log.error(f"Error storing article {row['url']}: {e}")
            failed_count += 1

    # Log metadata
    context.add_output_metadata(
        {
            "total_articles": len(article_vector_embeddings),
            "stored_successfully": stored_count,
            "failed_to_store": failed_count,
            "storage_rate": stored_count / len(article_vector_embeddings)
            if len(article_vector_embeddings) > 0
            else 0,
            "tickers": article_vector_embeddings["ticker"].unique().tolist(),
        }
    )

    context.log.info(
        f"Stored {stored_count} articles in database ({failed_count} failed)"
    )


@asset(partitions_def=DAILY_PARTITIONS)
def daily_sentiment_summary(
    context: AssetExecutionContext, _news_articles_table
) -> pd.DataFrame:
    """
    Aggregated sentiment by ticker/date for trading agents.

    This asset creates daily sentiment summaries that can be used
    by trading agents for market context and decision making.
    """
    partition_date = context.partition_key
    context.log.info(f"Creating daily sentiment summary for {partition_date}")

    # Initialize NewsService and repository
    config = TradingAgentsConfig.from_env()
    news_service = NewsService.build(None, config)
    repository = news_service.repository

    if not repository:
        context.log.error("No repository available for sentiment summary")
        return pd.DataFrame()

    # Get tracked tickers
    tickers = ["AAPL", "GOOGL", "MSFT", "TSLA"]  # TODO: Make configurable

    summary_data = []

    try:
        # Query articles for each ticker on this date
        for ticker in tickers:
            try:
                # Convert partition date to date object
                start_date = date.fromisoformat(partition_date)
                end_date = start_date  # Same day for daily summary

                # Get articles from repository (following test pattern)
                news_articles = asyncio.run(
                    repository.list_by_date_range(
                        symbol=ticker,
                        start_date=start_date,
                        end_date=end_date,
                    )
                )

                if not news_articles:
                    context.log.debug(
                        f"No articles found for {ticker} on {partition_date}"
                    )
                    continue

                # Convert NewsArticle objects to ArticleData objects (following test pattern)
                articles = []
                for article in news_articles:
                    articles.append(
                        ArticleData(
                            title=article.headline,
                            content=article.summary or "",
                            author=article.author or "",
                            source=article.source,
                            date=article.published_date.isoformat(),
                            url=article.url,
                        )
                    )

                # Calculate sentiment summary using NewsService (following test pattern)
                sentiment_summary = asyncio.run(
                    news_service._calculate_sentiment_summary(articles)
                )

                # Create summary record
                summary_data.append(
                    {
                        "date": partition_date,
                        "ticker": ticker,
                        "total_articles": len(articles),
                        "positive_articles": sum(
                            1
                            for a in articles
                            if hasattr(a, "sentiment")
                            and a.sentiment
                            and a.sentiment.label == "positive"
                        ),
                        "negative_articles": sum(
                            1
                            for a in articles
                            if hasattr(a, "sentiment")
                            and a.sentiment
                            and a.sentiment.label == "negative"
                        ),
                        "neutral_articles": sum(
                            1
                            for a in articles
                            if hasattr(a, "sentiment")
                            and a.sentiment
                            and a.sentiment.label == "neutral"
                        ),
                        "avg_sentiment_score": sentiment_summary.score,
                        "avg_confidence": sentiment_summary.confidence,
                        "dominant_sentiment": sentiment_summary.label,
                    }
                )

                context.log.debug(
                    f"Created sentiment summary for {ticker}: {len(articles)} articles"
                )

            except Exception as e:
                context.log.error(f"Error creating sentiment summary for {ticker}: {e}")
                continue

    except Exception as e:
        context.log.error(f"Error in daily sentiment summary: {e}")

    # Create DataFrame with proper columns
    summary_df = pd.DataFrame(summary_data)

    context.add_output_metadata(
        {
            "summary_date": partition_date,
            "total_tickers": len(summary_df),
            "total_articles": summary_df["total_articles"].sum()
            if not summary_df.empty
            else 0,
            "preview": MetadataValue.md(summary_df.head().to_markdown())
            if not summary_df.empty
            else "No data",
        }
    )

    context.log.info(f"Created sentiment summary for {len(summary_df)} tickers")
    return summary_df


@asset(partitions_def=DAILY_PARTITIONS)
def trending_topics_analysis(
    context: AssetExecutionContext, _news_articles_table
) -> pd.DataFrame:
    """
    Extracted trending topics for market context.

    This asset analyzes article titles and content to identify
    trending topics that may impact market conditions.
    """
    partition_date = context.partition_key
    context.log.info(f"Analyzing trending topics for {partition_date}")

    # Initialize NewsService and repository
    config = TradingAgentsConfig.from_env()
    news_service = NewsService.build(None, config)
    repository = news_service.repository

    if not repository:
        context.log.error("No repository available for trending topics analysis")
        return pd.DataFrame()

    # Get tracked tickers
    tickers = ["AAPL", "GOOGL", "MSFT", "TSLA"]  # TODO: Make configurable

    topics_data = []

    try:
        # Collect all articles for topic analysis
        all_articles = []

        for ticker in tickers:
            try:
                # Convert partition date to date object
                start_date = date.fromisoformat(partition_date)
                end_date = start_date  # Same day for daily analysis

                # Get articles from repository
                news_articles = asyncio.run(
                    repository.list_by_date_range(
                        symbol=ticker,
                        start_date=start_date,
                        end_date=end_date,
                    )
                )

                if not news_articles:
                    continue

                # Convert NewsArticle objects to ArticleData objects
                for article in news_articles:
                    all_articles.append(
                        ArticleData(
                            title=article.headline,
                            content=article.summary or "",
                            author=article.author or "",
                            source=article.source,
                            date=article.published_date.isoformat(),
                            url=article.url,
                        )
                    )

            except Exception as e:
                context.log.error(f"Error fetching articles for {ticker}: {e}")
                continue

        if all_articles:
            # Extract trending topics using NewsService (following test pattern)
            trending_topics = news_service._extract_trending_topics(all_articles)

            # Create topic records with frequency analysis
            for topic in trending_topics:
                # Count articles containing this topic
                topic_articles = [
                    article
                    for article in all_articles
                    if topic.lower() in article.title.lower()
                    or topic.lower() in article.content.lower()
                ]

                # Calculate average sentiment for articles with this topic
                if topic_articles:
                    sentiment_summary = asyncio.run(
                        news_service._calculate_sentiment_summary(topic_articles)
                    )
                    avg_sentiment = sentiment_summary.score
                else:
                    avg_sentiment = 0.0

                # Get related tickers for this topic
                related_tickers = []
                for ticker in tickers:
                    ticker_articles = [
                        article
                        for article in topic_articles
                        if ticker.lower() in article.title.lower()
                        or ticker.lower() in article.content.lower()
                    ]
                    if ticker_articles:
                        related_tickers.append(ticker)

                topics_data.append(
                    {
                        "date": partition_date,
                        "topic": topic,
                        "frequency": len(topic_articles),
                        "sentiment_score": avg_sentiment,
                        "related_tickers": ",".join(related_tickers)
                        if related_tickers
                        else "",
                        "sample_articles": ",".join(
                            [article.url for article in topic_articles[:3]]
                        ),
                    }
                )

        context.log.debug(
            f"Identified {len(trending_topics)} trending topics from {len(all_articles)} articles"
        )

    except Exception as e:
        context.log.error(f"Error in trending topics analysis: {e}")

    # Create DataFrame
    topics_df = pd.DataFrame(topics_data)

    context.add_output_metadata(
        {
            "analysis_date": partition_date,
            "total_topics": len(topics_df),
            "preview": MetadataValue.md(topics_df.head().to_markdown())
            if not topics_df.empty
            else "No data",
        }
    )

    context.log.info(f"Identified {len(topics_df)} trending topics")
    return topics_df