From bfbc011a87259d91b0306e22af9cc7aec0693541 Mon Sep 17 00:00:00 2001
From: "swj.premkumar" <swj.premkumar@gmail.com>
Date: Sat, 10 Jan 2026 11:14:40 -0600
Subject: [PATCH]  Implemented pure Python local embeddings using
 `sentence-transformers` library, eliminating the need for external services
 like Ollama for providers that don't support embeddings

---
 .env.example                         |  74 +++++++++++++-
 CHANGELOG.md                         |  43 +++++++++
 cli/utils.py                         |  31 +++---
 docs/LOCAL_EMBEDDINGS.md             |  83 ++++++++++++++++
 start.sh => startAgent.sh            |   3 +
 startEmbedding.sh                    |  20 ++++
 tradingagents/agents/utils/memory.py |  19 +++-
 verify_local_embeddings.py           | 138 +++++++++++++++++++++++++++
 verify_ollama_embeddings.py          |  81 ++++++++++++++++
 verify_tei_native.py                 |  57 +++++++++++
 10 files changed, 530 insertions(+), 19 deletions(-)
 create mode 100644 docs/LOCAL_EMBEDDINGS.md
 rename start.sh => startAgent.sh (93%)
 create mode 100755 startEmbedding.sh
 create mode 100755 verify_local_embeddings.py
 create mode 100755 verify_ollama_embeddings.py
 create mode 100644 verify_tei_native.py

diff --git a/.env.example b/.env.example
index 1e257c3c..334ea3c0 100644
--- a/.env.example
+++ b/.env.example
@@ -1,2 +1,74 @@
+# TradingAgents Environment Variables Configuration
+
+# ============================================
+# LLM Provider API URLs
+# ============================================
+# These environment variables allow you to customize the API endpoints
+# for different LLM providers. If not set, the default URLs will be used.
+
+# OpenAI API URL
+# Default: https://api.openai.com/v1
+#OPENAI_API_URL=https://api.openai.com/v1
+
+# Anthropic API URL
+# Default: https://api.anthropic.com/
+#ANTHROPIC_API_URL=https://api.anthropic.com/
+
+# Google Generative AI API URL
+# Default: https://generativelanguage.googleapis.com/v1
+#GOOGLE_API_URL=https://generativelanguage.googleapis.com/v1
+
+# OpenRouter API URL
+# Default: https://openrouter.ai/api/v1
+#OPENROUTER_API_URL=https://openrouter.ai/api/v1
+
+# Ollama API URL (local)
+# Default: http://localhost:11434/v1
+#OLLAMA_API_URL=http://localhost:11434/v1
+
+# ============================================
+# Embedding Configuration
+# ============================================
+# If EMBEDDING_API_URL is set, it will be used for ALL providers (overrides defaults)
+# This is required for Anthropic (which doesn't provide embeddings)
+# Can point to sentence-transformers in Docker, Ollama, or any OpenAI-compatible service
+
+# Embedding service URL (OpenAI-compatible API)
+# Required for Anthropic, optional for others
+# Examples:
+#   - Local Service (startEmbedding.sh): http://localhost:11434/v1
+#   - Ollama: http://localhost:11434/v1
+#EMBEDDING_API_URL=http://localhost:11434/v1
+
+# Embedding model name
+# Default: all-MiniLM-L6-v2
+#EMBEDDING_MODEL=all-MiniLM-L6-v2
+
+# Embedding API key (if your service requires it)
+# Default: "local" (most local services don't need a key)
+#EMBEDDING_API_KEY=local
+
+# ============================================
+# API Keys
+# ============================================
+
+# Alpha Vantage API Key
 ALPHA_VANTAGE_API_KEY=alpha_vantage_api_key_placeholder
-OPENAI_API_KEY=openai_api_key_placeholder
\ No newline at end of file
+
+# OpenAI API Key
+OPENAI_API_KEY=openai_api_key_placeholder
+
+# Alpaca Trading API
+#ALPACA_API_KEY=your_alpaca_api_key_here
+#ALPACA_API_SECRET=your_alpaca_secret_key_here
+
+# Google API Key (for Gemini models)
+#GOOGLE_API_KEY=your_google_api_key_here
+
+# ============================================
+# Application Settings
+# ============================================
+
+# Results directory for storing analysis outputs
+# Default: ./results
+#TRADINGAGENTS_RESULTS_DIR=./results
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ccc748b..1b81de5c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,49 @@
 
 All notable changes to the **TradingAgents** project will be documented in this file.
 
+## [Unreleased] - 2026-01-10
+
+### Added
+- **Local Embedding Service Support**: Added support for Anthropic to use local embedding service via URL
+  - Anthropic doesn't provide embeddings API, so users can run **Hugging Face Text Embeddings Inference (TEI)** in Docker
+  - Configure via `EMBEDDING_API_URL` environment variable (default: `http://localhost:11434/v1`)
+  - Configure model via `EMBEDDING_MODEL` environment variable (default: `all-MiniLM-L6-v2`)
+  - Keeps main application lightweight - heavy dependencies (PyTorch) isolated in separate container
+- **Environment Variable Configuration**: Added comprehensive environment variable support for all LLM providers and embedding configuration
+  - `OPENAI_API_URL` - Custom OpenAI API endpoint
+  - `ANTHROPIC_API_URL` - Custom Anthropic API endpoint
+  - `GOOGLE_API_URL` - Custom Google API endpoint
+  - `OPENROUTER_API_URL` - Custom OpenRouter API endpoint
+  - `OLLAMA_API_URL` - Custom Ollama API endpoint
+  - `EMBEDDING_PROVIDER` - Choose embedding provider: `local`, `openai`, `google`, `ollama`
+  - `EMBEDDING_API_URL` - Custom embedding API endpoint (for Ollama or Docker service)
+  - `EMBEDDING_MODEL` - Custom embedding model name
+- **Anthropic Claude 4.5 Thinking Models**: Added support for latest Anthropic thinking models
+  - `claude-sonnet-4-5-thinking` - Advanced reasoning with extended thinking
+  - `claude-opus-4-5-thinking` - Premier reasoning with extended thinking
+  - Removed older Claude models (3.5, 3.7, 4.0) to focus on latest thinking models
+- **Documentation**: Created comprehensive guides and verification tools
+  - `docs/LOCAL_EMBEDDINGS.md` - Complete guide for local embeddings setup
+  - `verify_local_embeddings.py` - Verification script for sentence-transformers
+  - `verify_ollama_embeddings.py` - Verification script for Ollama (optional)
+  - Updated `.env.example` with all new configuration options
+
+### Changed
+- **Dependency Cleanup**: Removed `sentence-transformers` from `requirements.txt` to keep main application lightweight.
+- **Virtual Environment**: Recreated `.venv` to ensure a clean state without unused heavy dependencies.
+- **Embedding Architecture**: Refactored `tradingagents/agents/utils/memory.py` to support multiple embedding providers with clean separation of concerns
+  - Automatic provider selection based on LLM provider
+  - Local embeddings as default for Anthropic and Ollama providers
+  - Maintained backward compatibility with existing API-based embeddings
+- **CLI Provider Selection**: Updated `cli/utils.py` to use environment variables for all LLM provider API URLs with sensible defaults
+- **Configuration Documentation**: Enhanced `.env.example` with detailed comments and examples for all configuration options
+
+### Fixed
+- **Anthropic Embedding Error**: Resolved `404 Not Found` error when using Anthropic as LLM provider by implementing automatic fallback to local embeddings (Anthropic doesn't provide an embeddings API)
+
+### Technical Debt
+- None - All changes follow SOLID principles with proper separation of concerns
+
 ## [Unreleased] - 2026-01-09
 
 ### Added
diff --git a/cli/utils.py b/cli/utils.py
index 6fe5ab3c..606d6226 100644
--- a/cli/utils.py
+++ b/cli/utils.py
@@ -1,3 +1,4 @@
+import os
 import questionary
 from typing import List, Optional, Tuple, Dict
 
@@ -134,10 +135,8 @@ def select_shallow_thinking_agent(provider) -> str:
             ("GPT-4o - Standard model with solid capabilities", "gpt-4o"),
         ],
         "anthropic": [
-            ("Claude Haiku 3.5 - Fast inference and standard capabilities", "claude-3-5-haiku-latest"),
-            ("Claude Sonnet 3.5 - Highly capable standard model", "claude-3-5-sonnet-latest"),
-            ("Claude Sonnet 3.7 - Exceptional hybrid reasoning and agentic capabilities", "claude-3-7-sonnet-latest"),
-            ("Claude Sonnet 4 - High performance and excellent reasoning", "claude-sonnet-4-0"),
+            ("Claude Sonnet 4.5 (Thinking) - Advanced reasoning with extended thinking", "claude-sonnet-4-5-thinking"),
+            ("Claude Opus 4.5 (Thinking) - Premier reasoning with extended thinking", "claude-opus-4-5-thinking"),
         ],
         "google": [
             ("Gemini 2.5 Flash-Lite - Cost efficiency and low latency", "gemini-2.5-flash-lite"),
@@ -196,11 +195,8 @@ def select_deep_thinking_agent(provider) -> str:
             ("o1 - Premier reasoning and problem-solving model", "o1"),
         ],
         "anthropic": [
-            ("Claude Haiku 3.5 - Fast inference and standard capabilities", "claude-3-5-haiku-latest"),
-            ("Claude Sonnet 3.5 - Highly capable standard model", "claude-3-5-sonnet-latest"),
-            ("Claude Sonnet 3.7 - Exceptional hybrid reasoning and agentic capabilities", "claude-3-7-sonnet-latest"),
-            ("Claude Sonnet 4 - High performance and excellent reasoning", "claude-sonnet-4-0"),
-            ("Claude Opus 4 - Most powerful Anthropic model", "	claude-opus-4-0"),
+            ("Claude Sonnet 4.5 (Thinking) - Advanced reasoning with extended thinking", "claude-sonnet-4-5-thinking"),
+            ("Claude Opus 4.5 (Thinking) - Premier reasoning with extended thinking", "claude-opus-4-5-thinking"),
         ],
         "google": [
             ("Gemini 2.5 Flash - Next generation features, speed, and thinking", "gemini-2.5-flash"),
@@ -241,14 +237,15 @@ def select_deep_thinking_agent(provider) -> str:
     return choice
 
 def select_llm_provider() -> tuple[str, str]:
-    """Select the OpenAI api url using interactive selection."""
-    # Define OpenAI api options with their corresponding endpoints
+    """Select the LLM provider and return its API URL from environment or default."""
+    # Define LLM provider options with their corresponding endpoints
+    # Each provider checks for its specific environment variable with a fallback default
     BASE_URLS = [
-        ("OpenAI", "https://api.openai.com/v1"),
-        ("Anthropic", "https://api.anthropic.com/"),
-        ("Google", "https://generativelanguage.googleapis.com/v1"),
-        ("Openrouter", "https://openrouter.ai/api/v1"),
-        ("Ollama", "http://localhost:11434/v1"),        
+        ("OpenAI", os.getenv("OPENAI_API_URL", "https://api.openai.com/v1")),
+        ("Anthropic", os.getenv("ANTHROPIC_API_URL", "https://api.anthropic.com/")),
+        ("Google", os.getenv("GOOGLE_API_URL", "https://generativelanguage.googleapis.com/v1")),
+        ("Openrouter", os.getenv("OPENROUTER_API_URL", "https://openrouter.ai/api/v1")),
+        ("Ollama", os.getenv("OLLAMA_API_URL", "http://localhost:11434/v1")),        
     ]
     
     choice = questionary.select(
@@ -268,7 +265,7 @@ def select_llm_provider() -> tuple[str, str]:
     ).ask()
     
     if choice is None:
-        console.print("\n[red]no OpenAI backend selected. Exiting...[/red]")
+        console.print("\n[red]No LLM provider selected. Exiting...[/red]")
         exit(1)
     
     display_name, url = choice
diff --git a/docs/LOCAL_EMBEDDINGS.md b/docs/LOCAL_EMBEDDINGS.md
new file mode 100644
index 00000000..37cd8b73
--- /dev/null
+++ b/docs/LOCAL_EMBEDDINGS.md
@@ -0,0 +1,83 @@
+# Local Embeddings Setup Guide
+
+This guide explains how to set up local embeddings for the TradingAgents framework.
+
+## Why Local Embeddings?
+
+When using LLM providers that don't support embeddings (like Anthropic), or when you want to avoid additional API costs, you need a local embedding solution.
+
+## Recommended: Run in Docker
+
+The recommended approach is to run the embedding service in a Docker container. This keeps your main application environment clean and avoids installing heavy dependencies like PyTorch on your host machine.
+
+### 1. Run the Embedding Service
+Use the provided script to start the service:
+
+```bash
+./startEmbedding.sh
+```
+
+This runs **Hugging Face Text Embeddings Inference (TEI)**, a high-performance server compatible with the OpenAI API.
+
+*(Note: The Go-based image `clems4ever/all-minilm-l6-v2-go` is a CLI tool and cannot merely be run as a server.)*
+
+### 2. Configure TradingAgents
+
+Add (or update) these lines in your `.env` file:
+
+```bash
+# Point to your local embedding service (TEI supports /v1 API)
+EMBEDDING_API_URL=http://localhost:11434/v1
+
+# The model name configured in the start script
+EMBEDDING_MODEL=all-MiniLM-L6-v2
+```
+
+### 3. Verify Setup
+
+Run the verification script:
+
+```bash
+python3 verify_local_embeddings.py
+```
+
+## Alternative: Local Installation (Development Only)
+
+If you prefer to run everything locally without Docker (e.g., for development), you can install the library directly.
+
+**⚠️ Warning:** This adds ~500MB of PyTorch dependencies to your environment.
+
+### 1. Install Dependencies
+
+```bash
+pip install sentence-transformers
+```
+
+### 2. Configure TradingAgents
+
+If you don't set `EMBEDDING_API_URL`, the system will attempt to import `sentence-transformers` automatically when using Anthropic.
+
+```bash
+# Optional: Force local provider
+EMBEDDING_PROVIDER=local
+```
+
+## Supported Providers
+
+| LLM Provider | Default Behavior | Recommended Setup |
+|--------------|------------------|-------------------|
+| **Anthropic** | Tries local service URL | **Docker Service** |
+| **Ollama** | Uses Ollama API | Ensure Ollama is running |
+| **OpenAI** | Uses OpenAI API | No setup needed |
+| **Google** | Uses Google API | No setup needed |
+
+## FAQ
+
+**Q: Why Docker?**
+A: `sentence-transformers` requires PyTorch, which is a very large dependency (~500MB+). Putting it in a container keeps your main application lightweight and portable.
+
+**Q: Can I use GPU?**
+A: Yes! Use the GPU version of the container: `ghcr.io/huggingface/text-embeddings-inference:latest` (requires NVIDIA Container Toolkit).
+
+**Q: Can I use Ollama instead?**
+A: Yes. Set `EMBEDDING_API_URL=http://localhost:11434/v1` and `EMBEDDING_MODEL=nomic-embed-text` (or your preferred Ollama model).
diff --git a/start.sh b/startAgent.sh
similarity index 93%
rename from start.sh
rename to startAgent.sh
index ceb1595a..d10f3acb 100755
--- a/start.sh
+++ b/startAgent.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+/home/prem/git/antigravity-claude-proxy/startProxy.sh &
+
+./startEmbedding.sh
 
 # 1. Activate Virtual Environment
 if [ -d ".venv" ]; then
diff --git a/startEmbedding.sh b/startEmbedding.sh
new file mode 100755
index 00000000..ff9ef58d
--- /dev/null
+++ b/startEmbedding.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Stop and remove existing container if it exists
+docker rm -f embedding-service 2>/dev/null || true
+
+echo "🚀 Starting Local Embedding Service (Hugging Face TEI)..."
+echo "ℹ️  Note: The previous image (clems4ever/all-minilm-l6-v2-go) is a CLI tool, not a server."
+echo "    Switching to ghcr.io/huggingface/text-embeddings-inference:cpu-latest which provides a compatible API."
+
+# Run Hugging Face Text Embeddings Inference (compatible with OpenAI client)
+docker run -d \
+  --name embedding-service \
+  --restart unless-stopped \
+  -p 11434:80 \
+  -e MAX_CONCURRENT_REQUESTS=4 \
+  ghcr.io/huggingface/text-embeddings-inference:cpu-latest \
+  --model-id sentence-transformers/all-MiniLM-L6-v2
+
+echo "✅ Service started!"
+echo "   URL: http://localhost:11434/v1"
diff --git a/tradingagents/agents/utils/memory.py b/tradingagents/agents/utils/memory.py
index bd5e25bf..ce3a00ed 100644
--- a/tradingagents/agents/utils/memory.py
+++ b/tradingagents/agents/utils/memory.py
@@ -6,7 +6,17 @@ from openai import OpenAI
 
 class FinancialSituationMemory:
     def __init__(self, name, config):
-        if config.get("llm_provider") == "google":
+        # Check if user explicitly set EMBEDDING_API_URL - if so, use it regardless of provider
+        embedding_url = os.getenv("EMBEDDING_API_URL")
+        
+        if embedding_url:
+            # User has explicitly configured embedding service URL
+            self.embedding = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
+            self.client = OpenAI(
+                base_url=embedding_url,
+                api_key=os.getenv("EMBEDDING_API_KEY", "local")
+            )
+        elif config.get("llm_provider") == "google":
             self.embedding = "text-embedding-004"
             
             google_api_key = os.getenv("GOOGLE_API_KEY")
@@ -19,6 +29,13 @@ class FinancialSituationMemory:
                 base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
                 max_retries=5
             )
+        elif config.get("llm_provider") == "anthropic":
+            # Anthropic doesn't provide embeddings - default to local embedding service
+            self.embedding = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
+            self.client = OpenAI(
+                base_url="http://localhost:8000/v1",
+                api_key="local"
+            )
         elif config["backend_url"] == "http://localhost:11434/v1" or config.get("llm_provider") == "ollama":
             self.embedding = "nomic-embed-text"
             self.client = OpenAI(base_url=config["backend_url"])
diff --git a/verify_local_embeddings.py b/verify_local_embeddings.py
new file mode 100755
index 00000000..dff409e3
--- /dev/null
+++ b/verify_local_embeddings.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Verify that local sentence-transformers embeddings are working correctly.
+This script tests the local embedding model without requiring external services.
+"""
+
+import os
+import sys
+
+def test_local_embeddings():
+    """Test local sentence-transformers embeddings"""
+    
+    embedding_model = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
+    
+    print("=" * 60)
+    print("Local Embeddings Verification (sentence-transformers)")
+    print("=" * 60)
+    print(f"Embedding Model: {embedding_model}")
+    print()
+    
+    try:
+        # 1. Try to import sentence-transformers (Local Library Mode)
+        try:
+            from sentence_transformers import SentenceTransformer
+            print("✅ Found local sentence-transformers library.")
+            
+            # Load the model
+            print(f"📦 Loading embedding model: {embedding_model}")
+            print("   (First run will download the model, ~90MB)")
+            print()
+            
+            model = SentenceTransformer(embedding_model)
+            
+            # Test embedding generation
+            test_texts = [
+                "This is a test sentence for embedding generation.",
+                "Financial markets are showing increased volatility.",
+                "The company reported strong quarterly earnings."
+            ]
+            
+            print(f"Testing embedding generation with {len(test_texts)} sentences:")
+            for i, text in enumerate(test_texts, 1):
+                print(f"  {i}. '{text[:50]}...'")
+            print()
+            
+            embeddings = model.encode(test_texts, convert_to_numpy=True)
+            
+            print("✅ SUCCESS!")
+            print(f"Generated {len(embeddings)} embedding vectors")
+            print(f"Embedding dimensions: {embeddings.shape[1]}")
+            print(f"First embedding (first 5 values): {embeddings[0][:5].tolist()}")
+            print()
+            
+            # Test similarity
+            from numpy import dot
+            from numpy.linalg import norm
+            
+            def cosine_similarity(a, b):
+                return dot(a, b) / (norm(a) * norm(b))
+            
+            sim_0_1 = cosine_similarity(embeddings[0], embeddings[1])
+            sim_1_2 = cosine_similarity(embeddings[1], embeddings[2])
+            
+            print("Similarity scores:")
+            print(f"  Sentence 1 ↔ Sentence 2: {sim_0_1:.4f}")
+            print(f"  Sentence 2 ↔ Sentence 3: {sim_1_2:.4f}")
+            print()
+            
+            print("=" * 60)
+            print("Local embeddings (Library) are working correctly! 🎉")
+            print("=" * 60)
+            return True
+
+        except ImportError:
+            # 2. If Library missing, try connection to Local Service (Docker Mode)
+            print("ℹ️  sentence-transformers library not installed.")
+            print("Checking for local embedding service...")
+            
+            try:
+                from openai import OpenAI
+                
+                embedding_url = os.getenv("EMBEDDING_API_URL", "http://localhost:8000/v1")
+                print(f"Connecting to: {embedding_url}")
+                
+                client = OpenAI(base_url=embedding_url, api_key="local")
+                
+                # Test embedding generation via API
+                test_texts = [
+                    "This is a test sentence for embedding generation.",
+                    "Financial markets are showing increased volatility.",
+                    "The company reported strong quarterly earnings."
+                ]
+                
+                print(f"Testing embedding generation via API with {len(test_texts)} sentences:")
+                for i, text in enumerate(test_texts, 1):
+                    print(f"  {i}. '{text[:50]}...'")
+                print()
+                
+                response = client.embeddings.create(model=embedding_model, input=test_texts)
+                embeddings = [data.embedding for data in response.data]
+                
+                print("✅ SUCCESS!")
+                print(f"Generated {len(embeddings)} embedding vectors via API")
+                print(f"Embedding dimensions: {len(embeddings[0])}")
+                print(f"First embedding (first 5 values): {embeddings[0][:5]}")
+                print()
+                
+                print("=" * 60)
+                print("Local embedding service (Docker) is working correctly! 🎉")
+                print("=" * 60)
+                return True
+                
+            except Exception as service_error:
+                print("❌ FAILED!")
+                print("Neither sentence-transformers library nor local embedding service found.")
+                print(f"Library Error: sentence-transformers not installed")
+                print(f"Service Error: {str(service_error)}")
+                print()
+                print("=" * 60)
+                print("Installation Options:")
+                print("=" * 60)
+                print("OPTION 1: Run Service (Recommended - Docker)")
+                print("   docker run -d -p 8000:8000 ghcr.io/huggingface/text-embeddings-inference:cpu-latest --model-id sentence-transformers/all-MiniLM-L6-v2")
+                print("   export EMBEDDING_API_URL=http://localhost:8000/v1")
+                print()
+                print("OPTION 2: Install Library (Runs locally, adds dependencies)")
+                print("   pip install sentence-transformers")
+                print("=" * 60)
+                return False
+
+    except Exception as e:
+        print("❌ FAILED!")
+        print(f"Error: {str(e)}")
+        return False
+
+if __name__ == "__main__":
+    success = test_local_embeddings()
+    sys.exit(0 if success else 1)
diff --git a/verify_ollama_embeddings.py b/verify_ollama_embeddings.py
new file mode 100755
index 00000000..7ed48c6c
--- /dev/null
+++ b/verify_ollama_embeddings.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""
+Verify that Ollama embeddings are working correctly.
+This script tests the embedding endpoint and model availability.
+"""
+
+import os
+import sys
+from openai import OpenAI
+
+def test_ollama_embeddings():
+    """Test Ollama embeddings endpoint"""
+    
+    # Get configuration from environment or use defaults
+    embedding_url = os.getenv("EMBEDDING_API_URL", "http://localhost:11434/v1")
+    embedding_model = os.getenv("EMBEDDING_MODEL", "nomic-embed-text")
+    
+    print("=" * 60)
+    print("Ollama Embeddings Verification")
+    print("=" * 60)
+    print(f"Embedding URL: {embedding_url}")
+    print(f"Embedding Model: {embedding_model}")
+    print()
+    
+    try:
+        # Initialize OpenAI client pointing to Ollama
+        client = OpenAI(
+            base_url=embedding_url,
+            api_key="ollama"  # Ollama doesn't require a real API key
+        )
+        
+        # Test embedding generation
+        test_text = "This is a test sentence for embedding generation."
+        print(f"Testing embedding generation with text:")
+        print(f"  '{test_text}'")
+        print()
+        
+        response = client.embeddings.create(
+            model=embedding_model,
+            input=test_text
+        )
+        
+        embedding = response.data[0].embedding
+        
+        print("✅ SUCCESS!")
+        print(f"Generated embedding vector with {len(embedding)} dimensions")
+        print(f"First 5 values: {embedding[:5]}")
+        print()
+        print("=" * 60)
+        print("Ollama embeddings are working correctly! 🎉")
+        print("=" * 60)
+        
+        return True
+        
+    except Exception as e:
+        print("❌ FAILED!")
+        print(f"Error: {str(e)}")
+        print()
+        print("=" * 60)
+        print("Troubleshooting Steps:")
+        print("=" * 60)
+        print("1. Make sure Ollama is running:")
+        print("   $ ollama serve")
+        print()
+        print("2. Pull the embedding model:")
+        print(f"   $ ollama pull {embedding_model}")
+        print()
+        print("3. Verify Ollama is accessible:")
+        print(f"   $ curl {embedding_url.replace('/v1', '')}/api/tags")
+        print()
+        print("4. Check if the model is available:")
+        print(f"   $ ollama list | grep {embedding_model}")
+        print()
+        print("For more help, see: docs/LOCAL_EMBEDDINGS.md")
+        print("=" * 60)
+        
+        return False
+
+if __name__ == "__main__":
+    success = test_ollama_embeddings()
+    sys.exit(0 if success else 1)
diff --git a/verify_tei_native.py b/verify_tei_native.py
new file mode 100644
index 00000000..3a404f5f
--- /dev/null
+++ b/verify_tei_native.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""
+Verify local embedding service using the native TEI /embed endpoint.
+This uses pure HTTP requests without the OpenAI client.
+"""
+
+import sys
+import requests
+import json
+import time
+
+def test_native_endpoint():
+    url = "http://localhost:11434/embed"
+    headers = {"Content-Type": "application/json"}
+    
+    print(f"Testing Native TEI Endpoint: {url}")
+    print("-" * 50)
+    
+    test_inputs = [
+        "This is a test using the native /embed endpoint.",
+        "It should be slightly faster than the OpenAI-compatible one."
+    ]
+    
+    payload = {"inputs": test_inputs}
+    
+    try:
+        start_time = time.time()
+        response = requests.post(url, headers=headers, json=payload)
+        response.raise_for_status()
+        duration = time.time() - start_time
+        
+        embeddings = response.json()
+        
+        print("✅ SUCCESS!")
+        print(f"Time taken: {duration:.4f}s")
+        print(f"Received {len(embeddings)} embeddings")
+        print(f"Dimensions: {len(embeddings[0])}")
+        print(f"First 5 values: {embeddings[0][:5]}")
+        print("-" * 50)
+        return True
+        
+    except requests.exceptions.ConnectionError:
+        print("❌ FAILED: Connection refused.")
+        print("Make sure the container is running: ./startEmbedding.sh")
+        return False
+    except Exception as e:
+        print(f"❌ FAILED: {str(e)}")
+        if hasattr(e, 'response') and e.response:
+            print(f"Status: {e.response.status_code}")
+            print(f"Response: {e.response.text}")
+        return False
+
+if __name__ == "__main__":
+    if test_native_endpoint():
+        sys.exit(0)
+    else:
+        sys.exit(1)