feat(llm): add Gemini adapter (#28)

feat: Add support for Gemini model
2026-04-05 11:48:32 +08:00 · 2026-04-05 11:48:32 +08:00 · 9a81a13982
parent 9f5afb10f5 553bf24e61
commit 9a81a13982
9 changed files with 1506 additions and 801 deletions
--- a/README.md
+++ b/README.md
@ -29,7 +29,12 @@ Requires Node.js >= 18.
 npm install @jackchen_me/open-multi-agent
 ```
-Set `ANTHROPIC_API_KEY` (and optionally `OPENAI_API_KEY` or `GITHUB_TOKEN` for Copilot) in your environment. Local models via Ollama require no API key — see [example 06](examples/06-local-model.ts).
+Set the API key for your provider. Local models via Ollama require no API key — see [example 06](examples/06-local-model.ts).
 - `ANTHROPIC_API_KEY`
 - `OPENAI_API_KEY`
 - `GEMINI_API_KEY`
 - `GITHUB_TOKEN` (for Copilot)
 Three agents, one goal — the framework handles the rest:
@ -156,6 +161,7 @@ npx tsx examples/01-single-agent.ts
 │  - stream()       │    │  - AnthropicAdapter  │
 └────────┬──────────┘    │  - OpenAIAdapter     │
         │               │  - CopilotAdapter    │
         │               │  - GeminiAdapter     │
         │               └──────────────────────┘
 ┌────────▼──────────┐
 │  AgentRunner      │    ┌──────────────────────┐
@ -183,6 +189,7 @@ npx tsx examples/01-single-agent.ts
 | OpenAI (GPT) | `provider: 'openai'` | `OPENAI_API_KEY` | Verified |
 | Grok (xAI)   | `provider: 'grok'` | `XAI_API_KEY` | Verified |
 | GitHub Copilot | `provider: 'copilot'` | `GITHUB_TOKEN` | Verified |
 | Gemini | `provider: 'gemini'` | `GEMINI_API_KEY` | Verified |
 | Ollama / vLLM / LM Studio | `provider: 'openai'` + `baseURL` | — | Verified |
 | llama.cpp server | `provider: 'openai'` + `baseURL` | — | Verified |
--- a/README_zh.md
+++ b/README_zh.md
@ -155,6 +155,7 @@ npx tsx examples/01-single-agent.ts
 │  - stream()       │    │  - AnthropicAdapter  │
 └────────┬──────────┘    │  - OpenAIAdapter     │
         │               │  - CopilotAdapter    │
         │               │  - GeminiAdapter     │
         │               └──────────────────────┘
 ┌────────▼──────────┐
 │  AgentRunner      │    ┌──────────────────────┐
@ -181,6 +182,7 @@ npx tsx examples/01-single-agent.ts
 | Anthropic (Claude) | `provider: 'anthropic'` | `ANTHROPIC_API_KEY` | 已验证 |
 | OpenAI (GPT) | `provider: 'openai'` | `OPENAI_API_KEY` | 已验证 |
 | GitHub Copilot | `provider: 'copilot'` | `GITHUB_TOKEN` | 已验证 |
 | Gemini | `provider: 'gemini'` | `GEMINI_API_KEY` | 已验证 |
 | Ollama / vLLM / LM Studio | `provider: 'openai'` + `baseURL` | — | 已验证 |
 已验证支持 tool-calling 的本地模型：**Gemma 4**（见[示例 08](examples/08-gemma4-local.ts)）。
--- a/examples/13-gemini.ts
+++ b/examples/13-gemini.ts
@ -0,0 +1,48 @@
 /**
 * Quick smoke test for the Gemini adapter.
 *
 * Run:
 *   npx tsx examples/13-gemini.ts
 *
 * If GEMINI_API_KEY is not set, the adapter will not work.
 */
 import { OpenMultiAgent } from '../src/index.js'
 import type { OrchestratorEvent } from '../src/types.js'
 const orchestrator = new OpenMultiAgent({
  defaultModel: 'gemini-2.5-flash',
  defaultProvider: 'gemini',
  onProgress: (event: OrchestratorEvent) => {
    if (event.type === 'agent_start') {
      console.log(`[start]    agent=${event.agent}`)
    } else if (event.type === 'agent_complete') {
      console.log(`[complete] agent=${event.agent}`)
    }
  },
 })
 console.log('Testing Gemini adapter with gemini-2.5-flash...\n')
 const result = await orchestrator.runAgent(
  {
    name: 'assistant',
    model: 'gemini-2.5-flash',
    provider: 'gemini',
    systemPrompt: 'You are a helpful assistant. Keep answers brief.',
    maxTurns: 1,
    maxTokens: 256,
  },
  'What is 2 + 2? Reply in one sentence.',
 )
 if (result.success) {
  console.log('\nAgent output:')
  console.log('─'.repeat(60))
  console.log(result.output)
  console.log('─'.repeat(60))
  console.log(`\nTokens: input=${result.tokenUsage.input_tokens}, output=${result.tokenUsage.output_tokens}`)
 } else {
  console.error('Agent failed:', result.output)
  process.exit(1)
 }
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -41,6 +41,14 @@
    "openai": "^4.73.0",
    "zod": "^3.23.0"
  },
  "peerDependencies": {
    "@google/genai": "^1.48.0"
  },
  "peerDependenciesMeta": {
    "@google/genai": {
      "optional": true
    }
  },
  "devDependencies": {
    "@types/node": "^22.0.0",
    "tsx": "^4.21.0",
--- a/src/llm/adapter.ts
+++ b/src/llm/adapter.ts
@ -11,6 +11,7 @@
 *
 * const anthropic = createAdapter('anthropic')
 * const openai    = createAdapter('openai', process.env.OPENAI_API_KEY)
 * const gemini    = createAdapter('gemini', process.env.GEMINI_API_KEY)
 * ```
 */
@ -37,7 +38,7 @@ import type { LLMAdapter } from '../types.js'
 * Additional providers can be integrated by implementing {@link LLMAdapter}
 * directly and bypassing this factory.
 */
-export type SupportedProvider = 'anthropic' | 'copilot' | 'grok' | 'openai'
+export type SupportedProvider = 'anthropic' | 'copilot' | 'grok' | 'openai' | 'gemini'
 /**
 * Instantiate the appropriate {@link LLMAdapter} for the given provider.
@ -46,6 +47,7 @@ export type SupportedProvider = 'anthropic' | 'copilot' | 'grok' | 'openai'
 * explicitly:
 * - `anthropic` → `ANTHROPIC_API_KEY`
 * - `openai`    → `OPENAI_API_KEY`
 * - `gemini`    → `GEMINI_API_KEY` / `GOOGLE_API_KEY`
 * - `grok`      → `XAI_API_KEY`
 * - `copilot`   → `GITHUB_COPILOT_TOKEN` / `GITHUB_TOKEN`, or interactive
 *                  OAuth2 device flow if neither is set
@ -75,6 +77,10 @@ export async function createAdapter(
      const { CopilotAdapter } = await import('./copilot.js')
      return new CopilotAdapter(apiKey)
    }
    case 'gemini': {
      const { GeminiAdapter } = await import('./gemini.js')
      return new GeminiAdapter(apiKey)
    }
    case 'openai': {
      const { OpenAIAdapter } = await import('./openai.js')
      return new OpenAIAdapter(apiKey, baseURL)
--- a/src/llm/gemini.ts
+++ b/src/llm/gemini.ts
@ -0,0 +1,378 @@
 /**
 * @fileoverview Google Gemini adapter implementing {@link LLMAdapter}.
 *
 * Built for `@google/genai` (the unified Google Gen AI SDK, v1.x), NOT the
 * legacy `@google/generative-ai` package.
 *
 * Converts between the framework's internal {@link ContentBlock} types and the
 * `@google/genai` SDK's wire format, handling tool definitions, system prompts,
 * and both batch and streaming response paths.
 *
 * API key resolution order:
 *   1. `apiKey` constructor argument
 *   2. `GEMINI_API_KEY` environment variable
 *   3. `GOOGLE_API_KEY` environment variable
 *
 * @example
 * ```ts
 * import { GeminiAdapter } from './gemini.js'
 *
 * const adapter = new GeminiAdapter()
 * const response = await adapter.chat(messages, {
 *   model: 'gemini-2.5-flash',
 *   maxTokens: 1024,
 * })
 * ```
 */
 import {
  GoogleGenAI,
  FunctionCallingConfigMode,
  type Content,
  type FunctionDeclaration,
  type GenerateContentConfig,
  type GenerateContentResponse,
  type Part,
  type Tool as GeminiTool,
 } from '@google/genai'
 import type {
  ContentBlock,
  LLMAdapter,
  LLMChatOptions,
  LLMMessage,
  LLMResponse,
  LLMStreamOptions,
  LLMToolDef,
  StreamEvent,
  ToolUseBlock,
 } from '../types.js'
 // ---------------------------------------------------------------------------
 // Internal helpers
 // ---------------------------------------------------------------------------
 /**
 * Map framework role names to Gemini role names.
 *
 * Gemini uses `"model"` instead of `"assistant"`.
 */
 function toGeminiRole(role: 'user' | 'assistant'): string {
  return role === 'assistant' ? 'model' : 'user'
 }
 /**
 * Convert framework messages into Gemini's {@link Content}[] format.
 *
 * Key differences from Anthropic:
 * - Gemini uses `"model"` instead of `"assistant"`.
 * - `functionResponse` parts (tool results) must appear in `"user"` turns.
 * - `functionCall` parts appear in `"model"` turns.
 * - We build a name lookup map from tool_use blocks so tool_result blocks
 *   can resolve the function name required by Gemini's `functionResponse`.
 */
 function toGeminiContents(messages: LLMMessage[]): Content[] {
  // First pass: build id → name map for resolving tool results.
  const toolNameById = new Map<string, string>()
  for (const msg of messages) {
    for (const block of msg.content) {
      if (block.type === 'tool_use') {
        toolNameById.set(block.id, block.name)
      }
    }
  }
  return messages.map((msg): Content => {
    const parts: Part[] = msg.content.map((block): Part => {
      switch (block.type) {
        case 'text':
          return { text: block.text }
        case 'tool_use':
          return {
            functionCall: {
              id: block.id,
              name: block.name,
              args: block.input,
            },
          }
        case 'tool_result': {
          const name = toolNameById.get(block.tool_use_id) ?? block.tool_use_id
          return {
            functionResponse: {
              id: block.tool_use_id,
              name,
              response: {
                content:
                  typeof block.content === 'string'
                    ? block.content
                    : JSON.stringify(block.content),
                isError: block.is_error ?? false,
              },
            },
          }
        }
        case 'image':
          return {
            inlineData: {
              mimeType: block.source.media_type,
              data: block.source.data,
            },
          }
        default: {
          const _exhaustive: never = block
          throw new Error(`Unhandled content block type: ${JSON.stringify(_exhaustive)}`)
        }
      }
    })
    return { role: toGeminiRole(msg.role), parts }
  })
 }
 /**
 * Convert framework {@link LLMToolDef}s into a Gemini `tools` config array.
 *
 * In `@google/genai`, function declarations use `parametersJsonSchema` (not
 * `parameters` or `input_schema`). All declarations are grouped under a single
 * tool entry.
 */
 function toGeminiTools(tools: readonly LLMToolDef[]): GeminiTool[] {
  const functionDeclarations: FunctionDeclaration[] = tools.map((t) => ({
    name: t.name,
    description: t.description,
    parametersJsonSchema: t.inputSchema as Record<string, unknown>,
  }))
  return [{ functionDeclarations }]
 }
 /**
 * Build the {@link GenerateContentConfig} shared by chat() and stream().
 */
 function buildConfig(
  options: LLMChatOptions | LLMStreamOptions,
 ): GenerateContentConfig {
  return {
    maxOutputTokens: options.maxTokens ?? 4096,
    temperature: options.temperature,
    systemInstruction: options.systemPrompt,
    tools: options.tools ? toGeminiTools(options.tools) : undefined,
    toolConfig: options.tools
      ? { functionCallingConfig: { mode: FunctionCallingConfigMode.AUTO } }
      : undefined,
  }
 }
 /**
 * Generate a stable pseudo-random ID string for tool use blocks.
 *
 * Gemini may not always return call IDs (especially in streaming), so we
 * fabricate them when absent to satisfy the framework's {@link ToolUseBlock}
 * contract.
 */
 function generateId(): string {
  return `gemini-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`
 }
 /**
 * Extract the function call ID from a Gemini part, or generate one.
 *
 * The `id` field exists in newer API versions but may be absent in older
 * responses, so we cast conservatively and fall back to a generated ID.
 */
 function getFunctionCallId(part: Part): string {
  return (part.functionCall as { id?: string } | undefined)?.id ?? generateId()
 }
 /**
 * Convert a Gemini {@link GenerateContentResponse} into a framework
 * {@link LLMResponse}.
 */
 function fromGeminiResponse(
  response: GenerateContentResponse,
  id: string,
  model: string,
 ): LLMResponse {
  const candidate = response.candidates?.[0]
  const content: ContentBlock[] = []
  for (const part of candidate?.content?.parts ?? []) {
    if (part.text !== undefined && part.text !== '') {
      content.push({ type: 'text', text: part.text })
    } else if (part.functionCall !== undefined) {
      content.push({
        type: 'tool_use',
        id: getFunctionCallId(part),
        name: part.functionCall.name ?? '',
        input: (part.functionCall.args ?? {}) as Record<string, unknown>,
      })
    }
    // inlineData echoes and other part types are silently ignored.
  }
  // Map Gemini finish reasons to framework stop_reason vocabulary.
  const finishReason = candidate?.finishReason as string | undefined
  let stop_reason: LLMResponse['stop_reason'] = 'end_turn'
  if (finishReason === 'MAX_TOKENS') {
    stop_reason = 'max_tokens'
  } else if (content.some((b) => b.type === 'tool_use')) {
    // Gemini may report STOP even when it returned function calls.
    stop_reason = 'tool_use'
  }
  const usage = response.usageMetadata
  return {
    id,
    content,
    model,
    stop_reason,
    usage: {
      input_tokens: usage?.promptTokenCount ?? 0,
      output_tokens: usage?.candidatesTokenCount ?? 0,
    },
  }
 }
 // ---------------------------------------------------------------------------
 // Adapter implementation
 // ---------------------------------------------------------------------------
 /**
 * LLM adapter backed by the Google Gemini API via `@google/genai`.
 *
 * Thread-safe — a single instance may be shared across concurrent agent runs.
 * The underlying SDK client is stateless across requests.
 */
 export class GeminiAdapter implements LLMAdapter {
  readonly name = 'gemini'
  readonly #client: GoogleGenAI
  constructor(apiKey?: string) {
    this.#client = new GoogleGenAI({
      apiKey: apiKey ?? process.env['GEMINI_API_KEY'] ?? process.env['GOOGLE_API_KEY'],
    })
  }
  // -------------------------------------------------------------------------
  // chat()
  // -------------------------------------------------------------------------
  /**
   * Send a synchronous (non-streaming) chat request and return the complete
   * {@link LLMResponse}.
   *
   * Uses `ai.models.generateContent()` with the full conversation as `contents`,
   * which is the idiomatic pattern for `@google/genai`.
   */
  async chat(messages: LLMMessage[], options: LLMChatOptions): Promise<LLMResponse> {
    const id = generateId()
    const contents = toGeminiContents(messages)
    const response = await this.#client.models.generateContent({
      model: options.model,
      contents,
      config: buildConfig(options),
    })
    return fromGeminiResponse(response, id, options.model)
  }
  // -------------------------------------------------------------------------
  // stream()
  // -------------------------------------------------------------------------
  /**
   * Send a streaming chat request and yield {@link StreamEvent}s as they
   * arrive from the API.
   *
   * Uses `ai.models.generateContentStream()` which returns an
   * `AsyncGenerator<GenerateContentResponse>`. Each yielded chunk has the same
   * shape as a full response but contains only the delta for that chunk.
   *
   * Because `@google/genai` doesn't expose a `finalMessage()` helper like the
   * Anthropic SDK, we accumulate content and token counts as we stream so that
   * the terminal `done` event carries a complete and accurate {@link LLMResponse}.
   *
   * Sequence guarantees (matching the Anthropic adapter):
   * - Zero or more `text` events with incremental deltas
   * - Zero or more `tool_use` events (one per call; Gemini doesn't stream args)
   * - Exactly one terminal event: `done` or `error`
   */
  async *stream(
    messages: LLMMessage[],
    options: LLMStreamOptions,
  ): AsyncIterable<StreamEvent> {
    const id = generateId()
    const contents = toGeminiContents(messages)
    try {
      const streamResponse = await this.#client.models.generateContentStream({
        model: options.model,
        contents,
        config: buildConfig(options),
      })
      // Accumulators for building the done payload.
      const accumulatedContent: ContentBlock[] = []
      let inputTokens = 0
      let outputTokens = 0
      let lastFinishReason: string | undefined
      for await (const chunk of streamResponse) {
        const candidate = chunk.candidates?.[0]
        // Accumulate token counts — the API emits these on the final chunk.
        if (chunk.usageMetadata) {
          inputTokens = chunk.usageMetadata.promptTokenCount ?? inputTokens
          outputTokens = chunk.usageMetadata.candidatesTokenCount ?? outputTokens
        }
        if (candidate?.finishReason) {
          lastFinishReason = candidate.finishReason as string
        }
        for (const part of candidate?.content?.parts ?? []) {
          if (part.text) {
            accumulatedContent.push({ type: 'text', text: part.text })
            yield { type: 'text', data: part.text } satisfies StreamEvent
          } else if (part.functionCall) {
            const toolId = getFunctionCallId(part)
            const toolUseBlock: ToolUseBlock = {
              type: 'tool_use',
              id: toolId,
              name: part.functionCall.name ?? '',
              input: (part.functionCall.args ?? {}) as Record<string, unknown>,
            }
            accumulatedContent.push(toolUseBlock)
            yield { type: 'tool_use', data: toolUseBlock } satisfies StreamEvent
          }
        }
      }
      // Determine stop_reason from the accumulated response.
      const hasToolUse = accumulatedContent.some((b) => b.type === 'tool_use')
      let stop_reason: LLMResponse['stop_reason'] = 'end_turn'
      if (lastFinishReason === 'MAX_TOKENS') {
        stop_reason = 'max_tokens'
      } else if (hasToolUse) {
        stop_reason = 'tool_use'
      }
      const finalResponse: LLMResponse = {
        id,
        content: accumulatedContent,
        model: options.model,
        stop_reason,
        usage: { input_tokens: inputTokens, output_tokens: outputTokens },
      }
      yield { type: 'done', data: finalResponse } satisfies StreamEvent
    } catch (err) {
      const error = err instanceof Error ? err : new Error(String(err))
      yield { type: 'error', data: error } satisfies StreamEvent
    }
  }
 }
--- a/src/types.ts
+++ b/src/types.ts
@ -194,7 +194,7 @@ export interface BeforeRunHookContext {
 export interface AgentConfig {
  readonly name: string
  readonly model: string
-  readonly provider?: 'anthropic' | 'copilot' | 'grok' | 'openai'
+  readonly provider?: 'anthropic' | 'copilot' | 'grok' | 'openai' | 'gemini'
  /**
   * Custom base URL for OpenAI-compatible APIs (Ollama, vLLM, LM Studio, etc.).
   * Note: local servers that don't require auth still need `apiKey` set to a
@ -338,7 +338,7 @@ export interface OrchestratorEvent {
 export interface OrchestratorConfig {
  readonly maxConcurrency?: number
  readonly defaultModel?: string
-  readonly defaultProvider?: 'anthropic' | 'copilot' | 'grok' | 'openai'
+  readonly defaultProvider?: 'anthropic' | 'copilot' | 'grok' | 'openai' | 'gemini'
  readonly defaultBaseURL?: string
  readonly defaultApiKey?: string
  readonly onProgress?: (event: OrchestratorEvent) => void
--- a/tests/gemini-adapter.test.ts
+++ b/tests/gemini-adapter.test.ts
@ -0,0 +1,97 @@
 import { describe, it, expect, vi, beforeEach } from 'vitest'
 // ---------------------------------------------------------------------------
 // Mock GoogleGenAI constructor (must be hoisted for Vitest)
 // ---------------------------------------------------------------------------
 const GoogleGenAIMock = vi.hoisted(() => vi.fn())
 vi.mock('@google/genai', () => ({
  GoogleGenAI: GoogleGenAIMock,
  FunctionCallingConfigMode: { AUTO: 'AUTO' },
 }))
 import { GeminiAdapter } from '../src/llm/gemini.js'
 import { createAdapter } from '../src/llm/adapter.js'
 // ---------------------------------------------------------------------------
 // GeminiAdapter tests
 // ---------------------------------------------------------------------------
 describe('GeminiAdapter', () => {
  beforeEach(() => {
    GoogleGenAIMock.mockClear()
  })
  it('has name "gemini"', () => {
    const adapter = new GeminiAdapter()
    expect(adapter.name).toBe('gemini')
  })
  it('uses GEMINI_API_KEY by default', () => {
    const originalGemini = process.env['GEMINI_API_KEY']
    const originalGoogle = process.env['GOOGLE_API_KEY']
    process.env['GEMINI_API_KEY'] = 'gemini-env-key'
    delete process.env['GOOGLE_API_KEY']
    try {
      new GeminiAdapter()
      expect(GoogleGenAIMock).toHaveBeenCalledWith(
        expect.objectContaining({
          apiKey: 'gemini-env-key',
        }),
      )
    } finally {
      if (originalGemini === undefined) {
        delete process.env['GEMINI_API_KEY']
      } else {
        process.env['GEMINI_API_KEY'] = originalGemini
      }
      if (originalGoogle === undefined) {
        delete process.env['GOOGLE_API_KEY']
      } else {
        process.env['GOOGLE_API_KEY'] = originalGoogle
      }
    }
  })
  it('falls back to GOOGLE_API_KEY when GEMINI_API_KEY is unset', () => {
    const originalGemini = process.env['GEMINI_API_KEY']
    const originalGoogle = process.env['GOOGLE_API_KEY']
    delete process.env['GEMINI_API_KEY']
    process.env['GOOGLE_API_KEY'] = 'google-env-key'
    try {
      new GeminiAdapter()
      expect(GoogleGenAIMock).toHaveBeenCalledWith(
        expect.objectContaining({
          apiKey: 'google-env-key',
        }),
      )
    } finally {
      if (originalGemini === undefined) {
        delete process.env['GEMINI_API_KEY']
      } else {
        process.env['GEMINI_API_KEY'] = originalGemini
      }
      if (originalGoogle === undefined) {
        delete process.env['GOOGLE_API_KEY']
      } else {
        process.env['GOOGLE_API_KEY'] = originalGoogle
      }
    }
  })
  it('allows overriding apiKey explicitly', () => {
    new GeminiAdapter('explicit-key')
    expect(GoogleGenAIMock).toHaveBeenCalledWith(
      expect.objectContaining({
        apiKey: 'explicit-key',
      }),
    )
  })
  it('createAdapter("gemini") returns GeminiAdapter instance', async () => {
    const adapter = await createAdapter('gemini')
    expect(adapter).toBeInstanceOf(GeminiAdapter)
  })
 })