feat: add context management strategies (sliding-window, summarize, custom) to prevent unbounded conversation growth

2026-04-09 19:40:15 +03:00 · 2026-04-09 19:40:15 +03:00 · eb484d9bbf
parent f1c7477a26
commit eb484d9bbf
7 changed files with 415 additions and 1 deletions
--- a/examples/01-single-agent.ts
+++ b/examples/01-single-agent.ts
@ -114,6 +114,8 @@ const conversationAgent = new Agent(
    model: 'claude-sonnet-4-6',
    systemPrompt: 'You are a TypeScript tutor. Give short, direct answers.',
    maxTurns: 2,
    // Keep only the most recent turn in long prompt() conversations.
    contextStrategy: { type: 'sliding-window', maxTurns: 1 },
  },
  new ToolRegistry(), // no tools needed for this conversation
  new ToolExecutor(new ToolRegistry()),
--- a/src/agent/agent.ts
+++ b/src/agent/agent.ts
@ -153,6 +153,7 @@ export class Agent {
      agentRole: this.config.systemPrompt?.slice(0, 50) ?? 'assistant',
      loopDetection: this.config.loopDetection,
      maxTokenBudget: this.config.maxTokenBudget,
      contextStrategy: this.config.contextStrategy,
    }
    this.runner = new AgentRunner(
--- a/src/agent/runner.ts
+++ b/src/agent/runner.ts
@ -29,10 +29,12 @@ import type {
  LoopDetectionConfig,
  LoopDetectionInfo,
  LLMToolDef,
  ContextStrategy,
 } from '../types.js'
 import { TokenBudgetExceededError } from '../errors.js'
 import { LoopDetector } from './loop-detector.js'
 import { emitTrace } from '../utils/trace.js'
 import { estimateTokens } from '../utils/tokens.js'
 import type { ToolRegistry } from '../tool/framework.js'
 import type { ToolExecutor } from '../tool/executor.js'
@ -94,6 +96,8 @@ export interface RunnerOptions {
  readonly loopDetection?: LoopDetectionConfig
  /** Maximum cumulative tokens (input + output) allowed for this run. */
  readonly maxTokenBudget?: number
  /** Optional context compression strategy for long multi-turn runs. */
  readonly contextStrategy?: ContextStrategy
 }
 /**
@ -191,6 +195,10 @@ const ZERO_USAGE: TokenUsage = { input_tokens: 0, output_tokens: 0 }
 */
 export class AgentRunner {
  private readonly maxTurns: number
  private summarizeCache: {
    oldSignature: string
    summaryMessage: LLMMessage
  } | null = null
  constructor(
    private readonly adapter: LLMAdapter,
@ -201,6 +209,168 @@ export class AgentRunner {
    this.maxTurns = options.maxTurns ?? 10
  }
  private serializeMessage(message: LLMMessage): string {
    return JSON.stringify(message)
  }
  private truncateToSlidingWindow(messages: LLMMessage[], maxTurns: number): LLMMessage[] {
    if (maxTurns <= 0) {
      return messages
    }
    const firstUserIndex = messages.findIndex(m => m.role === 'user')
    const firstUser = firstUserIndex >= 0 ? messages[firstUserIndex]! : null
    const afterFirst = firstUserIndex >= 0
      ? messages.slice(firstUserIndex + 1)
      : messages.slice()
    if (afterFirst.length <= maxTurns * 2) {
      return messages
    }
    const kept = afterFirst.slice(-maxTurns * 2)
    const result: LLMMessage[] = []
    if (firstUser !== null) {
      result.push(firstUser)
    }
    const droppedPairs = Math.floor((afterFirst.length - kept.length) / 2)
    if (droppedPairs > 0) {
      result.push({
        role: 'user',
        content: [{
          type: 'text',
          text: `[Earlier conversation history truncated — ${droppedPairs} turn(s) removed]`,
        }],
      })
    }
    result.push(...kept)
    return result
  }
  private async summarizeMessages(
    messages: LLMMessage[],
    maxTokens: number,
    summaryModel: string | undefined,
    baseChatOptions: LLMChatOptions,
    turns: number,
    options: RunOptions,
  ): Promise<LLMMessage[]> {
    const estimated = estimateTokens(messages)
    if (estimated <= maxTokens || messages.length < 4) {
      return messages
    }
    const firstUserIndex = messages.findIndex(m => m.role === 'user')
    if (firstUserIndex < 0 || firstUserIndex === messages.length - 1) {
      return messages
    }
    const firstUser = messages[firstUserIndex]!
    const rest = messages.slice(firstUserIndex + 1)
    if (rest.length < 2) {
      return messages
    }
    const splitAt = Math.max(2, Math.floor(rest.length / 2))
    const oldPortion = rest.slice(0, splitAt)
    const recentPortion = rest.slice(splitAt)
    const oldSignature = oldPortion.map(m => this.serializeMessage(m)).join('\n')
    if (this.summarizeCache !== null && this.summarizeCache.oldSignature === oldSignature) {
      return [firstUser, this.summarizeCache.summaryMessage, ...recentPortion]
    }
    const summaryPrompt = [
      'Summarize the following conversation history for an LLM.',
      '- Preserve user goals, constraints, and decisions.',
      '- Keep key tool outputs and unresolved questions.',
      '- Use concise bullets.',
      '- Do not fabricate details.',
    ].join('\n')
    const summaryInput: LLMMessage[] = [
      {
        role: 'user',
        content: [
          { type: 'text', text: summaryPrompt },
          { type: 'text', text: `\n\nConversation:\n${oldSignature}` },
        ],
      },
    ]
    const summaryOptions: LLMChatOptions = {
      ...baseChatOptions,
      model: summaryModel ?? this.options.model,
      tools: undefined,
    }
    const summaryStartMs = Date.now()
    const summaryResponse = await this.adapter.chat(summaryInput, summaryOptions)
    if (options.onTrace) {
      const summaryEndMs = Date.now()
      emitTrace(options.onTrace, {
        type: 'llm_call',
        runId: options.runId ?? '',
        taskId: options.taskId,
        agent: options.traceAgent ?? this.options.agentName ?? 'unknown',
        model: summaryOptions.model,
        phase: 'summary',
        turn: turns,
        tokens: summaryResponse.usage,
        startMs: summaryStartMs,
        endMs: summaryEndMs,
        durationMs: summaryEndMs - summaryStartMs,
      })
    }
    const summaryText = extractText(summaryResponse.content).trim()
    const summaryMessage: LLMMessage = {
      role: 'user',
      content: [{
        type: 'text',
        text: summaryText.length > 0
          ? `[Conversation summary]\n${summaryText}`
          : '[Conversation summary unavailable]',
      }],
    }
    this.summarizeCache = { oldSignature, summaryMessage }
    return [firstUser, summaryMessage, ...recentPortion]
  }
  private async applyContextStrategy(
    messages: LLMMessage[],
    strategy: ContextStrategy,
    baseChatOptions: LLMChatOptions,
    turns: number,
    options: RunOptions,
  ): Promise<LLMMessage[]> {
    if (strategy.type === 'sliding-window') {
      return this.truncateToSlidingWindow(messages, strategy.maxTurns)
    }
    if (strategy.type === 'summarize') {
      return this.summarizeMessages(
        messages,
        strategy.maxTokens,
        strategy.summaryModel,
        baseChatOptions,
        turns,
        options,
      )
    }
    const estimated = estimateTokens(messages)
    const compressed = await strategy.compress(messages, estimated)
    if (!Array.isArray(compressed) || compressed.length === 0) {
      throw new Error('contextStrategy.custom.compress must return a non-empty LLMMessage[]')
    }
    return compressed
  }
  // -------------------------------------------------------------------------
  // Tool resolution
  // -------------------------------------------------------------------------
@ -313,7 +483,7 @@ export class AgentRunner {
    options: RunOptions = {},
  ): AsyncGenerator<StreamEvent> {
    // Working copy of the conversation — mutated as turns progress.
-    const conversationMessages: LLMMessage[] = [...initialMessages]
+    let conversationMessages: LLMMessage[] = [...initialMessages]
    // Accumulated state across all turns.
    let totalUsage: TokenUsage = ZERO_USAGE
@ -363,6 +533,17 @@ export class AgentRunner {
        turns++
        // Optionally compact context before each LLM call after the first turn.
        if (this.options.contextStrategy && turns > 1) {
          conversationMessages = await this.applyContextStrategy(
            conversationMessages,
            this.options.contextStrategy,
            baseChatOptions,
            turns,
            options,
          )
        }
        // ------------------------------------------------------------------
        // Step 1: Call the LLM and collect the full response for this turn.
        // ------------------------------------------------------------------
@ -376,6 +557,7 @@ export class AgentRunner {
            taskId: options.taskId,
            agent: options.traceAgent ?? this.options.agentName ?? 'unknown',
            model: this.options.model,
            phase: 'turn',
            turn: turns,
            tokens: response.usage,
            startMs: llmStartMs,
--- a/src/index.ts
+++ b/src/index.ts
@ -153,6 +153,7 @@ export type {
  ToolCallRecord,
  LoopDetectionConfig,
  LoopDetectionInfo,
  ContextStrategy,
  // Team
  TeamConfig,
--- a/src/types.ts
+++ b/src/types.ts
@ -65,6 +65,18 @@ export interface LLMMessage {
  readonly content: ContentBlock[]
 }
 /** Context management strategy for long-running agent conversations. */
 export type ContextStrategy =
  | { type: 'sliding-window'; maxTurns: number }
  | { type: 'summarize'; maxTokens: number; summaryModel?: string }
  | {
    type: 'custom'
    compress: (
      messages: LLMMessage[],
      estimatedTokens: number,
    ) => Promise<LLMMessage[]> | LLMMessage[]
  }
 /** Token accounting for a single API call. */
 export interface TokenUsage {
  readonly input_tokens: number
@ -215,6 +227,8 @@ export interface AgentConfig {
  readonly maxTokens?: number
  /** Maximum cumulative tokens (input + output) allowed for this run. */
  readonly maxTokenBudget?: number
  /** Optional context compression policy to control input growth across turns. */
  readonly contextStrategy?: ContextStrategy
  readonly temperature?: number
  /**
   * Maximum wall-clock time (in milliseconds) for the entire agent run.
@ -487,6 +501,8 @@ export interface TraceEventBase {
 export interface LLMCallTrace extends TraceEventBase {
  readonly type: 'llm_call'
  readonly model: string
  /** Distinguishes normal turn calls from context-summary calls. */
  readonly phase?: 'turn' | 'summary'
  readonly turn: number
  readonly tokens: TokenUsage
 }
--- a/src/utils/tokens.ts
+++ b/src/utils/tokens.ts
@ -0,0 +1,27 @@
 import type { LLMMessage } from '../types.js'
 /**
 * Estimate token count using a lightweight character heuristic.
 * This intentionally avoids model-specific tokenizer dependencies.
 */
 export function estimateTokens(messages: LLMMessage[]): number {
  let chars = 0
  for (const message of messages) {
    for (const block of message.content) {
      if (block.type === 'text') {
        chars += block.text.length
      } else if (block.type === 'tool_result') {
        chars += block.content.length
      } else if (block.type === 'tool_use') {
        chars += JSON.stringify(block.input).length
      } else if (block.type === 'image') {
        // Account for non-text payloads with a small fixed cost.
        chars += 64
      }
    }
  }
  // Conservative English heuristic: ~4 chars per token.
  return Math.ceil(chars / 4)
 }
--- a/tests/context-strategy.test.ts
+++ b/tests/context-strategy.test.ts
@ -0,0 +1,185 @@
 import { describe, it, expect, vi } from 'vitest'
 import { z } from 'zod'
 import { AgentRunner } from '../src/agent/runner.js'
 import { ToolRegistry, defineTool } from '../src/tool/framework.js'
 import { ToolExecutor } from '../src/tool/executor.js'
 import type { LLMAdapter, LLMChatOptions, LLMMessage, LLMResponse, TraceEvent } from '../src/types.js'
 function textResponse(text: string): LLMResponse {
  return {
    id: `resp-${Math.random().toString(36).slice(2)}`,
    content: [{ type: 'text', text }],
    model: 'mock-model',
    stop_reason: 'end_turn',
    usage: { input_tokens: 10, output_tokens: 20 },
  }
 }
 function toolUseResponse(toolName: string, input: Record<string, unknown>): LLMResponse {
  return {
    id: `resp-${Math.random().toString(36).slice(2)}`,
    content: [{
      type: 'tool_use',
      id: `tu-${Math.random().toString(36).slice(2)}`,
      name: toolName,
      input,
    }],
    model: 'mock-model',
    stop_reason: 'tool_use',
    usage: { input_tokens: 15, output_tokens: 25 },
  }
 }
 function buildRegistryAndExecutor(): { registry: ToolRegistry; executor: ToolExecutor } {
  const registry = new ToolRegistry()
  registry.register(
    defineTool({
      name: 'echo',
      description: 'Echo input',
      inputSchema: z.object({ message: z.string() }),
      async execute({ message }) {
        return { data: message }
      },
    }),
  )
  return { registry, executor: new ToolExecutor(registry) }
 }
 describe('AgentRunner contextStrategy', () => {
  it('keeps baseline behavior when contextStrategy is not set', async () => {
    const calls: LLMMessage[][] = []
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: m.content })))
        return calls.length === 1
          ? toolUseResponse('echo', { message: 'hello' })
          : textResponse('done')
      },
      async *stream() {
        /* unused */
      },
    }
    const { registry, executor } = buildRegistryAndExecutor()
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 4,
    })
    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])
    expect(calls).toHaveLength(2)
    expect(calls[0]).toHaveLength(1)
    expect(calls[1]!.length).toBeGreaterThan(calls[0]!.length)
  })
  it('sliding-window truncates old turns and preserves the first user message', async () => {
    const calls: LLMMessage[][] = []
    const responses = [
      toolUseResponse('echo', { message: 't1' }),
      toolUseResponse('echo', { message: 't2' }),
      toolUseResponse('echo', { message: 't3' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: m.content })))
        return responses[idx++]!
      },
      async *stream() {
        /* unused */
      },
    }
    const { registry, executor } = buildRegistryAndExecutor()
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 8,
      contextStrategy: { type: 'sliding-window', maxTurns: 1 },
    })
    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'original prompt' }] }])
    const laterCall = calls[calls.length - 1]!
    const firstUserText = laterCall[0]!.content[0]
    expect(firstUserText).toMatchObject({ type: 'text', text: 'original prompt' })
    const flattenedText = laterCall.flatMap(m => m.content.filter(c => c.type === 'text'))
    expect(flattenedText.some(c => c.type === 'text' && c.text.includes('truncated'))).toBe(true)
  })
  it('summarize strategy replaces old context and emits summary trace call', async () => {
    const calls: Array<{ messages: LLMMessage[]; options: LLMChatOptions }> = []
    const traces: TraceEvent[] = []
    const responses = [
      toolUseResponse('echo', { message: 'first turn payload '.repeat(20) }),
      toolUseResponse('echo', { message: 'second turn payload '.repeat(20) }),
      textResponse('This is a concise summary.'),
      textResponse('final answer'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages, options) {
        calls.push({ messages: messages.map(m => ({ role: m.role, content: m.content })), options })
        return responses[idx++]!
      },
      async *stream() {
        /* unused */
      },
    }
    const { registry, executor } = buildRegistryAndExecutor()
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 8,
      contextStrategy: { type: 'summarize', maxTokens: 20 },
    })
    await runner.run(
      [{ role: 'user', content: [{ type: 'text', text: 'start' }] }],
      { onTrace: (e) => { traces.push(e) }, runId: 'run-summary', traceAgent: 'context-agent' },
    )
    const summaryCall = calls.find(c => c.messages.length === 1 && c.options.tools === undefined)
    expect(summaryCall).toBeDefined()
    const llmTraces = traces.filter(t => t.type === 'llm_call')
    expect(llmTraces.some(t => t.type === 'llm_call' && t.phase === 'summary')).toBe(true)
  })
  it('custom strategy calls compress callback and uses returned messages', async () => {
    const compress = vi.fn((messages: LLMMessage[]) => messages.slice(-1))
    const calls: LLMMessage[][] = []
    const responses = [
      toolUseResponse('echo', { message: 'hello' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: m.content })))
        return responses[idx++]!
      },
      async *stream() {
        /* unused */
      },
    }
    const { registry, executor } = buildRegistryAndExecutor()
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 4,
      contextStrategy: {
        type: 'custom',
        compress,
      },
    })
    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'custom prompt' }] }])
    expect(compress).toHaveBeenCalledOnce()
    expect(calls[1]).toHaveLength(1)
  })
 })