import { describe, it, expect, vi } from 'vitest' import { z } from 'zod' import { AgentRunner } from '../src/agent/runner.js' import { ToolRegistry, defineTool } from '../src/tool/framework.js' import { ToolExecutor } from '../src/tool/executor.js' import type { LLMAdapter, LLMChatOptions, LLMMessage, LLMResponse, TraceEvent } from '../src/types.js' function textResponse(text: string): LLMResponse { return { id: `resp-${Math.random().toString(36).slice(2)}`, content: [{ type: 'text', text }], model: 'mock-model', stop_reason: 'end_turn', usage: { input_tokens: 10, output_tokens: 20 }, } } function toolUseResponse(toolName: string, input: Record): LLMResponse { return { id: `resp-${Math.random().toString(36).slice(2)}`, content: [{ type: 'tool_use', id: `tu-${Math.random().toString(36).slice(2)}`, name: toolName, input, }], model: 'mock-model', stop_reason: 'tool_use', usage: { input_tokens: 15, output_tokens: 25 }, } } function buildRegistryAndExecutor(): { registry: ToolRegistry; executor: ToolExecutor } { const registry = new ToolRegistry() registry.register( defineTool({ name: 'echo', description: 'Echo input', inputSchema: z.object({ message: z.string() }), async execute({ message }) { return { data: message } }, }), ) return { registry, executor: new ToolExecutor(registry) } } describe('AgentRunner contextStrategy', () => { it('keeps baseline behavior when contextStrategy is not set', async () => { const calls: LLMMessage[][] = [] const adapter: LLMAdapter = { name: 'mock', async chat(messages) { calls.push(messages.map(m => ({ role: m.role, content: m.content }))) return calls.length === 1 ? toolUseResponse('echo', { message: 'hello' }) : textResponse('done') }, async *stream() { /* unused */ }, } const { registry, executor } = buildRegistryAndExecutor() const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 4, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) expect(calls).toHaveLength(2) expect(calls[0]).toHaveLength(1) expect(calls[1]!.length).toBeGreaterThan(calls[0]!.length) }) it('sliding-window truncates old turns and preserves the first user message', async () => { const calls: LLMMessage[][] = [] const responses = [ toolUseResponse('echo', { message: 't1' }), toolUseResponse('echo', { message: 't2' }), toolUseResponse('echo', { message: 't3' }), textResponse('done'), ] let idx = 0 const adapter: LLMAdapter = { name: 'mock', async chat(messages) { calls.push(messages.map(m => ({ role: m.role, content: m.content }))) return responses[idx++]! }, async *stream() { /* unused */ }, } const { registry, executor } = buildRegistryAndExecutor() const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'sliding-window', maxTurns: 1 }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'original prompt' }] }]) const laterCall = calls[calls.length - 1]! const firstUserText = laterCall[0]!.content[0] expect(firstUserText).toMatchObject({ type: 'text', text: 'original prompt' }) const flattenedText = laterCall.flatMap(m => m.content.filter(c => c.type === 'text')) expect(flattenedText.some(c => c.type === 'text' && c.text.includes('truncated'))).toBe(true) }) it('summarize strategy replaces old context and emits summary trace call', async () => { const calls: Array<{ messages: LLMMessage[]; options: LLMChatOptions }> = [] const traces: TraceEvent[] = [] const responses = [ toolUseResponse('echo', { message: 'first turn payload '.repeat(20) }), toolUseResponse('echo', { message: 'second turn payload '.repeat(20) }), textResponse('This is a concise summary.'), textResponse('final answer'), ] let idx = 0 const adapter: LLMAdapter = { name: 'mock', async chat(messages, options) { calls.push({ messages: messages.map(m => ({ role: m.role, content: m.content })), options }) return responses[idx++]! }, async *stream() { /* unused */ }, } const { registry, executor } = buildRegistryAndExecutor() const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'summarize', maxTokens: 20 }, }) const result = await runner.run( [{ role: 'user', content: [{ type: 'text', text: 'start' }] }], { onTrace: (e) => { traces.push(e) }, runId: 'run-summary', traceAgent: 'context-agent' }, ) const summaryCall = calls.find(c => c.messages.length === 1 && c.options.tools === undefined) expect(summaryCall).toBeDefined() const llmTraces = traces.filter(t => t.type === 'llm_call') expect(llmTraces.some(t => t.type === 'llm_call' && t.phase === 'summary')).toBe(true) // Summary adapter usage must count toward RunResult.tokenUsage (maxTokenBudget). expect(result.tokenUsage.input_tokens).toBe(15 + 15 + 10 + 10) expect(result.tokenUsage.output_tokens).toBe(25 + 25 + 20 + 20) // After compaction, summary text is folded into the next user turn (not a // standalone user message), preserving user/assistant alternation. const turnAfterSummary = calls.find( c => c.messages.some( m => m.role === 'user' && m.content.some( b => b.type === 'text' && b.text.includes('[Conversation summary]'), ), ), ) expect(turnAfterSummary).toBeDefined() const rolesAfterFirstUser = turnAfterSummary!.messages.map(m => m.role).join(',') expect(rolesAfterFirstUser).not.toMatch(/^user,user/) }) it('custom strategy calls compress callback and uses returned messages', async () => { const compress = vi.fn((messages: LLMMessage[]) => messages.slice(-1)) const calls: LLMMessage[][] = [] const responses = [ toolUseResponse('echo', { message: 'hello' }), textResponse('done'), ] let idx = 0 const adapter: LLMAdapter = { name: 'mock', async chat(messages) { calls.push(messages.map(m => ({ role: m.role, content: m.content }))) return responses[idx++]! }, async *stream() { /* unused */ }, } const { registry, executor } = buildRegistryAndExecutor() const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 4, contextStrategy: { type: 'custom', compress, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'custom prompt' }] }]) expect(compress).toHaveBeenCalledOnce() expect(calls[1]).toHaveLength(1) }) // --------------------------------------------------------------------------- // compact strategy // --------------------------------------------------------------------------- describe('compact strategy', () => { const longText = 'x'.repeat(3000) const longToolResult = 'result-data '.repeat(100) // ~1200 chars function buildMultiTurnAdapter( responseCount: number, calls: LLMMessage[][], ): LLMAdapter { const responses: LLMResponse[] = [] for (let i = 0; i < responseCount - 1; i++) { responses.push(toolUseResponse('echo', { message: `turn-${i}` })) } responses.push(textResponse('done')) let idx = 0 return { name: 'mock', async chat(messages) { calls.push(messages.map(m => ({ role: m.role, content: m.content }))) return responses[idx++]! }, async *stream() { /* unused */ }, } } /** Build a registry with an echo tool that returns a fixed result string. */ function buildEchoRegistry(result: string): { registry: ToolRegistry; executor: ToolExecutor } { const registry = new ToolRegistry() registry.register( defineTool({ name: 'echo', description: 'Echo input', inputSchema: z.object({ message: z.string() }), async execute() { return { data: result } }, }), ) return { registry, executor: new ToolExecutor(registry) } } it('does not activate below maxTokens threshold', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(3, calls) const { registry, executor } = buildEchoRegistry('short') const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 999999 }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) // On the 3rd call (turn 3), all previous messages should still be intact // because estimated tokens are way below the threshold. const lastCall = calls[calls.length - 1]! const allToolResults = lastCall.flatMap(m => m.content.filter(b => b.type === 'tool_result'), ) for (const tr of allToolResults) { if (tr.type === 'tool_result') { expect(tr.content).not.toContain('compacted') } } }) it('compresses old tool_result blocks when tokens exceed threshold', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(4, calls) const { registry, executor } = buildEchoRegistry(longToolResult) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 20, // very low to always trigger preserveRecentTurns: 1, // only protect the most recent turn minToolResultChars: 100, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) // On the last call, old tool results should have compact markers. const lastCall = calls[calls.length - 1]! const toolResults = lastCall.flatMap(m => m.content.filter(b => b.type === 'tool_result'), ) const compacted = toolResults.filter( b => b.type === 'tool_result' && b.content.includes('compacted'), ) expect(compacted.length).toBeGreaterThan(0) // Marker should include tool name. for (const tr of compacted) { if (tr.type === 'tool_result') { expect(tr.content).toMatch(/\[Tool result: echo/) } } }) it('preserves the first user message', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(4, calls) const { registry, executor } = buildEchoRegistry(longToolResult) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 1, minToolResultChars: 100, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'original prompt' }] }]) const lastCall = calls[calls.length - 1]! const firstUser = lastCall.find(m => m.role === 'user')! expect(firstUser.content[0]).toMatchObject({ type: 'text', text: 'original prompt' }) }) it('preserves tool_use blocks in old turns', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(4, calls) const { registry, executor } = buildEchoRegistry(longToolResult) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 1, minToolResultChars: 100, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) // Every assistant message should still have its tool_use block. const lastCall = calls[calls.length - 1]! const assistantMsgs = lastCall.filter(m => m.role === 'assistant') for (const msg of assistantMsgs) { const toolUses = msg.content.filter(b => b.type === 'tool_use') // The last assistant message is "done" (text only), others have tool_use. if (msg.content.some(b => b.type === 'text' && b.text === 'done')) continue expect(toolUses.length).toBeGreaterThan(0) } }) it('preserves error tool_result blocks', async () => { const calls: LLMMessage[][] = [] const responses: LLMResponse[] = [ toolUseResponse('echo', { message: 'will-fail' }), toolUseResponse('echo', { message: 'ok' }), textResponse('done'), ] let idx = 0 const adapter: LLMAdapter = { name: 'mock', async chat(messages) { calls.push(messages.map(m => ({ role: m.role, content: m.content }))) return responses[idx++]! }, async *stream() { /* unused */ }, } // Tool that fails on first call, succeeds on second. let callCount = 0 const registry = new ToolRegistry() registry.register( defineTool({ name: 'echo', description: 'Echo input', inputSchema: z.object({ message: z.string() }), async execute() { callCount++ if (callCount === 1) { throw new Error('deliberate error '.repeat(40)) } return { data: longToolResult } }, }), ) const executor = new ToolExecutor(registry) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 1, minToolResultChars: 50, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) const lastCall = calls[calls.length - 1]! const errorResults = lastCall.flatMap(m => m.content.filter(b => b.type === 'tool_result' && b.is_error), ) // Error results should still have their original content (not compacted). for (const er of errorResults) { if (er.type === 'tool_result') { expect(er.content).not.toContain('compacted') expect(er.content).toContain('deliberate error') } } }) it('does not re-compress markers from compressToolResults', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(4, calls) const { registry, executor } = buildEchoRegistry(longToolResult) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, compressToolResults: { minChars: 100 }, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 1, minToolResultChars: 10, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) const lastCall = calls[calls.length - 1]! const allToolResults = lastCall.flatMap(m => m.content.filter(b => b.type === 'tool_result'), ) // No result should contain nested markers. for (const tr of allToolResults) { if (tr.type === 'tool_result') { // Should not have a compact marker wrapping another marker. const markerCount = (tr.content.match(/\[Tool/g) || []).length expect(markerCount).toBeLessThanOrEqual(1) } } }) it('truncates long assistant text blocks in old turns', async () => { const calls: LLMMessage[][] = [] const responses: LLMResponse[] = [ // First turn: assistant with long text + tool_use { id: 'r1', content: [ { type: 'text', text: longText }, { type: 'tool_use', id: 'tu-1', name: 'echo', input: { message: 'hi' } }, ], model: 'mock-model', stop_reason: 'tool_use', usage: { input_tokens: 10, output_tokens: 20 }, }, toolUseResponse('echo', { message: 'turn2' }), textResponse('done'), ] let idx = 0 const adapter: LLMAdapter = { name: 'mock', async chat(messages) { calls.push(messages.map(m => ({ role: m.role, content: m.content }))) return responses[idx++]! }, async *stream() { /* unused */ }, } const { registry, executor } = buildEchoRegistry('short') const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 1, minTextBlockChars: 500, textBlockExcerptChars: 100, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) const lastCall = calls[calls.length - 1]! // The first assistant message (old zone) should have its text truncated. const firstAssistant = lastCall.find(m => m.role === 'assistant')! const textBlocks = firstAssistant.content.filter(b => b.type === 'text') const truncated = textBlocks.find( b => b.type === 'text' && b.text.includes('truncated'), ) expect(truncated).toBeDefined() if (truncated && truncated.type === 'text') { expect(truncated.text.length).toBeLessThan(longText.length) expect(truncated.text).toContain(`${longText.length} chars total`) } }) it('keeps recent turns intact within preserveRecentTurns', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(4, calls) const { registry, executor } = buildEchoRegistry(longToolResult) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 1, minToolResultChars: 100, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) // The most recent tool_result (last user message with tool_result) should // still contain the original long content. const lastCall = calls[calls.length - 1]! const userMsgs = lastCall.filter(m => m.role === 'user') const lastUserWithToolResult = [...userMsgs] .reverse() .find(m => m.content.some(b => b.type === 'tool_result')) expect(lastUserWithToolResult).toBeDefined() const recentTr = lastUserWithToolResult!.content.find(b => b.type === 'tool_result') if (recentTr && recentTr.type === 'tool_result') { expect(recentTr.content).not.toContain('compacted') expect(recentTr.content).toContain('result-data') } }) it('does not compact when all turns fit in preserveRecentTurns', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(3, calls) const { registry, executor } = buildEchoRegistry(longToolResult) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 10, // way more than actual turns minToolResultChars: 100, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) // All tool results should still have original content. const lastCall = calls[calls.length - 1]! const toolResults = lastCall.flatMap(m => m.content.filter(b => b.type === 'tool_result'), ) for (const tr of toolResults) { if (tr.type === 'tool_result') { expect(tr.content).not.toContain('compacted') } } }) it('maintains correct role alternation after compaction', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(5, calls) const { registry, executor } = buildEchoRegistry(longToolResult) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 10, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 1, minToolResultChars: 100, }, }) await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) // Check all LLM calls for role alternation. for (const callMsgs of calls) { for (let i = 1; i < callMsgs.length; i++) { expect(callMsgs[i]!.role).not.toBe(callMsgs[i - 1]!.role) } } }) it('returns ZERO_USAGE (no LLM cost from compaction)', async () => { const calls: LLMMessage[][] = [] const adapter = buildMultiTurnAdapter(4, calls) const { registry, executor } = buildEchoRegistry(longToolResult) const runner = new AgentRunner(adapter, registry, executor, { model: 'mock-model', allowedTools: ['echo'], maxTurns: 8, contextStrategy: { type: 'compact', maxTokens: 20, preserveRecentTurns: 1, minToolResultChars: 100, }, }) const result = await runner.run([ { role: 'user', content: [{ type: 'text', text: 'start' }] }, ]) // Token usage should only reflect the 4 actual LLM calls (no extra from compaction). // Each toolUseResponse: input=15, output=25. textResponse: input=10, output=20. // 3 tool calls + 1 final = (15*3 + 10) input, (25*3 + 20) output. expect(result.tokenUsage.input_tokens).toBe(15 * 3 + 10) expect(result.tokenUsage.output_tokens).toBe(25 * 3 + 20) }) }) })