From 1529dd1346baa5408a2b98100182093a1e07b39e Mon Sep 17 00:00:00 2001 From: JackChen Date: Thu, 16 Apr 2026 18:09:37 +0800 Subject: [PATCH] feat: post-consumption tool result compression (#116) Replace consumed tool results with compact markers before each LLM call, freeing context budget in multi-turn agent runs. A tool result is "consumed" once the assistant has produced a response after seeing it. - Add `compressToolResults` option to AgentConfig / RunnerOptions - Runs before contextStrategy (lightweight, no LLM calls) - Error results and short results (< minChars, default 500) are skipped - 9 test cases covering default off, compression, parallel tools, 4+ turn compounding, error exemption, custom threshold, and contextStrategy coexistence --- src/agent/agent.ts | 1 + src/agent/runner.ts | 83 +++++ src/types.ts | 15 + tests/tool-result-compression.test.ts | 456 ++++++++++++++++++++++++++ 4 files changed, 555 insertions(+) create mode 100644 tests/tool-result-compression.test.ts diff --git a/src/agent/agent.ts b/src/agent/agent.ts index 7270e93..d7c808e 100644 --- a/src/agent/agent.ts +++ b/src/agent/agent.ts @@ -154,6 +154,7 @@ export class Agent { loopDetection: this.config.loopDetection, maxTokenBudget: this.config.maxTokenBudget, contextStrategy: this.config.contextStrategy, + compressToolResults: this.config.compressToolResults, } this.runner = new AgentRunner( diff --git a/src/agent/runner.ts b/src/agent/runner.ts index df1cbc0..9f910a2 100644 --- a/src/agent/runner.ts +++ b/src/agent/runner.ts @@ -98,6 +98,11 @@ export interface RunnerOptions { readonly maxTokenBudget?: number /** Optional context compression strategy for long multi-turn runs. */ readonly contextStrategy?: ContextStrategy + /** + * Compress tool results that the agent has already processed. + * See {@link AgentConfig.compressToolResults} for details. + */ + readonly compressToolResults?: boolean | { readonly minChars?: number } } /** @@ -176,6 +181,9 @@ function addTokenUsage(a: TokenUsage, b: TokenUsage): TokenUsage { const ZERO_USAGE: TokenUsage = { input_tokens: 0, output_tokens: 0 } +/** Default minimum content length before tool result compression kicks in. */ +const DEFAULT_MIN_COMPRESS_CHARS = 500 + /** * Prepends synthetic framing text to the first user message so we never emit * consecutive `user` turns (Bedrock) and summaries do not concatenate onto @@ -569,6 +577,12 @@ export class AgentRunner { turns++ + // Compress consumed tool results before context strategy (lightweight, + // no LLM calls) so the strategy operates on already-reduced messages. + if (this.options.compressToolResults && turns > 1) { + conversationMessages = this.compressConsumedToolResults(conversationMessages) + } + // Optionally compact context before each LLM call after the first turn. if (this.options.contextStrategy && turns > 1) { const compacted = await this.applyContextStrategy( @@ -846,6 +860,75 @@ export class AgentRunner { // Private helpers // ------------------------------------------------------------------------- + /** + * Replace consumed tool results with compact markers. + * + * A tool_result is "consumed" when the assistant has produced a response + * after seeing it (i.e. there is an assistant message following the user + * message that contains the tool_result). The most recent user message + * with tool results is always kept intact — the LLM is about to see it. + * + * Error results and results shorter than `minChars` are never compressed. + */ + private compressConsumedToolResults(messages: LLMMessage[]): LLMMessage[] { + const config = this.options.compressToolResults + if (!config) return messages + + const minChars = typeof config === 'object' + ? (config.minChars ?? DEFAULT_MIN_COMPRESS_CHARS) + : DEFAULT_MIN_COMPRESS_CHARS + + // Find the last user message that carries tool_result blocks. + let lastToolResultUserIdx = -1 + for (let i = messages.length - 1; i >= 0; i--) { + if ( + messages[i]!.role === 'user' && + messages[i]!.content.some(b => b.type === 'tool_result') + ) { + lastToolResultUserIdx = i + break + } + } + + // Nothing to compress if there's at most one tool-result user message. + if (lastToolResultUserIdx <= 0) return messages + + let anyChanged = false + const result = messages.map((msg, idx) => { + // Only compress user messages that appear before the last one. + if (msg.role !== 'user' || idx >= lastToolResultUserIdx) return msg + + const hasToolResult = msg.content.some(b => b.type === 'tool_result') + if (!hasToolResult) return msg + + let msgChanged = false + const newContent = msg.content.map((block): ContentBlock => { + if (block.type !== 'tool_result') return block + + // Never compress error results — they carry diagnostic value. + if (block.is_error) return block + + // Skip short results — the marker itself has overhead. + if (block.content.length < minChars) return block + + msgChanged = true + return { + type: 'tool_result', + tool_use_id: block.tool_use_id, + content: `[Tool output compressed — ${block.content.length} chars, already processed]`, + } satisfies ToolResultBlock + }) + + if (msgChanged) { + anyChanged = true + return { role: msg.role, content: newContent } as LLMMessage + } + return msg + }) + + return anyChanged ? result : messages + } + /** * Build the {@link ToolUseContext} passed to every tool execution. * Identifies this runner as the invoking agent. diff --git a/src/types.ts b/src/types.ts index 2ebed59..f61decb 100644 --- a/src/types.ts +++ b/src/types.ts @@ -270,6 +270,21 @@ export interface AgentConfig { * takes priority over this value. */ readonly maxToolOutputChars?: number + /** + * Compress tool results that the agent has already processed. + * + * In multi-turn runs, tool results persist in the conversation even after the + * agent has acted on them. When enabled, consumed tool results (those followed + * by an assistant response) are replaced with a short marker before the next + * LLM call, freeing context budget for new reasoning. + * + * - `true` — enable with default threshold (500 chars) + * - `{ minChars: N }` — only compress results longer than N characters + * - `false` / `undefined` — disabled (default) + * + * Error tool results are never compressed. + */ + readonly compressToolResults?: boolean | { readonly minChars?: number } /** * Optional Zod schema for structured output. When set, the agent's final * output is parsed as JSON and validated against this schema. A single diff --git a/tests/tool-result-compression.test.ts b/tests/tool-result-compression.test.ts new file mode 100644 index 0000000..085da56 --- /dev/null +++ b/tests/tool-result-compression.test.ts @@ -0,0 +1,456 @@ +import { describe, it, expect } from 'vitest' +import { z } from 'zod' +import { AgentRunner } from '../src/agent/runner.js' +import { ToolRegistry, defineTool } from '../src/tool/framework.js' +import { ToolExecutor } from '../src/tool/executor.js' +import type { LLMAdapter, LLMMessage, LLMResponse } from '../src/types.js' + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function textResponse(text: string): LLMResponse { + return { + id: `resp-${Math.random().toString(36).slice(2)}`, + content: [{ type: 'text', text }], + model: 'mock-model', + stop_reason: 'end_turn', + usage: { input_tokens: 10, output_tokens: 20 }, + } +} + +function toolUseResponse(toolName: string, input: Record): LLMResponse { + return { + id: `resp-${Math.random().toString(36).slice(2)}`, + content: [{ + type: 'tool_use', + id: `tu-${Math.random().toString(36).slice(2)}`, + name: toolName, + input, + }], + model: 'mock-model', + stop_reason: 'tool_use', + usage: { input_tokens: 15, output_tokens: 25 }, + } +} + +function buildRegistryAndExecutor( + toolOutput: string = 'x'.repeat(600), +): { registry: ToolRegistry; executor: ToolExecutor } { + const registry = new ToolRegistry() + registry.register( + defineTool({ + name: 'echo', + description: 'Echo input', + inputSchema: z.object({ message: z.string() }), + async execute() { + return { data: toolOutput } + }, + }), + ) + return { registry, executor: new ToolExecutor(registry) } +} + +function buildErrorRegistryAndExecutor(): { registry: ToolRegistry; executor: ToolExecutor } { + const registry = new ToolRegistry() + registry.register( + defineTool({ + name: 'fail', + description: 'Always fails', + inputSchema: z.object({ message: z.string() }), + async execute() { + return { data: 'E'.repeat(600), isError: true } + }, + }), + ) + return { registry, executor: new ToolExecutor(registry) } +} + +/** Extract all tool_result content strings from messages sent to the LLM. */ +function extractToolResultContents(messages: LLMMessage[]): string[] { + return messages.flatMap(m => + m.content + .filter((b): b is { type: 'tool_result'; tool_use_id: string; content: string; is_error?: boolean } => + b.type === 'tool_result') + .map(b => b.content), + ) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('AgentRunner compressToolResults', () => { + it('does NOT compress when compressToolResults is not set (default)', async () => { + const calls: LLMMessage[][] = [] + const longOutput = 'x'.repeat(600) + const responses = [ + toolUseResponse('echo', { message: 't1' }), + toolUseResponse('echo', { message: 't2' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + const { registry, executor } = buildRegistryAndExecutor(longOutput) + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 5, + // compressToolResults not set + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + + // Turn 3 should still see full tool results from turn 1 + const turn3Messages = calls[2]! + const allToolResults = extractToolResultContents(turn3Messages) + expect(allToolResults.every(c => c === longOutput)).toBe(true) + }) + + it('compresses consumed tool results on turn 3+', async () => { + const calls: LLMMessage[][] = [] + const longOutput = 'x'.repeat(600) + const responses = [ + toolUseResponse('echo', { message: 't1' }), + toolUseResponse('echo', { message: 't2' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + const { registry, executor } = buildRegistryAndExecutor(longOutput) + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 5, + compressToolResults: true, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + + // Turn 3: the LLM should see a compressed marker for turn 1 results + // but the full output for turn 2 results (most recent, not yet consumed). + const turn3Messages = calls[2]! + const allToolResults = extractToolResultContents(turn3Messages) + expect(allToolResults).toHaveLength(2) + + // First result (turn 1) should be compressed + expect(allToolResults[0]).toContain('compressed') + expect(allToolResults[0]).toContain('600 chars') + + // Second result (turn 2, most recent) should be preserved in full + expect(allToolResults[1]).toBe(longOutput) + }) + + it('preserves tool_use_id on compressed results', async () => { + const calls: LLMMessage[][] = [] + const longOutput = 'x'.repeat(600) + const responses = [ + toolUseResponse('echo', { message: 't1' }), + toolUseResponse('echo', { message: 't2' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + const { registry, executor } = buildRegistryAndExecutor(longOutput) + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 5, + compressToolResults: true, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + + // Turn 3: verify compressed result still has tool_use_id + const turn3Messages = calls[2]! + const toolResultBlocks = turn3Messages.flatMap(m => + m.content.filter(b => b.type === 'tool_result'), + ) + for (const block of toolResultBlocks) { + expect(block).toHaveProperty('tool_use_id') + expect((block as { tool_use_id: string }).tool_use_id).toBeTruthy() + } + }) + + it('skips short tool results below minChars threshold', async () => { + const calls: LLMMessage[][] = [] + const shortOutput = 'short' // 5 chars, well below 500 default + const responses = [ + toolUseResponse('echo', { message: 't1' }), + toolUseResponse('echo', { message: 't2' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + const { registry, executor } = buildRegistryAndExecutor(shortOutput) + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 5, + compressToolResults: true, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + + // Turn 3: short results should NOT be compressed + const turn3Messages = calls[2]! + const allToolResults = extractToolResultContents(turn3Messages) + expect(allToolResults.every(c => c === shortOutput)).toBe(true) + }) + + it('respects custom minChars threshold', async () => { + const calls: LLMMessage[][] = [] + const output = 'x'.repeat(200) + const responses = [ + toolUseResponse('echo', { message: 't1' }), + toolUseResponse('echo', { message: 't2' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + const { registry, executor } = buildRegistryAndExecutor(output) + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 5, + compressToolResults: { minChars: 100 }, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + + // With minChars=100, the 200-char output should be compressed + const turn3Messages = calls[2]! + const allToolResults = extractToolResultContents(turn3Messages) + expect(allToolResults[0]).toContain('compressed') + expect(allToolResults[0]).toContain('200 chars') + }) + + it('never compresses error tool results', async () => { + const calls: LLMMessage[][] = [] + const responses = [ + toolUseResponse('fail', { message: 't1' }), + toolUseResponse('fail', { message: 't2' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + const { registry, executor } = buildErrorRegistryAndExecutor() + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['fail'], + maxTurns: 5, + compressToolResults: true, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + + // Error results should never be compressed even if long + const turn3Messages = calls[2]! + const allToolResults = extractToolResultContents(turn3Messages) + expect(allToolResults.every(c => c === 'E'.repeat(600))).toBe(true) + }) + + it('compresses selectively in multi-block tool_result messages (parallel tool calls)', async () => { + const calls: LLMMessage[][] = [] + // Two tools: one returns long output, one returns short output + const registry = new ToolRegistry() + registry.register( + defineTool({ + name: 'long_tool', + description: 'Returns long output', + inputSchema: z.object({ msg: z.string() }), + async execute() { return { data: 'L'.repeat(600) } }, + }), + ) + registry.register( + defineTool({ + name: 'short_tool', + description: 'Returns short output', + inputSchema: z.object({ msg: z.string() }), + async execute() { return { data: 'S'.repeat(50) } }, + }), + ) + const executor = new ToolExecutor(registry) + + // Turn 1: model calls both tools in parallel + const parallelResponse: LLMResponse = { + id: 'resp-parallel', + content: [ + { type: 'tool_use', id: 'tu-long', name: 'long_tool', input: { msg: 'a' } }, + { type: 'tool_use', id: 'tu-short', name: 'short_tool', input: { msg: 'b' } }, + ], + model: 'mock-model', + stop_reason: 'tool_use', + usage: { input_tokens: 15, output_tokens: 25 }, + } + const responses = [ + parallelResponse, + toolUseResponse('long_tool', { msg: 't2' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['long_tool', 'short_tool'], + maxTurns: 5, + compressToolResults: true, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + + // Turn 3: the parallel results from turn 1 should be selectively compressed. + // The long_tool result (600 chars) → compressed. The short_tool result (50 chars) → kept. + const turn3Messages = calls[2]! + const turn1ToolResults = turn3Messages.flatMap(m => + m.content.filter((b): b is { type: 'tool_result'; tool_use_id: string; content: string } => + b.type === 'tool_result'), + ) + // Find the results from turn 1 (first user message with tool_results) + const firstToolResultMsg = turn3Messages.find( + m => m.role === 'user' && m.content.some(b => b.type === 'tool_result'), + )! + const blocks = firstToolResultMsg.content.filter( + (b): b is { type: 'tool_result'; tool_use_id: string; content: string } => + b.type === 'tool_result', + ) + + // One should be compressed (long), one should be intact (short) + const compressedBlocks = blocks.filter(b => b.content.includes('compressed')) + const intactBlocks = blocks.filter(b => !b.content.includes('compressed')) + expect(compressedBlocks).toHaveLength(1) + expect(compressedBlocks[0]!.content).toContain('600 chars') + expect(intactBlocks).toHaveLength(1) + expect(intactBlocks[0]!.content).toBe('S'.repeat(50)) + }) + + it('compounds compression across 4+ turns', async () => { + const calls: LLMMessage[][] = [] + const longOutput = 'x'.repeat(600) + const responses = [ + toolUseResponse('echo', { message: 't1' }), + toolUseResponse('echo', { message: 't2' }), + toolUseResponse('echo', { message: 't3' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + const { registry, executor } = buildRegistryAndExecutor(longOutput) + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 6, + compressToolResults: true, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + + // Turn 4: turns 1 and 2 should both be compressed, turn 3 should be intact + const turn4Messages = calls[3]! + const allToolResults = extractToolResultContents(turn4Messages) + expect(allToolResults).toHaveLength(3) + + // First two are compressed (turns 1 & 2) + expect(allToolResults[0]).toContain('compressed') + expect(allToolResults[1]).toContain('compressed') + + // Last one (turn 3, most recent) preserved + expect(allToolResults[2]).toBe(longOutput) + }) + + it('works together with contextStrategy', async () => { + const calls: LLMMessage[][] = [] + const longOutput = 'x'.repeat(600) + const responses = [ + toolUseResponse('echo', { message: 't1' }), + toolUseResponse('echo', { message: 't2' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: [...m.content] }))) + return responses[idx++]! + }, + async *stream() { /* unused */ }, + } + const { registry, executor } = buildRegistryAndExecutor(longOutput) + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 5, + compressToolResults: true, + contextStrategy: { type: 'sliding-window', maxTurns: 10 }, + }) + + const result = await runner.run([ + { role: 'user', content: [{ type: 'text', text: 'start' }] }, + ]) + + // Should complete without error; both features coexist + expect(result.output).toBe('done') + + // Turn 3 should have compressed turn 1 results + const turn3Messages = calls[2]! + const allToolResults = extractToolResultContents(turn3Messages) + expect(allToolResults[0]).toContain('compressed') + }) +})