open-multi-agent/tests/tool-result-compression.tes...

import { describe, it, expect } from 'vitest'
import { z } from 'zod'
import { AgentRunner } from '../src/agent/runner.js'
import { ToolRegistry, defineTool } from '../src/tool/framework.js'
import { ToolExecutor } from '../src/tool/executor.js'
import type { LLMAdapter, LLMMessage, LLMResponse } from '../src/types.js'

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

function textResponse(text: string): LLMResponse {
  return {
    id: `resp-${Math.random().toString(36).slice(2)}`,
    content: [{ type: 'text', text }],
    model: 'mock-model',
    stop_reason: 'end_turn',
    usage: { input_tokens: 10, output_tokens: 20 },
  }
}

function toolUseResponse(toolName: string, input: Record<string, unknown>): LLMResponse {
  return {
    id: `resp-${Math.random().toString(36).slice(2)}`,
    content: [{
      type: 'tool_use',
      id: `tu-${Math.random().toString(36).slice(2)}`,
      name: toolName,
      input,
    }],
    model: 'mock-model',
    stop_reason: 'tool_use',
    usage: { input_tokens: 15, output_tokens: 25 },
  }
}

function buildRegistryAndExecutor(
  toolOutput: string = 'x'.repeat(600),
): { registry: ToolRegistry; executor: ToolExecutor } {
  const registry = new ToolRegistry()
  registry.register(
    defineTool({
      name: 'echo',
      description: 'Echo input',
      inputSchema: z.object({ message: z.string() }),
      async execute() {
        return { data: toolOutput }
      },
    }),
  )
  return { registry, executor: new ToolExecutor(registry) }
}

function buildErrorRegistryAndExecutor(): { registry: ToolRegistry; executor: ToolExecutor } {
  const registry = new ToolRegistry()
  registry.register(
    defineTool({
      name: 'fail',
      description: 'Always fails',
      inputSchema: z.object({ message: z.string() }),
      async execute() {
        return { data: 'E'.repeat(600), isError: true }
      },
    }),
  )
  return { registry, executor: new ToolExecutor(registry) }
}

/** Extract all tool_result content strings from messages sent to the LLM. */
function extractToolResultContents(messages: LLMMessage[]): string[] {
  return messages.flatMap(m =>
    m.content
      .filter((b): b is { type: 'tool_result'; tool_use_id: string; content: string; is_error?: boolean } =>
        b.type === 'tool_result')
      .map(b => b.content),
  )
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

describe('AgentRunner compressToolResults', () => {
  it('does NOT compress when compressToolResults is not set (default)', async () => {
    const calls: LLMMessage[][] = []
    const longOutput = 'x'.repeat(600)
    const responses = [
      toolUseResponse('echo', { message: 't1' }),
      toolUseResponse('echo', { message: 't2' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }
    const { registry, executor } = buildRegistryAndExecutor(longOutput)
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 5,
      // compressToolResults not set
    })

    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])

    // Turn 3 should still see full tool results from turn 1
    const turn3Messages = calls[2]!
    const allToolResults = extractToolResultContents(turn3Messages)
    expect(allToolResults.every(c => c === longOutput)).toBe(true)
  })

  it('compresses consumed tool results on turn 3+', async () => {
    const calls: LLMMessage[][] = []
    const longOutput = 'x'.repeat(600)
    const responses = [
      toolUseResponse('echo', { message: 't1' }),
      toolUseResponse('echo', { message: 't2' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }
    const { registry, executor } = buildRegistryAndExecutor(longOutput)
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 5,
      compressToolResults: true,
    })

    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])

    // Turn 3: the LLM should see a compressed marker for turn 1 results
    // but the full output for turn 2 results (most recent, not yet consumed).
    const turn3Messages = calls[2]!
    const allToolResults = extractToolResultContents(turn3Messages)
    expect(allToolResults).toHaveLength(2)

    // First result (turn 1) should be compressed
    expect(allToolResults[0]).toContain('compressed')
    expect(allToolResults[0]).toContain('600 chars')

    // Second result (turn 2, most recent) should be preserved in full
    expect(allToolResults[1]).toBe(longOutput)
  })

  it('preserves tool_use_id on compressed results', async () => {
    const calls: LLMMessage[][] = []
    const longOutput = 'x'.repeat(600)
    const responses = [
      toolUseResponse('echo', { message: 't1' }),
      toolUseResponse('echo', { message: 't2' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }
    const { registry, executor } = buildRegistryAndExecutor(longOutput)
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 5,
      compressToolResults: true,
    })

    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])

    // Turn 3: verify compressed result still has tool_use_id
    const turn3Messages = calls[2]!
    const toolResultBlocks = turn3Messages.flatMap(m =>
      m.content.filter(b => b.type === 'tool_result'),
    )
    for (const block of toolResultBlocks) {
      expect(block).toHaveProperty('tool_use_id')
      expect((block as { tool_use_id: string }).tool_use_id).toBeTruthy()
    }
  })

  it('skips short tool results below minChars threshold', async () => {
    const calls: LLMMessage[][] = []
    const shortOutput = 'short' // 5 chars, well below 500 default
    const responses = [
      toolUseResponse('echo', { message: 't1' }),
      toolUseResponse('echo', { message: 't2' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }
    const { registry, executor } = buildRegistryAndExecutor(shortOutput)
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 5,
      compressToolResults: true,
    })

    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])

    // Turn 3: short results should NOT be compressed
    const turn3Messages = calls[2]!
    const allToolResults = extractToolResultContents(turn3Messages)
    expect(allToolResults.every(c => c === shortOutput)).toBe(true)
  })

  it('respects custom minChars threshold', async () => {
    const calls: LLMMessage[][] = []
    const output = 'x'.repeat(200)
    const responses = [
      toolUseResponse('echo', { message: 't1' }),
      toolUseResponse('echo', { message: 't2' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }
    const { registry, executor } = buildRegistryAndExecutor(output)
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 5,
      compressToolResults: { minChars: 100 },
    })

    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])

    // With minChars=100, the 200-char output should be compressed
    const turn3Messages = calls[2]!
    const allToolResults = extractToolResultContents(turn3Messages)
    expect(allToolResults[0]).toContain('compressed')
    expect(allToolResults[0]).toContain('200 chars')
  })

  it('never compresses error tool results', async () => {
    const calls: LLMMessage[][] = []
    const responses = [
      toolUseResponse('fail', { message: 't1' }),
      toolUseResponse('fail', { message: 't2' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }
    const { registry, executor } = buildErrorRegistryAndExecutor()
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['fail'],
      maxTurns: 5,
      compressToolResults: true,
    })

    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])

    // Error results should never be compressed even if long
    const turn3Messages = calls[2]!
    const allToolResults = extractToolResultContents(turn3Messages)
    expect(allToolResults.every(c => c === 'E'.repeat(600))).toBe(true)
  })

  it('compresses selectively in multi-block tool_result messages (parallel tool calls)', async () => {
    const calls: LLMMessage[][] = []
    // Two tools: one returns long output, one returns short output
    const registry = new ToolRegistry()
    registry.register(
      defineTool({
        name: 'long_tool',
        description: 'Returns long output',
        inputSchema: z.object({ msg: z.string() }),
        async execute() { return { data: 'L'.repeat(600) } },
      }),
    )
    registry.register(
      defineTool({
        name: 'short_tool',
        description: 'Returns short output',
        inputSchema: z.object({ msg: z.string() }),
        async execute() { return { data: 'S'.repeat(50) } },
      }),
    )
    const executor = new ToolExecutor(registry)

    // Turn 1: model calls both tools in parallel
    const parallelResponse: LLMResponse = {
      id: 'resp-parallel',
      content: [
        { type: 'tool_use', id: 'tu-long', name: 'long_tool', input: { msg: 'a' } },
        { type: 'tool_use', id: 'tu-short', name: 'short_tool', input: { msg: 'b' } },
      ],
      model: 'mock-model',
      stop_reason: 'tool_use',
      usage: { input_tokens: 15, output_tokens: 25 },
    }
    const responses = [
      parallelResponse,
      toolUseResponse('long_tool', { msg: 't2' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }

    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['long_tool', 'short_tool'],
      maxTurns: 5,
      compressToolResults: true,
    })

    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])

    // Turn 3: the parallel results from turn 1 should be selectively compressed.
    // The long_tool result (600 chars) → compressed. The short_tool result (50 chars) → kept.
    const turn3Messages = calls[2]!
    const turn1ToolResults = turn3Messages.flatMap(m =>
      m.content.filter((b): b is { type: 'tool_result'; tool_use_id: string; content: string } =>
        b.type === 'tool_result'),
    )
    // Find the results from turn 1 (first user message with tool_results)
    const firstToolResultMsg = turn3Messages.find(
      m => m.role === 'user' && m.content.some(b => b.type === 'tool_result'),
    )!
    const blocks = firstToolResultMsg.content.filter(
      (b): b is { type: 'tool_result'; tool_use_id: string; content: string } =>
        b.type === 'tool_result',
    )

    // One should be compressed (long), one should be intact (short)
    const compressedBlocks = blocks.filter(b => b.content.includes('compressed'))
    const intactBlocks = blocks.filter(b => !b.content.includes('compressed'))
    expect(compressedBlocks).toHaveLength(1)
    expect(compressedBlocks[0]!.content).toContain('600 chars')
    expect(intactBlocks).toHaveLength(1)
    expect(intactBlocks[0]!.content).toBe('S'.repeat(50))
  })

  it('compounds compression across 4+ turns', async () => {
    const calls: LLMMessage[][] = []
    const longOutput = 'x'.repeat(600)
    const responses = [
      toolUseResponse('echo', { message: 't1' }),
      toolUseResponse('echo', { message: 't2' }),
      toolUseResponse('echo', { message: 't3' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }
    const { registry, executor } = buildRegistryAndExecutor(longOutput)
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 6,
      compressToolResults: true,
    })

    await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }])

    // Turn 4: turns 1 and 2 should both be compressed, turn 3 should be intact
    const turn4Messages = calls[3]!
    const allToolResults = extractToolResultContents(turn4Messages)
    expect(allToolResults).toHaveLength(3)

    // First two are compressed (turns 1 & 2)
    expect(allToolResults[0]).toContain('compressed')
    expect(allToolResults[1]).toContain('compressed')

    // Last one (turn 3, most recent) preserved
    expect(allToolResults[2]).toBe(longOutput)
  })

  it('works together with contextStrategy', async () => {
    const calls: LLMMessage[][] = []
    const longOutput = 'x'.repeat(600)
    const responses = [
      toolUseResponse('echo', { message: 't1' }),
      toolUseResponse('echo', { message: 't2' }),
      textResponse('done'),
    ]
    let idx = 0
    const adapter: LLMAdapter = {
      name: 'mock',
      async chat(messages) {
        calls.push(messages.map(m => ({ role: m.role, content: [...m.content] })))
        return responses[idx++]!
      },
      async *stream() { /* unused */ },
    }
    const { registry, executor } = buildRegistryAndExecutor(longOutput)
    const runner = new AgentRunner(adapter, registry, executor, {
      model: 'mock-model',
      allowedTools: ['echo'],
      maxTurns: 5,
      compressToolResults: true,
      contextStrategy: { type: 'sliding-window', maxTurns: 10 },
    })

    const result = await runner.run([
      { role: 'user', content: [{ type: 'text', text: 'start' }] },
    ])

    // Should complete without error; both features coexist
    expect(result.output).toBe('done')

    // Turn 3 should have compressed turn 1 results
    const turn3Messages = calls[2]!
    const allToolResults = extractToolResultContents(turn3Messages)
    expect(allToolResults[0]).toContain('compressed')
  })
})