diff --git a/src/index.ts b/src/index.ts index 8841357..98ca209 100644 --- a/src/index.ts +++ b/src/index.ts @@ -108,6 +108,7 @@ export { export { createAdapter } from './llm/adapter.js' export type { SupportedProvider } from './llm/adapter.js' +export { VLLMAdapter } from './llm/vllm.js' // --------------------------------------------------------------------------- // Memory @@ -166,4 +167,7 @@ export type { // Memory MemoryEntry, MemoryStore, + + // vLLM + VLLMConfig, } from './types.js' diff --git a/src/llm/adapter.ts b/src/llm/adapter.ts index 5b032c5..bd1064d 100644 --- a/src/llm/adapter.ts +++ b/src/llm/adapter.ts @@ -11,6 +11,7 @@ * * const anthropic = createAdapter('anthropic') * const openai = createAdapter('openai', process.env.OPENAI_API_KEY) + * const vllm = createAdapter('vllm', { baseURL: 'http://localhost:8000/v1', model: 'llama3' }) * ``` */ @@ -28,45 +29,59 @@ export type { ToolUseBlock, ToolResultBlock, ImageBlock, + VLLMConfig, } from '../types.js' -import type { LLMAdapter } from '../types.js' +import type { LLMAdapter, VLLMConfig } from '../types.js' /** * The set of LLM providers supported out of the box. * Additional providers can be integrated by implementing {@link LLMAdapter} * directly and bypassing this factory. */ -export type SupportedProvider = 'anthropic' | 'openai' +export type SupportedProvider = 'anthropic' | 'openai' | 'vllm' /** * Instantiate the appropriate {@link LLMAdapter} for the given provider. * - * API keys fall back to the standard environment variables - * (`ANTHROPIC_API_KEY` / `OPENAI_API_KEY`) when not supplied explicitly. + * For `'anthropic'` and `'openai'`, the second argument is an optional API key + * string (falls back to `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` env vars). + * + * For `'vllm'`, the second argument must be a {@link VLLMConfig} object. * * Adapters are imported lazily so that projects using only one provider * are not forced to install the SDK for the other. * * @param provider - Which LLM provider to target. - * @param apiKey - Optional API key override; falls back to env var. + * @param config - API key string (for anthropic/openai) or VLLMConfig (for vllm). * @throws {Error} When the provider string is not recognised. */ export async function createAdapter( provider: SupportedProvider, - apiKey?: string, + config?: string | VLLMConfig, ): Promise { switch (provider) { case 'anthropic': { const { AnthropicAdapter } = await import('./anthropic.js') + const apiKey = typeof config === 'string' ? config : undefined return new AnthropicAdapter(apiKey) } case 'openai': { const { OpenAIAdapter } = await import('./openai.js') + const apiKey = typeof config === 'string' ? config : undefined return new OpenAIAdapter(apiKey) } + case 'vllm': { + const { VLLMAdapter } = await import('./vllm.js') + if (typeof config === 'object' && config !== null && 'baseURL' in config) { + return new VLLMAdapter(config as VLLMConfig) + } + throw new Error( + 'createAdapter("vllm") requires a VLLMConfig object as the second argument ' + + '(e.g. { baseURL: "http://localhost:8000/v1", model: "llama3" }).', + ) + } default: { - // The `never` cast here makes TypeScript enforce exhaustiveness. const _exhaustive: never = provider throw new Error(`Unsupported LLM provider: ${String(_exhaustive)}`) } diff --git a/src/llm/openai-compat.ts b/src/llm/openai-compat.ts new file mode 100644 index 0000000..ba3ef9e --- /dev/null +++ b/src/llm/openai-compat.ts @@ -0,0 +1,252 @@ +/** + * @fileoverview Shared OpenAI-format helpers for adapters that speak the + * OpenAI Chat Completions wire format (OpenAI, vLLM, etc.). + * + * Both {@link OpenAIAdapter} and {@link VLLMAdapter} import from this module + * to avoid duplicating conversion logic. + * + * @module @vcg/agent-sdk + */ + +import type OpenAI from 'openai' +import type { + ChatCompletion, + ChatCompletionAssistantMessageParam, + ChatCompletionMessageParam, + ChatCompletionMessageToolCall, + ChatCompletionTool, + ChatCompletionToolMessageParam, + ChatCompletionUserMessageParam, +} from 'openai/resources/chat/completions/index.js' + +import type { + ContentBlock, + LLMMessage, + LLMResponse, + LLMToolDef, + TextBlock, + ToolUseBlock, +} from '../types.js' + +// --------------------------------------------------------------------------- +// Framework -> OpenAI format +// --------------------------------------------------------------------------- + +/** + * Convert a framework {@link LLMToolDef} to an OpenAI {@link ChatCompletionTool}. + */ +export function toOpenAITool(tool: LLMToolDef): ChatCompletionTool { + return { + type: 'function', + function: { + name: tool.name, + description: tool.description, + parameters: tool.inputSchema as Record, + }, + } +} + +/** + * Determine whether a framework message contains any `tool_result` content + * blocks, which must be serialised as separate OpenAI `tool`-role messages. + */ +export function hasToolResults(msg: LLMMessage): boolean { + return msg.content.some((b) => b.type === 'tool_result') +} + +/** + * Convert framework messages into OpenAI {@link ChatCompletionMessageParam} entries. + * + * Expands `tool_result` blocks into separate `tool`-role messages as required + * by the OpenAI wire format. + */ +export function toOpenAIMessages(messages: LLMMessage[]): ChatCompletionMessageParam[] { + const result: ChatCompletionMessageParam[] = [] + + for (const msg of messages) { + if (msg.role === 'assistant') { + result.push(toOpenAIAssistantMessage(msg)) + } else { + // user role + if (!hasToolResults(msg)) { + result.push(toOpenAIUserMessage(msg)) + } else { + const nonToolBlocks = msg.content.filter((b) => b.type !== 'tool_result') + if (nonToolBlocks.length > 0) { + result.push(toOpenAIUserMessage({ role: 'user', content: nonToolBlocks })) + } + + for (const block of msg.content) { + if (block.type === 'tool_result') { + const toolMsg: ChatCompletionToolMessageParam = { + role: 'tool', + tool_call_id: block.tool_use_id, + content: block.content, + } + result.push(toolMsg) + } + } + } + } + } + + return result +} + +/** + * Convert a `user`-role framework message into an OpenAI user message. + */ +export function toOpenAIUserMessage(msg: LLMMessage): ChatCompletionUserMessageParam { + if (msg.content.length === 1 && msg.content[0]?.type === 'text') { + return { role: 'user', content: msg.content[0].text } + } + + type ContentPart = OpenAI.Chat.ChatCompletionContentPartText | OpenAI.Chat.ChatCompletionContentPartImage + const parts: ContentPart[] = [] + + for (const block of msg.content) { + if (block.type === 'text') { + parts.push({ type: 'text', text: block.text }) + } else if (block.type === 'image') { + parts.push({ + type: 'image_url', + image_url: { + url: `data:${block.source.media_type};base64,${block.source.data}`, + }, + }) + } + } + + return { role: 'user', content: parts } +} + +/** + * Convert an `assistant`-role framework message into an OpenAI assistant message. + */ +export function toOpenAIAssistantMessage(msg: LLMMessage): ChatCompletionAssistantMessageParam { + const toolCalls: ChatCompletionMessageToolCall[] = [] + const textParts: string[] = [] + + for (const block of msg.content) { + if (block.type === 'tool_use') { + toolCalls.push({ + id: block.id, + type: 'function', + function: { + name: block.name, + arguments: JSON.stringify(block.input), + }, + }) + } else if (block.type === 'text') { + textParts.push(block.text) + } + } + + const assistantMsg: ChatCompletionAssistantMessageParam = { + role: 'assistant', + content: textParts.length > 0 ? textParts.join('') : null, + } + + if (toolCalls.length > 0) { + assistantMsg.tool_calls = toolCalls + } + + return assistantMsg +} + +// --------------------------------------------------------------------------- +// OpenAI format -> Framework +// --------------------------------------------------------------------------- + +/** + * Convert an OpenAI {@link ChatCompletion} into a framework {@link LLMResponse}. + */ +export function fromOpenAICompletion(completion: ChatCompletion): LLMResponse { + const choice = completion.choices[0] + if (choice === undefined) { + throw new Error('OpenAI returned a completion with no choices') + } + + const content: ContentBlock[] = [] + const message = choice.message + + if (message.content !== null && message.content !== undefined) { + const textBlock: TextBlock = { type: 'text', text: message.content } + content.push(textBlock) + } + + for (const toolCall of message.tool_calls ?? []) { + let parsedInput: Record = {} + try { + const parsed: unknown = JSON.parse(toolCall.function.arguments) + if (parsed !== null && typeof parsed === 'object' && !Array.isArray(parsed)) { + parsedInput = parsed as Record + } + } catch { + // Malformed arguments from the model — surface as empty object. + } + + const toolUseBlock: ToolUseBlock = { + type: 'tool_use', + id: toolCall.id, + name: toolCall.function.name, + input: parsedInput, + } + content.push(toolUseBlock) + } + + const stopReason = normalizeFinishReason(choice.finish_reason ?? 'stop') + + return { + id: completion.id, + content, + model: completion.model, + stop_reason: stopReason, + usage: { + input_tokens: completion.usage?.prompt_tokens ?? 0, + output_tokens: completion.usage?.completion_tokens ?? 0, + }, + } +} + +/** + * Normalize an OpenAI `finish_reason` string to the framework's canonical + * stop-reason vocabulary. + * + * - `'stop'` -> `'end_turn'` + * - `'tool_calls'` -> `'tool_use'` + * - `'length'` -> `'max_tokens'` + * - `'content_filter'` -> `'content_filter'` + * - anything else -> passed through unchanged + */ +export function normalizeFinishReason(reason: string): string { + switch (reason) { + case 'stop': return 'end_turn' + case 'tool_calls': return 'tool_use' + case 'length': return 'max_tokens' + case 'content_filter': return 'content_filter' + default: return reason + } +} + +// --------------------------------------------------------------------------- +// Message list assembly +// --------------------------------------------------------------------------- + +/** + * Prepend a system message when `systemPrompt` is provided, then append the + * converted conversation messages. + */ +export function buildOpenAIMessageList( + messages: LLMMessage[], + systemPrompt: string | undefined, +): ChatCompletionMessageParam[] { + const result: ChatCompletionMessageParam[] = [] + + if (systemPrompt !== undefined && systemPrompt.length > 0) { + result.push({ role: 'system', content: systemPrompt }) + } + + result.push(...toOpenAIMessages(messages)) + return result +} diff --git a/src/llm/openai.ts b/src/llm/openai.ts index a53ab24..568e539 100644 --- a/src/llm/openai.ts +++ b/src/llm/openai.ts @@ -4,10 +4,10 @@ * Converts between the framework's internal {@link ContentBlock} types and the * OpenAI Chat Completions wire format. Key mapping decisions: * - * - Framework `tool_use` blocks in assistant messages → OpenAI `tool_calls` - * - Framework `tool_result` blocks in user messages → OpenAI `tool` role messages - * - Framework `image` blocks in user messages → OpenAI image content parts - * - System prompt in {@link LLMChatOptions} → prepended `system` message + * - Framework `tool_use` blocks in assistant messages -> OpenAI `tool_calls` + * - Framework `tool_result` blocks in user messages -> OpenAI `tool` role messages + * - Framework `image` blocks in user messages -> OpenAI image content parts + * - System prompt in {@link LLMChatOptions} -> prepended `system` message * * Because OpenAI and Anthropic use fundamentally different role-based structures * for tool calling (Anthropic embeds tool results in user-role content arrays; @@ -31,16 +31,7 @@ */ import OpenAI from 'openai' -import type { - ChatCompletion, - ChatCompletionAssistantMessageParam, - ChatCompletionChunk, - ChatCompletionMessageParam, - ChatCompletionMessageToolCall, - ChatCompletionTool, - ChatCompletionToolMessageParam, - ChatCompletionUserMessageParam, -} from 'openai/resources/chat/completions/index.js' +import type { ChatCompletionChunk } from 'openai/resources/chat/completions/index.js' import type { ContentBlock, @@ -55,231 +46,12 @@ import type { ToolUseBlock, } from '../types.js' -// --------------------------------------------------------------------------- -// Internal helpers — framework → OpenAI -// --------------------------------------------------------------------------- - -/** - * Convert a framework {@link LLMToolDef} to an OpenAI {@link ChatCompletionTool}. - * - * OpenAI wraps the function definition inside a `function` key and a `type` - * discriminant. The `inputSchema` is already a JSON Schema object. - */ -function toOpenAITool(tool: LLMToolDef): ChatCompletionTool { - return { - type: 'function', - function: { - name: tool.name, - description: tool.description, - parameters: tool.inputSchema as Record, - }, - } -} - -/** - * Determine whether a framework message contains any `tool_result` content - * blocks, which must be serialised as separate OpenAI `tool`-role messages. - */ -function hasToolResults(msg: LLMMessage): boolean { - return msg.content.some((b) => b.type === 'tool_result') -} - -/** - * Convert a single framework {@link LLMMessage} into one or more OpenAI - * {@link ChatCompletionMessageParam} entries. - * - * The expansion is necessary because OpenAI represents tool results as - * top-level messages with role `tool`, whereas in our model they are content - * blocks inside a `user` message. - * - * Expansion rules: - * - A `user` message containing only text/image blocks → single user message - * - A `user` message containing `tool_result` blocks → one `tool` message per - * tool_result block; any remaining text/image blocks are folded into an - * additional user message prepended to the group - * - An `assistant` message → single assistant message with optional tool_calls - */ -function toOpenAIMessages(messages: LLMMessage[]): ChatCompletionMessageParam[] { - const result: ChatCompletionMessageParam[] = [] - - for (const msg of messages) { - if (msg.role === 'assistant') { - result.push(toOpenAIAssistantMessage(msg)) - } else { - // user role - if (!hasToolResults(msg)) { - result.push(toOpenAIUserMessage(msg)) - } else { - // Split: text/image blocks become a user message (if any exist), then - // each tool_result block becomes an independent tool message. - const nonToolBlocks = msg.content.filter((b) => b.type !== 'tool_result') - if (nonToolBlocks.length > 0) { - result.push(toOpenAIUserMessage({ role: 'user', content: nonToolBlocks })) - } - - for (const block of msg.content) { - if (block.type === 'tool_result') { - const toolMsg: ChatCompletionToolMessageParam = { - role: 'tool', - tool_call_id: block.tool_use_id, - content: block.content, - } - result.push(toolMsg) - } - } - } - } - } - - return result -} - -/** - * Convert a `user`-role framework message into an OpenAI user message. - * Image blocks are converted to the OpenAI image_url content part format. - */ -function toOpenAIUserMessage(msg: LLMMessage): ChatCompletionUserMessageParam { - // If the entire content is a single text block, use the compact string form - // to keep the request payload smaller. - if (msg.content.length === 1 && msg.content[0]?.type === 'text') { - return { role: 'user', content: msg.content[0].text } - } - - type ContentPart = OpenAI.Chat.ChatCompletionContentPartText | OpenAI.Chat.ChatCompletionContentPartImage - const parts: ContentPart[] = [] - - for (const block of msg.content) { - if (block.type === 'text') { - parts.push({ type: 'text', text: block.text }) - } else if (block.type === 'image') { - parts.push({ - type: 'image_url', - image_url: { - url: `data:${block.source.media_type};base64,${block.source.data}`, - }, - }) - } - // tool_result blocks are handled by the caller (toOpenAIMessages); skip here. - } - - return { role: 'user', content: parts } -} - -/** - * Convert an `assistant`-role framework message into an OpenAI assistant message. - * - * Any `tool_use` blocks become `tool_calls`; `text` blocks become the message content. - */ -function toOpenAIAssistantMessage(msg: LLMMessage): ChatCompletionAssistantMessageParam { - const toolCalls: ChatCompletionMessageToolCall[] = [] - const textParts: string[] = [] - - for (const block of msg.content) { - if (block.type === 'tool_use') { - toolCalls.push({ - id: block.id, - type: 'function', - function: { - name: block.name, - arguments: JSON.stringify(block.input), - }, - }) - } else if (block.type === 'text') { - textParts.push(block.text) - } - } - - const assistantMsg: ChatCompletionAssistantMessageParam = { - role: 'assistant', - content: textParts.length > 0 ? textParts.join('') : null, - } - - if (toolCalls.length > 0) { - assistantMsg.tool_calls = toolCalls - } - - return assistantMsg -} - -// --------------------------------------------------------------------------- -// Internal helpers — OpenAI → framework -// --------------------------------------------------------------------------- - -/** - * Convert an OpenAI {@link ChatCompletion} into a framework {@link LLMResponse}. - * - * We take only the first choice (index 0), consistent with how the framework - * is designed for single-output agents. - */ -function fromOpenAICompletion(completion: ChatCompletion): LLMResponse { - const choice = completion.choices[0] - if (choice === undefined) { - throw new Error('OpenAI returned a completion with no choices') - } - - const content: ContentBlock[] = [] - const message = choice.message - - if (message.content !== null && message.content !== undefined) { - const textBlock: TextBlock = { type: 'text', text: message.content } - content.push(textBlock) - } - - for (const toolCall of message.tool_calls ?? []) { - let parsedInput: Record = {} - try { - const parsed: unknown = JSON.parse(toolCall.function.arguments) - if (parsed !== null && typeof parsed === 'object' && !Array.isArray(parsed)) { - parsedInput = parsed as Record - } - } catch { - // Malformed arguments from the model — surface as empty object. - } - - const toolUseBlock: ToolUseBlock = { - type: 'tool_use', - id: toolCall.id, - name: toolCall.function.name, - input: parsedInput, - } - content.push(toolUseBlock) - } - - const stopReason = normalizeFinishReason(choice.finish_reason ?? 'stop') - - return { - id: completion.id, - content, - model: completion.model, - stop_reason: stopReason, - usage: { - input_tokens: completion.usage?.prompt_tokens ?? 0, - output_tokens: completion.usage?.completion_tokens ?? 0, - }, - } -} - -/** - * Normalize an OpenAI `finish_reason` string to the framework's canonical - * stop-reason vocabulary so consumers never need to branch on provider-specific - * strings. - * - * Mapping: - * - `'stop'` → `'end_turn'` - * - `'tool_calls'` → `'tool_use'` - * - `'length'` → `'max_tokens'` - * - `'content_filter'` → `'content_filter'` - * - anything else → passed through unchanged - */ -function normalizeFinishReason(reason: string): string { - switch (reason) { - case 'stop': return 'end_turn' - case 'tool_calls': return 'tool_use' - case 'length': return 'max_tokens' - case 'content_filter': return 'content_filter' - default: return reason - } -} +import { + toOpenAITool, + fromOpenAICompletion, + normalizeFinishReason, + buildOpenAIMessageList, +} from './openai-compat.js' // --------------------------------------------------------------------------- // Adapter implementation @@ -308,9 +80,6 @@ export class OpenAIAdapter implements LLMAdapter { /** * Send a synchronous (non-streaming) chat request and return the complete * {@link LLMResponse}. - * - * Throws an `OpenAI.APIError` on non-2xx responses. Callers should catch and - * handle these (e.g. rate limits, context length exceeded). */ async chat(messages: LLMMessage[], options: LLMChatOptions): Promise { const openAIMessages = buildOpenAIMessageList(messages, options.systemPrompt) @@ -338,12 +107,6 @@ export class OpenAIAdapter implements LLMAdapter { /** * Send a streaming chat request and yield {@link StreamEvent}s incrementally. - * - * Sequence guarantees match {@link AnthropicAdapter.stream}: - * - Zero or more `text` events - * - Zero or more `tool_use` events (emitted once per tool call, after - * arguments have been fully assembled) - * - Exactly one terminal event: `done` or `error` */ async *stream( messages: LLMMessage[], @@ -351,7 +114,6 @@ export class OpenAIAdapter implements LLMAdapter { ): AsyncIterable { const openAIMessages = buildOpenAIMessageList(messages, options.systemPrompt) - // We request usage in the final chunk so we can include it in the `done` event. const streamResponse = await this.#client.chat.completions.create( { model: options.model, @@ -367,20 +129,17 @@ export class OpenAIAdapter implements LLMAdapter { }, ) - // Accumulate state across chunks. let completionId = '' let completionModel = '' let finalFinishReason: string = 'stop' let inputTokens = 0 let outputTokens = 0 - // tool_calls are streamed piecemeal; key = tool call index const toolCallBuffers = new Map< number, { id: string; name: string; argsJson: string } >() - // Full text accumulator for the `done` response. let fullText = '' try { @@ -388,7 +147,6 @@ export class OpenAIAdapter implements LLMAdapter { completionId = chunk.id completionModel = chunk.model - // Usage is only populated in the final chunk when stream_options.include_usage is set. if (chunk.usage !== null && chunk.usage !== undefined) { inputTokens = chunk.usage.prompt_tokens outputTokens = chunk.usage.completion_tokens @@ -399,14 +157,12 @@ export class OpenAIAdapter implements LLMAdapter { const delta = choice.delta - // --- text delta --- if (delta.content !== null && delta.content !== undefined) { fullText += delta.content const textEvent: StreamEvent = { type: 'text', data: delta.content } yield textEvent } - // --- tool call delta --- for (const toolCallDelta of delta.tool_calls ?? []) { const idx = toolCallDelta.index @@ -419,7 +175,6 @@ export class OpenAIAdapter implements LLMAdapter { } const buf = toolCallBuffers.get(idx) - // buf is guaranteed to exist: we just set it above. if (buf !== undefined) { if (toolCallDelta.id) buf.id = toolCallDelta.id if (toolCallDelta.function?.name) buf.name = toolCallDelta.function.name @@ -434,7 +189,6 @@ export class OpenAIAdapter implements LLMAdapter { } } - // Emit accumulated tool_use events after the stream ends. const finalToolUseBlocks: ToolUseBlock[] = [] for (const buf of toolCallBuffers.values()) { let parsedInput: Record = {} @@ -458,7 +212,6 @@ export class OpenAIAdapter implements LLMAdapter { yield toolUseEvent } - // Build the complete content array for the done response. const doneContent: ContentBlock[] = [] if (fullText.length > 0) { const textBlock: TextBlock = { type: 'text', text: fullText } @@ -484,31 +237,6 @@ export class OpenAIAdapter implements LLMAdapter { } } -// --------------------------------------------------------------------------- -// Private utility -// --------------------------------------------------------------------------- - -/** - * Prepend a system message when `systemPrompt` is provided, then append the - * converted conversation messages. - * - * OpenAI represents system instructions as a message with `role: 'system'` - * at the top of the array, not as a separate API parameter. - */ -function buildOpenAIMessageList( - messages: LLMMessage[], - systemPrompt: string | undefined, -): ChatCompletionMessageParam[] { - const result: ChatCompletionMessageParam[] = [] - - if (systemPrompt !== undefined && systemPrompt.length > 0) { - result.push({ role: 'system', content: systemPrompt }) - } - - result.push(...toOpenAIMessages(messages)) - return result -} - // Re-export types that consumers of this module commonly need alongside the adapter. export type { ContentBlock, diff --git a/src/llm/vllm.ts b/src/llm/vllm.ts new file mode 100644 index 0000000..7f848ec --- /dev/null +++ b/src/llm/vllm.ts @@ -0,0 +1,248 @@ +/** + * @fileoverview vLLM adapter implementing {@link LLMAdapter}. + * + * vLLM exposes an OpenAI-compatible API, so this adapter reuses all shared + * helpers from `openai-compat.ts` and simply points the `openai` client at + * a custom `baseURL`. + * + * @module @vcg/agent-sdk + */ + +import OpenAI from 'openai' +import type { ChatCompletionChunk } from 'openai/resources/chat/completions/index.js' + +import type { + ContentBlock, + LLMAdapter, + LLMChatOptions, + LLMMessage, + LLMResponse, + LLMStreamOptions, + StreamEvent, + TextBlock, + ToolUseBlock, + VLLMConfig, +} from '../types.js' + +import { + toOpenAITool, + fromOpenAICompletion, + normalizeFinishReason, + buildOpenAIMessageList, +} from './openai-compat.js' + +// --------------------------------------------------------------------------- +// VLLMAdapter +// --------------------------------------------------------------------------- + +/** + * LLM adapter for vLLM inference servers. + * + * vLLM is OpenAI-compatible, so this adapter reuses the same message + * conversion and response parsing logic as the OpenAI adapter. The key + * difference is the configurable `baseURL` pointing at a self-hosted + * vLLM instance. + * + * @example + * ```ts + * const adapter = new VLLMAdapter({ + * baseURL: 'http://localhost:8000/v1', + * model: 'meta-llama/Llama-3-70b-chat-hf', + * }) + * const response = await adapter.chat(messages, { model: 'meta-llama/Llama-3-70b-chat-hf' }) + * ``` + */ +export class VLLMAdapter implements LLMAdapter { + readonly name = 'vllm' + + readonly #client: OpenAI + readonly #config: VLLMConfig + + constructor(config: VLLMConfig) { + this.#config = config + this.#client = new OpenAI({ + baseURL: config.baseURL, + apiKey: config.apiKey ?? 'dummy', + timeout: config.timeout, + maxRetries: config.maxRetries, + }) + } + + // ------------------------------------------------------------------------- + // healthCheck() + // ------------------------------------------------------------------------- + + /** + * Check whether the vLLM server is reachable by hitting `GET {baseURL}/health`. + * + * Returns `true` if the server responds with a 2xx status, `false` otherwise. + */ + async healthCheck(): Promise { + try { + // Strip trailing /v1 if present to hit the root health endpoint + const base = this.#config.baseURL.replace(/\/v1\/?$/, '') + const response = await fetch(`${base}/health`, { + signal: AbortSignal.timeout(this.#config.timeout ?? 5000), + }) + return response.ok + } catch { + return false + } + } + + // ------------------------------------------------------------------------- + // chat() + // ------------------------------------------------------------------------- + + async chat(messages: LLMMessage[], options: LLMChatOptions): Promise { + const openAIMessages = buildOpenAIMessageList(messages, options.systemPrompt) + + const completion = await this.#client.chat.completions.create( + { + model: options.model ?? this.#config.model, + messages: openAIMessages, + max_tokens: options.maxTokens, + temperature: options.temperature, + tools: options.tools ? options.tools.map(toOpenAITool) : undefined, + stream: false, + }, + { + signal: options.abortSignal, + }, + ) + + return fromOpenAICompletion(completion) + } + + // ------------------------------------------------------------------------- + // stream() + // ------------------------------------------------------------------------- + + async *stream( + messages: LLMMessage[], + options: LLMStreamOptions, + ): AsyncIterable { + const openAIMessages = buildOpenAIMessageList(messages, options.systemPrompt) + + const streamResponse = await this.#client.chat.completions.create( + { + model: options.model ?? this.#config.model, + messages: openAIMessages, + max_tokens: options.maxTokens, + temperature: options.temperature, + tools: options.tools ? options.tools.map(toOpenAITool) : undefined, + stream: true, + stream_options: { include_usage: true }, + }, + { + signal: options.abortSignal, + }, + ) + + let completionId = '' + let completionModel = '' + let finalFinishReason: string = 'stop' + let inputTokens = 0 + let outputTokens = 0 + + const toolCallBuffers = new Map< + number, + { id: string; name: string; argsJson: string } + >() + + let fullText = '' + + try { + for await (const chunk of streamResponse) { + completionId = chunk.id + completionModel = chunk.model + + if (chunk.usage !== null && chunk.usage !== undefined) { + inputTokens = chunk.usage.prompt_tokens + outputTokens = chunk.usage.completion_tokens + } + + const choice: ChatCompletionChunk.Choice | undefined = chunk.choices[0] + if (choice === undefined) continue + + const delta = choice.delta + + if (delta.content !== null && delta.content !== undefined) { + fullText += delta.content + const textEvent: StreamEvent = { type: 'text', data: delta.content } + yield textEvent + } + + for (const toolCallDelta of delta.tool_calls ?? []) { + const idx = toolCallDelta.index + + if (!toolCallBuffers.has(idx)) { + toolCallBuffers.set(idx, { + id: toolCallDelta.id ?? '', + name: toolCallDelta.function?.name ?? '', + argsJson: '', + }) + } + + const buf = toolCallBuffers.get(idx) + if (buf !== undefined) { + if (toolCallDelta.id) buf.id = toolCallDelta.id + if (toolCallDelta.function?.name) buf.name = toolCallDelta.function.name + if (toolCallDelta.function?.arguments) { + buf.argsJson += toolCallDelta.function.arguments + } + } + } + + if (choice.finish_reason !== null && choice.finish_reason !== undefined) { + finalFinishReason = choice.finish_reason + } + } + + const finalToolUseBlocks: ToolUseBlock[] = [] + for (const buf of toolCallBuffers.values()) { + let parsedInput: Record = {} + try { + const parsed: unknown = JSON.parse(buf.argsJson) + if (parsed !== null && typeof parsed === 'object' && !Array.isArray(parsed)) { + parsedInput = parsed as Record + } + } catch { + // Malformed JSON — surface as empty object. + } + + const toolUseBlock: ToolUseBlock = { + type: 'tool_use', + id: buf.id, + name: buf.name, + input: parsedInput, + } + finalToolUseBlocks.push(toolUseBlock) + const toolUseEvent: StreamEvent = { type: 'tool_use', data: toolUseBlock } + yield toolUseEvent + } + + const doneContent: ContentBlock[] = [] + if (fullText.length > 0) { + const textBlock: TextBlock = { type: 'text', text: fullText } + doneContent.push(textBlock) + } + doneContent.push(...finalToolUseBlocks) + + const finalResponse: LLMResponse = { + id: completionId, + content: doneContent, + model: completionModel, + stop_reason: normalizeFinishReason(finalFinishReason), + usage: { input_tokens: inputTokens, output_tokens: outputTokens }, + } + + const doneEvent: StreamEvent = { type: 'done', data: finalResponse } + yield doneEvent + } catch (err) { + const error = err instanceof Error ? err : new Error(String(err)) + const errorEvent: StreamEvent = { type: 'error', data: error } + yield errorEvent + } + } +} diff --git a/src/types.ts b/src/types.ts index f980c68..8b150b6 100644 --- a/src/types.ts +++ b/src/types.ts @@ -186,7 +186,7 @@ export interface ToolDefinition> { export interface AgentConfig { readonly name: string readonly model: string - readonly provider?: 'anthropic' | 'openai' + readonly provider?: 'anthropic' | 'openai' | 'vllm' readonly systemPrompt?: string /** Names of tools (from the tool registry) available to this agent. */ readonly tools?: readonly string[] @@ -285,10 +285,28 @@ export interface OrchestratorEvent { export interface OrchestratorConfig { readonly maxConcurrency?: number readonly defaultModel?: string - readonly defaultProvider?: 'anthropic' | 'openai' + readonly defaultProvider?: 'anthropic' | 'openai' | 'vllm' onProgress?: (event: OrchestratorEvent) => void } +// --------------------------------------------------------------------------- +// vLLM configuration +// --------------------------------------------------------------------------- + +/** Configuration for connecting to a vLLM inference server. */ +export interface VLLMConfig { + /** Base URL of the vLLM server (e.g. `'http://localhost:8000/v1'`). */ + readonly baseURL: string + /** Model name to use for requests (e.g. `'meta-llama/Llama-3-70b-chat-hf'`). */ + readonly model: string + /** Optional API key for authenticated vLLM deployments. */ + readonly apiKey?: string + /** Request timeout in milliseconds. */ + readonly timeout?: number + /** Maximum number of retries on transient errors. */ + readonly maxRetries?: number +} + // --------------------------------------------------------------------------- // Memory // ---------------------------------------------------------------------------