#!/usr/bin/env npx tsx /** * Model Quality Benchmark for @kthulu Phase 0A * * Tests each model with 3 standardized prompts via model-boss /v1/chat/completions. * Measures: tool call correctness, sequencing, code quality, speed. */ const MODEL_BOSS_URL = "http://localhost:8210"; // CLI script — stdout is the interface const log = { info: (...args: unknown[]) => process.stdout.write(args.join(" ") + "\n"), error: (...args: unknown[]) => process.stderr.write(args.join(" ") + "\n"), }; interface ToolCall { id: string; type: "function"; function: { name: string; arguments: string; }; } interface ChatMessage { role: string; content: string | null; tool_calls?: ToolCall[]; } interface ChatResponse { id: string; model: string; choices: Array<{ message: ChatMessage; finish_reason: string; }>; usage: { prompt_tokens: number; completion_tokens: number; total_tokens: number; }; } const TOOLS = [ { type: "function" as const, function: { name: "write_file", description: "Write content to a file, creating it if it doesn't exist.", parameters: { type: "object", properties: { path: { type: "string", description: "File path to write to" }, content: { type: "string", description: "Content to write" }, }, required: ["path", "content"], }, }, }, { type: "function" as const, function: { name: "read_file", description: "Read the contents of a file.", parameters: { type: "object", properties: { path: { type: "string", description: "File path to read" }, }, required: ["path"], }, }, }, { type: "function" as const, function: { name: "edit_file", description: "Edit a file by replacing an old string with a new string.", parameters: { type: "object", properties: { path: { type: "string", description: "File path to edit" }, old_string: { type: "string", description: "Text to find and replace" }, new_string: { type: "string", description: "Replacement text" }, }, required: ["path", "old_string", "new_string"], }, }, }, { type: "function" as const, function: { name: "bash", description: "Execute a bash command and return stdout/stderr.", parameters: { type: "object", properties: { command: { type: "string", description: "Command to execute" }, }, required: ["command"], }, }, }, ]; interface TestCase { name: string; complexity: "low" | "medium" | "high"; messages: Array<{ role: string; content: string }>; tools: typeof TOOLS; maxTokens: number; evaluate: (response: ChatResponse) => TestResult; } interface TestResult { passed: boolean; details: string; toolCalls: string[]; codeQuality?: string; } interface BenchmarkResult { model: string; testA: TestResult | { passed: false; details: string }; testB: TestResult | { passed: false; details: string }; testC: TestResult | { passed: false; details: string }; totalTokens: number; totalTimeMs: number; avgTokPerSec: number; } const SYSTEM_PROMPT = `You are a coding assistant. You have access to tools for file operations and shell commands. When asked to create or modify code, use the appropriate tools. Be direct — call tools immediately without explaining what you're about to do.`; function extractToolCalls(response: ChatResponse): ToolCall[] { return response.choices?.[0]?.message?.tool_calls ?? []; } function hasToolCall(response: ChatResponse, toolName: string): boolean { return extractToolCalls(response).some((tc) => tc.function.name === toolName); } function getToolCallArg( response: ChatResponse, toolName: string, argName: string, ): string | undefined { const tc = extractToolCalls(response).find( (t) => t.function.name === toolName, ); if (!tc) return undefined; try { return JSON.parse(tc.function.arguments)[argName] as string | undefined; } catch { return undefined; } } function isValidTypeScript(code: string): boolean { const hasExport = /export\s/.test(code); const hasFunction = /function\s|const\s|=>\s/.test(code); const balanced = (code.match(/{/g) || []).length === (code.match(/}/g) || []).length; return hasExport || (hasFunction && balanced); } const TEST_CASES: TestCase[] = [ { name: "Test A — Simple file write", complexity: "low", messages: [ { role: "system", content: SYSTEM_PROMPT }, { role: "user", content: "Create a file called calculator.ts with add and subtract functions that take two numbers and return a number. Export both functions.", }, ], tools: TOOLS, maxTokens: 2048, evaluate(response: ChatResponse): TestResult { const toolCalls = extractToolCalls(response).map( (tc) => tc.function.name, ); const hasWrite = hasToolCall(response, "write_file"); const content = getToolCallArg(response, "write_file", "content") ?? ""; const path = getToolCallArg(response, "write_file", "path") ?? ""; const validTS = isValidTypeScript(content); const hasCalc = path.includes("calculator"); return { passed: hasWrite && validTS && hasCalc, details: [ hasWrite ? "✅ write_file called" : "❌ No write_file tool call", hasCalc ? "✅ Correct filename" : `❌ Wrong path: ${path}`, validTS ? "✅ Valid TypeScript structure" : "❌ Invalid TypeScript", content.includes("add") ? "✅ Has add function" : "❌ Missing add", content.includes("subtract") ? "✅ Has subtract function" : "❌ Missing subtract", ].join("\n "), toolCalls, codeQuality: content.length > 0 ? `${content.length} chars` : "empty", }; }, }, { name: "Test B — Multi-step sequencing", complexity: "medium", messages: [ { role: "system", content: SYSTEM_PROMPT }, { role: "user", content: "Read the file src/index.ts, then add error handling to it by wrapping the main logic in a try-catch block.", }, ], tools: TOOLS, maxTokens: 2048, evaluate(response: ChatResponse): TestResult { const toolCalls = extractToolCalls(response).map( (tc) => tc.function.name, ); const firstCall = toolCalls[0]; const hasRead = toolCalls.includes("read_file"); const readsFirst = firstCall === "read_file"; return { passed: hasRead && readsFirst, details: [ hasRead ? "✅ read_file called" : "❌ No read_file call", readsFirst ? "✅ Correct sequencing (read first)" : `❌ Wrong sequence: ${firstCall ?? "none"} called first`, `Tool call order: ${toolCalls.join(" → ") || "none"}`, ].join("\n "), toolCalls, }; }, }, { name: "Test C — Complex multi-file (tic-tac-toe)", complexity: "high", messages: [ { role: "system", content: SYSTEM_PROMPT }, { role: "user", content: `Create a tic-tac-toe game with minimax AI in TypeScript. Create src/game.ts with: 1. A Board type (3x3 array of 'X' | 'O' | null) 2. A checkWinner function 3. A minimax function that returns the optimal move index 4. Export all of them.`, }, ], tools: TOOLS, maxTokens: 4096, evaluate(response: ChatResponse): TestResult { const toolCalls = extractToolCalls(response).map( (tc) => tc.function.name, ); const hasWrite = hasToolCall(response, "write_file"); const content = getToolCallArg(response, "write_file", "content") ?? ""; const hasMinimax = content.toLowerCase().includes("minimax"); const hasCheckWinner = content.includes("checkWinner") || content.includes("check_winner"); const hasBoard = content.includes("Board"); return { passed: hasWrite && content.length > 100, details: [ hasWrite ? "✅ write_file called" : "❌ No write_file tool call", content.length > 100 ? `✅ Substantial code (${content.length} chars)` : `❌ Insufficient code (${content.length} chars)`, hasMinimax ? "✅ Has minimax" : "❌ Missing minimax", hasCheckWinner ? "✅ Has checkWinner" : "❌ Missing checkWinner", hasBoard ? "✅ Has Board type" : "❌ Missing Board type", `Tool calls: ${toolCalls.join(", ") || "none"}`, ].join("\n "), toolCalls, codeQuality: content.length > 0 ? `${content.length} chars` : "empty", }; }, }, ]; async function runTest( modelId: string, test: TestCase, ): Promise<{ result: TestResult; tokens: number; timeMs: number }> { const start = Date.now(); const body = { model: modelId, messages: test.messages, tools: test.tools, max_tokens: test.maxTokens, temperature: 0, }; const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body), }); const timeMs = Date.now() - start; if (!response.ok) { const error = await response.text(); return { result: { passed: false, details: `❌ HTTP ${response.status}: ${error}`, toolCalls: [], }, tokens: 0, timeMs, }; } const data: ChatResponse = await response.json(); const tokens = data.usage?.completion_tokens ?? 0; const result = test.evaluate(data); if (!result.passed) { const msg = data.choices?.[0]?.message; log.info(` [DEBUG] Content: ${msg?.content?.slice(0, 200) ?? "null"}`); log.info(` [DEBUG] Tool calls: ${JSON.stringify(msg?.tool_calls?.map((tc) => tc.function.name) ?? [])}`); log.info(` [DEBUG] Finish reason: ${data.choices?.[0]?.finish_reason}`); } return { result, tokens, timeMs }; } async function prewarmModel(modelId: string): Promise { log.info(` Pre-warming ${modelId}...`); const start = Date.now(); // Send a trivial request to trigger model loading in the pool const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ model: modelId, messages: [{ role: "user", content: "Say hello." }], max_tokens: 16, temperature: 0, }), signal: AbortSignal.timeout(600_000), // 10 min for large model loading }); const elapsed = ((Date.now() - start) / 1000).toFixed(1); if (!response.ok) { const error = await response.text(); log.error(` ❌ Pre-warm failed after ${elapsed}s: ${error}`); return false; } log.info(` ✅ Model ready in ${elapsed}s`); return true; } async function benchmarkModel(modelId: string): Promise { log.info(`\n${"=".repeat(60)}`); log.info(` BENCHMARKING: ${modelId}`); log.info(`${"=".repeat(60)}`); let totalTokens = 0; let totalTimeMs = 0; const results: (TestResult | { passed: false; details: string })[] = []; for (const test of TEST_CASES) { log.info(`\n ${test.name} (${test.complexity} complexity):`); try { const { result, tokens, timeMs } = await runTest(modelId, test); totalTokens += tokens; totalTimeMs += timeMs; results.push(result); const status = result.passed ? "PASS ✅" : "FAIL ❌"; log.info(` ${status} (${tokens} tokens, ${(timeMs / 1000).toFixed(1)}s)`); log.info(` ${result.details}`); if (result.toolCalls?.length) { log.info(` Tools: ${result.toolCalls.join(" → ")}`); } } catch (err) { const msg = err instanceof Error ? err.message : String(err); results.push({ passed: false, details: `❌ Error: ${msg}` }); log.info(` FAIL ❌ — ${msg}`); } } const avgTokPerSec = totalTimeMs > 0 ? totalTokens / (totalTimeMs / 1000) : 0; return { model: modelId, testA: results[0], testB: results[1], testC: results[2], totalTokens, totalTimeMs, avgTokPerSec, }; } function printSummary(results: BenchmarkResult[]): void { log.info(`\n${"=".repeat(70)}`); log.info(" BENCHMARK SUMMARY"); log.info(`${"=".repeat(70)}`); const header = "| Model | Test A | Test B | Test C | Tokens | Speed (tok/s) |"; const sep = "|" + "-".repeat(header.length - 2) + "|"; log.info(header); log.info(sep); for (const r of results) { const a = r.testA.passed ? "✅" : "❌"; const b = r.testB.passed ? "✅" : "❌"; const c = r.testC.passed ? "✅" : "❌"; log.info( `| ${r.model.padEnd(28)} | ${a} | ${b} | ${c} | ${String(r.totalTokens).padStart(6)} | ${r.avgTokPerSec.toFixed(1).padStart(13)} |`, ); } log.info(sep); log.info(""); for (const r of results) { const passed = [r.testA, r.testB, r.testC].filter((t) => t.passed).length; let verdict: string; if (passed === 3) verdict = "Primary candidate — all tests passed"; else if (passed === 2) verdict = "Viable for simple/medium tasks"; else if (passed === 1) verdict = "Simple tasks only"; else verdict = "Not viable for agentic use"; log.info(` ${r.model}: ${passed}/3 — ${verdict}`); } } async function main(): Promise { const models = process.argv.slice(2); if (models.length === 0) { log.error( "Usage: npx tsx benchmark-models.ts [model2] [model3] ...", ); log.error( "Example: npx tsx benchmark-models.ts qwen3-8b hermes-4-70b nemotron-h-47b-iq3", ); process.exit(1); } log.info(`Model Boss: ${MODEL_BOSS_URL}`); log.info(`Models to test: ${models.join(", ")}`); log.info(`Tests per model: ${TEST_CASES.length}`); const results: BenchmarkResult[] = []; for (const modelId of models) { const check = await fetch(`${MODEL_BOSS_URL}/api/v1/models/${modelId}`); if (!check.ok) { log.error(`\n❌ Model "${modelId}" not found in registry. Skipping.`); continue; } const warmed = await prewarmModel(modelId); if (!warmed) { log.error(` Skipping ${modelId} — failed to load.`); continue; } const result = await benchmarkModel(modelId); results.push(result); } if (results.length > 0) { printSummary(results); const outPath = `/tmp/kthulu-benchmark-${new Date().toISOString().slice(0, 19).replace(/:/g, "")}.json`; const { writeFileSync } = await import("node:fs"); writeFileSync(outPath, JSON.stringify(results, null, 2)); log.info(`\nResults saved to: ${outPath}`); } } main().catch((err) => { log.error("Benchmark failed:", err); process.exit(1); });