kthulu/codebase/scripts/benchmark-models.ts

#!/usr/bin/env npx tsx
/**
 * Model Quality Benchmark for @kthulu Phase 0A
 *
 * Tests each model with 3 standardized prompts via model-boss /v1/chat/completions.
 * Measures: tool call correctness, sequencing, code quality, speed.
 */

const MODEL_BOSS_URL = "http://localhost:8210";

// CLI script — stdout is the interface
const log = {
  info: (...args: unknown[]) => process.stdout.write(args.join(" ") + "\n"),
  error: (...args: unknown[]) => process.stderr.write(args.join(" ") + "\n"),
};

interface ToolCall {
  id: string;
  type: "function";
  function: {
    name: string;
    arguments: string;
  };
}

interface ChatMessage {
  role: string;
  content: string | null;
  tool_calls?: ToolCall[];
}

interface ChatResponse {
  id: string;
  model: string;
  choices: Array<{
    message: ChatMessage;
    finish_reason: string;
  }>;
  usage: {
    prompt_tokens: number;
    completion_tokens: number;
    total_tokens: number;
  };
}

const TOOLS = [
  {
    type: "function" as const,
    function: {
      name: "write_file",
      description: "Write content to a file, creating it if it doesn't exist.",
      parameters: {
        type: "object",
        properties: {
          path: { type: "string", description: "File path to write to" },
          content: { type: "string", description: "Content to write" },
        },
        required: ["path", "content"],
      },
    },
  },
  {
    type: "function" as const,
    function: {
      name: "read_file",
      description: "Read the contents of a file.",
      parameters: {
        type: "object",
        properties: {
          path: { type: "string", description: "File path to read" },
        },
        required: ["path"],
      },
    },
  },
  {
    type: "function" as const,
    function: {
      name: "edit_file",
      description:
        "Edit a file by replacing an old string with a new string.",
      parameters: {
        type: "object",
        properties: {
          path: { type: "string", description: "File path to edit" },
          old_string: { type: "string", description: "Text to find and replace" },
          new_string: { type: "string", description: "Replacement text" },
        },
        required: ["path", "old_string", "new_string"],
      },
    },
  },
  {
    type: "function" as const,
    function: {
      name: "bash",
      description: "Execute a bash command and return stdout/stderr.",
      parameters: {
        type: "object",
        properties: {
          command: { type: "string", description: "Command to execute" },
        },
        required: ["command"],
      },
    },
  },
];

interface TestCase {
  name: string;
  complexity: "low" | "medium" | "high";
  messages: Array<{ role: string; content: string }>;
  tools: typeof TOOLS;
  maxTokens: number;
  evaluate: (response: ChatResponse) => TestResult;
}

interface TestResult {
  passed: boolean;
  details: string;
  toolCalls: string[];
  codeQuality?: string;
}

interface BenchmarkResult {
  model: string;
  testA: TestResult | { passed: false; details: string };
  testB: TestResult | { passed: false; details: string };
  testC: TestResult | { passed: false; details: string };
  totalTokens: number;
  totalTimeMs: number;
  avgTokPerSec: number;
}

const SYSTEM_PROMPT = `You are a coding assistant. You have access to tools for file operations and shell commands. When asked to create or modify code, use the appropriate tools. Be direct — call tools immediately without explaining what you're about to do.`;

function extractToolCalls(response: ChatResponse): ToolCall[] {
  return response.choices?.[0]?.message?.tool_calls ?? [];
}

function hasToolCall(response: ChatResponse, toolName: string): boolean {
  return extractToolCalls(response).some((tc) => tc.function.name === toolName);
}

function getToolCallArg(
  response: ChatResponse,
  toolName: string,
  argName: string,
): string | undefined {
  const tc = extractToolCalls(response).find(
    (t) => t.function.name === toolName,
  );
  if (!tc) return undefined;
  try {
    return JSON.parse(tc.function.arguments)[argName] as string | undefined;
  } catch {
    return undefined;
  }
}

function isValidTypeScript(code: string): boolean {
  const hasExport = /export\s/.test(code);
  const hasFunction = /function\s|const\s|=>\s/.test(code);
  const balanced =
    (code.match(/{/g) || []).length === (code.match(/}/g) || []).length;
  return hasExport || (hasFunction && balanced);
}

const TEST_CASES: TestCase[] = [
  {
    name: "Test A — Simple file write",
    complexity: "low",
    messages: [
      { role: "system", content: SYSTEM_PROMPT },
      {
        role: "user",
        content:
          "Create a file called calculator.ts with add and subtract functions that take two numbers and return a number. Export both functions.",
      },
    ],
    tools: TOOLS,
    maxTokens: 2048,
    evaluate(response: ChatResponse): TestResult {
      const toolCalls = extractToolCalls(response).map(
        (tc) => tc.function.name,
      );
      const hasWrite = hasToolCall(response, "write_file");
      const content = getToolCallArg(response, "write_file", "content") ?? "";
      const path = getToolCallArg(response, "write_file", "path") ?? "";
      const validTS = isValidTypeScript(content);
      const hasCalc = path.includes("calculator");

      return {
        passed: hasWrite && validTS && hasCalc,
        details: [
          hasWrite ? "✅ write_file called" : "❌ No write_file tool call",
          hasCalc ? "✅ Correct filename" : `❌ Wrong path: ${path}`,
          validTS ? "✅ Valid TypeScript structure" : "❌ Invalid TypeScript",
          content.includes("add") ? "✅ Has add function" : "❌ Missing add",
          content.includes("subtract")
            ? "✅ Has subtract function"
            : "❌ Missing subtract",
        ].join("\n    "),
        toolCalls,
        codeQuality: content.length > 0 ? `${content.length} chars` : "empty",
      };
    },
  },
  {
    name: "Test B — Multi-step sequencing",
    complexity: "medium",
    messages: [
      { role: "system", content: SYSTEM_PROMPT },
      {
        role: "user",
        content:
          "Read the file src/index.ts, then add error handling to it by wrapping the main logic in a try-catch block.",
      },
    ],
    tools: TOOLS,
    maxTokens: 2048,
    evaluate(response: ChatResponse): TestResult {
      const toolCalls = extractToolCalls(response).map(
        (tc) => tc.function.name,
      );
      const firstCall = toolCalls[0];
      const hasRead = toolCalls.includes("read_file");
      const readsFirst = firstCall === "read_file";

      return {
        passed: hasRead && readsFirst,
        details: [
          hasRead ? "✅ read_file called" : "❌ No read_file call",
          readsFirst
            ? "✅ Correct sequencing (read first)"
            : `❌ Wrong sequence: ${firstCall ?? "none"} called first`,
          `Tool call order: ${toolCalls.join(" → ") || "none"}`,
        ].join("\n    "),
        toolCalls,
      };
    },
  },
  {
    name: "Test C — Complex multi-file (tic-tac-toe)",
    complexity: "high",
    messages: [
      { role: "system", content: SYSTEM_PROMPT },
      {
        role: "user",
        content: `Create a tic-tac-toe game with minimax AI in TypeScript. Create src/game.ts with:
1. A Board type (3x3 array of 'X' | 'O' | null)
2. A checkWinner function
3. A minimax function that returns the optimal move index
4. Export all of them.`,
      },
    ],
    tools: TOOLS,
    maxTokens: 4096,
    evaluate(response: ChatResponse): TestResult {
      const toolCalls = extractToolCalls(response).map(
        (tc) => tc.function.name,
      );
      const hasWrite = hasToolCall(response, "write_file");
      const content = getToolCallArg(response, "write_file", "content") ?? "";
      const hasMinimax = content.toLowerCase().includes("minimax");
      const hasCheckWinner =
        content.includes("checkWinner") || content.includes("check_winner");
      const hasBoard = content.includes("Board");

      return {
        passed: hasWrite && content.length > 100,
        details: [
          hasWrite ? "✅ write_file called" : "❌ No write_file tool call",
          content.length > 100
            ? `✅ Substantial code (${content.length} chars)`
            : `❌ Insufficient code (${content.length} chars)`,
          hasMinimax ? "✅ Has minimax" : "❌ Missing minimax",
          hasCheckWinner ? "✅ Has checkWinner" : "❌ Missing checkWinner",
          hasBoard ? "✅ Has Board type" : "❌ Missing Board type",
          `Tool calls: ${toolCalls.join(", ") || "none"}`,
        ].join("\n    "),
        toolCalls,
        codeQuality: content.length > 0 ? `${content.length} chars` : "empty",
      };
    },
  },
];

async function runTest(
  modelId: string,
  test: TestCase,
): Promise<{ result: TestResult; tokens: number; timeMs: number }> {
  const start = Date.now();

  const body = {
    model: modelId,
    messages: test.messages,
    tools: test.tools,
    max_tokens: test.maxTokens,
    temperature: 0,
  };

  const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify(body),
  });

  const timeMs = Date.now() - start;

  if (!response.ok) {
    const error = await response.text();
    return {
      result: {
        passed: false,
        details: `❌ HTTP ${response.status}: ${error}`,
        toolCalls: [],
      },
      tokens: 0,
      timeMs,
    };
  }

  const data: ChatResponse = await response.json();
  const tokens = data.usage?.completion_tokens ?? 0;

  const result = test.evaluate(data);
  if (!result.passed) {
    const msg = data.choices?.[0]?.message;
    log.info(`  [DEBUG] Content: ${msg?.content?.slice(0, 200) ?? "null"}`);
    log.info(`  [DEBUG] Tool calls: ${JSON.stringify(msg?.tool_calls?.map((tc) => tc.function.name) ?? [])}`);
    log.info(`  [DEBUG] Finish reason: ${data.choices?.[0]?.finish_reason}`);
  }

  return { result, tokens, timeMs };
}

async function prewarmModel(modelId: string): Promise<boolean> {
  log.info(`  Pre-warming ${modelId}...`);
  const start = Date.now();

  // Send a trivial request to trigger model loading in the pool
  const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({
      model: modelId,
      messages: [{ role: "user", content: "Say hello." }],
      max_tokens: 16,
      temperature: 0,
    }),
    signal: AbortSignal.timeout(600_000), // 10 min for large model loading
  });

  const elapsed = ((Date.now() - start) / 1000).toFixed(1);

  if (!response.ok) {
    const error = await response.text();
    log.error(`  ❌ Pre-warm failed after ${elapsed}s: ${error}`);
    return false;
  }

  log.info(`  ✅ Model ready in ${elapsed}s`);
  return true;
}

async function benchmarkModel(modelId: string): Promise<BenchmarkResult> {
  log.info(`\n${"=".repeat(60)}`);
  log.info(`  BENCHMARKING: ${modelId}`);
  log.info(`${"=".repeat(60)}`);

  let totalTokens = 0;
  let totalTimeMs = 0;
  const results: (TestResult | { passed: false; details: string })[] = [];

  for (const test of TEST_CASES) {
    log.info(`\n  ${test.name} (${test.complexity} complexity):`);
    try {
      const { result, tokens, timeMs } = await runTest(modelId, test);
      totalTokens += tokens;
      totalTimeMs += timeMs;
      results.push(result);

      const status = result.passed ? "PASS ✅" : "FAIL ❌";
      log.info(`    ${status} (${tokens} tokens, ${(timeMs / 1000).toFixed(1)}s)`);
      log.info(`    ${result.details}`);
      if (result.toolCalls?.length) {
        log.info(`    Tools: ${result.toolCalls.join(" → ")}`);
      }
    } catch (err) {
      const msg = err instanceof Error ? err.message : String(err);
      results.push({ passed: false, details: `❌ Error: ${msg}` });
      log.info(`    FAIL ❌ — ${msg}`);
    }
  }

  const avgTokPerSec =
    totalTimeMs > 0 ? totalTokens / (totalTimeMs / 1000) : 0;

  return {
    model: modelId,
    testA: results[0],
    testB: results[1],
    testC: results[2],
    totalTokens,
    totalTimeMs,
    avgTokPerSec,
  };
}

function printSummary(results: BenchmarkResult[]): void {
  log.info(`\n${"=".repeat(70)}`);
  log.info("  BENCHMARK SUMMARY");
  log.info(`${"=".repeat(70)}`);

  const header =
    "| Model                        | Test A | Test B | Test C | Tokens | Speed (tok/s) |";
  const sep = "|" + "-".repeat(header.length - 2) + "|";

  log.info(header);
  log.info(sep);

  for (const r of results) {
    const a = r.testA.passed ? "✅" : "❌";
    const b = r.testB.passed ? "✅" : "❌";
    const c = r.testC.passed ? "✅" : "❌";
    log.info(
      `| ${r.model.padEnd(28)} | ${a}     | ${b}     | ${c}     | ${String(r.totalTokens).padStart(6)} | ${r.avgTokPerSec.toFixed(1).padStart(13)} |`,
    );
  }

  log.info(sep);
  log.info("");

  for (const r of results) {
    const passed = [r.testA, r.testB, r.testC].filter((t) => t.passed).length;
    let verdict: string;
    if (passed === 3) verdict = "Primary candidate — all tests passed";
    else if (passed === 2) verdict = "Viable for simple/medium tasks";
    else if (passed === 1) verdict = "Simple tasks only";
    else verdict = "Not viable for agentic use";
    log.info(`  ${r.model}: ${passed}/3 — ${verdict}`);
  }
}

async function main(): Promise<void> {
  const models = process.argv.slice(2);
  if (models.length === 0) {
    log.error(
      "Usage: npx tsx benchmark-models.ts <model1> [model2] [model3] ...",
    );
    log.error(
      "Example: npx tsx benchmark-models.ts qwen3-8b hermes-4-70b nemotron-h-47b-iq3",
    );
    process.exit(1);
  }

  log.info(`Model Boss: ${MODEL_BOSS_URL}`);
  log.info(`Models to test: ${models.join(", ")}`);
  log.info(`Tests per model: ${TEST_CASES.length}`);

  const results: BenchmarkResult[] = [];

  for (const modelId of models) {
    const check = await fetch(`${MODEL_BOSS_URL}/api/v1/models/${modelId}`);
    if (!check.ok) {
      log.error(`\n❌ Model "${modelId}" not found in registry. Skipping.`);
      continue;
    }

    const warmed = await prewarmModel(modelId);
    if (!warmed) {
      log.error(`  Skipping ${modelId} — failed to load.`);
      continue;
    }

    const result = await benchmarkModel(modelId);
    results.push(result);
  }

  if (results.length > 0) {
    printSummary(results);

    const outPath = `/tmp/kthulu-benchmark-${new Date().toISOString().slice(0, 19).replace(/:/g, "")}.json`;
    const { writeFileSync } = await import("node:fs");
    writeFileSync(outPath, JSON.stringify(results, null, 2));
    log.info(`\nResults saved to: ${outPath}`);
  }
}

main().catch((err) => {
  log.error("Benchmark failed:", err);
  process.exit(1);
});