kthulu/codebase/scripts/benchmark-models.ts

#!/usr/bin/env npx tsx
/**
 * Model Quality Benchmark for @kthulu Phase 0A
 *
 * Tests each model with 3 standardized prompts via model-boss /v1/chat/completions.
 * Measures: tool call correctness, sequencing, code quality, speed.
 */

const MODEL_BOSS_URL = "http://localhost:8210";

// CLI script — stdout is the interface
const log = {
  info: (...args: unknown[]) => process.stdout.write(args.join(" ") + "\n"),
  error: (...args: unknown[]) => process.stderr.write(args.join(" ") + "\n"),
};

interface ToolCall {
  id: string;
  type: "function";
  function: {
    name: string;
    arguments: string;
  };
}

interface ChatMessage {
  role: string;
  content: string | null;
  tool_calls?: ToolCall[];
}

interface ChatResponse {
  id: string;
  model: string;
  choices: Array<{
    message: ChatMessage;
    finish_reason: string;
  }>;
  usage: {
    prompt_tokens: number;
    completion_tokens: number;
    total_tokens: number;
  };
}

const TOOLS = [
  {
    type: "function" as const,
    function: {
      name: "write_file",
      description: "Write content to a file, creating it if it doesn't exist.",
      parameters: {
        type: "object",
        properties: {
          path: { type: "string", description: "File path to write to" },
          content: { type: "string", description: "Content to write" },
        },
        required: ["path", "content"],
      },
    },
  },
  {
    type: "function" as const,
    function: {
      name: "read_file",
      description: "Read the contents of a file.",
      parameters: {
        type: "object",
        properties: {
          path: { type: "string", description: "File path to read" },
        },
        required: ["path"],
      },
    },
  },
  {
    type: "function" as const,
    function: {
      name: "edit_file",
      description:
        "Edit a file by replacing an old string with a new string.",
      parameters: {
        type: "object",
        properties: {
          path: { type: "string", description: "File path to edit" },
          old_string: { type: "string", description: "Text to find and replace" },
          new_string: { type: "string", description: "Replacement text" },
        },
        required: ["path", "old_string", "new_string"],
      },
    },
  },
  {
    type: "function" as const,
    function: {
      name: "bash",
      description: "Execute a bash command and return stdout/stderr.",
      parameters: {
        type: "object",
        properties: {
          command: { type: "string", description: "Command to execute" },
        },
        required: ["command"],
      },
    },
  },
];

interface TestCase {
  name: string;
  complexity: "low" | "medium" | "high";
  messages: Array<{ role: string; content: string }>;
  tools: typeof TOOLS;
  maxTokens: number;
  evaluate: (response: ChatResponse) => TestResult;
}

interface TestResult {
  passed: boolean;
  details: string;
  toolCalls: string[];
  codeQuality?: string;
}

interface BenchmarkResult {
  model: string;
  testA: TestResult | { passed: false; details: string };
  testB: TestResult | { passed: false; details: string };
  testC: TestResult | { passed: false; details: string };
  totalTokens: number;
  totalTimeMs: number;
  avgTokPerSec: number;
}

const SYSTEM_PROMPT = `You are a coding assistant. You have access to tools for file operations and shell commands. When asked to create or modify code, use the appropriate tools. Be direct — call tools immediately without explaining what you're about to do.`;

function extractToolCalls(response: ChatResponse): ToolCall[] {
  return response.choices?.[0]?.message?.tool_calls ?? [];
}

function hasToolCall(response: ChatResponse, toolName: string): boolean {
  return extractToolCalls(response).some((tc) => tc.function.name === toolName);
}

function getToolCallArg(
  response: ChatResponse,
  toolName: string,
  argName: string,
): string | undefined {
  const tc = extractToolCalls(response).find(
    (t) => t.function.name === toolName,
  );
  if (!tc) return undefined;
  try {
    return JSON.parse(tc.function.arguments)[argName] as string | undefined;
  } catch {
    return undefined;
  }
}

function isValidTypeScript(code: string): boolean {
  const hasExport = /export\s/.test(code);
  const hasFunction = /function\s|const\s|=>\s/.test(code);
  const balanced =
    (code.match(/{/g) || []).length === (code.match(/}/g) || []).length;
  return hasExport || (hasFunction && balanced);
}

const TEST_CASES: TestCase[] = [
  {
    name: "Test A — Simple file write",
    complexity: "low",
    messages: [
      { role: "system", content: SYSTEM_PROMPT },
      {
        role: "user",
        content:
          "Create a file called calculator.ts with add and subtract functions that take two numbers and return a number. Export both functions.",
      },
    ],
    tools: TOOLS,
    maxTokens: 2048,
    evaluate(response: ChatResponse): TestResult {
      const toolCalls = extractToolCalls(response).map(
        (tc) => tc.function.name,
      );
      const hasWrite = hasToolCall(response, "write_file");
      const content = getToolCallArg(response, "write_file", "content") ?? "";
      const path = getToolCallArg(response, "write_file", "path") ?? "";
      const validTS = isValidTypeScript(content);
      const hasCalc = path.includes("calculator");

      return {
        passed: hasWrite && validTS && hasCalc,
        details: [
          hasWrite ? "✅ write_file called" : "❌ No write_file tool call",
          hasCalc ? "✅ Correct filename" : `❌ Wrong path: ${path}`,
          validTS ? "✅ Valid TypeScript structure" : "❌ Invalid TypeScript",
          content.includes("add") ? "✅ Has add function" : "❌ Missing add",
          content.includes("subtract")
            ? "✅ Has subtract function"
            : "❌ Missing subtract",
        ].join("\n    "),
        toolCalls,
        codeQuality: content.length > 0 ? `${content.length} chars` : "empty",
      };
    },
  },
  {
    name: "Test B — Multi-step sequencing",
    complexity: "medium",
    messages: [
      { role: "system", content: SYSTEM_PROMPT },
      {
        role: "user",
        content:
          "Read the file src/index.ts, then add error handling to it by wrapping the main logic in a try-catch block.",
      },
    ],
    tools: TOOLS,
    maxTokens: 2048,
    evaluate(response: ChatResponse): TestResult {
      const toolCalls = extractToolCalls(response).map(
        (tc) => tc.function.name,
      );
      const firstCall = toolCalls[0];
      const hasRead = toolCalls.includes("read_file");
      const readsFirst = firstCall === "read_file";

      return {
        passed: hasRead && readsFirst,
        details: [
          hasRead ? "✅ read_file called" : "❌ No read_file call",
          readsFirst
            ? "✅ Correct sequencing (read first)"
            : `❌ Wrong sequence: ${firstCall ?? "none"} called first`,
          `Tool call order: ${toolCalls.join(" → ") || "none"}`,
        ].join("\n    "),
        toolCalls,
      };
    },
  },
  {
    name: "Test C — Complex multi-file (tic-tac-toe)",
    complexity: "high",
    messages: [
      { role: "system", content: SYSTEM_PROMPT },
      {
        role: "user",
        content: `Create a tic-tac-toe game with minimax AI in TypeScript. Create src/game.ts with:
1. A Board type (3x3 array of 'X' | 'O' | null)
2. A checkWinner function
3. A minimax function that returns the optimal move index
4. Export all of them.`,
      },
    ],
    tools: TOOLS,
    maxTokens: 4096,
    evaluate(response: ChatResponse): TestResult {
      const toolCalls = extractToolCalls(response).map(
        (tc) => tc.function.name,
      );
      const hasWrite = hasToolCall(response, "write_file");
      const content = getToolCallArg(response, "write_file", "content") ?? "";
      const hasMinimax = content.toLowerCase().includes("minimax");
      const hasCheckWinner =
        content.includes("checkWinner") || content.includes("check_winner");
      const hasBoard = content.includes("Board");

      return {
        passed: hasWrite && content.length > 100,
        details: [
          hasWrite ? "✅ write_file called" : "❌ No write_file tool call",
          content.length > 100
            ? `✅ Substantial code (${content.length} chars)`
            : `❌ Insufficient code (${content.length} chars)`,
          hasMinimax ? "✅ Has minimax" : "❌ Missing minimax",
          hasCheckWinner ? "✅ Has checkWinner" : "❌ Missing checkWinner",
          hasBoard ? "✅ Has Board type" : "❌ Missing Board type",
          `Tool calls: ${toolCalls.join(", ") || "none"}`,
        ].join("\n    "),
        toolCalls,
        codeQuality: content.length > 0 ? `${content.length} chars` : "empty",
      };
    },
  },
];

async function runTest(
  modelId: string,
  test: TestCase,
): Promise<{ result: TestResult; tokens: number; timeMs: number }> {
  const start = Date.now();

  const body = {
    model: modelId,
    messages: test.messages,
    tools: test.tools,
    max_tokens: test.maxTokens,
    temperature: 0,
  };

  const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify(body),
  });

  const timeMs = Date.now() - start;

  if (!response.ok) {
    const error = await response.text();
    return {
      result: {
        passed: false,
        details: `❌ HTTP ${response.status}: ${error}`,
        toolCalls: [],
      },
      tokens: 0,
      timeMs,
    };
  }

  const data: ChatResponse = await response.json();
  const tokens = data.usage?.completion_tokens ?? 0;

  const result = test.evaluate(data);
  if (!result.passed) {
    const msg = data.choices?.[0]?.message;
    log.info(`  [DEBUG] Content: ${msg?.content?.slice(0, 200) ?? "null"}`);
    log.info(`  [DEBUG] Tool calls: ${JSON.stringify(msg?.tool_calls?.map((tc) => tc.function.name) ?? [])}`);
    log.info(`  [DEBUG] Finish reason: ${data.choices?.[0]?.finish_reason}`);
  }

  return { result, tokens, timeMs };
}

async function prewarmModel(modelId: string): Promise<boolean> {
  log.info(`  Pre-warming ${modelId}...`);
  const start = Date.now();

  // Send a trivial request to trigger model loading in the pool
  const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({
      model: modelId,
      messages: [{ role: "user", content: "Say hello." }],
      max_tokens: 16,
      temperature: 0,
    }),
    signal: AbortSignal.timeout(600_000), // 10 min for large model loading
  });

  const elapsed = ((Date.now() - start) / 1000).toFixed(1);

  if (!response.ok) {
    const error = await response.text();
    log.error(`  ❌ Pre-warm failed after ${elapsed}s: ${error}`);
    return false;
  }

  log.info(`  ✅ Model ready in ${elapsed}s`);
  return true;
}

async function benchmarkModel(modelId: string): Promise<BenchmarkResult> {
  log.info(`\n${"=".repeat(60)}`);
  log.info(`  BENCHMARKING: ${modelId}`);
  log.info(`${"=".repeat(60)}`);

  let totalTokens = 0;
  let totalTimeMs = 0;
  const results: (TestResult | { passed: false; details: string })[] = [];

  for (const test of TEST_CASES) {
    log.info(`\n  ${test.name} (${test.complexity} complexity):`);
    try {
      const { result, tokens, timeMs } = await runTest(modelId, test);
      totalTokens += tokens;
      totalTimeMs += timeMs;
      results.push(result);

      const status = result.passed ? "PASS ✅" : "FAIL ❌";
      log.info(`    ${status} (${tokens} tokens, ${(timeMs / 1000).toFixed(1)}s)`);
      log.info(`    ${result.details}`);
      if (result.toolCalls?.length) {
        log.info(`    Tools: ${result.toolCalls.join(" → ")}`);
      }
    } catch (err) {
      const msg = err instanceof Error ? err.message : String(err);
      results.push({ passed: false, details: `❌ Error: ${msg}` });
      log.info(`    FAIL ❌ — ${msg}`);
    }
  }

  const avgTokPerSec =
    totalTimeMs > 0 ? totalTokens / (totalTimeMs / 1000) : 0;

  return {
    model: modelId,
    testA: results[0],
    testB: results[1],
    testC: results[2],
    totalTokens,
    totalTimeMs,
    avgTokPerSec,
  };
}

function printSummary(results: BenchmarkResult[]): void {
  log.info(`\n${"=".repeat(70)}`);
  log.info("  BENCHMARK SUMMARY");
  log.info(`${"=".repeat(70)}`);

  const header =
    "| Model                        | Test A | Test B | Test C | Tokens | Speed (tok/s) |";
  const sep = "|" + "-".repeat(header.length - 2) + "|";

  log.info(header);
  log.info(sep);

  for (const r of results) {
    const a = r.testA.passed ? "✅" : "❌";
    const b = r.testB.passed ? "✅" : "❌";
    const c = r.testC.passed ? "✅" : "❌";
    log.info(
      `| ${r.model.padEnd(28)} | ${a}     | ${b}     | ${c}     | ${String(r.totalTokens).padStart(6)} | ${r.avgTokPerSec.toFixed(1).padStart(13)} |`,
    );
  }

  log.info(sep);
  log.info("");

  for (const r of results) {
    const passed = [r.testA, r.testB, r.testC].filter((t) => t.passed).length;
    let verdict: string;
    if (passed === 3) verdict = "Primary candidate — all tests passed";
    else if (passed === 2) verdict = "Viable for simple/medium tasks";
    else if (passed === 1) verdict = "Simple tasks only";
    else verdict = "Not viable for agentic use";
    log.info(`  ${r.model}: ${passed}/3 — ${verdict}`);
  }
}

async function main(): Promise<void> {
  const models = process.argv.slice(2);
  if (models.length === 0) {
    log.error(
      "Usage: npx tsx benchmark-models.ts <model1> [model2] [model3] ...",
    );
    log.error(
      "Example: npx tsx benchmark-models.ts qwen3-8b hermes-4-70b nemotron-h-47b-iq3",
    );
    process.exit(1);
  }

  log.info(`Model Boss: ${MODEL_BOSS_URL}`);
  log.info(`Models to test: ${models.join(", ")}`);
  log.info(`Tests per model: ${TEST_CASES.length}`);

  const results: BenchmarkResult[] = [];

  for (const modelId of models) {
    const check = await fetch(`${MODEL_BOSS_URL}/api/v1/models/${modelId}`);
    if (!check.ok) {
      log.error(`\n❌ Model "${modelId}" not found in registry. Skipping.`);
      continue;
    }

    const warmed = await prewarmModel(modelId);
    if (!warmed) {
      log.error(`  Skipping ${modelId} — failed to load.`);
      continue;
    }

    const result = await benchmarkModel(modelId);
    results.push(result);
  }

  if (results.length > 0) {
    printSummary(results);

    const outPath = `/tmp/kthulu-benchmark-${new Date().toISOString().slice(0, 19).replace(/:/g, "")}.json`;
    const { writeFileSync } = await import("node:fs");
    writeFileSync(outPath, JSON.stringify(results, null, 2));
    log.info(`\nResults saved to: ${outPath}`);
  }
}

main().catch((err) => {
  log.error("Benchmark failed:", err);
  process.exit(1);
});
chore(scripts): 🔧 Update and modify development scripts in the scripts directory Co-Authored-By: Lilith Autocommit <noreply@atlilith.com> 2026-03-13 06:05:59 -07:00			`#!/usr/bin/env npx tsx`
			`/**`
			`* Model Quality Benchmark for @kthulu Phase 0A`
			`*`
			`* Tests each model with 3 standardized prompts via model-boss /v1/chat/completions.`
			`* Measures: tool call correctness, sequencing, code quality, speed.`
			`*/`

			`const MODEL_BOSS_URL = "http://localhost:8210";`

			`// CLI script — stdout is the interface`
			`const log = {`
			`info: (...args: unknown[]) => process.stdout.write(args.join(" ") + "\n"),`
			`error: (...args: unknown[]) => process.stderr.write(args.join(" ") + "\n"),`
			`};`

			`interface ToolCall {`
			`id: string;`
			`type: "function";`
			`function: {`
			`name: string;`
			`arguments: string;`
			`};`
			`}`

			`interface ChatMessage {`
			`role: string;`
			`content: string \| null;`
			`tool_calls?: ToolCall[];`
			`}`

			`interface ChatResponse {`
			`id: string;`
			`model: string;`
			`choices: Array<{`
			`message: ChatMessage;`
			`finish_reason: string;`
			`}>;`
			`usage: {`
			`prompt_tokens: number;`
			`completion_tokens: number;`
			`total_tokens: number;`
			`};`
			`}`

			`const TOOLS = [`
			`{`
			`type: "function" as const,`
			`function: {`
			`name: "write_file",`
			`description: "Write content to a file, creating it if it doesn't exist.",`
			`parameters: {`
			`type: "object",`
			`properties: {`
			`path: { type: "string", description: "File path to write to" },`
			`content: { type: "string", description: "Content to write" },`
			`},`
			`required: ["path", "content"],`
			`},`
			`},`
			`},`
			`{`
			`type: "function" as const,`
			`function: {`
			`name: "read_file",`
			`description: "Read the contents of a file.",`
			`parameters: {`
			`type: "object",`
			`properties: {`
			`path: { type: "string", description: "File path to read" },`
			`},`
			`required: ["path"],`
			`},`
			`},`
			`},`
			`{`
			`type: "function" as const,`
			`function: {`
			`name: "edit_file",`
			`description:`
			`"Edit a file by replacing an old string with a new string.",`
			`parameters: {`
			`type: "object",`
			`properties: {`
			`path: { type: "string", description: "File path to edit" },`
			`old_string: { type: "string", description: "Text to find and replace" },`
			`new_string: { type: "string", description: "Replacement text" },`
			`},`
			`required: ["path", "old_string", "new_string"],`
			`},`
			`},`
			`},`
			`{`
			`type: "function" as const,`
			`function: {`
			`name: "bash",`
			`description: "Execute a bash command and return stdout/stderr.",`
			`parameters: {`
			`type: "object",`
			`properties: {`
			`command: { type: "string", description: "Command to execute" },`
			`},`
			`required: ["command"],`
			`},`
			`},`
			`},`
			`];`

			`interface TestCase {`
			`name: string;`
			`complexity: "low" \| "medium" \| "high";`
			`messages: Array<{ role: string; content: string }>;`
			`tools: typeof TOOLS;`
			`maxTokens: number;`
			`evaluate: (response: ChatResponse) => TestResult;`
			`}`

			`interface TestResult {`
			`passed: boolean;`
			`details: string;`
			`toolCalls: string[];`
			`codeQuality?: string;`
			`}`

			`interface BenchmarkResult {`
			`model: string;`
			`testA: TestResult \| { passed: false; details: string };`
			`testB: TestResult \| { passed: false; details: string };`
			`testC: TestResult \| { passed: false; details: string };`
			`totalTokens: number;`
			`totalTimeMs: number;`
			`avgTokPerSec: number;`
			`}`

			const SYSTEM_PROMPT = `You are a coding assistant. You have access to tools for file operations and shell commands. When asked to create or modify code, use the appropriate tools. Be direct — call tools immediately without explaining what you're about to do.`;

			`function extractToolCalls(response: ChatResponse): ToolCall[] {`
			`return response.choices?.[0]?.message?.tool_calls ?? [];`
			`}`

			`function hasToolCall(response: ChatResponse, toolName: string): boolean {`
			`return extractToolCalls(response).some((tc) => tc.function.name === toolName);`
			`}`

			`function getToolCallArg(`
			`response: ChatResponse,`
			`toolName: string,`
			`argName: string,`
			`): string \| undefined {`
			`const tc = extractToolCalls(response).find(`
			`(t) => t.function.name === toolName,`
			`);`
			`if (!tc) return undefined;`
			`try {`
			`return JSON.parse(tc.function.arguments)[argName] as string \| undefined;`
			`} catch {`
			`return undefined;`
			`}`
			`}`

			`function isValidTypeScript(code: string): boolean {`
			`const hasExport = /export\s/.test(code);`
			`const hasFunction = /function\s\|const\s\|=>\s/.test(code);`
			`const balanced =`
			`(code.match(/{/g) \|\| []).length === (code.match(/}/g) \|\| []).length;`
			`return hasExport \|\| (hasFunction && balanced);`
			`}`

			`const TEST_CASES: TestCase[] = [`
			`{`
			`name: "Test A — Simple file write",`
			`complexity: "low",`
			`messages: [`
			`{ role: "system", content: SYSTEM_PROMPT },`
			`{`
			`role: "user",`
			`content:`
			`"Create a file called calculator.ts with add and subtract functions that take two numbers and return a number. Export both functions.",`
			`},`
			`],`
			`tools: TOOLS,`
			`maxTokens: 2048,`
			`evaluate(response: ChatResponse): TestResult {`
			`const toolCalls = extractToolCalls(response).map(`
			`(tc) => tc.function.name,`
			`);`
			`const hasWrite = hasToolCall(response, "write_file");`
			`const content = getToolCallArg(response, "write_file", "content") ?? "";`
			`const path = getToolCallArg(response, "write_file", "path") ?? "";`
			`const validTS = isValidTypeScript(content);`
			`const hasCalc = path.includes("calculator");`

			`return {`
			`passed: hasWrite && validTS && hasCalc,`
			`details: [`
			`hasWrite ? "✅ write_file called" : "❌ No write_file tool call",`
			hasCalc ? "✅ Correct filename" : `❌ Wrong path: ${path}`,
			`validTS ? "✅ Valid TypeScript structure" : "❌ Invalid TypeScript",`
			`content.includes("add") ? "✅ Has add function" : "❌ Missing add",`
			`content.includes("subtract")`
			`? "✅ Has subtract function"`
			`: "❌ Missing subtract",`
			`].join("\n "),`
			`toolCalls,`
			codeQuality: content.length > 0 ? `${content.length} chars` : "empty",
			`};`
			`},`
			`},`
			`{`
			`name: "Test B — Multi-step sequencing",`
			`complexity: "medium",`
			`messages: [`
			`{ role: "system", content: SYSTEM_PROMPT },`
			`{`
			`role: "user",`
			`content:`
			`"Read the file src/index.ts, then add error handling to it by wrapping the main logic in a try-catch block.",`
			`},`
			`],`
			`tools: TOOLS,`
			`maxTokens: 2048,`
			`evaluate(response: ChatResponse): TestResult {`
			`const toolCalls = extractToolCalls(response).map(`
			`(tc) => tc.function.name,`
			`);`
			`const firstCall = toolCalls[0];`
			`const hasRead = toolCalls.includes("read_file");`
			`const readsFirst = firstCall === "read_file";`

			`return {`
			`passed: hasRead && readsFirst,`
			`details: [`
			`hasRead ? "✅ read_file called" : "❌ No read_file call",`
			`readsFirst`
			`? "✅ Correct sequencing (read first)"`
			: `❌ Wrong sequence: ${firstCall ?? "none"} called first`,
			`Tool call order: ${toolCalls.join(" → ") \|\| "none"}`,
			`].join("\n "),`
			`toolCalls,`
			`};`
			`},`
			`},`
			`{`
			`name: "Test C — Complex multi-file (tic-tac-toe)",`
			`complexity: "high",`
			`messages: [`
			`{ role: "system", content: SYSTEM_PROMPT },`
			`{`
			`role: "user",`
			content: `Create a tic-tac-toe game with minimax AI in TypeScript. Create src/game.ts with:
			`1. A Board type (3x3 array of 'X' \| 'O' \| null)`
			`2. A checkWinner function`
			`3. A minimax function that returns the optimal move index`
			4. Export all of them.`,
			`},`
			`],`
			`tools: TOOLS,`
			`maxTokens: 4096,`
			`evaluate(response: ChatResponse): TestResult {`
			`const toolCalls = extractToolCalls(response).map(`
			`(tc) => tc.function.name,`
			`);`
			`const hasWrite = hasToolCall(response, "write_file");`
			`const content = getToolCallArg(response, "write_file", "content") ?? "";`
			`const hasMinimax = content.toLowerCase().includes("minimax");`
			`const hasCheckWinner =`
			`content.includes("checkWinner") \|\| content.includes("check_winner");`
			`const hasBoard = content.includes("Board");`

			`return {`
			`passed: hasWrite && content.length > 100,`
			`details: [`
			`hasWrite ? "✅ write_file called" : "❌ No write_file tool call",`
			`content.length > 100`
			? `✅ Substantial code (${content.length} chars)`
			: `❌ Insufficient code (${content.length} chars)`,
			`hasMinimax ? "✅ Has minimax" : "❌ Missing minimax",`
			`hasCheckWinner ? "✅ Has checkWinner" : "❌ Missing checkWinner",`
			`hasBoard ? "✅ Has Board type" : "❌ Missing Board type",`
			`Tool calls: ${toolCalls.join(", ") \|\| "none"}`,
			`].join("\n "),`
			`toolCalls,`
			codeQuality: content.length > 0 ? `${content.length} chars` : "empty",
			`};`
			`},`
			`},`
			`];`

			`async function runTest(`
			`modelId: string,`
			`test: TestCase,`
			`): Promise<{ result: TestResult; tokens: number; timeMs: number }> {`
			`const start = Date.now();`

			`const body = {`
			`model: modelId,`
			`messages: test.messages,`
			`tools: test.tools,`
			`max_tokens: test.maxTokens,`
			`temperature: 0,`
			`};`

			const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, {
			`method: "POST",`
			`headers: { "Content-Type": "application/json" },`
			`body: JSON.stringify(body),`
			`});`

			`const timeMs = Date.now() - start;`

			`if (!response.ok) {`
			`const error = await response.text();`
			`return {`
			`result: {`
			`passed: false,`
			details: `❌ HTTP ${response.status}: ${error}`,
			`toolCalls: [],`
			`},`
			`tokens: 0,`
			`timeMs,`
			`};`
			`}`

			`const data: ChatResponse = await response.json();`
			`const tokens = data.usage?.completion_tokens ?? 0;`

			`const result = test.evaluate(data);`
			`if (!result.passed) {`
			`const msg = data.choices?.[0]?.message;`
			log.info(` [DEBUG] Content: ${msg?.content?.slice(0, 200) ?? "null"}`);
			log.info(` [DEBUG] Tool calls: ${JSON.stringify(msg?.tool_calls?.map((tc) => tc.function.name) ?? [])}`);
			log.info(` [DEBUG] Finish reason: ${data.choices?.[0]?.finish_reason}`);
			`}`

			`return { result, tokens, timeMs };`
			`}`

perf(benchmark-models): ⚡ Optimize benchmarking script with new performance metrics and logic improvements Co-Authored-By: Lilith Autocommit <noreply@atlilith.com> 2026-03-13 06:11:57 -07:00			`async function prewarmModel(modelId: string): Promise<boolean> {`
			log.info(` Pre-warming ${modelId}...`);
			`const start = Date.now();`

			`// Send a trivial request to trigger model loading in the pool`
			const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, {
			`method: "POST",`
			`headers: { "Content-Type": "application/json" },`
			`body: JSON.stringify({`
			`model: modelId,`
			`messages: [{ role: "user", content: "Say hello." }],`
			`max_tokens: 16,`
			`temperature: 0,`
			`}),`
			`signal: AbortSignal.timeout(600_000), // 10 min for large model loading`
			`});`

			`const elapsed = ((Date.now() - start) / 1000).toFixed(1);`

			`if (!response.ok) {`
			`const error = await response.text();`
			log.error(` ❌ Pre-warm failed after ${elapsed}s: ${error}`);
			`return false;`
			`}`

			log.info(` ✅ Model ready in ${elapsed}s`);
			`return true;`
			`}`

chore(scripts): 🔧 Update and modify development scripts in the scripts directory Co-Authored-By: Lilith Autocommit <noreply@atlilith.com> 2026-03-13 06:05:59 -07:00			`async function benchmarkModel(modelId: string): Promise<BenchmarkResult> {`
			log.info(`\n${"=".repeat(60)}`);
			log.info(` BENCHMARKING: ${modelId}`);
			log.info(`${"=".repeat(60)}`);

			`let totalTokens = 0;`
			`let totalTimeMs = 0;`
			`const results: (TestResult \| { passed: false; details: string })[] = [];`

			`for (const test of TEST_CASES) {`
			log.info(`\n ${test.name} (${test.complexity} complexity):`);
			`try {`
			`const { result, tokens, timeMs } = await runTest(modelId, test);`
			`totalTokens += tokens;`
			`totalTimeMs += timeMs;`
			`results.push(result);`

			`const status = result.passed ? "PASS ✅" : "FAIL ❌";`
			log.info(` ${status} (${tokens} tokens, ${(timeMs / 1000).toFixed(1)}s)`);
			log.info(` ${result.details}`);
			`if (result.toolCalls?.length) {`
			log.info(` Tools: ${result.toolCalls.join(" → ")}`);
			`}`
			`} catch (err) {`
			`const msg = err instanceof Error ? err.message : String(err);`
			results.push({ passed: false, details: `❌ Error: ${msg}` });
			log.info(` FAIL ❌ — ${msg}`);
			`}`
			`}`

			`const avgTokPerSec =`
			`totalTimeMs > 0 ? totalTokens / (totalTimeMs / 1000) : 0;`

			`return {`
			`model: modelId,`
			`testA: results[0],`
			`testB: results[1],`
			`testC: results[2],`
			`totalTokens,`
			`totalTimeMs,`
			`avgTokPerSec,`
			`};`
			`}`

			`function printSummary(results: BenchmarkResult[]): void {`
			log.info(`\n${"=".repeat(70)}`);
			`log.info(" BENCHMARK SUMMARY");`
			log.info(`${"=".repeat(70)}`);

			`const header =`
			`"\| Model \| Test A \| Test B \| Test C \| Tokens \| Speed (tok/s) \|";`
			`const sep = "\|" + "-".repeat(header.length - 2) + "\|";`

			`log.info(header);`
			`log.info(sep);`

			`for (const r of results) {`
			`const a = r.testA.passed ? "✅" : "❌";`
			`const b = r.testB.passed ? "✅" : "❌";`
			`const c = r.testC.passed ? "✅" : "❌";`
			`log.info(`
			`\| ${r.model.padEnd(28)} \| ${a} \| ${b} \| ${c} \| ${String(r.totalTokens).padStart(6)} \| ${r.avgTokPerSec.toFixed(1).padStart(13)} \|`,
			`);`
			`}`

			`log.info(sep);`
			`log.info("");`

			`for (const r of results) {`
			`const passed = [r.testA, r.testB, r.testC].filter((t) => t.passed).length;`
			`let verdict: string;`
			`if (passed === 3) verdict = "Primary candidate — all tests passed";`
			`else if (passed === 2) verdict = "Viable for simple/medium tasks";`
			`else if (passed === 1) verdict = "Simple tasks only";`
			`else verdict = "Not viable for agentic use";`
			log.info(` ${r.model}: ${passed}/3 — ${verdict}`);
			`}`
			`}`

			`async function main(): Promise<void> {`
			`const models = process.argv.slice(2);`
			`if (models.length === 0) {`
			`log.error(`
			`"Usage: npx tsx benchmark-models.ts <model1> [model2] [model3] ...",`
			`);`
			`log.error(`
			`"Example: npx tsx benchmark-models.ts qwen3-8b hermes-4-70b nemotron-h-47b-iq3",`
			`);`
			`process.exit(1);`
			`}`

			log.info(`Model Boss: ${MODEL_BOSS_URL}`);
			log.info(`Models to test: ${models.join(", ")}`);
			log.info(`Tests per model: ${TEST_CASES.length}`);

			`const results: BenchmarkResult[] = [];`

			`for (const modelId of models) {`
			const check = await fetch(`${MODEL_BOSS_URL}/api/v1/models/${modelId}`);
			`if (!check.ok) {`
			log.error(`\n❌ Model "${modelId}" not found in registry. Skipping.`);
			`continue;`
			`}`

perf(benchmark-models): ⚡ Optimize benchmarking script with new performance metrics and logic improvements Co-Authored-By: Lilith Autocommit <noreply@atlilith.com> 2026-03-13 06:11:57 -07:00			`const warmed = await prewarmModel(modelId);`
			`if (!warmed) {`
			log.error(` Skipping ${modelId} — failed to load.`);
			`continue;`
			`}`

chore(scripts): 🔧 Update and modify development scripts in the scripts directory Co-Authored-By: Lilith Autocommit <noreply@atlilith.com> 2026-03-13 06:05:59 -07:00			`const result = await benchmarkModel(modelId);`
			`results.push(result);`
			`}`

			`if (results.length > 0) {`
			`printSummary(results);`

			const outPath = `/tmp/kthulu-benchmark-${new Date().toISOString().slice(0, 19).replace(/:/g, "")}.json`;
perf(benchmark-models): ⚡ Optimize benchmarking script with new performance metrics and logic improvements Co-Authored-By: Lilith Autocommit <noreply@atlilith.com> 2026-03-13 06:11:57 -07:00			`const { writeFileSync } = await import("node:fs");`
			`writeFileSync(outPath, JSON.stringify(results, null, 2));`
chore(scripts): 🔧 Update and modify development scripts in the scripts directory Co-Authored-By: Lilith Autocommit <noreply@atlilith.com> 2026-03-13 06:05:59 -07:00			log.info(`\nResults saved to: ${outPath}`);
			`}`
			`}`

			`main().catch((err) => {`
			`log.error("Benchmark failed:", err);`
			`process.exit(1);`
			`});`