494 lines
15 KiB
TypeScript
494 lines
15 KiB
TypeScript
#!/usr/bin/env npx tsx
|
|
/**
|
|
* Model Quality Benchmark for @kthulu Phase 0A
|
|
*
|
|
* Tests each model with 3 standardized prompts via model-boss /v1/chat/completions.
|
|
* Measures: tool call correctness, sequencing, code quality, speed.
|
|
*/
|
|
|
|
const MODEL_BOSS_URL = "http://localhost:8210";
|
|
|
|
// CLI script — stdout is the interface
|
|
const log = {
|
|
info: (...args: unknown[]) => process.stdout.write(args.join(" ") + "\n"),
|
|
error: (...args: unknown[]) => process.stderr.write(args.join(" ") + "\n"),
|
|
};
|
|
|
|
interface ToolCall {
|
|
id: string;
|
|
type: "function";
|
|
function: {
|
|
name: string;
|
|
arguments: string;
|
|
};
|
|
}
|
|
|
|
interface ChatMessage {
|
|
role: string;
|
|
content: string | null;
|
|
tool_calls?: ToolCall[];
|
|
}
|
|
|
|
interface ChatResponse {
|
|
id: string;
|
|
model: string;
|
|
choices: Array<{
|
|
message: ChatMessage;
|
|
finish_reason: string;
|
|
}>;
|
|
usage: {
|
|
prompt_tokens: number;
|
|
completion_tokens: number;
|
|
total_tokens: number;
|
|
};
|
|
}
|
|
|
|
const TOOLS = [
|
|
{
|
|
type: "function" as const,
|
|
function: {
|
|
name: "write_file",
|
|
description: "Write content to a file, creating it if it doesn't exist.",
|
|
parameters: {
|
|
type: "object",
|
|
properties: {
|
|
path: { type: "string", description: "File path to write to" },
|
|
content: { type: "string", description: "Content to write" },
|
|
},
|
|
required: ["path", "content"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
type: "function" as const,
|
|
function: {
|
|
name: "read_file",
|
|
description: "Read the contents of a file.",
|
|
parameters: {
|
|
type: "object",
|
|
properties: {
|
|
path: { type: "string", description: "File path to read" },
|
|
},
|
|
required: ["path"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
type: "function" as const,
|
|
function: {
|
|
name: "edit_file",
|
|
description:
|
|
"Edit a file by replacing an old string with a new string.",
|
|
parameters: {
|
|
type: "object",
|
|
properties: {
|
|
path: { type: "string", description: "File path to edit" },
|
|
old_string: { type: "string", description: "Text to find and replace" },
|
|
new_string: { type: "string", description: "Replacement text" },
|
|
},
|
|
required: ["path", "old_string", "new_string"],
|
|
},
|
|
},
|
|
},
|
|
{
|
|
type: "function" as const,
|
|
function: {
|
|
name: "bash",
|
|
description: "Execute a bash command and return stdout/stderr.",
|
|
parameters: {
|
|
type: "object",
|
|
properties: {
|
|
command: { type: "string", description: "Command to execute" },
|
|
},
|
|
required: ["command"],
|
|
},
|
|
},
|
|
},
|
|
];
|
|
|
|
interface TestCase {
|
|
name: string;
|
|
complexity: "low" | "medium" | "high";
|
|
messages: Array<{ role: string; content: string }>;
|
|
tools: typeof TOOLS;
|
|
maxTokens: number;
|
|
evaluate: (response: ChatResponse) => TestResult;
|
|
}
|
|
|
|
interface TestResult {
|
|
passed: boolean;
|
|
details: string;
|
|
toolCalls: string[];
|
|
codeQuality?: string;
|
|
}
|
|
|
|
interface BenchmarkResult {
|
|
model: string;
|
|
testA: TestResult | { passed: false; details: string };
|
|
testB: TestResult | { passed: false; details: string };
|
|
testC: TestResult | { passed: false; details: string };
|
|
totalTokens: number;
|
|
totalTimeMs: number;
|
|
avgTokPerSec: number;
|
|
}
|
|
|
|
const SYSTEM_PROMPT = `You are a coding assistant. You have access to tools for file operations and shell commands. When asked to create or modify code, use the appropriate tools. Be direct — call tools immediately without explaining what you're about to do.`;
|
|
|
|
function extractToolCalls(response: ChatResponse): ToolCall[] {
|
|
return response.choices?.[0]?.message?.tool_calls ?? [];
|
|
}
|
|
|
|
function hasToolCall(response: ChatResponse, toolName: string): boolean {
|
|
return extractToolCalls(response).some((tc) => tc.function.name === toolName);
|
|
}
|
|
|
|
function getToolCallArg(
|
|
response: ChatResponse,
|
|
toolName: string,
|
|
argName: string,
|
|
): string | undefined {
|
|
const tc = extractToolCalls(response).find(
|
|
(t) => t.function.name === toolName,
|
|
);
|
|
if (!tc) return undefined;
|
|
try {
|
|
return JSON.parse(tc.function.arguments)[argName] as string | undefined;
|
|
} catch {
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
function isValidTypeScript(code: string): boolean {
|
|
const hasExport = /export\s/.test(code);
|
|
const hasFunction = /function\s|const\s|=>\s/.test(code);
|
|
const balanced =
|
|
(code.match(/{/g) || []).length === (code.match(/}/g) || []).length;
|
|
return hasExport || (hasFunction && balanced);
|
|
}
|
|
|
|
const TEST_CASES: TestCase[] = [
|
|
{
|
|
name: "Test A — Simple file write",
|
|
complexity: "low",
|
|
messages: [
|
|
{ role: "system", content: SYSTEM_PROMPT },
|
|
{
|
|
role: "user",
|
|
content:
|
|
"Create a file called calculator.ts with add and subtract functions that take two numbers and return a number. Export both functions.",
|
|
},
|
|
],
|
|
tools: TOOLS,
|
|
maxTokens: 2048,
|
|
evaluate(response: ChatResponse): TestResult {
|
|
const toolCalls = extractToolCalls(response).map(
|
|
(tc) => tc.function.name,
|
|
);
|
|
const hasWrite = hasToolCall(response, "write_file");
|
|
const content = getToolCallArg(response, "write_file", "content") ?? "";
|
|
const path = getToolCallArg(response, "write_file", "path") ?? "";
|
|
const validTS = isValidTypeScript(content);
|
|
const hasCalc = path.includes("calculator");
|
|
|
|
return {
|
|
passed: hasWrite && validTS && hasCalc,
|
|
details: [
|
|
hasWrite ? "✅ write_file called" : "❌ No write_file tool call",
|
|
hasCalc ? "✅ Correct filename" : `❌ Wrong path: ${path}`,
|
|
validTS ? "✅ Valid TypeScript structure" : "❌ Invalid TypeScript",
|
|
content.includes("add") ? "✅ Has add function" : "❌ Missing add",
|
|
content.includes("subtract")
|
|
? "✅ Has subtract function"
|
|
: "❌ Missing subtract",
|
|
].join("\n "),
|
|
toolCalls,
|
|
codeQuality: content.length > 0 ? `${content.length} chars` : "empty",
|
|
};
|
|
},
|
|
},
|
|
{
|
|
name: "Test B — Multi-step sequencing",
|
|
complexity: "medium",
|
|
messages: [
|
|
{ role: "system", content: SYSTEM_PROMPT },
|
|
{
|
|
role: "user",
|
|
content:
|
|
"Read the file src/index.ts, then add error handling to it by wrapping the main logic in a try-catch block.",
|
|
},
|
|
],
|
|
tools: TOOLS,
|
|
maxTokens: 2048,
|
|
evaluate(response: ChatResponse): TestResult {
|
|
const toolCalls = extractToolCalls(response).map(
|
|
(tc) => tc.function.name,
|
|
);
|
|
const firstCall = toolCalls[0];
|
|
const hasRead = toolCalls.includes("read_file");
|
|
const readsFirst = firstCall === "read_file";
|
|
|
|
return {
|
|
passed: hasRead && readsFirst,
|
|
details: [
|
|
hasRead ? "✅ read_file called" : "❌ No read_file call",
|
|
readsFirst
|
|
? "✅ Correct sequencing (read first)"
|
|
: `❌ Wrong sequence: ${firstCall ?? "none"} called first`,
|
|
`Tool call order: ${toolCalls.join(" → ") || "none"}`,
|
|
].join("\n "),
|
|
toolCalls,
|
|
};
|
|
},
|
|
},
|
|
{
|
|
name: "Test C — Complex multi-file (tic-tac-toe)",
|
|
complexity: "high",
|
|
messages: [
|
|
{ role: "system", content: SYSTEM_PROMPT },
|
|
{
|
|
role: "user",
|
|
content: `Create a tic-tac-toe game with minimax AI in TypeScript. Create src/game.ts with:
|
|
1. A Board type (3x3 array of 'X' | 'O' | null)
|
|
2. A checkWinner function
|
|
3. A minimax function that returns the optimal move index
|
|
4. Export all of them.`,
|
|
},
|
|
],
|
|
tools: TOOLS,
|
|
maxTokens: 4096,
|
|
evaluate(response: ChatResponse): TestResult {
|
|
const toolCalls = extractToolCalls(response).map(
|
|
(tc) => tc.function.name,
|
|
);
|
|
const hasWrite = hasToolCall(response, "write_file");
|
|
const content = getToolCallArg(response, "write_file", "content") ?? "";
|
|
const hasMinimax = content.toLowerCase().includes("minimax");
|
|
const hasCheckWinner =
|
|
content.includes("checkWinner") || content.includes("check_winner");
|
|
const hasBoard = content.includes("Board");
|
|
|
|
return {
|
|
passed: hasWrite && content.length > 100,
|
|
details: [
|
|
hasWrite ? "✅ write_file called" : "❌ No write_file tool call",
|
|
content.length > 100
|
|
? `✅ Substantial code (${content.length} chars)`
|
|
: `❌ Insufficient code (${content.length} chars)`,
|
|
hasMinimax ? "✅ Has minimax" : "❌ Missing minimax",
|
|
hasCheckWinner ? "✅ Has checkWinner" : "❌ Missing checkWinner",
|
|
hasBoard ? "✅ Has Board type" : "❌ Missing Board type",
|
|
`Tool calls: ${toolCalls.join(", ") || "none"}`,
|
|
].join("\n "),
|
|
toolCalls,
|
|
codeQuality: content.length > 0 ? `${content.length} chars` : "empty",
|
|
};
|
|
},
|
|
},
|
|
];
|
|
|
|
async function runTest(
|
|
modelId: string,
|
|
test: TestCase,
|
|
): Promise<{ result: TestResult; tokens: number; timeMs: number }> {
|
|
const start = Date.now();
|
|
|
|
const body = {
|
|
model: modelId,
|
|
messages: test.messages,
|
|
tools: test.tools,
|
|
max_tokens: test.maxTokens,
|
|
temperature: 0,
|
|
};
|
|
|
|
const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify(body),
|
|
});
|
|
|
|
const timeMs = Date.now() - start;
|
|
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
return {
|
|
result: {
|
|
passed: false,
|
|
details: `❌ HTTP ${response.status}: ${error}`,
|
|
toolCalls: [],
|
|
},
|
|
tokens: 0,
|
|
timeMs,
|
|
};
|
|
}
|
|
|
|
const data: ChatResponse = await response.json();
|
|
const tokens = data.usage?.completion_tokens ?? 0;
|
|
|
|
const result = test.evaluate(data);
|
|
if (!result.passed) {
|
|
const msg = data.choices?.[0]?.message;
|
|
log.info(` [DEBUG] Content: ${msg?.content?.slice(0, 200) ?? "null"}`);
|
|
log.info(` [DEBUG] Tool calls: ${JSON.stringify(msg?.tool_calls?.map((tc) => tc.function.name) ?? [])}`);
|
|
log.info(` [DEBUG] Finish reason: ${data.choices?.[0]?.finish_reason}`);
|
|
}
|
|
|
|
return { result, tokens, timeMs };
|
|
}
|
|
|
|
async function prewarmModel(modelId: string): Promise<boolean> {
|
|
log.info(` Pre-warming ${modelId}...`);
|
|
const start = Date.now();
|
|
|
|
// Send a trivial request to trigger model loading in the pool
|
|
const response = await fetch(`${MODEL_BOSS_URL}/v1/chat/completions`, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({
|
|
model: modelId,
|
|
messages: [{ role: "user", content: "Say hello." }],
|
|
max_tokens: 16,
|
|
temperature: 0,
|
|
}),
|
|
signal: AbortSignal.timeout(600_000), // 10 min for large model loading
|
|
});
|
|
|
|
const elapsed = ((Date.now() - start) / 1000).toFixed(1);
|
|
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
log.error(` ❌ Pre-warm failed after ${elapsed}s: ${error}`);
|
|
return false;
|
|
}
|
|
|
|
log.info(` ✅ Model ready in ${elapsed}s`);
|
|
return true;
|
|
}
|
|
|
|
async function benchmarkModel(modelId: string): Promise<BenchmarkResult> {
|
|
log.info(`\n${"=".repeat(60)}`);
|
|
log.info(` BENCHMARKING: ${modelId}`);
|
|
log.info(`${"=".repeat(60)}`);
|
|
|
|
let totalTokens = 0;
|
|
let totalTimeMs = 0;
|
|
const results: (TestResult | { passed: false; details: string })[] = [];
|
|
|
|
for (const test of TEST_CASES) {
|
|
log.info(`\n ${test.name} (${test.complexity} complexity):`);
|
|
try {
|
|
const { result, tokens, timeMs } = await runTest(modelId, test);
|
|
totalTokens += tokens;
|
|
totalTimeMs += timeMs;
|
|
results.push(result);
|
|
|
|
const status = result.passed ? "PASS ✅" : "FAIL ❌";
|
|
log.info(` ${status} (${tokens} tokens, ${(timeMs / 1000).toFixed(1)}s)`);
|
|
log.info(` ${result.details}`);
|
|
if (result.toolCalls?.length) {
|
|
log.info(` Tools: ${result.toolCalls.join(" → ")}`);
|
|
}
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
results.push({ passed: false, details: `❌ Error: ${msg}` });
|
|
log.info(` FAIL ❌ — ${msg}`);
|
|
}
|
|
}
|
|
|
|
const avgTokPerSec =
|
|
totalTimeMs > 0 ? totalTokens / (totalTimeMs / 1000) : 0;
|
|
|
|
return {
|
|
model: modelId,
|
|
testA: results[0],
|
|
testB: results[1],
|
|
testC: results[2],
|
|
totalTokens,
|
|
totalTimeMs,
|
|
avgTokPerSec,
|
|
};
|
|
}
|
|
|
|
function printSummary(results: BenchmarkResult[]): void {
|
|
log.info(`\n${"=".repeat(70)}`);
|
|
log.info(" BENCHMARK SUMMARY");
|
|
log.info(`${"=".repeat(70)}`);
|
|
|
|
const header =
|
|
"| Model | Test A | Test B | Test C | Tokens | Speed (tok/s) |";
|
|
const sep = "|" + "-".repeat(header.length - 2) + "|";
|
|
|
|
log.info(header);
|
|
log.info(sep);
|
|
|
|
for (const r of results) {
|
|
const a = r.testA.passed ? "✅" : "❌";
|
|
const b = r.testB.passed ? "✅" : "❌";
|
|
const c = r.testC.passed ? "✅" : "❌";
|
|
log.info(
|
|
`| ${r.model.padEnd(28)} | ${a} | ${b} | ${c} | ${String(r.totalTokens).padStart(6)} | ${r.avgTokPerSec.toFixed(1).padStart(13)} |`,
|
|
);
|
|
}
|
|
|
|
log.info(sep);
|
|
log.info("");
|
|
|
|
for (const r of results) {
|
|
const passed = [r.testA, r.testB, r.testC].filter((t) => t.passed).length;
|
|
let verdict: string;
|
|
if (passed === 3) verdict = "Primary candidate — all tests passed";
|
|
else if (passed === 2) verdict = "Viable for simple/medium tasks";
|
|
else if (passed === 1) verdict = "Simple tasks only";
|
|
else verdict = "Not viable for agentic use";
|
|
log.info(` ${r.model}: ${passed}/3 — ${verdict}`);
|
|
}
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
const models = process.argv.slice(2);
|
|
if (models.length === 0) {
|
|
log.error(
|
|
"Usage: npx tsx benchmark-models.ts <model1> [model2] [model3] ...",
|
|
);
|
|
log.error(
|
|
"Example: npx tsx benchmark-models.ts qwen3-8b hermes-4-70b nemotron-h-47b-iq3",
|
|
);
|
|
process.exit(1);
|
|
}
|
|
|
|
log.info(`Model Boss: ${MODEL_BOSS_URL}`);
|
|
log.info(`Models to test: ${models.join(", ")}`);
|
|
log.info(`Tests per model: ${TEST_CASES.length}`);
|
|
|
|
const results: BenchmarkResult[] = [];
|
|
|
|
for (const modelId of models) {
|
|
const check = await fetch(`${MODEL_BOSS_URL}/api/v1/models/${modelId}`);
|
|
if (!check.ok) {
|
|
log.error(`\n❌ Model "${modelId}" not found in registry. Skipping.`);
|
|
continue;
|
|
}
|
|
|
|
const warmed = await prewarmModel(modelId);
|
|
if (!warmed) {
|
|
log.error(` Skipping ${modelId} — failed to load.`);
|
|
continue;
|
|
}
|
|
|
|
const result = await benchmarkModel(modelId);
|
|
results.push(result);
|
|
}
|
|
|
|
if (results.length > 0) {
|
|
printSummary(results);
|
|
|
|
const outPath = `/tmp/kthulu-benchmark-${new Date().toISOString().slice(0, 19).replace(/:/g, "")}.json`;
|
|
const { writeFileSync } = await import("node:fs");
|
|
writeFileSync(outPath, JSON.stringify(results, null, 2));
|
|
log.info(`\nResults saved to: ${outPath}`);
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
log.error("Benchmark failed:", err);
|
|
process.exit(1);
|
|
});
|