2026-03-13 06:05:59 -07:00
# ! / u s r / b i n / e n v n p x t s x
/ * *
* Model Quality Benchmark for @kthulu Phase 0 A
*
* Tests each model with 3 standardized prompts via model - boss / v1 / chat / completions .
* Measures : tool call correctness , sequencing , code quality , speed .
* /
const MODEL_BOSS_URL = "http://localhost:8210" ;
// CLI script — stdout is the interface
const log = {
info : ( . . . args : unknown [ ] ) = > process . stdout . write ( args . join ( " " ) + "\n" ) ,
error : ( . . . args : unknown [ ] ) = > process . stderr . write ( args . join ( " " ) + "\n" ) ,
} ;
interface ToolCall {
id : string ;
type : "function" ;
function : {
name : string ;
arguments : string ;
} ;
}
interface ChatMessage {
role : string ;
content : string | null ;
tool_calls? : ToolCall [ ] ;
}
interface ChatResponse {
id : string ;
model : string ;
choices : Array < {
message : ChatMessage ;
finish_reason : string ;
} > ;
usage : {
prompt_tokens : number ;
completion_tokens : number ;
total_tokens : number ;
} ;
}
const TOOLS = [
{
type : "function" as const ,
function : {
name : "write_file" ,
description : "Write content to a file, creating it if it doesn't exist." ,
parameters : {
type : "object" ,
properties : {
path : { type : "string" , description : "File path to write to" } ,
content : { type : "string" , description : "Content to write" } ,
} ,
required : [ "path" , "content" ] ,
} ,
} ,
} ,
{
type : "function" as const ,
function : {
name : "read_file" ,
description : "Read the contents of a file." ,
parameters : {
type : "object" ,
properties : {
path : { type : "string" , description : "File path to read" } ,
} ,
required : [ "path" ] ,
} ,
} ,
} ,
{
type : "function" as const ,
function : {
name : "edit_file" ,
description :
"Edit a file by replacing an old string with a new string." ,
parameters : {
type : "object" ,
properties : {
path : { type : "string" , description : "File path to edit" } ,
old_string : { type : "string" , description : "Text to find and replace" } ,
new_string : { type : "string" , description : "Replacement text" } ,
} ,
required : [ "path" , "old_string" , "new_string" ] ,
} ,
} ,
} ,
{
type : "function" as const ,
function : {
name : "bash" ,
description : "Execute a bash command and return stdout/stderr." ,
parameters : {
type : "object" ,
properties : {
command : { type : "string" , description : "Command to execute" } ,
} ,
required : [ "command" ] ,
} ,
} ,
} ,
] ;
interface TestCase {
name : string ;
complexity : "low" | "medium" | "high" ;
messages : Array < { role : string ; content : string } > ;
tools : typeof TOOLS ;
maxTokens : number ;
evaluate : ( response : ChatResponse ) = > TestResult ;
}
interface TestResult {
passed : boolean ;
details : string ;
toolCalls : string [ ] ;
codeQuality? : string ;
}
interface BenchmarkResult {
model : string ;
testA : TestResult | { passed : false ; details : string } ;
testB : TestResult | { passed : false ; details : string } ;
testC : TestResult | { passed : false ; details : string } ;
totalTokens : number ;
totalTimeMs : number ;
avgTokPerSec : number ;
}
const SYSTEM_PROMPT = ` You are a coding assistant. You have access to tools for file operations and shell commands. When asked to create or modify code, use the appropriate tools. Be direct — call tools immediately without explaining what you're about to do. ` ;
function extractToolCalls ( response : ChatResponse ) : ToolCall [ ] {
return response . choices ? . [ 0 ] ? . message ? . tool_calls ? ? [ ] ;
}
function hasToolCall ( response : ChatResponse , toolName : string ) : boolean {
return extractToolCalls ( response ) . some ( ( tc ) = > tc . function . name === toolName ) ;
}
function getToolCallArg (
response : ChatResponse ,
toolName : string ,
argName : string ,
) : string | undefined {
const tc = extractToolCalls ( response ) . find (
( t ) = > t . function . name === toolName ,
) ;
if ( ! tc ) return undefined ;
try {
return JSON . parse ( tc . function . arguments ) [ argName ] as string | undefined ;
} catch {
return undefined ;
}
}
function isValidTypeScript ( code : string ) : boolean {
const hasExport = /export\s/ . test ( code ) ;
const hasFunction = /function\s|const\s|=>\s/ . test ( code ) ;
const balanced =
( code . match ( /{/g ) || [ ] ) . length === ( code . match ( /}/g ) || [ ] ) . length ;
return hasExport || ( hasFunction && balanced ) ;
}
const TEST_CASES : TestCase [ ] = [
{
name : "Test A — Simple file write" ,
complexity : "low" ,
messages : [
{ role : "system" , content : SYSTEM_PROMPT } ,
{
role : "user" ,
content :
"Create a file called calculator.ts with add and subtract functions that take two numbers and return a number. Export both functions." ,
} ,
] ,
tools : TOOLS ,
maxTokens : 2048 ,
evaluate ( response : ChatResponse ) : TestResult {
const toolCalls = extractToolCalls ( response ) . map (
( tc ) = > tc . function . name ,
) ;
const hasWrite = hasToolCall ( response , "write_file" ) ;
const content = getToolCallArg ( response , "write_file" , "content" ) ? ? "" ;
const path = getToolCallArg ( response , "write_file" , "path" ) ? ? "" ;
const validTS = isValidTypeScript ( content ) ;
const hasCalc = path . includes ( "calculator" ) ;
return {
passed : hasWrite && validTS && hasCalc ,
details : [
hasWrite ? "✅ write_file called" : "❌ No write_file tool call" ,
hasCalc ? "✅ Correct filename" : ` ❌ Wrong path: ${ path } ` ,
validTS ? "✅ Valid TypeScript structure" : "❌ Invalid TypeScript" ,
content . includes ( "add" ) ? "✅ Has add function" : "❌ Missing add" ,
content . includes ( "subtract" )
? "✅ Has subtract function"
: "❌ Missing subtract" ,
] . join ( "\n " ) ,
toolCalls ,
codeQuality : content.length > 0 ? ` ${ content . length } chars ` : "empty" ,
} ;
} ,
} ,
{
name : "Test B — Multi-step sequencing" ,
complexity : "medium" ,
messages : [
{ role : "system" , content : SYSTEM_PROMPT } ,
{
role : "user" ,
content :
"Read the file src/index.ts, then add error handling to it by wrapping the main logic in a try-catch block." ,
} ,
] ,
tools : TOOLS ,
maxTokens : 2048 ,
evaluate ( response : ChatResponse ) : TestResult {
const toolCalls = extractToolCalls ( response ) . map (
( tc ) = > tc . function . name ,
) ;
const firstCall = toolCalls [ 0 ] ;
const hasRead = toolCalls . includes ( "read_file" ) ;
const readsFirst = firstCall === "read_file" ;
return {
passed : hasRead && readsFirst ,
details : [
hasRead ? "✅ read_file called" : "❌ No read_file call" ,
readsFirst
? "✅ Correct sequencing (read first)"
: ` ❌ Wrong sequence: ${ firstCall ? ? "none" } called first ` ,
` Tool call order: ${ toolCalls . join ( " → " ) || "none" } ` ,
] . join ( "\n " ) ,
toolCalls ,
} ;
} ,
} ,
{
name : "Test C — Complex multi-file (tic-tac-toe)" ,
complexity : "high" ,
messages : [
{ role : "system" , content : SYSTEM_PROMPT } ,
{
role : "user" ,
content : ` Create a tic-tac-toe game with minimax AI in TypeScript. Create src/game.ts with:
1 . A Board type ( 3 x3 array of 'X' | 'O' | null )
2 . A checkWinner function
3 . A minimax function that returns the optimal move index
4 . Export all of them . ` ,
} ,
] ,
tools : TOOLS ,
maxTokens : 4096 ,
evaluate ( response : ChatResponse ) : TestResult {
const toolCalls = extractToolCalls ( response ) . map (
( tc ) = > tc . function . name ,
) ;
const hasWrite = hasToolCall ( response , "write_file" ) ;
const content = getToolCallArg ( response , "write_file" , "content" ) ? ? "" ;
const hasMinimax = content . toLowerCase ( ) . includes ( "minimax" ) ;
const hasCheckWinner =
content . includes ( "checkWinner" ) || content . includes ( "check_winner" ) ;
const hasBoard = content . includes ( "Board" ) ;
return {
passed : hasWrite && content . length > 100 ,
details : [
hasWrite ? "✅ write_file called" : "❌ No write_file tool call" ,
content . length > 100
? ` ✅ Substantial code ( ${ content . length } chars) `
: ` ❌ Insufficient code ( ${ content . length } chars) ` ,
hasMinimax ? "✅ Has minimax" : "❌ Missing minimax" ,
hasCheckWinner ? "✅ Has checkWinner" : "❌ Missing checkWinner" ,
hasBoard ? "✅ Has Board type" : "❌ Missing Board type" ,
` Tool calls: ${ toolCalls . join ( ", " ) || "none" } ` ,
] . join ( "\n " ) ,
toolCalls ,
codeQuality : content.length > 0 ? ` ${ content . length } chars ` : "empty" ,
} ;
} ,
} ,
] ;
async function runTest (
modelId : string ,
test : TestCase ,
) : Promise < { result : TestResult ; tokens : number ; timeMs : number } > {
const start = Date . now ( ) ;
const body = {
model : modelId ,
messages : test.messages ,
tools : test.tools ,
max_tokens : test.maxTokens ,
temperature : 0 ,
} ;
const response = await fetch ( ` ${ MODEL_BOSS_URL } /v1/chat/completions ` , {
method : "POST" ,
headers : { "Content-Type" : "application/json" } ,
body : JSON.stringify ( body ) ,
} ) ;
const timeMs = Date . now ( ) - start ;
if ( ! response . ok ) {
const error = await response . text ( ) ;
return {
result : {
passed : false ,
details : ` ❌ HTTP ${ response . status } : ${ error } ` ,
toolCalls : [ ] ,
} ,
tokens : 0 ,
timeMs ,
} ;
}
const data : ChatResponse = await response . json ( ) ;
const tokens = data . usage ? . completion_tokens ? ? 0 ;
const result = test . evaluate ( data ) ;
if ( ! result . passed ) {
const msg = data . choices ? . [ 0 ] ? . message ;
log . info ( ` [DEBUG] Content: ${ msg ? . content ? . slice ( 0 , 200 ) ? ? "null" } ` ) ;
log . info ( ` [DEBUG] Tool calls: ${ JSON . stringify ( msg ? . tool_calls ? . map ( ( tc ) = > tc . function . name ) ? ? [ ] ) } ` ) ;
log . info ( ` [DEBUG] Finish reason: ${ data . choices ? . [ 0 ] ? . finish_reason } ` ) ;
}
return { result , tokens , timeMs } ;
}
2026-03-13 06:11:57 -07:00
async function prewarmModel ( modelId : string ) : Promise < boolean > {
log . info ( ` Pre-warming ${ modelId } ... ` ) ;
const start = Date . now ( ) ;
// Send a trivial request to trigger model loading in the pool
const response = await fetch ( ` ${ MODEL_BOSS_URL } /v1/chat/completions ` , {
method : "POST" ,
headers : { "Content-Type" : "application/json" } ,
body : JSON.stringify ( {
model : modelId ,
messages : [ { role : "user" , content : "Say hello." } ] ,
max_tokens : 16 ,
temperature : 0 ,
} ) ,
signal : AbortSignal.timeout ( 600 _000 ) , // 10 min for large model loading
} ) ;
const elapsed = ( ( Date . now ( ) - start ) / 1000 ) . toFixed ( 1 ) ;
if ( ! response . ok ) {
const error = await response . text ( ) ;
log . error ( ` ❌ Pre-warm failed after ${ elapsed } s: ${ error } ` ) ;
return false ;
}
log . info ( ` ✅ Model ready in ${ elapsed } s ` ) ;
return true ;
}
2026-03-13 06:05:59 -07:00
async function benchmarkModel ( modelId : string ) : Promise < BenchmarkResult > {
log . info ( ` \ n ${ "=" . repeat ( 60 ) } ` ) ;
log . info ( ` BENCHMARKING: ${ modelId } ` ) ;
log . info ( ` ${ "=" . repeat ( 60 ) } ` ) ;
let totalTokens = 0 ;
let totalTimeMs = 0 ;
const results : ( TestResult | { passed : false ; details : string } ) [ ] = [ ] ;
for ( const test of TEST_CASES ) {
log . info ( ` \ n ${ test . name } ( ${ test . complexity } complexity): ` ) ;
try {
const { result , tokens , timeMs } = await runTest ( modelId , test ) ;
totalTokens += tokens ;
totalTimeMs += timeMs ;
results . push ( result ) ;
const status = result . passed ? "PASS ✅" : "FAIL ❌" ;
log . info ( ` ${ status } ( ${ tokens } tokens, ${ ( timeMs / 1000 ) . toFixed ( 1 ) } s) ` ) ;
log . info ( ` ${ result . details } ` ) ;
if ( result . toolCalls ? . length ) {
log . info ( ` Tools: ${ result . toolCalls . join ( " → " ) } ` ) ;
}
} catch ( err ) {
const msg = err instanceof Error ? err.message : String ( err ) ;
results . push ( { passed : false , details : ` ❌ Error: ${ msg } ` } ) ;
log . info ( ` FAIL ❌ — ${ msg } ` ) ;
}
}
const avgTokPerSec =
totalTimeMs > 0 ? totalTokens / ( totalTimeMs / 1000 ) : 0 ;
return {
model : modelId ,
testA : results [ 0 ] ,
testB : results [ 1 ] ,
testC : results [ 2 ] ,
totalTokens ,
totalTimeMs ,
avgTokPerSec ,
} ;
}
function printSummary ( results : BenchmarkResult [ ] ) : void {
log . info ( ` \ n ${ "=" . repeat ( 70 ) } ` ) ;
log . info ( " BENCHMARK SUMMARY" ) ;
log . info ( ` ${ "=" . repeat ( 70 ) } ` ) ;
const header =
"| Model | Test A | Test B | Test C | Tokens | Speed (tok/s) |" ;
const sep = "|" + "-" . repeat ( header . length - 2 ) + "|" ;
log . info ( header ) ;
log . info ( sep ) ;
for ( const r of results ) {
const a = r . testA . passed ? "✅" : "❌" ;
const b = r . testB . passed ? "✅" : "❌" ;
const c = r . testC . passed ? "✅" : "❌" ;
log . info (
` | ${ r . model . padEnd ( 28 ) } | ${ a } | ${ b } | ${ c } | ${ String ( r . totalTokens ) . padStart ( 6 ) } | ${ r . avgTokPerSec . toFixed ( 1 ) . padStart ( 13 ) } | ` ,
) ;
}
log . info ( sep ) ;
log . info ( "" ) ;
for ( const r of results ) {
const passed = [ r . testA , r . testB , r . testC ] . filter ( ( t ) = > t . passed ) . length ;
let verdict : string ;
if ( passed === 3 ) verdict = "Primary candidate — all tests passed" ;
else if ( passed === 2 ) verdict = "Viable for simple/medium tasks" ;
else if ( passed === 1 ) verdict = "Simple tasks only" ;
else verdict = "Not viable for agentic use" ;
log . info ( ` ${ r . model } : ${ passed } /3 — ${ verdict } ` ) ;
}
}
async function main ( ) : Promise < void > {
const models = process . argv . slice ( 2 ) ;
if ( models . length === 0 ) {
log . error (
"Usage: npx tsx benchmark-models.ts <model1> [model2] [model3] ..." ,
) ;
log . error (
"Example: npx tsx benchmark-models.ts qwen3-8b hermes-4-70b nemotron-h-47b-iq3" ,
) ;
process . exit ( 1 ) ;
}
log . info ( ` Model Boss: ${ MODEL_BOSS_URL } ` ) ;
log . info ( ` Models to test: ${ models . join ( ", " ) } ` ) ;
log . info ( ` Tests per model: ${ TEST_CASES . length } ` ) ;
const results : BenchmarkResult [ ] = [ ] ;
for ( const modelId of models ) {
const check = await fetch ( ` ${ MODEL_BOSS_URL } /api/v1/models/ ${ modelId } ` ) ;
if ( ! check . ok ) {
log . error ( ` \ n❌ Model " ${ modelId } " not found in registry. Skipping. ` ) ;
continue ;
}
2026-03-13 06:11:57 -07:00
const warmed = await prewarmModel ( modelId ) ;
if ( ! warmed ) {
log . error ( ` Skipping ${ modelId } — failed to load. ` ) ;
continue ;
}
2026-03-13 06:05:59 -07:00
const result = await benchmarkModel ( modelId ) ;
results . push ( result ) ;
}
if ( results . length > 0 ) {
printSummary ( results ) ;
const outPath = ` /tmp/kthulu-benchmark- ${ new Date ( ) . toISOString ( ) . slice ( 0 , 19 ) . replace ( /:/g , "" ) } .json ` ;
2026-03-13 06:11:57 -07:00
const { writeFileSync } = await import ( "node:fs" ) ;
writeFileSync ( outPath , JSON . stringify ( results , null , 2 ) ) ;
2026-03-13 06:05:59 -07:00
log . info ( ` \ nResults saved to: ${ outPath } ` ) ;
}
}
main ( ) . catch ( ( err ) = > {
log . error ( "Benchmark failed:" , err ) ;
process . exit ( 1 ) ;
} ) ;