model-boss/tools/benchmark/cli.py
2026-05-11 00:20:11 -07:00

199 lines
6 KiB
Python

"""CLI entry point for model-boss benchmarks.
Usage:
python3 -m tools.benchmark.cli vlm --models qwen25-vl-7b-instruct --samples 50
python3 -m tools.benchmark.cli reasoning --models ministral-14b-reasoning,qwen3.6-27b --samples 100
python3 -m tools.benchmark.cli code --models ministral-14b-reasoning,qwen3.6-27b --samples 50
python3 -m tools.benchmark.cli describe --models qwen3-vl-8b-instruct,gemma-4-31b --samples 15
"""
from __future__ import annotations
import argparse
import asyncio
import logging
import sys
from collections.abc import Awaitable, Callable
from typing import Any
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s %(name)s: %(message)s",
stream=sys.stderr,
)
logger = logging.getLogger(__name__)
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="python3 -m tools.benchmark.cli",
description="Model-boss inference benchmarks",
)
sub = parser.add_subparsers(dest="suite", required=True)
def _add_common(p: argparse.ArgumentParser, default_samples: int) -> None:
p.add_argument(
"--models",
required=True,
help="Comma-separated list of model IDs registered in model-boss",
metavar="MODEL_ID[,MODEL_ID...]",
)
p.add_argument(
"--samples",
type=int,
default=default_samples,
help=f"Number of samples (default: {default_samples})",
)
vlm = sub.add_parser("vlm", help="VLM boolean attribute benchmark (CelebA)")
_add_common(vlm, 50)
vlm.add_argument(
"--no-vision-update",
action="store_true",
help="Skip updating docs/model-encyclopedia/vision.md",
)
reasoning = sub.add_parser("reasoning", help="LLM reasoning benchmark (GSM8K + MMLU-Pro)")
_add_common(reasoning, 100)
reasoning.add_argument(
"--no-llms-update",
action="store_true",
help="Skip updating docs/model-encyclopedia/llms.md",
)
code = sub.add_parser("code", help="LLM code benchmark (HumanEval + MBPP)")
_add_common(code, 50)
code.add_argument(
"--no-llms-update",
action="store_true",
help="Skip updating docs/model-encyclopedia/llms.md",
)
describe = sub.add_parser("describe", help="VLM describe benchmark (LLM-as-judge)")
_add_common(describe, 15)
describe.add_argument(
"--judge-model",
default="ministral-14b-reasoning",
help="Model used as the judge (default: ministral-14b-reasoning)",
)
describe.add_argument(
"--no-vision-update",
action="store_true",
help="Skip updating docs/model-encyclopedia/vision.md",
)
return parser
INTER_MODEL_SLEEP_S = 30.0
async def _run_suite(
label: str,
run_fn: Callable[[str], Awaitable[Any]],
model_ids: list[str],
*,
update_target: str,
update_enabled: bool,
) -> int:
from .reporter import report
exit_code = 0
for i, model_id in enumerate(model_ids):
if i > 0:
# Give llama-server a window to fully release the prior model's
# port before stager spawns the next one (downloader reported a
# port-binding race during rapid evict-and-respawn).
print(f"\n[pause {INTER_MODEL_SLEEP_S:.0f}s for pool to settle]", flush=True)
await asyncio.sleep(INTER_MODEL_SLEEP_S)
print(f"\n{'='*60}", flush=True)
print(f"Running {label} benchmark: {model_id}", flush=True)
print(f"{'='*60}\n", flush=True)
try:
result = await run_fn(model_id)
except Exception as exc:
logger.error("Benchmark failed for %s: %s", model_id, exc)
exit_code = 1
continue
json_path = report(
result,
update_vision=(update_target == "vision" and update_enabled),
update_llms=(update_target == "llms" and update_enabled),
target=update_target, # type: ignore[arg-type]
)
print(f"\nResults written to: {json_path}", flush=True)
return exit_code
def _parse_models(arg: str, parser: argparse.ArgumentParser) -> list[str]:
model_ids = [m.strip() for m in arg.split(",") if m.strip()]
if not model_ids:
parser.error("--models requires at least one model ID")
return model_ids
def main() -> None:
parser = _build_parser()
args = parser.parse_args()
model_ids = _parse_models(args.models, parser)
if args.suite == "vlm":
from .suites import vlm_celeba
exit_code = asyncio.run(
_run_suite(
"VLM CelebA",
lambda m: vlm_celeba.run(m, args.samples),
model_ids,
update_target="vision",
update_enabled=not args.no_vision_update,
)
)
elif args.suite == "reasoning":
from .suites import llm_reasoning
exit_code = asyncio.run(
_run_suite(
"LLM Reasoning",
lambda m: llm_reasoning.run(m, args.samples),
model_ids,
update_target="llms",
update_enabled=not args.no_llms_update,
)
)
elif args.suite == "code":
from .suites import llm_code
exit_code = asyncio.run(
_run_suite(
"LLM Code",
lambda m: llm_code.run(m, args.samples),
model_ids,
update_target="llms",
update_enabled=not args.no_llms_update,
)
)
elif args.suite == "describe":
from .suites import vlm_describe
exit_code = asyncio.run(
_run_suite(
"VLM Describe",
lambda m: vlm_describe.run(m, args.samples, args.judge_model),
model_ids,
update_target="vision",
update_enabled=not args.no_vision_update,
)
)
else:
parser.error(f"Unknown suite: {args.suite}")
return
sys.exit(exit_code)
if __name__ == "__main__":
main()