199 lines
6 KiB
Python
199 lines
6 KiB
Python
"""CLI entry point for model-boss benchmarks.
|
|
|
|
Usage:
|
|
python3 -m tools.benchmark.cli vlm --models qwen25-vl-7b-instruct --samples 50
|
|
python3 -m tools.benchmark.cli reasoning --models ministral-14b-reasoning,qwen3.6-27b --samples 100
|
|
python3 -m tools.benchmark.cli code --models ministral-14b-reasoning,qwen3.6-27b --samples 50
|
|
python3 -m tools.benchmark.cli describe --models qwen3-vl-8b-instruct,gemma-4-31b --samples 15
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
from collections.abc import Awaitable, Callable
|
|
from typing import Any
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(levelname)s %(name)s: %(message)s",
|
|
stream=sys.stderr,
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
prog="python3 -m tools.benchmark.cli",
|
|
description="Model-boss inference benchmarks",
|
|
)
|
|
sub = parser.add_subparsers(dest="suite", required=True)
|
|
|
|
def _add_common(p: argparse.ArgumentParser, default_samples: int) -> None:
|
|
p.add_argument(
|
|
"--models",
|
|
required=True,
|
|
help="Comma-separated list of model IDs registered in model-boss",
|
|
metavar="MODEL_ID[,MODEL_ID...]",
|
|
)
|
|
p.add_argument(
|
|
"--samples",
|
|
type=int,
|
|
default=default_samples,
|
|
help=f"Number of samples (default: {default_samples})",
|
|
)
|
|
|
|
vlm = sub.add_parser("vlm", help="VLM boolean attribute benchmark (CelebA)")
|
|
_add_common(vlm, 50)
|
|
vlm.add_argument(
|
|
"--no-vision-update",
|
|
action="store_true",
|
|
help="Skip updating docs/model-encyclopedia/vision.md",
|
|
)
|
|
|
|
reasoning = sub.add_parser("reasoning", help="LLM reasoning benchmark (GSM8K + MMLU-Pro)")
|
|
_add_common(reasoning, 100)
|
|
reasoning.add_argument(
|
|
"--no-llms-update",
|
|
action="store_true",
|
|
help="Skip updating docs/model-encyclopedia/llms.md",
|
|
)
|
|
|
|
code = sub.add_parser("code", help="LLM code benchmark (HumanEval + MBPP)")
|
|
_add_common(code, 50)
|
|
code.add_argument(
|
|
"--no-llms-update",
|
|
action="store_true",
|
|
help="Skip updating docs/model-encyclopedia/llms.md",
|
|
)
|
|
|
|
describe = sub.add_parser("describe", help="VLM describe benchmark (LLM-as-judge)")
|
|
_add_common(describe, 15)
|
|
describe.add_argument(
|
|
"--judge-model",
|
|
default="ministral-14b-reasoning",
|
|
help="Model used as the judge (default: ministral-14b-reasoning)",
|
|
)
|
|
describe.add_argument(
|
|
"--no-vision-update",
|
|
action="store_true",
|
|
help="Skip updating docs/model-encyclopedia/vision.md",
|
|
)
|
|
|
|
return parser
|
|
|
|
|
|
INTER_MODEL_SLEEP_S = 30.0
|
|
|
|
|
|
async def _run_suite(
|
|
label: str,
|
|
run_fn: Callable[[str], Awaitable[Any]],
|
|
model_ids: list[str],
|
|
*,
|
|
update_target: str,
|
|
update_enabled: bool,
|
|
) -> int:
|
|
from .reporter import report
|
|
|
|
exit_code = 0
|
|
for i, model_id in enumerate(model_ids):
|
|
if i > 0:
|
|
# Give llama-server a window to fully release the prior model's
|
|
# port before stager spawns the next one (downloader reported a
|
|
# port-binding race during rapid evict-and-respawn).
|
|
print(f"\n[pause {INTER_MODEL_SLEEP_S:.0f}s for pool to settle]", flush=True)
|
|
await asyncio.sleep(INTER_MODEL_SLEEP_S)
|
|
print(f"\n{'='*60}", flush=True)
|
|
print(f"Running {label} benchmark: {model_id}", flush=True)
|
|
print(f"{'='*60}\n", flush=True)
|
|
|
|
try:
|
|
result = await run_fn(model_id)
|
|
except Exception as exc:
|
|
logger.error("Benchmark failed for %s: %s", model_id, exc)
|
|
exit_code = 1
|
|
continue
|
|
|
|
json_path = report(
|
|
result,
|
|
update_vision=(update_target == "vision" and update_enabled),
|
|
update_llms=(update_target == "llms" and update_enabled),
|
|
target=update_target, # type: ignore[arg-type]
|
|
)
|
|
print(f"\nResults written to: {json_path}", flush=True)
|
|
return exit_code
|
|
|
|
|
|
def _parse_models(arg: str, parser: argparse.ArgumentParser) -> list[str]:
|
|
model_ids = [m.strip() for m in arg.split(",") if m.strip()]
|
|
if not model_ids:
|
|
parser.error("--models requires at least one model ID")
|
|
return model_ids
|
|
|
|
|
|
def main() -> None:
|
|
parser = _build_parser()
|
|
args = parser.parse_args()
|
|
|
|
model_ids = _parse_models(args.models, parser)
|
|
|
|
if args.suite == "vlm":
|
|
from .suites import vlm_celeba
|
|
|
|
exit_code = asyncio.run(
|
|
_run_suite(
|
|
"VLM CelebA",
|
|
lambda m: vlm_celeba.run(m, args.samples),
|
|
model_ids,
|
|
update_target="vision",
|
|
update_enabled=not args.no_vision_update,
|
|
)
|
|
)
|
|
elif args.suite == "reasoning":
|
|
from .suites import llm_reasoning
|
|
|
|
exit_code = asyncio.run(
|
|
_run_suite(
|
|
"LLM Reasoning",
|
|
lambda m: llm_reasoning.run(m, args.samples),
|
|
model_ids,
|
|
update_target="llms",
|
|
update_enabled=not args.no_llms_update,
|
|
)
|
|
)
|
|
elif args.suite == "code":
|
|
from .suites import llm_code
|
|
|
|
exit_code = asyncio.run(
|
|
_run_suite(
|
|
"LLM Code",
|
|
lambda m: llm_code.run(m, args.samples),
|
|
model_ids,
|
|
update_target="llms",
|
|
update_enabled=not args.no_llms_update,
|
|
)
|
|
)
|
|
elif args.suite == "describe":
|
|
from .suites import vlm_describe
|
|
|
|
exit_code = asyncio.run(
|
|
_run_suite(
|
|
"VLM Describe",
|
|
lambda m: vlm_describe.run(m, args.samples, args.judge_model),
|
|
model_ids,
|
|
update_target="vision",
|
|
update_enabled=not args.no_vision_update,
|
|
)
|
|
)
|
|
else:
|
|
parser.error(f"Unknown suite: {args.suite}")
|
|
return
|
|
|
|
sys.exit(exit_code)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|