model-boss/tools/benchmark/cli.py

"""CLI entry point for model-boss benchmarks.

Usage:
    python3 -m tools.benchmark.cli vlm --models qwen25-vl-7b-instruct --samples 50
    python3 -m tools.benchmark.cli reasoning --models ministral-14b-reasoning,qwen3.6-27b --samples 100
    python3 -m tools.benchmark.cli code --models ministral-14b-reasoning,qwen3.6-27b --samples 50
    python3 -m tools.benchmark.cli describe --models qwen3-vl-8b-instruct,gemma-4-31b --samples 15
"""

from __future__ import annotations

import argparse
import asyncio
import logging
import sys
from collections.abc import Awaitable, Callable
from typing import Any

logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s %(name)s: %(message)s",
    stream=sys.stderr,
)
logger = logging.getLogger(__name__)


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        prog="python3 -m tools.benchmark.cli",
        description="Model-boss inference benchmarks",
    )
    sub = parser.add_subparsers(dest="suite", required=True)

    def _add_common(p: argparse.ArgumentParser, default_samples: int) -> None:
        p.add_argument(
            "--models",
            required=True,
            help="Comma-separated list of model IDs registered in model-boss",
            metavar="MODEL_ID[,MODEL_ID...]",
        )
        p.add_argument(
            "--samples",
            type=int,
            default=default_samples,
            help=f"Number of samples (default: {default_samples})",
        )

    vlm = sub.add_parser("vlm", help="VLM boolean attribute benchmark (CelebA)")
    _add_common(vlm, 50)
    vlm.add_argument(
        "--no-vision-update",
        action="store_true",
        help="Skip updating docs/model-encyclopedia/vision.md",
    )

    reasoning = sub.add_parser("reasoning", help="LLM reasoning benchmark (GSM8K + MMLU-Pro)")
    _add_common(reasoning, 100)
    reasoning.add_argument(
        "--no-llms-update",
        action="store_true",
        help="Skip updating docs/model-encyclopedia/llms.md",
    )

    code = sub.add_parser("code", help="LLM code benchmark (HumanEval + MBPP)")
    _add_common(code, 50)
    code.add_argument(
        "--no-llms-update",
        action="store_true",
        help="Skip updating docs/model-encyclopedia/llms.md",
    )

    describe = sub.add_parser("describe", help="VLM describe benchmark (LLM-as-judge)")
    _add_common(describe, 15)
    describe.add_argument(
        "--judge-model",
        default="ministral-14b-reasoning",
        help="Model used as the judge (default: ministral-14b-reasoning)",
    )
    describe.add_argument(
        "--no-vision-update",
        action="store_true",
        help="Skip updating docs/model-encyclopedia/vision.md",
    )

    return parser


INTER_MODEL_SLEEP_S = 30.0


async def _run_suite(
    label: str,
    run_fn: Callable[[str], Awaitable[Any]],
    model_ids: list[str],
    *,
    update_target: str,
    update_enabled: bool,
) -> int:
    from .reporter import report

    exit_code = 0
    for i, model_id in enumerate(model_ids):
        if i > 0:
            # Give llama-server a window to fully release the prior model's
            # port before stager spawns the next one (downloader reported a
            # port-binding race during rapid evict-and-respawn).
            print(f"\n[pause {INTER_MODEL_SLEEP_S:.0f}s for pool to settle]", flush=True)
            await asyncio.sleep(INTER_MODEL_SLEEP_S)
        print(f"\n{'='*60}", flush=True)
        print(f"Running {label} benchmark: {model_id}", flush=True)
        print(f"{'='*60}\n", flush=True)

        try:
            result = await run_fn(model_id)
        except Exception as exc:
            logger.error("Benchmark failed for %s: %s", model_id, exc)
            exit_code = 1
            continue

        json_path = report(
            result,
            update_vision=(update_target == "vision" and update_enabled),
            update_llms=(update_target == "llms" and update_enabled),
            target=update_target,  # type: ignore[arg-type]
        )
        print(f"\nResults written to: {json_path}", flush=True)
    return exit_code


def _parse_models(arg: str, parser: argparse.ArgumentParser) -> list[str]:
    model_ids = [m.strip() for m in arg.split(",") if m.strip()]
    if not model_ids:
        parser.error("--models requires at least one model ID")
    return model_ids


def main() -> None:
    parser = _build_parser()
    args = parser.parse_args()

    model_ids = _parse_models(args.models, parser)

    if args.suite == "vlm":
        from .suites import vlm_celeba

        exit_code = asyncio.run(
            _run_suite(
                "VLM CelebA",
                lambda m: vlm_celeba.run(m, args.samples),
                model_ids,
                update_target="vision",
                update_enabled=not args.no_vision_update,
            )
        )
    elif args.suite == "reasoning":
        from .suites import llm_reasoning

        exit_code = asyncio.run(
            _run_suite(
                "LLM Reasoning",
                lambda m: llm_reasoning.run(m, args.samples),
                model_ids,
                update_target="llms",
                update_enabled=not args.no_llms_update,
            )
        )
    elif args.suite == "code":
        from .suites import llm_code

        exit_code = asyncio.run(
            _run_suite(
                "LLM Code",
                lambda m: llm_code.run(m, args.samples),
                model_ids,
                update_target="llms",
                update_enabled=not args.no_llms_update,
            )
        )
    elif args.suite == "describe":
        from .suites import vlm_describe

        exit_code = asyncio.run(
            _run_suite(
                "VLM Describe",
                lambda m: vlm_describe.run(m, args.samples, args.judge_model),
                model_ids,
                update_target="vision",
                update_enabled=not args.no_vision_update,
            )
        )
    else:
        parser.error(f"Unknown suite: {args.suite}")
        return

    sys.exit(exit_code)


if __name__ == "__main__":
    main()