ml-model-loader/src_python/tqftw_model_loader/cli.py

"""
CLI entry point for Python-side operations.

Note: Primary CLI is the TypeScript implementation (bin/model-loader.ts).
This provides Python-native commands for testing and direct usage.
"""

import argparse
import asyncio
import sys
import json
from pathlib import Path


def main() -> None:
    """Main CLI entry point."""
    parser = argparse.ArgumentParser(
        prog="model-loader",
        description="TQFTW Model Loader - ML model loading and caching",
    )
    subparsers = parser.add_subparsers(dest="command", help="Commands")

    # device command
    device_parser = subparsers.add_parser("device", help="Show device information")
    device_parser.add_argument("--json", action="store_true", help="Output JSON")

    # test-load command
    test_parser = subparsers.add_parser("test-load", help="Test loading a model")
    test_parser.add_argument("model_id", help="Model ID to load")
    test_parser.add_argument("--loader", default="gguf", help="Loader type (hf, diffusers, gguf)")
    test_parser.add_argument("--device", help="Device to use")

    args = parser.parse_args()

    if args.command == "device":
        from .device import DeviceManager, get_best_device, get_device_count

        dm = DeviceManager()
        devices = dm.get_cuda_devices()

        if args.json:
            output = {
                "best_device": get_best_device(),
                "device_count": get_device_count(),
                "cuda_devices": [
                    {
                        "name": d.name,
                        "index": d.index,
                        "total_memory_mb": d.total_memory_mb,
                        "free_memory_mb": d.free_memory_mb,
                    }
                    for d in devices
                ],
            }
            print(json.dumps(output, indent=2))
        else:
            print(f"Best device: {get_best_device()}")
            print(f"Device count: {get_device_count()}")
            if devices:
                print("\nCUDA devices:")
                for d in devices:
                    print(f"  {d.index}: {d.name}")
                    print(f"      Memory: {d.free_memory_mb:.0f} / {d.total_memory_mb:.0f} MB")

    elif args.command == "test-load":
        from .registry import get_loader

        async def test_load():
            loader = get_loader(args.loader)
            print(f"Loading {args.model_id} with {args.loader} loader...")

            kwargs = {}
            if args.device:
                kwargs["device"] = args.device

            model = await loader.load(args.model_id, **kwargs)
            print(f"Loaded successfully!")
            print(f"  Device: {loader.get_device()}")
            if loader.model_info:
                print(f"  Load time: {loader.model_info.load_time_seconds:.2f}s")
                print(f"  Memory: {loader.model_info.memory_used_mb:.0f} MB")

            await loader.unload()
            print("Unloaded.")

        asyncio.run(test_load())

    else:
        parser.print_help()
        sys.exit(1)


if __name__ == "__main__":
    main()