prospector/tooling/eval/gpu.py

#!/usr/bin/env python3
"""prospector GPU — thin config over @cocotte/infra-tools.

Provisions the H100 that serves the 27B AEON generator (quinn-oss :8000) and, on
demand, the trained LoRA classifier (quinn-classifier :8001). All lifecycle logic
(region fallback, external no-secret reaper, container serving, mesh-join hook)
lives in `infra_tools`; this file supplies only prospector's specifics.

  gpu.py up | serve-classifier | reap | install-reaper | down | status
Env: IDLE_MIN (default 20), MAX_HOURS (default 4).
"""
import os

from infra_tools import GpuConfig, cli, mesh

VOL = "prospector-models"

# Optional region-agnostic mesh reach: reserved nyc3-segment slot 10.9.0.20. The WG
# privkey comes from the operator vault (NEVER the repo); hub params from net-tools'
# mesh-hosts.json. Absent key -> no mesh (falls back to the nyc2-volume path). The
# one-time hub-side registration of this slot's pubkey on citron is an operator prod
# step (the auto-mode classifier blocks the agent from prod-infra writes).
_WG_KEY = os.path.expanduser("~/.vault/prospector_gpu_wg.key")
MESH = mesh.join_from_net_tools(open(_WG_KEY).read().strip(), "10.9.0.20") if os.path.exists(_WG_KEY) else ""

# Generator: 27B AEON on :8000 (mounts the model volume when it's attached).
USERDATA = f"""mkdir -p /mnt/models && (mountpoint -q /mnt/models || mount -o discard,defaults /dev/disk/by-id/scsi-0DO_Volume_{VOL} /mnt/models) || true
mkdir -p /mnt/models/hf
docker run -d --name vllm --gpus all --restart unless-stopped \\
  -v /mnt/models/hf:/root/.cache/huggingface -p 127.0.0.1:8000:8000 \\
  vllm/vllm-openai:latest --model AEON-7/Qwen3.6-27B-AEON-Ultimate-Uncensored-BF16 \\
  --served-model-name quinn-oss --enforce-eager --gpu-memory-utilization 0.92 --max-model-len 16384 --max-num-seqs 128"""

# Classifier: 7B base + the trained LoRA adapter on :8001 (coexists with the 27B on
# a bigger card; on an 80GB H100 the 27B alone needs ~74GB, so serve them separately).
CLASSIFIER_CMD = (
    "docker rm -f vllm-classifier 2>/dev/null; "
    "docker run -d --name vllm-classifier --gpus all --restart unless-stopped "
    "-v /mnt/models/hf:/root/.cache/huggingface -v /mnt/models:/mnt/models:ro -p 127.0.0.1:8001:8001 "
    "vllm/vllm-openai:latest --model Qwen/Qwen2.5-7B-Instruct --port 8001 "
    "--enable-lora --lora-modules quinn-classifier=/mnt/models/lora-classifier --max-lora-rank 16 "
    "--served-model-name quinn-classifier-base --enforce-eager --gpu-memory-utilization 0.25 --max-model-len 8192"
)

CFG = GpuConfig(
    name="prospector-gpu", tags=["prospector", "gpu", "eval"], ssh_key="57416878",
    volume=VOL, volume_region="nyc2", vpc="b4f86d48-767b-48e3-b1c9-8b6bda8d70d4",
    userdata=USERDATA, mesh_join=MESH,
)

if __name__ == "__main__":
    cli(CFG, __file__, serves={"serve-classifier": CLASSIFIER_CMD},
        reaper_label="com.prospector.gpu-reaper")