#!/usr/bin/env python3 """prospector GPU — thin config over @cocotte/infra-tools. Provisions the H100 that serves the 27B AEON generator (quinn-oss :8000) and, on demand, the trained LoRA classifier (quinn-classifier :8001). All lifecycle logic (region fallback, external no-secret reaper, container serving, mesh-join hook) lives in `infra_tools`; this file supplies only prospector's specifics. gpu.py up | serve-classifier | reap | install-reaper | down | status Env: IDLE_MIN (default 20), MAX_HOURS (default 4). """ import os from infra_tools import GpuConfig, cli, mesh VOL = "prospector-models" # Optional region-agnostic mesh reach: reserved nyc3-segment slot 10.9.0.20. The WG # privkey comes from the operator vault (NEVER the repo); hub params from net-tools' # mesh-hosts.json. Absent key -> no mesh (falls back to the nyc2-volume path). The # one-time hub-side registration of this slot's pubkey on citron is an operator prod # step (the auto-mode classifier blocks the agent from prod-infra writes). _WG_KEY = os.path.expanduser("~/.vault/prospector_gpu_wg.key") MESH = mesh.join_from_net_tools(open(_WG_KEY).read().strip(), "10.9.0.20") if os.path.exists(_WG_KEY) else "" # Generator: 27B AEON on :8000 (mounts the model volume when it's attached). USERDATA = f"""mkdir -p /mnt/models && (mountpoint -q /mnt/models || mount -o discard,defaults /dev/disk/by-id/scsi-0DO_Volume_{VOL} /mnt/models) || true mkdir -p /mnt/models/hf docker run -d --name vllm --gpus all --restart unless-stopped \\ -v /mnt/models/hf:/root/.cache/huggingface -p 127.0.0.1:8000:8000 \\ vllm/vllm-openai:latest --model AEON-7/Qwen3.6-27B-AEON-Ultimate-Uncensored-BF16 \\ --served-model-name quinn-oss --enforce-eager --gpu-memory-utilization 0.92 --max-model-len 16384 --max-num-seqs 128""" # Classifier: 7B base + the trained LoRA adapter on :8001 (coexists with the 27B on # a bigger card; on an 80GB H100 the 27B alone needs ~74GB, so serve them separately). CLASSIFIER_CMD = ( "docker rm -f vllm-classifier 2>/dev/null; " "docker run -d --name vllm-classifier --gpus all --restart unless-stopped " "-v /mnt/models/hf:/root/.cache/huggingface -v /mnt/models:/mnt/models:ro -p 127.0.0.1:8001:8001 " "vllm/vllm-openai:latest --model Qwen/Qwen2.5-7B-Instruct --port 8001 " "--enable-lora --lora-modules quinn-classifier=/mnt/models/lora-classifier --max-lora-rank 16 " "--served-model-name quinn-classifier-base --enforce-eager --gpu-memory-utilization 0.25 --max-model-len 8192" ) CFG = GpuConfig( name="prospector-gpu", tags=["prospector", "gpu", "eval"], ssh_key="57416878", volume=VOL, volume_region="nyc2", vpc="b4f86d48-767b-48e3-b1c9-8b6bda8d70d4", userdata=USERDATA, mesh_join=MESH, ) if __name__ == "__main__": cli(CFG, __file__, serves={"serve-classifier": CLASSIFIER_CMD}, reaper_label="com.prospector.gpu-reaper")