Reserved nyc3 slot 10.9.0.20; WG privkey from the operator vault (never repo), hub params from net-tools' mesh-hosts.json. Absent key -> no mesh (nyc2-volume path). The one-time hub registration of the slot pubkey on citron is an operator prod step (agent-blocked). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
53 lines
2.8 KiB
Python
Executable file
53 lines
2.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""prospector GPU — thin config over @cocotte/infra-tools.
|
|
|
|
Provisions the H100 that serves the 27B AEON generator (quinn-oss :8000) and, on
|
|
demand, the trained LoRA classifier (quinn-classifier :8001). All lifecycle logic
|
|
(region fallback, external no-secret reaper, container serving, mesh-join hook)
|
|
lives in `infra_tools`; this file supplies only prospector's specifics.
|
|
|
|
gpu.py up | serve-classifier | reap | install-reaper | down | status
|
|
Env: IDLE_MIN (default 20), MAX_HOURS (default 4).
|
|
"""
|
|
import os
|
|
|
|
from infra_tools import GpuConfig, cli, mesh
|
|
|
|
VOL = "prospector-models"
|
|
|
|
# Optional region-agnostic mesh reach: reserved nyc3-segment slot 10.9.0.20. The WG
|
|
# privkey comes from the operator vault (NEVER the repo); hub params from net-tools'
|
|
# mesh-hosts.json. Absent key -> no mesh (falls back to the nyc2-volume path). The
|
|
# one-time hub-side registration of this slot's pubkey on citron is an operator prod
|
|
# step (the auto-mode classifier blocks the agent from prod-infra writes).
|
|
_WG_KEY = os.path.expanduser("~/.vault/prospector_gpu_wg.key")
|
|
MESH = mesh.join_from_net_tools(open(_WG_KEY).read().strip(), "10.9.0.20") if os.path.exists(_WG_KEY) else ""
|
|
|
|
# Generator: 27B AEON on :8000 (mounts the model volume when it's attached).
|
|
USERDATA = f"""mkdir -p /mnt/models && (mountpoint -q /mnt/models || mount -o discard,defaults /dev/disk/by-id/scsi-0DO_Volume_{VOL} /mnt/models) || true
|
|
mkdir -p /mnt/models/hf
|
|
docker run -d --name vllm --gpus all --restart unless-stopped \\
|
|
-v /mnt/models/hf:/root/.cache/huggingface -p 127.0.0.1:8000:8000 \\
|
|
vllm/vllm-openai:latest --model AEON-7/Qwen3.6-27B-AEON-Ultimate-Uncensored-BF16 \\
|
|
--served-model-name quinn-oss --enforce-eager --gpu-memory-utilization 0.92 --max-model-len 16384 --max-num-seqs 128"""
|
|
|
|
# Classifier: 7B base + the trained LoRA adapter on :8001 (coexists with the 27B on
|
|
# a bigger card; on an 80GB H100 the 27B alone needs ~74GB, so serve them separately).
|
|
CLASSIFIER_CMD = (
|
|
"docker rm -f vllm-classifier 2>/dev/null; "
|
|
"docker run -d --name vllm-classifier --gpus all --restart unless-stopped "
|
|
"-v /mnt/models/hf:/root/.cache/huggingface -v /mnt/models:/mnt/models:ro -p 127.0.0.1:8001:8001 "
|
|
"vllm/vllm-openai:latest --model Qwen/Qwen2.5-7B-Instruct --port 8001 "
|
|
"--enable-lora --lora-modules quinn-classifier=/mnt/models/lora-classifier --max-lora-rank 16 "
|
|
"--served-model-name quinn-classifier-base --enforce-eager --gpu-memory-utilization 0.25 --max-model-len 8192"
|
|
)
|
|
|
|
CFG = GpuConfig(
|
|
name="prospector-gpu", tags=["prospector", "gpu", "eval"], ssh_key="57416878",
|
|
volume=VOL, volume_region="nyc2", vpc="b4f86d48-767b-48e3-b1c9-8b6bda8d70d4",
|
|
userdata=USERDATA, mesh_join=MESH,
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
cli(CFG, __file__, serves={"serve-classifier": CLASSIFIER_CMD},
|
|
reaper_label="com.prospector.gpu-reaper")
|