model-boss/scripts/init-gpus.sh

52 lines
1.6 KiB
Bash
Executable file

#!/usr/bin/env bash
# Initialize GPUs in model-boss from nvidia-smi
# Called by systemd after model-boss-coordinator starts
set -euo pipefail
# Wait for coordinator to be ready
MAX_WAIT=30
WAITED=0
while ! curl -s http://localhost:8210/ready >/dev/null 2>&1; do
sleep 1
WAITED=$((WAITED + 1))
if [ $WAITED -ge $MAX_WAIT ]; then
echo "ERROR: model-boss-coordinator not ready after ${MAX_WAIT}s"
exit 1
fi
done
# Check if GPUs already initialized
GPU_COUNT=$(curl -s http://localhost:8210/api/v1/gpu/status 2>/dev/null | jq -r '.gpus | length' 2>/dev/null || echo "0")
if [ "$GPU_COUNT" -gt 0 ]; then
echo "GPUs already initialized ($GPU_COUNT GPUs)"
exit 0
fi
# Detect GPUs from nvidia-smi
if ! command -v nvidia-smi >/dev/null 2>&1; then
echo "nvidia-smi not found, skipping GPU initialization"
exit 0
fi
echo "Initializing GPUs from nvidia-smi..."
# Parse nvidia-smi output and initialize each GPU
GPU_INDEX=0
while IFS=',' read -r name memory_total; do
# Clean up values
name=$(echo "$name" | xargs)
memory_total=$(echo "$memory_total" | sed 's/[^0-9]//g')
if [ -n "$memory_total" ] && [ "$memory_total" -gt 0 ]; then
echo " GPU $GPU_INDEX: $name (${memory_total} MB)"
model-boss gpu init "$GPU_INDEX" "$memory_total" --name "$name" 2>/dev/null || true
GPU_INDEX=$((GPU_INDEX + 1))
fi
done < <(nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader,nounits 2>/dev/null)
if [ $GPU_INDEX -eq 0 ]; then
echo "No GPUs detected"
else
echo "✓ Initialized $GPU_INDEX GPU(s)"
fi