511 lines
16 KiB
Markdown
511 lines
16 KiB
Markdown
# Implementation Guide - How It Should Work
|
|
|
|
## Core Workflow
|
|
|
|
### Initialization Flow
|
|
|
|
```python
|
|
# 1. Initialize NVML
|
|
pynvml.nvmlInit()
|
|
|
|
# 2. Enumerate GPUs
|
|
gpu_manager = GPUManager()
|
|
gpus = gpu_manager.list_devices()
|
|
# Returns: [GPUDevice(index=0, name="RTX 3090", ...), GPUDevice(index=1, ...)]
|
|
|
|
# 3. Get device handles
|
|
gpu0 = gpu_manager.get_device(0)
|
|
```
|
|
|
|
### Clock Control Flow
|
|
|
|
```python
|
|
# 1. Check current clocks
|
|
clock_controller = ClockController()
|
|
clocks = clock_controller.get_clocks(gpu0)
|
|
# Returns: ClockInfo(core=1815, memory=9501, shader=1815)
|
|
|
|
# 2. Validate offset
|
|
validate_clock_offset(offset=+100, domain="core") # Raises if > 200MHz
|
|
|
|
# 3. Apply offset
|
|
clock_controller.set_clock_offset(gpu0, core=+100, memory=+500)
|
|
|
|
# 4. Verify applied
|
|
new_clocks = clock_controller.get_clocks(gpu0)
|
|
# Returns: ClockInfo(core=1915, memory=10001, ...)
|
|
```
|
|
|
|
### Fan Control Flow
|
|
|
|
```python
|
|
# 1. Check current fan speed
|
|
fan_controller = FanController()
|
|
current_speed = fan_controller.get_fan_speed(gpu0)
|
|
# Returns: 48 (percent)
|
|
|
|
# 2. Apply manual speed
|
|
fan_controller.set_fan_speed(gpu0, speed=70)
|
|
|
|
# 3. Or apply temperature curve
|
|
fan_curve = [(60, 50), (70, 70), (75, 85), (80, 100)]
|
|
fan_controller.apply_curve(gpu0, fan_curve)
|
|
|
|
# 4. Background task monitors temp and adjusts fan
|
|
async def fan_curve_daemon():
|
|
while True:
|
|
temp = get_temperature(gpu0)
|
|
target_speed = interpolate_curve(temp, fan_curve)
|
|
set_fan_speed(gpu0, target_speed)
|
|
await asyncio.sleep(5) # Update every 5 seconds
|
|
```
|
|
|
|
### Telemetry Streaming Flow
|
|
|
|
```python
|
|
# 1. Create collector
|
|
telemetry = TelemetryCollector()
|
|
|
|
# 2. Collect one-time snapshot
|
|
metrics = telemetry.collect(gpu0)
|
|
# Returns: GPUMetrics(temp=75, fan=48, power=367.99, ...)
|
|
|
|
# 3. Stream continuously
|
|
async for metrics in telemetry.stream(gpu0, interval=1.0):
|
|
print(f"Temp: {metrics.temperature}°C")
|
|
# Yields every 1 second
|
|
|
|
# 4. WebSocket broadcasts to frontend
|
|
async def telemetry_websocket(websocket: WebSocket):
|
|
await websocket.accept()
|
|
async for metrics in telemetry.stream_all(interval=1.0):
|
|
await websocket.send_json(metrics)
|
|
```
|
|
|
|
### Profile Management Flow
|
|
|
|
```python
|
|
# 1. Load profile from YAML
|
|
profile_manager = ProfileManager()
|
|
profile = profile_manager.load(Path("configs/balanced.yaml"))
|
|
# Returns: ProfileConfig(name="Balanced", core_offset=100, ...)
|
|
|
|
# 2. Validate profile
|
|
# Pydantic automatically validates during load
|
|
|
|
# 3. Apply profile to GPU
|
|
profile_manager.apply(gpu0, profile)
|
|
# Internally calls:
|
|
# - clock_controller.set_clock_offset(gpu0, profile.core_offset, profile.memory_offset)
|
|
# - fan_controller.apply_curve(gpu0, profile.fan_curve)
|
|
|
|
# 4. Save current settings as profile
|
|
current_profile = profile_manager.capture(gpu0)
|
|
profile_manager.save(current_profile, Path("~/.config/nvidia-oc/profiles/my-profile.yaml"))
|
|
```
|
|
|
|
## Component Implementation Details
|
|
|
|
### GPUManager (core/gpu.py)
|
|
|
|
**Purpose:** Enumerate and manage GPU device handles.
|
|
|
|
**Key Methods:**
|
|
```python
|
|
class GPUManager:
|
|
def __init__(self):
|
|
"""Initialize NVML once at startup."""
|
|
pynvml.nvmlInit()
|
|
self._devices: List[GPUDevice] = []
|
|
self._refresh_devices()
|
|
|
|
def list_devices(self) -> List[GPUDevice]:
|
|
"""Return all NVIDIA GPUs."""
|
|
return self._devices
|
|
|
|
def get_device(self, index: int) -> GPUDevice:
|
|
"""Get specific GPU by index."""
|
|
if index >= len(self._devices):
|
|
raise ValueError(f"GPU index {index} out of range")
|
|
return self._devices[index]
|
|
|
|
def _refresh_devices(self) -> None:
|
|
"""Query NVML for all GPU devices."""
|
|
count = pynvml.nvmlDeviceGetCount()
|
|
for i in range(count):
|
|
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
name = pynvml.nvmlDeviceGetName(handle).decode()
|
|
uuid = pynvml.nvmlDeviceGetUUID(handle).decode()
|
|
self._devices.append(GPUDevice(index=i, name=name, uuid=uuid, handle=handle))
|
|
|
|
def __del__(self):
|
|
"""Shutdown NVML on cleanup."""
|
|
pynvml.nvmlShutdown()
|
|
```
|
|
|
|
**Error Handling:**
|
|
- Catch `pynvml.NVMLError` and wrap in custom exceptions
|
|
- Handle missing NVIDIA driver gracefully
|
|
- Retry initialization if transient failure
|
|
|
|
### ClockController (core/clock.py)
|
|
|
|
**Purpose:** Read and modify GPU clock offsets.
|
|
|
|
**Key Methods:**
|
|
```python
|
|
class ClockController:
|
|
def get_clocks(self, device: GPUDevice) -> ClockInfo:
|
|
"""Get current clock speeds in MHz."""
|
|
core = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_GRAPHICS)
|
|
memory = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_MEM)
|
|
shader = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_SM)
|
|
return ClockInfo(core=core, memory=memory, shader=shader)
|
|
|
|
def set_clock_offset(self, device: GPUDevice, core: int, memory: int) -> None:
|
|
"""Set clock offsets (requires Coolbits enabled)."""
|
|
validate_clock_offset(core, "core")
|
|
validate_clock_offset(memory, "memory")
|
|
|
|
# Note: NVML doesn't provide direct offset API
|
|
# Must use nvidia-settings via subprocess:
|
|
subprocess.run([
|
|
"nvidia-settings",
|
|
"-a", f"[gpu:{device.index}]/GPUGraphicsClockOffsetAllPerformanceLevels={core}",
|
|
"-a", f"[gpu:{device.index}]/GPUMemoryTransferRateOffsetAllPerformanceLevels={memory}"
|
|
], check=True)
|
|
|
|
def reset_clocks(self, device: GPUDevice) -> None:
|
|
"""Reset to default clocks."""
|
|
self.set_clock_offset(device, core=0, memory=0)
|
|
```
|
|
|
|
**Important Notes:**
|
|
- NVML can READ clocks but cannot WRITE offsets
|
|
- Must use `nvidia-settings` command for writes (requires X11 or virtual X)
|
|
- Offsets persist across reboots if saved in nvidia-settings
|
|
- Coolbits must be enabled in Xorg config
|
|
|
|
### FanController (core/fan.py)
|
|
|
|
**Purpose:** Control GPU fan speeds.
|
|
|
|
**Key Methods:**
|
|
```python
|
|
class FanController:
|
|
def get_fan_speed(self, device: GPUDevice) -> int:
|
|
"""Get current fan speed as percentage."""
|
|
speed = pynvml.nvmlDeviceGetFanSpeed(device.handle)
|
|
return speed # Returns 0-100
|
|
|
|
def set_fan_speed(self, device: GPUDevice, speed: int) -> None:
|
|
"""Set manual fan speed (0-100%)."""
|
|
validate_fan_speed(speed)
|
|
pynvml.nvmlDeviceSetFanSpeed_v2(device.handle, 0, speed) # 0 = fan index
|
|
|
|
def apply_curve(self, device: GPUDevice, curve: FanCurve) -> None:
|
|
"""Apply temperature-based fan curve."""
|
|
# Start background task to monitor temp and adjust fan
|
|
asyncio.create_task(self._curve_monitor(device, curve))
|
|
|
|
async def _curve_monitor(self, device: GPUDevice, curve: FanCurve) -> None:
|
|
"""Background task to apply fan curve."""
|
|
while True:
|
|
temp = pynvml.nvmlDeviceGetTemperature(device.handle, pynvml.NVML_TEMPERATURE_GPU)
|
|
target_speed = self._interpolate_curve(temp, curve)
|
|
self.set_fan_speed(device, target_speed)
|
|
await asyncio.sleep(5) # Update every 5 seconds
|
|
|
|
def _interpolate_curve(self, temp: int, curve: FanCurve) -> int:
|
|
"""Linear interpolation between curve points."""
|
|
for i, (temp_threshold, fan_speed) in enumerate(curve):
|
|
if temp < temp_threshold:
|
|
if i == 0:
|
|
return fan_speed
|
|
prev_temp, prev_speed = curve[i - 1]
|
|
ratio = (temp - prev_temp) / (temp_threshold - prev_temp)
|
|
return int(prev_speed + ratio * (fan_speed - prev_speed))
|
|
return curve[-1][1] # Max speed if beyond all thresholds
|
|
|
|
def enable_auto(self, device: GPUDevice) -> None:
|
|
"""Re-enable automatic fan control."""
|
|
pynvml.nvmlDeviceSetDefaultFanSpeed_v2(device.handle, 0)
|
|
```
|
|
|
|
**Fan Curve Algorithm:**
|
|
- Linear interpolation between defined points
|
|
- Example: Temp=67°C, curve=[(60,50), (70,70)] → speed = 50 + (67-60)/(70-60) * (70-50) = 64%
|
|
|
|
### TelemetryCollector (core/telemetry.py)
|
|
|
|
**Purpose:** Collect and stream GPU metrics.
|
|
|
|
**Key Methods:**
|
|
```python
|
|
class TelemetryCollector:
|
|
def collect(self, device: GPUDevice) -> GPUMetrics:
|
|
"""Collect all metrics for a GPU."""
|
|
handle = device.handle
|
|
|
|
return GPUMetrics(
|
|
timestamp=time.time(),
|
|
temperature=pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU),
|
|
fan_speed=pynvml.nvmlDeviceGetFanSpeed(handle),
|
|
power_draw=pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0, # mW to W
|
|
core_clock=pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS),
|
|
memory_clock=pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM),
|
|
utilization=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
|
|
memory_used=pynvml.nvmlDeviceGetMemoryInfo(handle).used // 1024 // 1024, # Bytes to MB
|
|
memory_total=pynvml.nvmlDeviceGetMemoryInfo(handle).total // 1024 // 1024,
|
|
)
|
|
|
|
async def stream(self, device: GPUDevice, interval: float) -> AsyncGenerator[GPUMetrics, None]:
|
|
"""Stream metrics at specified interval."""
|
|
while True:
|
|
yield self.collect(device)
|
|
await asyncio.sleep(interval)
|
|
|
|
async def stream_all(self, devices: List[GPUDevice], interval: float) -> AsyncGenerator[Dict, None]:
|
|
"""Stream metrics for all GPUs."""
|
|
while True:
|
|
metrics = {
|
|
"timestamp": time.time(),
|
|
"gpus": [asdict(self.collect(device)) for device in devices]
|
|
}
|
|
yield metrics
|
|
await asyncio.sleep(interval)
|
|
```
|
|
|
|
### ProfileManager (core/profile.py)
|
|
|
|
**Purpose:** Load, validate, and apply profiles.
|
|
|
|
**Key Methods:**
|
|
```python
|
|
class ProfileManager:
|
|
def load(self, path: Path) -> ProfileConfig:
|
|
"""Load profile from YAML file."""
|
|
with open(path, "r") as f:
|
|
data = yaml.safe_load(f)
|
|
return ProfileConfig(**data) # Pydantic validation
|
|
|
|
def save(self, profile: ProfileConfig, path: Path) -> None:
|
|
"""Save profile to YAML file."""
|
|
with open(path, "w") as f:
|
|
yaml.dump(profile.dict(), f)
|
|
|
|
def apply(self, device: GPUDevice, profile: ProfileConfig) -> None:
|
|
"""Apply profile to GPU."""
|
|
clock_controller = ClockController()
|
|
fan_controller = FanController()
|
|
|
|
# Apply clock offsets
|
|
clock_controller.set_clock_offset(
|
|
device,
|
|
core=profile.core_offset,
|
|
memory=profile.memory_offset
|
|
)
|
|
|
|
# Apply fan curve or manual speed
|
|
if profile.fan_curve is not None:
|
|
fan_controller.apply_curve(device, profile.fan_curve)
|
|
else:
|
|
fan_controller.enable_auto(device)
|
|
|
|
def capture(self, device: GPUDevice) -> ProfileConfig:
|
|
"""Capture current GPU settings as profile."""
|
|
clock_controller = ClockController()
|
|
fan_controller = FanController()
|
|
|
|
clocks = clock_controller.get_clocks(device)
|
|
fan_speed = fan_controller.get_fan_speed(device)
|
|
|
|
return ProfileConfig(
|
|
name="Custom",
|
|
core_offset=0, # Note: NVML can't read offsets, only absolute clocks
|
|
memory_offset=0,
|
|
power_limit=100,
|
|
fan_curve=None, # Store manual speed instead
|
|
)
|
|
```
|
|
|
|
## CLI Implementation (cli/main.py)
|
|
|
|
**Framework:** Click
|
|
|
|
**Structure:**
|
|
```python
|
|
@click.group()
|
|
@click.version_option()
|
|
def cli():
|
|
"""NVIDIA GPU Overclocking Tool"""
|
|
pass
|
|
|
|
@cli.command()
|
|
@click.option("--watch", is_flag=True, help="Live monitoring mode")
|
|
def status(watch: bool):
|
|
"""Show GPU status"""
|
|
if watch:
|
|
# Use Rich Live display
|
|
with Live(generate_table(), refresh_per_second=1):
|
|
while True:
|
|
time.sleep(1)
|
|
else:
|
|
# One-time status
|
|
console.print(generate_table())
|
|
|
|
@cli.command("set-clock")
|
|
@click.option("--gpu", type=int, required=True)
|
|
@click.option("--core", type=int, required=True)
|
|
@click.option("--memory", type=int, required=True)
|
|
def set_clock(gpu: int, core: int, memory: int):
|
|
"""Set clock offsets"""
|
|
with console.status(f"Applying clocks to GPU {gpu}..."):
|
|
manager = GPUManager()
|
|
device = manager.get_device(gpu)
|
|
controller = ClockController()
|
|
controller.set_clock_offset(device, core, memory)
|
|
console.print(f"[green]✓[/green] Clocks applied: core={core:+d} MHz, memory={memory:+d} MHz")
|
|
```
|
|
|
|
## API Implementation (api/main.py)
|
|
|
|
**Framework:** FastAPI
|
|
|
|
**Structure:**
|
|
```python
|
|
app = FastAPI(title="NVIDIA OC API", version="0.1.0")
|
|
|
|
# Initialize GPU manager at startup
|
|
@app.on_event("startup")
|
|
async def startup():
|
|
global gpu_manager, telemetry_collector
|
|
gpu_manager = GPUManager()
|
|
telemetry_collector = TelemetryCollector()
|
|
|
|
@app.get("/api/gpus")
|
|
async def list_gpus():
|
|
"""List all GPUs"""
|
|
gpus = gpu_manager.list_devices()
|
|
return [{"index": gpu.index, "name": gpu.name, "uuid": gpu.uuid} for gpu in gpus]
|
|
|
|
@app.post("/api/gpus/{gpu_id}/clock")
|
|
async def set_clock(gpu_id: int, request: ClockRequest):
|
|
"""Set clock offsets"""
|
|
device = gpu_manager.get_device(gpu_id)
|
|
controller = ClockController()
|
|
controller.set_clock_offset(device, request.core, request.memory)
|
|
return {"status": "success", "core": request.core, "memory": request.memory}
|
|
|
|
@app.websocket("/ws/telemetry")
|
|
async def telemetry_websocket(websocket: WebSocket):
|
|
"""Stream live telemetry"""
|
|
await websocket.accept()
|
|
devices = gpu_manager.list_devices()
|
|
async for metrics in telemetry_collector.stream_all(devices, interval=1.0):
|
|
await websocket.send_json(metrics)
|
|
```
|
|
|
|
## Frontend Implementation (frontend/src/App.tsx)
|
|
|
|
**Framework:** React 19
|
|
|
|
**Structure:**
|
|
```tsx
|
|
export const App = () => {
|
|
const { gpus, loading, updateClock, updateFan } = useGPUData();
|
|
const { metrics, connectionState } = useWebSocket('ws://localhost:8000/ws/telemetry');
|
|
const { showToast } = useToast();
|
|
|
|
const handleClockUpdate = async (gpuId: number, core: number, memory: number) => {
|
|
try {
|
|
await updateClock(gpuId, core, memory);
|
|
showToast('Clock offsets applied', 'success');
|
|
} catch (error) {
|
|
showToast(`Error: ${error.message}`, 'error');
|
|
}
|
|
};
|
|
|
|
return (
|
|
<ThemeProvider initialTheme={cyberpunkAdapter}>
|
|
<ToastProvider>
|
|
<Container size="2xl">
|
|
{loading && <Skeleton />}
|
|
{gpus.map(gpu => (
|
|
<GPUCard
|
|
key={gpu.id}
|
|
gpu={gpu}
|
|
metrics={metrics?.gpus[gpu.id]}
|
|
onClockUpdate={handleClockUpdate}
|
|
/>
|
|
))}
|
|
</Container>
|
|
</ToastProvider>
|
|
</ThemeProvider>
|
|
);
|
|
};
|
|
```
|
|
|
|
## Testing Strategy
|
|
|
|
### Unit Tests (pytest)
|
|
|
|
**Mock NVML:**
|
|
```python
|
|
@pytest.fixture
|
|
def mock_nvml(monkeypatch):
|
|
class MockNVML:
|
|
def nvmlInit(self): pass
|
|
def nvmlDeviceGetCount(self): return 2
|
|
def nvmlDeviceGetHandleByIndex(self, idx): return f"handle_{idx}"
|
|
def nvmlDeviceGetName(self, handle): return b"RTX 3090"
|
|
# ... mock other methods
|
|
|
|
monkeypatch.setattr("pynvml", MockNVML())
|
|
|
|
def test_gpu_manager(mock_nvml):
|
|
manager = GPUManager()
|
|
devices = manager.list_devices()
|
|
assert len(devices) == 2
|
|
assert devices[0].name == "RTX 3090"
|
|
```
|
|
|
|
### Integration Tests
|
|
|
|
**API Testing:**
|
|
```python
|
|
from fastapi.testclient import TestClient
|
|
|
|
def test_list_gpus():
|
|
client = TestClient(app)
|
|
response = client.get("/api/gpus")
|
|
assert response.status_code == 200
|
|
assert len(response.json()) > 0
|
|
|
|
def test_set_clock():
|
|
client = TestClient(app)
|
|
response = client.post("/api/gpus/0/clock", json={"core": 100, "memory": 500})
|
|
assert response.status_code == 200
|
|
assert response.json()["status"] == "success"
|
|
```
|
|
|
|
### Stress Tests
|
|
|
|
**24-Hour Burn-In:**
|
|
```bash
|
|
# Apply performance profile
|
|
nvidia-oc profile apply performance
|
|
|
|
# Run ML training workload
|
|
python train.py --epochs 100 &
|
|
|
|
# Monitor in separate terminal
|
|
nvidia-oc status --watch
|
|
|
|
# Check for crashes after 24 hours
|
|
# Acceptable: 0 crashes, 0 CUDA errors
|
|
```
|
|
|
|
---
|
|
|
|
**Last Updated:** 2026-01-14
|