# Implementation Guide - How It Should Work ## Core Workflow ### Initialization Flow ```python # 1. Initialize NVML pynvml.nvmlInit() # 2. Enumerate GPUs gpu_manager = GPUManager() gpus = gpu_manager.list_devices() # Returns: [GPUDevice(index=0, name="RTX 3090", ...), GPUDevice(index=1, ...)] # 3. Get device handles gpu0 = gpu_manager.get_device(0) ``` ### Clock Control Flow ```python # 1. Check current clocks clock_controller = ClockController() clocks = clock_controller.get_clocks(gpu0) # Returns: ClockInfo(core=1815, memory=9501, shader=1815) # 2. Validate offset validate_clock_offset(offset=+100, domain="core") # Raises if > 200MHz # 3. Apply offset clock_controller.set_clock_offset(gpu0, core=+100, memory=+500) # 4. Verify applied new_clocks = clock_controller.get_clocks(gpu0) # Returns: ClockInfo(core=1915, memory=10001, ...) ``` ### Fan Control Flow ```python # 1. Check current fan speed fan_controller = FanController() current_speed = fan_controller.get_fan_speed(gpu0) # Returns: 48 (percent) # 2. Apply manual speed fan_controller.set_fan_speed(gpu0, speed=70) # 3. Or apply temperature curve fan_curve = [(60, 50), (70, 70), (75, 85), (80, 100)] fan_controller.apply_curve(gpu0, fan_curve) # 4. Background task monitors temp and adjusts fan async def fan_curve_daemon(): while True: temp = get_temperature(gpu0) target_speed = interpolate_curve(temp, fan_curve) set_fan_speed(gpu0, target_speed) await asyncio.sleep(5) # Update every 5 seconds ``` ### Telemetry Streaming Flow ```python # 1. Create collector telemetry = TelemetryCollector() # 2. Collect one-time snapshot metrics = telemetry.collect(gpu0) # Returns: GPUMetrics(temp=75, fan=48, power=367.99, ...) # 3. Stream continuously async for metrics in telemetry.stream(gpu0, interval=1.0): print(f"Temp: {metrics.temperature}°C") # Yields every 1 second # 4. WebSocket broadcasts to frontend async def telemetry_websocket(websocket: WebSocket): await websocket.accept() async for metrics in telemetry.stream_all(interval=1.0): await websocket.send_json(metrics) ``` ### Profile Management Flow ```python # 1. Load profile from YAML profile_manager = ProfileManager() profile = profile_manager.load(Path("configs/balanced.yaml")) # Returns: ProfileConfig(name="Balanced", core_offset=100, ...) # 2. Validate profile # Pydantic automatically validates during load # 3. Apply profile to GPU profile_manager.apply(gpu0, profile) # Internally calls: # - clock_controller.set_clock_offset(gpu0, profile.core_offset, profile.memory_offset) # - fan_controller.apply_curve(gpu0, profile.fan_curve) # 4. Save current settings as profile current_profile = profile_manager.capture(gpu0) profile_manager.save(current_profile, Path("~/.config/nvidia-oc/profiles/my-profile.yaml")) ``` ## Component Implementation Details ### GPUManager (core/gpu.py) **Purpose:** Enumerate and manage GPU device handles. **Key Methods:** ```python class GPUManager: def __init__(self): """Initialize NVML once at startup.""" pynvml.nvmlInit() self._devices: List[GPUDevice] = [] self._refresh_devices() def list_devices(self) -> List[GPUDevice]: """Return all NVIDIA GPUs.""" return self._devices def get_device(self, index: int) -> GPUDevice: """Get specific GPU by index.""" if index >= len(self._devices): raise ValueError(f"GPU index {index} out of range") return self._devices[index] def _refresh_devices(self) -> None: """Query NVML for all GPU devices.""" count = pynvml.nvmlDeviceGetCount() for i in range(count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle).decode() uuid = pynvml.nvmlDeviceGetUUID(handle).decode() self._devices.append(GPUDevice(index=i, name=name, uuid=uuid, handle=handle)) def __del__(self): """Shutdown NVML on cleanup.""" pynvml.nvmlShutdown() ``` **Error Handling:** - Catch `pynvml.NVMLError` and wrap in custom exceptions - Handle missing NVIDIA driver gracefully - Retry initialization if transient failure ### ClockController (core/clock.py) **Purpose:** Read and modify GPU clock offsets. **Key Methods:** ```python class ClockController: def get_clocks(self, device: GPUDevice) -> ClockInfo: """Get current clock speeds in MHz.""" core = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_GRAPHICS) memory = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_MEM) shader = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_SM) return ClockInfo(core=core, memory=memory, shader=shader) def set_clock_offset(self, device: GPUDevice, core: int, memory: int) -> None: """Set clock offsets (requires Coolbits enabled).""" validate_clock_offset(core, "core") validate_clock_offset(memory, "memory") # Note: NVML doesn't provide direct offset API # Must use nvidia-settings via subprocess: subprocess.run([ "nvidia-settings", "-a", f"[gpu:{device.index}]/GPUGraphicsClockOffsetAllPerformanceLevels={core}", "-a", f"[gpu:{device.index}]/GPUMemoryTransferRateOffsetAllPerformanceLevels={memory}" ], check=True) def reset_clocks(self, device: GPUDevice) -> None: """Reset to default clocks.""" self.set_clock_offset(device, core=0, memory=0) ``` **Important Notes:** - NVML can READ clocks but cannot WRITE offsets - Must use `nvidia-settings` command for writes (requires X11 or virtual X) - Offsets persist across reboots if saved in nvidia-settings - Coolbits must be enabled in Xorg config ### FanController (core/fan.py) **Purpose:** Control GPU fan speeds. **Key Methods:** ```python class FanController: def get_fan_speed(self, device: GPUDevice) -> int: """Get current fan speed as percentage.""" speed = pynvml.nvmlDeviceGetFanSpeed(device.handle) return speed # Returns 0-100 def set_fan_speed(self, device: GPUDevice, speed: int) -> None: """Set manual fan speed (0-100%).""" validate_fan_speed(speed) pynvml.nvmlDeviceSetFanSpeed_v2(device.handle, 0, speed) # 0 = fan index def apply_curve(self, device: GPUDevice, curve: FanCurve) -> None: """Apply temperature-based fan curve.""" # Start background task to monitor temp and adjust fan asyncio.create_task(self._curve_monitor(device, curve)) async def _curve_monitor(self, device: GPUDevice, curve: FanCurve) -> None: """Background task to apply fan curve.""" while True: temp = pynvml.nvmlDeviceGetTemperature(device.handle, pynvml.NVML_TEMPERATURE_GPU) target_speed = self._interpolate_curve(temp, curve) self.set_fan_speed(device, target_speed) await asyncio.sleep(5) # Update every 5 seconds def _interpolate_curve(self, temp: int, curve: FanCurve) -> int: """Linear interpolation between curve points.""" for i, (temp_threshold, fan_speed) in enumerate(curve): if temp < temp_threshold: if i == 0: return fan_speed prev_temp, prev_speed = curve[i - 1] ratio = (temp - prev_temp) / (temp_threshold - prev_temp) return int(prev_speed + ratio * (fan_speed - prev_speed)) return curve[-1][1] # Max speed if beyond all thresholds def enable_auto(self, device: GPUDevice) -> None: """Re-enable automatic fan control.""" pynvml.nvmlDeviceSetDefaultFanSpeed_v2(device.handle, 0) ``` **Fan Curve Algorithm:** - Linear interpolation between defined points - Example: Temp=67°C, curve=[(60,50), (70,70)] → speed = 50 + (67-60)/(70-60) * (70-50) = 64% ### TelemetryCollector (core/telemetry.py) **Purpose:** Collect and stream GPU metrics. **Key Methods:** ```python class TelemetryCollector: def collect(self, device: GPUDevice) -> GPUMetrics: """Collect all metrics for a GPU.""" handle = device.handle return GPUMetrics( timestamp=time.time(), temperature=pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU), fan_speed=pynvml.nvmlDeviceGetFanSpeed(handle), power_draw=pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0, # mW to W core_clock=pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS), memory_clock=pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM), utilization=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu, memory_used=pynvml.nvmlDeviceGetMemoryInfo(handle).used // 1024 // 1024, # Bytes to MB memory_total=pynvml.nvmlDeviceGetMemoryInfo(handle).total // 1024 // 1024, ) async def stream(self, device: GPUDevice, interval: float) -> AsyncGenerator[GPUMetrics, None]: """Stream metrics at specified interval.""" while True: yield self.collect(device) await asyncio.sleep(interval) async def stream_all(self, devices: List[GPUDevice], interval: float) -> AsyncGenerator[Dict, None]: """Stream metrics for all GPUs.""" while True: metrics = { "timestamp": time.time(), "gpus": [asdict(self.collect(device)) for device in devices] } yield metrics await asyncio.sleep(interval) ``` ### ProfileManager (core/profile.py) **Purpose:** Load, validate, and apply profiles. **Key Methods:** ```python class ProfileManager: def load(self, path: Path) -> ProfileConfig: """Load profile from YAML file.""" with open(path, "r") as f: data = yaml.safe_load(f) return ProfileConfig(**data) # Pydantic validation def save(self, profile: ProfileConfig, path: Path) -> None: """Save profile to YAML file.""" with open(path, "w") as f: yaml.dump(profile.dict(), f) def apply(self, device: GPUDevice, profile: ProfileConfig) -> None: """Apply profile to GPU.""" clock_controller = ClockController() fan_controller = FanController() # Apply clock offsets clock_controller.set_clock_offset( device, core=profile.core_offset, memory=profile.memory_offset ) # Apply fan curve or manual speed if profile.fan_curve is not None: fan_controller.apply_curve(device, profile.fan_curve) else: fan_controller.enable_auto(device) def capture(self, device: GPUDevice) -> ProfileConfig: """Capture current GPU settings as profile.""" clock_controller = ClockController() fan_controller = FanController() clocks = clock_controller.get_clocks(device) fan_speed = fan_controller.get_fan_speed(device) return ProfileConfig( name="Custom", core_offset=0, # Note: NVML can't read offsets, only absolute clocks memory_offset=0, power_limit=100, fan_curve=None, # Store manual speed instead ) ``` ## CLI Implementation (cli/main.py) **Framework:** Click **Structure:** ```python @click.group() @click.version_option() def cli(): """NVIDIA GPU Overclocking Tool""" pass @cli.command() @click.option("--watch", is_flag=True, help="Live monitoring mode") def status(watch: bool): """Show GPU status""" if watch: # Use Rich Live display with Live(generate_table(), refresh_per_second=1): while True: time.sleep(1) else: # One-time status console.print(generate_table()) @cli.command("set-clock") @click.option("--gpu", type=int, required=True) @click.option("--core", type=int, required=True) @click.option("--memory", type=int, required=True) def set_clock(gpu: int, core: int, memory: int): """Set clock offsets""" with console.status(f"Applying clocks to GPU {gpu}..."): manager = GPUManager() device = manager.get_device(gpu) controller = ClockController() controller.set_clock_offset(device, core, memory) console.print(f"[green]✓[/green] Clocks applied: core={core:+d} MHz, memory={memory:+d} MHz") ``` ## API Implementation (api/main.py) **Framework:** FastAPI **Structure:** ```python app = FastAPI(title="NVIDIA OC API", version="0.1.0") # Initialize GPU manager at startup @app.on_event("startup") async def startup(): global gpu_manager, telemetry_collector gpu_manager = GPUManager() telemetry_collector = TelemetryCollector() @app.get("/api/gpus") async def list_gpus(): """List all GPUs""" gpus = gpu_manager.list_devices() return [{"index": gpu.index, "name": gpu.name, "uuid": gpu.uuid} for gpu in gpus] @app.post("/api/gpus/{gpu_id}/clock") async def set_clock(gpu_id: int, request: ClockRequest): """Set clock offsets""" device = gpu_manager.get_device(gpu_id) controller = ClockController() controller.set_clock_offset(device, request.core, request.memory) return {"status": "success", "core": request.core, "memory": request.memory} @app.websocket("/ws/telemetry") async def telemetry_websocket(websocket: WebSocket): """Stream live telemetry""" await websocket.accept() devices = gpu_manager.list_devices() async for metrics in telemetry_collector.stream_all(devices, interval=1.0): await websocket.send_json(metrics) ``` ## Frontend Implementation (frontend/src/App.tsx) **Framework:** React 19 **Structure:** ```tsx export const App = () => { const { gpus, loading, updateClock, updateFan } = useGPUData(); const { metrics, connectionState } = useWebSocket('ws://localhost:8000/ws/telemetry'); const { showToast } = useToast(); const handleClockUpdate = async (gpuId: number, core: number, memory: number) => { try { await updateClock(gpuId, core, memory); showToast('Clock offsets applied', 'success'); } catch (error) { showToast(`Error: ${error.message}`, 'error'); } }; return ( {loading && } {gpus.map(gpu => ( ))} ); }; ``` ## Testing Strategy ### Unit Tests (pytest) **Mock NVML:** ```python @pytest.fixture def mock_nvml(monkeypatch): class MockNVML: def nvmlInit(self): pass def nvmlDeviceGetCount(self): return 2 def nvmlDeviceGetHandleByIndex(self, idx): return f"handle_{idx}" def nvmlDeviceGetName(self, handle): return b"RTX 3090" # ... mock other methods monkeypatch.setattr("pynvml", MockNVML()) def test_gpu_manager(mock_nvml): manager = GPUManager() devices = manager.list_devices() assert len(devices) == 2 assert devices[0].name == "RTX 3090" ``` ### Integration Tests **API Testing:** ```python from fastapi.testclient import TestClient def test_list_gpus(): client = TestClient(app) response = client.get("/api/gpus") assert response.status_code == 200 assert len(response.json()) > 0 def test_set_clock(): client = TestClient(app) response = client.post("/api/gpus/0/clock", json={"core": 100, "memory": 500}) assert response.status_code == 200 assert response.json()["status"] == "success" ``` ### Stress Tests **24-Hour Burn-In:** ```bash # Apply performance profile nvidia-oc profile apply performance # Run ML training workload python train.py --epochs 100 & # Monitor in separate terminal nvidia-oc status --watch # Check for crashes after 24 hours # Acceptable: 0 crashes, 0 CUDA errors ``` --- **Last Updated:** 2026-01-14