From bea1934dfd6d2e97ee70eed3384abb9708fb8940 Mon Sep 17 00:00:00 2001
From: autocommit <autocommit@ftw.codes>
Date: Thu, 16 Apr 2026 17:03:08 -0700
Subject: [PATCH] =?UTF-8?q?perf(database):=20=E2=9A=A1=20Optimize=20SQLite?=
 =?UTF-8?q?=20connection=20handling=20with=20PRAGMA=20settings=20for=20jou?=
 =?UTF-8?q?rnal=20mode,=20synchronous=20writes,=20and=20cache=20size=20to?=
 =?UTF-8?q?=20reduce=20lock=20contention=20and=20speed=20up=20telemetry=20?=
 =?UTF-8?q?queries?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
---
 backend/nvidia_oc/database/connection.py | 78 +++++++++++++++++++-----
 1 file changed, 64 insertions(+), 14 deletions(-)

diff --git a/backend/nvidia_oc/database/connection.py b/backend/nvidia_oc/database/connection.py
index bbb32d5..7784972 100644
--- a/backend/nvidia_oc/database/connection.py
+++ b/backend/nvidia_oc/database/connection.py
@@ -3,11 +3,11 @@
 import asyncio
 import logging
 from contextlib import asynccontextmanager
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import AsyncIterator
 
-from sqlalchemy import text
+from sqlalchemy import event, text
 from sqlalchemy.ext.asyncio import (
     AsyncEngine,
     AsyncSession,
@@ -18,6 +18,31 @@ from sqlalchemy.ext.asyncio import (
 logger = logging.getLogger(__name__)
 
 
+_SQLITE_PRAGMAS = (
+    ("journal_mode", "WAL"),
+    ("synchronous", "NORMAL"),
+    ("busy_timeout", "5000"),
+    ("temp_store", "MEMORY"),
+    ("cache_size", "-32000"),
+    ("foreign_keys", "ON"),
+)
+
+
+def _apply_sqlite_pragmas(dbapi_connection, _connection_record) -> None:
+    """Apply SQLite PRAGMAs on every new connection.
+
+    WAL + 5s busy_timeout is what keeps the daemon's per-second telemetry
+    inserts from racing the prune-loop's writes — without these, every prune
+    cycle holds a global lock long enough to drop seconds of samples.
+    """
+    cursor = dbapi_connection.cursor()
+    try:
+        for name, value in _SQLITE_PRAGMAS:
+            cursor.execute(f"PRAGMA {name}={value}")
+    finally:
+        cursor.close()
+
+
 class DatabaseManager:
     """Manages SQLite connection with rolling data retention."""
 
@@ -55,6 +80,9 @@ class DatabaseManager:
             pool_pre_ping=True,
         )
 
+        # Apply SQLite PRAGMAs (WAL + busy_timeout) on every checkout.
+        event.listen(self.engine.sync_engine, "connect", _apply_sqlite_pragmas)
+
         self.session_factory = async_sessionmaker(
             self.engine,
             class_=AsyncSession,
@@ -124,46 +152,68 @@ class DatabaseManager:
     async def prune_old_data(self, force: bool = False) -> dict[str, int]:
         """Prune old data to maintain size limits.
 
+        Strategy: drop everything older than RETENTION_DAYS first; if the file
+        is still over SIZE_THRESHOLD_MB after that, drop the oldest records
+        until we're back under target. This guarantees the DB cannot grow
+        unbounded just because all data happens to be < RETENTION_DAYS old.
+
+        VACUUM only runs when we actually deleted rows — it holds an exclusive
+        lock proportional to file size, so calling it on every tick is what
+        was producing the sustained "database is locked" storms.
+
         Args:
             force: Force pruning even if under threshold
 
         Returns:
             Dictionary with counts of deleted records
         """
-        from .models import GPUTelemetryRecord
-
         size_mb = await self.get_database_size_mb()
         logger.info("Database size: %.2f MB / %d MB", size_mb, self.MAX_SIZE_MB)
 
         if not force and size_mb < self.SIZE_THRESHOLD_MB:
             return {"telemetry_records": 0}
 
-        cutoff_date = datetime.utcnow() - timedelta(days=self.RETENTION_DAYS)
-        deleted = {"telemetry_records": 0}
+        cutoff_date = datetime.now(timezone.utc) - timedelta(days=self.RETENTION_DAYS)
+        deleted_total = 0
 
         async with self.session() as session:
-            # Delete old telemetry records
             result = await session.execute(
                 text("DELETE FROM gpu_telemetry WHERE timestamp < :cutoff"),
                 {"cutoff": cutoff_date},
             )
-            deleted["telemetry_records"] = result.rowcount or 0
+            deleted_total += result.rowcount or 0
 
-            await session.commit()
+        # If the age-based delete didn't bring us under target, evict the
+        # oldest remaining rows in batches until we do.
+        size_mb = await self.get_database_size_mb()
+        while size_mb >= self.SIZE_THRESHOLD_MB:
+            async with self.session() as session:
+                result = await session.execute(
+                    text(
+                        "DELETE FROM gpu_telemetry WHERE id IN ("
+                        "SELECT id FROM gpu_telemetry ORDER BY timestamp ASC LIMIT 50000"
+                        ")"
+                    ),
+                )
+                batch = result.rowcount or 0
+            if batch == 0:
+                break
+            deleted_total += batch
+            size_mb = await self.get_database_size_mb()
 
-        # Vacuum to reclaim space (SQLite)
-        async with self.engine.connect() as conn:
-            await conn.execute(text("VACUUM"))
+        if deleted_total > 0:
+            async with self.engine.connect() as conn:
+                await conn.execute(text("VACUUM"))
 
         new_size_mb = await self.get_database_size_mb()
         logger.info(
             "Pruned %d telemetry records. Size: %.2f MB -> %.2f MB",
-            deleted["telemetry_records"],
+            deleted_total,
             size_mb,
             new_size_mb,
         )
 
-        return deleted
+        return {"telemetry_records": deleted_total}
 
     async def _prune_loop(self) -> None:
         """Background task to periodically prune old data."""