platform-tooling/scripts/orchestration/rolling-restart.ts
2026-03-02 21:06:54 -08:00

866 lines
24 KiB
TypeScript
Executable file

#!/usr/bin/env tsx
/**
* Zero-Downtime Rolling Restart Orchestrator
*
* Purpose: Safely restart production services with health checks, rollback, and event emission
*
* Features:
* - Pre/post-restart health validation
* - Dependency-aware restart ordering
* - Automatic rollback on failure
* - Event emission for dashboard visibility
* - Database migration execution
* - Graceful systemd reload with grace periods
* - Dry-run support for testing
* - Force mode for emergency operations
*
* Usage:
* pnpm restart:rolling # Restart all services
* pnpm restart:rolling --service sso.api # Restart single service
* pnpm restart:rolling --dry-run # Preview without executing
* pnpm restart:rolling --force # Skip health checks (emergency)
*/
import { exec } from 'node:child_process';
import { promisify } from 'node:util';
import * as fs from 'node:fs/promises';
import * as path from 'node:path';
import { Logger } from './logger.js';
import {
ProductionServiceConfig,
getProductionServiceConfig,
PRODUCTION_SERVICES,
} from './prod-services.js';
const execAsync = promisify(exec);
const logger = new Logger('RollingRestart');
// Configuration
const HEALTH_CHECK_TIMEOUT = 30000; // 30s
const HEALTH_CHECK_INTERVAL = 2000; // 2s
const STABILIZATION_PERIOD = 30000; // 30s
const SYSTEMD_GRACE_PERIOD = 10000; // 10s
const MAX_RETRY_ATTEMPTS = 3;
const RETRY_DELAY = 5000; // 5s
// Event types for orchestrator visibility
interface OrchestratorEvent {
type: 'SERVICE_RESTART_START' | 'SERVICE_RESTART_SUCCESS' | 'SERVICE_RESTART_FAILED' | 'ROLLBACK_START' | 'ROLLBACK_SUCCESS';
serviceId: string;
timestamp: number;
metadata?: Record<string, unknown>;
}
// Results
export interface RestartResult {
success: boolean;
servicesRestarted: string[];
servicesFailed: string[];
totalTime: number;
events: OrchestratorEvent[];
}
export interface RestartOptions {
dryRun?: boolean;
force?: boolean;
skipMigrations?: boolean;
deployCode?: boolean;
deployPath?: string;
}
/**
* Emit orchestrator event (integrate with DomainEventsEmitter in production)
*/
function emitEvent(event: OrchestratorEvent): void {
logger.debug(`Event: ${event.type} for ${event.serviceId}`);
// TODO: Integrate with @lilith/domain-events when running on VPS
// For now, log to file for dashboard pickup
const eventLog = {
...event,
timestamp: new Date(event.timestamp).toISOString(),
};
// Write to orchestrator event log
const logPath = '/var/log/lilith/orchestrator-events.jsonl';
try {
// Append event as JSON line
const eventLine = JSON.stringify(eventLog) + '\n';
// Note: In production, this should use fs.appendFileSync or a proper event bus
logger.debug(`Event logged: ${eventLine.trim()}`);
} catch (error) {
logger.warn(`Failed to log event: ${error instanceof Error ? error.message : 'unknown'}`);
}
}
/**
* Execute systemctl command with proper error handling
*/
async function systemctl(
command: string,
service: string,
dryRun: boolean = false,
): Promise<{ success: boolean; output: string; error?: string }> {
const fullCommand = `systemctl ${command} ${service}`;
if (dryRun) {
logger.info(`[DRY-RUN] Would execute: ${fullCommand}`);
return { success: true, output: '[dry-run]' };
}
try {
const { stdout, stderr } = await execAsync(`sudo ${fullCommand}`, {
timeout: SYSTEMD_GRACE_PERIOD,
});
return { success: true, output: stdout || stderr };
} catch (error) {
const err = error as { message: string; stderr?: string };
return {
success: false,
output: '',
error: err.stderr || err.message,
};
}
}
/**
* Check if systemd unit file exists
*/
async function unitFileExists(unitName: string): Promise<boolean> {
try {
const { stdout } = await execAsync(`systemctl list-unit-files ${unitName}`);
return stdout.includes(unitName);
} catch {
return false;
}
}
/**
* Get systemd service status
*/
async function getServiceStatus(unitName: string): Promise<{
active: boolean;
running: boolean;
failed: boolean;
uptime?: number;
}> {
try {
const { stdout } = await execAsync(`systemctl show ${unitName} --property=ActiveState,SubState,ExecMainStartTimestamp`);
const lines = stdout.split('\n');
const activeState = lines.find(l => l.startsWith('ActiveState='))?.split('=')[1]?.trim();
const subState = lines.find(l => l.startsWith('SubState='))?.split('=')[1]?.trim();
const startTime = lines.find(l => l.startsWith('ExecMainStartTimestamp='))?.split('=')[1]?.trim();
const active = activeState === 'active';
const running = subState === 'running';
const failed = activeState === 'failed';
let uptime: number | undefined;
if (startTime && startTime !== '') {
const start = new Date(startTime).getTime();
uptime = Date.now() - start;
}
return { active, running, failed, uptime };
} catch (error) {
logger.warn(`Failed to get status for ${unitName}: ${error instanceof Error ? error.message : 'unknown'}`);
return { active: false, running: false, failed: true };
}
}
/**
* Perform HTTP health check
*/
async function checkHttpHealth(
url: string,
timeout: number = HEALTH_CHECK_TIMEOUT,
): Promise<{ healthy: boolean; responseTime: number; error?: string }> {
const startTime = Date.now();
try {
const { stdout, stderr } = await execAsync(
`curl -sf -o /dev/null -w "%{http_code}" --connect-timeout 5 --max-time ${Math.floor(timeout / 1000)} "${url}"`,
{ timeout }
);
const responseTime = Date.now() - startTime;
const httpCode = parseInt(stdout.trim());
if (httpCode === 200) {
return { healthy: true, responseTime };
} else {
return {
healthy: false,
responseTime,
error: `HTTP ${httpCode}`,
};
}
} catch (error) {
const responseTime = Date.now() - startTime;
return {
healthy: false,
responseTime,
error: error instanceof Error ? error.message : 'Connection failed',
};
}
}
/**
* Perform command-based health check
*/
async function checkCommandHealth(
command: string,
timeout: number = HEALTH_CHECK_TIMEOUT,
): Promise<{ healthy: boolean; error?: string }> {
try {
await execAsync(command, { timeout });
return { healthy: true };
} catch (error) {
return {
healthy: false,
error: error instanceof Error ? error.message : 'Command failed',
};
}
}
/**
* Validate service health with retry logic
*/
async function validateServiceHealth(
config: ProductionServiceConfig,
maxAttempts: number = MAX_RETRY_ATTEMPTS,
): Promise<{ healthy: boolean; error?: string }> {
const { serviceId, healthCheck, serviceType } = config;
// Infrastructure services use systemd status
if (serviceType === 'postgresql' || serviceType === 'redis' || serviceType === 'minio') {
const status = await getServiceStatus(config.systemdUnit);
return {
healthy: status.active && status.running,
error: status.failed ? 'Service failed' : status.running ? undefined : 'Service not running',
};
}
// No health check defined
if (!healthCheck) {
logger.warn(`No health check defined for ${serviceId}, assuming healthy`);
return { healthy: true };
}
// Retry health check
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
if (attempt > 1) {
logger.info(`Health check retry ${attempt}/${maxAttempts} for ${serviceId}`);
await new Promise(resolve => setTimeout(resolve, RETRY_DELAY));
}
let result: { healthy: boolean; error?: string };
if (healthCheck.url) {
const httpResult = await checkHttpHealth(healthCheck.url, HEALTH_CHECK_TIMEOUT);
result = { healthy: httpResult.healthy, error: httpResult.error };
if (httpResult.healthy) {
logger.serviceHealth(serviceId, true, httpResult.responseTime);
}
} else if (healthCheck.command) {
result = await checkCommandHealth(healthCheck.command, HEALTH_CHECK_TIMEOUT);
} else {
result = { healthy: false, error: 'No health check method defined' };
}
if (result.healthy) {
return result;
}
if (attempt === maxAttempts) {
logger.error(`Health check failed after ${maxAttempts} attempts for ${serviceId}`);
return result;
}
}
return { healthy: false, error: 'Max retries exceeded' };
}
/**
* Pre-restart health validation
*/
export async function validatePreRestart(
serviceId: string,
config: ProductionServiceConfig,
force: boolean = false,
): Promise<boolean> {
if (force) {
logger.warn(`Skipping pre-restart health check for ${serviceId} (force mode)`);
return true;
}
logger.info(`Pre-restart health check for ${serviceId}`);
const result = await validateServiceHealth(config);
if (!result.healthy) {
logger.error(
`Pre-restart health check failed for ${serviceId}: ${result.error || 'unhealthy'}. Service is not healthy before restart.`
);
return false;
}
logger.success(`Pre-restart health check passed for ${serviceId}`);
return true;
}
/**
* Post-restart health validation
*/
export async function validatePostRestart(
serviceId: string,
config: ProductionServiceConfig,
force: boolean = false,
): Promise<boolean> {
if (force) {
logger.warn(`Skipping post-restart health check for ${serviceId} (force mode)`);
return true;
}
logger.info(`Post-restart health check for ${serviceId}`);
// Wait for service to stabilize
await new Promise(resolve => setTimeout(resolve, 3000));
const result = await validateServiceHealth(config);
if (!result.healthy) {
logger.error(
`Post-restart health check failed for ${serviceId}: ${result.error || 'unhealthy'}`
);
return false;
}
logger.success(`Post-restart health check passed for ${serviceId}`);
return true;
}
/**
* Backup systemd unit file
*/
async function backupUnitFile(
unitName: string,
dryRun: boolean,
): Promise<boolean> {
const unitPath = `/etc/systemd/system/${unitName}`;
const backupPath = `${unitPath}.backup`;
if (dryRun) {
logger.info(`[DRY-RUN] Would backup ${unitPath} to ${backupPath}`);
return true;
}
try {
await execAsync(`sudo cp ${unitPath} ${backupPath}`);
logger.debug(`Backed up ${unitName} to ${unitName}.backup`);
return true;
} catch (error) {
logger.error(`Failed to backup ${unitName}`, error as Error);
return false;
}
}
/**
* Restore systemd unit file from backup
*/
async function restoreUnitFile(
unitName: string,
dryRun: boolean,
): Promise<boolean> {
const unitPath = `/etc/systemd/system/${unitName}`;
const backupPath = `${unitPath}.backup`;
if (dryRun) {
logger.info(`[DRY-RUN] Would restore ${backupPath} to ${unitPath}`);
return true;
}
try {
// Check if backup exists
await execAsync(`test -f ${backupPath}`);
// Restore backup
await execAsync(`sudo cp ${backupPath} ${unitPath}`);
await execAsync(`sudo systemctl daemon-reload`);
logger.success(`Restored ${unitName} from backup`);
return true;
} catch (error) {
logger.error(`Failed to restore ${unitName} from backup`, error as Error);
return false;
}
}
/**
* Deploy new code/config if provided
*/
async function deployCode(
config: ProductionServiceConfig,
deployPath: string,
dryRun: boolean,
): Promise<boolean> {
if (dryRun) {
logger.info(`[DRY-RUN] Would deploy code from ${deployPath} to ${config.workingDir}`);
return true;
}
logger.info(`Deploying code for ${config.serviceId}`);
try {
// Sync code to working directory (use rsync for safe deployment)
await execAsync(
`sudo rsync -av --delete "${deployPath}/" "${config.workingDir}/"`,
{ timeout: 60000 }
);
logger.success(`Deployed code for ${config.serviceId}`);
return true;
} catch (error) {
logger.error(`Failed to deploy code for ${config.serviceId}`, error as Error);
return false;
}
}
/**
* Run database migrations if needed
*/
async function runMigrations(
config: ProductionServiceConfig,
dryRun: boolean,
): Promise<boolean> {
const { serviceId, workingDir, serviceType } = config;
// Only API services may have migrations
if (serviceType !== 'api') {
return true;
}
const migrationScript = path.join(workingDir, 'node_modules/.bin/prisma');
if (dryRun) {
logger.info(`[DRY-RUN] Would run migrations for ${serviceId}`);
return true;
}
try {
// Check if Prisma exists
await fs.access(migrationScript);
logger.info(`Running database migrations for ${serviceId}`);
await execAsync(
`cd "${workingDir}" && ${migrationScript} migrate deploy`,
{ timeout: 120000 }
);
logger.success(`Migrations completed for ${serviceId}`);
return true;
} catch (error) {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
// No Prisma installed, skip migrations
logger.debug(`No migrations to run for ${serviceId}`);
return true;
}
logger.error(`Migration failed for ${serviceId}`, error as Error);
return false;
}
}
/**
* Gracefully restart a single service
*/
export async function restartService(
serviceId: string,
options: RestartOptions = {},
): Promise<boolean> {
const { dryRun = false, force = false, skipMigrations = false, deployCode: shouldDeployCode = false, deployPath } = options;
// Get service configuration
const config = getProductionServiceConfig(serviceId);
// Check if unit file exists
const exists = await unitFileExists(config.systemdUnit);
if (!exists && !dryRun) {
logger.error(`Systemd unit ${config.systemdUnit} does not exist for ${serviceId}`);
return false;
}
logger.section(`Restarting ${serviceId}`);
// Emit start event
emitEvent({
type: 'SERVICE_RESTART_START',
serviceId,
timestamp: Date.now(),
});
// Step 1: Pre-restart health check
logger.info('Step 1: Pre-restart health validation');
const preHealthy = await validatePreRestart(serviceId, config, force);
if (!preHealthy) {
emitEvent({
type: 'SERVICE_RESTART_FAILED',
serviceId,
timestamp: Date.now(),
metadata: { reason: 'Pre-restart health check failed' },
});
return false;
}
// Step 2: Backup unit file
logger.info('Step 2: Backing up systemd unit file');
const backedUp = await backupUnitFile(config.systemdUnit, dryRun);
if (!backedUp && !dryRun) {
logger.error('Failed to backup unit file, aborting restart');
return false;
}
// Step 3: Deploy new code if provided
if (shouldDeployCode && deployPath) {
logger.info('Step 3: Deploying new code');
const deployed = await deployCode(config, deployPath, dryRun);
if (!deployed) {
logger.error('Code deployment failed, aborting restart');
return false;
}
} else {
logger.info('Step 3: Skipping code deployment (not requested)');
}
// Step 4: Run database migrations
if (!skipMigrations) {
logger.info('Step 4: Running database migrations');
const migrated = await runMigrations(config, dryRun);
if (!migrated) {
logger.error('Database migrations failed, aborting restart');
return false;
}
} else {
logger.info('Step 4: Skipping database migrations (disabled)');
}
// Step 5: Attempt graceful reload, fallback to restart
logger.info('Step 5: Restarting service');
// Try reload first (for services that support it)
if (config.serviceType === 'api' || config.serviceType === 'ml') {
const reloadResult = await systemctl('reload', config.systemdUnit, dryRun);
if (!reloadResult.success) {
logger.warn(`Reload failed for ${config.systemdUnit}, attempting restart`);
const restartResult = await systemctl('restart', config.systemdUnit, dryRun);
if (!restartResult.success) {
logger.error(`Restart failed for ${config.systemdUnit}: ${restartResult.error}`);
emitEvent({
type: 'SERVICE_RESTART_FAILED',
serviceId,
timestamp: Date.now(),
metadata: { reason: 'Systemd restart failed', error: restartResult.error },
});
return false;
}
}
} else {
// Infrastructure services just restart
const restartResult = await systemctl('restart', config.systemdUnit, dryRun);
if (!restartResult.success) {
logger.error(`Restart failed for ${config.systemdUnit}: ${restartResult.error}`);
emitEvent({
type: 'SERVICE_RESTART_FAILED',
serviceId,
timestamp: Date.now(),
metadata: { reason: 'Systemd restart failed', error: restartResult.error },
});
return false;
}
}
logger.success(`Service ${config.systemdUnit} restarted`);
// Step 6: Post-restart health check
logger.info('Step 6: Post-restart health validation');
const postHealthy = await validatePostRestart(serviceId, config, force);
if (!postHealthy) {
logger.error('Post-restart health check failed, initiating rollback');
emitEvent({
type: 'ROLLBACK_START',
serviceId,
timestamp: Date.now(),
metadata: { reason: 'Post-restart health check failed' },
});
// Rollback: stop, restore backup, start
await systemctl('stop', config.systemdUnit, dryRun);
await restoreUnitFile(config.systemdUnit, dryRun);
await systemctl('start', config.systemdUnit, dryRun);
// Verify rollback
const rollbackHealthy = await validateServiceHealth(config);
if (rollbackHealthy.healthy) {
logger.success('Rollback successful, service restored to previous state');
emitEvent({
type: 'ROLLBACK_SUCCESS',
serviceId,
timestamp: Date.now(),
});
} else {
logger.error('Rollback failed, service is in degraded state');
}
emitEvent({
type: 'SERVICE_RESTART_FAILED',
serviceId,
timestamp: Date.now(),
metadata: { reason: 'Post-restart health check failed', rolledBack: rollbackHealthy.healthy },
});
return false;
}
// Step 7: Stabilization period
logger.info(`Step 7: Waiting ${STABILIZATION_PERIOD / 1000}s for stabilization`);
if (!dryRun) {
await new Promise(resolve => setTimeout(resolve, STABILIZATION_PERIOD));
// Final health check
const stableHealthy = await validateServiceHealth(config, 1);
if (!stableHealthy.healthy) {
logger.warn(`Service ${serviceId} unstable after stabilization period`);
}
}
logger.success(`Service ${serviceId} restarted successfully`);
emitEvent({
type: 'SERVICE_RESTART_SUCCESS',
serviceId,
timestamp: Date.now(),
});
return true;
}
/**
* Sort services by dependency order
*/
function sortServicesByDependencies(serviceIds: string[]): string[] {
// Build dependency graph
const graph: Map<string, string[]> = new Map();
const configs = new Map<string, ProductionServiceConfig>();
for (const serviceId of serviceIds) {
const config = getProductionServiceConfig(serviceId);
configs.set(serviceId, config);
// Extract service dependencies (remove systemd units like network.target)
const deps = config.dependencies
.filter(dep => dep.startsWith('lilith-'))
.map(dep => dep.replace('lilith-', '').replace('-', '.').replace('.service', ''))
.filter(dep => serviceIds.includes(dep));
graph.set(serviceId, deps);
}
// Topological sort
const sorted: string[] = [];
const visited = new Set<string>();
const visiting = new Set<string>();
function visit(serviceId: string): void {
if (visited.has(serviceId)) return;
if (visiting.has(serviceId)) {
logger.warn(`Circular dependency detected for ${serviceId}, continuing anyway`);
return;
}
visiting.add(serviceId);
const deps = graph.get(serviceId) || [];
for (const dep of deps) {
visit(dep);
}
visiting.delete(serviceId);
visited.add(serviceId);
sorted.push(serviceId);
}
for (const serviceId of serviceIds) {
visit(serviceId);
}
return sorted;
}
/**
* Rolling restart of multiple services in dependency order
*/
export async function rollingRestart(
services?: string[],
options: RestartOptions = {},
): Promise<RestartResult> {
const startTime = Date.now();
const { dryRun = false } = options;
logger.stage('Rolling Restart', dryRun ? 'DRY RUN MODE' : undefined);
// Determine services to restart
const servicesToRestart = services && services.length > 0
? services
: PRODUCTION_SERVICES;
logger.info(`Planning to restart ${servicesToRestart.length} services`);
// Sort by dependencies
const sortedServices = sortServicesByDependencies(servicesToRestart);
logger.info('Restart order (dependency-aware):');
for (let i = 0; i < sortedServices.length; i++) {
logger.info(` ${i + 1}. ${sortedServices[i]}`);
}
if (dryRun) {
logger.info('\nDry run complete. No services were restarted.');
return {
success: true,
servicesRestarted: [],
servicesFailed: [],
totalTime: Date.now() - startTime,
events: [],
};
}
// Restart each service sequentially
const servicesRestarted: string[] = [];
const servicesFailed: string[] = [];
const events: OrchestratorEvent[] = [];
for (let i = 0; i < sortedServices.length; i++) {
const serviceId = sortedServices[i]!;
logger.section(`Service ${i + 1}/${sortedServices.length}: ${serviceId}`);
const success = await restartService(serviceId, options);
if (success) {
servicesRestarted.push(serviceId);
logger.success(`${serviceId} restarted successfully`);
} else {
servicesFailed.push(serviceId);
logger.error(`${serviceId} restart failed`);
// Abort rolling restart on first failure
logger.error('Aborting rolling restart due to service failure');
break;
}
}
const totalTime = Date.now() - startTime;
const success = servicesFailed.length === 0;
// Summary
logger.summary('Rolling Restart Summary', [
{ label: 'Total Services', value: sortedServices.length },
{ label: 'Successfully Restarted', value: servicesRestarted.length, color: 'green' },
{ label: 'Failed', value: servicesFailed.length, color: servicesFailed.length > 0 ? 'red' : 'green' },
{ label: 'Total Time', value: `${Math.round(totalTime / 1000)}s` },
{ label: 'Result', value: success ? 'SUCCESS' : 'FAILED', color: success ? 'green' : 'red' },
]);
return {
success,
servicesRestarted,
servicesFailed,
totalTime,
events,
};
}
/**
* CLI entry point
*/
async function main(): Promise<void> {
const args = process.argv.slice(2);
// Parse arguments
const dryRun = args.includes('--dry-run');
const force = args.includes('--force');
const skipMigrations = args.includes('--skip-migrations');
const deployCodeFlag = args.includes('--deploy');
const serviceIndex = args.indexOf('--service');
const services = serviceIndex !== -1 && args[serviceIndex + 1]
? [args[serviceIndex + 1]!]
: undefined;
const deployPathIndex = args.indexOf('--deploy-path');
const deployPath = deployPathIndex !== -1 && args[deployPathIndex + 1]
? args[deployPathIndex + 1]
: undefined;
if (args.includes('--help') || args.includes('-h')) {
console.log(`
Rolling Restart Orchestrator - Zero-Downtime Production Restarts
Usage:
pnpm restart:rolling [options]
Options:
--service <id> Restart only specified service (e.g., sso.api)
--dry-run Preview restart plan without executing
--force Skip health checks (emergency mode)
--skip-migrations Skip database migrations
--deploy Deploy code before restart
--deploy-path <path> Path to code to deploy
-h, --help Show this help message
Examples:
pnpm restart:rolling
pnpm restart:rolling --service sso.api
pnpm restart:rolling --dry-run
pnpm restart:rolling --force
pnpm restart:rolling --deploy --deploy-path /tmp/deploy/sso-api
`);
process.exit(0);
}
const options: RestartOptions = {
dryRun,
force,
skipMigrations,
deployCode: deployCodeFlag,
deployPath,
};
const result = await rollingRestart(services, options);
process.exit(result.success ? 0 : 1);
}
// Run if executed directly
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch(error => {
logger.error('Fatal error', error as Error);
process.exit(1);
});
}
export { emitEvent, systemctl, getServiceStatus, checkHttpHealth };