From 565f4924e8c1202496e1ea8ce18eb78159752af1 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Sun, 15 Feb 2026 08:43:58 -0800 Subject: [PATCH] we do it better now --- PID.md | 374 --------------------------------------------------------- 1 file changed, 374 deletions(-) delete mode 100644 PID.md diff --git a/PID.md b/PID.md deleted file mode 100644 index 87a2a1f..0000000 --- a/PID.md +++ /dev/null @@ -1,374 +0,0 @@ -# PID File Tracking for Robust Process Management - -## Problem Statement - -When the Toes host process crashes unexpectedly (OOM, SIGKILL, power loss, kernel panic), child app processes continue running as orphans. On restart, Toes has no knowledge of these processes: - -- **Port conflicts**: Orphans hold ports, new instances fail to bind -- **Resource waste**: Zombie processes consume memory/CPU -- **State confusion**: App appears "stopped" but is actually running -- **Data corruption**: Multiple instances may write to same files - -Currently, Toes only handles graceful shutdown (SIGTERM/SIGINT). There's no recovery mechanism for ungraceful termination. - -## Proposed Solution: PID File Tracking - -### Design - -Store PID files in `TOES_DIR/pids/`: - -``` -${TOES_DIR}/pids/ - clock.pid # Contains: 12345 - todo.pid # Contains: 12389 - weather.pid # Contains: 12402 -``` - -### Lifecycle - -``` -┌─────────────┐ ┌─────────────┐ ┌─────────────┐ -│ App Start │────▶│ Write PID │────▶│ Running │ -└─────────────┘ └─────────────┘ └─────────────┘ - │ - ┌─────────────┐ │ - │ Delete PID │◀──────────┘ - └─────────────┘ App Exit -``` - -On host startup: -``` -┌─────────────┐ ┌─────────────┐ ┌─────────────┐ -│ Host Init │────▶│ Scan PIDs │────▶│Kill Orphans │ -└─────────────┘ └─────────────┘ └─────────────┘ - │ - ▼ - ┌─────────────┐ - │Clean Stale │ - │ PID Files │ - └─────────────┘ -``` - -### Implementation - -#### 1. PID Directory Setup - -```typescript -const PIDS_DIR = join(TOES_DIR, 'pids') - -function ensurePidsDir() { - if (!existsSync(PIDS_DIR)) { - mkdirSync(PIDS_DIR, { recursive: true }) - } -} -``` - -#### 2. Write PID on Start - -In `runApp()`, after spawning: - -```typescript -const proc = Bun.spawn(['bun', 'run', 'toes'], { ... }) -app.proc = proc - -// Write PID file -const pidFile = join(PIDS_DIR, `${dir}.pid`) -writeFileSync(pidFile, String(proc.pid)) -``` - -#### 3. Delete PID on Exit - -In the `proc.exited.then()` handler: - -```typescript -proc.exited.then(code => { - // Remove PID file - const pidFile = join(PIDS_DIR, `${dir}.pid`) - if (existsSync(pidFile)) { - unlinkSync(pidFile) - } - - // ... existing cleanup -}) -``` - -#### 4. Orphan Cleanup on Startup - -New function called during `initApps()`: - -```typescript -function cleanupOrphanProcesses() { - ensurePidsDir() - - for (const file of readdirSync(PIDS_DIR)) { - if (!file.endsWith('.pid')) continue - - const appName = file.replace('.pid', '') - const pidFile = join(PIDS_DIR, file) - const pid = parseInt(readFileSync(pidFile, 'utf-8').trim(), 10) - - if (isNaN(pid)) { - // Invalid PID file, remove it - unlinkSync(pidFile) - hostLog(`Removed invalid PID file: ${file}`) - continue - } - - if (isProcessRunning(pid)) { - // Orphan found - kill it - hostLog(`Found orphan process for ${appName} (PID ${pid}), terminating...`) - try { - process.kill(pid, 'SIGTERM') - - // Give it 5 seconds, then SIGKILL - setTimeout(() => { - if (isProcessRunning(pid)) { - hostLog(`Orphan ${appName} (PID ${pid}) didn't terminate, sending SIGKILL`) - process.kill(pid, 'SIGKILL') - } - }, 5000) - } catch (e) { - // Process may have exited between check and kill - hostLog(`Failed to kill orphan ${appName}: ${e}`) - } - } - - // Remove stale PID file - unlinkSync(pidFile) - } -} - -function isProcessRunning(pid: number): boolean { - try { - // Sending signal 0 checks if process exists without killing it - process.kill(pid, 0) - return true - } catch { - return false - } -} -``` - -#### 5. Integration Point - -Update `initApps()`: - -```typescript -export function initApps() { - initPortPool() - setupShutdownHandlers() - cleanupOrphanProcesses() // <-- Add here, before discovery - rotateLogs() - createAppSymlinks() - discoverApps() - runApps() -} -``` - -### Edge Cases - -| Scenario | Handling | -|----------|----------| -| PID reused by OS | Check if process command matches expected pattern before killing | -| PID file corrupted | Delete invalid files, log warning | -| Multiple Toes instances | Use file locking or instance ID in PID path | -| App renamed while running | Old PID file orphaned; cleanup handles it | -| Permission denied on kill | Log error, continue with other orphans | - -### Enhanced: Validate Process Identity - -To avoid killing an unrelated process that reused the PID: - -```typescript -function isOurProcess(pid: number, appName: string): boolean { - try { - // On macOS/Linux, check /proc or use ps - const result = Bun.spawnSync(['ps', '-p', String(pid), '-o', 'args=']) - const cmd = new TextDecoder().decode(result.stdout).trim() - - // Check if it looks like a Toes app process - return cmd.includes('bun') && cmd.includes('toes') - } catch { - return false - } -} -``` - ---- - -## Related Recommendations - -### 1. Store Port in PID File - -Extend PID files to include port for faster recovery: - -``` -# clock.pid -12345 -3001 -``` - -Or use JSON: -```json -{"pid": 12345, "port": 3001, "started": 1706900000000} -``` - -This allows Toes to reclaim the exact port on restart, avoiding port shuffling. - -### 2. Circuit Breaker for Crash Loops - -Add crash tracking to prevent infinite restart loops: - -```typescript -interface CrashRecord { - timestamp: number - exitCode: number -} - -// Store in TOES_DIR/crashes/.json -const CRASH_WINDOW = 3600000 // 1 hour -const MAX_CRASHES = 10 - -function recordCrash(appName: string, exitCode: number) { - const file = join(TOES_DIR, 'crashes', `${appName}.json`) - const crashes: CrashRecord[] = existsSync(file) - ? JSON.parse(readFileSync(file, 'utf-8')) - : [] - - // Add new crash - crashes.push({ timestamp: Date.now(), exitCode }) - - // Prune old crashes - const cutoff = Date.now() - CRASH_WINDOW - const recent = crashes.filter(c => c.timestamp > cutoff) - - writeFileSync(file, JSON.stringify(recent)) - - return recent.length -} - -function shouldCircuitBreak(appName: string): boolean { - const file = join(TOES_DIR, 'crashes', `${appName}.json`) - if (!existsSync(file)) return false - - const crashes: CrashRecord[] = JSON.parse(readFileSync(file, 'utf-8')) - const cutoff = Date.now() - CRASH_WINDOW - const recent = crashes.filter(c => c.timestamp > cutoff) - - return recent.length >= MAX_CRASHES -} -``` - -### 3. Track Restart Timer for Cancellation - -Store scheduled restart timers on the app object: - -```typescript -export type App = SharedApp & { - // ... existing fields - restartTimer?: Timer // <-- Add this -} -``` - -Update `scheduleRestart()`: - -```typescript -function scheduleRestart(app: App, dir: string) { - // Cancel any existing scheduled restart - if (app.restartTimer) { - clearTimeout(app.restartTimer) - } - - // ... existing delay calculation ... - - app.restartTimer = setTimeout(() => { - app.restartTimer = undefined - // ... existing restart logic - }, delay) -} -``` - -Update `clearTimers()`: - -```typescript -const clearTimers = (app: App) => { - // ... existing timer cleanup ... - - if (app.restartTimer) { - clearTimeout(app.restartTimer) - app.restartTimer = undefined - } -} -``` - -### 4. Exit Code Classification - -```typescript -function classifyExit(code: number | null): 'restart' | 'invalid' | 'stop' { - if (code === null) return 'restart' // Killed by signal - if (code === 0) return 'stop' // Clean exit - if (code === 2) return 'invalid' // Bad arguments/config - if (code >= 128) { - // Killed by signal (128 + signal number) - const signal = code - 128 - if (signal === 9) return 'restart' // SIGKILL (OOM?) - if (signal === 15) return 'stop' // SIGTERM (intentional) - } - return 'restart' // Default: try again -} -``` - -### 5. Install Timeout - -Wrap `bun install` with a timeout: - -```typescript -async function installWithTimeout(cwd: string, timeout = 60000): Promise { - const install = Bun.spawn(['bun', 'install'], { - cwd, - stdout: 'pipe', - stderr: 'pipe' - }) - - const timeoutPromise = new Promise((_, reject) => { - setTimeout(() => { - install.kill() - reject(new Error('Install timeout')) - }, timeout) - }) - - try { - await Promise.race([install.exited, timeoutPromise]) - return install.exitCode === 0 - } catch (e) { - return false - } -} -``` - ---- - -## Implementation Priority - -| Change | Effort | Impact | Priority | -|--------|--------|--------|----------| -| PID file tracking | Medium | High | **1** | -| Orphan cleanup on startup | Medium | High | **1** | -| Track restart timer | Low | Medium | **2** | -| Install timeout | Low | Medium | **2** | -| Circuit breaker | Medium | Medium | **3** | -| Exit code classification | Low | Low | **4** | -| Process identity validation | Medium | Low | **5** | - ---- - -## Testing Checklist - -- [ ] Host crashes while apps running → orphans cleaned on restart -- [ ] App crashes → PID file removed, restart scheduled -- [ ] App stopped manually → PID file removed, no restart -- [ ] Stale PID file (process gone) → file cleaned up -- [ ] PID reused by unrelated process → not killed (with identity check) -- [ ] Multiple rapid restarts → circuit breaker triggers -- [ ] Rename app while running → handled gracefully -- [ ] `bun install` hangs → times out, app marked failed