From b1fc698b9ab52ec09b9e5a9a1ccba7da8e92d53c Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Wed, 4 Feb 2026 09:52:19 -0800 Subject: [PATCH] PID ideas --- PID.md | 374 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 PID.md diff --git a/PID.md b/PID.md new file mode 100644 index 0000000..87a2a1f --- /dev/null +++ b/PID.md @@ -0,0 +1,374 @@ +# PID File Tracking for Robust Process Management + +## Problem Statement + +When the Toes host process crashes unexpectedly (OOM, SIGKILL, power loss, kernel panic), child app processes continue running as orphans. On restart, Toes has no knowledge of these processes: + +- **Port conflicts**: Orphans hold ports, new instances fail to bind +- **Resource waste**: Zombie processes consume memory/CPU +- **State confusion**: App appears "stopped" but is actually running +- **Data corruption**: Multiple instances may write to same files + +Currently, Toes only handles graceful shutdown (SIGTERM/SIGINT). There's no recovery mechanism for ungraceful termination. + +## Proposed Solution: PID File Tracking + +### Design + +Store PID files in `TOES_DIR/pids/`: + +``` +${TOES_DIR}/pids/ + clock.pid # Contains: 12345 + todo.pid # Contains: 12389 + weather.pid # Contains: 12402 +``` + +### Lifecycle + +``` +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ App Start │────▶│ Write PID │────▶│ Running │ +└─────────────┘ └─────────────┘ └─────────────┘ + │ + ┌─────────────┐ │ + │ Delete PID │◀──────────┘ + └─────────────┘ App Exit +``` + +On host startup: +``` +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Host Init │────▶│ Scan PIDs │────▶│Kill Orphans │ +└─────────────┘ └─────────────┘ └─────────────┘ + │ + ▼ + ┌─────────────┐ + │Clean Stale │ + │ PID Files │ + └─────────────┘ +``` + +### Implementation + +#### 1. PID Directory Setup + +```typescript +const PIDS_DIR = join(TOES_DIR, 'pids') + +function ensurePidsDir() { + if (!existsSync(PIDS_DIR)) { + mkdirSync(PIDS_DIR, { recursive: true }) + } +} +``` + +#### 2. Write PID on Start + +In `runApp()`, after spawning: + +```typescript +const proc = Bun.spawn(['bun', 'run', 'toes'], { ... }) +app.proc = proc + +// Write PID file +const pidFile = join(PIDS_DIR, `${dir}.pid`) +writeFileSync(pidFile, String(proc.pid)) +``` + +#### 3. Delete PID on Exit + +In the `proc.exited.then()` handler: + +```typescript +proc.exited.then(code => { + // Remove PID file + const pidFile = join(PIDS_DIR, `${dir}.pid`) + if (existsSync(pidFile)) { + unlinkSync(pidFile) + } + + // ... existing cleanup +}) +``` + +#### 4. Orphan Cleanup on Startup + +New function called during `initApps()`: + +```typescript +function cleanupOrphanProcesses() { + ensurePidsDir() + + for (const file of readdirSync(PIDS_DIR)) { + if (!file.endsWith('.pid')) continue + + const appName = file.replace('.pid', '') + const pidFile = join(PIDS_DIR, file) + const pid = parseInt(readFileSync(pidFile, 'utf-8').trim(), 10) + + if (isNaN(pid)) { + // Invalid PID file, remove it + unlinkSync(pidFile) + hostLog(`Removed invalid PID file: ${file}`) + continue + } + + if (isProcessRunning(pid)) { + // Orphan found - kill it + hostLog(`Found orphan process for ${appName} (PID ${pid}), terminating...`) + try { + process.kill(pid, 'SIGTERM') + + // Give it 5 seconds, then SIGKILL + setTimeout(() => { + if (isProcessRunning(pid)) { + hostLog(`Orphan ${appName} (PID ${pid}) didn't terminate, sending SIGKILL`) + process.kill(pid, 'SIGKILL') + } + }, 5000) + } catch (e) { + // Process may have exited between check and kill + hostLog(`Failed to kill orphan ${appName}: ${e}`) + } + } + + // Remove stale PID file + unlinkSync(pidFile) + } +} + +function isProcessRunning(pid: number): boolean { + try { + // Sending signal 0 checks if process exists without killing it + process.kill(pid, 0) + return true + } catch { + return false + } +} +``` + +#### 5. Integration Point + +Update `initApps()`: + +```typescript +export function initApps() { + initPortPool() + setupShutdownHandlers() + cleanupOrphanProcesses() // <-- Add here, before discovery + rotateLogs() + createAppSymlinks() + discoverApps() + runApps() +} +``` + +### Edge Cases + +| Scenario | Handling | +|----------|----------| +| PID reused by OS | Check if process command matches expected pattern before killing | +| PID file corrupted | Delete invalid files, log warning | +| Multiple Toes instances | Use file locking or instance ID in PID path | +| App renamed while running | Old PID file orphaned; cleanup handles it | +| Permission denied on kill | Log error, continue with other orphans | + +### Enhanced: Validate Process Identity + +To avoid killing an unrelated process that reused the PID: + +```typescript +function isOurProcess(pid: number, appName: string): boolean { + try { + // On macOS/Linux, check /proc or use ps + const result = Bun.spawnSync(['ps', '-p', String(pid), '-o', 'args=']) + const cmd = new TextDecoder().decode(result.stdout).trim() + + // Check if it looks like a Toes app process + return cmd.includes('bun') && cmd.includes('toes') + } catch { + return false + } +} +``` + +--- + +## Related Recommendations + +### 1. Store Port in PID File + +Extend PID files to include port for faster recovery: + +``` +# clock.pid +12345 +3001 +``` + +Or use JSON: +```json +{"pid": 12345, "port": 3001, "started": 1706900000000} +``` + +This allows Toes to reclaim the exact port on restart, avoiding port shuffling. + +### 2. Circuit Breaker for Crash Loops + +Add crash tracking to prevent infinite restart loops: + +```typescript +interface CrashRecord { + timestamp: number + exitCode: number +} + +// Store in TOES_DIR/crashes/.json +const CRASH_WINDOW = 3600000 // 1 hour +const MAX_CRASHES = 10 + +function recordCrash(appName: string, exitCode: number) { + const file = join(TOES_DIR, 'crashes', `${appName}.json`) + const crashes: CrashRecord[] = existsSync(file) + ? JSON.parse(readFileSync(file, 'utf-8')) + : [] + + // Add new crash + crashes.push({ timestamp: Date.now(), exitCode }) + + // Prune old crashes + const cutoff = Date.now() - CRASH_WINDOW + const recent = crashes.filter(c => c.timestamp > cutoff) + + writeFileSync(file, JSON.stringify(recent)) + + return recent.length +} + +function shouldCircuitBreak(appName: string): boolean { + const file = join(TOES_DIR, 'crashes', `${appName}.json`) + if (!existsSync(file)) return false + + const crashes: CrashRecord[] = JSON.parse(readFileSync(file, 'utf-8')) + const cutoff = Date.now() - CRASH_WINDOW + const recent = crashes.filter(c => c.timestamp > cutoff) + + return recent.length >= MAX_CRASHES +} +``` + +### 3. Track Restart Timer for Cancellation + +Store scheduled restart timers on the app object: + +```typescript +export type App = SharedApp & { + // ... existing fields + restartTimer?: Timer // <-- Add this +} +``` + +Update `scheduleRestart()`: + +```typescript +function scheduleRestart(app: App, dir: string) { + // Cancel any existing scheduled restart + if (app.restartTimer) { + clearTimeout(app.restartTimer) + } + + // ... existing delay calculation ... + + app.restartTimer = setTimeout(() => { + app.restartTimer = undefined + // ... existing restart logic + }, delay) +} +``` + +Update `clearTimers()`: + +```typescript +const clearTimers = (app: App) => { + // ... existing timer cleanup ... + + if (app.restartTimer) { + clearTimeout(app.restartTimer) + app.restartTimer = undefined + } +} +``` + +### 4. Exit Code Classification + +```typescript +function classifyExit(code: number | null): 'restart' | 'invalid' | 'stop' { + if (code === null) return 'restart' // Killed by signal + if (code === 0) return 'stop' // Clean exit + if (code === 2) return 'invalid' // Bad arguments/config + if (code >= 128) { + // Killed by signal (128 + signal number) + const signal = code - 128 + if (signal === 9) return 'restart' // SIGKILL (OOM?) + if (signal === 15) return 'stop' // SIGTERM (intentional) + } + return 'restart' // Default: try again +} +``` + +### 5. Install Timeout + +Wrap `bun install` with a timeout: + +```typescript +async function installWithTimeout(cwd: string, timeout = 60000): Promise { + const install = Bun.spawn(['bun', 'install'], { + cwd, + stdout: 'pipe', + stderr: 'pipe' + }) + + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + install.kill() + reject(new Error('Install timeout')) + }, timeout) + }) + + try { + await Promise.race([install.exited, timeoutPromise]) + return install.exitCode === 0 + } catch (e) { + return false + } +} +``` + +--- + +## Implementation Priority + +| Change | Effort | Impact | Priority | +|--------|--------|--------|----------| +| PID file tracking | Medium | High | **1** | +| Orphan cleanup on startup | Medium | High | **1** | +| Track restart timer | Low | Medium | **2** | +| Install timeout | Low | Medium | **2** | +| Circuit breaker | Medium | Medium | **3** | +| Exit code classification | Low | Low | **4** | +| Process identity validation | Medium | Low | **5** | + +--- + +## Testing Checklist + +- [ ] Host crashes while apps running → orphans cleaned on restart +- [ ] App crashes → PID file removed, restart scheduled +- [ ] App stopped manually → PID file removed, no restart +- [ ] Stale PID file (process gone) → file cleaned up +- [ ] PID reused by unrelated process → not killed (with identity check) +- [ ] Multiple rapid restarts → circuit breaker triggers +- [ ] Rename app while running → handled gracefully +- [ ] `bun install` hangs → times out, app marked failed