PID ideas
This commit is contained in:
parent
a3f36a0c98
commit
b1fc698b9a
374
PID.md
Normal file
374
PID.md
Normal file
|
|
@ -0,0 +1,374 @@
|
|||
# PID File Tracking for Robust Process Management
|
||||
|
||||
## Problem Statement
|
||||
|
||||
When the Toes host process crashes unexpectedly (OOM, SIGKILL, power loss, kernel panic), child app processes continue running as orphans. On restart, Toes has no knowledge of these processes:
|
||||
|
||||
- **Port conflicts**: Orphans hold ports, new instances fail to bind
|
||||
- **Resource waste**: Zombie processes consume memory/CPU
|
||||
- **State confusion**: App appears "stopped" but is actually running
|
||||
- **Data corruption**: Multiple instances may write to same files
|
||||
|
||||
Currently, Toes only handles graceful shutdown (SIGTERM/SIGINT). There's no recovery mechanism for ungraceful termination.
|
||||
|
||||
## Proposed Solution: PID File Tracking
|
||||
|
||||
### Design
|
||||
|
||||
Store PID files in `TOES_DIR/pids/`:
|
||||
|
||||
```
|
||||
${TOES_DIR}/pids/
|
||||
clock.pid # Contains: 12345
|
||||
todo.pid # Contains: 12389
|
||||
weather.pid # Contains: 12402
|
||||
```
|
||||
|
||||
### Lifecycle
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||
│ App Start │────▶│ Write PID │────▶│ Running │
|
||||
└─────────────┘ └─────────────┘ └─────────────┘
|
||||
│
|
||||
┌─────────────┐ │
|
||||
│ Delete PID │◀──────────┘
|
||||
└─────────────┘ App Exit
|
||||
```
|
||||
|
||||
On host startup:
|
||||
```
|
||||
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||
│ Host Init │────▶│ Scan PIDs │────▶│Kill Orphans │
|
||||
└─────────────┘ └─────────────┘ └─────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────┐
|
||||
│Clean Stale │
|
||||
│ PID Files │
|
||||
└─────────────┘
|
||||
```
|
||||
|
||||
### Implementation
|
||||
|
||||
#### 1. PID Directory Setup
|
||||
|
||||
```typescript
|
||||
const PIDS_DIR = join(TOES_DIR, 'pids')
|
||||
|
||||
function ensurePidsDir() {
|
||||
if (!existsSync(PIDS_DIR)) {
|
||||
mkdirSync(PIDS_DIR, { recursive: true })
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 2. Write PID on Start
|
||||
|
||||
In `runApp()`, after spawning:
|
||||
|
||||
```typescript
|
||||
const proc = Bun.spawn(['bun', 'run', 'toes'], { ... })
|
||||
app.proc = proc
|
||||
|
||||
// Write PID file
|
||||
const pidFile = join(PIDS_DIR, `${dir}.pid`)
|
||||
writeFileSync(pidFile, String(proc.pid))
|
||||
```
|
||||
|
||||
#### 3. Delete PID on Exit
|
||||
|
||||
In the `proc.exited.then()` handler:
|
||||
|
||||
```typescript
|
||||
proc.exited.then(code => {
|
||||
// Remove PID file
|
||||
const pidFile = join(PIDS_DIR, `${dir}.pid`)
|
||||
if (existsSync(pidFile)) {
|
||||
unlinkSync(pidFile)
|
||||
}
|
||||
|
||||
// ... existing cleanup
|
||||
})
|
||||
```
|
||||
|
||||
#### 4. Orphan Cleanup on Startup
|
||||
|
||||
New function called during `initApps()`:
|
||||
|
||||
```typescript
|
||||
function cleanupOrphanProcesses() {
|
||||
ensurePidsDir()
|
||||
|
||||
for (const file of readdirSync(PIDS_DIR)) {
|
||||
if (!file.endsWith('.pid')) continue
|
||||
|
||||
const appName = file.replace('.pid', '')
|
||||
const pidFile = join(PIDS_DIR, file)
|
||||
const pid = parseInt(readFileSync(pidFile, 'utf-8').trim(), 10)
|
||||
|
||||
if (isNaN(pid)) {
|
||||
// Invalid PID file, remove it
|
||||
unlinkSync(pidFile)
|
||||
hostLog(`Removed invalid PID file: ${file}`)
|
||||
continue
|
||||
}
|
||||
|
||||
if (isProcessRunning(pid)) {
|
||||
// Orphan found - kill it
|
||||
hostLog(`Found orphan process for ${appName} (PID ${pid}), terminating...`)
|
||||
try {
|
||||
process.kill(pid, 'SIGTERM')
|
||||
|
||||
// Give it 5 seconds, then SIGKILL
|
||||
setTimeout(() => {
|
||||
if (isProcessRunning(pid)) {
|
||||
hostLog(`Orphan ${appName} (PID ${pid}) didn't terminate, sending SIGKILL`)
|
||||
process.kill(pid, 'SIGKILL')
|
||||
}
|
||||
}, 5000)
|
||||
} catch (e) {
|
||||
// Process may have exited between check and kill
|
||||
hostLog(`Failed to kill orphan ${appName}: ${e}`)
|
||||
}
|
||||
}
|
||||
|
||||
// Remove stale PID file
|
||||
unlinkSync(pidFile)
|
||||
}
|
||||
}
|
||||
|
||||
function isProcessRunning(pid: number): boolean {
|
||||
try {
|
||||
// Sending signal 0 checks if process exists without killing it
|
||||
process.kill(pid, 0)
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 5. Integration Point
|
||||
|
||||
Update `initApps()`:
|
||||
|
||||
```typescript
|
||||
export function initApps() {
|
||||
initPortPool()
|
||||
setupShutdownHandlers()
|
||||
cleanupOrphanProcesses() // <-- Add here, before discovery
|
||||
rotateLogs()
|
||||
createAppSymlinks()
|
||||
discoverApps()
|
||||
runApps()
|
||||
}
|
||||
```
|
||||
|
||||
### Edge Cases
|
||||
|
||||
| Scenario | Handling |
|
||||
|----------|----------|
|
||||
| PID reused by OS | Check if process command matches expected pattern before killing |
|
||||
| PID file corrupted | Delete invalid files, log warning |
|
||||
| Multiple Toes instances | Use file locking or instance ID in PID path |
|
||||
| App renamed while running | Old PID file orphaned; cleanup handles it |
|
||||
| Permission denied on kill | Log error, continue with other orphans |
|
||||
|
||||
### Enhanced: Validate Process Identity
|
||||
|
||||
To avoid killing an unrelated process that reused the PID:
|
||||
|
||||
```typescript
|
||||
function isOurProcess(pid: number, appName: string): boolean {
|
||||
try {
|
||||
// On macOS/Linux, check /proc or use ps
|
||||
const result = Bun.spawnSync(['ps', '-p', String(pid), '-o', 'args='])
|
||||
const cmd = new TextDecoder().decode(result.stdout).trim()
|
||||
|
||||
// Check if it looks like a Toes app process
|
||||
return cmd.includes('bun') && cmd.includes('toes')
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Related Recommendations
|
||||
|
||||
### 1. Store Port in PID File
|
||||
|
||||
Extend PID files to include port for faster recovery:
|
||||
|
||||
```
|
||||
# clock.pid
|
||||
12345
|
||||
3001
|
||||
```
|
||||
|
||||
Or use JSON:
|
||||
```json
|
||||
{"pid": 12345, "port": 3001, "started": 1706900000000}
|
||||
```
|
||||
|
||||
This allows Toes to reclaim the exact port on restart, avoiding port shuffling.
|
||||
|
||||
### 2. Circuit Breaker for Crash Loops
|
||||
|
||||
Add crash tracking to prevent infinite restart loops:
|
||||
|
||||
```typescript
|
||||
interface CrashRecord {
|
||||
timestamp: number
|
||||
exitCode: number
|
||||
}
|
||||
|
||||
// Store in TOES_DIR/crashes/<app>.json
|
||||
const CRASH_WINDOW = 3600000 // 1 hour
|
||||
const MAX_CRASHES = 10
|
||||
|
||||
function recordCrash(appName: string, exitCode: number) {
|
||||
const file = join(TOES_DIR, 'crashes', `${appName}.json`)
|
||||
const crashes: CrashRecord[] = existsSync(file)
|
||||
? JSON.parse(readFileSync(file, 'utf-8'))
|
||||
: []
|
||||
|
||||
// Add new crash
|
||||
crashes.push({ timestamp: Date.now(), exitCode })
|
||||
|
||||
// Prune old crashes
|
||||
const cutoff = Date.now() - CRASH_WINDOW
|
||||
const recent = crashes.filter(c => c.timestamp > cutoff)
|
||||
|
||||
writeFileSync(file, JSON.stringify(recent))
|
||||
|
||||
return recent.length
|
||||
}
|
||||
|
||||
function shouldCircuitBreak(appName: string): boolean {
|
||||
const file = join(TOES_DIR, 'crashes', `${appName}.json`)
|
||||
if (!existsSync(file)) return false
|
||||
|
||||
const crashes: CrashRecord[] = JSON.parse(readFileSync(file, 'utf-8'))
|
||||
const cutoff = Date.now() - CRASH_WINDOW
|
||||
const recent = crashes.filter(c => c.timestamp > cutoff)
|
||||
|
||||
return recent.length >= MAX_CRASHES
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Track Restart Timer for Cancellation
|
||||
|
||||
Store scheduled restart timers on the app object:
|
||||
|
||||
```typescript
|
||||
export type App = SharedApp & {
|
||||
// ... existing fields
|
||||
restartTimer?: Timer // <-- Add this
|
||||
}
|
||||
```
|
||||
|
||||
Update `scheduleRestart()`:
|
||||
|
||||
```typescript
|
||||
function scheduleRestart(app: App, dir: string) {
|
||||
// Cancel any existing scheduled restart
|
||||
if (app.restartTimer) {
|
||||
clearTimeout(app.restartTimer)
|
||||
}
|
||||
|
||||
// ... existing delay calculation ...
|
||||
|
||||
app.restartTimer = setTimeout(() => {
|
||||
app.restartTimer = undefined
|
||||
// ... existing restart logic
|
||||
}, delay)
|
||||
}
|
||||
```
|
||||
|
||||
Update `clearTimers()`:
|
||||
|
||||
```typescript
|
||||
const clearTimers = (app: App) => {
|
||||
// ... existing timer cleanup ...
|
||||
|
||||
if (app.restartTimer) {
|
||||
clearTimeout(app.restartTimer)
|
||||
app.restartTimer = undefined
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Exit Code Classification
|
||||
|
||||
```typescript
|
||||
function classifyExit(code: number | null): 'restart' | 'invalid' | 'stop' {
|
||||
if (code === null) return 'restart' // Killed by signal
|
||||
if (code === 0) return 'stop' // Clean exit
|
||||
if (code === 2) return 'invalid' // Bad arguments/config
|
||||
if (code >= 128) {
|
||||
// Killed by signal (128 + signal number)
|
||||
const signal = code - 128
|
||||
if (signal === 9) return 'restart' // SIGKILL (OOM?)
|
||||
if (signal === 15) return 'stop' // SIGTERM (intentional)
|
||||
}
|
||||
return 'restart' // Default: try again
|
||||
}
|
||||
```
|
||||
|
||||
### 5. Install Timeout
|
||||
|
||||
Wrap `bun install` with a timeout:
|
||||
|
||||
```typescript
|
||||
async function installWithTimeout(cwd: string, timeout = 60000): Promise<boolean> {
|
||||
const install = Bun.spawn(['bun', 'install'], {
|
||||
cwd,
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe'
|
||||
})
|
||||
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
setTimeout(() => {
|
||||
install.kill()
|
||||
reject(new Error('Install timeout'))
|
||||
}, timeout)
|
||||
})
|
||||
|
||||
try {
|
||||
await Promise.race([install.exited, timeoutPromise])
|
||||
return install.exitCode === 0
|
||||
} catch (e) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Priority
|
||||
|
||||
| Change | Effort | Impact | Priority |
|
||||
|--------|--------|--------|----------|
|
||||
| PID file tracking | Medium | High | **1** |
|
||||
| Orphan cleanup on startup | Medium | High | **1** |
|
||||
| Track restart timer | Low | Medium | **2** |
|
||||
| Install timeout | Low | Medium | **2** |
|
||||
| Circuit breaker | Medium | Medium | **3** |
|
||||
| Exit code classification | Low | Low | **4** |
|
||||
| Process identity validation | Medium | Low | **5** |
|
||||
|
||||
---
|
||||
|
||||
## Testing Checklist
|
||||
|
||||
- [ ] Host crashes while apps running → orphans cleaned on restart
|
||||
- [ ] App crashes → PID file removed, restart scheduled
|
||||
- [ ] App stopped manually → PID file removed, no restart
|
||||
- [ ] Stale PID file (process gone) → file cleaned up
|
||||
- [ ] PID reused by unrelated process → not killed (with identity check)
|
||||
- [ ] Multiple rapid restarts → circuit breaker triggers
|
||||
- [ ] Rename app while running → handled gracefully
|
||||
- [ ] `bun install` hangs → times out, app marked failed
|
||||
Loading…
Reference in New Issue
Block a user