health checks
This commit is contained in:
parent
02fca1313c
commit
a396f740a5
|
|
@ -33,6 +33,7 @@ let _shuttingDown = false
|
||||||
export type App = SharedApp & {
|
export type App = SharedApp & {
|
||||||
consecutiveHealthFailures?: number
|
consecutiveHealthFailures?: number
|
||||||
healthCheckTimer?: Timer
|
healthCheckTimer?: Timer
|
||||||
|
isHttpApp?: boolean
|
||||||
lastRestartTime?: number
|
lastRestartTime?: number
|
||||||
manuallyStopped?: boolean
|
manuallyStopped?: boolean
|
||||||
proc?: Subprocess
|
proc?: Subprocess
|
||||||
|
|
@ -184,11 +185,12 @@ export function renameApp(oldName: string, newName: string): { ok: boolean, erro
|
||||||
|
|
||||||
export function startApp(dir: string) {
|
export function startApp(dir: string) {
|
||||||
const app = _apps.get(dir)
|
const app = _apps.get(dir)
|
||||||
if (!app || app.state !== 'stopped') return
|
if (!app || (app.state !== 'stopped' && app.state !== 'invalid')) return
|
||||||
if (!isApp(dir)) return
|
if (!isApp(dir)) return
|
||||||
|
|
||||||
// Clear manually stopped flag when explicitly starting
|
// Clear flags when explicitly starting
|
||||||
app.manuallyStopped = false
|
app.manuallyStopped = false
|
||||||
|
app.error = undefined
|
||||||
runApp(dir, getPort(dir))
|
runApp(dir, getPort(dir))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -448,6 +450,23 @@ function initPortPool() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function markAsRunning(app: App, port: number, isHttpApp: boolean) {
|
||||||
|
if (app.startupTimer) {
|
||||||
|
clearTimeout(app.startupTimer)
|
||||||
|
app.startupTimer = undefined
|
||||||
|
}
|
||||||
|
app.state = 'running'
|
||||||
|
app.started = Date.now()
|
||||||
|
app.isHttpApp = isHttpApp
|
||||||
|
update()
|
||||||
|
|
||||||
|
if (isHttpApp) {
|
||||||
|
startHealthChecks(app, port)
|
||||||
|
} else {
|
||||||
|
startProcessHealthChecks(app)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function loadApp(dir: string): LoadResult {
|
function loadApp(dir: string): LoadResult {
|
||||||
try {
|
try {
|
||||||
const pkgPath = join(APPS_DIR, dir, 'current', 'package.json')
|
const pkgPath = join(APPS_DIR, dir, 'current', 'package.json')
|
||||||
|
|
@ -582,19 +601,71 @@ async function runApp(dir: string, port: number) {
|
||||||
stderr: 'pipe',
|
stderr: 'pipe',
|
||||||
})
|
})
|
||||||
|
|
||||||
// Clear startup timer and set state to running
|
app.proc = proc
|
||||||
if (app.startupTimer) {
|
|
||||||
clearTimeout(app.startupTimer)
|
// Check if process is alive using ps(1) - more reliable than Bun's API
|
||||||
app.startupTimer = undefined
|
const isProcessAlive = async (pid: number): Promise<boolean> => {
|
||||||
|
try {
|
||||||
|
const ps = Bun.spawn(['ps', '-p', String(pid)], { stdout: 'pipe', stderr: 'pipe' })
|
||||||
|
const code = await ps.exited
|
||||||
|
return code === 0
|
||||||
|
} catch {
|
||||||
|
return false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
app.state = 'running'
|
// Poll to verify app started - tries /ok for HTTP apps, falls back to survival check
|
||||||
app.proc = proc
|
const pollStartup = async () => {
|
||||||
app.started = Date.now()
|
const pollInterval = 500
|
||||||
update()
|
const survivalThreshold = 5000 // Consider non-HTTP apps running after 5s
|
||||||
|
const startTime = Date.now()
|
||||||
|
const pid = proc.pid
|
||||||
|
|
||||||
// Start health checks
|
while (app.state === 'starting' && app.proc === proc) {
|
||||||
startHealthChecks(app, port)
|
// First check if process is still alive
|
||||||
|
const alive = await isProcessAlive(pid)
|
||||||
|
if (!alive) {
|
||||||
|
info(app, 'Process died during startup')
|
||||||
|
// proc.exited handler will clean up
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try /ok endpoint for HTTP apps
|
||||||
|
try {
|
||||||
|
const controller = new AbortController()
|
||||||
|
const timeout = setTimeout(() => controller.abort(), 2000)
|
||||||
|
const response = await fetch(`http://localhost:${port}/ok`, {
|
||||||
|
signal: controller.signal,
|
||||||
|
})
|
||||||
|
clearTimeout(timeout)
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
// HTTP app is running and healthy
|
||||||
|
markAsRunning(app, port, true)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// App responded but /ok returned error - mark as error and kill
|
||||||
|
info(app, `/ok returned ${response.status}`)
|
||||||
|
app.error = `Health check failed: /ok returned ${response.status}`
|
||||||
|
app.proc?.kill()
|
||||||
|
return
|
||||||
|
} catch {
|
||||||
|
// Connection failed - app not ready yet or not an HTTP app
|
||||||
|
}
|
||||||
|
|
||||||
|
// If process survived long enough, consider it running (non-HTTP app)
|
||||||
|
if (Date.now() - startTime >= survivalThreshold) {
|
||||||
|
info(app, 'No /ok endpoint, marking as running (process survived 5s)')
|
||||||
|
markAsRunning(app, port, false)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise(resolve => setTimeout(resolve, pollInterval))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pollStartup()
|
||||||
|
|
||||||
const streamOutput = async (stream: ReadableStream<Uint8Array> | null, streamType: 'stdout' | 'stderr') => {
|
const streamOutput = async (stream: ReadableStream<Uint8Array> | null, streamType: 'stdout' | 'stderr') => {
|
||||||
if (!stream) return
|
if (!stream) return
|
||||||
|
|
@ -639,8 +710,8 @@ async function runApp(dir: string, port: number) {
|
||||||
releasePort(app.port)
|
releasePort(app.port)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reset to stopped state (or invalid if no longer valid)
|
// Reset to stopped state (or invalid if error or no longer valid)
|
||||||
app.state = isApp(dir) ? 'stopped' : 'invalid'
|
app.state = (isApp(dir) && !app.error) ? 'stopped' : 'invalid'
|
||||||
app.proc = undefined
|
app.proc = undefined
|
||||||
app.port = undefined
|
app.port = undefined
|
||||||
app.started = undefined
|
app.started = undefined
|
||||||
|
|
@ -733,6 +804,38 @@ function startHealthChecks(app: App, port: number) {
|
||||||
}, HEALTH_CHECK_INTERVAL)
|
}, HEALTH_CHECK_INTERVAL)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function startProcessHealthChecks(app: App) {
|
||||||
|
// For non-HTTP apps, just verify process is still alive using ps(1)
|
||||||
|
app.healthCheckTimer = setInterval(async () => {
|
||||||
|
if (app.state !== 'running') {
|
||||||
|
if (app.healthCheckTimer) {
|
||||||
|
clearInterval(app.healthCheckTimer)
|
||||||
|
app.healthCheckTimer = undefined
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const pid = app.proc?.pid
|
||||||
|
if (!pid) {
|
||||||
|
handleHealthCheckFailure(app)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const ps = Bun.spawn(['ps', '-p', String(pid)], { stdout: 'pipe', stderr: 'pipe' })
|
||||||
|
const code = await ps.exited
|
||||||
|
if (code === 0) {
|
||||||
|
// Process is alive
|
||||||
|
app.consecutiveHealthFailures = 0
|
||||||
|
} else {
|
||||||
|
handleHealthCheckFailure(app)
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
handleHealthCheckFailure(app)
|
||||||
|
}
|
||||||
|
}, HEALTH_CHECK_INTERVAL)
|
||||||
|
}
|
||||||
|
|
||||||
function startShutdownTimeout(app: App) {
|
function startShutdownTimeout(app: App) {
|
||||||
app.shutdownTimer = setTimeout(() => {
|
app.shutdownTimer = setTimeout(() => {
|
||||||
if (app.proc && (app.state === 'stopping' || app.state === 'running')) {
|
if (app.proc && (app.state === 'stopping' || app.state === 'running')) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user