Compare commits

..

4 Commits

Author SHA1 Message Date
1015e20cf9 [cron] reload jobs on renames/deploys 2026-02-15 08:44:48 -08:00
565f4924e8 we do it better now 2026-02-15 08:43:58 -08:00
c49cc2e078 tailscale docs 2026-02-15 08:37:21 -08:00
bf14ba4ba1 new event API 2026-02-15 08:36:58 -08:00
13 changed files with 276 additions and 379 deletions

View File

@ -138,7 +138,10 @@ The server sets these on each app process: `PORT`, `APPS_DIR`, `TOES_URL`, `TOES
### SSE Streaming ### SSE Streaming
`/api/apps/stream` pushes the full app list on every state change. Client reconnects automatically. The `onChange()` callback system in `apps.ts` notifies listeners. Two SSE endpoints serve different consumers:
- `/api/apps/stream` -- Full app state snapshots on every change. Used by the dashboard UI. Driven by `onChange()` in `apps.ts`.
- `/api/events/stream` -- Discrete lifecycle events (`app:start`, `app:stop`, `app:activate`, `app:create`, `app:delete`). Used by app processes to react to other apps' lifecycle changes. Driven by `emit()`/`onEvent()` in `apps.ts`. Apps subscribe via `on()` from `@because/toes/tools`.
## Coding Guidelines ## Coding Guidelines

374
PID.md
View File

@ -1,374 +0,0 @@
# PID File Tracking for Robust Process Management
## Problem Statement
When the Toes host process crashes unexpectedly (OOM, SIGKILL, power loss, kernel panic), child app processes continue running as orphans. On restart, Toes has no knowledge of these processes:
- **Port conflicts**: Orphans hold ports, new instances fail to bind
- **Resource waste**: Zombie processes consume memory/CPU
- **State confusion**: App appears "stopped" but is actually running
- **Data corruption**: Multiple instances may write to same files
Currently, Toes only handles graceful shutdown (SIGTERM/SIGINT). There's no recovery mechanism for ungraceful termination.
## Proposed Solution: PID File Tracking
### Design
Store PID files in `TOES_DIR/pids/`:
```
${TOES_DIR}/pids/
clock.pid # Contains: 12345
todo.pid # Contains: 12389
weather.pid # Contains: 12402
```
### Lifecycle
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ App Start │────▶│ Write PID │────▶│ Running │
└─────────────┘ └─────────────┘ └─────────────┘
┌─────────────┐ │
│ Delete PID │◀──────────┘
└─────────────┘ App Exit
```
On host startup:
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Host Init │────▶│ Scan PIDs │────▶│Kill Orphans │
└─────────────┘ └─────────────┘ └─────────────┘
┌─────────────┐
│Clean Stale │
│ PID Files │
└─────────────┘
```
### Implementation
#### 1. PID Directory Setup
```typescript
const PIDS_DIR = join(TOES_DIR, 'pids')
function ensurePidsDir() {
if (!existsSync(PIDS_DIR)) {
mkdirSync(PIDS_DIR, { recursive: true })
}
}
```
#### 2. Write PID on Start
In `runApp()`, after spawning:
```typescript
const proc = Bun.spawn(['bun', 'run', 'toes'], { ... })
app.proc = proc
// Write PID file
const pidFile = join(PIDS_DIR, `${dir}.pid`)
writeFileSync(pidFile, String(proc.pid))
```
#### 3. Delete PID on Exit
In the `proc.exited.then()` handler:
```typescript
proc.exited.then(code => {
// Remove PID file
const pidFile = join(PIDS_DIR, `${dir}.pid`)
if (existsSync(pidFile)) {
unlinkSync(pidFile)
}
// ... existing cleanup
})
```
#### 4. Orphan Cleanup on Startup
New function called during `initApps()`:
```typescript
function cleanupOrphanProcesses() {
ensurePidsDir()
for (const file of readdirSync(PIDS_DIR)) {
if (!file.endsWith('.pid')) continue
const appName = file.replace('.pid', '')
const pidFile = join(PIDS_DIR, file)
const pid = parseInt(readFileSync(pidFile, 'utf-8').trim(), 10)
if (isNaN(pid)) {
// Invalid PID file, remove it
unlinkSync(pidFile)
hostLog(`Removed invalid PID file: ${file}`)
continue
}
if (isProcessRunning(pid)) {
// Orphan found - kill it
hostLog(`Found orphan process for ${appName} (PID ${pid}), terminating...`)
try {
process.kill(pid, 'SIGTERM')
// Give it 5 seconds, then SIGKILL
setTimeout(() => {
if (isProcessRunning(pid)) {
hostLog(`Orphan ${appName} (PID ${pid}) didn't terminate, sending SIGKILL`)
process.kill(pid, 'SIGKILL')
}
}, 5000)
} catch (e) {
// Process may have exited between check and kill
hostLog(`Failed to kill orphan ${appName}: ${e}`)
}
}
// Remove stale PID file
unlinkSync(pidFile)
}
}
function isProcessRunning(pid: number): boolean {
try {
// Sending signal 0 checks if process exists without killing it
process.kill(pid, 0)
return true
} catch {
return false
}
}
```
#### 5. Integration Point
Update `initApps()`:
```typescript
export function initApps() {
initPortPool()
setupShutdownHandlers()
cleanupOrphanProcesses() // <-- Add here, before discovery
rotateLogs()
createAppSymlinks()
discoverApps()
runApps()
}
```
### Edge Cases
| Scenario | Handling |
|----------|----------|
| PID reused by OS | Check if process command matches expected pattern before killing |
| PID file corrupted | Delete invalid files, log warning |
| Multiple Toes instances | Use file locking or instance ID in PID path |
| App renamed while running | Old PID file orphaned; cleanup handles it |
| Permission denied on kill | Log error, continue with other orphans |
### Enhanced: Validate Process Identity
To avoid killing an unrelated process that reused the PID:
```typescript
function isOurProcess(pid: number, appName: string): boolean {
try {
// On macOS/Linux, check /proc or use ps
const result = Bun.spawnSync(['ps', '-p', String(pid), '-o', 'args='])
const cmd = new TextDecoder().decode(result.stdout).trim()
// Check if it looks like a Toes app process
return cmd.includes('bun') && cmd.includes('toes')
} catch {
return false
}
}
```
---
## Related Recommendations
### 1. Store Port in PID File
Extend PID files to include port for faster recovery:
```
# clock.pid
12345
3001
```
Or use JSON:
```json
{"pid": 12345, "port": 3001, "started": 1706900000000}
```
This allows Toes to reclaim the exact port on restart, avoiding port shuffling.
### 2. Circuit Breaker for Crash Loops
Add crash tracking to prevent infinite restart loops:
```typescript
interface CrashRecord {
timestamp: number
exitCode: number
}
// Store in TOES_DIR/crashes/<app>.json
const CRASH_WINDOW = 3600000 // 1 hour
const MAX_CRASHES = 10
function recordCrash(appName: string, exitCode: number) {
const file = join(TOES_DIR, 'crashes', `${appName}.json`)
const crashes: CrashRecord[] = existsSync(file)
? JSON.parse(readFileSync(file, 'utf-8'))
: []
// Add new crash
crashes.push({ timestamp: Date.now(), exitCode })
// Prune old crashes
const cutoff = Date.now() - CRASH_WINDOW
const recent = crashes.filter(c => c.timestamp > cutoff)
writeFileSync(file, JSON.stringify(recent))
return recent.length
}
function shouldCircuitBreak(appName: string): boolean {
const file = join(TOES_DIR, 'crashes', `${appName}.json`)
if (!existsSync(file)) return false
const crashes: CrashRecord[] = JSON.parse(readFileSync(file, 'utf-8'))
const cutoff = Date.now() - CRASH_WINDOW
const recent = crashes.filter(c => c.timestamp > cutoff)
return recent.length >= MAX_CRASHES
}
```
### 3. Track Restart Timer for Cancellation
Store scheduled restart timers on the app object:
```typescript
export type App = SharedApp & {
// ... existing fields
restartTimer?: Timer // <-- Add this
}
```
Update `scheduleRestart()`:
```typescript
function scheduleRestart(app: App, dir: string) {
// Cancel any existing scheduled restart
if (app.restartTimer) {
clearTimeout(app.restartTimer)
}
// ... existing delay calculation ...
app.restartTimer = setTimeout(() => {
app.restartTimer = undefined
// ... existing restart logic
}, delay)
}
```
Update `clearTimers()`:
```typescript
const clearTimers = (app: App) => {
// ... existing timer cleanup ...
if (app.restartTimer) {
clearTimeout(app.restartTimer)
app.restartTimer = undefined
}
}
```
### 4. Exit Code Classification
```typescript
function classifyExit(code: number | null): 'restart' | 'invalid' | 'stop' {
if (code === null) return 'restart' // Killed by signal
if (code === 0) return 'stop' // Clean exit
if (code === 2) return 'invalid' // Bad arguments/config
if (code >= 128) {
// Killed by signal (128 + signal number)
const signal = code - 128
if (signal === 9) return 'restart' // SIGKILL (OOM?)
if (signal === 15) return 'stop' // SIGTERM (intentional)
}
return 'restart' // Default: try again
}
```
### 5. Install Timeout
Wrap `bun install` with a timeout:
```typescript
async function installWithTimeout(cwd: string, timeout = 60000): Promise<boolean> {
const install = Bun.spawn(['bun', 'install'], {
cwd,
stdout: 'pipe',
stderr: 'pipe'
})
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => {
install.kill()
reject(new Error('Install timeout'))
}, timeout)
})
try {
await Promise.race([install.exited, timeoutPromise])
return install.exitCode === 0
} catch (e) {
return false
}
}
```
---
## Implementation Priority
| Change | Effort | Impact | Priority |
|--------|--------|--------|----------|
| PID file tracking | Medium | High | **1** |
| Orphan cleanup on startup | Medium | High | **1** |
| Track restart timer | Low | Medium | **2** |
| Install timeout | Low | Medium | **2** |
| Circuit breaker | Medium | Medium | **3** |
| Exit code classification | Low | Low | **4** |
| Process identity validation | Medium | Low | **5** |
---
## Testing Checklist
- [ ] Host crashes while apps running → orphans cleaned on restart
- [ ] App crashes → PID file removed, restart scheduled
- [ ] App stopped manually → PID file removed, no restart
- [ ] Stale PID file (process gone) → file cleaned up
- [ ] PID reused by unrelated process → not killed (with identity check)
- [ ] Multiple rapid restarts → circuit breaker triggers
- [ ] Rename app while running → handled gracefully
- [ ] `bun install` hangs → times out, app marked failed

View File

@ -1,6 +1,6 @@
import { Hype } from '@because/hype' import { Hype } from '@because/hype'
import { define, stylesToCSS } from '@because/forge' import { define, stylesToCSS } from '@because/forge'
import { baseStyles, ToolScript, theme } from '@because/toes/tools' import { baseStyles, on, ToolScript, theme } from '@because/toes/tools'
import { discoverCronJobs } from './lib/discovery' import { discoverCronJobs } from './lib/discovery'
import { scheduleJob, stopJob } from './lib/scheduler' import { scheduleJob, stopJob } from './lib/scheduler'
import { executeJob } from './lib/executor' import { executeJob } from './lib/executor'
@ -691,6 +691,11 @@ watch(APPS_DIR, { recursive: true }, (_event, filename) => {
debounceTimer = setTimeout(rediscover, 100) debounceTimer = setTimeout(rediscover, 100)
}) })
on(['app:activate', 'app:delete'], (event) => {
console.log(`[cron] ${event.type} ${event.app}, rediscovering jobs...`)
rediscover()
})
init() init()
export default app.defaults export default app.defaults

149
docs/TAILSCALE.md Normal file
View File

@ -0,0 +1,149 @@
# Tailscale
Connect your Toes appliance to your Tailscale network for secure access from anywhere.
Tailscale is pre-installed on the appliance but not configured. The user authenticates through the dashboard or CLI — no SSH required.
## how it works
1. User clicks "Connect to Tailscale" in the dashboard (or runs `toes tailscale connect`)
2. Toes runs `tailscale login` and captures the auth URL
3. Dashboard shows the URL and a QR code
4. User visits the URL and authenticates with Tailscale
5. Toes detects the connection, runs `tailscale serve --bg 80`
6. Appliance is now accessible at `https://<hostname>.<tailnet>.ts.net`
## dashboard
Settings area shows one of three states:
**Not connected:**
- "Connect to Tailscale" button
**Connecting:**
- Auth URL as a clickable link
- QR code for mobile
- Polls `tailscale status` until authenticated
**Connected:**
- Tailnet URL (clickable)
- Tailnet name
- Device hostname
- `tailscale serve` toggle
- "Disconnect" button
## cli
```bash
toes tailscale # show status
toes tailscale connect # start auth flow, print URL, wait
toes tailscale disconnect # log out of tailnet
toes tailscale serve # toggle tailscale serve on/off
```
### `toes tailscale`
```
Tailscale: connected
Tailnet: user@github
Hostname: toes.tail1234.ts.net
IP: 100.64.0.1
Serve: on (port 80)
```
Or when not connected:
```
Tailscale: not connected
Run `toes tailscale connect` to get started.
```
### `toes tailscale connect`
```
Visit this URL to authenticate:
https://login.tailscale.com/a/abc123
Waiting for authentication... done!
Connected to tailnet user@github
https://toes.tail1234.ts.net
```
## server api
All endpoints shell out to the `tailscale` CLI and parse output.
### `GET /api/tailscale`
Returns current status.
```json
{
"installed": true,
"connected": true,
"hostname": "toes",
"tailnetName": "user@github",
"url": "https://toes.tail1234.ts.net",
"ip": "100.64.0.1",
"serving": true
}
```
When not connected:
```json
{
"installed": true,
"connected": false
}
```
When tailscale isn't installed:
```json
{
"installed": false
}
```
### `POST /api/tailscale/connect`
Runs `tailscale login`. Returns the auth URL.
```json
{
"authUrl": "https://login.tailscale.com/a/abc123"
}
```
### `POST /api/tailscale/disconnect`
Runs `tailscale logout`.
### `POST /api/tailscale/serve`
Toggles `tailscale serve`. Body:
```json
{ "enabled": true }
```
## install
`scripts/install.sh` installs tailscale and enables the daemon, but does not authenticate:
```bash
curl -fsSL https://tailscale.com/install.sh | sh
sudo systemctl enable tailscaled
```
## permissions
The `toes` user needs passwordless sudo for tailscale commands. Add to sudoers during install:
```
toes ALL=(ALL) NOPASSWD: /usr/bin/tailscale
```
This lets the server run `sudo tailscale login`, `sudo tailscale serve`, etc. without a password prompt.

View File

@ -21,7 +21,8 @@ function convert(app: BackendApp): SharedApp {
return { ...rest, pid: proc?.pid } return { ...rest, pid: proc?.pid }
} }
// SSE endpoint for real-time app state updates // SSE: full app state snapshots for the dashboard UI (every state change)
// For discrete lifecycle events consumed by app processes, see /api/events/stream
router.sse('/stream', (send) => { router.sse('/stream', (send) => {
const broadcast = () => { const broadcast = () => {
const apps: SharedApp[] = allApps().map(({ const apps: SharedApp[] = allApps().map(({

14
src/server/api/events.ts Normal file
View File

@ -0,0 +1,14 @@
import { onEvent } from '$apps'
import { Hype } from '@because/hype'
const router = Hype.router()
// SSE: discrete lifecycle events for app processes (start, stop, deploy, etc.)
// Unlike /api/apps/stream (full state snapshots for the dashboard), this sends
// individual events so apps can react to specific lifecycle changes.
router.sse('/stream', (send) => {
const unsub = onEvent(event => send(event))
return unsub
})
export default router

View File

@ -1,4 +1,4 @@
import { APPS_DIR, allApps, registerApp, removeApp, restartApp, startApp } from '$apps' import { APPS_DIR, allApps, emit, registerApp, removeApp, restartApp, startApp } from '$apps'
import { computeHash, generateManifest } from '../sync' import { computeHash, generateManifest } from '../sync'
import { loadGitignore } from '@gitignore' import { loadGitignore } from '@gitignore'
import { cpSync, existsSync, mkdirSync, readdirSync, readFileSync, realpathSync, renameSync, rmSync, symlinkSync, unlinkSync, watch, writeFileSync } from 'fs' import { cpSync, existsSync, mkdirSync, readdirSync, readFileSync, realpathSync, renameSync, rmSync, symlinkSync, unlinkSync, watch, writeFileSync } from 'fs'
@ -330,6 +330,8 @@ router.post('/apps/:app/activate', async c => {
console.error(`Failed to clean up old versions: ${e}`) console.error(`Failed to clean up old versions: ${e}`)
} }
emit({ type: 'app:activate', app: appName, version })
// Register new app or restart existing // Register new app or restart existing
const app = allApps().find(a => a.name === appName) const app = allApps().find(a => a.name === appName)
if (!app) { if (!app) {

View File

@ -1,4 +1,5 @@
import type { App as SharedApp, AppState } from '@types' import type { App as SharedApp, AppState } from '@types'
import type { ToesEvent, ToesEventInput, ToesEventType } from '../shared/events'
import type { Subprocess } from 'bun' import type { Subprocess } from 'bun'
import { DEFAULT_EMOJI } from '@types' import { DEFAULT_EMOJI } from '@types'
import { appendFileSync, existsSync, mkdirSync, readdirSync, readFileSync, realpathSync, renameSync, symlinkSync, unlinkSync, writeFileSync } from 'fs' import { appendFileSync, existsSync, mkdirSync, readdirSync, readFileSync, realpathSync, renameSync, symlinkSync, unlinkSync, writeFileSync } from 'fs'
@ -31,6 +32,7 @@ const STARTUP_TIMEOUT = 30000
const _appPorts = new Map<string, number>() const _appPorts = new Map<string, number>()
const _apps = new Map<string, App>() const _apps = new Map<string, App>()
const _availablePorts: number[] = [] const _availablePorts: number[] = []
const _eventListeners = new Set<(event: ToesEvent) => void>()
const _listeners = new Set<() => void>() const _listeners = new Set<() => void>()
let _shuttingDown = false let _shuttingDown = false
@ -106,11 +108,22 @@ export async function initApps() {
runApps() runApps()
} }
export function emit(event: ToesEventInput) {
// Cast: ToesEventInput is DistributiveOmit<ToesEvent, 'time'>, so adding time
// back produces ToesEvent. TS can't prove this because spreads don't distribute.
_eventListeners.forEach(cb => cb({ ...event, time: Date.now() } as ToesEvent))
}
export function onChange(cb: () => void) { export function onChange(cb: () => void) {
_listeners.add(cb) _listeners.add(cb)
return () => _listeners.delete(cb) return () => _listeners.delete(cb)
} }
export function onEvent(cb: (event: ToesEvent) => void) {
_eventListeners.add(cb)
return () => _eventListeners.delete(cb)
}
export function removeApp(dir: string) { export function removeApp(dir: string) {
const app = _apps.get(dir) const app = _apps.get(dir)
if (!app) return if (!app) return
@ -130,6 +143,7 @@ export function removeApp(dir: string) {
_apps.delete(dir) _apps.delete(dir)
update() update()
emit({ type: 'app:delete', app: dir })
} }
export function registerApp(dir: string) { export function registerApp(dir: string) {
@ -141,6 +155,7 @@ export function registerApp(dir: string) {
const tool = pkg.toes?.tool const tool = pkg.toes?.tool
_apps.set(dir, { name: dir, state, icon, error, tool }) _apps.set(dir, { name: dir, state, icon, error, tool })
update() update()
emit({ type: 'app:create', app: dir })
if (!error) { if (!error) {
runApp(dir, getPort(dir)) runApp(dir, getPort(dir))
} }
@ -191,6 +206,8 @@ export async function renameApp(oldName: string, newName: string): Promise<{ ok:
renameTunnelConfig(oldName, newName) renameTunnelConfig(oldName, newName)
update() update()
emit({ type: 'app:delete', app: oldName })
emit({ type: 'app:create', app: newName })
// Restart if it was running // Restart if it was running
if (wasRunning) { if (wasRunning) {
@ -502,6 +519,7 @@ function markAsRunning(app: App, port: number, isHttpApp: boolean) {
app.started = Date.now() app.started = Date.now()
app.isHttpApp = isHttpApp app.isHttpApp = isHttpApp
update() update()
emit({ type: 'app:start', app: app.name })
openTunnelIfEnabled(app.name, port) openTunnelIfEnabled(app.name, port)
if (isHttpApp) { if (isHttpApp) {
@ -737,6 +755,7 @@ async function runApp(dir: string, port: number) {
app.port = undefined app.port = undefined
app.started = undefined app.started = undefined
update() update()
if (!_shuttingDown) emit({ type: 'app:stop', app: dir })
// Schedule restart if appropriate // Schedule restart if appropriate
if (shouldAutoRestart(app, code)) { if (shouldAutoRestart(app, code)) {

View File

@ -1,5 +1,6 @@
import { allApps, initApps, TOES_URL } from '$apps' import { allApps, initApps, TOES_URL } from '$apps'
import appsRouter from './api/apps' import appsRouter from './api/apps'
import eventsRouter from './api/events'
import syncRouter from './api/sync' import syncRouter from './api/sync'
import systemRouter from './api/system' import systemRouter from './api/system'
import { Hype } from '@because/hype' import { Hype } from '@because/hype'
@ -7,6 +8,7 @@ import { Hype } from '@because/hype'
const app = new Hype({ layout: false, logging: !!process.env.DEBUG }) const app = new Hype({ layout: false, logging: !!process.env.DEBUG })
app.route('/api/apps', appsRouter) app.route('/api/apps', appsRouter)
app.route('/api/events', eventsRouter)
app.route('/api/sync', syncRouter) app.route('/api/sync', syncRouter)
app.route('/api/system', systemRouter) app.route('/api/system', systemRouter)

17
src/shared/events.ts Normal file
View File

@ -0,0 +1,17 @@
export type ToesEventType = 'app:activate' | 'app:create' | 'app:delete' | 'app:start' | 'app:stop'
interface BaseEvent {
app: string
time: number
}
export type ToesEvent =
| BaseEvent & { type: 'app:activate'; version: string }
| BaseEvent & { type: 'app:create' }
| BaseEvent & { type: 'app:delete' }
| BaseEvent & { type: 'app:start' }
| BaseEvent & { type: 'app:stop' }
type DistributiveOmit<T, K extends keyof any> = T extends any ? Omit<T, K> : never
export type ToesEventInput = DistributiveOmit<ToesEvent, 'time'>

57
src/tools/events.ts Normal file
View File

@ -0,0 +1,57 @@
import type { ToesEvent, ToesEventType } from '../shared/events'
export type { ToesEvent, ToesEventType }
type EventCallback = (event: ToesEvent) => void
interface Listener {
types: ToesEventType[]
callback: EventCallback
}
const _listeners = new Set<Listener>()
let _es: EventSource | undefined
function ensureConnection() {
if (_es && _es.readyState !== EventSource.CLOSED) return
if (_es) _es.close()
const url = `${process.env.TOES_URL}/api/events/stream`
_es = new EventSource(url)
_es.onerror = () => {
if (_es?.readyState === EventSource.CLOSED) {
console.warn('[toes] Event stream closed, reconnecting...')
_es = undefined
if (_listeners.size > 0) ensureConnection()
}
}
_es.onmessage = (msg) => {
try {
const event: ToesEvent = JSON.parse(msg.data)
_listeners.forEach(l => {
if (l.types.includes(event.type)) l.callback(event)
})
} catch (e) {
console.warn('[toes] Failed to parse event:', e)
}
}
}
export function on(type: ToesEventType | ToesEventType[], callback: EventCallback): () => void {
const listener: Listener = {
types: Array.isArray(type) ? type : [type],
callback,
}
_listeners.add(listener)
ensureConnection()
return () => {
_listeners.delete(listener)
if (_listeners.size === 0 && _es) {
_es.close()
_es = undefined
}
}
}

View File

@ -1,3 +1,5 @@
export { theme } from '../client/themes' export { theme } from '../client/themes'
export { loadAppEnv } from './env' export { loadAppEnv } from './env'
export type { ToesEvent, ToesEventType } from './events'
export { on } from './events'
export { baseStyles, ToolScript } from './scripts.tsx' export { baseStyles, ToolScript } from './scripts.tsx'

View File

@ -1,5 +1,5 @@
{ {
"exclude": ["templates"], "exclude": ["apps", "templates"],
"compilerOptions": { "compilerOptions": {
// Environment setup & latest features // Environment setup & latest features
"lib": [ "lib": [