From a53db50b1ac2078c2117e94c936232db0b6e88f1 Mon Sep 17 00:00:00 2001 From: Corey Johnson Date: Sun, 12 Oct 2025 16:33:53 -0700 Subject: [PATCH] wip --- CLAUDE.md | 111 ++++++++++++++++++++++++++++- src/compiler/compiler.ts | 142 ++++++++++++++++++++++---------------- src/parser/parser.test.ts | 20 +++--- 3 files changed, 202 insertions(+), 71 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index a9564d0..00532ad 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,6 +35,31 @@ Shrimp is a shell-like scripting language that combines command-line simplicity Key references: [Lezer System Guide](https://lezer.codemirror.net/docs/guide/) | [Lezer API](https://lezer.codemirror.net/docs/ref/) +## Reading the Codebase: What to Look For + +When exploring Shrimp, focus on these key files in order: + +1. **src/parser/shrimp.grammar** - Language syntax rules + + - Note the `expressionWithoutIdentifier` pattern and its comment + - See how `consumeToTerminator` handles statement-level parsing + +2. **src/parser/tokenizer.ts** - How Identifier vs Word is determined + + - Check the emoji Unicode ranges and surrogate pair handling + - See context-aware termination logic (`;`, `)`, `:`) + +3. **src/compiler/compiler.ts** - CST to bytecode transformation + + - See how functions become labels in `fnLabels` map + - Check short-circuit logic for `and`/`or` (lines 267-282) + - Notice `TRY_CALL` emission for bare identifiers (line 152) + +4. **packages/ReefVM/src/vm.ts** - Bytecode execution + - See `TRY_CALL` fall-through to `CALL` (lines 357-375) + - Check `TRY_LOAD` string coercion (lines 135-145) + - Notice NOSE-style named parameter binding (lines 425-443) + ## Development Commands ### Running Files @@ -141,14 +166,69 @@ function parseExpression(input: string) { **Whitespace-sensitive parsing**: Spaces distinguish operators from identifiers (`x-1` vs `x - 1`). This enables natural shell-like syntax. -**Identifier vs Word tokenization**: Custom tokenizer determines if a token is an assignable identifier (lowercase/emoji start) or a non-assignable word (paths, URLs). This allows `./file.txt` without quotes. +**Identifier vs Word tokenization**: The custom tokenizer (tokenizer.ts) is sophisticated: -**Ambiguous identifier resolution**: Bare identifiers like `myVar` could be function calls or variable references. The parser creates `FunctionCallOrIdentifier` nodes, resolved at runtime. +- **Surrogate pair handling**: Processes emoji as full Unicode code points (lines 51-65) +- **Context-aware termination**: Stops at `;`, `)`, `:` only when followed by whitespace (lines 19-24) + - This allows `basename ./cool;` to parse correctly + - But `basename ./cool; 2` treats the semicolon as a terminator +- **GLR state checking**: Uses `stack.canShift(Word)` to decide whether to track identifier validity +- **Permissive Words**: Anything that's not an identifier is a Word (paths, URLs, @mentions, #hashtags) + +**Why this matters**: This complexity is what enables shell-like syntax. Without it, you'd need quotes around `./file.txt` or special handling for paths. + +**Identifier rules**: Must start with lowercase letter or emoji, can contain lowercase, digits, dashes, and emoji. + +**Word rules**: Everything else that isn't whitespace or a delimiter. + +**Ambiguous identifier resolution**: Bare identifiers like `myVar` could be function calls or variable references. The parser creates `FunctionCallOrIdentifier` nodes, resolved at runtime using the `TRY_CALL` opcode. + +**How it works**: + +- The compiler emits `TRY_CALL varname` for bare identifiers (src/compiler/compiler.ts:152) +- ReefVM checks if the variable is a function at runtime (vm.ts:357-373) +- If it's a function, TRY_CALL intentionally falls through to CALL opcode (no break statement) +- If it's not a function or undefined, it pushes the value/string and returns +- This runtime resolution enables shell-like "echo hello" without quotes + +**Unbound symbols become strings**: When `TRY_LOAD` encounters an undefined variable, it pushes the variable name as a string (vm.ts:135-145). This is a first-class language feature implemented as a VM opcode, not a parser trick. **Expression-oriented design**: Everything returns a value - commands, assignments, functions. This enables composition and functional patterns. **EOF handling**: The grammar uses `(statement | newlineOrSemicolon)+ eof?` to handle empty lines and end-of-file without infinite loops. +## Compiler Architecture + +**Function compilation strategy**: The compiler doesn't create inline function objects. Instead it: + +1. Generates unique labels (`.func_0`, `.func_1`) for each function body (compiler.ts:137) +2. Stores function body instructions in `fnLabels` map during compilation +3. Appends all function bodies to the end of bytecode with RETURN instructions (compiler.ts:36-41) +4. Emits `MAKE_FUNCTION` with parameters and label reference + +This approach keeps the main program linear and allows ReefVM to jump to function bodies by label. + +**Short-circuit logic**: ReefVM has no AND/OR opcodes. The compiler implements short-circuit evaluation using: + +```typescript +// For `a and b`: +LOAD a +DUP // Duplicate so we can return it if falsy +JUMP_IF_FALSE skip // If false, skip evaluating b +POP // Remove duplicate if we're continuing +LOAD b // Evaluate right side +skip: +``` + +See compiler.ts:267-282 for the full implementation. The `or` operator uses `JUMP_IF_TRUE` instead. + +**If/else compilation**: The compiler uses label-based jumps: + +- `JUMP_IF_FALSE` skips the then-block when condition is false +- Each branch ends with `JUMP endLabel` to skip remaining branches +- The final label marks where all branches converge +- If there's no else branch, compiler emits `PUSH null` as the default value + ## Grammar Development ### Grammar Structure @@ -206,6 +286,21 @@ The `toMatchTree` helper compares parser output with expected CST structure. **Empty line parsing**: The grammar structure `(statement | newlineOrSemicolon)+ eof?` allows proper empty line and EOF handling. +### Why expressionWithoutIdentifier Exists + +The grammar has an unusual pattern: `expressionWithoutIdentifier`. This exists to solve a GLR conflict: + +``` +consumeToTerminator { + ambiguousFunctionCall | // → FunctionCallOrIdentifier → Identifier + expression // → Identifier +} +``` + +Without `expressionWithoutIdentifier`, parsing `my-var` at statement level creates two paths that both want the Identifier token. The grammar comment (shrimp.grammar lines 157-164) explains we "gave up trying to use GLR to fix it." + +**The solution**: Remove Identifier from the `expression` path by creating `expressionWithoutIdentifier`, forcing standalone identifiers through `ambiguousFunctionCall`. This is pragmatic over theoretical purity. + ## Testing Strategy ### Parser Tests (`src/parser/parser.test.ts`) @@ -259,3 +354,15 @@ When grammar isn't parsing correctly: 2. **Test simpler cases first** - build up from basic to complex 3. **Use `toMatchTree` output** - see what the parser actually produces 4. **Check external tokenizer** - identifier vs word logic in `tokenizers.ts` + +## Common Misconceptions + +**"The parser handles unbound symbols as strings"** → False. The _VM_ does this via `TRY_LOAD` opcode. The parser creates `FunctionCallOrIdentifier` nodes; the compiler emits `TRY_LOAD`/`TRY_CALL`; the VM resolves at runtime. + +**"Words are just paths"** → False. Words are _anything_ that isn't an identifier. Paths, URLs, `@mentions`, `#hashtags` all parse as Words. The tokenizer accepts any non-whitespace that doesn't match identifier rules. + +**"Functions are first-class values"** → True, but they're compiled to labels, not inline bytecode. The VM creates closures with label references, not embedded instructions. + +**"The grammar is simple"** → False. It has pragmatic workarounds for GLR conflicts (`expressionWithoutIdentifier`), complex EOF handling, and relies heavily on the external tokenizer for correctness. + +**"Short-circuit logic is a VM feature"** → False. It's a compiler pattern using `DUP`, `JUMP_IF_FALSE/TRUE`, and `POP`. The VM has no AND/OR opcodes. diff --git a/src/compiler/compiler.ts b/src/compiler/compiler.ts index eef5396..94fcda6 100644 --- a/src/compiler/compiler.ts +++ b/src/compiler/compiler.ts @@ -3,7 +3,7 @@ import { parser } from '#parser/shrimp.ts' import * as terms from '#parser/shrimp.terms' import type { SyntaxNode, Tree } from '@lezer/common' import { assert, errorMessage } from '#utils/utils' -import { toBytecode, type Bytecode } from 'reefvm' +import { toBytecode, type Bytecode, type ProgramItem } from 'reefvm' import { checkTreeForErrors, getAllChildren, @@ -15,9 +15,10 @@ import { getNamedArgParts, } from '#compiler/utils' +type Label = `.${string}` export class Compiler { - instructions: string[] = [] - fnLabels = new Map() + instructions: ProgramItem[] = [] + fnLabels = new Map() ifLabelCount = 0 bytecode: Bytecode @@ -34,13 +35,14 @@ export class Compiler { // Add the labels for (const [label, labelInstructions] of this.fnLabels) { - this.instructions.push(`${label}:`) - this.instructions.push(...labelInstructions.map((instr) => ` ${instr}`)) - this.instructions.push(' RETURN') + this.instructions.push([`${label}:`]) + this.instructions.push(...labelInstructions) + this.instructions.push(['RETURN']) } - // console.log(`\n🤖 instructions:\n----------------\n${this.instructions.join('\n')}\n\n`) - this.bytecode = toBytecode(this.instructions.join('\n')) + // logInstructions(this.instructions) + + this.bytecode = toBytecode(this.instructions) } catch (error) { if (error instanceof CompilerError) { throw new Error(error.toReadableString(input)) @@ -60,46 +62,50 @@ export class Compiler { child = child.nextSibling } - this.instructions.push('HALT') + this.instructions.push(['HALT']) } - #compileNode(node: SyntaxNode, input: string): string[] { + #compileNode(node: SyntaxNode, input: string): ProgramItem[] { const value = input.slice(node.from, node.to) switch (node.type.id) { case terms.Number: - return [`PUSH ${value}`] + const number = Number(value) + if (Number.isNaN(number)) + throw new CompilerError(`Invalid number literal: ${value}`, node.from, node.to) + + return [[`PUSH`, number]] case terms.String: const strValue = value.slice(1, -1).replace(/\\/g, '') - return [`PUSH "${strValue}"`] + return [[`PUSH`, strValue]] case terms.Boolean: { - return [`PUSH ${value}`] + return [[`PUSH`, value === 'true']] } case terms.Identifier: { - return [`TRY_LOAD ${value}`] + return [[`TRY_LOAD`, value]] } case terms.BinOp: { const { left, op, right } = getBinaryParts(node) - const instructions: string[] = [] + const instructions: ProgramItem[] = [] instructions.push(...this.#compileNode(left, input)) instructions.push(...this.#compileNode(right, input)) const opValue = input.slice(op.from, op.to) switch (opValue) { case '+': - instructions.push('ADD') + instructions.push(['ADD']) break case '-': - instructions.push('SUB') + instructions.push(['SUB']) break case '*': - instructions.push('MUL') + instructions.push(['MUL']) break case '/': - instructions.push('DIV') + instructions.push(['DIV']) break default: throw new CompilerError(`Unsupported binary operator: ${opValue}`, op.from, op.to) @@ -110,10 +116,10 @@ export class Compiler { case terms.Assign: { const { identifier, right } = getAssignmentParts(node) - const instructions: string[] = [] + const instructions: ProgramItem[] = [] instructions.push(...this.#compileNode(right, input)) const identifierName = input.slice(identifier.from, identifier.to) - instructions.push(`STORE ${identifierName}`) + instructions.push(['STORE', identifierName]) return instructions } @@ -127,23 +133,23 @@ export class Compiler { case terms.FunctionDef: { const { paramNames, bodyNode } = getFunctionDefParts(node, input) - const instructions: string[] = [] - const functionName = `.func_${this.fnLabels.size}` - const bodyInstructions: string[] = [] - if (this.fnLabels.has(functionName)) { - throw new CompilerError(`Function name collision: ${functionName}`, node.from, node.to) + const instructions: ProgramItem[] = [] + const functionLabel: Label = `.func_${this.fnLabels.size}` + const bodyInstructions: ProgramItem[] = [] + if (this.fnLabels.has(functionLabel)) { + throw new CompilerError(`Function name collision: ${functionLabel}`, node.from, node.to) } - this.fnLabels.set(functionName, bodyInstructions) + this.fnLabels.set(functionLabel, bodyInstructions) - instructions.push(`MAKE_FUNCTION (${paramNames}) ${functionName}`) + instructions.push(['MAKE_FUNCTION', paramNames, functionLabel]) bodyInstructions.push(...this.#compileNode(bodyNode, input)) return instructions } case terms.FunctionCallOrIdentifier: { - return [`TRY_CALL ${value}`] + return [['TRY_CALL', value]] } /* @@ -161,7 +167,7 @@ export class Compiler { */ case terms.FunctionCall: { const { identifierNode, namedArgs, positionalArgs } = getFunctionCallParts(node, input) - const instructions: string[] = [] + const instructions: ProgramItem[] = [] instructions.push(...this.#compileNode(identifierNode, input)) positionalArgs.forEach((arg) => { @@ -170,13 +176,13 @@ export class Compiler { namedArgs.forEach((arg) => { const { name, valueNode } = getNamedArgParts(arg, input) - instructions.push(`PUSH "${name}"`) + instructions.push(['PUSH', name]) instructions.push(...this.#compileNode(valueNode, input)) }) - instructions.push(`PUSH ${positionalArgs.length}`) - instructions.push(`PUSH ${namedArgs.length}`) - instructions.push(`CALL`) + instructions.push(['PUSH', positionalArgs.length]) + instructions.push(['PUSH', namedArgs.length]) + instructions.push(['CALL']) return instructions } @@ -193,86 +199,84 @@ export class Compiler { node, input ) - const instructions: string[] = [] + const instructions: ProgramItem[] = [] instructions.push(...this.#compileNode(conditionNode, input)) this.ifLabelCount++ - const elseLabel = `.else_${this.ifLabelCount}` - const endLabel = `.end_${this.ifLabelCount}` + const endLabel: Label = `.end_${this.ifLabelCount}` const thenBlockInstructions = this.#compileNode(thenBlock, input) - instructions.push(`JUMP_IF_FALSE #${thenBlockInstructions.length + 1}`) + instructions.push(['JUMP_IF_FALSE', thenBlockInstructions.length + 1]) instructions.push(...thenBlockInstructions) - instructions.push(`JUMP ${endLabel}`) + instructions.push(['JUMP', endLabel]) // Else if - elseIfBlocks.forEach(({ conditional, thenBlock }, index) => { + elseIfBlocks.forEach(({ conditional, thenBlock }) => { instructions.push(...this.#compileNode(conditional, input)) const elseIfInstructions = this.#compileNode(thenBlock, input) - instructions.push(`JUMP_IF_FALSE #${elseIfInstructions.length + 1}`) + instructions.push(['JUMP_IF_FALSE', elseIfInstructions.length + 1]) instructions.push(...elseIfInstructions) - instructions.push(`JUMP ${endLabel}`) + instructions.push(['JUMP', endLabel]) }) // Else - instructions.push(`${elseLabel}:`) if (elseThenBlock) { - const elseThenInstructions = this.#compileNode(elseThenBlock, input).map((i) => ` ${i}`) + const elseThenInstructions = this.#compileNode(elseThenBlock, input) instructions.push(...elseThenInstructions) } else { - instructions.push(` PUSH null`) + instructions.push(['PUSH', null]) } - instructions.push(`${endLabel}:`) + instructions.push([`${endLabel}:`]) return instructions } // - `EQ`, `NEQ`, `LT`, `GT`, `LTE`, `GTE` - Pop 2, push boolean case terms.ConditionalOp: { - const instructions: string[] = [] + const instructions: ProgramItem[] = [] const { left, op, right } = getBinaryParts(node) - const leftInstructions: string[] = this.#compileNode(left, input) - const rightInstructions: string[] = this.#compileNode(right, input) + const leftInstructions: ProgramItem[] = this.#compileNode(left, input) + const rightInstructions: ProgramItem[] = this.#compileNode(right, input) const opValue = input.slice(op.from, op.to) switch (opValue) { case '=': - instructions.push(...leftInstructions, ...rightInstructions, 'EQ') + instructions.push(...leftInstructions, ...rightInstructions, ['EQ']) break case '!=': - instructions.push(...leftInstructions, ...rightInstructions, 'NEQ') + instructions.push(...leftInstructions, ...rightInstructions, ['NEQ']) break case '<': - instructions.push(...leftInstructions, ...rightInstructions, 'LT') + instructions.push(...leftInstructions, ...rightInstructions, ['LT']) break case '>': - instructions.push(...leftInstructions, ...rightInstructions, 'GT') + instructions.push(...leftInstructions, ...rightInstructions, ['GT']) break case '<=': - instructions.push(...leftInstructions, ...rightInstructions, 'LTE') + instructions.push(...leftInstructions, ...rightInstructions, ['LTE']) break case '>=': - instructions.push(...leftInstructions, ...rightInstructions, 'GTE') + instructions.push(...leftInstructions, ...rightInstructions, ['GTE']) break case 'and': instructions.push(...leftInstructions) - instructions.push('DUP') - instructions.push(`JUMP_IF_FALSE #${rightInstructions.length + 1}`) - instructions.push('POP') + instructions.push(['DUP']) + instructions.push(['JUMP_IF_FALSE', rightInstructions.length + 1]) + instructions.push(['POP']) instructions.push(...rightInstructions) break case 'or': instructions.push(...leftInstructions) - instructions.push('PUSH 9') - instructions.push(`JUMP_IF_TRUE #${rightInstructions.length + 1}`) - instructions.push('POP') + instructions.push(['DUP']) + instructions.push(['JUMP_IF_TRUE', rightInstructions.length + 1]) + instructions.push(['POP']) instructions.push(...rightInstructions) break @@ -289,3 +293,19 @@ export class Compiler { } } } + +const logInstructions = (instructions: ProgramItem[]) => { + const instructionsString = instructions + .map((parts) => { + const isPush = parts[0] === 'PUSH' + return parts + .map((part, i) => { + const partAsString = typeof part == 'string' && isPush ? `'${part}'` : part!.toString() + return i > 0 ? partAsString : part + }) + .join(' ') + }) + .join('\n') + + console.log(`\n🤖 instructions:\n----------------\n${instructionsString}\n\n`) +} diff --git a/src/parser/parser.test.ts b/src/parser/parser.test.ts index 2adb6c9..bd7074b 100644 --- a/src/parser/parser.test.ts +++ b/src/parser/parser.test.ts @@ -245,16 +245,17 @@ describe('BinOp', () => { describe('Fn', () => { test('parses function no parameters', () => { - expect('fn: 1').toMatchTree(` + expect('fn: 1 end').toMatchTree(` FunctionDef keyword fn Params colon : - Number 1`) + Number 1 + end end`) }) test('parses function with single parameter', () => { - expect('fn x: x + 1').toMatchTree(` + expect('fn x: x + 1 end').toMatchTree(` FunctionDef keyword fn Params @@ -263,11 +264,12 @@ describe('Fn', () => { BinOp Identifier x operator + - Number 1`) + Number 1 + end end`) }) test('parses function with multiple parameters', () => { - expect('fn x y: x * y').toMatchTree(` + expect('fn x y: x * y end').toMatchTree(` FunctionDef keyword fn Params @@ -277,7 +279,8 @@ describe('Fn', () => { BinOp Identifier x operator * - Identifier y`) + Identifier y + end end`) }) test('parses multiline function with multiple statements', () => { @@ -381,7 +384,7 @@ describe('Assign', () => { }) test('parses assignment with functions', () => { - expect('add = fn a b: a + b').toMatchTree(` + expect('add = fn a b: a + b end').toMatchTree(` Assign Identifier add operator = @@ -394,7 +397,8 @@ describe('Assign', () => { BinOp Identifier a operator + - Identifier b`) + Identifier b + end end`) }) })