From a53db50b1ac2078c2117e94c936232db0b6e88f1 Mon Sep 17 00:00:00 2001
From: Corey Johnson <probablycorey@gmail.com>
Date: Sun, 12 Oct 2025 16:33:53 -0700
Subject: [PATCH] wip

---
 CLAUDE.md                 | 111 ++++++++++++++++++++++++++++-
 src/compiler/compiler.ts  | 142 ++++++++++++++++++++++----------------
 src/parser/parser.test.ts |  20 +++---
 3 files changed, 202 insertions(+), 71 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index a9564d0..00532ad 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -35,6 +35,31 @@ Shrimp is a shell-like scripting language that combines command-line simplicity
 
 Key references: [Lezer System Guide](https://lezer.codemirror.net/docs/guide/) | [Lezer API](https://lezer.codemirror.net/docs/ref/)
 
+## Reading the Codebase: What to Look For
+
+When exploring Shrimp, focus on these key files in order:
+
+1. **src/parser/shrimp.grammar** - Language syntax rules
+
+   - Note the `expressionWithoutIdentifier` pattern and its comment
+   - See how `consumeToTerminator` handles statement-level parsing
+
+2. **src/parser/tokenizer.ts** - How Identifier vs Word is determined
+
+   - Check the emoji Unicode ranges and surrogate pair handling
+   - See context-aware termination logic (`;`, `)`, `:`)
+
+3. **src/compiler/compiler.ts** - CST to bytecode transformation
+
+   - See how functions become labels in `fnLabels` map
+   - Check short-circuit logic for `and`/`or` (lines 267-282)
+   - Notice `TRY_CALL` emission for bare identifiers (line 152)
+
+4. **packages/ReefVM/src/vm.ts** - Bytecode execution
+   - See `TRY_CALL` fall-through to `CALL` (lines 357-375)
+   - Check `TRY_LOAD` string coercion (lines 135-145)
+   - Notice NOSE-style named parameter binding (lines 425-443)
+
 ## Development Commands
 
 ### Running Files
@@ -141,14 +166,69 @@ function parseExpression(input: string) {
 
 **Whitespace-sensitive parsing**: Spaces distinguish operators from identifiers (`x-1` vs `x - 1`). This enables natural shell-like syntax.
 
-**Identifier vs Word tokenization**: Custom tokenizer determines if a token is an assignable identifier (lowercase/emoji start) or a non-assignable word (paths, URLs). This allows `./file.txt` without quotes.
+**Identifier vs Word tokenization**: The custom tokenizer (tokenizer.ts) is sophisticated:
 
-**Ambiguous identifier resolution**: Bare identifiers like `myVar` could be function calls or variable references. The parser creates `FunctionCallOrIdentifier` nodes, resolved at runtime.
+- **Surrogate pair handling**: Processes emoji as full Unicode code points (lines 51-65)
+- **Context-aware termination**: Stops at `;`, `)`, `:` only when followed by whitespace (lines 19-24)
+  - This allows `basename ./cool;` to parse correctly
+  - But `basename ./cool; 2` treats the semicolon as a terminator
+- **GLR state checking**: Uses `stack.canShift(Word)` to decide whether to track identifier validity
+- **Permissive Words**: Anything that's not an identifier is a Word (paths, URLs, @mentions, #hashtags)
+
+**Why this matters**: This complexity is what enables shell-like syntax. Without it, you'd need quotes around `./file.txt` or special handling for paths.
+
+**Identifier rules**: Must start with lowercase letter or emoji, can contain lowercase, digits, dashes, and emoji.
+
+**Word rules**: Everything else that isn't whitespace or a delimiter.
+
+**Ambiguous identifier resolution**: Bare identifiers like `myVar` could be function calls or variable references. The parser creates `FunctionCallOrIdentifier` nodes, resolved at runtime using the `TRY_CALL` opcode.
+
+**How it works**:
+
+- The compiler emits `TRY_CALL varname` for bare identifiers (src/compiler/compiler.ts:152)
+- ReefVM checks if the variable is a function at runtime (vm.ts:357-373)
+- If it's a function, TRY_CALL intentionally falls through to CALL opcode (no break statement)
+- If it's not a function or undefined, it pushes the value/string and returns
+- This runtime resolution enables shell-like "echo hello" without quotes
+
+**Unbound symbols become strings**: When `TRY_LOAD` encounters an undefined variable, it pushes the variable name as a string (vm.ts:135-145). This is a first-class language feature implemented as a VM opcode, not a parser trick.
 
 **Expression-oriented design**: Everything returns a value - commands, assignments, functions. This enables composition and functional patterns.
 
 **EOF handling**: The grammar uses `(statement | newlineOrSemicolon)+ eof?` to handle empty lines and end-of-file without infinite loops.
 
+## Compiler Architecture
+
+**Function compilation strategy**: The compiler doesn't create inline function objects. Instead it:
+
+1. Generates unique labels (`.func_0`, `.func_1`) for each function body (compiler.ts:137)
+2. Stores function body instructions in `fnLabels` map during compilation
+3. Appends all function bodies to the end of bytecode with RETURN instructions (compiler.ts:36-41)
+4. Emits `MAKE_FUNCTION` with parameters and label reference
+
+This approach keeps the main program linear and allows ReefVM to jump to function bodies by label.
+
+**Short-circuit logic**: ReefVM has no AND/OR opcodes. The compiler implements short-circuit evaluation using:
+
+```typescript
+// For `a and b`:
+LOAD a
+DUP                    // Duplicate so we can return it if falsy
+JUMP_IF_FALSE skip     // If false, skip evaluating b
+POP                    // Remove duplicate if we're continuing
+LOAD b                 // Evaluate right side
+skip:
+```
+
+See compiler.ts:267-282 for the full implementation. The `or` operator uses `JUMP_IF_TRUE` instead.
+
+**If/else compilation**: The compiler uses label-based jumps:
+
+- `JUMP_IF_FALSE` skips the then-block when condition is false
+- Each branch ends with `JUMP endLabel` to skip remaining branches
+- The final label marks where all branches converge
+- If there's no else branch, compiler emits `PUSH null` as the default value
+
 ## Grammar Development
 
 ### Grammar Structure
@@ -206,6 +286,21 @@ The `toMatchTree` helper compares parser output with expected CST structure.
 
 **Empty line parsing**: The grammar structure `(statement | newlineOrSemicolon)+ eof?` allows proper empty line and EOF handling.
 
+### Why expressionWithoutIdentifier Exists
+
+The grammar has an unusual pattern: `expressionWithoutIdentifier`. This exists to solve a GLR conflict:
+
+```
+consumeToTerminator {
+  ambiguousFunctionCall |   // → FunctionCallOrIdentifier → Identifier
+  expression                 // → Identifier
+}
+```
+
+Without `expressionWithoutIdentifier`, parsing `my-var` at statement level creates two paths that both want the Identifier token. The grammar comment (shrimp.grammar lines 157-164) explains we "gave up trying to use GLR to fix it."
+
+**The solution**: Remove Identifier from the `expression` path by creating `expressionWithoutIdentifier`, forcing standalone identifiers through `ambiguousFunctionCall`. This is pragmatic over theoretical purity.
+
 ## Testing Strategy
 
 ### Parser Tests (`src/parser/parser.test.ts`)
@@ -259,3 +354,15 @@ When grammar isn't parsing correctly:
 2. **Test simpler cases first** - build up from basic to complex
 3. **Use `toMatchTree` output** - see what the parser actually produces
 4. **Check external tokenizer** - identifier vs word logic in `tokenizers.ts`
+
+## Common Misconceptions
+
+**"The parser handles unbound symbols as strings"** → False. The _VM_ does this via `TRY_LOAD` opcode. The parser creates `FunctionCallOrIdentifier` nodes; the compiler emits `TRY_LOAD`/`TRY_CALL`; the VM resolves at runtime.
+
+**"Words are just paths"** → False. Words are _anything_ that isn't an identifier. Paths, URLs, `@mentions`, `#hashtags` all parse as Words. The tokenizer accepts any non-whitespace that doesn't match identifier rules.
+
+**"Functions are first-class values"** → True, but they're compiled to labels, not inline bytecode. The VM creates closures with label references, not embedded instructions.
+
+**"The grammar is simple"** → False. It has pragmatic workarounds for GLR conflicts (`expressionWithoutIdentifier`), complex EOF handling, and relies heavily on the external tokenizer for correctness.
+
+**"Short-circuit logic is a VM feature"** → False. It's a compiler pattern using `DUP`, `JUMP_IF_FALSE/TRUE`, and `POP`. The VM has no AND/OR opcodes.
diff --git a/src/compiler/compiler.ts b/src/compiler/compiler.ts
index eef5396..94fcda6 100644
--- a/src/compiler/compiler.ts
+++ b/src/compiler/compiler.ts
@@ -3,7 +3,7 @@ import { parser } from '#parser/shrimp.ts'
 import * as terms from '#parser/shrimp.terms'
 import type { SyntaxNode, Tree } from '@lezer/common'
 import { assert, errorMessage } from '#utils/utils'
-import { toBytecode, type Bytecode } from 'reefvm'
+import { toBytecode, type Bytecode, type ProgramItem } from 'reefvm'
 import {
   checkTreeForErrors,
   getAllChildren,
@@ -15,9 +15,10 @@ import {
   getNamedArgParts,
 } from '#compiler/utils'
 
+type Label = `.${string}`
 export class Compiler {
-  instructions: string[] = []
-  fnLabels = new Map<string, string[]>()
+  instructions: ProgramItem[] = []
+  fnLabels = new Map<Label, ProgramItem[]>()
   ifLabelCount = 0
   bytecode: Bytecode
 
@@ -34,13 +35,14 @@ export class Compiler {
 
       // Add the labels
       for (const [label, labelInstructions] of this.fnLabels) {
-        this.instructions.push(`${label}:`)
-        this.instructions.push(...labelInstructions.map((instr) => `  ${instr}`))
-        this.instructions.push('  RETURN')
+        this.instructions.push([`${label}:`])
+        this.instructions.push(...labelInstructions)
+        this.instructions.push(['RETURN'])
       }
 
-      // console.log(`\n🤖 instructions:\n----------------\n${this.instructions.join('\n')}\n\n`)
-      this.bytecode = toBytecode(this.instructions.join('\n'))
+      // logInstructions(this.instructions)
+
+      this.bytecode = toBytecode(this.instructions)
     } catch (error) {
       if (error instanceof CompilerError) {
         throw new Error(error.toReadableString(input))
@@ -60,46 +62,50 @@ export class Compiler {
       child = child.nextSibling
     }
 
-    this.instructions.push('HALT')
+    this.instructions.push(['HALT'])
   }
 
-  #compileNode(node: SyntaxNode, input: string): string[] {
+  #compileNode(node: SyntaxNode, input: string): ProgramItem[] {
     const value = input.slice(node.from, node.to)
     switch (node.type.id) {
       case terms.Number:
-        return [`PUSH ${value}`]
+        const number = Number(value)
+        if (Number.isNaN(number))
+          throw new CompilerError(`Invalid number literal: ${value}`, node.from, node.to)
+
+        return [[`PUSH`, number]]
 
       case terms.String:
         const strValue = value.slice(1, -1).replace(/\\/g, '')
-        return [`PUSH "${strValue}"`]
+        return [[`PUSH`, strValue]]
 
       case terms.Boolean: {
-        return [`PUSH ${value}`]
+        return [[`PUSH`, value === 'true']]
       }
 
       case terms.Identifier: {
-        return [`TRY_LOAD ${value}`]
+        return [[`TRY_LOAD`, value]]
       }
 
       case terms.BinOp: {
         const { left, op, right } = getBinaryParts(node)
-        const instructions: string[] = []
+        const instructions: ProgramItem[] = []
         instructions.push(...this.#compileNode(left, input))
         instructions.push(...this.#compileNode(right, input))
 
         const opValue = input.slice(op.from, op.to)
         switch (opValue) {
           case '+':
-            instructions.push('ADD')
+            instructions.push(['ADD'])
             break
           case '-':
-            instructions.push('SUB')
+            instructions.push(['SUB'])
             break
           case '*':
-            instructions.push('MUL')
+            instructions.push(['MUL'])
             break
           case '/':
-            instructions.push('DIV')
+            instructions.push(['DIV'])
             break
           default:
             throw new CompilerError(`Unsupported binary operator: ${opValue}`, op.from, op.to)
@@ -110,10 +116,10 @@ export class Compiler {
 
       case terms.Assign: {
         const { identifier, right } = getAssignmentParts(node)
-        const instructions: string[] = []
+        const instructions: ProgramItem[] = []
         instructions.push(...this.#compileNode(right, input))
         const identifierName = input.slice(identifier.from, identifier.to)
-        instructions.push(`STORE ${identifierName}`)
+        instructions.push(['STORE', identifierName])
 
         return instructions
       }
@@ -127,23 +133,23 @@ export class Compiler {
 
       case terms.FunctionDef: {
         const { paramNames, bodyNode } = getFunctionDefParts(node, input)
-        const instructions: string[] = []
-        const functionName = `.func_${this.fnLabels.size}`
-        const bodyInstructions: string[] = []
-        if (this.fnLabels.has(functionName)) {
-          throw new CompilerError(`Function name collision: ${functionName}`, node.from, node.to)
+        const instructions: ProgramItem[] = []
+        const functionLabel: Label = `.func_${this.fnLabels.size}`
+        const bodyInstructions: ProgramItem[] = []
+        if (this.fnLabels.has(functionLabel)) {
+          throw new CompilerError(`Function name collision: ${functionLabel}`, node.from, node.to)
         }
 
-        this.fnLabels.set(functionName, bodyInstructions)
+        this.fnLabels.set(functionLabel, bodyInstructions)
 
-        instructions.push(`MAKE_FUNCTION (${paramNames}) ${functionName}`)
+        instructions.push(['MAKE_FUNCTION', paramNames, functionLabel])
         bodyInstructions.push(...this.#compileNode(bodyNode, input))
 
         return instructions
       }
 
       case terms.FunctionCallOrIdentifier: {
-        return [`TRY_CALL ${value}`]
+        return [['TRY_CALL', value]]
       }
 
       /*
@@ -161,7 +167,7 @@ export class Compiler {
       */
       case terms.FunctionCall: {
         const { identifierNode, namedArgs, positionalArgs } = getFunctionCallParts(node, input)
-        const instructions: string[] = []
+        const instructions: ProgramItem[] = []
         instructions.push(...this.#compileNode(identifierNode, input))
 
         positionalArgs.forEach((arg) => {
@@ -170,13 +176,13 @@ export class Compiler {
 
         namedArgs.forEach((arg) => {
           const { name, valueNode } = getNamedArgParts(arg, input)
-          instructions.push(`PUSH "${name}"`)
+          instructions.push(['PUSH', name])
           instructions.push(...this.#compileNode(valueNode, input))
         })
 
-        instructions.push(`PUSH ${positionalArgs.length}`)
-        instructions.push(`PUSH ${namedArgs.length}`)
-        instructions.push(`CALL`)
+        instructions.push(['PUSH', positionalArgs.length])
+        instructions.push(['PUSH', namedArgs.length])
+        instructions.push(['CALL'])
         return instructions
       }
 
@@ -193,86 +199,84 @@ export class Compiler {
           node,
           input
         )
-        const instructions: string[] = []
+        const instructions: ProgramItem[] = []
         instructions.push(...this.#compileNode(conditionNode, input))
         this.ifLabelCount++
-        const elseLabel = `.else_${this.ifLabelCount}`
-        const endLabel = `.end_${this.ifLabelCount}`
+        const endLabel: Label = `.end_${this.ifLabelCount}`
 
         const thenBlockInstructions = this.#compileNode(thenBlock, input)
-        instructions.push(`JUMP_IF_FALSE #${thenBlockInstructions.length + 1}`)
+        instructions.push(['JUMP_IF_FALSE', thenBlockInstructions.length + 1])
         instructions.push(...thenBlockInstructions)
-        instructions.push(`JUMP ${endLabel}`)
+        instructions.push(['JUMP', endLabel])
 
         // Else if
-        elseIfBlocks.forEach(({ conditional, thenBlock }, index) => {
+        elseIfBlocks.forEach(({ conditional, thenBlock }) => {
           instructions.push(...this.#compileNode(conditional, input))
           const elseIfInstructions = this.#compileNode(thenBlock, input)
-          instructions.push(`JUMP_IF_FALSE #${elseIfInstructions.length + 1}`)
+          instructions.push(['JUMP_IF_FALSE', elseIfInstructions.length + 1])
           instructions.push(...elseIfInstructions)
-          instructions.push(`JUMP ${endLabel}`)
+          instructions.push(['JUMP', endLabel])
         })
 
         // Else
-        instructions.push(`${elseLabel}:`)
         if (elseThenBlock) {
-          const elseThenInstructions = this.#compileNode(elseThenBlock, input).map((i) => `  ${i}`)
+          const elseThenInstructions = this.#compileNode(elseThenBlock, input)
           instructions.push(...elseThenInstructions)
         } else {
-          instructions.push(`  PUSH null`)
+          instructions.push(['PUSH', null])
         }
 
-        instructions.push(`${endLabel}:`)
+        instructions.push([`${endLabel}:`])
 
         return instructions
       }
 
       // - `EQ`, `NEQ`, `LT`, `GT`, `LTE`, `GTE` - Pop 2, push boolean
       case terms.ConditionalOp: {
-        const instructions: string[] = []
+        const instructions: ProgramItem[] = []
         const { left, op, right } = getBinaryParts(node)
-        const leftInstructions: string[] = this.#compileNode(left, input)
-        const rightInstructions: string[] = this.#compileNode(right, input)
+        const leftInstructions: ProgramItem[] = this.#compileNode(left, input)
+        const rightInstructions: ProgramItem[] = this.#compileNode(right, input)
 
         const opValue = input.slice(op.from, op.to)
         switch (opValue) {
           case '=':
-            instructions.push(...leftInstructions, ...rightInstructions, 'EQ')
+            instructions.push(...leftInstructions, ...rightInstructions, ['EQ'])
             break
 
           case '!=':
-            instructions.push(...leftInstructions, ...rightInstructions, 'NEQ')
+            instructions.push(...leftInstructions, ...rightInstructions, ['NEQ'])
             break
 
           case '<':
-            instructions.push(...leftInstructions, ...rightInstructions, 'LT')
+            instructions.push(...leftInstructions, ...rightInstructions, ['LT'])
             break
 
           case '>':
-            instructions.push(...leftInstructions, ...rightInstructions, 'GT')
+            instructions.push(...leftInstructions, ...rightInstructions, ['GT'])
             break
 
           case '<=':
-            instructions.push(...leftInstructions, ...rightInstructions, 'LTE')
+            instructions.push(...leftInstructions, ...rightInstructions, ['LTE'])
             break
 
           case '>=':
-            instructions.push(...leftInstructions, ...rightInstructions, 'GTE')
+            instructions.push(...leftInstructions, ...rightInstructions, ['GTE'])
             break
 
           case 'and':
             instructions.push(...leftInstructions)
-            instructions.push('DUP')
-            instructions.push(`JUMP_IF_FALSE #${rightInstructions.length + 1}`)
-            instructions.push('POP')
+            instructions.push(['DUP'])
+            instructions.push(['JUMP_IF_FALSE', rightInstructions.length + 1])
+            instructions.push(['POP'])
             instructions.push(...rightInstructions)
             break
 
           case 'or':
             instructions.push(...leftInstructions)
-            instructions.push('PUSH 9')
-            instructions.push(`JUMP_IF_TRUE #${rightInstructions.length + 1}`)
-            instructions.push('POP')
+            instructions.push(['DUP'])
+            instructions.push(['JUMP_IF_TRUE', rightInstructions.length + 1])
+            instructions.push(['POP'])
             instructions.push(...rightInstructions)
 
             break
@@ -289,3 +293,19 @@ export class Compiler {
     }
   }
 }
+
+const logInstructions = (instructions: ProgramItem[]) => {
+  const instructionsString = instructions
+    .map((parts) => {
+      const isPush = parts[0] === 'PUSH'
+      return parts
+        .map((part, i) => {
+          const partAsString = typeof part == 'string' && isPush ? `'${part}'` : part!.toString()
+          return i > 0 ? partAsString : part
+        })
+        .join(' ')
+    })
+    .join('\n')
+
+  console.log(`\n🤖 instructions:\n----------------\n${instructionsString}\n\n`)
+}
diff --git a/src/parser/parser.test.ts b/src/parser/parser.test.ts
index 2adb6c9..bd7074b 100644
--- a/src/parser/parser.test.ts
+++ b/src/parser/parser.test.ts
@@ -245,16 +245,17 @@ describe('BinOp', () => {
 
 describe('Fn', () => {
   test('parses function no parameters', () => {
-    expect('fn: 1').toMatchTree(`
+    expect('fn: 1 end').toMatchTree(`
       FunctionDef
         keyword fn
         Params 
         colon :
-        Number 1`)
+        Number 1
+        end end`)
   })
 
   test('parses function with single parameter', () => {
-    expect('fn x: x + 1').toMatchTree(`
+    expect('fn x: x + 1 end').toMatchTree(`
       FunctionDef
         keyword fn
         Params
@@ -263,11 +264,12 @@ describe('Fn', () => {
         BinOp
           Identifier x
           operator +
-          Number 1`)
+          Number 1
+        end end`)
   })
 
   test('parses function with multiple parameters', () => {
-    expect('fn x y: x * y').toMatchTree(`
+    expect('fn x y: x * y end').toMatchTree(`
       FunctionDef
         keyword fn
         Params
@@ -277,7 +279,8 @@ describe('Fn', () => {
         BinOp
           Identifier x
           operator *
-          Identifier y`)
+          Identifier y
+        end end`)
   })
 
   test('parses multiline function with multiple statements', () => {
@@ -381,7 +384,7 @@ describe('Assign', () => {
   })
 
   test('parses assignment with functions', () => {
-    expect('add = fn a b: a + b').toMatchTree(`
+    expect('add = fn a b: a + b end').toMatchTree(`
       Assign
         Identifier add
         operator =
@@ -394,7 +397,8 @@ describe('Assign', () => {
           BinOp
             Identifier a
             operator +
-            Identifier b`)
+            Identifier b
+          end end`)
   })
 })