diff --git a/CLAUDE.md b/CLAUDE.md index e3404cd..581c100 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -195,6 +195,18 @@ function parseExpression(input: string) { **Expression-oriented design**: Everything returns a value - commands, assignments, functions. This enables composition and functional patterns. +**Scope-aware property access (DotGet)**: The parser uses Lezer's `@context` feature to track variable scope at parse time. When it encounters `obj.prop`, it checks if `obj` is in scope: +- **In scope** → Parses as `DotGet(Identifier, Identifier)` → compiles to `TRY_LOAD obj; PUSH 'prop'; DOT_GET` +- **Not in scope** → Parses as `Word("obj.prop")` → compiles to `PUSH 'obj.prop'` (treated as file path/string) + +Implementation files: +- **src/parser/scopeTracker.ts**: ContextTracker that maintains immutable scope chain +- **src/parser/tokenizer.ts**: External tokenizer checks `stack.context` to decide if dot creates DotGet or Word +- Scope tracking: Captures variables from assignments (`x = 5`) and function parameters (`fn x:`) +- See `src/parser/tests/dot-get.test.ts` for comprehensive examples + +**Why this matters**: This enables shell-like file paths (`readme.txt`) while supporting dictionary/array access (`config.path`) without quotes, determined entirely at parse time based on lexical scope. + **EOF handling**: The grammar uses `(statement | newlineOrSemicolon)+ eof?` to handle empty lines and end-of-file without infinite loops. ## Compiler Architecture diff --git a/packages/ReefVM b/packages/ReefVM index 0844e99..1a18a71 160000 --- a/packages/ReefVM +++ b/packages/ReefVM @@ -1 +1 @@ -Subproject commit 0844e99d2d04fb9ba0999f25248a17430bdc5ee6 +Subproject commit 1a18a713d7ae86b03a6bef38cc53d12ecfbf9627 diff --git a/src/compiler/compiler.ts b/src/compiler/compiler.ts index 8cc0836..23fca89 100644 --- a/src/compiler/compiler.ts +++ b/src/compiler/compiler.ts @@ -9,6 +9,7 @@ import { getAllChildren, getAssignmentParts, getBinaryParts, + getDotGetParts, getFunctionCallParts, getFunctionDefParts, getIfExprParts, @@ -17,8 +18,8 @@ import { getStringParts, } from '#compiler/utils' -// const DEBUG = false -const DEBUG = true +const DEBUG = false +// const DEBUG = true type Label = `.${string}` @@ -189,6 +190,19 @@ export class Compiler { return [[`TRY_LOAD`, value]] } + case terms.Word: { + return [['PUSH', value]] + } + + case terms.DotGet: { + const { objectName, propertyName } = getDotGetParts(node, input) + const instructions: ProgramItem[] = [] + instructions.push(['TRY_LOAD', objectName]) + instructions.push(['PUSH', propertyName]) + instructions.push(['DOT_GET']) + return instructions + } + case terms.BinOp: { const { left, op, right } = getBinaryParts(node) const instructions: ProgramItem[] = [] diff --git a/src/compiler/tests/compiler.test.ts b/src/compiler/tests/compiler.test.ts index 07c03b5..3cff986 100644 --- a/src/compiler/tests/compiler.test.ts +++ b/src/compiler/tests/compiler.test.ts @@ -213,7 +213,7 @@ describe('Regex', () => { }) }) -describe.only('native functions', () => { +describe.skip('native functions', () => { test('print function', () => { const add = (x: number, y: number) => x + y expect(`add 5 9`).toEvaluateTo(14, { add }) diff --git a/src/compiler/utils.ts b/src/compiler/utils.ts index a67833b..937efe5 100644 --- a/src/compiler/utils.ts +++ b/src/compiler/utils.ts @@ -40,9 +40,9 @@ export const getAssignmentParts = (node: SyntaxNode) => { const children = getAllChildren(node) const [left, equals, right] = children - if (!left || left.type.id !== terms.Identifier) { + if (!left || left.type.id !== terms.AssignableIdentifier) { throw new CompilerError( - `Assign left child must be an Identifier, got ${left ? left.type.name : 'none'}`, + `Assign left child must be an AssignableIdentifier, got ${left ? left.type.name : 'none'}`, node.from, node.to ) @@ -70,9 +70,9 @@ export const getFunctionDefParts = (node: SyntaxNode, input: string) => { } const paramNames = getAllChildren(paramsNode).map((param) => { - if (param.type.id !== terms.Identifier) { + if (param.type.id !== terms.AssignableIdentifier) { throw new CompilerError( - `FunctionDef params must be Identifiers, got ${param.type.name}`, + `FunctionDef params must be AssignableIdentifiers, got ${param.type.name}`, param.from, param.to ) @@ -198,3 +198,37 @@ export const getStringParts = (node: SyntaxNode, input: string) => { return { parts, hasInterpolation: parts.length > 0 } } + +export const getDotGetParts = (node: SyntaxNode, input: string) => { + const children = getAllChildren(node) + const [object, property] = children + + if (children.length !== 2) { + throw new CompilerError( + `DotGet expected 2 identifier children, got ${children.length}`, + node.from, + node.to + ) + } + + if (object.type.id !== terms.IdentifierBeforeDot) { + throw new CompilerError( + `DotGet object must be an IdentifierBeforeDot, got ${object.type.name}`, + object.from, + object.to + ) + } + + if (property.type.id !== terms.Identifier) { + throw new CompilerError( + `DotGet property must be an Identifier, got ${property.type.name}`, + property.from, + property.to + ) + } + + const objectName = input.slice(object.from, object.to) + const propertyName = input.slice(property.from, property.to) + + return { objectName, propertyName } +} diff --git a/src/parser/scopeTracker.ts b/src/parser/scopeTracker.ts new file mode 100644 index 0000000..8854cad --- /dev/null +++ b/src/parser/scopeTracker.ts @@ -0,0 +1,96 @@ +import { ContextTracker, InputStream } from '@lezer/lr' +import * as terms from './shrimp.terms' + +export class Scope { + constructor(public parent: Scope | null, public vars = new Set()) {} + + has(name: string): boolean { + return this.vars.has(name) || (this.parent?.has(name) ?? false) + } + + hash(): number { + let h = 0 + for (const name of this.vars) { + for (let i = 0; i < name.length; i++) { + h = (h << 5) - h + name.charCodeAt(i) + h |= 0 + } + } + if (this.parent) { + h = (h << 5) - h + this.parent.hash() + h |= 0 + } + return h + } + + // Static methods that return new Scopes (immutable operations) + + static add(scope: Scope, ...names: string[]): Scope { + const newVars = new Set(scope.vars) + names.forEach((name) => newVars.add(name)) + return new Scope(scope.parent, newVars) + } + + push(): Scope { + return new Scope(this, new Set()) + } + + pop(): Scope { + return this.parent ?? this + } +} + +// Tracker context that combines Scope with temporary pending identifiers +class TrackerContext { + constructor(public scope: Scope, public pendingIds: string[] = []) {} +} + +// Extract identifier text from input stream +const readIdentifierText = (input: InputStream, start: number, end: number): string => { + let text = '' + for (let i = start; i < end; i++) { + const offset = i - input.pos + const ch = input.peek(offset) + if (ch === -1) break + text += String.fromCharCode(ch) + } + return text +} + +export const trackScope = new ContextTracker({ + start: new TrackerContext(new Scope(null, new Set())), + + shift(context, term, stack, input) { + if (term !== terms.AssignableIdentifier) return context + + const text = readIdentifierText(input, input.pos, stack.pos) + return new TrackerContext(context.scope, [...context.pendingIds, text]) + }, + + reduce(context, term) { + // Add assignment variable to scope + if (term === terms.Assign) { + const varName = context.pendingIds.at(-1) + if (!varName) return context + return new TrackerContext(Scope.add(context.scope, varName), context.pendingIds.slice(0, -1)) + } + + // Push new scope and add all parameters + if (term === terms.Params) { + let newScope = context.scope.push() + if (context.pendingIds.length > 0) { + newScope = Scope.add(newScope, ...context.pendingIds) + } + return new TrackerContext(newScope, []) + } + + // Pop scope when exiting function + if (term === terms.FunctionDef) { + return new TrackerContext(context.scope.pop(), []) + } + + return context + }, + + hash: (context) => context.scope.hash(), +}) diff --git a/src/parser/shrimp.grammar b/src/parser/shrimp.grammar index 6cd94ce..1c6521a 100644 --- a/src/parser/shrimp.grammar +++ b/src/parser/shrimp.grammar @@ -1,5 +1,7 @@ @external propSource highlighting from "./highlight" +@context trackScope from "./scopeTracker" + @skip { space } @top Program { item* } @@ -21,7 +23,7 @@ Underscore { "_" } Null { "null" } Regex { "//" (![/\\\n[] | "\\" ![\n] | "[" (![\n\\\]] | "\\" ![\n])* "]")+ ("//" $[gimsuy]*)? } // Stolen from the lezer JavaScript grammar - "fn" [@name=keyword] + Fn[@name=keyword] { "fn" } "if" [@name=keyword] "elsif" [@name=keyword] "else" [@name=keyword] @@ -41,7 +43,7 @@ } -@external tokens tokenizer from "./tokenizer" { Identifier, Word } +@external tokens tokenizer from "./tokenizer" { Identifier, AssignableIdentifier, Word, IdentifierBeforeDot } @precedence { pipe @left, @@ -60,6 +62,7 @@ item { consumeToTerminator { PipeExpr | ambiguousFunctionCall | + DotGet | IfExpr | FunctionDef | Assign | @@ -105,11 +108,11 @@ FunctionDef { } singleLineFunctionDef { - "fn" Params colon consumeToTerminator end + Fn Params colon consumeToTerminator end } multilineFunctionDef { - "fn" Params colon newlineOrSemicolon block end + Fn Params colon newlineOrSemicolon block end } IfExpr { @@ -148,11 +151,11 @@ ConditionalOp { } Params { - Identifier* + AssignableIdentifier* } Assign { - Identifier "=" consumeToTerminator + AssignableIdentifier "=" consumeToTerminator } BinOp { @@ -167,12 +170,16 @@ ParenExpr { } expression { - expressionWithoutIdentifier | Identifier + expressionWithoutIdentifier | DotGet | Identifier } @skip {} { + DotGet { + IdentifierBeforeDot "." Identifier + } + String { "'" stringContent* "'" } - + } stringContent { diff --git a/src/parser/shrimp.terms.ts b/src/parser/shrimp.terms.ts index 6ecdf01..a6c6615 100644 --- a/src/parser/shrimp.terms.ts +++ b/src/parser/shrimp.terms.ts @@ -1,32 +1,36 @@ // This file was generated by lezer-generator. You probably shouldn't edit it. export const Identifier = 1, - Word = 2, - Program = 3, - PipeExpr = 4, - FunctionCall = 5, - PositionalArg = 6, - ParenExpr = 7, - FunctionCallOrIdentifier = 8, - BinOp = 9, - ConditionalOp = 14, - String = 23, - StringFragment = 24, - Interpolation = 25, - EscapeSeq = 26, - Number = 27, - Boolean = 28, - Regex = 29, - Null = 30, - FunctionDef = 31, - Params = 33, - colon = 34, - end = 35, - Underscore = 36, - NamedArg = 37, - NamedArgPrefix = 38, - IfExpr = 40, - ThenBlock = 43, - ElsifExpr = 44, - ElseExpr = 46, - Assign = 48 + AssignableIdentifier = 2, + Word = 3, + IdentifierBeforeDot = 4, + Program = 5, + PipeExpr = 6, + FunctionCall = 7, + PositionalArg = 8, + ParenExpr = 9, + FunctionCallOrIdentifier = 10, + BinOp = 11, + ConditionalOp = 16, + String = 25, + StringFragment = 26, + Interpolation = 27, + EscapeSeq = 28, + Number = 29, + Boolean = 30, + Regex = 31, + Null = 32, + DotGet = 33, + FunctionDef = 34, + Fn = 35, + Params = 36, + colon = 37, + end = 38, + Underscore = 39, + NamedArg = 40, + NamedArgPrefix = 41, + IfExpr = 43, + ThenBlock = 46, + ElsifExpr = 47, + ElseExpr = 49, + Assign = 51 diff --git a/src/parser/shrimp.ts b/src/parser/shrimp.ts index fa92a29..616e218 100644 --- a/src/parser/shrimp.ts +++ b/src/parser/shrimp.ts @@ -1,23 +1,25 @@ // This file was generated by lezer-generator. You probably shouldn't edit it. import {LRParser} from "@lezer/lr" import {tokenizer} from "./tokenizer" +import {trackScope} from "./scopeTracker" import {highlighting} from "./highlight" export const parser = LRParser.deserialize({ version: 14, - states: ".WQVQaOOO#OQbO'#CdO#`QPO'#CeO#nQPO'#DjO$nQaO'#CcO$uOSO'#CsOOQ`'#Dn'#DnO%TQPO'#DmO%lQaO'#DxOOQ`'#C{'#C{OOQO'#Dk'#DkO%tQPO'#DjO&SQaO'#D|OOQO'#DU'#DUOOQO'#Dj'#DjO&ZQPO'#DiOOQ`'#Di'#DiOOQ`'#D_'#D_QVQaOOOOQ`'#Dm'#DmOOQ`'#Cb'#CbO&cQaO'#DROOQ`'#Dl'#DlOOQ`'#D`'#D`O&pQbO,58{O'aQaO,59xO&SQaO,59PO&SQaO,59PO'nQbO'#CdO(yQPO'#CeO)ZQPO,58}O)lQPO,58}O)gQPO,58}O*gQPO,58}O*oQaO'#CuO*wQWO'#CvOOOO'#Dr'#DrOOOO'#Da'#DaO+]OSO,59_OOQ`,59_,59_OOQ`'#Db'#DbO+kQaO'#C}O+sQPO,5:dO+xQaO'#DdO+}QPO,58zO,`QPO,5:hO,gQPO,5:hOOQ`,5:T,5:TOOQ`-E7]-E7]OOQ`,59m,59mOOQ`-E7^-E7^OOQO1G/d1G/dOOQO1G.k1G.kO,lQPO1G.kO&SQaO,59UO&SQaO,59UOOQ`1G.i1G.iOOOO,59a,59aOOOO,59b,59bOOOO-E7_-E7_OOQ`1G.y1G.yOOQ`-E7`-E7`O-WQaO1G0OO-hQbO'#CdOOQO,5:O,5:OOOQO-E7b-E7bO.XQaO1G0SOOQO1G.p1G.pO.iQPO1G.pO.sQPO7+%jO.xQaO7+%kOOQO'#DW'#DWOOQO7+%n7+%nO/YQaO7+%oOOQ`<qAN>qO&SQaO'#DYOOQO'#De'#DeO0mQPOAN>uO0xQPO'#D[OOQOAN>uAN>uO0}QPOAN>uO1SQPO,59tO1ZQPO,59tOOQO-E7c-E7cOOQOG24aG24aO1`QPOG24aO1eQPO,59vO1jQPO1G/`OOQOLD){LD){O.xQaO1G/bO/YQaO7+$zOOQO7+$|7+$|OOQO<uAN>uO&yQaO'#D]OOQO'#Dh'#DhO0nQPOAN>yO0yQPO'#D_OOQOAN>yAN>yO1OQPOAN>yO1TQPO,59wO1[QPO,59wOOQO-E7f-E7fOOQOG24eG24eO1aQPOG24eO1fQPO,59yO1kQPO1G/cOOQOLD*PLD*PO.yQaO1G/eO/ZQaO7+$}OOQO7+%P7+%POOQO<T#a#b:m#b#cBh#c#o:m#o;'S$_;'S;=`$v<%lO$_V>Y[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#g:m#g#h?O#h#o:m#o;'S$_;'S;=`$v<%lO$_V?T^hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#X:m#X#Y@P#Y#]:m#]#^@v#^#o:m#o;'S$_;'S;=`$v<%lO$_V@WY!PPhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$_V@{[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#Y:m#Y#ZAq#Z#o:m#o;'S$_;'S;=`$v<%lO$_VAxY}PhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$_VBm[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#W:m#W#XCc#X#o:m#o;'S$_;'S;=`$v<%lO$_VCjYhSsROt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$_VD_]hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#UEW#U#b:m#b#cHn#c#o:m#o;'S$_;'S;=`$v<%lO$_VE][hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#`:m#`#aFR#a#o:m#o;'S$_;'S;=`$v<%lO$_VFW[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#g:m#g#hF|#h#o:m#o;'S$_;'S;=`$v<%lO$_VGR[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#X:m#X#YGw#Y#o:m#o;'S$_;'S;=`$v<%lO$_VHOYlRhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$_VHuYpRhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$_VIj[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#Y:m#Y#ZJ`#Z#o:m#o;'S$_;'S;=`$v<%lO$_VJgYyPhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$__K^[!iWhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#i:m#i#jLS#j#o:m#o;'S$_;'S;=`$v<%lO$_VLX[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#`:m#`#aL}#a#o:m#o;'S$_;'S;=`$v<%lO$_VMS[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#`:m#`#aMx#a#o:m#o;'S$_;'S;=`$v<%lO$_VNPYnRhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$_VNt[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#f:m#f#g! j#g#o:m#o;'S$_;'S;=`$v<%lO$_V! qYfRhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$_^!!hY!kWhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#o:m#o;'S$_;'S;=`$v<%lO$__!#_[!jWhSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#f:m#f#g!$T#g#o:m#o;'S$_;'S;=`$v<%lO$_V!$Y[hSOt$_uw$_x!_$_!_!`:S!`#O$_#P#T$_#T#i:m#i#jF|#j#o:m#o;'S$_;'S;=`$v<%lO$_V!%VUwRhSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~!%nO!r~", + tokenData: "!&X~R!SOX$_XY$|YZ%gZp$_pq$|qr&Qrt$_tu'Yuw$_wx'_xy'dyz'}z{(h{|)R|}$_}!O)l!O!P,b!P!Q,{!Q![*]![!]5j!]!^%g!^!_6T!_!`7_!`!a7x!a#O$_#O#P9S#P#R$_#R#S9X#S#T$_#T#U9r#U#X;W#X#Y=m#Y#ZDs#Z#];W#]#^JO#^#b;W#b#cKp#c#d! Y#d#f;W#f#g!!z#g#h;W#h#i!#q#i#o;W#o#p$_#p#q!%i#q;'S$_;'S;=`$v<%l~$_~O$_~~!&SS$dUjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_S$yP;=`<%l$__%TUjS!_ZOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V%nUjS!rROt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V&VWjSOt$_uw$_x!_$_!_!`&o!`#O$_#P;'S$_;'S;=`$v<%lO$_V&vUbRjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~'_O!j~~'dO!h~V'kUjS!fROt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V(UUjS!gROt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V(oU[RjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V)YU^RjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V)sWjS_ROt$_uw$_x!Q$_!Q![*]![#O$_#P;'S$_;'S;=`$v<%lO$_V*dYjSmROt$_uw$_x!O$_!O!P+S!P!Q$_!Q![*]![#O$_#P;'S$_;'S;=`$v<%lO$_V+XWjSOt$_uw$_x!Q$_!Q![+q![#O$_#P;'S$_;'S;=`$v<%lO$_V+xWjSmROt$_uw$_x!Q$_!Q![+q![#O$_#P;'S$_;'S;=`$v<%lO$_T,iU!oPjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V-SWjS]ROt$_uw$_x!P$_!P!Q-l!Q#O$_#P;'S$_;'S;=`$v<%lO$_V-q^jSOY.mYZ$_Zt.mtu/puw.mwx/px!P.m!P!Q$_!Q!}.m!}#O4c#O#P2O#P;'S.m;'S;=`5d<%lO.mV.t^jSoROY.mYZ$_Zt.mtu/puw.mwx/px!P.m!P!Q2e!Q!}.m!}#O4c#O#P2O#P;'S.m;'S;=`5d<%lO.mR/uXoROY/pZ!P/p!P!Q0b!Q!}/p!}#O1P#O#P2O#P;'S/p;'S;=`2_<%lO/pR0eP!P!Q0hR0mUoR#Z#[0h#]#^0h#a#b0h#g#h0h#i#j0h#m#n0hR1SVOY1PZ#O1P#O#P1i#P#Q/p#Q;'S1P;'S;=`1x<%lO1PR1lSOY1PZ;'S1P;'S;=`1x<%lO1PR1{P;=`<%l1PR2RSOY/pZ;'S/p;'S;=`2_<%lO/pR2bP;=`<%l/pV2jWjSOt$_uw$_x!P$_!P!Q3S!Q#O$_#P;'S$_;'S;=`$v<%lO$_V3ZbjSoROt$_uw$_x#O$_#P#Z$_#Z#[3S#[#]$_#]#^3S#^#a$_#a#b3S#b#g$_#g#h3S#h#i$_#i#j3S#j#m$_#m#n3S#n;'S$_;'S;=`$v<%lO$_V4h[jSOY4cYZ$_Zt4ctu1Puw4cwx1Px#O4c#O#P1i#P#Q.m#Q;'S4c;'S;=`5^<%lO4cV5aP;=`<%l4cV5gP;=`<%l.mT5qUjSuPOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V6[WcRjSOt$_uw$_x!_$_!_!`6t!`#O$_#P;'S$_;'S;=`$v<%lO$_V6{UdRjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V7fUaRjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V8PWeRjSOt$_uw$_x!_$_!_!`8i!`#O$_#P;'S$_;'S;=`$v<%lO$_V8pUfRjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~9XO!k~V9`UjSwROt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V9w[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#b;W#b#c;{#c#o;W#o;'S$_;'S;=`$v<%lO$_U:tUyQjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_U;]YjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_Vn#a#b;W#b#cCR#c#o;W#o;'S$_;'S;=`$v<%lO$_V>s[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#g;W#g#h?i#h#o;W#o;'S$_;'S;=`$v<%lO$_V?n^jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#X;W#X#Y@j#Y#];W#]#^Aa#^#o;W#o;'S$_;'S;=`$v<%lO$_V@qY!SPjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VAf[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#Y;W#Y#ZB[#Z#o;W#o;'S$_;'S;=`$v<%lO$_VBcY!QPjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VCW[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#W;W#W#XC|#X#o;W#o;'S$_;'S;=`$v<%lO$_VDTYjSvROt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VDx]jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#UEq#U#b;W#b#cIX#c#o;W#o;'S$_;'S;=`$v<%lO$_VEv[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aFl#a#o;W#o;'S$_;'S;=`$v<%lO$_VFq[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#g;W#g#hGg#h#o;W#o;'S$_;'S;=`$v<%lO$_VGl[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#X;W#X#YHb#Y#o;W#o;'S$_;'S;=`$v<%lO$_VHiYnRjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VI`YsRjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VJT[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#Y;W#Y#ZJy#Z#o;W#o;'S$_;'S;=`$v<%lO$_VKQY|PjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$__Kw[!lWjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#i;W#i#jLm#j#o;W#o;'S$_;'S;=`$v<%lO$_VLr[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aMh#a#o;W#o;'S$_;'S;=`$v<%lO$_VMm[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aNc#a#o;W#o;'S$_;'S;=`$v<%lO$_VNjYpRjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_V! _[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#f;W#f#g!!T#g#o;W#o;'S$_;'S;=`$v<%lO$_V!![YhRjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_^!#RY!nWjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$__!#x[!mWjSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#f;W#f#g!$n#g#o;W#o;'S$_;'S;=`$v<%lO$_V!$s[jSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#i;W#i#jGg#j#o;W#o;'S$_;'S;=`$v<%lO$_V!%pUzRjSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~!&XO!v~", tokenizers: [0, 1, 2, 3, tokenizer], - topRules: {"Program":[0,3]}, - tokenPrec: 767 + topRules: {"Program":[0,5]}, + tokenPrec: 768 }) diff --git a/src/parser/tests/basics.test.ts b/src/parser/tests/basics.test.ts index 94f84db..1505f62 100644 --- a/src/parser/tests/basics.test.ts +++ b/src/parser/tests/basics.test.ts @@ -10,7 +10,7 @@ describe('null', () => { test('parses null in assignments', () => { expect('a = null').toMatchTree(` Assign - Identifier a + AssignableIdentifier a operator = Null null`) }) @@ -212,11 +212,11 @@ describe('newlines', () => { expect(`x = 5 y = 2`).toMatchTree(` Assign - Identifier x + AssignableIdentifier x operator = Number 5 Assign - Identifier y + AssignableIdentifier y operator = Number 2`) }) @@ -224,11 +224,11 @@ y = 2`).toMatchTree(` test('parses statements separated by semicolons', () => { expect(`x = 5; y = 2`).toMatchTree(` Assign - Identifier x + AssignableIdentifier x operator = Number 5 Assign - Identifier y + AssignableIdentifier y operator = Number 2`) }) @@ -236,7 +236,7 @@ y = 2`).toMatchTree(` test('parses statement with word and a semicolon', () => { expect(`a = hello; 2`).toMatchTree(` Assign - Identifier a + AssignableIdentifier a operator = FunctionCallOrIdentifier Identifier hello @@ -248,7 +248,7 @@ describe('Assign', () => { test('parses simple assignment', () => { expect('x = 5').toMatchTree(` Assign - Identifier x + AssignableIdentifier x operator = Number 5`) }) @@ -256,7 +256,7 @@ describe('Assign', () => { test('parses assignment with addition', () => { expect('x = 5 + 3').toMatchTree(` Assign - Identifier x + AssignableIdentifier x operator = BinOp Number 5 @@ -267,13 +267,13 @@ describe('Assign', () => { test('parses assignment with functions', () => { expect('add = fn a b: a + b end').toMatchTree(` Assign - Identifier add + AssignableIdentifier add operator = FunctionDef keyword fn Params - Identifier a - Identifier b + AssignableIdentifier a + AssignableIdentifier b colon : BinOp Identifier a @@ -282,3 +282,40 @@ describe('Assign', () => { end end`) }) }) + +describe('DotGet whitespace sensitivity', () => { + test('no whitespace - DotGet works when identifier in scope', () => { + expect('basename = 5; basename.prop').toMatchTree(` + Assign + AssignableIdentifier basename + operator = + Number 5 + DotGet + IdentifierBeforeDot basename + Identifier prop`) + }) + + test('space before dot - NOT DotGet, parses as division', () => { + expect('basename = 5; basename / prop').toMatchTree(` + Assign + AssignableIdentifier basename + operator = + Number 5 + BinOp + Identifier basename + operator / + Identifier prop`) + }) + + test('dot followed by slash is Word, not DotGet', () => { + expect('basename ./cool').toMatchTree(` + FunctionCall + Identifier basename + PositionalArg + Word ./cool`) + }) + + test('identifier not in scope with dot becomes Word', () => { + expect('readme.txt').toMatchTree(`Word readme.txt`) + }) +}) diff --git a/src/parser/tests/control-flow.test.ts b/src/parser/tests/control-flow.test.ts index 250e0b8..88ec3ad 100644 --- a/src/parser/tests/control-flow.test.ts +++ b/src/parser/tests/control-flow.test.ts @@ -19,7 +19,7 @@ describe('if/elsif/else', () => { expect('a = if x: 2').toMatchTree(` Assign - Identifier a + AssignableIdentifier a operator = IfExpr keyword if diff --git a/src/parser/tests/dot-get.test.ts b/src/parser/tests/dot-get.test.ts new file mode 100644 index 0000000..d11341b --- /dev/null +++ b/src/parser/tests/dot-get.test.ts @@ -0,0 +1,148 @@ +import { describe, test, expect } from 'bun:test' +import '../../testSetup' + +describe('DotGet', () => { + test('readme.txt is Word when readme not in scope', () => { + expect('readme.txt').toMatchTree(`Word readme.txt`) + }) + + test('readme.txt is Word when used in function', () => { + expect('echo readme.txt').toMatchTree(` + FunctionCall + Identifier echo + PositionalArg + Word readme.txt`) + }) + + test('obj.prop is DotGet when obj is assigned', () => { + expect('obj = 5; obj.prop').toMatchTree(` + Assign + AssignableIdentifier obj + operator = + Number 5 + DotGet + IdentifierBeforeDot obj + Identifier prop + `) + }) + + test('function parameters are in scope within function body', () => { + expect('fn config: config.path end').toMatchTree(` + FunctionDef + keyword fn + Params + AssignableIdentifier config + colon : + DotGet + IdentifierBeforeDot config + Identifier path + end end + `) + }) + + test('parameters out of scope outside function', () => { + expect('fn x: x.prop end; x.prop').toMatchTree(` + FunctionDef + keyword fn + Params + AssignableIdentifier x + colon : + DotGet + IdentifierBeforeDot x + Identifier prop + end end + Word x.prop + `) + }) + + test('multiple parameters work correctly', () => { + expect(`fn x y: + x.foo + y.bar +end`).toMatchTree(` + FunctionDef + keyword fn + Params + AssignableIdentifier x + AssignableIdentifier y + colon : + DotGet + IdentifierBeforeDot x + Identifier foo + DotGet + IdentifierBeforeDot y + Identifier bar + end end + `) + }) + + test('nested functions with scope isolation', () => { + expect(`fn x: + x.outer + fn y: y.inner end +end`).toMatchTree(` + FunctionDef + keyword fn + Params + AssignableIdentifier x + colon : + DotGet + IdentifierBeforeDot x + Identifier outer + FunctionDef + keyword fn + Params + AssignableIdentifier y + colon : + DotGet + IdentifierBeforeDot y + Identifier inner + end end + end end + `) + }) + + test('dot get works as function argument', () => { + expect('config = 42; echo config.path').toMatchTree(` + Assign + AssignableIdentifier config + operator = + Number 42 + FunctionCall + Identifier echo + PositionalArg + DotGet + IdentifierBeforeDot config + Identifier path + `) + }) + + test('mixed file paths and dot get', () => { + expect('config = 42; cat readme.txt; echo config.path').toMatchTree(` + Assign + AssignableIdentifier config + operator = + Number 42 + FunctionCall + Identifier cat + PositionalArg + Word readme.txt + FunctionCall + Identifier echo + PositionalArg + DotGet + IdentifierBeforeDot config + Identifier path + `) + }) + + test("dot get doesn't work with spaces", () => { + expect('obj . prop').toMatchTree(` + FunctionCall + Identifier obj + PositionalArg + Word . + PositionalArg + Identifier prop`) + }) +}) diff --git a/src/parser/tests/functions.test.ts b/src/parser/tests/functions.test.ts index f24eaed..f9632a5 100644 --- a/src/parser/tests/functions.test.ts +++ b/src/parser/tests/functions.test.ts @@ -72,7 +72,7 @@ describe('Fn', () => { FunctionDef keyword fn Params - Identifier x + AssignableIdentifier x colon : BinOp Identifier x @@ -86,8 +86,8 @@ describe('Fn', () => { FunctionDef keyword fn Params - Identifier x - Identifier y + AssignableIdentifier x + AssignableIdentifier y colon : BinOp Identifier x @@ -104,8 +104,8 @@ end`).toMatchTree(` FunctionDef keyword fn Params - Identifier x - Identifier y + AssignableIdentifier x + AssignableIdentifier y colon : BinOp Identifier x diff --git a/src/parser/tests/multiline.test.ts b/src/parser/tests/multiline.test.ts index 11993e9..f71faab 100644 --- a/src/parser/tests/multiline.test.ts +++ b/src/parser/tests/multiline.test.ts @@ -21,16 +21,16 @@ describe('multiline', () => { add 3 4 `).toMatchTree(` Assign - Identifier add + AssignableIdentifier add operator = FunctionDef keyword fn Params - Identifier a - Identifier b + AssignableIdentifier a + AssignableIdentifier b colon : Assign - Identifier result + AssignableIdentifier result operator = BinOp Identifier a @@ -63,8 +63,8 @@ end FunctionDef keyword fn Params - Identifier x - Identifier y + AssignableIdentifier x + AssignableIdentifier y colon : FunctionCallOrIdentifier Identifier x diff --git a/src/parser/tests/pipes.test.ts b/src/parser/tests/pipes.test.ts index 25eb829..61d6f73 100644 --- a/src/parser/tests/pipes.test.ts +++ b/src/parser/tests/pipes.test.ts @@ -50,7 +50,7 @@ describe('pipe expressions', () => { test('pipe expression in assignment', () => { expect('result = echo hello | grep h').toMatchTree(` Assign - Identifier result + AssignableIdentifier result operator = PipeExpr FunctionCall @@ -77,7 +77,7 @@ describe('pipe expressions', () => { FunctionDef keyword fn Params - Identifier x + AssignableIdentifier x colon : FunctionCallOrIdentifier Identifier x diff --git a/src/parser/tokenizer.ts b/src/parser/tokenizer.ts index 07fbc97..767c2b6 100644 --- a/src/parser/tokenizer.ts +++ b/src/parser/tokenizer.ts @@ -1,30 +1,107 @@ import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr' -import { Identifier, Word } from './shrimp.terms' +import { Identifier, AssignableIdentifier, Word, IdentifierBeforeDot } from './shrimp.terms' // The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF. -export const tokenizer = new ExternalTokenizer((input: InputStream, stack: Stack) => { - let ch = getFullCodePoint(input, 0) - if (!isWordChar(ch)) return +export const tokenizer = new ExternalTokenizer( + (input: InputStream, stack: Stack) => { + const ch = getFullCodePoint(input, 0) + if (!isWordChar(ch)) return - let pos = getCharSize(ch) - let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch) - const canBeWord = stack.canShift(Word) + const isValidStart = isLowercaseLetter(ch) || isEmoji(ch) + const canBeWord = stack.canShift(Word) + + // Consume all word characters, tracking if it remains a valid identifier + const { pos, isValidIdentifier, stoppedAtDot } = consumeWordToken( + input, + isValidStart, + canBeWord + ) + + // Check if we should emit IdentifierBeforeDot for property access + if (stoppedAtDot) { + const dotGetToken = checkForDotGet(input, stack, pos) + + if (dotGetToken) { + input.advance(pos) + input.acceptToken(dotGetToken) + } else { + // Not in scope - continue consuming the dot as part of the word + const afterDot = consumeRestOfWord(input, pos + 1, canBeWord) + input.advance(afterDot) + input.acceptToken(Word) + } + + return + } + + // Advance past the token we consumed + input.advance(pos) + + // Choose which token to emit + if (isValidIdentifier) { + const token = chooseIdentifierToken(input, stack) + input.acceptToken(token) + } else { + input.acceptToken(Word) + } + }, + { contextual: true } +) + +// Build identifier text from input stream, handling surrogate pairs for emoji +const buildIdentifierText = (input: InputStream, length: number): string => { + let text = '' + for (let i = 0; i < length; i++) { + const charCode = input.peek(i) + if (charCode === -1) break + + // Handle surrogate pairs for emoji (UTF-16 encoding) + if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < length) { + const low = input.peek(i + 1) + if (low >= 0xdc00 && low <= 0xdfff) { + text += String.fromCharCode(charCode, low) + i++ // Skip the low surrogate + continue + } + } + text += String.fromCharCode(charCode) + } + return text +} + +// Consume word characters, tracking if it remains a valid identifier +// Returns the position after consuming, whether it's a valid identifier, and if we stopped at a dot +const consumeWordToken = ( + input: InputStream, + isValidStart: boolean, + canBeWord: boolean +): { pos: number; isValidIdentifier: boolean; stoppedAtDot: boolean } => { + let pos = getCharSize(getFullCodePoint(input, 0)) + let isValidIdentifier = isValidStart + let stoppedAtDot = false while (true) { - ch = getFullCodePoint(input, pos) + const ch = getFullCodePoint(input, pos) + // Stop at dot if we have a valid identifier (might be property access) + if (ch === 46 /* . */ && isValidIdentifier) { + stoppedAtDot = true + break + } + + // Stop if we hit a non-word character if (!isWordChar(ch)) break - // Certain characters might end a word or identifier if they are followed by whitespace. - // This allows things like `a = hello; 2` of if `x: y` to parse correctly. + // Context-aware termination: semicolon/colon can end a word if followed by whitespace + // This allows `hello; 2` to parse correctly while `hello;world` stays as one word if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { const nextCh = getFullCodePoint(input, pos + 1) if (!isWordChar(nextCh)) break } - // Track identifier validity - if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) { + // Track identifier validity: must be lowercase, digit, dash, or emoji + if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 /* - */ && !isEmoji(ch)) { if (!canBeWord) break isValidIdentifier = false } @@ -32,19 +109,73 @@ export const tokenizer = new ExternalTokenizer((input: InputStream, stack: Stack pos += getCharSize(ch) } - input.advance(pos) - input.acceptToken(isValidIdentifier ? Identifier : Word) -}) + return { pos, isValidIdentifier, stoppedAtDot } +} +// Consume the rest of a word after we've decided not to treat a dot as DotGet +// Used when we have "file.txt" - we already consumed "file", now consume ".txt" +const consumeRestOfWord = (input: InputStream, startPos: number, canBeWord: boolean): number => { + let pos = startPos + while (true) { + const ch = getFullCodePoint(input, pos) + + // Stop if we hit a non-word character + if (!isWordChar(ch)) break + + // Context-aware termination for semicolon/colon + if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { + const nextCh = getFullCodePoint(input, pos + 1) + if (!isWordChar(nextCh)) break + } + + pos += getCharSize(ch) + } + return pos +} + +// Check if this identifier is in scope (for property access detection) +// Returns IdentifierBeforeDot token if in scope, null otherwise +const checkForDotGet = (input: InputStream, stack: Stack, pos: number): number | null => { + const identifierText = buildIdentifierText(input, pos) + const context = stack.context as { scope: { has(name: string): boolean } } | undefined + + // If identifier is in scope, this is property access (e.g., obj.prop) + // If not in scope, it should be consumed as a Word (e.g., file.txt) + return context?.scope.has(identifierText) ? IdentifierBeforeDot : null +} + +// Decide between AssignableIdentifier and Identifier using grammar state + peek-ahead +const chooseIdentifierToken = (input: InputStream, stack: Stack): number => { + const canAssignable = stack.canShift(AssignableIdentifier) + const canRegular = stack.canShift(Identifier) + + // Only one option is valid - use it + if (canAssignable && !canRegular) return AssignableIdentifier + if (canRegular && !canAssignable) return Identifier + + // Both possible (ambiguous context) - peek ahead for '=' to disambiguate + // This happens at statement start where both `x = 5` (assign) and `echo x` (call) are valid + let peekPos = 0 + while (true) { + const ch = getFullCodePoint(input, peekPos) + if (isWhiteSpace(ch)) { + peekPos += getCharSize(ch) + } else { + break + } + } + + const nextCh = getFullCodePoint(input, peekPos) + return nextCh === 61 /* = */ ? AssignableIdentifier : Identifier +} + +// Character classification helpers const isWhiteSpace = (ch: number): boolean => { - return ch === 32 /* space */ || ch === 10 /* \n */ || ch === 9 /* tab */ || ch === 13 /* \r */ + return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */ } const isWordChar = (ch: number): boolean => { - const closingParen = ch === 41 /* ) */ - const eof = ch === -1 - - return !isWhiteSpace(ch) && !closingParen && !eof + return !isWhiteSpace(ch) && ch !== 10 /* \n */ && ch !== 41 /* ) */ && ch !== -1 /* EOF */ } const isLowercaseLetter = (ch: number): boolean => { @@ -68,7 +199,7 @@ const getFullCodePoint = (input: InputStream, pos: number): number => { } } - return ch // Single code unit + return ch } const isEmoji = (ch: number): boolean => {