diff --git a/CLAUDE.md b/CLAUDE.md index 97cde30..e3404cd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -295,6 +295,7 @@ These discoveries came from implementing string interpolation with external toke **The most surprising discovery**: Rule names determine whether nodes appear in the parse tree. **Lowercase rules get inlined** (no tree nodes): + ```lezer statement { assign | expr } // ❌ No "statement" node assign { x "=" y } // ❌ No "assign" node @@ -302,6 +303,7 @@ expr { x | y } // ❌ No "expr" node ``` **Capitalized rules create tree nodes**: + ```lezer Statement { Assign | Expr } // ✅ Creates Statement node Assign { x "=" y } // ✅ Creates Assign node @@ -339,6 +341,7 @@ Example: `x = 42` was parsing as `Program(Identifier,"=",Number)` instead of `Pr **Reality**: External tokenizers work perfectly inside `@skip {}` blocks! The tokenizer gets called even when skip is disabled. **Working pattern**: + ```lezer @external tokens tokenizer from "./tokenizer" { Identifier, Word } @@ -357,6 +360,7 @@ Interpolation { ### 4. Single-Character Tokens Can Be Literals **Initial approach**: Define every single character as a token: + ```lezer @tokens { dollar[@name="$"] { "$" } @@ -365,13 +369,14 @@ Interpolation { ``` **Simpler approach**: Just use literals in the grammar rules: + ```lezer Interpolation { "$" Identifier | // Literal "$" "$" "(" expr ")" } -StringEscape { +EscapeSeq { "\\" ("$" | "n" | ...) // Literal "\\" } ``` diff --git a/src/compiler/compiler.ts b/src/compiler/compiler.ts index f8a0eed..df60f2b 100644 --- a/src/compiler/compiler.ts +++ b/src/compiler/compiler.ts @@ -23,7 +23,7 @@ const DEBUG = false type Label = `.${string}` // Process escape sequences in strings -function processEscapeSequence(escapeSeq: string): string { +function processEscapeSeq(escapeSeq: string): string { // escapeSeq includes the backslash, e.g., "\n", "\$", "\\" if (escapeSeq.length !== 2) return escapeSeq @@ -130,9 +130,9 @@ export class Compiler { instructions.push(['PUSH', partValue]) break - case terms.StringEscape: + case terms.EscapeSeq: // Process escape sequence and push the result - const processed = processEscapeSequence(partValue) + const processed = processEscapeSeq(partValue) instructions.push(['PUSH', processed]) break diff --git a/src/compiler/utils.ts b/src/compiler/utils.ts index 0a4cf97..a67833b 100644 --- a/src/compiler/utils.ts +++ b/src/compiler/utils.ts @@ -177,7 +177,7 @@ export const getStringParts = (node: SyntaxNode, input: string) => { return ( child.type.id === terms.StringFragment || child.type.id === terms.Interpolation || - child.type.id === terms.StringEscape + child.type.id === terms.EscapeSeq ) }) @@ -186,10 +186,10 @@ export const getStringParts = (node: SyntaxNode, input: string) => { if ( part.type.id !== terms.StringFragment && part.type.id !== terms.Interpolation && - part.type.id !== terms.StringEscape + part.type.id !== terms.EscapeSeq ) { throw new CompilerError( - `String child must be StringFragment, Interpolation, or StringEscape, got ${part.type.name}`, + `String child must be StringFragment, Interpolation, or EscapeSeq, got ${part.type.name}`, part.from, part.to ) diff --git a/src/parser/shrimp.grammar b/src/parser/shrimp.grammar index f922672..5bf3af2 100644 --- a/src/parser/shrimp.grammar +++ b/src/parser/shrimp.grammar @@ -39,7 +39,7 @@ } -@external tokens tokenizer from "./tokenizer" { Identifier, Word } +@external tokens tokenizer from "./tokenizer" { Identifier, WordFragment } @precedence { pipe @left, @@ -170,12 +170,13 @@ expression { @skip {} { String { "'" stringContent* "'" } + } stringContent { StringFragment | Interpolation | - StringEscape + EscapeSeq } Interpolation { @@ -183,10 +184,18 @@ Interpolation { "$" ParenExpr } -StringEscape { +EscapeSeq { "\\" ("$" | "n" | "t" | "r" | "\\" | "'") } + +Word { wordContent+ } + +wordContent { + WordFragment | Interpolation | EscapeSeq +} + + // We need expressionWithoutIdentifier to avoid conflicts in consumeToTerminator. // Without this, when parsing "my-var" at statement level, the parser can't decide: // - ambiguousFunctionCall → FunctionCallOrIdentifier → Identifier diff --git a/src/parser/shrimp.terms.ts b/src/parser/shrimp.terms.ts index 965a67d..539b754 100644 --- a/src/parser/shrimp.terms.ts +++ b/src/parser/shrimp.terms.ts @@ -1,7 +1,7 @@ // This file was generated by lezer-generator. You probably shouldn't edit it. export const Identifier = 1, - Word = 2, + WordFragment = 2, Program = 3, PipeExpr = 4, FunctionCall = 5, @@ -10,21 +10,22 @@ export const FunctionCallOrIdentifier = 8, BinOp = 9, ConditionalOp = 14, - String = 23, - StringFragment = 24, - Interpolation = 25, - StringEscape = 26, - Number = 27, - Boolean = 28, - FunctionDef = 29, - Params = 31, - colon = 32, - end = 33, - Underscore = 34, - NamedArg = 35, - NamedArgPrefix = 36, - IfExpr = 38, - ThenBlock = 41, - ElsifExpr = 42, - ElseExpr = 44, - Assign = 46 + Word = 23, + Interpolation = 24, + EscapeSeq = 25, + String = 26, + StringFragment = 27, + Number = 28, + Boolean = 29, + FunctionDef = 30, + Params = 32, + colon = 33, + end = 34, + Underscore = 35, + NamedArg = 36, + NamedArgPrefix = 37, + IfExpr = 39, + ThenBlock = 42, + ElsifExpr = 43, + ElseExpr = 45, + Assign = 47 diff --git a/src/parser/shrimp.ts b/src/parser/shrimp.ts index 835f63c..427658d 100644 --- a/src/parser/shrimp.ts +++ b/src/parser/shrimp.ts @@ -4,20 +4,20 @@ import {tokenizer} from "./tokenizer" import {highlighting} from "./highlight" export const parser = LRParser.deserialize({ version: 14, - states: ".WQVQaOOO!rQbO'#CdO#SQPO'#CeO#bQPO'#DhO$[QaO'#CcO$cOSO'#CsOOQ`'#Dl'#DlO$qQPO'#DkO%YQaO'#DvOOQ`'#Cy'#CyOOQO'#Di'#DiO%bQPO'#DhO%pQaO'#DzOOQO'#DS'#DSOOQO'#Dh'#DhO%wQPO'#DgOOQ`'#Dg'#DgOOQ`'#D]'#D]QVQaOOOOQ`'#Dk'#DkOOQ`'#Cb'#CbO&PQaO'#DPOOQ`'#Dj'#DjOOQ`'#D^'#D^O&^QbO,58{O&}QaO,59vO%pQaO,59PO%pQaO,59PO'[QbO'#CdO(gQPO'#CeO(wQPO,58}O)YQPO,58}O)TQPO,58}O*TQPO,58}O*]QaO'#CuO*eQWO'#CvOOOO'#Dp'#DpOOOO'#D_'#D_O*yOSO,59_OOQ`,59_,59_OOQ`'#D`'#D`O+XQaO'#C{O+aQPO,5:bO+fQaO'#DbO+kQPO,58zO+|QPO,5:fO,TQPO,5:fOOQ`,5:R,5:ROOQ`-E7Z-E7ZOOQ`,59k,59kOOQ`-E7[-E7[OOQO1G/b1G/bOOQO1G.k1G.kO,YQPO1G.kO%pQaO,59UO%pQaO,59UOOQ`1G.i1G.iOOOO,59a,59aOOOO,59b,59bOOOO-E7]-E7]OOQ`1G.y1G.yOOQ`-E7^-E7^O,tQaO1G/|O-UQbO'#CdOOQO,59|,59|OOQO-E7`-E7`O-uQaO1G0QOOQO1G.p1G.pO.VQPO1G.pO.aQPO7+%hO.fQaO7+%iOOQO'#DU'#DUOOQO7+%l7+%lO.vQaO7+%mOOQ`<oAN>oO%pQaO'#DWOOQO'#Dc'#DcO0ZQPOAN>sO0fQPO'#DYOOQOAN>sAN>sO0kQPOAN>sO0pQPO,59rO0wQPO,59rOOQO-E7a-E7aOOQOG24_G24_O0|QPOG24_O1RQPO,59tO1WQPO1G/^OOQOLD)yLD)yO.fQaO1G/`O.vQaO7+$xOOQO7+$z7+$zOOQO<rAN>rO(QQaO'#DXOOQO'#De'#DeO1}QPOAN>vO2YQPO'#DZOOQOAN>vAN>vO2_QPOAN>vO2dQPO,59sO2kQPO,59sOOQO-E7c-E7cOOQOG24bG24bO2pQPOG24bO2uQPO,59uO2zQPO1G/_OOQOLD)|LD)|O0YQaO1G/aO0jQaO7+$yOOQO7+${7+${OOQO<S[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#g2i#g#h>x#h#o2i#o;'S$_;'S;=`$v<%lO$_V>}[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#X2i#X#Y?s#Y#o2i#o;'S$_;'S;=`$v<%lO$_V?zYlRhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_V@qYnRhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_VAf[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#Y2i#Y#ZB[#Z#o2i#o;'S$_;'S;=`$v<%lO$_VBcYwPhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_^CYY!gWhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_VC}[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#f2i#f#gDs#g#o2i#o;'S$_;'S;=`$v<%lO$_VDzYfRhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_^EqY!iWhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$__Fh[!hWhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#f2i#f#gG^#g#o2i#o;'S$_;'S;=`$v<%lO$_VGc[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#i2i#i#j>x#j#o2i#o;'S$_;'S;=`$v<%lO$_VH`UuRhSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~HwO!p~", + repeatNodeCount: 8, + tokenData: "Hw~R!SOX$_XY$|YZ%gZp$_pq$|qr&Qrt$_tu'Yuw$_wx'_xy'dyz'}z{(h{|)R|}$_}!O)l!O!P$_!P!Q,b!Q![*]![!],{!]!^%g!^!_-f!_!`.p!`!a/Z!a#O$_#O#P0e#P#R$_#R#S0j#S#T$_#T#U1T#U#X2i#X#Y5O#Y#ZS[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#g2i#g#h>x#h#o2i#o;'S$_;'S;=`$v<%lO$_Z>}[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#X2i#X#Y?s#Y#o2i#o;'S$_;'S;=`$v<%lO$_Z?zYmRkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_Z@qYoRkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_ZAf[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#Y2i#Y#ZB[#Z#o2i#o;'S$_;'S;=`$v<%lO$_ZBcYxPkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_^CYY!hSkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_ZC}[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#f2i#f#gDs#g#o2i#o;'S$_;'S;=`$v<%lO$_ZDzYfRkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_^EqY!jSkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$__Fh[!iSkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#f2i#f#gG^#g#o2i#o;'S$_;'S;=`$v<%lO$_ZGc[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#i2i#i#j>x#j#o2i#o;'S$_;'S;=`$v<%lO$_ZH`UvRkWOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~HwO!s~", tokenizers: [0, 1, 2, 3, tokenizer], topRules: {"Program":[0,3]}, - tokenPrec: 749 + tokenPrec: 829 }) diff --git a/src/parser/tests/strings.test.ts b/src/parser/tests/strings.test.ts index 037ac55..00f4cd3 100644 --- a/src/parser/tests/strings.test.ts +++ b/src/parser/tests/strings.test.ts @@ -55,7 +55,7 @@ describe('string escape sequences', () => { expect("'price is \\$10'").toMatchTree(` String StringFragment ${'price is '} - StringEscape \\$ + EscapeSeq \\$ StringFragment 10 `) }) @@ -64,7 +64,7 @@ describe('string escape sequences', () => { expect("'it\\'s working'").toMatchTree(` String StringFragment ${'it'} - StringEscape \\' + EscapeSeq \\' StringFragment ${'s working'} `) }) @@ -73,7 +73,7 @@ describe('string escape sequences', () => { expect("'path\\\\file'").toMatchTree(` String StringFragment path - StringEscape \\\\ + EscapeSeq \\\\ StringFragment file `) }) @@ -82,7 +82,7 @@ describe('string escape sequences', () => { expect("'line1\\nline2'").toMatchTree(` String StringFragment line1 - StringEscape \\n + EscapeSeq \\n StringFragment line2 `) }) @@ -91,7 +91,7 @@ describe('string escape sequences', () => { expect("'col1\\tcol2'").toMatchTree(` String StringFragment col1 - StringEscape \\t + EscapeSeq \\t StringFragment col2 `) }) @@ -100,7 +100,7 @@ describe('string escape sequences', () => { expect("'text\\rmore'").toMatchTree(` String StringFragment text - StringEscape \\r + EscapeSeq \\r StringFragment more `) }) @@ -108,11 +108,11 @@ describe('string escape sequences', () => { test('multiple escape sequences', () => { expect("'\\$10\\nTotal: \\$20'").toMatchTree(` String - StringEscape \\$ + EscapeSeq \\$ StringFragment 10 - StringEscape \\n + EscapeSeq \\n StringFragment ${'Total: '} - StringEscape \\$ + EscapeSeq \\$ StringFragment 20 `) }) diff --git a/src/parser/tests/word-interpolation.test.ts b/src/parser/tests/word-interpolation.test.ts new file mode 100644 index 0000000..ca98b9d --- /dev/null +++ b/src/parser/tests/word-interpolation.test.ts @@ -0,0 +1,195 @@ +import { describe, expect, test } from 'bun:test' +import '../shrimp.grammar' // Importing this so changes cause it to retest! + +describe('word interpolation', () => { + test.only('word with variable interpolation', () => { + expect('path/$file').toMatchTree(` + Word + WordFragment path/ + Interpolation + Identifier file + `) + }) + + test('word with expression interpolation', () => { + expect('prefix-$(123)').toMatchTree(` + Word + WordFragment prefix- + Interpolation + leftParen + Number 123 + rightParen + `) + }) + + test('multiple interpolations in word', () => { + expect('$user/$file').toMatchTree(` + Word + Interpolation + Identifier user + WordFragment / + Interpolation + Identifier file + `) + }) + + test('dollar not followed by identifier stays in word', () => { + expect('price$10').toMatchTree(` + Word + WordFragment price$10 + `) + }) + + test('escaped dollar in word', () => { + expect('price\\$10').toMatchTree(` + Word + WordFragment price + EscapeSeq + WordFragment 10 + `) + }) + + test('interpolation at start of word', () => { + expect('$HOME/documents').toMatchTree(` + Word + Interpolation + Identifier HOME + WordFragment /documents + `) + }) + + test('interpolation at end of word', () => { + expect('./path/$filename').toMatchTree(` + Word + WordFragment ./path/ + Interpolation + Identifier filename + `) + }) + + test('complex expression interpolation', () => { + expect('output-$(add 1 2).txt').toMatchTree(` + Word + WordFragment output- + Interpolation + leftParen + FunctionCall + Identifier add + PositionalArg + Number 1 + PositionalArg + Number 2 + rightParen + WordFragment .txt + `) + }) + + test('emoji in interpolated identifier', () => { + expect('hello/$😎file').toMatchTree(` + Word + WordFragment hello/ + Interpolation + Identifier 😎file + `) + }) + + test('escaped space in word', () => { + expect('my\\ file.txt').toMatchTree(` + Word + WordFragment my + EscapeSeq + WordFragment file.txt + `) + }) + + test('multiple escapes and interpolations', () => { + expect('pre\\$fix-$var-\\$end').toMatchTree(` + Word + WordFragment pre + EscapeSeq + WordFragment fix- + Interpolation + Identifier var + WordFragment - + EscapeSeq + WordFragment end + `) + }) + + test('plain word without interpolation still works', () => { + expect('./file.txt').toMatchTree(` + Word + WordFragment ./file.txt + `) + }) + + test('word with URL-like content', () => { + expect('https://example.com/$path').toMatchTree(` + Word + WordFragment https://example.com/ + Interpolation + Identifier path + `) + }) + + test('nested expression in interpolation', () => { + expect('file-$(multiply (add 1 2) 3).txt').toMatchTree(` + Word + WordFragment file- + Interpolation + leftParen + FunctionCall + Identifier multiply + PositionalArg + ParenExpr + leftParen + FunctionCall + Identifier add + PositionalArg + Number 1 + PositionalArg + Number 2 + rightParen + PositionalArg + Number 3 + rightParen + WordFragment .txt + `) + }) +}) + +describe('word interpolation in function calls', () => { + test('function call with interpolated word argument', () => { + expect('cat /home/$user/file.txt').toMatchTree(` + FunctionCall + Identifier cat + PositionalArg + Word + WordFragment /home/ + Interpolation + Identifier user + WordFragment /file.txt + `) + }) + + test('multiple interpolated word arguments', () => { + expect('cp $src/$file $dest/$file').toMatchTree(` + FunctionCall + Identifier cp + PositionalArg + Word + Interpolation + Identifier src + WordFragment / + Interpolation + Identifier file + PositionalArg + Word + Interpolation + Identifier dest + WordFragment / + Interpolation + Identifier file + `) + }) +}) diff --git a/src/parser/tokenizer.ts b/src/parser/tokenizer.ts index 09ebe55..572afd5 100644 --- a/src/parser/tokenizer.ts +++ b/src/parser/tokenizer.ts @@ -1,5 +1,5 @@ import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr' -import { Identifier, Word } from './shrimp.terms' +import { Identifier, Word, WordFragment } from './shrimp.terms' // The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF. @@ -16,6 +16,15 @@ export const tokenizer = new ExternalTokenizer((input: InputStream, stack: Stack if (!isWordChar(ch)) break + // Stop at $ if it's followed by identifier start or ( + // This allows word interpolation like path/$file or result-$(expr) + if (ch === 36 /* $ */) { + const nextCh = getFullCodePoint(input, pos + 1) + if (isLowercaseLetter(nextCh) || isEmoji(nextCh) || nextCh === 40 /* ( */) { + break + } + } + // Certain characters might end a word or identifier if they are followed by whitespace. // This allows things like `a = hello; 2` of if `x: y` to parse correctly. // to work as expected. @@ -34,7 +43,7 @@ export const tokenizer = new ExternalTokenizer((input: InputStream, stack: Stack } input.advance(pos) - input.acceptToken(isValidIdentifier ? Identifier : Word) + input.acceptToken(isValidIdentifier ? Identifier : WordFragment) }) const isWhiteSpace = (ch: number): boolean => {