diff --git a/src/parser/shrimp.grammar b/src/parser/shrimp.grammar index 5bf3af2..e126797 100644 --- a/src/parser/shrimp.grammar +++ b/src/parser/shrimp.grammar @@ -39,7 +39,7 @@ } -@external tokens tokenizer from "./tokenizer" { Identifier, WordFragment } +@external tokens tokenizer from "./tokenizer" { Identifier, Word } @precedence { pipe @left, @@ -188,14 +188,6 @@ EscapeSeq { "\\" ("$" | "n" | "t" | "r" | "\\" | "'") } - -Word { wordContent+ } - -wordContent { - WordFragment | Interpolation | EscapeSeq -} - - // We need expressionWithoutIdentifier to avoid conflicts in consumeToTerminator. // Without this, when parsing "my-var" at statement level, the parser can't decide: // - ambiguousFunctionCall → FunctionCallOrIdentifier → Identifier diff --git a/src/parser/shrimp.terms.ts b/src/parser/shrimp.terms.ts index 539b754..9990b95 100644 --- a/src/parser/shrimp.terms.ts +++ b/src/parser/shrimp.terms.ts @@ -1,7 +1,7 @@ // This file was generated by lezer-generator. You probably shouldn't edit it. export const Identifier = 1, - WordFragment = 2, + Word = 2, Program = 3, PipeExpr = 4, FunctionCall = 5, @@ -10,22 +10,21 @@ export const FunctionCallOrIdentifier = 8, BinOp = 9, ConditionalOp = 14, - Word = 23, - Interpolation = 24, - EscapeSeq = 25, - String = 26, - StringFragment = 27, - Number = 28, - Boolean = 29, - FunctionDef = 30, - Params = 32, - colon = 33, - end = 34, - Underscore = 35, - NamedArg = 36, - NamedArgPrefix = 37, - IfExpr = 39, - ThenBlock = 42, - ElsifExpr = 43, - ElseExpr = 45, - Assign = 47 + String = 23, + StringFragment = 24, + Interpolation = 25, + EscapeSeq = 26, + Number = 27, + Boolean = 28, + FunctionDef = 29, + Params = 31, + colon = 32, + end = 33, + Underscore = 34, + NamedArg = 35, + NamedArgPrefix = 36, + IfExpr = 38, + ThenBlock = 41, + ElsifExpr = 42, + ElseExpr = 44, + Assign = 46 diff --git a/src/parser/shrimp.ts b/src/parser/shrimp.ts index 427658d..18ecbf2 100644 --- a/src/parser/shrimp.ts +++ b/src/parser/shrimp.ts @@ -4,20 +4,20 @@ import {tokenizer} from "./tokenizer" import {highlighting} from "./highlight" export const parser = LRParser.deserialize({ version: 14, - states: ".vQVQaOOO#OQbO'#CdO#`QPO'#CeO#nQPO'#DjO$nQaO'#CcO$uQaO'#CtO$}QSO'#CuOOQ`'#Dq'#DqOOQ`'#D`'#D`O%cQaO'#CsO&sOWO'#CvOOQ`'#Dn'#DnO'RQPO'#DmO'jQaO'#DyOOQ`'#Cz'#CzOOQO'#Dk'#DkO'rQPO'#DjO(QQaO'#D}OOQO'#DT'#DTOOQO'#Dj'#DjO(XQPO'#DiOOQ`'#Di'#DiOOQ`'#D^'#D^QVQaOOOOQ`'#Cs'#CsOOQ`'#Dm'#DmOOQ`'#Cb'#CbO(aQaO'#DQOOQ`'#Dl'#DlOOQ`'#D_'#D_O(nQbO,58{O)_QaO,59wO(QQaO,59PO(QQaO,59PO)lQbO'#CdO*wQPO'#CeO+XQPO,58}O+jQPO,58}O+eQPO,58}O,eQPO,58}OOQ`,59`,59`OOQ`,59a,59aOOQ`-E7^-E7^OOOO'#Dx'#DxOOOO'#Da'#DaO,mOWO,59bOOQ`,59b,59bOOQ`'#Db'#DbO,{QaO'#C|O-TQPO,5:eO-YQaO'#DdO-_QPO,58zO-pQPO,5:iO-wQPO,5:iOOQ`,5:T,5:TOOQ`-E7[-E7[OOQ`,59l,59lOOQ`-E7]-E7]OOQO1G/c1G/cOOQO1G.k1G.kO-|QPO1G.kO(QQaO,59UO(QQaO,59UOOQ`1G.i1G.iOOOO-E7_-E7_OOQ`1G.|1G.|OOQ`-E7`-E7`O.hQaO1G0PO.xQbO'#CdOOQO,5:O,5:OOOQO-E7b-E7bO/iQaO1G0TOOQO1G.p1G.pO/yQPO1G.pO0TQPO7+%kO0YQaO7+%lOOQO'#DV'#DVOOQO7+%o7+%oO0jQaO7+%pOOQ`<rAN>rO(QQaO'#DXOOQO'#De'#DeO1}QPOAN>vO2YQPO'#DZOOQOAN>vAN>vO2_QPOAN>vO2dQPO,59sO2kQPO,59sOOQO-E7c-E7cOOQOG24bG24bO2pQPOG24bO2uQPO,59uO2zQPO1G/_OOQOLD)|LD)|O0YQaO1G/aO0jQaO7+$yOOQO7+${7+${OOQO<oAN>oO%pQaO'#DWOOQO'#Dc'#DcO0ZQPOAN>sO0fQPO'#DYOOQOAN>sAN>sO0kQPOAN>sO0pQPO,59rO0wQPO,59rOOQO-E7a-E7aOOQOG24_G24_O0|QPOG24_O1RQPO,59tO1WQPO1G/^OOQOLD)yLD)yO.fQaO1G/`O.vQaO7+$xOOQO7+$z7+$zOOQO<S[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#g2i#g#h>x#h#o2i#o;'S$_;'S;=`$v<%lO$_Z>}[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#X2i#X#Y?s#Y#o2i#o;'S$_;'S;=`$v<%lO$_Z?zYmRkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_Z@qYoRkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_ZAf[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#Y2i#Y#ZB[#Z#o2i#o;'S$_;'S;=`$v<%lO$_ZBcYxPkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_^CYY!hSkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_ZC}[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#f2i#f#gDs#g#o2i#o;'S$_;'S;=`$v<%lO$_ZDzYfRkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_^EqY!jSkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$__Fh[!iSkWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#f2i#f#gG^#g#o2i#o;'S$_;'S;=`$v<%lO$_ZGc[kWOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#i2i#i#j>x#j#o2i#o;'S$_;'S;=`$v<%lO$_ZH`UvRkWOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~HwO!s~", + repeatNodeCount: 7, + tokenData: "Hw~R!SOX$_XY$|YZ%gZp$_pq$|qr&Qrt$_tu'Yuw$_wx'_xy'dyz'}z{(h{|)R|}$_}!O)l!O!P$_!P!Q,b!Q![*]![!],{!]!^%g!^!_-f!_!`.p!`!a/Z!a#O$_#O#P0e#P#R$_#R#S0j#S#T$_#T#U1T#U#X2i#X#Y5O#Y#ZS[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#g2i#g#h>x#h#o2i#o;'S$_;'S;=`$v<%lO$_V>}[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#X2i#X#Y?s#Y#o2i#o;'S$_;'S;=`$v<%lO$_V?zYlRhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_V@qYnRhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_VAf[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#Y2i#Y#ZB[#Z#o2i#o;'S$_;'S;=`$v<%lO$_VBcYwPhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_^CYY!gWhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_VC}[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#f2i#f#gDs#g#o2i#o;'S$_;'S;=`$v<%lO$_VDzYfRhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$_^EqY!iWhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#o2i#o;'S$_;'S;=`$v<%lO$__Fh[!hWhSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#f2i#f#gG^#g#o2i#o;'S$_;'S;=`$v<%lO$_VGc[hSOt$_uw$_x!_$_!_!`2O!`#O$_#P#T$_#T#i2i#i#j>x#j#o2i#o;'S$_;'S;=`$v<%lO$_VH`UuRhSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~HwO!p~", tokenizers: [0, 1, 2, 3, tokenizer], topRules: {"Program":[0,3]}, - tokenPrec: 829 + tokenPrec: 749 }) diff --git a/src/parser/tests/basics.test.ts b/src/parser/tests/basics.test.ts index cdbba7c..42b8db1 100644 --- a/src/parser/tests/basics.test.ts +++ b/src/parser/tests/basics.test.ts @@ -268,3 +268,29 @@ describe('Assign', () => { end end`) }) }) + +describe('Word escapes', () => { + test('parses escaped spaces in words', () => { + expect('echo my\\ file').toMatchTree(` + FunctionCall + Identifier echo + PositionalArg + Word my\\ file`) + }) + + test('parses multiple escaped spaces', () => { + expect('cat file\\ with\\ spaces.txt').toMatchTree(` + FunctionCall + Identifier cat + PositionalArg + Word file\\ with\\ spaces.txt`) + }) + + test('parses escaped backslash', () => { + expect('echo path\\\\file').toMatchTree(` + FunctionCall + Identifier echo + PositionalArg + Word path\\\\file`) + }) +}) diff --git a/src/parser/tests/word-interpolation.test.ts b/src/parser/tests/word-interpolation.test.ts deleted file mode 100644 index ca98b9d..0000000 --- a/src/parser/tests/word-interpolation.test.ts +++ /dev/null @@ -1,195 +0,0 @@ -import { describe, expect, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - -describe('word interpolation', () => { - test.only('word with variable interpolation', () => { - expect('path/$file').toMatchTree(` - Word - WordFragment path/ - Interpolation - Identifier file - `) - }) - - test('word with expression interpolation', () => { - expect('prefix-$(123)').toMatchTree(` - Word - WordFragment prefix- - Interpolation - leftParen - Number 123 - rightParen - `) - }) - - test('multiple interpolations in word', () => { - expect('$user/$file').toMatchTree(` - Word - Interpolation - Identifier user - WordFragment / - Interpolation - Identifier file - `) - }) - - test('dollar not followed by identifier stays in word', () => { - expect('price$10').toMatchTree(` - Word - WordFragment price$10 - `) - }) - - test('escaped dollar in word', () => { - expect('price\\$10').toMatchTree(` - Word - WordFragment price - EscapeSeq - WordFragment 10 - `) - }) - - test('interpolation at start of word', () => { - expect('$HOME/documents').toMatchTree(` - Word - Interpolation - Identifier HOME - WordFragment /documents - `) - }) - - test('interpolation at end of word', () => { - expect('./path/$filename').toMatchTree(` - Word - WordFragment ./path/ - Interpolation - Identifier filename - `) - }) - - test('complex expression interpolation', () => { - expect('output-$(add 1 2).txt').toMatchTree(` - Word - WordFragment output- - Interpolation - leftParen - FunctionCall - Identifier add - PositionalArg - Number 1 - PositionalArg - Number 2 - rightParen - WordFragment .txt - `) - }) - - test('emoji in interpolated identifier', () => { - expect('hello/$😎file').toMatchTree(` - Word - WordFragment hello/ - Interpolation - Identifier 😎file - `) - }) - - test('escaped space in word', () => { - expect('my\\ file.txt').toMatchTree(` - Word - WordFragment my - EscapeSeq - WordFragment file.txt - `) - }) - - test('multiple escapes and interpolations', () => { - expect('pre\\$fix-$var-\\$end').toMatchTree(` - Word - WordFragment pre - EscapeSeq - WordFragment fix- - Interpolation - Identifier var - WordFragment - - EscapeSeq - WordFragment end - `) - }) - - test('plain word without interpolation still works', () => { - expect('./file.txt').toMatchTree(` - Word - WordFragment ./file.txt - `) - }) - - test('word with URL-like content', () => { - expect('https://example.com/$path').toMatchTree(` - Word - WordFragment https://example.com/ - Interpolation - Identifier path - `) - }) - - test('nested expression in interpolation', () => { - expect('file-$(multiply (add 1 2) 3).txt').toMatchTree(` - Word - WordFragment file- - Interpolation - leftParen - FunctionCall - Identifier multiply - PositionalArg - ParenExpr - leftParen - FunctionCall - Identifier add - PositionalArg - Number 1 - PositionalArg - Number 2 - rightParen - PositionalArg - Number 3 - rightParen - WordFragment .txt - `) - }) -}) - -describe('word interpolation in function calls', () => { - test('function call with interpolated word argument', () => { - expect('cat /home/$user/file.txt').toMatchTree(` - FunctionCall - Identifier cat - PositionalArg - Word - WordFragment /home/ - Interpolation - Identifier user - WordFragment /file.txt - `) - }) - - test('multiple interpolated word arguments', () => { - expect('cp $src/$file $dest/$file').toMatchTree(` - FunctionCall - Identifier cp - PositionalArg - Word - Interpolation - Identifier src - WordFragment / - Interpolation - Identifier file - PositionalArg - Word - Interpolation - Identifier dest - WordFragment / - Interpolation - Identifier file - `) - }) -}) diff --git a/src/parser/tokenizer.ts b/src/parser/tokenizer.ts index 572afd5..b9f2060 100644 --- a/src/parser/tokenizer.ts +++ b/src/parser/tokenizer.ts @@ -1,5 +1,5 @@ import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr' -import { Identifier, Word, WordFragment } from './shrimp.terms' +import { Identifier, Word } from './shrimp.terms' // The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF. @@ -16,18 +16,19 @@ export const tokenizer = new ExternalTokenizer((input: InputStream, stack: Stack if (!isWordChar(ch)) break - // Stop at $ if it's followed by identifier start or ( - // This allows word interpolation like path/$file or result-$(expr) - if (ch === 36 /* $ */) { - const nextCh = getFullCodePoint(input, pos + 1) - if (isLowercaseLetter(nextCh) || isEmoji(nextCh) || nextCh === 40 /* ( */) { - break + // Handle backslash escapes: consume backslash + next char + if (ch === 92 /* \ */) { + isValidIdentifier = false + pos += getCharSize(ch) // skip backslash + const nextCh = getFullCodePoint(input, pos) + if (nextCh !== -1) { // if not EOF + pos += getCharSize(nextCh) // skip escaped char } + continue } // Certain characters might end a word or identifier if they are followed by whitespace. // This allows things like `a = hello; 2` of if `x: y` to parse correctly. - // to work as expected. if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { const nextCh = getFullCodePoint(input, pos + 1) if (!isWordChar(nextCh)) break @@ -43,7 +44,7 @@ export const tokenizer = new ExternalTokenizer((input: InputStream, stack: Stack } input.advance(pos) - input.acceptToken(isValidIdentifier ? Identifier : WordFragment) + input.acceptToken(isValidIdentifier ? Identifier : Word) }) const isWhiteSpace = (ch: number): boolean => {