From 8a290903647761550fb8f80998e4129da39ea526 Mon Sep 17 00:00:00 2001 From: Corey Johnson Date: Fri, 17 Oct 2025 10:40:28 -0700 Subject: [PATCH] fix(parser): make DotGet whitespace-sensitive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add IdentifierBeforeDot token emitted when identifier immediately precedes '.' - Move DotGet into @skip {} block using IdentifierBeforeDot - Prevents 'basename . prop' from parsing as DotGet - Allows 'basename.prop' to work as expected when identifier is in scope - Fixes test: 'a word can be contained in parens' 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/parser/shrimp.grammar | 18 +++--- src/parser/shrimp.terms.ts | 60 ++++++++++--------- src/parser/shrimp.ts | 20 +++---- src/parser/tests/basics.test.ts | 37 ++++++++++++ src/parser/tests/dot-get.test.ts | 28 ++++++--- src/parser/tokenizer.ts | 100 ++++++++++++++++++------------- 6 files changed, 166 insertions(+), 97 deletions(-) diff --git a/src/parser/shrimp.grammar b/src/parser/shrimp.grammar index 95509d8..08122f4 100644 --- a/src/parser/shrimp.grammar +++ b/src/parser/shrimp.grammar @@ -23,7 +23,7 @@ Underscore { "_" } Null { "null" } Regex { "//" (![/\\\n[] | "\\" ![\n] | "[" (![\n\\\]] | "\\" ![\n])* "]")+ ("//" $[gimsuy]*)? } // Stolen from the lezer JavaScript grammar - "fn" [@name=keyword] + Fn[@name=keyword] { "fn" } "if" [@name=keyword] "elsif" [@name=keyword] "else" [@name=keyword] @@ -43,7 +43,7 @@ } -@external tokens tokenizer from "./tokenizer" { Identifier, Word } +@external tokens tokenizer from "./tokenizer" { Identifier, Word, IdentifierBeforeDot } @precedence { pipe @left, @@ -108,11 +108,11 @@ FunctionDef { } singleLineFunctionDef { - "fn" Params colon consumeToTerminator end + Fn Params colon consumeToTerminator end } multilineFunctionDef { - "fn" Params colon newlineOrSemicolon block end + Fn Params colon newlineOrSemicolon block end } IfExpr { @@ -158,10 +158,6 @@ Assign { Identifier "=" consumeToTerminator } -DotGet { - Identifier "." Identifier -} - BinOp { (expression | BinOp) !multiplicative "*" (expression | BinOp) | (expression | BinOp) !multiplicative "/" (expression | BinOp) | @@ -178,8 +174,12 @@ expression { } @skip {} { + DotGet { + IdentifierBeforeDot "." Identifier + } + String { "'" stringContent* "'" } - + } stringContent { diff --git a/src/parser/shrimp.terms.ts b/src/parser/shrimp.terms.ts index b7aeb71..80a01ed 100644 --- a/src/parser/shrimp.terms.ts +++ b/src/parser/shrimp.terms.ts @@ -2,32 +2,34 @@ export const Identifier = 1, Word = 2, - Program = 3, - PipeExpr = 4, - FunctionCall = 5, - PositionalArg = 6, - ParenExpr = 7, - FunctionCallOrIdentifier = 8, - BinOp = 9, - ConditionalOp = 14, - String = 23, - StringFragment = 24, - Interpolation = 25, - EscapeSeq = 26, - Number = 27, - Boolean = 28, - Regex = 29, - Null = 30, - DotGet = 31, - FunctionDef = 32, - Params = 34, - colon = 35, - end = 36, - Underscore = 37, - NamedArg = 38, - NamedArgPrefix = 39, - IfExpr = 41, - ThenBlock = 44, - ElsifExpr = 45, - ElseExpr = 47, - Assign = 49 + IdentifierBeforeDot = 3, + Program = 4, + PipeExpr = 5, + FunctionCall = 6, + PositionalArg = 7, + ParenExpr = 8, + FunctionCallOrIdentifier = 9, + BinOp = 10, + ConditionalOp = 15, + String = 24, + StringFragment = 25, + Interpolation = 26, + EscapeSeq = 27, + Number = 28, + Boolean = 29, + Regex = 30, + Null = 31, + DotGet = 32, + FunctionDef = 33, + Fn = 34, + Params = 35, + colon = 36, + end = 37, + Underscore = 38, + NamedArg = 39, + NamedArgPrefix = 40, + IfExpr = 42, + ThenBlock = 45, + ElsifExpr = 46, + ElseExpr = 48, + Assign = 50 diff --git a/src/parser/shrimp.ts b/src/parser/shrimp.ts index c4d2886..16de5fa 100644 --- a/src/parser/shrimp.ts +++ b/src/parser/shrimp.ts @@ -5,21 +5,21 @@ import {trackScope} from "./scopeTracker" import {highlighting} from "./highlight" export const parser = LRParser.deserialize({ version: 14, - states: ".pQVQaOOO#RQbO'#CdO#cQPO'#CeO#qQPO'#DkO$qQaO'#CcO$xOSO'#CsOOQ`'#Do'#DoO%WQPO'#DnO%oQaO'#DzOOQ`'#C|'#C|OOQO'#Dl'#DlO%wQPO'#DkO&VQaO'#EOOOQO'#DV'#DVOOQO'#Dk'#DkO&^QPO'#DjOOQ`'#Dj'#DjOOQ`'#D`'#D`QVQaOOO&wQbO'#DnO'qQaO,59gOOQ`'#Dn'#DnOOQ`'#Cb'#CbO'vQaO'#DSOOQ`'#Dm'#DmOOQ`'#Da'#DaO(TQbO,58{O(tQaO,59yO&VQaO,59PO&VQaO,59PO)RQbO'#CdO*^QPO'#CeO*nQPO,58}O+wQPO,58}O*zQPO,58}O,OQPO,58}O,WQaO'#CuO,`QWO'#CvOOOO'#Ds'#DsOOOO'#Db'#DbO,tOSO,59_OOQ`,59_,59_OOQ`'#Dc'#DcO-SQaO'#DOO-[QPO,5:fO-aQaO'#DeO-fQPO,58zO-wQPO,5:jO.OQPO'#DnO.fQPO,5:jOOQ`,5:U,5:UOOQ`-E7^-E7^OOQ`1G/R1G/ROOQ`,59n,59nOOQ`-E7_-E7_OOQO1G/e1G/eOOQO1G.k1G.kO.kQPO1G.kO&VQaO,59UO&VQaO,59UOOQ`1G.i1G.iOOOO,59a,59aOOOO,59b,59bOOOO-E7`-E7`OOQ`1G.y1G.yOOQ`-E7a-E7aO/VQaO1G0QO/gQbO'#CdOOQO,5:P,5:POOQO-E7c-E7cO0WQaO1G0UOOQO1G.p1G.pO0hQPO1G.pO0rQPO7+%lO0wQaO7+%mOOQO'#DX'#DXOOQO7+%p7+%pO1XQaO7+%qOOQ`<sAN>sO&VQaO'#DZOOQO'#Df'#DfO2lQPOAN>wO2wQPO'#D]OOQOAN>wAN>wO2|QPOAN>wO3RQPO,59uO3YQPO,59uOOQO-E7d-E7dOOQOG24cG24cO3_QPOG24cO3dQPO,59wO3iQPO1G/aOOQOLD)}LD)}O0wQaO1G/cO1XQaO7+${OOQO7+$}7+$}OOQO<tAN>tO&bQaO'#D[OOQO'#Dg'#DgO1QQPOAN>xO1]QPO'#D^OOQOAN>xAN>xO1bQPOAN>xO1gQPO,59vO1nQPO,59vOOQO-E7e-E7eOOQOG24dG24dO1sQPOG24dO1xQPO,59xO1}QPO1G/bOOQOLD*OLD*OO/]QaO1G/dO/mQaO7+$|OOQO7+%O7+%OOOQO<n#a#b;W#b#cCR#c#o;W#o;'S$_;'S;=`$v<%lO$_V>s[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#g;W#g#h?i#h#o;W#o;'S$_;'S;=`$v<%lO$_V?n^hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#X;W#X#Y@j#Y#];W#]#^Aa#^#o;W#o;'S$_;'S;=`$v<%lO$_V@qY!QPhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VAf[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#Y;W#Y#ZB[#Z#o;W#o;'S$_;'S;=`$v<%lO$_VBcY!OPhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VCW[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#W;W#W#XC|#X#o;W#o;'S$_;'S;=`$v<%lO$_VDTYhStROt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VDx]hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#UEq#U#b;W#b#cIX#c#o;W#o;'S$_;'S;=`$v<%lO$_VEv[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aFl#a#o;W#o;'S$_;'S;=`$v<%lO$_VFq[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#g;W#g#hGg#h#o;W#o;'S$_;'S;=`$v<%lO$_VGl[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#X;W#X#YHb#Y#o;W#o;'S$_;'S;=`$v<%lO$_VHiYlRhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VI`YqRhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VJT[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#Y;W#Y#ZJy#Z#o;W#o;'S$_;'S;=`$v<%lO$_VKQYzPhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$__Kw[!jWhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#i;W#i#jLm#j#o;W#o;'S$_;'S;=`$v<%lO$_VLr[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aMh#a#o;W#o;'S$_;'S;=`$v<%lO$_VMm[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aNc#a#o;W#o;'S$_;'S;=`$v<%lO$_VNjYnRhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_V! _[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#f;W#f#g!!T#g#o;W#o;'S$_;'S;=`$v<%lO$_V!![YfRhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_^!#RY!lWhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$__!#x[!kWhSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#f;W#f#g!$n#g#o;W#o;'S$_;'S;=`$v<%lO$_V!$s[hSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#i;W#i#jGg#j#o;W#o;'S$_;'S;=`$v<%lO$_V!%pUxRhSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~!&XO!t~", + tokenData: "!&X~R!SOX$_XY$|YZ%gZp$_pq$|qr&Qrt$_tu'Yuw$_wx'_xy'dyz'}z{(h{|)R|}$_}!O)l!O!P,b!P!Q,{!Q![*]![!]5j!]!^%g!^!_6T!_!`7_!`!a7x!a#O$_#O#P9S#P#R$_#R#S9X#S#T$_#T#U9r#U#X;W#X#Y=m#Y#ZDs#Z#];W#]#^JO#^#b;W#b#cKp#c#d! Y#d#f;W#f#g!!z#g#h;W#h#i!#q#i#o;W#o#p$_#p#q!%i#q;'S$_;'S;=`$v<%l~$_~O$_~~!&SS$dUiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_S$yP;=`<%l$__%TUiS!^ZOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V%nUiS!qROt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V&VWiSOt$_uw$_x!_$_!_!`&o!`#O$_#P;'S$_;'S;=`$v<%lO$_V&vUaRiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~'_O!i~~'dO!g~V'kUiS!eROt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V(UUiS!fROt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V(oUZRiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V)YU]RiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V)sWiS^ROt$_uw$_x!Q$_!Q![*]![#O$_#P;'S$_;'S;=`$v<%lO$_V*dYiSlROt$_uw$_x!O$_!O!P+S!P!Q$_!Q![*]![#O$_#P;'S$_;'S;=`$v<%lO$_V+XWiSOt$_uw$_x!Q$_!Q![+q![#O$_#P;'S$_;'S;=`$v<%lO$_V+xWiSlROt$_uw$_x!Q$_!Q![+q![#O$_#P;'S$_;'S;=`$v<%lO$_T,iU!nPiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V-SWiS[ROt$_uw$_x!P$_!P!Q-l!Q#O$_#P;'S$_;'S;=`$v<%lO$_V-q^iSOY.mYZ$_Zt.mtu/puw.mwx/px!P.m!P!Q$_!Q!}.m!}#O4c#O#P2O#P;'S.m;'S;=`5d<%lO.mV.t^iSnROY.mYZ$_Zt.mtu/puw.mwx/px!P.m!P!Q2e!Q!}.m!}#O4c#O#P2O#P;'S.m;'S;=`5d<%lO.mR/uXnROY/pZ!P/p!P!Q0b!Q!}/p!}#O1P#O#P2O#P;'S/p;'S;=`2_<%lO/pR0eP!P!Q0hR0mUnR#Z#[0h#]#^0h#a#b0h#g#h0h#i#j0h#m#n0hR1SVOY1PZ#O1P#O#P1i#P#Q/p#Q;'S1P;'S;=`1x<%lO1PR1lSOY1PZ;'S1P;'S;=`1x<%lO1PR1{P;=`<%l1PR2RSOY/pZ;'S/p;'S;=`2_<%lO/pR2bP;=`<%l/pV2jWiSOt$_uw$_x!P$_!P!Q3S!Q#O$_#P;'S$_;'S;=`$v<%lO$_V3ZbiSnROt$_uw$_x#O$_#P#Z$_#Z#[3S#[#]$_#]#^3S#^#a$_#a#b3S#b#g$_#g#h3S#h#i$_#i#j3S#j#m$_#m#n3S#n;'S$_;'S;=`$v<%lO$_V4h[iSOY4cYZ$_Zt4ctu1Puw4cwx1Px#O4c#O#P1i#P#Q.m#Q;'S4c;'S;=`5^<%lO4cV5aP;=`<%l4cV5gP;=`<%l.mT5qUiStPOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V6[WbRiSOt$_uw$_x!_$_!_!`6t!`#O$_#P;'S$_;'S;=`$v<%lO$_V6{UcRiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V7fU`RiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V8PWdRiSOt$_uw$_x!_$_!_!`8i!`#O$_#P;'S$_;'S;=`$v<%lO$_V8pUeRiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~9XO!j~V9`UiSvROt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_V9w[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#b;W#b#c;{#c#o;W#o;'S$_;'S;=`$v<%lO$_U:tUxQiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_U;]YiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_Vn#a#b;W#b#cCR#c#o;W#o;'S$_;'S;=`$v<%lO$_V>s[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#g;W#g#h?i#h#o;W#o;'S$_;'S;=`$v<%lO$_V?n^iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#X;W#X#Y@j#Y#];W#]#^Aa#^#o;W#o;'S$_;'S;=`$v<%lO$_V@qY!RPiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VAf[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#Y;W#Y#ZB[#Z#o;W#o;'S$_;'S;=`$v<%lO$_VBcY!PPiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VCW[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#W;W#W#XC|#X#o;W#o;'S$_;'S;=`$v<%lO$_VDTYiSuROt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VDx]iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#UEq#U#b;W#b#cIX#c#o;W#o;'S$_;'S;=`$v<%lO$_VEv[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aFl#a#o;W#o;'S$_;'S;=`$v<%lO$_VFq[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#g;W#g#hGg#h#o;W#o;'S$_;'S;=`$v<%lO$_VGl[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#X;W#X#YHb#Y#o;W#o;'S$_;'S;=`$v<%lO$_VHiYmRiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VI`YrRiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_VJT[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#Y;W#Y#ZJy#Z#o;W#o;'S$_;'S;=`$v<%lO$_VKQY{PiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$__Kw[!kWiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#i;W#i#jLm#j#o;W#o;'S$_;'S;=`$v<%lO$_VLr[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aMh#a#o;W#o;'S$_;'S;=`$v<%lO$_VMm[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#`;W#`#aNc#a#o;W#o;'S$_;'S;=`$v<%lO$_VNjYoRiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_V! _[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#f;W#f#g!!T#g#o;W#o;'S$_;'S;=`$v<%lO$_V!![YgRiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$_^!#RY!mWiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#o;W#o;'S$_;'S;=`$v<%lO$__!#x[!lWiSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#f;W#f#g!$n#g#o;W#o;'S$_;'S;=`$v<%lO$_V!$s[iSOt$_uw$_x!_$_!_!`:m!`#O$_#P#T$_#T#i;W#i#jGg#j#o;W#o;'S$_;'S;=`$v<%lO$_V!%pUyRiSOt$_uw$_x#O$_#P;'S$_;'S;=`$v<%lO$_~!&XO!u~", tokenizers: [0, 1, 2, 3, tokenizer], - topRules: {"Program":[0,3]}, - tokenPrec: 858 + topRules: {"Program":[0,4]}, + tokenPrec: 786 }) diff --git a/src/parser/tests/basics.test.ts b/src/parser/tests/basics.test.ts index 94f84db..fe82c7a 100644 --- a/src/parser/tests/basics.test.ts +++ b/src/parser/tests/basics.test.ts @@ -282,3 +282,40 @@ describe('Assign', () => { end end`) }) }) + +describe('DotGet whitespace sensitivity', () => { + test('no whitespace - DotGet works when identifier in scope', () => { + expect('basename = 5; basename.prop').toMatchTree(` + Assign + Identifier basename + operator = + Number 5 + DotGet + IdentifierBeforeDot basename + Identifier prop`) + }) + + test('space before dot - NOT DotGet, parses as division', () => { + expect('basename = 5; basename / prop').toMatchTree(` + Assign + Identifier basename + operator = + Number 5 + BinOp + Identifier basename + operator / + Identifier prop`) + }) + + test('dot followed by slash is Word, not DotGet', () => { + expect('basename ./cool').toMatchTree(` + FunctionCall + Identifier basename + PositionalArg + Word ./cool`) + }) + + test('identifier not in scope with dot becomes Word', () => { + expect('readme.txt').toMatchTree(`Word readme.txt`) + }) +}) diff --git a/src/parser/tests/dot-get.test.ts b/src/parser/tests/dot-get.test.ts index 18e774f..3442186 100644 --- a/src/parser/tests/dot-get.test.ts +++ b/src/parser/tests/dot-get.test.ts @@ -13,7 +13,7 @@ describe('DotGet', () => { operator = Number 5 DotGet - Identifier obj + IdentifierBeforeDot obj Identifier prop `) }) @@ -26,7 +26,7 @@ describe('DotGet', () => { Identifier config colon : DotGet - Identifier config + IdentifierBeforeDot config Identifier path end end `) @@ -40,7 +40,7 @@ describe('DotGet', () => { Identifier x colon : DotGet - Identifier x + IdentifierBeforeDot x Identifier prop end end Word x.prop @@ -59,10 +59,10 @@ end`).toMatchTree(` Identifier y colon : DotGet - Identifier x + IdentifierBeforeDot x Identifier foo DotGet - Identifier y + IdentifierBeforeDot y Identifier bar end end `) @@ -79,7 +79,7 @@ end`).toMatchTree(` Identifier x colon : DotGet - Identifier x + IdentifierBeforeDot x Identifier outer FunctionDef keyword fn @@ -87,7 +87,7 @@ end`).toMatchTree(` Identifier y colon : DotGet - Identifier y + IdentifierBeforeDot y Identifier inner end end end end @@ -104,7 +104,7 @@ end`).toMatchTree(` Identifier echo PositionalArg DotGet - Identifier config + IdentifierBeforeDot config Identifier path `) }) @@ -123,8 +123,18 @@ end`).toMatchTree(` Identifier echo PositionalArg DotGet - Identifier config + IdentifierBeforeDot config Identifier path `) }) + + test("dot get doesn't work with spaces", () => { + expect('obj . prop').toMatchTree(` + FunctionCall + Identifier obj + PositionalArg + Word . + PositionalArg + Identifier prop`) + }) }) diff --git a/src/parser/tokenizer.ts b/src/parser/tokenizer.ts index 03d874d..1d3c708 100644 --- a/src/parser/tokenizer.ts +++ b/src/parser/tokenizer.ts @@ -1,55 +1,75 @@ import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr' -import { Identifier, Word } from './shrimp.terms' +import { Identifier, Word, IdentifierBeforeDot } from './shrimp.terms' import type { Scope } from './scopeTracker' // The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF. -export const tokenizer = new ExternalTokenizer((input: InputStream, stack: Stack) => { - let ch = getFullCodePoint(input, 0) - if (!isWordChar(ch)) return +export const tokenizer = new ExternalTokenizer( + (input: InputStream, stack: Stack) => { + let ch = getFullCodePoint(input, 0) + console.log(`🌭 checking char ${String.fromCodePoint(ch)}`) + if (!isWordChar(ch)) return - let pos = getCharSize(ch) - let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch) - const canBeWord = stack.canShift(Word) + let pos = getCharSize(ch) + let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch) + const canBeWord = stack.canShift(Word) - while (true) { - ch = getFullCodePoint(input, pos) + while (true) { + ch = getFullCodePoint(input, pos) - // Check for dot and scope - property access detection - if (ch === 46 /* . */ && isValidIdentifier) { - const identifierText = input.read(input.pos, input.pos + pos) - const scope = stack.context as Scope | undefined + // Check for dot and scope - property access detection + if (ch === 46 /* . */ && isValidIdentifier) { + // Build identifier text by peeking character by character + let identifierText = '' + for (let i = 0; i < pos; i++) { + const charCode = input.peek(i) + if (charCode === -1) break + // Handle surrogate pairs for emoji + if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) { + const low = input.peek(i + 1) + if (low >= 0xdc00 && low <= 0xdfff) { + identifierText += String.fromCharCode(charCode, low) + i++ // Skip the low surrogate + continue + } + } + identifierText += String.fromCharCode(charCode) + } - if (scope?.has(identifierText)) { - // In scope - stop here, let grammar parse property access - input.advance(pos) - input.acceptToken(Identifier) - return + const scope = stack.context as Scope | undefined + + if (scope?.has(identifierText)) { + // In scope - stop here, let grammar parse property access + input.advance(pos) + input.acceptToken(IdentifierBeforeDot) + return + } + // Not in scope - continue consuming as Word (fall through) } - // Not in scope - continue consuming as Word (fall through) + + if (!isWordChar(ch)) break + + // Certain characters might end a word or identifier if they are followed by whitespace. + // This allows things like `a = hello; 2` of if `x: y` to parse correctly. + if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { + const nextCh = getFullCodePoint(input, pos + 1) + if (!isWordChar(nextCh)) break + } + + // Track identifier validity + if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) { + if (!canBeWord) break + isValidIdentifier = false + } + + pos += getCharSize(ch) } - if (!isWordChar(ch)) break - - // Certain characters might end a word or identifier if they are followed by whitespace. - // This allows things like `a = hello; 2` of if `x: y` to parse correctly. - if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { - const nextCh = getFullCodePoint(input, pos + 1) - if (!isWordChar(nextCh)) break - } - - // Track identifier validity - if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) { - if (!canBeWord) break - isValidIdentifier = false - } - - pos += getCharSize(ch) - } - - input.advance(pos) - input.acceptToken(isValidIdentifier ? Identifier : Word) -}, { contextual: true }) + input.advance(pos) + input.acceptToken(isValidIdentifier ? Identifier : Word) + }, + { contextual: true } +) const isWhiteSpace = (ch: number): boolean => { return ch === 32 /* space */ || ch === 10 /* \n */ || ch === 9 /* tab */ || ch === 13 /* \r */