diff --git a/bunfig.toml b/bunfig.toml index 8877354..6cfa412 100644 --- a/bunfig.toml +++ b/bunfig.toml @@ -1,4 +1,7 @@ [serve.static] plugins = ["bun-plugin-tailwind"] -env = "BUN_PUBLIC_*" \ No newline at end of file +env = "BUN_PUBLIC_*" + +[test] +preload = ["./src/testSetup.ts"] diff --git a/package.json b/package.json index 1535fb4..c83ee53 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,7 @@ "private": true, "type": "module", "scripts": { + "pretest": "bun generate-parser", "serve": "bun --hot src/server/server.tsx", "generate-parser": "lezer-generator src/parser/shrimp.grammar --typeScript -o src/parser/shrimp.ts" }, diff --git a/src/parser/shrimp.grammar b/src/parser/shrimp.grammar index 790ebbe..d8c9d4a 100644 --- a/src/parser/shrimp.grammar +++ b/src/parser/shrimp.grammar @@ -1,17 +1,14 @@ @external propSource highlighting from "./highlight.js" - @top Program { expr* } @skip { space } @tokens { - @precedence { fn Boolean Identifier } - space { @whitespace+ } Number { $[0-9]+ ('.' $[0-9]+)? } Boolean { "true" | "false" } String { '"' !["]* '"' } - Identifier { $[A-Za-z_]$[A-Za-z_0-9-]* } + fn[@name=Keyword] { "fn" } equals[@name=Operator] { "=" } ":"[@name=Colon] @@ -23,6 +20,10 @@ rightParen[@name=Paren] { ")" } } +@external tokens identifierTokenizer from "./tokenizers" { + Identifier +} + @precedence { multiplicative @left, additive @left, diff --git a/src/parser/shrimp.terms.ts b/src/parser/shrimp.terms.ts index 07cd39e..1c25326 100644 --- a/src/parser/shrimp.terms.ts +++ b/src/parser/shrimp.terms.ts @@ -1,8 +1,8 @@ // This file was generated by lezer-generator. You probably shouldn't edit it. export const - Program = 1, - Assignment = 2, - Identifier = 3, + Identifier = 1, + Program = 2, + Assignment = 3, equals = 4, Function = 5, fn = 6, diff --git a/src/parser/shrimp.test.ts b/src/parser/shrimp.test.ts index 7529434..35efeeb 100644 --- a/src/parser/shrimp.test.ts +++ b/src/parser/shrimp.test.ts @@ -1,9 +1,27 @@ -import { regenerateParser } from '@/parser/test-helper' -import { expect, beforeAll, describe, test } from 'bun:test' +import { expect, describe, test } from 'bun:test' + +describe('Identifier', () => { + test('parses simple identifiers', () => { + expect('hyphenated-var').toMatchTree(`Identifier hyphenated-var`) + expect('var').toMatchTree(`Identifier var`) + expect('var123').toMatchTree(`Identifier var123`) + }) + + test('fails on underscores and capital letters', () => { + expect('myVar').toFailParse() + expect('underscore_var').toFailParse() + expect('_leadingUnderscore').toFailParse() + expect('trailingUnderscore_').toFailParse() + expect('mixed-123_var').toFailParse() + }) + + test('parses identifiers with emojis', () => { + expect('var😊').toMatchTree(`Identifier var😊`) + expect('😊').toMatchTree(`Identifier 😊`) + }) +}) describe('BinOp', () => { - beforeAll(() => regenerateParser()) - test('addition tests', () => { expect('2 + 3').toMatchTree(` BinOp @@ -60,8 +78,6 @@ describe('BinOp', () => { }) describe('Fn', () => { - beforeAll(() => regenerateParser()) - test('parses function with single parameter', () => { expect('fn x: x + 1').toMatchTree(` Function @@ -109,8 +125,6 @@ describe('Fn', () => { }) describe('Identifier', () => { - beforeAll(() => regenerateParser()) - test('parses hyphenated identifiers correctly', () => { expect('my-var - another-var').toMatchTree(` BinOp @@ -133,8 +147,6 @@ describe('Identifier', () => { }) describe('Assignment', () => { - beforeAll(() => regenerateParser()) - test('parses assignment with addition', () => { expect('x = 5 + 3').toMatchTree(` Assignment @@ -165,8 +177,6 @@ describe('Assignment', () => { }) describe('Parentheses', () => { - beforeAll(() => regenerateParser()) - test('parses expressions with parentheses correctly', () => { expect('(2 + 3) * 4').toMatchTree(` BinOp @@ -205,8 +215,6 @@ describe('Parentheses', () => { }) describe('multiline', () => { - beforeAll(() => regenerateParser()) - test('parses multiline expressions', () => { expect(` 5 + 4 diff --git a/src/parser/shrimp.ts b/src/parser/shrimp.ts index f25632f..e25ac62 100644 --- a/src/parser/shrimp.ts +++ b/src/parser/shrimp.ts @@ -1,18 +1,19 @@ // This file was generated by lezer-generator. You probably shouldn't edit it. import {LRParser} from "@lezer/lr" +import {identifierTokenizer} from "./tokenizers" import {highlighting} from "./highlight.js" export const parser = LRParser.deserialize({ version: 14, - states: "$OQVQPOOOkQPO'#CuO!fQPO'#CaO!nQPO'#CoOOQO'#Cu'#CuOVQPO'#CuOOQO'#Ct'#CtQVQPOOOVQPO,58xOOQO'#Cp'#CpO#cQPO'#CcO#kQPO,58{OVQPO,59POVQPO,59PO#pQPO,59aOOQO-E6m-E6mO$RQPO1G.dOOQO-E6n-E6nOVQPO1G.gOOQO1G.k1G.kO$yQPO1G.kOOQO1G.{1G.{O%qQPO7+$R", - stateData: "&n~OgOS~ORPOUQO^SO_SO`SOaTO~OSWORiXUiXYiXZiX[iX]iX^iX_iX`iXaiXeiXbiX~ORXOWVP~OY[OZ[O[]O]]ORcXUcX^cX_cX`cXacXecX~ORXOWVX~OWbO~OY[OZ[O[]O]]ObeO~OY[OZ[O[]O]]ORQiUQi^Qi_Qi`QiaQieQibQi~OY[OZ[ORXiUXi[Xi]Xi^Xi_Xi`XiaXieXibXi~OY[OZ[O[]O]]ORTqUTq^Tq_Tq`TqaTqeTqbTq~OU`R`~", - goto: "!hjPPkPPkPtPkPPPPPPPPPw}PPP!Tk_UOTVW[]bRZQQVOR_VQYQRaYSROVQ^TQ`WQc[Qd]Rfb", - nodeNames: "⚠ Program Assignment Identifier Operator Function Keyword Params Colon BinOp Operator Operator Operator Operator Number String Boolean Paren Paren", + states: "$OQVQROOOkQRO'#CuO!fQRO'#CaO!nQRO'#CoOOQQ'#Cu'#CuOVQRO'#CuOOQQ'#Ct'#CtQVQROOOVQRO,58yOOQQ'#Cp'#CpO#cQRO'#CcO#kQPO,58{OVQRO,59POVQRO,59PO#pQPO,59aOOQQ-E6m-E6mO$RQRO1G.eOOQQ-E6n-E6nOVQRO1G.gOOQQ1G.k1G.kO$yQRO1G.kOOQQ1G.{1G.{O%qQRO7+$R", + stateData: "&i~OgOS~OPPOUQO^SO_SO`SOaTO~OSWOPiXUiXYiXZiX[iX]iX^iX_iX`iXaiXeiXbiX~OPXOWVP~OY[OZ[O[]O]]OPcXUcX^cX_cX`cXacXecX~OPXOWVX~OWbO~OY[OZ[O[]O]]ObeO~OY[OZ[O[]O]]OPRiURi^Ri_Ri`RiaRieRibRi~OY[OZ[OPXiUXi[Xi]Xi^Xi_Xi`XiaXieXibXi~OY[OZ[O[]O]]OPTqUTq^Tq_Tq`TqaTqeTqbTq~O", + goto: "!hjPPPkPkPtPkPPPPPPPPPw}PPP!Tk_UOTVW[]bRZQQVOR_VQYQRaYSROVQ^TQ`WQc[Qd]Rfb", + nodeNames: "⚠ Identifier Program Assignment Operator Function Keyword Params Colon BinOp Operator Operator Operator Operator Number String Boolean Paren Paren", maxTerm: 25, propSources: [highlighting], skippedNodes: [0], repeatNodeCount: 2, - tokenData: "*f~RkX^!vpq!vrs#kxy$Yyz$_z{$d{|$i}!O$n!P!Q$s!Q![$x![!]%c!_!`%h!c!}%m#R#S%m#T#Y%m#Y#Z&R#Z#h%m#h#i)`#i#o%m#y#z!v$f$g!v#BY#BZ!v$IS$I_!v$I|$JO!v$JT$JU!v$KV$KW!v&FU&FV!v~!{Yg~X^!vpq!v#y#z!v$f$g!v#BY#BZ!v$IS$I_!v$I|$JO!v$JT$JU!v$KV$KW!v&FU&FV!v~#nTOr#krs#}s;'S#k;'S;=`$S<%lO#k~$SO_~~$VP;=`<%l#k~$_Oa~~$dOb~~$iOY~~$nO[~~$sO]~~$xOZ~~$}Q^~!O!P%T!Q![$x~%WP!Q![%Z~%`P^~!Q![%Z~%hOW~~%mOS~~%rTR~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~&WWR~}!O%m!Q![%m!c!}%m#R#S%m#T#U&p#U#b%m#b#c(x#c#o%m~&uVR~}!O%m!Q![%m!c!}%m#R#S%m#T#`%m#`#a'[#a#o%m~'aVR~}!O%m!Q![%m!c!}%m#R#S%m#T#g%m#g#h'v#h#o%m~'{VR~}!O%m!Q![%m!c!}%m#R#S%m#T#X%m#X#Y(b#Y#o%m~(iT`~R~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~)PTU~R~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~)eVR~}!O%m!Q![%m!c!}%m#R#S%m#T#f%m#f#g)z#g#o%m~*PVR~}!O%m!Q![%m!c!}%m#R#S%m#T#i%m#i#j'v#j#o%m", - tokenizers: [0], - topRules: {"Program":[0,1]}, - tokenPrec: 255 + tokenData: "&a~RfX^!gpq!grs#[xy#yyz$Oz{$T{|$Y}!O$_!P!Q$d!Q![$i![!]%S!_!`%X#Y#Z%^#h#i&T#y#z!g$f$g!g#BY#BZ!g$IS$I_!g$I|$JO!g$JT$JU!g$KV$KW!g&FU&FV!g~!lYg~X^!gpq!g#y#z!g$f$g!g#BY#BZ!g$IS$I_!g$I|$JO!g$JT$JU!g$KV$KW!g&FU&FV!g~#_TOr#[rs#ns;'S#[;'S;=`#s<%lO#[~#sO_~~#vP;=`<%l#[~$OOa~~$TOb~~$YOY~~$_O[~~$dO]~~$iOZ~~$nQ^~!O!P$t!Q![$i~$wP!Q![$z~%PP^~!Q![$z~%XOW~~%^OS~~%aQ#T#U%g#b#c&O~%jP#`#a%m~%pP#g#h%s~%vP#X#Y%y~&OO`~~&TOU~~&WP#f#g&Z~&^P#i#j%s", + tokenizers: [0, identifierTokenizer], + topRules: {"Program":[0,2]}, + tokenPrec: 0 }) diff --git a/src/parser/tokenizers.ts b/src/parser/tokenizers.ts new file mode 100644 index 0000000..85ad52b --- /dev/null +++ b/src/parser/tokenizers.ts @@ -0,0 +1,82 @@ +import { ExternalTokenizer, InputStream } from '@lezer/lr' +import { Identifier } from './shrimp.terms' + +function isLowercaseLetter(ch: number): boolean { + return ch >= 97 && ch <= 122 // a-z +} + +function isDigit(ch: number): boolean { + return ch >= 48 && ch <= 57 // 0-9 +} + +function getFullCodePoint(input: InputStream, pos: number): number { + const ch = input.peek(pos) + + // Check if this is a high surrogate (0xD800-0xDBFF) + if (ch >= 0xd800 && ch <= 0xdbff) { + const low = input.peek(pos + 1) + // Check if next is low surrogate (0xDC00-0xDFFF) + if (low >= 0xdc00 && low <= 0xdfff) { + // Combine surrogate pair into full code point + return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff) + } + } + + return ch // Single code unit +} + +function isEmoji(ch: number): boolean { + return ( + // Basic Emoticons + (ch >= 0x1f600 && ch <= 0x1f64f) || + // Miscellaneous Symbols and Pictographs + (ch >= 0x1f300 && ch <= 0x1f5ff) || + // Transport and Map Symbols + (ch >= 0x1f680 && ch <= 0x1f6ff) || + // Regional Indicator Symbols (flags) + (ch >= 0x1f1e6 && ch <= 0x1f1ff) || + // Miscellaneous Symbols (hearts, stars, weather) + (ch >= 0x2600 && ch <= 0x26ff) || + // Dingbats (scissors, pencils, etc) + (ch >= 0x2700 && ch <= 0x27bf) || + // Supplemental Symbols and Pictographs (newer emojis) + (ch >= 0x1f900 && ch <= 0x1f9ff) || + // Symbols and Pictographs Extended-A (newest emojis) + (ch >= 0x1fa70 && ch <= 0x1faff) || + // Various Asian Characters with emoji presentation + (ch >= 0x1f018 && ch <= 0x1f270) || + // Variation Selectors (for emoji presentation) + (ch >= 0xfe00 && ch <= 0xfe0f) || + // Additional miscellaneous items + (ch >= 0x238c && ch <= 0x2454) || + // Combining Diacritical Marks for Symbols + (ch >= 0x20d0 && ch <= 0x20ff) + ) +} + +export const identifierTokenizer = new ExternalTokenizer((input: InputStream) => { + const ch = getFullCodePoint(input, 0) + + if (isLowercaseLetter(ch) || isEmoji(ch)) { + let pos = ch > 0xffff ? 2 : 1 // emoji takes 2 UTF-16 code units + + // Continue consuming identifier characters + while (true) { + const nextCh = getFullCodePoint(input, pos) + + if ( + isLowercaseLetter(nextCh) || + isDigit(nextCh) || + nextCh === 45 /* - */ || + isEmoji(nextCh) + ) { + pos += nextCh > 0xffff ? 2 : 1 // advance by 1 or 2 UTF-16 code units + } else { + break + } + } + + input.advance(pos) // advance by total length + input.acceptToken(Identifier) + } +}) diff --git a/src/parser/test-helper.ts b/src/testSetup.ts similarity index 61% rename from src/parser/test-helper.ts rename to src/testSetup.ts index d1bc9ae..ee9badf 100644 --- a/src/parser/test-helper.ts +++ b/src/testSetup.ts @@ -1,25 +1,34 @@ -import { beforeAll, expect } from 'bun:test' +import { expect } from 'bun:test' import { Tree, TreeCursor } from '@lezer/common' -import grammarFile from './shrimp.grammar' -import { parser } from './shrimp.ts' +import { parser } from './parser/shrimp.ts' import { $ } from 'bun' -// Regenerate the parser if the grammar file is newer than the generated parser -// This makes --watch work without needing to manually regenerate the parser -export const regenerateParser = async () => { - const grammarStat = await Bun.file('src/parser/shrimp.grammar').stat() - const jsStat = await Bun.file('src/parser/shrimp.ts').stat() +const regenerateParser = async () => { + let generate = true + try { + const grammarStat = await Bun.file('./src/parser/shrimp.grammar').stat() + const tokenizerStat = await Bun.file('./src/parser/tokenizers.ts').stat() + const parserStat = await Bun.file('./src/parser/shrimp.ts').stat() - if (grammarStat.mtime <= jsStat.mtime) return - - console.log(`Regenerating parser from ${grammarFile}...`) - await $`bun generate-parser ` + if (grammarStat.mtime <= parserStat.mtime && tokenizerStat.mtime <= parserStat.mtime) { + generate = false + } + } catch (e) { + console.error('Error checking or regenerating parser:', e) + } finally { + if (generate) { + await $`bun generate-parser` + } + } } +await regenerateParser() + // Type declaration for TypeScript declare module 'bun:test' { interface Matchers { toMatchTree(expected: string): T + toFailParse(): T } } @@ -46,6 +55,45 @@ expect.extend({ } } }, + toFailParse(received: unknown) { + if (typeof received !== 'string') { + return { + message: () => 'toMatchTree can only be used with string values', + pass: false, + } + } + + try { + const tree = parser.parse(received) + let hasErrors = false + tree.iterate({ + enter(n) { + if (n.type.isError) { + hasErrors = true + return false + } + }, + }) + + if (hasErrors) { + return { + message: () => `Expected input to fail parsing, and it did.`, + pass: true, + } + } else { + const actual = treeToString(tree, received) + return { + message: () => `Expected input to fail parsing, but it parsed successfully:\n${actual}`, + pass: false, + } + } + } catch (error) { + return { + message: () => `Parsing threw an error: ${(error as Error).message}`, + pass: false, + } + } + }, }) const treeToString = (tree: Tree, input: string): string => {