This commit is contained in:
Corey Johnson 2025-09-29 11:40:32 -07:00
parent 7585f0e8a2
commit 0168d7f933
8 changed files with 186 additions and 42 deletions

View File

@ -2,3 +2,6 @@
[serve.static]
plugins = ["bun-plugin-tailwind"]
env = "BUN_PUBLIC_*"
[test]
preload = ["./src/testSetup.ts"]

View File

@ -4,6 +4,7 @@
"private": true,
"type": "module",
"scripts": {
"pretest": "bun generate-parser",
"serve": "bun --hot src/server/server.tsx",
"generate-parser": "lezer-generator src/parser/shrimp.grammar --typeScript -o src/parser/shrimp.ts"
},

View File

@ -1,17 +1,14 @@
@external propSource highlighting from "./highlight.js"
@top Program { expr* }
@skip { space }
@tokens {
@precedence { fn Boolean Identifier }
space { @whitespace+ }
Number { $[0-9]+ ('.' $[0-9]+)? }
Boolean { "true" | "false" }
String { '"' !["]* '"' }
Identifier { $[A-Za-z_]$[A-Za-z_0-9-]* }
fn[@name=Keyword] { "fn" }
equals[@name=Operator] { "=" }
":"[@name=Colon]
@ -23,6 +20,10 @@
rightParen[@name=Paren] { ")" }
}
@external tokens identifierTokenizer from "./tokenizers" {
Identifier
}
@precedence {
multiplicative @left,
additive @left,

View File

@ -1,8 +1,8 @@
// This file was generated by lezer-generator. You probably shouldn't edit it.
export const
Program = 1,
Assignment = 2,
Identifier = 3,
Identifier = 1,
Program = 2,
Assignment = 3,
equals = 4,
Function = 5,
fn = 6,

View File

@ -1,9 +1,27 @@
import { regenerateParser } from '@/parser/test-helper'
import { expect, beforeAll, describe, test } from 'bun:test'
import { expect, describe, test } from 'bun:test'
describe('Identifier', () => {
test('parses simple identifiers', () => {
expect('hyphenated-var').toMatchTree(`Identifier hyphenated-var`)
expect('var').toMatchTree(`Identifier var`)
expect('var123').toMatchTree(`Identifier var123`)
})
test('fails on underscores and capital letters', () => {
expect('myVar').toFailParse()
expect('underscore_var').toFailParse()
expect('_leadingUnderscore').toFailParse()
expect('trailingUnderscore_').toFailParse()
expect('mixed-123_var').toFailParse()
})
test('parses identifiers with emojis', () => {
expect('var😊').toMatchTree(`Identifier var😊`)
expect('😊').toMatchTree(`Identifier 😊`)
})
})
describe('BinOp', () => {
beforeAll(() => regenerateParser())
test('addition tests', () => {
expect('2 + 3').toMatchTree(`
BinOp
@ -60,8 +78,6 @@ describe('BinOp', () => {
})
describe('Fn', () => {
beforeAll(() => regenerateParser())
test('parses function with single parameter', () => {
expect('fn x: x + 1').toMatchTree(`
Function
@ -109,8 +125,6 @@ describe('Fn', () => {
})
describe('Identifier', () => {
beforeAll(() => regenerateParser())
test('parses hyphenated identifiers correctly', () => {
expect('my-var - another-var').toMatchTree(`
BinOp
@ -133,8 +147,6 @@ describe('Identifier', () => {
})
describe('Assignment', () => {
beforeAll(() => regenerateParser())
test('parses assignment with addition', () => {
expect('x = 5 + 3').toMatchTree(`
Assignment
@ -165,8 +177,6 @@ describe('Assignment', () => {
})
describe('Parentheses', () => {
beforeAll(() => regenerateParser())
test('parses expressions with parentheses correctly', () => {
expect('(2 + 3) * 4').toMatchTree(`
BinOp
@ -205,8 +215,6 @@ describe('Parentheses', () => {
})
describe('multiline', () => {
beforeAll(() => regenerateParser())
test('parses multiline expressions', () => {
expect(`
5 + 4

View File

@ -1,18 +1,19 @@
// This file was generated by lezer-generator. You probably shouldn't edit it.
import {LRParser} from "@lezer/lr"
import {identifierTokenizer} from "./tokenizers"
import {highlighting} from "./highlight.js"
export const parser = LRParser.deserialize({
version: 14,
states: "$OQVQPOOOkQPO'#CuO!fQPO'#CaO!nQPO'#CoOOQO'#Cu'#CuOVQPO'#CuOOQO'#Ct'#CtQVQPOOOVQPO,58xOOQO'#Cp'#CpO#cQPO'#CcO#kQPO,58{OVQPO,59POVQPO,59PO#pQPO,59aOOQO-E6m-E6mO$RQPO1G.dOOQO-E6n-E6nOVQPO1G.gOOQO1G.k1G.kO$yQPO1G.kOOQO1G.{1G.{O%qQPO7+$R",
stateData: "&n~OgOS~ORPOUQO^SO_SO`SOaTO~OSWORiXUiXYiXZiX[iX]iX^iX_iX`iXaiXeiXbiX~ORXOWVP~OY[OZ[O[]O]]ORcXUcX^cX_cX`cXacXecX~ORXOWVX~OWbO~OY[OZ[O[]O]]ObeO~OY[OZ[O[]O]]ORQiUQi^Qi_Qi`QiaQieQibQi~OY[OZ[ORXiUXi[Xi]Xi^Xi_Xi`XiaXieXibXi~OY[OZ[O[]O]]ORTqUTq^Tq_Tq`TqaTqeTqbTq~OU`R`~",
goto: "!hjPPkPPkPtPkPPPPPPPPPw}PPP!Tk_UOTVW[]bRZQQVOR_VQYQRaYSROVQ^TQ`WQc[Qd]Rfb",
nodeNames: "⚠ Program Assignment Identifier Operator Function Keyword Params Colon BinOp Operator Operator Operator Operator Number String Boolean Paren Paren",
states: "$OQVQROOOkQRO'#CuO!fQRO'#CaO!nQRO'#CoOOQQ'#Cu'#CuOVQRO'#CuOOQQ'#Ct'#CtQVQROOOVQRO,58yOOQQ'#Cp'#CpO#cQRO'#CcO#kQPO,58{OVQRO,59POVQRO,59PO#pQPO,59aOOQQ-E6m-E6mO$RQRO1G.eOOQQ-E6n-E6nOVQRO1G.gOOQQ1G.k1G.kO$yQRO1G.kOOQQ1G.{1G.{O%qQRO7+$R",
stateData: "&i~OgOS~OPPOUQO^SO_SO`SOaTO~OSWOPiXUiXYiXZiX[iX]iX^iX_iX`iXaiXeiXbiX~OPXOWVP~OY[OZ[O[]O]]OPcXUcX^cX_cX`cXacXecX~OPXOWVX~OWbO~OY[OZ[O[]O]]ObeO~OY[OZ[O[]O]]OPRiURi^Ri_Ri`RiaRieRibRi~OY[OZ[OPXiUXi[Xi]Xi^Xi_Xi`XiaXieXibXi~OY[OZ[O[]O]]OPTqUTq^Tq_Tq`TqaTqeTqbTq~O",
goto: "!hjPPPkPkPtPkPPPPPPPPPw}PPP!Tk_UOTVW[]bRZQQVOR_VQYQRaYSROVQ^TQ`WQc[Qd]Rfb",
nodeNames: "⚠ Identifier Program Assignment Operator Function Keyword Params Colon BinOp Operator Operator Operator Operator Number String Boolean Paren Paren",
maxTerm: 25,
propSources: [highlighting],
skippedNodes: [0],
repeatNodeCount: 2,
tokenData: "*f~RkX^!vpq!vrs#kxy$Yyz$_z{$d{|$i}!O$n!P!Q$s!Q![$x![!]%c!_!`%h!c!}%m#R#S%m#T#Y%m#Y#Z&R#Z#h%m#h#i)`#i#o%m#y#z!v$f$g!v#BY#BZ!v$IS$I_!v$I|$JO!v$JT$JU!v$KV$KW!v&FU&FV!v~!{Yg~X^!vpq!v#y#z!v$f$g!v#BY#BZ!v$IS$I_!v$I|$JO!v$JT$JU!v$KV$KW!v&FU&FV!v~#nTOr#krs#}s;'S#k;'S;=`$S<%lO#k~$SO_~~$VP;=`<%l#k~$_Oa~~$dOb~~$iOY~~$nO[~~$sO]~~$xOZ~~$}Q^~!O!P%T!Q![$x~%WP!Q![%Z~%`P^~!Q![%Z~%hOW~~%mOS~~%rTR~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~&WWR~}!O%m!Q![%m!c!}%m#R#S%m#T#U&p#U#b%m#b#c(x#c#o%m~&uVR~}!O%m!Q![%m!c!}%m#R#S%m#T#`%m#`#a'[#a#o%m~'aVR~}!O%m!Q![%m!c!}%m#R#S%m#T#g%m#g#h'v#h#o%m~'{VR~}!O%m!Q![%m!c!}%m#R#S%m#T#X%m#X#Y(b#Y#o%m~(iT`~R~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~)PTU~R~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~)eVR~}!O%m!Q![%m!c!}%m#R#S%m#T#f%m#f#g)z#g#o%m~*PVR~}!O%m!Q![%m!c!}%m#R#S%m#T#i%m#i#j'v#j#o%m",
tokenizers: [0],
topRules: {"Program":[0,1]},
tokenPrec: 255
tokenData: "&a~RfX^!gpq!grs#[xy#yyz$Oz{$T{|$Y}!O$_!P!Q$d!Q![$i![!]%S!_!`%X#Y#Z%^#h#i&T#y#z!g$f$g!g#BY#BZ!g$IS$I_!g$I|$JO!g$JT$JU!g$KV$KW!g&FU&FV!g~!lYg~X^!gpq!g#y#z!g$f$g!g#BY#BZ!g$IS$I_!g$I|$JO!g$JT$JU!g$KV$KW!g&FU&FV!g~#_TOr#[rs#ns;'S#[;'S;=`#s<%lO#[~#sO_~~#vP;=`<%l#[~$OOa~~$TOb~~$YOY~~$_O[~~$dO]~~$iOZ~~$nQ^~!O!P$t!Q![$i~$wP!Q![$z~%PP^~!Q![$z~%XOW~~%^OS~~%aQ#T#U%g#b#c&O~%jP#`#a%m~%pP#g#h%s~%vP#X#Y%y~&OO`~~&TOU~~&WP#f#g&Z~&^P#i#j%s",
tokenizers: [0, identifierTokenizer],
topRules: {"Program":[0,2]},
tokenPrec: 0
})

82
src/parser/tokenizers.ts Normal file
View File

@ -0,0 +1,82 @@
import { ExternalTokenizer, InputStream } from '@lezer/lr'
import { Identifier } from './shrimp.terms'
function isLowercaseLetter(ch: number): boolean {
return ch >= 97 && ch <= 122 // a-z
}
function isDigit(ch: number): boolean {
return ch >= 48 && ch <= 57 // 0-9
}
function getFullCodePoint(input: InputStream, pos: number): number {
const ch = input.peek(pos)
// Check if this is a high surrogate (0xD800-0xDBFF)
if (ch >= 0xd800 && ch <= 0xdbff) {
const low = input.peek(pos + 1)
// Check if next is low surrogate (0xDC00-0xDFFF)
if (low >= 0xdc00 && low <= 0xdfff) {
// Combine surrogate pair into full code point
return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff)
}
}
return ch // Single code unit
}
function isEmoji(ch: number): boolean {
return (
// Basic Emoticons
(ch >= 0x1f600 && ch <= 0x1f64f) ||
// Miscellaneous Symbols and Pictographs
(ch >= 0x1f300 && ch <= 0x1f5ff) ||
// Transport and Map Symbols
(ch >= 0x1f680 && ch <= 0x1f6ff) ||
// Regional Indicator Symbols (flags)
(ch >= 0x1f1e6 && ch <= 0x1f1ff) ||
// Miscellaneous Symbols (hearts, stars, weather)
(ch >= 0x2600 && ch <= 0x26ff) ||
// Dingbats (scissors, pencils, etc)
(ch >= 0x2700 && ch <= 0x27bf) ||
// Supplemental Symbols and Pictographs (newer emojis)
(ch >= 0x1f900 && ch <= 0x1f9ff) ||
// Symbols and Pictographs Extended-A (newest emojis)
(ch >= 0x1fa70 && ch <= 0x1faff) ||
// Various Asian Characters with emoji presentation
(ch >= 0x1f018 && ch <= 0x1f270) ||
// Variation Selectors (for emoji presentation)
(ch >= 0xfe00 && ch <= 0xfe0f) ||
// Additional miscellaneous items
(ch >= 0x238c && ch <= 0x2454) ||
// Combining Diacritical Marks for Symbols
(ch >= 0x20d0 && ch <= 0x20ff)
)
}
export const identifierTokenizer = new ExternalTokenizer((input: InputStream) => {
const ch = getFullCodePoint(input, 0)
if (isLowercaseLetter(ch) || isEmoji(ch)) {
let pos = ch > 0xffff ? 2 : 1 // emoji takes 2 UTF-16 code units
// Continue consuming identifier characters
while (true) {
const nextCh = getFullCodePoint(input, pos)
if (
isLowercaseLetter(nextCh) ||
isDigit(nextCh) ||
nextCh === 45 /* - */ ||
isEmoji(nextCh)
) {
pos += nextCh > 0xffff ? 2 : 1 // advance by 1 or 2 UTF-16 code units
} else {
break
}
}
input.advance(pos) // advance by total length
input.acceptToken(Identifier)
}
})

View File

@ -1,25 +1,34 @@
import { beforeAll, expect } from 'bun:test'
import { expect } from 'bun:test'
import { Tree, TreeCursor } from '@lezer/common'
import grammarFile from './shrimp.grammar'
import { parser } from './shrimp.ts'
import { parser } from './parser/shrimp.ts'
import { $ } from 'bun'
// Regenerate the parser if the grammar file is newer than the generated parser
// This makes --watch work without needing to manually regenerate the parser
export const regenerateParser = async () => {
const grammarStat = await Bun.file('src/parser/shrimp.grammar').stat()
const jsStat = await Bun.file('src/parser/shrimp.ts').stat()
const regenerateParser = async () => {
let generate = true
try {
const grammarStat = await Bun.file('./src/parser/shrimp.grammar').stat()
const tokenizerStat = await Bun.file('./src/parser/tokenizers.ts').stat()
const parserStat = await Bun.file('./src/parser/shrimp.ts').stat()
if (grammarStat.mtime <= jsStat.mtime) return
console.log(`Regenerating parser from ${grammarFile}...`)
if (grammarStat.mtime <= parserStat.mtime && tokenizerStat.mtime <= parserStat.mtime) {
generate = false
}
} catch (e) {
console.error('Error checking or regenerating parser:', e)
} finally {
if (generate) {
await $`bun generate-parser`
}
}
}
await regenerateParser()
// Type declaration for TypeScript
declare module 'bun:test' {
interface Matchers<T> {
toMatchTree(expected: string): T
toFailParse(): T
}
}
@ -46,6 +55,45 @@ expect.extend({
}
}
},
toFailParse(received: unknown) {
if (typeof received !== 'string') {
return {
message: () => 'toMatchTree can only be used with string values',
pass: false,
}
}
try {
const tree = parser.parse(received)
let hasErrors = false
tree.iterate({
enter(n) {
if (n.type.isError) {
hasErrors = true
return false
}
},
})
if (hasErrors) {
return {
message: () => `Expected input to fail parsing, and it did.`,
pass: true,
}
} else {
const actual = treeToString(tree, received)
return {
message: () => `Expected input to fail parsing, but it parsed successfully:\n${actual}`,
pass: false,
}
}
} catch (error) {
return {
message: () => `Parsing threw an error: ${(error as Error).message}`,
pass: false,
}
}
},
})
const treeToString = (tree: Tree, input: string): string => {