emoji

2025-09-29 11:40:32 -07:00 · 2025-09-29 11:40:32 -07:00 · 0168d7f933
commit 0168d7f933
parent 7585f0e8a2
8 changed files with 186 additions and 42 deletions
--- a/bunfig.toml
+++ b/bunfig.toml
@ -2,3 +2,6 @@
 [serve.static]
 plugins = ["bun-plugin-tailwind"]
 env = "BUN_PUBLIC_*"
+
+[test]
+preload = ["./src/testSetup.ts"]
--- a/package.json
+++ b/package.json
@ -4,6 +4,7 @@
  "private": true,
  "type": "module",
  "scripts": {
+    "pretest": "bun generate-parser",
    "serve": "bun --hot src/server/server.tsx",
    "generate-parser": "lezer-generator src/parser/shrimp.grammar --typeScript -o src/parser/shrimp.ts"
  },
--- a/src/parser/shrimp.grammar
+++ b/src/parser/shrimp.grammar
@ -1,17 +1,14 @@
@external propSource highlighting from "./highlight.js"
-
@top Program { expr* }

@skip { space }

@tokens {
-  @precedence { fn Boolean Identifier }
-
  space { @whitespace+ }
  Number { $[0-9]+ ('.' $[0-9]+)? }
  Boolean { "true" | "false" }
  String { '"' !["]* '"' }
-  Identifier { $[A-Za-z_]$[A-Za-z_0-9-]* }
+  
  fn[@name=Keyword] { "fn" }
  equals[@name=Operator] { "=" }
  ":"[@name=Colon]
@ -23,6 +20,10 @@
  rightParen[@name=Paren] { ")" }
 }

+@external tokens identifierTokenizer from "./tokenizers" { 
+  Identifier 
+}
+
@precedence {
  multiplicative @left,
  additive @left,
--- a/src/parser/shrimp.terms.ts
+++ b/src/parser/shrimp.terms.ts
@ -1,8 +1,8 @@
 // This file was generated by lezer-generator. You probably shouldn't edit it.
 export const
-  Program = 1,
-  Assignment = 2,
-  Identifier = 3,
+  Identifier = 1,
+  Program = 2,
+  Assignment = 3,
  equals = 4,
  Function = 5,
  fn = 6,
--- a/src/parser/shrimp.test.ts
+++ b/src/parser/shrimp.test.ts
@ -1,9 +1,27 @@
-import { regenerateParser } from '@/parser/test-helper'
-import { expect, beforeAll, describe, test } from 'bun:test'
+import { expect, describe, test } from 'bun:test'
+
+describe('Identifier', () => {
+  test('parses simple identifiers', () => {
+    expect('hyphenated-var').toMatchTree(`Identifier hyphenated-var`)
+    expect('var').toMatchTree(`Identifier var`)
+    expect('var123').toMatchTree(`Identifier var123`)
+  })
+
+  test('fails on underscores and capital letters', () => {
+    expect('myVar').toFailParse()
+    expect('underscore_var').toFailParse()
+    expect('_leadingUnderscore').toFailParse()
+    expect('trailingUnderscore_').toFailParse()
+    expect('mixed-123_var').toFailParse()
+  })
+
+  test('parses identifiers with emojis', () => {
+    expect('var😊').toMatchTree(`Identifier var😊`)
+    expect('😊').toMatchTree(`Identifier 😊`)
+  })
+})

 describe('BinOp', () => {
-  beforeAll(() => regenerateParser())
-
  test('addition tests', () => {
    expect('2 + 3').toMatchTree(`
      BinOp
@ -60,8 +78,6 @@ describe('BinOp', () => {
 })

 describe('Fn', () => {
-  beforeAll(() => regenerateParser())
-
  test('parses function with single parameter', () => {
    expect('fn x: x + 1').toMatchTree(`
      Function
@ -109,8 +125,6 @@ describe('Fn', () => {
 })

 describe('Identifier', () => {
-  beforeAll(() => regenerateParser())
-
  test('parses hyphenated identifiers correctly', () => {
    expect('my-var - another-var').toMatchTree(`
      BinOp
@ -133,8 +147,6 @@ describe('Identifier', () => {
 })

 describe('Assignment', () => {
-  beforeAll(() => regenerateParser())
-
  test('parses assignment with addition', () => {
    expect('x = 5 + 3').toMatchTree(`
      Assignment
@ -165,8 +177,6 @@ describe('Assignment', () => {
 })

 describe('Parentheses', () => {
-  beforeAll(() => regenerateParser())
-
  test('parses expressions with parentheses correctly', () => {
    expect('(2 + 3) * 4').toMatchTree(`
      BinOp
@ -205,8 +215,6 @@ describe('Parentheses', () => {
 })

 describe('multiline', () => {
-  beforeAll(() => regenerateParser())
-
  test('parses multiline expressions', () => {
    expect(`
      5 + 4
--- a/src/parser/shrimp.ts
+++ b/src/parser/shrimp.ts
@ -1,18 +1,19 @@
 // This file was generated by lezer-generator. You probably shouldn't edit it.
 import {LRParser} from "@lezer/lr"
+import {identifierTokenizer} from "./tokenizers"
 import {highlighting} from "./highlight.js"
 export const parser = LRParser.deserialize({
  version: 14,
-  states: "$OQVQPOOOkQPO'#CuO!fQPO'#CaO!nQPO'#CoOOQO'#Cu'#CuOVQPO'#CuOOQO'#Ct'#CtQVQPOOOVQPO,58xOOQO'#Cp'#CpO#cQPO'#CcO#kQPO,58{OVQPO,59POVQPO,59PO#pQPO,59aOOQO-E6m-E6mO$RQPO1G.dOOQO-E6n-E6nOVQPO1G.gOOQO1G.k1G.kO$yQPO1G.kOOQO1G.{1G.{O%qQPO7+$R",
-  stateData: "&n~OgOS~ORPOUQO^SO_SO`SOaTO~OSWORiXUiXYiXZiX[iX]iX^iX_iX`iXaiXeiXbiX~ORXOWVP~OY[OZ[O[]O]]ORcXUcX^cX_cX`cXacXecX~ORXOWVX~OWbO~OY[OZ[O[]O]]ObeO~OY[OZ[O[]O]]ORQiUQi^Qi_Qi`QiaQieQibQi~OY[OZ[ORXiUXi[Xi]Xi^Xi_Xi`XiaXieXibXi~OY[OZ[O[]O]]ORTqUTq^Tq_Tq`TqaTqeTqbTq~OU`R`~",
-  goto: "!hjPPkPPkPtPkPPPPPPPPPw}PPP!Tk_UOTVW[]bRZQQVOR_VQYQRaYSROVQ^TQ`WQc[Qd]Rfb",
-  nodeNames: "⚠ Program Assignment Identifier Operator Function Keyword Params Colon BinOp Operator Operator Operator Operator Number String Boolean Paren Paren",
+  states: "$OQVQROOOkQRO'#CuO!fQRO'#CaO!nQRO'#CoOOQQ'#Cu'#CuOVQRO'#CuOOQQ'#Ct'#CtQVQROOOVQRO,58yOOQQ'#Cp'#CpO#cQRO'#CcO#kQPO,58{OVQRO,59POVQRO,59PO#pQPO,59aOOQQ-E6m-E6mO$RQRO1G.eOOQQ-E6n-E6nOVQRO1G.gOOQQ1G.k1G.kO$yQRO1G.kOOQQ1G.{1G.{O%qQRO7+$R",
+  stateData: "&i~OgOS~OPPOUQO^SO_SO`SOaTO~OSWOPiXUiXYiXZiX[iX]iX^iX_iX`iXaiXeiXbiX~OPXOWVP~OY[OZ[O[]O]]OPcXUcX^cX_cX`cXacXecX~OPXOWVX~OWbO~OY[OZ[O[]O]]ObeO~OY[OZ[O[]O]]OPRiURi^Ri_Ri`RiaRieRibRi~OY[OZ[OPXiUXi[Xi]Xi^Xi_Xi`XiaXieXibXi~OY[OZ[O[]O]]OPTqUTq^Tq_Tq`TqaTqeTqbTq~O",
+  goto: "!hjPPPkPkPtPkPPPPPPPPPw}PPP!Tk_UOTVW[]bRZQQVOR_VQYQRaYSROVQ^TQ`WQc[Qd]Rfb",
+  nodeNames: "⚠ Identifier Program Assignment Operator Function Keyword Params Colon BinOp Operator Operator Operator Operator Number String Boolean Paren Paren",
  maxTerm: 25,
  propSources: [highlighting],
  skippedNodes: [0],
  repeatNodeCount: 2,
-  tokenData: "*f~RkX^!vpq!vrs#kxy$Yyz$_z{$d{|$i}!O$n!P!Q$s!Q![$x![!]%c!_!`%h!c!}%m#R#S%m#T#Y%m#Y#Z&R#Z#h%m#h#i)`#i#o%m#y#z!v$f$g!v#BY#BZ!v$IS$I_!v$I|$JO!v$JT$JU!v$KV$KW!v&FU&FV!v~!{Yg~X^!vpq!v#y#z!v$f$g!v#BY#BZ!v$IS$I_!v$I|$JO!v$JT$JU!v$KV$KW!v&FU&FV!v~#nTOr#krs#}s;'S#k;'S;=`$S<%lO#k~$SO_~~$VP;=`<%l#k~$_Oa~~$dOb~~$iOY~~$nO[~~$sO]~~$xOZ~~$}Q^~!O!P%T!Q![$x~%WP!Q![%Z~%`P^~!Q![%Z~%hOW~~%mOS~~%rTR~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~&WWR~}!O%m!Q![%m!c!}%m#R#S%m#T#U&p#U#b%m#b#c(x#c#o%m~&uVR~}!O%m!Q![%m!c!}%m#R#S%m#T#`%m#`#a'[#a#o%m~'aVR~}!O%m!Q![%m!c!}%m#R#S%m#T#g%m#g#h'v#h#o%m~'{VR~}!O%m!Q![%m!c!}%m#R#S%m#T#X%m#X#Y(b#Y#o%m~(iT`~R~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~)PTU~R~}!O%m!Q![%m!c!}%m#R#S%m#T#o%m~)eVR~}!O%m!Q![%m!c!}%m#R#S%m#T#f%m#f#g)z#g#o%m~*PVR~}!O%m!Q![%m!c!}%m#R#S%m#T#i%m#i#j'v#j#o%m",
-  tokenizers: [0],
-  topRules: {"Program":[0,1]},
-  tokenPrec: 255
+  tokenData: "&a~RfX^!gpq!grs#[xy#yyz$Oz{$T{|$Y}!O$_!P!Q$d!Q![$i![!]%S!_!`%X#Y#Z%^#h#i&T#y#z!g$f$g!g#BY#BZ!g$IS$I_!g$I|$JO!g$JT$JU!g$KV$KW!g&FU&FV!g~!lYg~X^!gpq!g#y#z!g$f$g!g#BY#BZ!g$IS$I_!g$I|$JO!g$JT$JU!g$KV$KW!g&FU&FV!g~#_TOr#[rs#ns;'S#[;'S;=`#s<%lO#[~#sO_~~#vP;=`<%l#[~$OOa~~$TOb~~$YOY~~$_O[~~$dO]~~$iOZ~~$nQ^~!O!P$t!Q![$i~$wP!Q![$z~%PP^~!Q![$z~%XOW~~%^OS~~%aQ#T#U%g#b#c&O~%jP#`#a%m~%pP#g#h%s~%vP#X#Y%y~&OO`~~&TOU~~&WP#f#g&Z~&^P#i#j%s",
+  tokenizers: [0, identifierTokenizer],
+  topRules: {"Program":[0,2]},
+  tokenPrec: 0
 })
--- a/src/parser/tokenizers.ts
+++ b/src/parser/tokenizers.ts
@ -0,0 +1,82 @@
+import { ExternalTokenizer, InputStream } from '@lezer/lr'
+import { Identifier } from './shrimp.terms'
+
+function isLowercaseLetter(ch: number): boolean {
+  return ch >= 97 && ch <= 122 // a-z
+}
+
+function isDigit(ch: number): boolean {
+  return ch >= 48 && ch <= 57 // 0-9
+}
+
+function getFullCodePoint(input: InputStream, pos: number): number {
+  const ch = input.peek(pos)
+
+  // Check if this is a high surrogate (0xD800-0xDBFF)
+  if (ch >= 0xd800 && ch <= 0xdbff) {
+    const low = input.peek(pos + 1)
+    // Check if next is low surrogate (0xDC00-0xDFFF)
+    if (low >= 0xdc00 && low <= 0xdfff) {
+      // Combine surrogate pair into full code point
+      return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff)
+    }
+  }
+
+  return ch // Single code unit
+}
+
+function isEmoji(ch: number): boolean {
+  return (
+    // Basic Emoticons
+    (ch >= 0x1f600 && ch <= 0x1f64f) ||
+    // Miscellaneous Symbols and Pictographs
+    (ch >= 0x1f300 && ch <= 0x1f5ff) ||
+    // Transport and Map Symbols
+    (ch >= 0x1f680 && ch <= 0x1f6ff) ||
+    // Regional Indicator Symbols (flags)
+    (ch >= 0x1f1e6 && ch <= 0x1f1ff) ||
+    // Miscellaneous Symbols (hearts, stars, weather)
+    (ch >= 0x2600 && ch <= 0x26ff) ||
+    // Dingbats (scissors, pencils, etc)
+    (ch >= 0x2700 && ch <= 0x27bf) ||
+    // Supplemental Symbols and Pictographs (newer emojis)
+    (ch >= 0x1f900 && ch <= 0x1f9ff) ||
+    // Symbols and Pictographs Extended-A (newest emojis)
+    (ch >= 0x1fa70 && ch <= 0x1faff) ||
+    // Various Asian Characters with emoji presentation
+    (ch >= 0x1f018 && ch <= 0x1f270) ||
+    // Variation Selectors (for emoji presentation)
+    (ch >= 0xfe00 && ch <= 0xfe0f) ||
+    // Additional miscellaneous items
+    (ch >= 0x238c && ch <= 0x2454) ||
+    // Combining Diacritical Marks for Symbols
+    (ch >= 0x20d0 && ch <= 0x20ff)
+  )
+}
+
+export const identifierTokenizer = new ExternalTokenizer((input: InputStream) => {
+  const ch = getFullCodePoint(input, 0)
+
+  if (isLowercaseLetter(ch) || isEmoji(ch)) {
+    let pos = ch > 0xffff ? 2 : 1 // emoji takes 2 UTF-16 code units
+
+    // Continue consuming identifier characters
+    while (true) {
+      const nextCh = getFullCodePoint(input, pos)
+
+      if (
+        isLowercaseLetter(nextCh) ||
+        isDigit(nextCh) ||
+        nextCh === 45 /* - */ ||
+        isEmoji(nextCh)
+      ) {
+        pos += nextCh > 0xffff ? 2 : 1 // advance by 1 or 2 UTF-16 code units
+      } else {
+        break
+      }
+    }
+
+    input.advance(pos) // advance by total length
+    input.acceptToken(Identifier)
+  }
+})
--- a/src/parser/test-helper.ts
+++ b/src/parser/test-helper.ts
@ -1,25 +1,34 @@
-import { beforeAll, expect } from 'bun:test'
+import { expect } from 'bun:test'
 import { Tree, TreeCursor } from '@lezer/common'
-import grammarFile from './shrimp.grammar'
-import { parser } from './shrimp.ts'
+import { parser } from './parser/shrimp.ts'
 import { $ } from 'bun'

-// Regenerate the parser if the grammar file is newer than the generated parser
-// This makes --watch work without needing to manually regenerate the parser
-export const regenerateParser = async () => {
-  const grammarStat = await Bun.file('src/parser/shrimp.grammar').stat()
-  const jsStat = await Bun.file('src/parser/shrimp.ts').stat()
+const regenerateParser = async () => {
+  let generate = true
+  try {
+    const grammarStat = await Bun.file('./src/parser/shrimp.grammar').stat()
+    const tokenizerStat = await Bun.file('./src/parser/tokenizers.ts').stat()
+    const parserStat = await Bun.file('./src/parser/shrimp.ts').stat()

-  if (grammarStat.mtime <= jsStat.mtime) return
-
-  console.log(`Regenerating parser from ${grammarFile}...`)
+    if (grammarStat.mtime <= parserStat.mtime && tokenizerStat.mtime <= parserStat.mtime) {
+      generate = false
+    }
+  } catch (e) {
+    console.error('Error checking or regenerating parser:', e)
+  } finally {
+    if (generate) {
      await $`bun generate-parser`
    }
+  }
+}
+
+await regenerateParser()

 // Type declaration for TypeScript
 declare module 'bun:test' {
  interface Matchers<T> {
    toMatchTree(expected: string): T
+    toFailParse(): T
  }
 }

@ -46,6 +55,45 @@ expect.extend({
      }
    }
  },
+  toFailParse(received: unknown) {
+    if (typeof received !== 'string') {
+      return {
+        message: () => 'toMatchTree can only be used with string values',
+        pass: false,
+      }
+    }
+
+    try {
+      const tree = parser.parse(received)
+      let hasErrors = false
+      tree.iterate({
+        enter(n) {
+          if (n.type.isError) {
+            hasErrors = true
+            return false
+          }
+        },
+      })
+
+      if (hasErrors) {
+        return {
+          message: () => `Expected input to fail parsing, and it did.`,
+          pass: true,
+        }
+      } else {
+        const actual = treeToString(tree, received)
+        return {
+          message: () => `Expected input to fail parsing, but it parsed successfully:\n${actual}`,
+          pass: false,
+        }
+      }
+    } catch (error) {
+      return {
+        message: () => `Parsing threw an error: ${(error as Error).message}`,
+        pass: false,
+      }
+    }
+  },
 })

 const treeToString = (tree: Tree, input: string): string => {