From 7cf7ac3703bfe1fa924d9883710a580625e031e0 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Sun, 26 Oct 2025 13:02:05 -0700 Subject: [PATCH] allow more unicode in variable names --- src/parser/tests/basics.test.ts | 198 ++++++++++++++++++++++++++++++++ src/parser/tokenizer.ts | 28 ++++- 2 files changed, 221 insertions(+), 5 deletions(-) diff --git a/src/parser/tests/basics.test.ts b/src/parser/tests/basics.test.ts index 3299ff3..1716240 100644 --- a/src/parser/tests/basics.test.ts +++ b/src/parser/tests/basics.test.ts @@ -30,6 +30,204 @@ describe('Identifier', () => { FunctionCallOrIdentifier Identifier moo-😊-34`) }) + + test('parses mathematical unicode symbols like πœ‹ as identifiers', () => { + expect('πœ‹').toMatchTree(` + FunctionCallOrIdentifier + Identifier πœ‹`) + }) +}) + +describe('Unicode Symbol Support', () => { + describe('Emoji (currently supported)', () => { + test('Basic Emoticons (U+1F600-U+1F64F)', () => { + expect('πŸ˜€').toMatchTree(` + FunctionCallOrIdentifier + Identifier πŸ˜€`) + + expect('😊-counter').toMatchTree(` + FunctionCallOrIdentifier + Identifier 😊-counter`) + }) + + test('Miscellaneous Symbols and Pictographs (U+1F300-U+1F5FF)', () => { + expect('🌍').toMatchTree(` + FunctionCallOrIdentifier + Identifier 🌍`) + + expect('πŸ”₯-handler').toMatchTree(` + FunctionCallOrIdentifier + Identifier πŸ”₯-handler`) + }) + + test('Transport and Map Symbols (U+1F680-U+1F6FF)', () => { + expect('πŸš€').toMatchTree(` + FunctionCallOrIdentifier + Identifier πŸš€`) + + expect('πŸš€-launch').toMatchTree(` + FunctionCallOrIdentifier + Identifier πŸš€-launch`) + }) + + test('Regional Indicator Symbols / Flags (U+1F1E6-U+1F1FF)', () => { + // Note: Flags are typically two regional indicators combined + expect('πŸ‡Ί').toMatchTree(` + FunctionCallOrIdentifier + Identifier πŸ‡Ί`) + }) + + test('Supplemental Symbols and Pictographs (U+1F900-U+1F9FF)', () => { + expect('πŸ€–').toMatchTree(` + FunctionCallOrIdentifier + Identifier πŸ€–`) + + expect('πŸ¦€-lang').toMatchTree(` + FunctionCallOrIdentifier + Identifier πŸ¦€-lang`) + }) + + test('Dingbats (U+2700-U+27BF)', () => { + expect('βœ‚').toMatchTree(` + FunctionCallOrIdentifier + Identifier βœ‚`) + + expect('✨-magic').toMatchTree(` + FunctionCallOrIdentifier + Identifier ✨-magic`) + }) + + test('Miscellaneous Symbols (U+2600-U+26FF)', () => { + expect('⚑').toMatchTree(` + FunctionCallOrIdentifier + Identifier ⚑`) + + expect('β˜€-bright').toMatchTree(` + FunctionCallOrIdentifier + Identifier β˜€-bright`) + }) + }) + + describe('Greek Letters (not currently supported)', () => { + test('Greek lowercase alpha Ξ± (U+03B1)', () => { + expect('Ξ±').toMatchTree(` + FunctionCallOrIdentifier + Identifier Ξ±`) + }) + + test('Greek lowercase beta Ξ² (U+03B2)', () => { + expect('Ξ²').toMatchTree(` + FunctionCallOrIdentifier + Identifier Ξ²`) + }) + + test('Greek lowercase lambda Ξ» (U+03BB)', () => { + expect('Ξ»').toMatchTree(` + FunctionCallOrIdentifier + Identifier Ξ»`) + }) + + test('Greek lowercase pi Ο€ (U+03C0)', () => { + // Note: This is different from mathematical pi πœ‹ + expect('Ο€').toMatchTree(` + FunctionCallOrIdentifier + Identifier Ο€`) + }) + }) + + describe('Mathematical Alphanumeric Symbols (not currently supported)', () => { + test('Mathematical italic small pi πœ‹ (U+1D70B)', () => { + expect('πœ‹').toMatchTree(` + FunctionCallOrIdentifier + Identifier πœ‹`) + }) + + test('Mathematical bold small x 𝐱 (U+1D431)', () => { + expect('𝐱').toMatchTree(` + FunctionCallOrIdentifier + Identifier 𝐱`) + }) + + test('Mathematical script capital F 𝓕 (U+1D4D5)', () => { + expect('𝓕').toMatchTree(` + FunctionCallOrIdentifier + Identifier 𝓕`) + }) + }) + + describe('Mathematical Operators (not currently supported)', () => { + test('Infinity symbol ∞ (U+221E)', () => { + expect('∞').toMatchTree(` + FunctionCallOrIdentifier + Identifier ∞`) + }) + + test('Sum symbol βˆ‘ (U+2211)', () => { + expect('βˆ‘').toMatchTree(` + FunctionCallOrIdentifier + Identifier βˆ‘`) + }) + + test('Integral symbol ∫ (U+222B)', () => { + expect('∫').toMatchTree(` + FunctionCallOrIdentifier + Identifier ∫`) + }) + }) + + describe('Superscripts and Subscripts (not currently supported)', () => { + test('Superscript two Β² (U+00B2)', () => { + expect('xΒ²').toMatchTree(` + FunctionCallOrIdentifier + Identifier xΒ²`) + }) + + test('Subscript two β‚‚ (U+2082)', () => { + expect('hβ‚‚o').toMatchTree(` + FunctionCallOrIdentifier + Identifier hβ‚‚o`) + }) + }) + + describe('Arrows (not currently supported)', () => { + test('Rightward arrow β†’ (U+2192)', () => { + expect('β†’').toMatchTree(` + FunctionCallOrIdentifier + Identifier β†’`) + }) + + test('Leftward arrow ← (U+2190)', () => { + expect('←').toMatchTree(` + FunctionCallOrIdentifier + Identifier ←`) + }) + + test('Double rightward arrow β‡’ (U+21D2)', () => { + expect('β‡’').toMatchTree(` + FunctionCallOrIdentifier + Identifier β‡’`) + }) + }) + + describe('CJK Symbols (not currently supported)', () => { + test('Hiragana あ (U+3042)', () => { + expect('あ').toMatchTree(` + FunctionCallOrIdentifier + Identifier あ`) + }) + + test('Katakana γ‚« (U+30AB)', () => { + expect('γ‚«').toMatchTree(` + FunctionCallOrIdentifier + Identifier γ‚«`) + }) + + test('CJK Unified Ideograph δΈ­ (U+4E2D)', () => { + expect('δΈ­').toMatchTree(` + FunctionCallOrIdentifier + Identifier δΈ­`) + }) + }) }) describe('Parentheses', () => { diff --git a/src/parser/tokenizer.ts b/src/parser/tokenizer.ts index 8963ffb..8df852a 100644 --- a/src/parser/tokenizer.ts +++ b/src/parser/tokenizer.ts @@ -14,7 +14,7 @@ export const tokenizer = new ExternalTokenizer( // Don't consume things that start with - or + followed by a digit (negative/positive numbers) if ((ch === 45 /* - */ || ch === 43) /* + */ && isDigit(input.peek(1))) return - const isValidStart = isLowercaseLetter(ch) || isEmoji(ch) + const isValidStart = isLowercaseLetter(ch) || isEmojiOrUnicode(ch) const canBeWord = stack.canShift(Word) // Consume all word characters, tracking if it remains a valid identifier @@ -106,8 +106,8 @@ const consumeWordToken = ( if (!isWordChar(nextCh)) break } - // Track identifier validity: must be lowercase, digit, dash, or emoji - if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 /* - */ && !isEmoji(ch)) { + // Track identifier validity: must be lowercase, digit, dash, or emoji/unicode + if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 /* - */ && !isEmojiOrUnicode(ch)) { if (!canBeWord) break isValidIdentifier = false } @@ -217,7 +217,7 @@ const getFullCodePoint = (input: InputStream, pos: number): number => { return ch } -const isEmoji = (ch: number): boolean => { +const isEmojiOrUnicode = (ch: number): boolean => { return ( // Basic Emoticons (ch >= 0x1f600 && ch <= 0x1f64f) || @@ -242,7 +242,25 @@ const isEmoji = (ch: number): boolean => { // Additional miscellaneous items (ch >= 0x238c && ch <= 0x2454) || // Combining Diacritical Marks for Symbols - (ch >= 0x20d0 && ch <= 0x20ff) + (ch >= 0x20d0 && ch <= 0x20ff) || + // Latin-1 Supplement (includes Β², Β³, ΒΉ and other special chars) + (ch >= 0x00a0 && ch <= 0x00ff) || + // Greek and Coptic (U+0370-U+03FF) + (ch >= 0x0370 && ch <= 0x03ff) || + // Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF) + (ch >= 0x1d400 && ch <= 0x1d7ff) || + // Mathematical Operators (U+2200-U+22FF) + (ch >= 0x2200 && ch <= 0x22ff) || + // Superscripts and Subscripts (U+2070-U+209F) + (ch >= 0x2070 && ch <= 0x209f) || + // Arrows (U+2190-U+21FF) + (ch >= 0x2190 && ch <= 0x21ff) || + // Hiragana (U+3040-U+309F) + (ch >= 0x3040 && ch <= 0x309f) || + // Katakana (U+30A0-U+30FF) + (ch >= 0x30a0 && ch <= 0x30ff) || + // CJK Unified Ideographs (U+4E00-U+9FFF) + (ch >= 0x4e00 && ch <= 0x9fff) ) }