import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr' import { Identifier, AssignableIdentifier, Word, IdentifierBeforeDot, Do } from './shrimp.terms' // doobie doobie do (we need the `do` keyword to know when we're defining params) export function specializeKeyword(ident: string) { return ident === 'do' ? Do : -1 } // tell the dotGet searcher about builtin globals export const globals: string[] = [] export const setGlobals = (newGlobals: string[]) => { globals.length = 0 globals.push(...newGlobals) } // The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF. export const tokenizer = new ExternalTokenizer( (input: InputStream, stack: Stack) => { const ch = getFullCodePoint(input, 0) if (!isWordChar(ch)) return // Don't consume things that start with digits - let Number token handle it if (isDigit(ch)) return // Don't consume things that start with - or + followed by a digit (negative/positive numbers) if ((ch === 45 /* - */ || ch === 43) /* + */ && isDigit(input.peek(1))) return const isValidStart = isLowercaseLetter(ch) || isEmojiOrUnicode(ch) const canBeWord = stack.canShift(Word) // Consume all word characters, tracking if it remains a valid identifier const { pos, isValidIdentifier, stoppedAtDot } = consumeWordToken( input, isValidStart, canBeWord ) // Check if we should emit IdentifierBeforeDot for property access if (stoppedAtDot) { const dotGetToken = checkForDotGet(input, stack, pos) if (dotGetToken) { input.advance(pos) input.acceptToken(dotGetToken) } else { // Not in scope - continue consuming the dot as part of the word const afterDot = consumeRestOfWord(input, pos + 1, canBeWord) input.advance(afterDot) input.acceptToken(Word) } return } // Advance past the token we consumed input.advance(pos) // Choose which token to emit if (isValidIdentifier) { const token = chooseIdentifierToken(input, stack) input.acceptToken(token) } else { input.acceptToken(Word) } }, { contextual: true } ) // Build identifier text from input stream, handling surrogate pairs for emoji const buildIdentifierText = (input: InputStream, length: number): string => { let text = '' for (let i = 0; i < length; i++) { const charCode = input.peek(i) if (charCode === -1) break // Handle surrogate pairs for emoji (UTF-16 encoding) if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < length) { const low = input.peek(i + 1) if (low >= 0xdc00 && low <= 0xdfff) { text += String.fromCharCode(charCode, low) i++ // Skip the low surrogate continue } } text += String.fromCharCode(charCode) } return text } // Consume word characters, tracking if it remains a valid identifier // Returns the position after consuming, whether it's a valid identifier, and if we stopped at a dot const consumeWordToken = ( input: InputStream, isValidStart: boolean, canBeWord: boolean ): { pos: number; isValidIdentifier: boolean; stoppedAtDot: boolean } => { let pos = getCharSize(getFullCodePoint(input, 0)) let isValidIdentifier = isValidStart let stoppedAtDot = false while (true) { const ch = getFullCodePoint(input, pos) // Stop at dot if we have a valid identifier (might be property access) if (ch === 46 /* . */ && isValidIdentifier) { stoppedAtDot = true break } // Stop if we hit a non-word character if (!isWordChar(ch)) break // Context-aware termination: semicolon/colon can end a word if followed by whitespace // This allows `hello; 2` to parse correctly while `hello;world` stays as one word if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { const nextCh = getFullCodePoint(input, pos + 1) if (!isWordChar(nextCh)) break } // Track identifier validity: must be lowercase, digit, dash, or emoji/unicode if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 /* - */ && ch !== 63 /* ? */ && !isEmojiOrUnicode(ch)) { if (!canBeWord) break isValidIdentifier = false } pos += getCharSize(ch) } return { pos, isValidIdentifier, stoppedAtDot } } // Consume the rest of a word after we've decided not to treat a dot as DotGet // Used when we have "file.txt" - we already consumed "file", now consume ".txt" const consumeRestOfWord = (input: InputStream, startPos: number, canBeWord: boolean): number => { let pos = startPos while (true) { const ch = getFullCodePoint(input, pos) // Stop if we hit a non-word character if (!isWordChar(ch)) break // Context-aware termination for semicolon/colon if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { const nextCh = getFullCodePoint(input, pos + 1) if (!isWordChar(nextCh)) break } pos += getCharSize(ch) } return pos } // Check if this identifier is in scope (for property access detection) // Returns IdentifierBeforeDot token if in scope, null otherwise const checkForDotGet = (input: InputStream, stack: Stack, pos: number): number | null => { const identifierText = buildIdentifierText(input, pos) const context = stack.context as { scope: { has(name: string): boolean } } | undefined // If identifier is in scope, this is property access (e.g., obj.prop) // If not in scope, it should be consumed as a Word (e.g., file.txt) return context?.scope.has(identifierText) || globals.includes(identifierText) ? IdentifierBeforeDot : null } // Decide between AssignableIdentifier and Identifier using grammar state + peek-ahead const chooseIdentifierToken = (input: InputStream, stack: Stack): number => { const canAssignable = stack.canShift(AssignableIdentifier) const canRegular = stack.canShift(Identifier) // Only one option is valid - use it if (canAssignable && !canRegular) return AssignableIdentifier if (canRegular && !canAssignable) return Identifier // Both possible (ambiguous context) - peek ahead for '=' to disambiguate // This happens at statement start where both `x = 5` (assign) and `echo x` (call) are valid let peekPos = 0 while (true) { const ch = getFullCodePoint(input, peekPos) if (isWhiteSpace(ch)) { peekPos += getCharSize(ch) } else { break } } const nextCh = getFullCodePoint(input, peekPos) const nextCh2 = getFullCodePoint(input, peekPos + 1) // Check for compound assignment operators: +=, -=, *=, /=, %= if ([43/* + */, 45/* - */, 42/* * */, 47/* / */, 37/* % */].includes(nextCh) && nextCh2 === 61/* = */) { // Found compound operator, check if it's followed by whitespace const charAfterOp = getFullCodePoint(input, peekPos + 2) if (isWhiteSpace(charAfterOp) || charAfterOp === -1 /* EOF */) { return AssignableIdentifier } } if (nextCh === 61 /* = */) { // Found '=', but check if it's followed by whitespace // If '=' is followed by non-whitespace (like '=cool*'), it won't be tokenized as Eq // In that case, this should be Identifier (for function call), not AssignableIdentifier const charAfterEquals = getFullCodePoint(input, peekPos + 1) if (isWhiteSpace(charAfterEquals) || charAfterEquals === -1 /* EOF */) { return AssignableIdentifier } } return Identifier } // Character classification helpers const isWhiteSpace = (ch: number): boolean => { return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */ } const isWordChar = (ch: number): boolean => { return ( !isWhiteSpace(ch) && ch !== 10 /* \n */ && ch !== 41 /* ) */ && ch !== 93 /* ] */ && ch !== -1 /* EOF */ ) } const isLowercaseLetter = (ch: number): boolean => { return ch >= 97 && ch <= 122 // a-z } const isDigit = (ch: number): boolean => { return ch >= 48 && ch <= 57 // 0-9 } const getFullCodePoint = (input: InputStream, pos: number): number => { const ch = input.peek(pos) // Check if this is a high surrogate (0xD800-0xDBFF) if (ch >= 0xd800 && ch <= 0xdbff) { const low = input.peek(pos + 1) // Check if next is low surrogate (0xDC00-0xDFFF) if (low >= 0xdc00 && low <= 0xdfff) { // Combine surrogate pair into full code point return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff) } } return ch } const isEmojiOrUnicode = (ch: number): boolean => { return ( // Basic Emoticons (ch >= 0x1f600 && ch <= 0x1f64f) || // Miscellaneous Symbols and Pictographs (ch >= 0x1f300 && ch <= 0x1f5ff) || // Transport and Map Symbols (ch >= 0x1f680 && ch <= 0x1f6ff) || // Regional Indicator Symbols (flags) (ch >= 0x1f1e6 && ch <= 0x1f1ff) || // Miscellaneous Symbols (hearts, stars, weather) (ch >= 0x2600 && ch <= 0x26ff) || // Dingbats (scissors, pencils, etc) (ch >= 0x2700 && ch <= 0x27bf) || // Supplemental Symbols and Pictographs (newer emojis) (ch >= 0x1f900 && ch <= 0x1f9ff) || // Symbols and Pictographs Extended-A (newest emojis) (ch >= 0x1fa70 && ch <= 0x1faff) || // Various Asian Characters with emoji presentation (ch >= 0x1f018 && ch <= 0x1f270) || // Variation Selectors (for emoji presentation) (ch >= 0xfe00 && ch <= 0xfe0f) || // Additional miscellaneous items (ch >= 0x238c && ch <= 0x2454) || // Combining Diacritical Marks for Symbols (ch >= 0x20d0 && ch <= 0x20ff) || // Latin-1 Supplement (includes ², ³, ¹ and other special chars) (ch >= 0x00a0 && ch <= 0x00ff) || // Greek and Coptic (U+0370-U+03FF) (ch >= 0x0370 && ch <= 0x03ff) || // Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF) (ch >= 0x1d400 && ch <= 0x1d7ff) || // Mathematical Operators (U+2200-U+22FF) (ch >= 0x2200 && ch <= 0x22ff) || // Superscripts and Subscripts (U+2070-U+209F) (ch >= 0x2070 && ch <= 0x209f) || // Arrows (U+2190-U+21FF) (ch >= 0x2190 && ch <= 0x21ff) || // Hiragana (U+3040-U+309F) (ch >= 0x3040 && ch <= 0x309f) || // Katakana (U+30A0-U+30FF) (ch >= 0x30a0 && ch <= 0x30ff) || // CJK Unified Ideographs (U+4E00-U+9FFF) (ch >= 0x4e00 && ch <= 0x9fff) ) } const getCharSize = (ch: number) => (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units