import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr' import { Identifier, Word, IdentifierBeforeDot } from './shrimp.terms' import type { Scope } from './scopeTracker' // The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF. export const tokenizer = new ExternalTokenizer( (input: InputStream, stack: Stack) => { let ch = getFullCodePoint(input, 0) console.log(`🌭 checking char ${String.fromCodePoint(ch)}`) if (!isWordChar(ch)) return let pos = getCharSize(ch) let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch) const canBeWord = stack.canShift(Word) while (true) { ch = getFullCodePoint(input, pos) // Check for dot and scope - property access detection if (ch === 46 /* . */ && isValidIdentifier) { // Build identifier text by peeking character by character let identifierText = '' for (let i = 0; i < pos; i++) { const charCode = input.peek(i) if (charCode === -1) break // Handle surrogate pairs for emoji if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) { const low = input.peek(i + 1) if (low >= 0xdc00 && low <= 0xdfff) { identifierText += String.fromCharCode(charCode, low) i++ // Skip the low surrogate continue } } identifierText += String.fromCharCode(charCode) } const scope = stack.context as Scope | undefined if (scope?.has(identifierText)) { // In scope - stop here, let grammar parse property access input.advance(pos) input.acceptToken(IdentifierBeforeDot) return } // Not in scope - continue consuming as Word (fall through) } if (!isWordChar(ch)) break // Certain characters might end a word or identifier if they are followed by whitespace. // This allows things like `a = hello; 2` of if `x: y` to parse correctly. if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { const nextCh = getFullCodePoint(input, pos + 1) if (!isWordChar(nextCh)) break } // Track identifier validity if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) { if (!canBeWord) break isValidIdentifier = false } pos += getCharSize(ch) } input.advance(pos) input.acceptToken(isValidIdentifier ? Identifier : Word) }, { contextual: true } ) const isWhiteSpace = (ch: number): boolean => { return ch === 32 /* space */ || ch === 10 /* \n */ || ch === 9 /* tab */ || ch === 13 /* \r */ } const isWordChar = (ch: number): boolean => { const closingParen = ch === 41 /* ) */ const eof = ch === -1 return !isWhiteSpace(ch) && !closingParen && !eof } const isLowercaseLetter = (ch: number): boolean => { return ch >= 97 && ch <= 122 // a-z } const isDigit = (ch: number): boolean => { return ch >= 48 && ch <= 57 // 0-9 } const getFullCodePoint = (input: InputStream, pos: number): number => { const ch = input.peek(pos) // Check if this is a high surrogate (0xD800-0xDBFF) if (ch >= 0xd800 && ch <= 0xdbff) { const low = input.peek(pos + 1) // Check if next is low surrogate (0xDC00-0xDFFF) if (low >= 0xdc00 && low <= 0xdfff) { // Combine surrogate pair into full code point return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff) } } return ch // Single code unit } const isEmoji = (ch: number): boolean => { return ( // Basic Emoticons (ch >= 0x1f600 && ch <= 0x1f64f) || // Miscellaneous Symbols and Pictographs (ch >= 0x1f300 && ch <= 0x1f5ff) || // Transport and Map Symbols (ch >= 0x1f680 && ch <= 0x1f6ff) || // Regional Indicator Symbols (flags) (ch >= 0x1f1e6 && ch <= 0x1f1ff) || // Miscellaneous Symbols (hearts, stars, weather) (ch >= 0x2600 && ch <= 0x26ff) || // Dingbats (scissors, pencils, etc) (ch >= 0x2700 && ch <= 0x27bf) || // Supplemental Symbols and Pictographs (newer emojis) (ch >= 0x1f900 && ch <= 0x1f9ff) || // Symbols and Pictographs Extended-A (newest emojis) (ch >= 0x1fa70 && ch <= 0x1faff) || // Various Asian Characters with emoji presentation (ch >= 0x1f018 && ch <= 0x1f270) || // Variation Selectors (for emoji presentation) (ch >= 0xfe00 && ch <= 0xfe0f) || // Additional miscellaneous items (ch >= 0x238c && ch <= 0x2454) || // Combining Diacritical Marks for Symbols (ch >= 0x20d0 && ch <= 0x20ff) ) } const getCharSize = (ch: number) => (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units