- Add IdentifierBeforeDot token emitted when identifier immediately precedes '.'
- Move DotGet into @skip {} block using IdentifierBeforeDot
- Prevents 'basename . prop' from parsing as DotGet
- Allows 'basename.prop' to work as expected when identifier is in scope
- Fixes test: 'a word can be contained in parens'
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
139 lines
4.6 KiB
TypeScript
139 lines
4.6 KiB
TypeScript
import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr'
|
|
import { Identifier, Word, IdentifierBeforeDot } from './shrimp.terms'
|
|
import type { Scope } from './scopeTracker'
|
|
|
|
// The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF.
|
|
|
|
export const tokenizer = new ExternalTokenizer(
|
|
(input: InputStream, stack: Stack) => {
|
|
let ch = getFullCodePoint(input, 0)
|
|
console.log(`🌭 checking char ${String.fromCodePoint(ch)}`)
|
|
if (!isWordChar(ch)) return
|
|
|
|
let pos = getCharSize(ch)
|
|
let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch)
|
|
const canBeWord = stack.canShift(Word)
|
|
|
|
while (true) {
|
|
ch = getFullCodePoint(input, pos)
|
|
|
|
// Check for dot and scope - property access detection
|
|
if (ch === 46 /* . */ && isValidIdentifier) {
|
|
// Build identifier text by peeking character by character
|
|
let identifierText = ''
|
|
for (let i = 0; i < pos; i++) {
|
|
const charCode = input.peek(i)
|
|
if (charCode === -1) break
|
|
// Handle surrogate pairs for emoji
|
|
if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
|
|
const low = input.peek(i + 1)
|
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
identifierText += String.fromCharCode(charCode, low)
|
|
i++ // Skip the low surrogate
|
|
continue
|
|
}
|
|
}
|
|
identifierText += String.fromCharCode(charCode)
|
|
}
|
|
|
|
const scope = stack.context as Scope | undefined
|
|
|
|
if (scope?.has(identifierText)) {
|
|
// In scope - stop here, let grammar parse property access
|
|
input.advance(pos)
|
|
input.acceptToken(IdentifierBeforeDot)
|
|
return
|
|
}
|
|
// Not in scope - continue consuming as Word (fall through)
|
|
}
|
|
|
|
if (!isWordChar(ch)) break
|
|
|
|
// Certain characters might end a word or identifier if they are followed by whitespace.
|
|
// This allows things like `a = hello; 2` of if `x: y` to parse correctly.
|
|
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
|
|
const nextCh = getFullCodePoint(input, pos + 1)
|
|
if (!isWordChar(nextCh)) break
|
|
}
|
|
|
|
// Track identifier validity
|
|
if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) {
|
|
if (!canBeWord) break
|
|
isValidIdentifier = false
|
|
}
|
|
|
|
pos += getCharSize(ch)
|
|
}
|
|
|
|
input.advance(pos)
|
|
input.acceptToken(isValidIdentifier ? Identifier : Word)
|
|
},
|
|
{ contextual: true }
|
|
)
|
|
|
|
const isWhiteSpace = (ch: number): boolean => {
|
|
return ch === 32 /* space */ || ch === 10 /* \n */ || ch === 9 /* tab */ || ch === 13 /* \r */
|
|
}
|
|
|
|
const isWordChar = (ch: number): boolean => {
|
|
const closingParen = ch === 41 /* ) */
|
|
const eof = ch === -1
|
|
|
|
return !isWhiteSpace(ch) && !closingParen && !eof
|
|
}
|
|
|
|
const isLowercaseLetter = (ch: number): boolean => {
|
|
return ch >= 97 && ch <= 122 // a-z
|
|
}
|
|
|
|
const isDigit = (ch: number): boolean => {
|
|
return ch >= 48 && ch <= 57 // 0-9
|
|
}
|
|
|
|
const getFullCodePoint = (input: InputStream, pos: number): number => {
|
|
const ch = input.peek(pos)
|
|
|
|
// Check if this is a high surrogate (0xD800-0xDBFF)
|
|
if (ch >= 0xd800 && ch <= 0xdbff) {
|
|
const low = input.peek(pos + 1)
|
|
// Check if next is low surrogate (0xDC00-0xDFFF)
|
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
// Combine surrogate pair into full code point
|
|
return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff)
|
|
}
|
|
}
|
|
|
|
return ch // Single code unit
|
|
}
|
|
|
|
const isEmoji = (ch: number): boolean => {
|
|
return (
|
|
// Basic Emoticons
|
|
(ch >= 0x1f600 && ch <= 0x1f64f) ||
|
|
// Miscellaneous Symbols and Pictographs
|
|
(ch >= 0x1f300 && ch <= 0x1f5ff) ||
|
|
// Transport and Map Symbols
|
|
(ch >= 0x1f680 && ch <= 0x1f6ff) ||
|
|
// Regional Indicator Symbols (flags)
|
|
(ch >= 0x1f1e6 && ch <= 0x1f1ff) ||
|
|
// Miscellaneous Symbols (hearts, stars, weather)
|
|
(ch >= 0x2600 && ch <= 0x26ff) ||
|
|
// Dingbats (scissors, pencils, etc)
|
|
(ch >= 0x2700 && ch <= 0x27bf) ||
|
|
// Supplemental Symbols and Pictographs (newer emojis)
|
|
(ch >= 0x1f900 && ch <= 0x1f9ff) ||
|
|
// Symbols and Pictographs Extended-A (newest emojis)
|
|
(ch >= 0x1fa70 && ch <= 0x1faff) ||
|
|
// Various Asian Characters with emoji presentation
|
|
(ch >= 0x1f018 && ch <= 0x1f270) ||
|
|
// Variation Selectors (for emoji presentation)
|
|
(ch >= 0xfe00 && ch <= 0xfe0f) ||
|
|
// Additional miscellaneous items
|
|
(ch >= 0x238c && ch <= 0x2454) ||
|
|
// Combining Diacritical Marks for Symbols
|
|
(ch >= 0x20d0 && ch <= 0x20ff)
|
|
)
|
|
}
|
|
|
|
const getCharSize = (ch: number) => (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units
|