349 lines
11 KiB
TypeScript
349 lines
11 KiB
TypeScript
import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr'
|
|
import { Identifier, AssignableIdentifier, Word, IdentifierBeforeDot, Do, CurlyString } from './shrimp.terms'
|
|
|
|
// doobie doobie do (we need the `do` keyword to know when we're defining params)
|
|
export function specializeKeyword(ident: string) {
|
|
return ident === 'do' ? Do : -1
|
|
}
|
|
|
|
// tell the dotGet searcher about builtin globals
|
|
export const globals: string[] = []
|
|
export const setGlobals = (newGlobals: string[] | Record<string, any>) => {
|
|
globals.length = 0
|
|
globals.push(...(Array.isArray(newGlobals) ? newGlobals : Object.keys(newGlobals)))
|
|
}
|
|
|
|
// The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF.
|
|
|
|
export const tokenizer = new ExternalTokenizer(
|
|
(input: InputStream, stack: Stack) => {
|
|
const ch = getFullCodePoint(input, 0)
|
|
|
|
// Handle curly strings
|
|
if (ch === 123 /* { */) return consumeCurlyString(input, stack)
|
|
|
|
if (!isWordChar(ch)) return
|
|
|
|
// Don't consume things that start with digits - let Number token handle it
|
|
if (isDigit(ch)) return
|
|
|
|
// Don't consume things that start with - or + followed by a digit (negative/positive numbers)
|
|
if ((ch === 45 /* - */ || ch === 43) /* + */ && isDigit(input.peek(1))) return
|
|
|
|
const isValidStart = isIdentStart(ch)
|
|
const canBeWord = stack.canShift(Word)
|
|
|
|
// Consume all word characters, tracking if it remains a valid identifier
|
|
const { pos, isValidIdentifier, stoppedAtDot } = consumeWordToken(
|
|
input,
|
|
isValidStart,
|
|
canBeWord
|
|
)
|
|
|
|
// Check if we should emit IdentifierBeforeDot for property access
|
|
if (stoppedAtDot) {
|
|
const dotGetToken = checkForDotGet(input, stack, pos)
|
|
|
|
if (dotGetToken) {
|
|
input.advance(pos)
|
|
input.acceptToken(dotGetToken)
|
|
} else {
|
|
// Not in scope - continue consuming the dot as part of the word
|
|
const afterDot = consumeRestOfWord(input, pos + 1, canBeWord)
|
|
input.advance(afterDot)
|
|
input.acceptToken(Word)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// Advance past the token we consumed
|
|
input.advance(pos)
|
|
|
|
// Choose which token to emit
|
|
if (isValidIdentifier) {
|
|
const token = chooseIdentifierToken(input, stack)
|
|
input.acceptToken(token)
|
|
} else {
|
|
input.acceptToken(Word)
|
|
}
|
|
},
|
|
{ contextual: true }
|
|
)
|
|
|
|
// Build identifier text from input stream, handling surrogate pairs for emoji
|
|
const buildIdentifierText = (input: InputStream, length: number): string => {
|
|
let text = ''
|
|
for (let i = 0; i < length; i++) {
|
|
const charCode = input.peek(i)
|
|
if (charCode === -1) break
|
|
|
|
// Handle surrogate pairs for emoji (UTF-16 encoding)
|
|
if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < length) {
|
|
const low = input.peek(i + 1)
|
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
text += String.fromCharCode(charCode, low)
|
|
i++ // Skip the low surrogate
|
|
continue
|
|
}
|
|
}
|
|
text += String.fromCharCode(charCode)
|
|
}
|
|
return text
|
|
}
|
|
|
|
// Consume word characters, tracking if it remains a valid identifier
|
|
// Returns the position after consuming, whether it's a valid identifier, and if we stopped at a dot
|
|
const consumeWordToken = (
|
|
input: InputStream,
|
|
isValidStart: boolean,
|
|
canBeWord: boolean
|
|
): { pos: number; isValidIdentifier: boolean; stoppedAtDot: boolean } => {
|
|
let pos = getCharSize(getFullCodePoint(input, 0))
|
|
let isValidIdentifier = isValidStart
|
|
let stoppedAtDot = false
|
|
|
|
while (true) {
|
|
const ch = getFullCodePoint(input, pos)
|
|
|
|
// Stop at dot if we have a valid identifier (might be property access)
|
|
if (ch === 46 /* . */ && isValidIdentifier) {
|
|
stoppedAtDot = true
|
|
break
|
|
}
|
|
|
|
// Stop if we hit a non-word character
|
|
if (!isWordChar(ch)) break
|
|
|
|
// Context-aware termination: semicolon/colon can end a word if followed by whitespace
|
|
// This allows `hello; 2` to parse correctly while `hello;world` stays as one word
|
|
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
|
|
const nextCh = getFullCodePoint(input, pos + 1)
|
|
if (!isWordChar(nextCh)) break
|
|
}
|
|
|
|
// Track identifier validity: must be lowercase, digit, dash, or emoji/unicode
|
|
if (!isIdentChar(ch)) {
|
|
if (!canBeWord) break
|
|
isValidIdentifier = false
|
|
}
|
|
|
|
pos += getCharSize(ch)
|
|
}
|
|
|
|
return { pos, isValidIdentifier, stoppedAtDot }
|
|
}
|
|
|
|
// Consume the rest of a word after we've decided not to treat a dot as DotGet
|
|
// Used when we have "file.txt" - we already consumed "file", now consume ".txt"
|
|
const consumeRestOfWord = (input: InputStream, startPos: number, canBeWord: boolean): number => {
|
|
let pos = startPos
|
|
while (true) {
|
|
const ch = getFullCodePoint(input, pos)
|
|
|
|
// Stop if we hit a non-word character
|
|
if (!isWordChar(ch)) break
|
|
|
|
// Context-aware termination for semicolon/colon
|
|
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
|
|
const nextCh = getFullCodePoint(input, pos + 1)
|
|
if (!isWordChar(nextCh)) break
|
|
}
|
|
|
|
pos += getCharSize(ch)
|
|
}
|
|
return pos
|
|
}
|
|
|
|
// Consumes { curly strings } and tracks braces so you can { have { braces { inside { braces } } }
|
|
const consumeCurlyString = (input: InputStream, stack: Stack) => {
|
|
if (!stack.canShift(CurlyString)) return
|
|
|
|
let depth = 0
|
|
let pos = 0
|
|
|
|
while (true) {
|
|
const ch = input.peek(pos)
|
|
if (ch < 0) return // EOF - invalid
|
|
|
|
if (ch === 123) depth++ // {
|
|
else if (ch === 125) { // }
|
|
depth--
|
|
if (depth === 0) {
|
|
pos++ // consume final }
|
|
break
|
|
}
|
|
}
|
|
|
|
pos++
|
|
}
|
|
|
|
input.acceptToken(CurlyString, pos)
|
|
}
|
|
|
|
// Check if this identifier is in scope (for property access detection)
|
|
// Returns IdentifierBeforeDot token if in scope, null otherwise
|
|
const checkForDotGet = (input: InputStream, stack: Stack, pos: number): number | null => {
|
|
const identifierText = buildIdentifierText(input, pos)
|
|
const context = stack.context as { scope: { has(name: string): boolean } } | undefined
|
|
|
|
// If identifier is in scope, this is property access (e.g., obj.prop)
|
|
// If not in scope, it should be consumed as a Word (e.g., file.txt)
|
|
return context?.scope.has(identifierText) || globals.includes(identifierText)
|
|
? IdentifierBeforeDot
|
|
: null
|
|
}
|
|
|
|
// Decide between AssignableIdentifier and Identifier using grammar state + peek-ahead
|
|
const chooseIdentifierToken = (input: InputStream, stack: Stack): number => {
|
|
const canAssignable = stack.canShift(AssignableIdentifier)
|
|
const canRegular = stack.canShift(Identifier)
|
|
|
|
// Only one option is valid - use it
|
|
if (canAssignable && !canRegular) return AssignableIdentifier
|
|
if (canRegular && !canAssignable) return Identifier
|
|
|
|
// Both possible (ambiguous context) - peek ahead for '=' to disambiguate
|
|
// This happens at statement start where both `x = 5` (assign) and `echo x` (call) are valid
|
|
let peekPos = 0
|
|
while (true) {
|
|
const ch = getFullCodePoint(input, peekPos)
|
|
if (isWhiteSpace(ch)) {
|
|
peekPos += getCharSize(ch)
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
|
|
const nextCh = getFullCodePoint(input, peekPos)
|
|
const nextCh2 = getFullCodePoint(input, peekPos + 1)
|
|
const nextCh3 = getFullCodePoint(input, peekPos + 2)
|
|
|
|
// Check for ??= (three-character compound operator)
|
|
if (nextCh === 63 /* ? */ && nextCh2 === 63 /* ? */ && nextCh3 === 61 /* = */) {
|
|
const charAfterOp = getFullCodePoint(input, peekPos + 3)
|
|
if (isWhiteSpace(charAfterOp) || charAfterOp === -1 /* EOF */) {
|
|
return AssignableIdentifier
|
|
}
|
|
}
|
|
|
|
// Check for compound assignment operators: +=, -=, *=, /=, %=
|
|
if (
|
|
[43 /* + */, 45 /* - */, 42 /* * */, 47 /* / */, 37 /* % */].includes(nextCh) &&
|
|
nextCh2 === 61 /* = */
|
|
) {
|
|
// Found compound operator, check if it's followed by whitespace
|
|
const charAfterOp = getFullCodePoint(input, peekPos + 2)
|
|
if (isWhiteSpace(charAfterOp) || charAfterOp === -1 /* EOF */) {
|
|
return AssignableIdentifier
|
|
}
|
|
}
|
|
|
|
if (nextCh === 61 /* = */) {
|
|
// Found '=', but check if it's followed by whitespace
|
|
// If '=' is followed by non-whitespace (like '=cool*'), it won't be tokenized as Eq
|
|
// In that case, this should be Identifier (for function call), not AssignableIdentifier
|
|
const charAfterEquals = getFullCodePoint(input, peekPos + 1)
|
|
if (isWhiteSpace(charAfterEquals) || charAfterEquals === -1 /* EOF */) {
|
|
return AssignableIdentifier
|
|
}
|
|
}
|
|
return Identifier
|
|
}
|
|
|
|
// Character classification helpers
|
|
export const isIdentStart = (ch: number): boolean => {
|
|
return isLowercaseLetter(ch) || isEmojiOrUnicode(ch)
|
|
}
|
|
|
|
export const isIdentChar = (ch: number): boolean => {
|
|
return isLowercaseLetter(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */ || isEmojiOrUnicode(ch)
|
|
}
|
|
|
|
const isWhiteSpace = (ch: number): boolean => {
|
|
return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */
|
|
}
|
|
|
|
const isWordChar = (ch: number): boolean => {
|
|
return (
|
|
!isWhiteSpace(ch) &&
|
|
ch !== 10 /* \n */ &&
|
|
ch !== 41 /* ) */ &&
|
|
ch !== 93 /* ] */ &&
|
|
ch !== -1 /* EOF */
|
|
)
|
|
}
|
|
|
|
const isLowercaseLetter = (ch: number): boolean => {
|
|
return ch >= 97 && ch <= 122 // a-z
|
|
}
|
|
|
|
const isDigit = (ch: number): boolean => {
|
|
return ch >= 48 && ch <= 57 // 0-9
|
|
}
|
|
|
|
const getFullCodePoint = (input: InputStream, pos: number): number => {
|
|
const ch = input.peek(pos)
|
|
|
|
// Check if this is a high surrogate (0xD800-0xDBFF)
|
|
if (ch >= 0xd800 && ch <= 0xdbff) {
|
|
const low = input.peek(pos + 1)
|
|
// Check if next is low surrogate (0xDC00-0xDFFF)
|
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
// Combine surrogate pair into full code point
|
|
return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff)
|
|
}
|
|
}
|
|
|
|
return ch
|
|
}
|
|
|
|
const isEmojiOrUnicode = (ch: number): boolean => {
|
|
return (
|
|
// Basic Emoticons
|
|
(ch >= 0x1f600 && ch <= 0x1f64f) ||
|
|
// Miscellaneous Symbols and Pictographs
|
|
(ch >= 0x1f300 && ch <= 0x1f5ff) ||
|
|
// Transport and Map Symbols
|
|
(ch >= 0x1f680 && ch <= 0x1f6ff) ||
|
|
// Regional Indicator Symbols (flags)
|
|
(ch >= 0x1f1e6 && ch <= 0x1f1ff) ||
|
|
// Miscellaneous Symbols (hearts, stars, weather)
|
|
(ch >= 0x2600 && ch <= 0x26ff) ||
|
|
// Dingbats (scissors, pencils, etc)
|
|
(ch >= 0x2700 && ch <= 0x27bf) ||
|
|
// Supplemental Symbols and Pictographs (newer emojis)
|
|
(ch >= 0x1f900 && ch <= 0x1f9ff) ||
|
|
// Symbols and Pictographs Extended-A (newest emojis)
|
|
(ch >= 0x1fa70 && ch <= 0x1faff) ||
|
|
// Various Asian Characters with emoji presentation
|
|
(ch >= 0x1f018 && ch <= 0x1f270) ||
|
|
// Variation Selectors (for emoji presentation)
|
|
(ch >= 0xfe00 && ch <= 0xfe0f) ||
|
|
// Additional miscellaneous items
|
|
(ch >= 0x238c && ch <= 0x2454) ||
|
|
// Combining Diacritical Marks for Symbols
|
|
(ch >= 0x20d0 && ch <= 0x20ff) ||
|
|
// Latin-1 Supplement (includes ², ³, ¹ and other special chars)
|
|
(ch >= 0x00a0 && ch <= 0x00ff) ||
|
|
// Greek and Coptic (U+0370-U+03FF)
|
|
(ch >= 0x0370 && ch <= 0x03ff) ||
|
|
// Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF)
|
|
(ch >= 0x1d400 && ch <= 0x1d7ff) ||
|
|
// Mathematical Operators (U+2200-U+22FF)
|
|
(ch >= 0x2200 && ch <= 0x22ff) ||
|
|
// Superscripts and Subscripts (U+2070-U+209F)
|
|
(ch >= 0x2070 && ch <= 0x209f) ||
|
|
// Arrows (U+2190-U+21FF)
|
|
(ch >= 0x2190 && ch <= 0x21ff) ||
|
|
// Hiragana (U+3040-U+309F)
|
|
(ch >= 0x3040 && ch <= 0x309f) ||
|
|
// Katakana (U+30A0-U+30FF)
|
|
(ch >= 0x30a0 && ch <= 0x30ff) ||
|
|
// CJK Unified Ideographs (U+4E00-U+9FFF)
|
|
(ch >= 0x4e00 && ch <= 0x9fff)
|
|
)
|
|
}
|
|
|
|
const getCharSize = (ch: number) => (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units
|