This commit is contained in:
Corey Johnson 2025-10-17 21:13:49 -07:00
parent b0d5a7f50c
commit 78ae96fc72
2 changed files with 169 additions and 138 deletions

View File

@ -2,23 +2,20 @@ import { ContextTracker, InputStream } from '@lezer/lr'
import * as terms from './shrimp.terms' import * as terms from './shrimp.terms'
export class Scope { export class Scope {
constructor( constructor(public parent: Scope | null, public vars = new Set<string>()) {}
public parent: Scope | null,
public vars: Set<string>
) {}
has(name: string): boolean { has(name: string): boolean {
return this.vars.has(name) || (this.parent?.has(name) ?? false) return this.vars.has(name) ?? this.parent?.has(name)
} }
add(...names: string[]): Scope { add(...names: string[]): Scope {
const newVars = new Set(this.vars) const newVars = new Set(this.vars)
names.forEach(name => newVars.add(name)) names.forEach((name) => newVars.add(name))
return new Scope(this.parent, newVars) return new Scope(this.parent, newVars)
} }
push(): Scope { push(): Scope {
return new Scope(this, new Set()) return new Scope(this)
} }
pop(): Scope { pop(): Scope {
@ -43,10 +40,7 @@ export class Scope {
// Wrapper that adds temporary state for identifier capture // Wrapper that adds temporary state for identifier capture
export class ScopeContext { export class ScopeContext {
constructor( constructor(public scope: Scope, public pendingIds: string[] = []) {}
public scope: Scope,
public pendingIds: string[] = []
) {}
// Helper to append identifier to pending list // Helper to append identifier to pending list
withPending(id: string): ScopeContext { withPending(id: string): ScopeContext {
@ -57,24 +51,19 @@ export class ScopeContext {
consumeLast(): ScopeContext { consumeLast(): ScopeContext {
const varName = this.pendingIds.at(-1) const varName = this.pendingIds.at(-1)
if (!varName) return this if (!varName) return this
return new ScopeContext( return new ScopeContext(this.scope.add(varName), this.pendingIds.slice(0, -1))
this.scope.add(varName),
this.pendingIds.slice(0, -1)
)
} }
// Helper to consume all pending identifiers and add to new scope // Helper to consume all pending identifiers and add to new scope
consumeAll(): ScopeContext { consumeAll(): ScopeContext {
const newScope = this.scope.push() let newScope = this.scope.push()
return new ScopeContext( newScope = this.pendingIds.length > 0 ? newScope.add(...this.pendingIds) : newScope
this.pendingIds.length > 0 ? newScope.add(...this.pendingIds) : newScope, return new ScopeContext(newScope)
[]
)
} }
// Helper to clear pending without adding to scope // Helper to clear pending without adding to scope
clearPending(): ScopeContext { clearPending(): ScopeContext {
return new ScopeContext(this.scope, []) return new ScopeContext(this.scope)
} }
} }
@ -94,24 +83,19 @@ export const trackScope = new ContextTracker<ScopeContext>({
start: new ScopeContext(new Scope(null, new Set())), start: new ScopeContext(new Scope(null, new Set())),
shift(context, term, stack, input) { shift(context, term, stack, input) {
// Only capture AssignableIdentifier tokens if (term !== terms.AssignableIdentifier) return context
if (term === terms.AssignableIdentifier) {
const text = readIdentifierText(input, input.pos, stack.pos) const text = readIdentifierText(input, input.pos, stack.pos)
return context.withPending(text) return context.withPending(text)
}
return context
}, },
reduce(context, term) { reduce(context, term) {
// Add assignment variable to scope
if (term === terms.Assign) return context.consumeLast() if (term === terms.Assign) return context.consumeLast()
// Push new scope and add all parameters
if (term === terms.Params) return context.consumeAll() if (term === terms.Params) return context.consumeAll()
// Pop scope when exiting function // Pop scope when exiting function
if (term === terms.FunctionDef) { if (term === terms.FunctionDef) {
return new ScopeContext(context.scope.pop(), []) return new ScopeContext(context.scope.pop())
} }
return context return context

View File

@ -6,115 +6,43 @@ import type { ScopeContext } from './scopeTracker'
export const tokenizer = new ExternalTokenizer( export const tokenizer = new ExternalTokenizer(
(input: InputStream, stack: Stack) => { (input: InputStream, stack: Stack) => {
let ch = getFullCodePoint(input, 0) const ch = getFullCodePoint(input, 0)
if (!isWordChar(ch)) return if (!isWordChar(ch)) return
let pos = getCharSize(ch) const isValidStart = isLowercaseLetter(ch) || isEmoji(ch)
let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch)
const canBeWord = stack.canShift(Word) const canBeWord = stack.canShift(Word)
while (true) { // Consume all word characters, tracking if it remains a valid identifier
ch = getFullCodePoint(input, pos) const { pos, isValidIdentifier, stoppedAtDot } = consumeWordToken(
input,
isValidStart,
canBeWord
)
// Check for dot and scope - property access detection // Check if we should emit IdentifierBeforeDot for property access
if (ch === 46 /* . */ && isValidIdentifier) { if (stoppedAtDot) {
// Build identifier text by peeking character by character const dotGetToken = checkForDotGet(input, stack, pos)
let identifierText = ''
for (let i = 0; i < pos; i++) {
const charCode = input.peek(i)
if (charCode === -1) break
// Handle surrogate pairs for emoji
if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
const low = input.peek(i + 1)
if (low >= 0xdc00 && low <= 0xdfff) {
identifierText += String.fromCharCode(charCode, low)
i++ // Skip the low surrogate
continue
}
}
identifierText += String.fromCharCode(charCode)
}
const scopeContext = stack.context as ScopeContext | undefined if (dotGetToken) {
const scope = scopeContext?.scope
if (scope?.has(identifierText)) {
// In scope - stop here, let grammar parse property access
input.advance(pos) input.advance(pos)
input.acceptToken(IdentifierBeforeDot) input.acceptToken(dotGetToken)
} else {
// Not in scope - continue consuming the dot as part of the word
const afterDot = consumeRestOfWord(input, pos + 1, canBeWord)
input.advance(afterDot)
input.acceptToken(Word)
}
return return
} }
// Not in scope - continue consuming as Word (fall through)
}
if (!isWordChar(ch)) break
// Certain characters might end a word or identifier if they are followed by whitespace.
// This allows things like `a = hello; 2` of if `x: y` to parse correctly.
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
const nextCh = getFullCodePoint(input, pos + 1)
if (!isWordChar(nextCh)) break
}
// Track identifier validity
if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) {
if (!canBeWord) break
isValidIdentifier = false
}
pos += getCharSize(ch)
}
// Build identifier text BEFORE advancing (for debug and peek-ahead)
let identifierText = ''
if (isValidIdentifier) {
for (let i = 0; i < pos; i++) {
const charCode = input.peek(i)
if (charCode === -1) break
if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
const low = input.peek(i + 1)
if (low >= 0xdc00 && low <= 0xdfff) {
identifierText += String.fromCharCode(charCode, low)
i++
continue
}
}
identifierText += String.fromCharCode(charCode)
}
}
// Advance past the token we consumed
input.advance(pos) input.advance(pos)
if (isValidIdentifier) {
const canAssignable = stack.canShift(AssignableIdentifier)
const canRegular = stack.canShift(Identifier)
if (canAssignable && !canRegular) { // Choose which token to emit
// Only AssignableIdentifier valid (e.g., in Params) if (isValidIdentifier) {
input.acceptToken(AssignableIdentifier) const token = chooseIdentifierToken(input, stack)
} else if (canRegular && !canAssignable) { input.acceptToken(token)
// Only Identifier valid (e.g., in function args)
input.acceptToken(Identifier)
} else {
// BOTH possible (ambiguous) - peek ahead for '='
// Note: we're peeking from current position (after advance), so start at 0
let peekPos = 0
// Skip whitespace (space, tab, CR, but NOT newline - assignment must be on same line)
while (true) {
const ch = getFullCodePoint(input, peekPos)
if (ch === 32 || ch === 9 || ch === 13) { // space, tab, CR
peekPos += getCharSize(ch)
} else {
break
}
}
// Check if next non-whitespace char is '='
const nextCh = getFullCodePoint(input, peekPos)
if (nextCh === 61 /* = */) {
input.acceptToken(AssignableIdentifier)
} else {
input.acceptToken(Identifier)
}
}
} else { } else {
input.acceptToken(Word) input.acceptToken(Word)
} }
@ -122,15 +50,134 @@ export const tokenizer = new ExternalTokenizer(
{ contextual: true } { contextual: true }
) )
// Build identifier text from input stream, handling surrogate pairs for emoji
const buildIdentifierText = (input: InputStream, length: number): string => {
let text = ''
for (let i = 0; i < length; i++) {
const charCode = input.peek(i)
if (charCode === -1) break
// Handle surrogate pairs for emoji (UTF-16 encoding)
if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < length) {
const low = input.peek(i + 1)
if (low >= 0xdc00 && low <= 0xdfff) {
text += String.fromCharCode(charCode, low)
i++ // Skip the low surrogate
continue
}
}
text += String.fromCharCode(charCode)
}
return text
}
// Consume word characters, tracking if it remains a valid identifier
// Returns the position after consuming, whether it's a valid identifier, and if we stopped at a dot
const consumeWordToken = (
input: InputStream,
isValidStart: boolean,
canBeWord: boolean
): { pos: number; isValidIdentifier: boolean; stoppedAtDot: boolean } => {
let pos = getCharSize(getFullCodePoint(input, 0))
let isValidIdentifier = isValidStart
let stoppedAtDot = false
while (true) {
const ch = getFullCodePoint(input, pos)
// Stop at dot if we have a valid identifier (might be property access)
if (ch === 46 /* . */ && isValidIdentifier) {
stoppedAtDot = true
break
}
// Stop if we hit a non-word character
if (!isWordChar(ch)) break
// Context-aware termination: semicolon/colon can end a word if followed by whitespace
// This allows `hello; 2` to parse correctly while `hello;world` stays as one word
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
const nextCh = getFullCodePoint(input, pos + 1)
if (!isWordChar(nextCh)) break
}
// Track identifier validity: must be lowercase, digit, dash, or emoji
if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 /* - */ && !isEmoji(ch)) {
if (!canBeWord) break
isValidIdentifier = false
}
pos += getCharSize(ch)
}
return { pos, isValidIdentifier, stoppedAtDot }
}
// Consume the rest of a word after we've decided not to treat a dot as DotGet
// Used when we have "file.txt" - we already consumed "file", now consume ".txt"
const consumeRestOfWord = (input: InputStream, startPos: number, canBeWord: boolean): number => {
let pos = startPos
while (true) {
const ch = getFullCodePoint(input, pos)
// Stop if we hit a non-word character
if (!isWordChar(ch)) break
// Context-aware termination for semicolon/colon
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
const nextCh = getFullCodePoint(input, pos + 1)
if (!isWordChar(nextCh)) break
}
pos += getCharSize(ch)
}
return pos
}
// Check if this identifier is in scope (for property access detection)
// Returns IdentifierBeforeDot token if in scope, null otherwise
const checkForDotGet = (input: InputStream, stack: Stack, pos: number): number | null => {
const identifierText = buildIdentifierText(input, pos)
const scopeContext = stack.context as ScopeContext | undefined
const scope = scopeContext?.scope
// If identifier is in scope, this is property access (e.g., obj.prop)
// If not in scope, it should be consumed as a Word (e.g., file.txt)
return scope?.has(identifierText) ? IdentifierBeforeDot : null
}
// Decide between AssignableIdentifier and Identifier using grammar state + peek-ahead
const chooseIdentifierToken = (input: InputStream, stack: Stack): number => {
const canAssignable = stack.canShift(AssignableIdentifier)
const canRegular = stack.canShift(Identifier)
// Only one option is valid - use it
if (canAssignable && !canRegular) return AssignableIdentifier
if (canRegular && !canAssignable) return Identifier
// Both possible (ambiguous context) - peek ahead for '=' to disambiguate
// This happens at statement start where both `x = 5` (assign) and `echo x` (call) are valid
let peekPos = 0
while (true) {
const ch = getFullCodePoint(input, peekPos)
if (isWhiteSpace(ch)) {
peekPos += getCharSize(ch)
} else {
break
}
}
const nextCh = getFullCodePoint(input, peekPos)
return nextCh === 61 /* = */ ? AssignableIdentifier : Identifier
}
// Character classification helpers
const isWhiteSpace = (ch: number): boolean => { const isWhiteSpace = (ch: number): boolean => {
return ch === 32 /* space */ || ch === 10 /* \n */ || ch === 9 /* tab */ || ch === 13 /* \r */ return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */
} }
const isWordChar = (ch: number): boolean => { const isWordChar = (ch: number): boolean => {
const closingParen = ch === 41 /* ) */ return !isWhiteSpace(ch) && ch !== 10 /* \n */ && ch !== 41 /* ) */ && ch !== -1 /* EOF */
const eof = ch === -1
return !isWhiteSpace(ch) && !closingParen && !eof
} }
const isLowercaseLetter = (ch: number): boolean => { const isLowercaseLetter = (ch: number): boolean => {
@ -154,7 +201,7 @@ const getFullCodePoint = (input: InputStream, pos: number): number => {
} }
} }
return ch // Single code unit return ch
} }
const isEmoji = (ch: number): boolean => { const isEmoji = (ch: number): boolean => {