wip
This commit is contained in:
parent
b0d5a7f50c
commit
78ae96fc72
|
|
@ -2,23 +2,20 @@ import { ContextTracker, InputStream } from '@lezer/lr'
|
||||||
import * as terms from './shrimp.terms'
|
import * as terms from './shrimp.terms'
|
||||||
|
|
||||||
export class Scope {
|
export class Scope {
|
||||||
constructor(
|
constructor(public parent: Scope | null, public vars = new Set<string>()) {}
|
||||||
public parent: Scope | null,
|
|
||||||
public vars: Set<string>
|
|
||||||
) {}
|
|
||||||
|
|
||||||
has(name: string): boolean {
|
has(name: string): boolean {
|
||||||
return this.vars.has(name) || (this.parent?.has(name) ?? false)
|
return this.vars.has(name) ?? this.parent?.has(name)
|
||||||
}
|
}
|
||||||
|
|
||||||
add(...names: string[]): Scope {
|
add(...names: string[]): Scope {
|
||||||
const newVars = new Set(this.vars)
|
const newVars = new Set(this.vars)
|
||||||
names.forEach(name => newVars.add(name))
|
names.forEach((name) => newVars.add(name))
|
||||||
return new Scope(this.parent, newVars)
|
return new Scope(this.parent, newVars)
|
||||||
}
|
}
|
||||||
|
|
||||||
push(): Scope {
|
push(): Scope {
|
||||||
return new Scope(this, new Set())
|
return new Scope(this)
|
||||||
}
|
}
|
||||||
|
|
||||||
pop(): Scope {
|
pop(): Scope {
|
||||||
|
|
@ -43,10 +40,7 @@ export class Scope {
|
||||||
|
|
||||||
// Wrapper that adds temporary state for identifier capture
|
// Wrapper that adds temporary state for identifier capture
|
||||||
export class ScopeContext {
|
export class ScopeContext {
|
||||||
constructor(
|
constructor(public scope: Scope, public pendingIds: string[] = []) {}
|
||||||
public scope: Scope,
|
|
||||||
public pendingIds: string[] = []
|
|
||||||
) {}
|
|
||||||
|
|
||||||
// Helper to append identifier to pending list
|
// Helper to append identifier to pending list
|
||||||
withPending(id: string): ScopeContext {
|
withPending(id: string): ScopeContext {
|
||||||
|
|
@ -57,24 +51,19 @@ export class ScopeContext {
|
||||||
consumeLast(): ScopeContext {
|
consumeLast(): ScopeContext {
|
||||||
const varName = this.pendingIds.at(-1)
|
const varName = this.pendingIds.at(-1)
|
||||||
if (!varName) return this
|
if (!varName) return this
|
||||||
return new ScopeContext(
|
return new ScopeContext(this.scope.add(varName), this.pendingIds.slice(0, -1))
|
||||||
this.scope.add(varName),
|
|
||||||
this.pendingIds.slice(0, -1)
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper to consume all pending identifiers and add to new scope
|
// Helper to consume all pending identifiers and add to new scope
|
||||||
consumeAll(): ScopeContext {
|
consumeAll(): ScopeContext {
|
||||||
const newScope = this.scope.push()
|
let newScope = this.scope.push()
|
||||||
return new ScopeContext(
|
newScope = this.pendingIds.length > 0 ? newScope.add(...this.pendingIds) : newScope
|
||||||
this.pendingIds.length > 0 ? newScope.add(...this.pendingIds) : newScope,
|
return new ScopeContext(newScope)
|
||||||
[]
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper to clear pending without adding to scope
|
// Helper to clear pending without adding to scope
|
||||||
clearPending(): ScopeContext {
|
clearPending(): ScopeContext {
|
||||||
return new ScopeContext(this.scope, [])
|
return new ScopeContext(this.scope)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -94,24 +83,19 @@ export const trackScope = new ContextTracker<ScopeContext>({
|
||||||
start: new ScopeContext(new Scope(null, new Set())),
|
start: new ScopeContext(new Scope(null, new Set())),
|
||||||
|
|
||||||
shift(context, term, stack, input) {
|
shift(context, term, stack, input) {
|
||||||
// Only capture AssignableIdentifier tokens
|
if (term !== terms.AssignableIdentifier) return context
|
||||||
if (term === terms.AssignableIdentifier) {
|
|
||||||
const text = readIdentifierText(input, input.pos, stack.pos)
|
const text = readIdentifierText(input, input.pos, stack.pos)
|
||||||
return context.withPending(text)
|
return context.withPending(text)
|
||||||
}
|
|
||||||
return context
|
|
||||||
},
|
},
|
||||||
|
|
||||||
reduce(context, term) {
|
reduce(context, term) {
|
||||||
// Add assignment variable to scope
|
|
||||||
if (term === terms.Assign) return context.consumeLast()
|
if (term === terms.Assign) return context.consumeLast()
|
||||||
|
|
||||||
// Push new scope and add all parameters
|
|
||||||
if (term === terms.Params) return context.consumeAll()
|
if (term === terms.Params) return context.consumeAll()
|
||||||
|
|
||||||
// Pop scope when exiting function
|
// Pop scope when exiting function
|
||||||
if (term === terms.FunctionDef) {
|
if (term === terms.FunctionDef) {
|
||||||
return new ScopeContext(context.scope.pop(), [])
|
return new ScopeContext(context.scope.pop())
|
||||||
}
|
}
|
||||||
|
|
||||||
return context
|
return context
|
||||||
|
|
|
||||||
|
|
@ -6,115 +6,43 @@ import type { ScopeContext } from './scopeTracker'
|
||||||
|
|
||||||
export const tokenizer = new ExternalTokenizer(
|
export const tokenizer = new ExternalTokenizer(
|
||||||
(input: InputStream, stack: Stack) => {
|
(input: InputStream, stack: Stack) => {
|
||||||
let ch = getFullCodePoint(input, 0)
|
const ch = getFullCodePoint(input, 0)
|
||||||
if (!isWordChar(ch)) return
|
if (!isWordChar(ch)) return
|
||||||
|
|
||||||
let pos = getCharSize(ch)
|
const isValidStart = isLowercaseLetter(ch) || isEmoji(ch)
|
||||||
let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch)
|
|
||||||
const canBeWord = stack.canShift(Word)
|
const canBeWord = stack.canShift(Word)
|
||||||
|
|
||||||
while (true) {
|
// Consume all word characters, tracking if it remains a valid identifier
|
||||||
ch = getFullCodePoint(input, pos)
|
const { pos, isValidIdentifier, stoppedAtDot } = consumeWordToken(
|
||||||
|
input,
|
||||||
|
isValidStart,
|
||||||
|
canBeWord
|
||||||
|
)
|
||||||
|
|
||||||
// Check for dot and scope - property access detection
|
// Check if we should emit IdentifierBeforeDot for property access
|
||||||
if (ch === 46 /* . */ && isValidIdentifier) {
|
if (stoppedAtDot) {
|
||||||
// Build identifier text by peeking character by character
|
const dotGetToken = checkForDotGet(input, stack, pos)
|
||||||
let identifierText = ''
|
|
||||||
for (let i = 0; i < pos; i++) {
|
|
||||||
const charCode = input.peek(i)
|
|
||||||
if (charCode === -1) break
|
|
||||||
// Handle surrogate pairs for emoji
|
|
||||||
if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
|
|
||||||
const low = input.peek(i + 1)
|
|
||||||
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
||||||
identifierText += String.fromCharCode(charCode, low)
|
|
||||||
i++ // Skip the low surrogate
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
identifierText += String.fromCharCode(charCode)
|
|
||||||
}
|
|
||||||
|
|
||||||
const scopeContext = stack.context as ScopeContext | undefined
|
if (dotGetToken) {
|
||||||
const scope = scopeContext?.scope
|
input.advance(pos)
|
||||||
|
input.acceptToken(dotGetToken)
|
||||||
if (scope?.has(identifierText)) {
|
|
||||||
// In scope - stop here, let grammar parse property access
|
|
||||||
input.advance(pos)
|
|
||||||
input.acceptToken(IdentifierBeforeDot)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Not in scope - continue consuming as Word (fall through)
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isWordChar(ch)) break
|
|
||||||
|
|
||||||
// Certain characters might end a word or identifier if they are followed by whitespace.
|
|
||||||
// This allows things like `a = hello; 2` of if `x: y` to parse correctly.
|
|
||||||
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
|
|
||||||
const nextCh = getFullCodePoint(input, pos + 1)
|
|
||||||
if (!isWordChar(nextCh)) break
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track identifier validity
|
|
||||||
if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) {
|
|
||||||
if (!canBeWord) break
|
|
||||||
isValidIdentifier = false
|
|
||||||
}
|
|
||||||
|
|
||||||
pos += getCharSize(ch)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build identifier text BEFORE advancing (for debug and peek-ahead)
|
|
||||||
let identifierText = ''
|
|
||||||
if (isValidIdentifier) {
|
|
||||||
for (let i = 0; i < pos; i++) {
|
|
||||||
const charCode = input.peek(i)
|
|
||||||
if (charCode === -1) break
|
|
||||||
if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
|
|
||||||
const low = input.peek(i + 1)
|
|
||||||
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
||||||
identifierText += String.fromCharCode(charCode, low)
|
|
||||||
i++
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
identifierText += String.fromCharCode(charCode)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
input.advance(pos)
|
|
||||||
if (isValidIdentifier) {
|
|
||||||
const canAssignable = stack.canShift(AssignableIdentifier)
|
|
||||||
const canRegular = stack.canShift(Identifier)
|
|
||||||
|
|
||||||
if (canAssignable && !canRegular) {
|
|
||||||
// Only AssignableIdentifier valid (e.g., in Params)
|
|
||||||
input.acceptToken(AssignableIdentifier)
|
|
||||||
} else if (canRegular && !canAssignable) {
|
|
||||||
// Only Identifier valid (e.g., in function args)
|
|
||||||
input.acceptToken(Identifier)
|
|
||||||
} else {
|
} else {
|
||||||
// BOTH possible (ambiguous) - peek ahead for '='
|
// Not in scope - continue consuming the dot as part of the word
|
||||||
// Note: we're peeking from current position (after advance), so start at 0
|
const afterDot = consumeRestOfWord(input, pos + 1, canBeWord)
|
||||||
let peekPos = 0
|
input.advance(afterDot)
|
||||||
// Skip whitespace (space, tab, CR, but NOT newline - assignment must be on same line)
|
input.acceptToken(Word)
|
||||||
while (true) {
|
|
||||||
const ch = getFullCodePoint(input, peekPos)
|
|
||||||
if (ch === 32 || ch === 9 || ch === 13) { // space, tab, CR
|
|
||||||
peekPos += getCharSize(ch)
|
|
||||||
} else {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Check if next non-whitespace char is '='
|
|
||||||
const nextCh = getFullCodePoint(input, peekPos)
|
|
||||||
if (nextCh === 61 /* = */) {
|
|
||||||
input.acceptToken(AssignableIdentifier)
|
|
||||||
} else {
|
|
||||||
input.acceptToken(Identifier)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Advance past the token we consumed
|
||||||
|
input.advance(pos)
|
||||||
|
|
||||||
|
// Choose which token to emit
|
||||||
|
if (isValidIdentifier) {
|
||||||
|
const token = chooseIdentifierToken(input, stack)
|
||||||
|
input.acceptToken(token)
|
||||||
} else {
|
} else {
|
||||||
input.acceptToken(Word)
|
input.acceptToken(Word)
|
||||||
}
|
}
|
||||||
|
|
@ -122,15 +50,134 @@ export const tokenizer = new ExternalTokenizer(
|
||||||
{ contextual: true }
|
{ contextual: true }
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Build identifier text from input stream, handling surrogate pairs for emoji
|
||||||
|
const buildIdentifierText = (input: InputStream, length: number): string => {
|
||||||
|
let text = ''
|
||||||
|
for (let i = 0; i < length; i++) {
|
||||||
|
const charCode = input.peek(i)
|
||||||
|
if (charCode === -1) break
|
||||||
|
|
||||||
|
// Handle surrogate pairs for emoji (UTF-16 encoding)
|
||||||
|
if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < length) {
|
||||||
|
const low = input.peek(i + 1)
|
||||||
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
||||||
|
text += String.fromCharCode(charCode, low)
|
||||||
|
i++ // Skip the low surrogate
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
text += String.fromCharCode(charCode)
|
||||||
|
}
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consume word characters, tracking if it remains a valid identifier
|
||||||
|
// Returns the position after consuming, whether it's a valid identifier, and if we stopped at a dot
|
||||||
|
const consumeWordToken = (
|
||||||
|
input: InputStream,
|
||||||
|
isValidStart: boolean,
|
||||||
|
canBeWord: boolean
|
||||||
|
): { pos: number; isValidIdentifier: boolean; stoppedAtDot: boolean } => {
|
||||||
|
let pos = getCharSize(getFullCodePoint(input, 0))
|
||||||
|
let isValidIdentifier = isValidStart
|
||||||
|
let stoppedAtDot = false
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const ch = getFullCodePoint(input, pos)
|
||||||
|
|
||||||
|
// Stop at dot if we have a valid identifier (might be property access)
|
||||||
|
if (ch === 46 /* . */ && isValidIdentifier) {
|
||||||
|
stoppedAtDot = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop if we hit a non-word character
|
||||||
|
if (!isWordChar(ch)) break
|
||||||
|
|
||||||
|
// Context-aware termination: semicolon/colon can end a word if followed by whitespace
|
||||||
|
// This allows `hello; 2` to parse correctly while `hello;world` stays as one word
|
||||||
|
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
|
||||||
|
const nextCh = getFullCodePoint(input, pos + 1)
|
||||||
|
if (!isWordChar(nextCh)) break
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track identifier validity: must be lowercase, digit, dash, or emoji
|
||||||
|
if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 /* - */ && !isEmoji(ch)) {
|
||||||
|
if (!canBeWord) break
|
||||||
|
isValidIdentifier = false
|
||||||
|
}
|
||||||
|
|
||||||
|
pos += getCharSize(ch)
|
||||||
|
}
|
||||||
|
|
||||||
|
return { pos, isValidIdentifier, stoppedAtDot }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consume the rest of a word after we've decided not to treat a dot as DotGet
|
||||||
|
// Used when we have "file.txt" - we already consumed "file", now consume ".txt"
|
||||||
|
const consumeRestOfWord = (input: InputStream, startPos: number, canBeWord: boolean): number => {
|
||||||
|
let pos = startPos
|
||||||
|
while (true) {
|
||||||
|
const ch = getFullCodePoint(input, pos)
|
||||||
|
|
||||||
|
// Stop if we hit a non-word character
|
||||||
|
if (!isWordChar(ch)) break
|
||||||
|
|
||||||
|
// Context-aware termination for semicolon/colon
|
||||||
|
if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
|
||||||
|
const nextCh = getFullCodePoint(input, pos + 1)
|
||||||
|
if (!isWordChar(nextCh)) break
|
||||||
|
}
|
||||||
|
|
||||||
|
pos += getCharSize(ch)
|
||||||
|
}
|
||||||
|
return pos
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this identifier is in scope (for property access detection)
|
||||||
|
// Returns IdentifierBeforeDot token if in scope, null otherwise
|
||||||
|
const checkForDotGet = (input: InputStream, stack: Stack, pos: number): number | null => {
|
||||||
|
const identifierText = buildIdentifierText(input, pos)
|
||||||
|
const scopeContext = stack.context as ScopeContext | undefined
|
||||||
|
const scope = scopeContext?.scope
|
||||||
|
|
||||||
|
// If identifier is in scope, this is property access (e.g., obj.prop)
|
||||||
|
// If not in scope, it should be consumed as a Word (e.g., file.txt)
|
||||||
|
return scope?.has(identifierText) ? IdentifierBeforeDot : null
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decide between AssignableIdentifier and Identifier using grammar state + peek-ahead
|
||||||
|
const chooseIdentifierToken = (input: InputStream, stack: Stack): number => {
|
||||||
|
const canAssignable = stack.canShift(AssignableIdentifier)
|
||||||
|
const canRegular = stack.canShift(Identifier)
|
||||||
|
|
||||||
|
// Only one option is valid - use it
|
||||||
|
if (canAssignable && !canRegular) return AssignableIdentifier
|
||||||
|
if (canRegular && !canAssignable) return Identifier
|
||||||
|
|
||||||
|
// Both possible (ambiguous context) - peek ahead for '=' to disambiguate
|
||||||
|
// This happens at statement start where both `x = 5` (assign) and `echo x` (call) are valid
|
||||||
|
let peekPos = 0
|
||||||
|
while (true) {
|
||||||
|
const ch = getFullCodePoint(input, peekPos)
|
||||||
|
if (isWhiteSpace(ch)) {
|
||||||
|
peekPos += getCharSize(ch)
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const nextCh = getFullCodePoint(input, peekPos)
|
||||||
|
return nextCh === 61 /* = */ ? AssignableIdentifier : Identifier
|
||||||
|
}
|
||||||
|
|
||||||
|
// Character classification helpers
|
||||||
const isWhiteSpace = (ch: number): boolean => {
|
const isWhiteSpace = (ch: number): boolean => {
|
||||||
return ch === 32 /* space */ || ch === 10 /* \n */ || ch === 9 /* tab */ || ch === 13 /* \r */
|
return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */
|
||||||
}
|
}
|
||||||
|
|
||||||
const isWordChar = (ch: number): boolean => {
|
const isWordChar = (ch: number): boolean => {
|
||||||
const closingParen = ch === 41 /* ) */
|
return !isWhiteSpace(ch) && ch !== 10 /* \n */ && ch !== 41 /* ) */ && ch !== -1 /* EOF */
|
||||||
const eof = ch === -1
|
|
||||||
|
|
||||||
return !isWhiteSpace(ch) && !closingParen && !eof
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const isLowercaseLetter = (ch: number): boolean => {
|
const isLowercaseLetter = (ch: number): boolean => {
|
||||||
|
|
@ -154,7 +201,7 @@ const getFullCodePoint = (input: InputStream, pos: number): number => {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ch // Single code unit
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
const isEmoji = (ch: number): boolean => {
|
const isEmoji = (ch: number): boolean => {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user