wip

2025-10-17 21:13:49 -07:00 · 2025-10-17 21:13:49 -07:00 · 78ae96fc72
commit 78ae96fc72
parent b0d5a7f50c
2 changed files with 169 additions and 138 deletions
--- a/src/parser/scopeTracker.ts
+++ b/src/parser/scopeTracker.ts
@ -2,23 +2,20 @@ import { ContextTracker, InputStream } from '@lezer/lr'
 import * as terms from './shrimp.terms'
 export class Scope {
-  constructor(
+  constructor(public parent: Scope | null, public vars = new Set<string>()) {}
    public parent: Scope | null,
    public vars: Set<string>
  ) {}
  has(name: string): boolean {
-    return this.vars.has(name) || (this.parent?.has(name) ?? false)
+    return this.vars.has(name) ?? this.parent?.has(name)
  }
  add(...names: string[]): Scope {
    const newVars = new Set(this.vars)
-    names.forEach(name => newVars.add(name))
+    names.forEach((name) => newVars.add(name))
    return new Scope(this.parent, newVars)
  }
  push(): Scope {
-    return new Scope(this, new Set())
+    return new Scope(this)
  }
  pop(): Scope {
@ -43,10 +40,7 @@ export class Scope {
 // Wrapper that adds temporary state for identifier capture
 export class ScopeContext {
-  constructor(
+  constructor(public scope: Scope, public pendingIds: string[] = []) {}
    public scope: Scope,
    public pendingIds: string[] = []
  ) {}
  // Helper to append identifier to pending list
  withPending(id: string): ScopeContext {
@ -57,24 +51,19 @@ export class ScopeContext {
  consumeLast(): ScopeContext {
    const varName = this.pendingIds.at(-1)
    if (!varName) return this
-    return new ScopeContext(
+    return new ScopeContext(this.scope.add(varName), this.pendingIds.slice(0, -1))
      this.scope.add(varName),
      this.pendingIds.slice(0, -1)
    )
  }
  // Helper to consume all pending identifiers and add to new scope
  consumeAll(): ScopeContext {
-    const newScope = this.scope.push()
+    let newScope = this.scope.push()
-    return new ScopeContext(
+    newScope = this.pendingIds.length > 0 ? newScope.add(...this.pendingIds) : newScope
-      this.pendingIds.length > 0 ? newScope.add(...this.pendingIds) : newScope,
+    return new ScopeContext(newScope)
      []
    )
  }
  // Helper to clear pending without adding to scope
  clearPending(): ScopeContext {
-    return new ScopeContext(this.scope, [])
+    return new ScopeContext(this.scope)
  }
 }
@ -94,24 +83,19 @@ export const trackScope = new ContextTracker<ScopeContext>({
  start: new ScopeContext(new Scope(null, new Set())),
  shift(context, term, stack, input) {
-    // Only capture AssignableIdentifier tokens
+    if (term !== terms.AssignableIdentifier) return context
-    if (term === terms.AssignableIdentifier) {
+
-      const text = readIdentifierText(input, input.pos, stack.pos)
+    const text = readIdentifierText(input, input.pos, stack.pos)
-      return context.withPending(text)
+    return context.withPending(text)
    }
    return context
  },
  reduce(context, term) {
    // Add assignment variable to scope
    if (term === terms.Assign) return context.consumeLast()
    // Push new scope and add all parameters
    if (term === terms.Params) return context.consumeAll()
    // Pop scope when exiting function
    if (term === terms.FunctionDef) {
-      return new ScopeContext(context.scope.pop(), [])
+      return new ScopeContext(context.scope.pop())
    }
    return context
--- a/src/parser/tokenizer.ts
+++ b/src/parser/tokenizer.ts
@ -6,115 +6,43 @@ import type { ScopeContext } from './scopeTracker'
 export const tokenizer = new ExternalTokenizer(
  (input: InputStream, stack: Stack) => {
-    let ch = getFullCodePoint(input, 0)
+    const ch = getFullCodePoint(input, 0)
    if (!isWordChar(ch)) return
-    let pos = getCharSize(ch)
+    const isValidStart = isLowercaseLetter(ch) || isEmoji(ch)
    let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch)
    const canBeWord = stack.canShift(Word)
-    while (true) {
+    // Consume all word characters, tracking if it remains a valid identifier
-      ch = getFullCodePoint(input, pos)
+    const { pos, isValidIdentifier, stoppedAtDot } = consumeWordToken(
      input,
      isValidStart,
      canBeWord
    )
-      // Check for dot and scope - property access detection
+    // Check if we should emit IdentifierBeforeDot for property access
-      if (ch === 46 /* . */ && isValidIdentifier) {
+    if (stoppedAtDot) {
-        // Build identifier text by peeking character by character
+      const dotGetToken = checkForDotGet(input, stack, pos)
        let identifierText = ''
        for (let i = 0; i < pos; i++) {
          const charCode = input.peek(i)
          if (charCode === -1) break
          // Handle surrogate pairs for emoji
          if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
            const low = input.peek(i + 1)
            if (low >= 0xdc00 && low <= 0xdfff) {
              identifierText += String.fromCharCode(charCode, low)
              i++ // Skip the low surrogate
              continue
            }
          }
          identifierText += String.fromCharCode(charCode)
        }
-        const scopeContext = stack.context as ScopeContext | undefined
+      if (dotGetToken) {
-        const scope = scopeContext?.scope
+        input.advance(pos)
-
+        input.acceptToken(dotGetToken)
        if (scope?.has(identifierText)) {
          // In scope - stop here, let grammar parse property access
          input.advance(pos)
          input.acceptToken(IdentifierBeforeDot)
          return
        }
        // Not in scope - continue consuming as Word (fall through)
      }
      if (!isWordChar(ch)) break
      // Certain characters might end a word or identifier if they are followed by whitespace.
      // This allows things like `a = hello; 2` of if `x: y` to parse correctly.
      if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
        const nextCh = getFullCodePoint(input, pos + 1)
        if (!isWordChar(nextCh)) break
      }
      // Track identifier validity
      if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) {
        if (!canBeWord) break
        isValidIdentifier = false
      }
      pos += getCharSize(ch)
    }
    // Build identifier text BEFORE advancing (for debug and peek-ahead)
    let identifierText = ''
    if (isValidIdentifier) {
      for (let i = 0; i < pos; i++) {
        const charCode = input.peek(i)
        if (charCode === -1) break
        if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
          const low = input.peek(i + 1)
          if (low >= 0xdc00 && low <= 0xdfff) {
            identifierText += String.fromCharCode(charCode, low)
            i++
            continue
          }
        }
        identifierText += String.fromCharCode(charCode)
      }
    }
    input.advance(pos)
    if (isValidIdentifier) {
      const canAssignable = stack.canShift(AssignableIdentifier)
      const canRegular = stack.canShift(Identifier)
      if (canAssignable && !canRegular) {
        // Only AssignableIdentifier valid (e.g., in Params)
        input.acceptToken(AssignableIdentifier)
      } else if (canRegular && !canAssignable) {
        // Only Identifier valid (e.g., in function args)
        input.acceptToken(Identifier)
      } else {
-        // BOTH possible (ambiguous) - peek ahead for '='
+        // Not in scope - continue consuming the dot as part of the word
-        // Note: we're peeking from current position (after advance), so start at 0
+        const afterDot = consumeRestOfWord(input, pos + 1, canBeWord)
-        let peekPos = 0
+        input.advance(afterDot)
-        // Skip whitespace (space, tab, CR, but NOT newline - assignment must be on same line)
+        input.acceptToken(Word)
        while (true) {
          const ch = getFullCodePoint(input, peekPos)
          if (ch === 32 || ch === 9 || ch === 13) { // space, tab, CR
            peekPos += getCharSize(ch)
          } else {
            break
          }
        }
        // Check if next non-whitespace char is '='
        const nextCh = getFullCodePoint(input, peekPos)
        if (nextCh === 61 /* = */) {
          input.acceptToken(AssignableIdentifier)
        } else {
          input.acceptToken(Identifier)
        }
      }
      return
    }
    // Advance past the token we consumed
    input.advance(pos)
    // Choose which token to emit
    if (isValidIdentifier) {
      const token = chooseIdentifierToken(input, stack)
      input.acceptToken(token)
    } else {
      input.acceptToken(Word)
    }
@ -122,15 +50,134 @@ export const tokenizer = new ExternalTokenizer(
  { contextual: true }
 )
 // Build identifier text from input stream, handling surrogate pairs for emoji
 const buildIdentifierText = (input: InputStream, length: number): string => {
  let text = ''
  for (let i = 0; i < length; i++) {
    const charCode = input.peek(i)
    if (charCode === -1) break
    // Handle surrogate pairs for emoji (UTF-16 encoding)
    if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < length) {
      const low = input.peek(i + 1)
      if (low >= 0xdc00 && low <= 0xdfff) {
        text += String.fromCharCode(charCode, low)
        i++ // Skip the low surrogate
        continue
      }
    }
    text += String.fromCharCode(charCode)
  }
  return text
 }
 // Consume word characters, tracking if it remains a valid identifier
 // Returns the position after consuming, whether it's a valid identifier, and if we stopped at a dot
 const consumeWordToken = (
  input: InputStream,
  isValidStart: boolean,
  canBeWord: boolean
 ): { pos: number; isValidIdentifier: boolean; stoppedAtDot: boolean } => {
  let pos = getCharSize(getFullCodePoint(input, 0))
  let isValidIdentifier = isValidStart
  let stoppedAtDot = false
  while (true) {
    const ch = getFullCodePoint(input, pos)
    // Stop at dot if we have a valid identifier (might be property access)
    if (ch === 46 /* . */ && isValidIdentifier) {
      stoppedAtDot = true
      break
    }
    // Stop if we hit a non-word character
    if (!isWordChar(ch)) break
    // Context-aware termination: semicolon/colon can end a word if followed by whitespace
    // This allows `hello; 2` to parse correctly while `hello;world` stays as one word
    if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
      const nextCh = getFullCodePoint(input, pos + 1)
      if (!isWordChar(nextCh)) break
    }
    // Track identifier validity: must be lowercase, digit, dash, or emoji
    if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 /* - */ && !isEmoji(ch)) {
      if (!canBeWord) break
      isValidIdentifier = false
    }
    pos += getCharSize(ch)
  }
  return { pos, isValidIdentifier, stoppedAtDot }
 }
 // Consume the rest of a word after we've decided not to treat a dot as DotGet
 // Used when we have "file.txt" - we already consumed "file", now consume ".txt"
 const consumeRestOfWord = (input: InputStream, startPos: number, canBeWord: boolean): number => {
  let pos = startPos
  while (true) {
    const ch = getFullCodePoint(input, pos)
    // Stop if we hit a non-word character
    if (!isWordChar(ch)) break
    // Context-aware termination for semicolon/colon
    if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
      const nextCh = getFullCodePoint(input, pos + 1)
      if (!isWordChar(nextCh)) break
    }
    pos += getCharSize(ch)
  }
  return pos
 }
 // Check if this identifier is in scope (for property access detection)
 // Returns IdentifierBeforeDot token if in scope, null otherwise
 const checkForDotGet = (input: InputStream, stack: Stack, pos: number): number | null => {
  const identifierText = buildIdentifierText(input, pos)
  const scopeContext = stack.context as ScopeContext | undefined
  const scope = scopeContext?.scope
  // If identifier is in scope, this is property access (e.g., obj.prop)
  // If not in scope, it should be consumed as a Word (e.g., file.txt)
  return scope?.has(identifierText) ? IdentifierBeforeDot : null
 }
 // Decide between AssignableIdentifier and Identifier using grammar state + peek-ahead
 const chooseIdentifierToken = (input: InputStream, stack: Stack): number => {
  const canAssignable = stack.canShift(AssignableIdentifier)
  const canRegular = stack.canShift(Identifier)
  // Only one option is valid - use it
  if (canAssignable && !canRegular) return AssignableIdentifier
  if (canRegular && !canAssignable) return Identifier
  // Both possible (ambiguous context) - peek ahead for '=' to disambiguate
  // This happens at statement start where both `x = 5` (assign) and `echo x` (call) are valid
  let peekPos = 0
  while (true) {
    const ch = getFullCodePoint(input, peekPos)
    if (isWhiteSpace(ch)) {
      peekPos += getCharSize(ch)
    } else {
      break
    }
  }
  const nextCh = getFullCodePoint(input, peekPos)
  return nextCh === 61 /* = */ ? AssignableIdentifier : Identifier
 }
 // Character classification helpers
 const isWhiteSpace = (ch: number): boolean => {
-  return ch === 32 /* space */ || ch === 10 /* \n */ || ch === 9 /* tab */ || ch === 13 /* \r */
+  return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */
 }
 const isWordChar = (ch: number): boolean => {
-  const closingParen = ch === 41 /* ) */
+  return !isWhiteSpace(ch) && ch !== 10 /* \n */ && ch !== 41 /* ) */ && ch !== -1 /* EOF */
  const eof = ch === -1
  return !isWhiteSpace(ch) && !closingParen && !eof
 }
 const isLowercaseLetter = (ch: number): boolean => {
@ -154,7 +201,7 @@ const getFullCodePoint = (input: InputStream, pos: number): number => {
    }
  }
-  return ch // Single code unit
+  return ch
 }
 const isEmoji = (ch: number): boolean => {