wip

2025-10-17 21:13:49 -07:00 · 2025-10-17 21:13:49 -07:00 · 78ae96fc72
commit 78ae96fc72
parent b0d5a7f50c
2 changed files with 169 additions and 138 deletions
--- a/src/parser/scopeTracker.ts
+++ b/src/parser/scopeTracker.ts
@ -2,23 +2,20 @@ import { ContextTracker, InputStream } from '@lezer/lr'
 import * as terms from './shrimp.terms'

 export class Scope {
-  constructor(
-    public parent: Scope | null,
-    public vars: Set<string>
-  ) {}
+  constructor(public parent: Scope | null, public vars = new Set<string>()) {}

  has(name: string): boolean {
-    return this.vars.has(name) || (this.parent?.has(name) ?? false)
+    return this.vars.has(name) ?? this.parent?.has(name)
  }

  add(...names: string[]): Scope {
    const newVars = new Set(this.vars)
-    names.forEach(name => newVars.add(name))
+    names.forEach((name) => newVars.add(name))
    return new Scope(this.parent, newVars)
  }

  push(): Scope {
-    return new Scope(this, new Set())
+    return new Scope(this)
  }

  pop(): Scope {
@ -43,10 +40,7 @@ export class Scope {

 // Wrapper that adds temporary state for identifier capture
 export class ScopeContext {
-  constructor(
-    public scope: Scope,
-    public pendingIds: string[] = []
-  ) {}
+  constructor(public scope: Scope, public pendingIds: string[] = []) {}

  // Helper to append identifier to pending list
  withPending(id: string): ScopeContext {
@ -57,24 +51,19 @@ export class ScopeContext {
  consumeLast(): ScopeContext {
    const varName = this.pendingIds.at(-1)
    if (!varName) return this
-    return new ScopeContext(
-      this.scope.add(varName),
-      this.pendingIds.slice(0, -1)
-    )
+    return new ScopeContext(this.scope.add(varName), this.pendingIds.slice(0, -1))
  }

  // Helper to consume all pending identifiers and add to new scope
  consumeAll(): ScopeContext {
-    const newScope = this.scope.push()
-    return new ScopeContext(
-      this.pendingIds.length > 0 ? newScope.add(...this.pendingIds) : newScope,
-      []
-    )
+    let newScope = this.scope.push()
+    newScope = this.pendingIds.length > 0 ? newScope.add(...this.pendingIds) : newScope
+    return new ScopeContext(newScope)
  }

  // Helper to clear pending without adding to scope
  clearPending(): ScopeContext {
-    return new ScopeContext(this.scope, [])
+    return new ScopeContext(this.scope)
  }
 }

@ -94,24 +83,19 @@ export const trackScope = new ContextTracker<ScopeContext>({
  start: new ScopeContext(new Scope(null, new Set())),

  shift(context, term, stack, input) {
-    // Only capture AssignableIdentifier tokens
-    if (term === terms.AssignableIdentifier) {
-      const text = readIdentifierText(input, input.pos, stack.pos)
-      return context.withPending(text)
-    }
-    return context
+    if (term !== terms.AssignableIdentifier) return context
+
+    const text = readIdentifierText(input, input.pos, stack.pos)
+    return context.withPending(text)
  },

  reduce(context, term) {
-    // Add assignment variable to scope
    if (term === terms.Assign) return context.consumeLast()
-
-    // Push new scope and add all parameters
    if (term === terms.Params) return context.consumeAll()

    // Pop scope when exiting function
    if (term === terms.FunctionDef) {
-      return new ScopeContext(context.scope.pop(), [])
+      return new ScopeContext(context.scope.pop())
    }

    return context
--- a/src/parser/tokenizer.ts
+++ b/src/parser/tokenizer.ts
@ -6,115 +6,43 @@ import type { ScopeContext } from './scopeTracker'

 export const tokenizer = new ExternalTokenizer(
  (input: InputStream, stack: Stack) => {
-    let ch = getFullCodePoint(input, 0)
+    const ch = getFullCodePoint(input, 0)
    if (!isWordChar(ch)) return

-    let pos = getCharSize(ch)
-    let isValidIdentifier = isLowercaseLetter(ch) || isEmoji(ch)
+    const isValidStart = isLowercaseLetter(ch) || isEmoji(ch)
    const canBeWord = stack.canShift(Word)

-    while (true) {
-      ch = getFullCodePoint(input, pos)
+    // Consume all word characters, tracking if it remains a valid identifier
+    const { pos, isValidIdentifier, stoppedAtDot } = consumeWordToken(
+      input,
+      isValidStart,
+      canBeWord
+    )

-      // Check for dot and scope - property access detection
-      if (ch === 46 /* . */ && isValidIdentifier) {
-        // Build identifier text by peeking character by character
-        let identifierText = ''
-        for (let i = 0; i < pos; i++) {
-          const charCode = input.peek(i)
-          if (charCode === -1) break
-          // Handle surrogate pairs for emoji
-          if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
-            const low = input.peek(i + 1)
-            if (low >= 0xdc00 && low <= 0xdfff) {
-              identifierText += String.fromCharCode(charCode, low)
-              i++ // Skip the low surrogate
-              continue
-            }
-          }
-          identifierText += String.fromCharCode(charCode)
-        }
+    // Check if we should emit IdentifierBeforeDot for property access
+    if (stoppedAtDot) {
+      const dotGetToken = checkForDotGet(input, stack, pos)

-        const scopeContext = stack.context as ScopeContext | undefined
-        const scope = scopeContext?.scope
-
-        if (scope?.has(identifierText)) {
-          // In scope - stop here, let grammar parse property access
-          input.advance(pos)
-          input.acceptToken(IdentifierBeforeDot)
-          return
-        }
-        // Not in scope - continue consuming as Word (fall through)
-      }
-
-      if (!isWordChar(ch)) break
-
-      // Certain characters might end a word or identifier if they are followed by whitespace.
-      // This allows things like `a = hello; 2` of if `x: y` to parse correctly.
-      if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
-        const nextCh = getFullCodePoint(input, pos + 1)
-        if (!isWordChar(nextCh)) break
-      }
-
-      // Track identifier validity
-      if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 && !isEmoji(ch)) {
-        if (!canBeWord) break
-        isValidIdentifier = false
-      }
-
-      pos += getCharSize(ch)
-    }
-
-    // Build identifier text BEFORE advancing (for debug and peek-ahead)
-    let identifierText = ''
-    if (isValidIdentifier) {
-      for (let i = 0; i < pos; i++) {
-        const charCode = input.peek(i)
-        if (charCode === -1) break
-        if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < pos) {
-          const low = input.peek(i + 1)
-          if (low >= 0xdc00 && low <= 0xdfff) {
-            identifierText += String.fromCharCode(charCode, low)
-            i++
-            continue
-          }
-        }
-        identifierText += String.fromCharCode(charCode)
-      }
-    }
-
-    input.advance(pos)
-    if (isValidIdentifier) {
-      const canAssignable = stack.canShift(AssignableIdentifier)
-      const canRegular = stack.canShift(Identifier)
-
-      if (canAssignable && !canRegular) {
-        // Only AssignableIdentifier valid (e.g., in Params)
-        input.acceptToken(AssignableIdentifier)
-      } else if (canRegular && !canAssignable) {
-        // Only Identifier valid (e.g., in function args)
-        input.acceptToken(Identifier)
+      if (dotGetToken) {
+        input.advance(pos)
+        input.acceptToken(dotGetToken)
      } else {
-        // BOTH possible (ambiguous) - peek ahead for '='
-        // Note: we're peeking from current position (after advance), so start at 0
-        let peekPos = 0
-        // Skip whitespace (space, tab, CR, but NOT newline - assignment must be on same line)
-        while (true) {
-          const ch = getFullCodePoint(input, peekPos)
-          if (ch === 32 || ch === 9 || ch === 13) { // space, tab, CR
-            peekPos += getCharSize(ch)
-          } else {
-            break
-          }
-        }
-        // Check if next non-whitespace char is '='
-        const nextCh = getFullCodePoint(input, peekPos)
-        if (nextCh === 61 /* = */) {
-          input.acceptToken(AssignableIdentifier)
-        } else {
-          input.acceptToken(Identifier)
-        }
+        // Not in scope - continue consuming the dot as part of the word
+        const afterDot = consumeRestOfWord(input, pos + 1, canBeWord)
+        input.advance(afterDot)
+        input.acceptToken(Word)
      }
+
+      return
+    }
+
+    // Advance past the token we consumed
+    input.advance(pos)
+
+    // Choose which token to emit
+    if (isValidIdentifier) {
+      const token = chooseIdentifierToken(input, stack)
+      input.acceptToken(token)
    } else {
      input.acceptToken(Word)
    }
@ -122,15 +50,134 @@ export const tokenizer = new ExternalTokenizer(
  { contextual: true }
 )

+// Build identifier text from input stream, handling surrogate pairs for emoji
+const buildIdentifierText = (input: InputStream, length: number): string => {
+  let text = ''
+  for (let i = 0; i < length; i++) {
+    const charCode = input.peek(i)
+    if (charCode === -1) break
+
+    // Handle surrogate pairs for emoji (UTF-16 encoding)
+    if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < length) {
+      const low = input.peek(i + 1)
+      if (low >= 0xdc00 && low <= 0xdfff) {
+        text += String.fromCharCode(charCode, low)
+        i++ // Skip the low surrogate
+        continue
+      }
+    }
+    text += String.fromCharCode(charCode)
+  }
+  return text
+}
+
+// Consume word characters, tracking if it remains a valid identifier
+// Returns the position after consuming, whether it's a valid identifier, and if we stopped at a dot
+const consumeWordToken = (
+  input: InputStream,
+  isValidStart: boolean,
+  canBeWord: boolean
+): { pos: number; isValidIdentifier: boolean; stoppedAtDot: boolean } => {
+  let pos = getCharSize(getFullCodePoint(input, 0))
+  let isValidIdentifier = isValidStart
+  let stoppedAtDot = false
+
+  while (true) {
+    const ch = getFullCodePoint(input, pos)
+
+    // Stop at dot if we have a valid identifier (might be property access)
+    if (ch === 46 /* . */ && isValidIdentifier) {
+      stoppedAtDot = true
+      break
+    }
+
+    // Stop if we hit a non-word character
+    if (!isWordChar(ch)) break
+
+    // Context-aware termination: semicolon/colon can end a word if followed by whitespace
+    // This allows `hello; 2` to parse correctly while `hello;world` stays as one word
+    if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
+      const nextCh = getFullCodePoint(input, pos + 1)
+      if (!isWordChar(nextCh)) break
+    }
+
+    // Track identifier validity: must be lowercase, digit, dash, or emoji
+    if (!isLowercaseLetter(ch) && !isDigit(ch) && ch !== 45 /* - */ && !isEmoji(ch)) {
+      if (!canBeWord) break
+      isValidIdentifier = false
+    }
+
+    pos += getCharSize(ch)
+  }
+
+  return { pos, isValidIdentifier, stoppedAtDot }
+}
+
+// Consume the rest of a word after we've decided not to treat a dot as DotGet
+// Used when we have "file.txt" - we already consumed "file", now consume ".txt"
+const consumeRestOfWord = (input: InputStream, startPos: number, canBeWord: boolean): number => {
+  let pos = startPos
+  while (true) {
+    const ch = getFullCodePoint(input, pos)
+
+    // Stop if we hit a non-word character
+    if (!isWordChar(ch)) break
+
+    // Context-aware termination for semicolon/colon
+    if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) {
+      const nextCh = getFullCodePoint(input, pos + 1)
+      if (!isWordChar(nextCh)) break
+    }
+
+    pos += getCharSize(ch)
+  }
+  return pos
+}
+
+// Check if this identifier is in scope (for property access detection)
+// Returns IdentifierBeforeDot token if in scope, null otherwise
+const checkForDotGet = (input: InputStream, stack: Stack, pos: number): number | null => {
+  const identifierText = buildIdentifierText(input, pos)
+  const scopeContext = stack.context as ScopeContext | undefined
+  const scope = scopeContext?.scope
+
+  // If identifier is in scope, this is property access (e.g., obj.prop)
+  // If not in scope, it should be consumed as a Word (e.g., file.txt)
+  return scope?.has(identifierText) ? IdentifierBeforeDot : null
+}
+
+// Decide between AssignableIdentifier and Identifier using grammar state + peek-ahead
+const chooseIdentifierToken = (input: InputStream, stack: Stack): number => {
+  const canAssignable = stack.canShift(AssignableIdentifier)
+  const canRegular = stack.canShift(Identifier)
+
+  // Only one option is valid - use it
+  if (canAssignable && !canRegular) return AssignableIdentifier
+  if (canRegular && !canAssignable) return Identifier
+
+  // Both possible (ambiguous context) - peek ahead for '=' to disambiguate
+  // This happens at statement start where both `x = 5` (assign) and `echo x` (call) are valid
+  let peekPos = 0
+  while (true) {
+    const ch = getFullCodePoint(input, peekPos)
+    if (isWhiteSpace(ch)) {
+      peekPos += getCharSize(ch)
+    } else {
+      break
+    }
+  }
+
+  const nextCh = getFullCodePoint(input, peekPos)
+  return nextCh === 61 /* = */ ? AssignableIdentifier : Identifier
+}
+
+// Character classification helpers
 const isWhiteSpace = (ch: number): boolean => {
-  return ch === 32 /* space */ || ch === 10 /* \n */ || ch === 9 /* tab */ || ch === 13 /* \r */
+  return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */
 }

 const isWordChar = (ch: number): boolean => {
-  const closingParen = ch === 41 /* ) */
-  const eof = ch === -1
-
-  return !isWhiteSpace(ch) && !closingParen && !eof
+  return !isWhiteSpace(ch) && ch !== 10 /* \n */ && ch !== 41 /* ) */ && ch !== -1 /* EOF */
 }

 const isLowercaseLetter = (ch: number): boolean => {
@ -154,7 +201,7 @@ const getFullCodePoint = (input: InputStream, pos: number): number => {
    }
  }

-  return ch // Single code unit
+  return ch
 }

 const isEmoji = (ch: number): boolean => {