shrimp/src/parser/tokenizer2.ts

import { isDebug } from '#utils/utils'

export type Token = {
  type: TokenType
  value?: string
  from: number
  to: number
}

export enum TokenType {
  Comment,

  Keyword,
  Operator,

  Newline,
  Semicolon,
  Colon,
  Underscore,

  OpenParen,
  CloseParen,
  OpenBracket,
  CloseBracket,

  Identifier,
  Word,
  NamedArgPrefix,

  Null,
  Boolean,
  Number,
  String,
  Regex,
}

const valueTokens = new Set([
  TokenType.Comment,
  TokenType.Keyword,
  TokenType.Operator,
  TokenType.Identifier,
  TokenType.Word,
  TokenType.NamedArgPrefix,
  TokenType.Boolean,
  TokenType.Number,
  TokenType.String,
  TokenType.Regex,
  TokenType.Underscore,
])

const operators = new Set([
  // assignment
  '=',

  // logic
  'or',
  'and',

  // bitwise
  'band',
  'bor',
  'bxor',
  '>>>',
  '>>',
  '<<',

  // compound assignment
  '??=',
  '+=',
  '-=',
  '*=',
  '/=',
  '%=',

  // nullish
  '??',

  // math
  '**',
  '*',
  '/',
  '+',
  '-',
  '%',

  // comparison
  '>=',
  '<=',
  '!=',
  '==',
  '>',
  '<',

  // property access
  '.',

  // pipe
  '|',
])

const keywords = new Set([
  'import',
  'end',
  'do',
  'if',
  'while',
  'if',
  'else',
  'try',
  'catch',
  'finally',
  'throw',
  'not',
])

// helper
function c(strings: TemplateStringsArray, ...values: any[]) {
  return strings.reduce((result, str, i) => result + str + (values[i] ?? ''), '').charCodeAt(0)
}

function s(c: number): string {
  return String.fromCharCode(c)
}

export class Scanner {
  input = ''
  pos = 0
  start = 0
  char = 0
  prev = 0
  inParen = 0
  inBracket = 0
  tokens: Token[] = []
  prevIsWhitespace = true

  reset() {
    this.input = ''
    this.pos = 0
    this.start = 0
    this.char = 0
    this.prev = 0
    this.tokens.length = 0
    this.prevIsWhitespace = true
  }

  peek(count = 0): number {
    return getFullCodePoint(this.input, this.pos + count)
  }

  next(): number {
    this.prevIsWhitespace = isWhitespace(this.char)
    this.prev = this.char
    this.char = this.peek()
    this.pos += getCharSize(this.char)

    return this.char
  }

  push(type: TokenType, from?: number, to?: number) {
    from ??= this.start
    to ??= this.pos - getCharSize(this.char)
    if (to < from) to = from

    this.tokens.push(
      Object.assign(
        {},
        {
          type,
          from,
          to,
        },
        valueTokens.has(type) ? { value: this.input.slice(from, to) } : {}
      )
    )

    if (isDebug()) {
      const tok = this.tokens.at(-1)
      console.log(`≫ PUSH(${from},${to})`, TokenType[tok?.type || 0], '—', tok?.value)
    }

    this.start = this.pos
  }

  pushChar(type: TokenType) {
    this.push(type, this.pos - 1, this.pos)
  }

  // turn shrimp code into shrimp tokens that get fed into the parser
  tokenize(input: string): Token[] {
    this.reset()
    this.input = input
    this.next()

    while (this.char > 0) {
      const char = this.char

      if (char === c`#`) {
        this.readComment()
        continue
      }

      if (isBracket(char)) {
        this.readBracket()
        continue
      }

      if (isStringDelim(char)) {
        this.readString(char)
        continue
      }

      if (char === c`{`) {
        this.readCurlyString()
        continue
      }

      if (isIdentStart(char)) {
        this.readWordOrIdent(true) // true = started with identifier char
        continue
      }

      if (isDigit(char) || ((char === c`-` || char === c`+`) && isDigit(this.peek()))) {
        this.readNumber()
        continue
      }

      if (char === c`:`) {
        this.pushChar(TokenType.Colon)
        this.next()
        continue
      }

      // whitespace-sensitive dot as operator (property access) only after identifier/number
      if (char === c`.`) {
        if (this.canBeDotGet(this.tokens.at(-1))) {
          this.pushChar(TokenType.Operator)
          this.next()
          continue
        }
      }

      if (char === c`/` && this.peek() === c`/`) {
        this.readRegex()
        continue
      }

      if (isWordChar(char)) {
        this.readWordOrIdent(false) // false = didn't start with identifier char
        continue
      }

      if (char === c`\n`) {
        if (this.inParen === 0 && this.inBracket === 0) this.pushChar(TokenType.Newline)
        this.next()
        continue
      }

      if (char === c`;`) {
        this.pushChar(TokenType.Semicolon)
        this.next()
        continue
      }

      this.next()
    }

    return this.tokens
  }

  readComment() {
    this.start = this.pos - 1
    while (this.char !== c`\n` && this.char > 0) this.next()
    this.push(TokenType.Comment)
  }

  readBracket() {
    switch (this.char) {
      case c`(`:
        this.inParen++
        this.pushChar(TokenType.OpenParen)
        break
      case c`)`:
        this.inParen--
        this.pushChar(TokenType.CloseParen)
        break
      case c`[`:
        this.inBracket++
        this.pushChar(TokenType.OpenBracket)
        break
      case c`]`:
        this.inBracket--
        this.pushChar(TokenType.CloseBracket)
        break
    }
    this.next()
  }

  readString(delim: number) {
    this.start = this.pos - 1
    this.next() // skip opening delim
    while (this.char > 0 && (this.char !== delim || (this.char === delim && this.prev === c`\\`)))
      this.next()
    this.next() // skip closing delim

    this.push(TokenType.String)
  }

  readCurlyString() {
    this.start = this.pos - 1
    let depth = 1
    this.next()

    while (depth > 0 && this.char > 0) {
      if (this.char === c`{`) depth++
      if (this.char === c`}`) depth--
      this.next()
    }

    this.push(TokenType.String)
  }

  readWordOrIdent(startedWithIdentChar: boolean) {
    this.start = this.pos - getCharSize(this.char)

    while (isWordChar(this.char)) {
      // stop at colon if followed by whitespace (e.g., 'do x: echo x end')
      if (this.char === c`:`) {
        const nextCh = this.peek()
        if (isWhitespace(nextCh) || nextCh === 0) break
      }

      // stop at equal sign (named arg) - but only if what we've read so far is an identifier
      if (this.char === c`=`) {
        const soFar = this.input.slice(this.start, this.pos - getCharSize(this.char))
        if (isIdentifer(soFar)) {
          this.next()
          break
        }
      }

      // stop at dot only if it would create a valid property access
      // AND only if we started with an identifier character (not for Words like README.txt)
      if (startedWithIdentChar && this.char === c`.`) {
        const nextCh = this.peek()
        if (isIdentStart(nextCh) || isDigit(nextCh) || nextCh === c`(`) {
          const soFar = this.input.slice(this.start, this.pos - getCharSize(this.char))
          if (isIdentifer(soFar)) break
        }
      }

      this.next()
    }

    const word = this.input.slice(this.start, this.pos - getCharSize(this.char))

    // classify the token based on what we read
    if (word === '_') this.push(TokenType.Underscore)
    else if (word === 'null') this.push(TokenType.Null)
    else if (word === 'true' || word === 'false') this.push(TokenType.Boolean)
    else if (isKeyword(word)) this.push(TokenType.Keyword)
    else if (isOperator(word)) this.push(TokenType.Operator)
    else if (isIdentifer(word)) this.push(TokenType.Identifier)
    else if (word.endsWith('=')) this.push(TokenType.NamedArgPrefix)
    else this.push(TokenType.Word)
  }

  readNumber() {
    this.start = this.pos - 1
    while (isWordChar(this.char)) {
      // stop at dot unless it's part of the number
      if (this.char === c`.`) {
        const nextCh = this.peek()
        if (!isDigit(nextCh)) break
      }

      // stop at colon
      if (this.char === c`:`) {
        const nextCh = this.peek()
        if (isWhitespace(nextCh) || nextCh === 0) break
      }
      this.next()
    }
    const ident = this.input.slice(this.start, this.pos - 1)
    this.push(isNumber(ident) ? TokenType.Number : TokenType.Word)
  }

  readRegex() {
    this.start = this.pos - 1
    this.next() // skip 2nd /

    while (this.char > 0) {
      if (this.char === c`/` && this.peek() === c`/`) {
        this.next() // skip /
        this.next() // skip /

        // read regex flags
        while (this.char > 0 && isIdentStart(this.char)) this.next()

        // validate regex
        const to = this.pos - getCharSize(this.char)
        const regexText = this.input.slice(this.start, to)
        const [_, pattern, flags] = regexText.match(/^\/\/(.*)\/\/([gimsuy]*)$/) || []

        if (pattern) {
          try {
            new RegExp(pattern, flags)
            this.push(TokenType.Regex)
            break
          } catch (e) {
            // invalid regex - fall through to Word
          }
        }

        // invalid regex is treated as Word
        this.push(TokenType.Word)
        break
      }

      this.next()
    }
  }

  canBeDotGet(lastToken?: Token): boolean {
    return (
      !this.prevIsWhitespace &&
      !!lastToken &&
      (lastToken.type === TokenType.Identifier ||
        lastToken.type === TokenType.Number ||
        lastToken.type === TokenType.CloseParen ||
        lastToken.type === TokenType.CloseBracket)
    )
  }
}

const isNumber = (word: string): boolean => {
  // regular number
  if (/^[+-]?\d+(_?\d+)*(\.(\d+(_?\d+)*))?$/.test(word)) return true

  // binary
  if (/^[+-]?0b[01]+(_?[01]+)*(\.[01](_?[01]*))?$/.test(word)) return true

  // octal
  if (/^[+-]?0o[0-7]+(_?[0-7]+)*(\.[0-7](_?[0-7]*))?$/.test(word)) return true

  // hex
  if (/^[+-]?0x[0-9a-f]+([0-9a-f]_?[0-9a-f]+)*(\.([0-9a-f]_?[0-9a-f]*))?$/i.test(word)) return true

  return false
}

const isIdentifer = (s: string): boolean => {
  if (s.length === 0) return false

  let pos = 0
  const chars = []
  while (pos < s.length) {
    const out = getFullCodePoint(s, pos)
    pos += getCharSize(out)
    chars.push(out)
  }

  if (chars.length === 1) return isIdentStart(chars[0]!)
  else if (chars.length === 2) return isIdentStart(chars[0]!) && isIdentEnd(chars[1]!)
  else
    return (
      isIdentStart(chars[0]!) &&
      chars.slice(1, chars.length - 1).every(isIdentChar) &&
      isIdentEnd(chars.at(-1)!)
    )
}

const isStringDelim = (ch: number): boolean => {
  return ch === c`'` || ch === c`"`
}

export const isIdentStart = (char: number | string): boolean => {
  let ch = typeof char === 'string' ? char.charCodeAt(0) : char
  return isLowercaseLetter(ch) || isEmojiOrUnicode(ch) || ch === 36 /* $ */
}

export const isIdentChar = (char: number | string): boolean => {
  let ch = typeof char === 'string' ? char.charCodeAt(0) : char
  return isIdentStart(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */
}

const isIdentEnd = (char: number | string): boolean => {
  return isIdentChar(char)
}

const isLowercaseLetter = (ch: number): boolean => {
  return ch >= 97 && ch <= 122 // a-z
}

const isDigit = (ch: number): boolean => {
  return ch >= 48 && ch <= 57 // 0-9
}

const isWhitespace = (ch: number): boolean => {
  return (
    ch === 32 /* space */ ||
    ch === 9 /* tab */ ||
    ch === 13 /* \r */ ||
    ch === 10 /* \n */ ||
    ch === -1 ||
    ch === 0
  ) /* EOF */
}

const isWordChar = (ch: number): boolean => {
  return (
    !isWhitespace(ch) &&
    ch !== 10 /* \n */ &&
    ch !== 59 /* ; */ &&
    ch !== 40 /* ( */ &&
    ch !== 41 /* ) */ &&
    ch !== 93 /* ] */ &&
    ch !== -1 /* EOF */
  )
}

const isOperator = (word: string): boolean => {
  return operators.has(word)
}

const isKeyword = (word: string): boolean => {
  return keywords.has(word)
}

const isBracket = (char: number): boolean => {
  return char === c`(` || char === c`)` || char === c`[` || char === c`]`
}

const getCharSize = (ch: number) => (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units

const getFullCodePoint = (input: string, pos: number): number => {
  const ch = input[pos]?.charCodeAt(0) || 0

  // Check if this is a high surrogate (0xD800-0xDBFF)
  if (ch >= 0xd800 && ch <= 0xdbff) {
    const low = input[pos + 1]?.charCodeAt(0) || 0
    // Check if next is low surrogate (0xDC00-0xDFFF)
    if (low >= 0xdc00 && low <= 0xdfff) {
      // Combine surrogate pair into full code point
      return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff)
    }
  }

  return ch
}

const isEmojiOrUnicode = (ch: number): boolean => {
  return (
    // Basic Emoticons
    (ch >= 0x1f600 && ch <= 0x1f64f) ||
    // Miscellaneous Symbols and Pictographs
    (ch >= 0x1f300 && ch <= 0x1f5ff) ||
    // Transport and Map Symbols
    (ch >= 0x1f680 && ch <= 0x1f6ff) ||
    // Regional Indicator Symbols (flags)
    (ch >= 0x1f1e6 && ch <= 0x1f1ff) ||
    // Miscellaneous Symbols (hearts, stars, weather)
    (ch >= 0x2600 && ch <= 0x26ff) ||
    // Dingbats (scissors, pencils, etc)
    (ch >= 0x2700 && ch <= 0x27bf) ||
    // Supplemental Symbols and Pictographs (newer emojis)
    (ch >= 0x1f900 && ch <= 0x1f9ff) ||
    // Symbols and Pictographs Extended-A (newest emojis)
    (ch >= 0x1fa70 && ch <= 0x1faff) ||
    // Various Asian Characters with emoji presentation
    (ch >= 0x1f018 && ch <= 0x1f270) ||
    // Variation Selectors (for emoji presentation)
    (ch >= 0xfe00 && ch <= 0xfe0f) ||
    // Additional miscellaneous items
    (ch >= 0x238c && ch <= 0x2454) ||
    // Combining Diacritical Marks for Symbols
    (ch >= 0x20d0 && ch <= 0x20ff) ||
    // Latin-1 Supplement (includes ², ³, ¹ and other special chars)
    (ch >= 0x00a0 && ch <= 0x00ff) ||
    // Greek and Coptic (U+0370-U+03FF)
    (ch >= 0x0370 && ch <= 0x03ff) ||
    // Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF)
    (ch >= 0x1d400 && ch <= 0x1d7ff) ||
    // Mathematical Operators (U+2200-U+22FF)
    (ch >= 0x2200 && ch <= 0x22ff) ||
    // Superscripts and Subscripts (U+2070-U+209F)
    (ch >= 0x2070 && ch <= 0x209f) ||
    // Arrows (U+2190-U+21FF)
    (ch >= 0x2190 && ch <= 0x21ff) ||
    // Hiragana (U+3040-U+309F)
    (ch >= 0x3040 && ch <= 0x309f) ||
    // Katakana (U+30A0-U+30FF)
    (ch >= 0x30a0 && ch <= 0x30ff) ||
    // CJK Unified Ideographs (U+4E00-U+9FFF)
    (ch >= 0x4e00 && ch <= 0x9fff)
  )
}