shrimp/src/parser/tokenizer2.ts

const DEBUG = process.env.DEBUG || false

export type Token = {
  type: TokenType
  value?: string,
  from: number,
  to: number,
}

export enum TokenType {
  Comment,

  Keyword,
  Operator,

  Newline,
  Semicolon,
  Colon,
  Underscore,

  OpenParen,
  CloseParen,
  OpenBracket,
  CloseBracket,

  Identifier,
  Word,
  NamedArgPrefix,

  Null,
  Boolean,
  Number,
  String,
}

const valueTokens = new Set([
  TokenType.Comment,
  TokenType.Keyword, TokenType.Operator,
  TokenType.Identifier, TokenType.Word, TokenType.NamedArgPrefix,
  TokenType.Boolean, TokenType.Number, TokenType.String
])

const operators = new Set([
  // assignment
  '=',

  // logic
  'or',
  'and',

  // bitwise
  'band',
  'bor',
  'bxor',
  '>>>',
  '>>',
  '<<',

  // compound assignment
  '??=',
  '+=',
  '-=',
  '*=',
  '/=',
  '%=',

  // nullish
  '??',

  // math
  '**',
  '*',
  '/',
  '+',
  '-',
  '%',

  // comparison
  '>=',
  '<=',
  '!=',
  '==',
  '>',
  '<',
])

const keywords = new Set([
  'import',
  'end',
  'do',
  'if',
  'while',
  'if',
  'else',
  'try',
  'catch',
  'finally',
  'throw',
])

// helper
function c(strings: TemplateStringsArray, ...values: any[]) {
  return strings.reduce((result, str, i) => result + str + (values[i] ?? ""), "").charCodeAt(0)
}

function s(c: number): string {
  return String.fromCharCode(c)
}

export class Scanner {
  input = ''
  pos = 0
  start = 0
  char = 0
  prev = 0
  inParen = 0
  inBracket = 0
  tokens: Token[] = []

  reset() {
    this.input = ''
    this.pos = 0
    this.start = 0
    this.char = 0
    this.prev = 0
    this.tokens.length = 0
  }

  peek(count = 0): number {
    return getFullCodePoint(this.input, this.pos + count)
  }

  next(): number {
    this.prev = this.char
    this.char = this.peek()
    this.pos += getCharSize(this.char)
    return this.char
  }

  push(type: TokenType, from?: number, to?: number) {
    from ??= this.start
    to ??= this.pos - getCharSize(this.char)
    if (to < from) to = from

    this.tokens.push(Object.assign({}, {
      type,
      from,
      to,
    }, valueTokens.has(type) ? { value: this.input.slice(from, to) } : {}))

    if (DEBUG) {
      const tok = this.tokens.at(-1)
      console.log(`≫ PUSH(${from},${to})`, TokenType[tok?.type || 0], '—', tok?.value)
    }

    this.start = this.pos
  }

  // turn shrimp code into shrimp tokens that get fed into the parser
  tokenize(input: string): Token[] {
    this.reset()
    this.input = input
    this.next()

    while (this.char > 0) {
      const char = this.char
      if (char === c`#`) {
        this.readComment()
        continue
      }

      if (isBracket(char)) {
        this.readBracket()
        continue
      }

      if (isStringDelim(char)) {
        this.readString(char)
        continue
      }

      if (char === c`{`) {
        this.readCurlyString()
        continue
      }

      if (isIdentStart(char)) {
        this.readIdentOrKeyword()
        continue
      }

      if (isDigit(char) || ((char === c`-` || char === c`+`) && isDigit(this.peek()))) {
        this.readNumber()
        continue
      }

      if (char === c`:`) {
        this.push(TokenType.Colon, this.start - 1, this.pos) // TODO: why?
        this.next()
        continue
      }

      if (isWordChar(char)) {
        this.readWord()
        continue
      }

      if (char === c`\n`) {
        if (this.inParen === 0 && this.inBracket === 0)
          this.push(TokenType.Newline)
        this.next()
        continue
      }

      if (char === c`;`) {
        this.push(TokenType.Semicolon)
        this.next()
        continue
      }

      this.next()
    }

    return this.tokens
  }

  readComment() {
    while (this.char !== c`\n` && this.char > 0) this.next()
    this.push(TokenType.Comment)
  }

  readBracket() {
    switch (this.char) {
      case c`(`:
        this.inParen++
        this.push(TokenType.OpenParen); break
      case c`)`:
        this.inParen--
        this.push(TokenType.CloseParen); break
      case c`[`:
        this.inBracket++
        this.push(TokenType.OpenBracket); break
      case c`]`:
        this.inBracket--
        this.push(TokenType.CloseBracket); break
    }
    this.next()
  }

  readString(delim: number) {
    this.start = this.pos - 1
    this.next() // skip opening delim
    while (this.char > 0 && (this.char !== delim || (this.char === delim && this.prev === c`\\`)))
      this.next()
    this.next() // skip closing delim

    this.push(TokenType.String)
  }

  readCurlyString() {
    let depth = 1
    this.next()

    while (depth > 0 && this.char > 0) {
      if (this.char === c`{`) depth++
      if (this.char === c`}`) depth--
      this.next()
    }

    this.push(TokenType.String)
  }

  readIdentOrKeyword() {
    this.start = this.pos - getCharSize(this.char)

    while (isWordChar(this.char)) {
      // stop at colon if followed by whitespace (e.g., 'do x: echo x end')
      if (this.char === c`:`) {
        const nextCh = this.peek()
        if (isWhitespace(nextCh) || nextCh === 0) break
      }

      // stop at equal sign (named arg)
      if (this.char === c`=`) {
        this.next()
        break
      }

      this.next()
    }

    const ident = this.input.slice(this.start, this.pos - getCharSize(this.char))

    if (ident === 'null')
      this.push(TokenType.Null)

    else if (ident === 'true' || ident === 'false')
      this.push(TokenType.Boolean)

    else if (isKeyword(ident))
      this.push(TokenType.Keyword)

    else if (isOperator(ident))
      this.push(TokenType.Operator) // only things like `and` and `or`

    else if (isIdentifer(ident))
      this.push(TokenType.Identifier)

    else if (ident.endsWith('='))
      this.push(TokenType.NamedArgPrefix)

    else
      this.push(TokenType.Word)
  }

  readNumber() {
    this.start = this.pos - 1
    while (isWordChar(this.char)) {
      // stop at colon
      if (this.char === c`:`) {
        const nextCh = this.peek()
        if (isWhitespace(nextCh) || nextCh === 0) break
      }
      this.next()
    }
    const ident = this.input.slice(this.start, this.pos - 1)
    this.push(isNumber(ident) ? TokenType.Number : TokenType.Word)
  }

  readWord() {
    this.start = this.pos - getCharSize(this.char)

    while (isWordChar(this.char)) this.next()

    const word = this.input.slice(this.start, this.pos - getCharSize(this.char))

    if (word === '_')
      this.push(TokenType.Underscore)

    else if (operators.has(word))
      this.push(TokenType.Operator)

    else
      this.push(TokenType.Word)
  }
}

const isNumber = (word: string): boolean => {
  // regular number
  if (/^[+-]?\d+(_?\d+)*(\.(\d+(_?\d+)*))?$/.test(word))
    return true

  // binary
  if (/^[+-]?0b[01]+(_?[01]+)*(\.[01](_?[01]*))?$/.test(word))
    return true

  // octal
  if (/^[+-]?0o[0-7]+(_?[0-7]+)*(\.[0-7](_?[0-7]*))?$/.test(word))
    return true

  // hex
  if (/^[+-]?0x[0-9a-f]+([0-9a-f]_?[0-9a-f]+)*(\.([0-9a-f]_?[0-9a-f]*))?$/i.test(word))
    return true

  return false
}

const isIdentifer = (s: string): boolean => {
  if (s.length === 0) return false

  let pos = 0
  const chars = []
  while (pos < s.length) {
    const out = getFullCodePoint(s, pos)
    pos += getCharSize(out)
    chars.push(out)
  }

  if (chars.length === 1)
    return isIdentStart(chars[0]!)
  else if (chars.length === 2)
    return isIdentStart(chars[0]!) && isIdentEnd(chars[1]!)
  else
    return isIdentStart(chars[0]!) &&
      chars.slice(1, chars.length - 1).every(isIdentChar) &&
      isIdentEnd(chars.at(-1)!)
}

const isStringDelim = (ch: number): boolean => {
  return ch === c`'` || ch === c`"`
}

const isIdentStart = (char: number | string): boolean => {
  let ch = typeof char === 'string' ? char.charCodeAt(0) : char
  return isLowercaseLetter(ch) || isEmojiOrUnicode(ch) || ch === 36 /* $ */
}

const isIdentChar = (char: number | string): boolean => {
  let ch = typeof char === 'string' ? char.charCodeAt(0) : char
  return isIdentStart(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */
}

const isIdentEnd = (char: number | string): boolean => {
  return isIdentChar(char)
}

const isLowercaseLetter = (ch: number): boolean => {
  return ch >= 97 && ch <= 122 // a-z
}

const isDigit = (ch: number): boolean => {
  return ch >= 48 && ch <= 57 // 0-9
}

const isWhitespace = (ch: number): boolean => {
  return ch === 32 /* space */ || ch === 9 /* tab */ ||
    ch === 13 /* \r */ || ch === 10 /* \n */ ||
    ch === -1 || ch === 0 /* EOF */
}

const isWordChar = (ch: number): boolean => {
  return (
    !isWhitespace(ch) &&
    ch !== 10 /* \n */ &&
    ch !== 59 /* ; */ &&
    ch !== 41 /* ) */ &&
    ch !== 93 /* ] */ &&
    ch !== -1 /* EOF */
  )
}

const isOperator = (word: string): boolean => {
  return operators.has(word)
}

const isKeyword = (word: string): boolean => {
  return keywords.has(word)
}

const isBracket = (char: number): boolean => {
  return char === c`(` || char === c`)` || char === c`[` || char === c`]`
}

const getCharSize = (ch: number) =>
  (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units

const getFullCodePoint = (input: string, pos: number): number => {
  const ch = input[pos]?.charCodeAt(0) || 0

  // Check if this is a high surrogate (0xD800-0xDBFF)
  if (ch >= 0xd800 && ch <= 0xdbff) {
    const low = input[pos + 1]?.charCodeAt(0) || 0
    // Check if next is low surrogate (0xDC00-0xDFFF)
    if (low >= 0xdc00 && low <= 0xdfff) {
      // Combine surrogate pair into full code point
      return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff)
    }
  }

  return ch
}

const isEmojiOrUnicode = (ch: number): boolean => {
  return (
    // Basic Emoticons
    (ch >= 0x1f600 && ch <= 0x1f64f) ||
    // Miscellaneous Symbols and Pictographs
    (ch >= 0x1f300 && ch <= 0x1f5ff) ||
    // Transport and Map Symbols
    (ch >= 0x1f680 && ch <= 0x1f6ff) ||
    // Regional Indicator Symbols (flags)
    (ch >= 0x1f1e6 && ch <= 0x1f1ff) ||
    // Miscellaneous Symbols (hearts, stars, weather)
    (ch >= 0x2600 && ch <= 0x26ff) ||
    // Dingbats (scissors, pencils, etc)
    (ch >= 0x2700 && ch <= 0x27bf) ||
    // Supplemental Symbols and Pictographs (newer emojis)
    (ch >= 0x1f900 && ch <= 0x1f9ff) ||
    // Symbols and Pictographs Extended-A (newest emojis)
    (ch >= 0x1fa70 && ch <= 0x1faff) ||
    // Various Asian Characters with emoji presentation
    (ch >= 0x1f018 && ch <= 0x1f270) ||
    // Variation Selectors (for emoji presentation)
    (ch >= 0xfe00 && ch <= 0xfe0f) ||
    // Additional miscellaneous items
    (ch >= 0x238c && ch <= 0x2454) ||
    // Combining Diacritical Marks for Symbols
    (ch >= 0x20d0 && ch <= 0x20ff) ||
    // Latin-1 Supplement (includes ², ³, ¹ and other special chars)
    (ch >= 0x00a0 && ch <= 0x00ff) ||
    // Greek and Coptic (U+0370-U+03FF)
    (ch >= 0x0370 && ch <= 0x03ff) ||
    // Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF)
    (ch >= 0x1d400 && ch <= 0x1d7ff) ||
    // Mathematical Operators (U+2200-U+22FF)
    (ch >= 0x2200 && ch <= 0x22ff) ||
    // Superscripts and Subscripts (U+2070-U+209F)
    (ch >= 0x2070 && ch <= 0x209f) ||
    // Arrows (U+2190-U+21FF)
    (ch >= 0x2190 && ch <= 0x21ff) ||
    // Hiragana (U+3040-U+309F)
    (ch >= 0x3040 && ch <= 0x309f) ||
    // Katakana (U+30A0-U+30FF)
    (ch >= 0x30a0 && ch <= 0x30ff) ||
    // CJK Unified Ideographs (U+4E00-U+9FFF)
    (ch >= 0x4e00 && ch <= 0x9fff)
  )
}