509 lines
11 KiB
TypeScript
509 lines
11 KiB
TypeScript
const DEBUG = process.env.DEBUG || false
|
|
|
|
export type Token = {
|
|
type: TokenType
|
|
value?: string,
|
|
from: number,
|
|
to: number,
|
|
}
|
|
|
|
export enum TokenType {
|
|
Comment,
|
|
|
|
Keyword,
|
|
Operator,
|
|
|
|
Newline,
|
|
Semicolon,
|
|
Colon,
|
|
Underscore,
|
|
|
|
OpenParen,
|
|
CloseParen,
|
|
OpenBracket,
|
|
CloseBracket,
|
|
|
|
Identifier,
|
|
Word,
|
|
NamedArgPrefix,
|
|
|
|
Null,
|
|
Boolean,
|
|
Number,
|
|
String,
|
|
}
|
|
|
|
const valueTokens = new Set([
|
|
TokenType.Comment,
|
|
TokenType.Keyword, TokenType.Operator,
|
|
TokenType.Identifier, TokenType.Word, TokenType.NamedArgPrefix,
|
|
TokenType.Boolean, TokenType.Number, TokenType.String
|
|
])
|
|
|
|
const operators = new Set([
|
|
// assignment
|
|
'=',
|
|
|
|
// logic
|
|
'or',
|
|
'and',
|
|
|
|
// bitwise
|
|
'band',
|
|
'bor',
|
|
'bxor',
|
|
'>>>',
|
|
'>>',
|
|
'<<',
|
|
|
|
// compound assignment
|
|
'??=',
|
|
'+=',
|
|
'-=',
|
|
'*=',
|
|
'/=',
|
|
'%=',
|
|
|
|
// nullish
|
|
'??',
|
|
|
|
// math
|
|
'**',
|
|
'*',
|
|
'/',
|
|
'+',
|
|
'-',
|
|
'%',
|
|
|
|
// comparison
|
|
'>=',
|
|
'<=',
|
|
'!=',
|
|
'==',
|
|
'>',
|
|
'<',
|
|
])
|
|
|
|
const keywords = new Set([
|
|
'import',
|
|
'end',
|
|
'do',
|
|
'if',
|
|
'while',
|
|
'if',
|
|
'else',
|
|
'try',
|
|
'catch',
|
|
'finally',
|
|
'throw',
|
|
])
|
|
|
|
// helper
|
|
function c(strings: TemplateStringsArray, ...values: any[]) {
|
|
return strings.reduce((result, str, i) => result + str + (values[i] ?? ""), "").charCodeAt(0)
|
|
}
|
|
|
|
function s(c: number): string {
|
|
return String.fromCharCode(c)
|
|
}
|
|
|
|
export class Scanner {
|
|
input = ''
|
|
pos = 0
|
|
start = 0
|
|
char = 0
|
|
prev = 0
|
|
inParen = 0
|
|
inBracket = 0
|
|
tokens: Token[] = []
|
|
|
|
reset() {
|
|
this.input = ''
|
|
this.pos = 0
|
|
this.start = 0
|
|
this.char = 0
|
|
this.prev = 0
|
|
this.tokens.length = 0
|
|
}
|
|
|
|
peek(count = 0): number {
|
|
return getFullCodePoint(this.input, this.pos + count)
|
|
}
|
|
|
|
next(): number {
|
|
this.prev = this.char
|
|
this.char = this.peek()
|
|
this.pos += getCharSize(this.char)
|
|
return this.char
|
|
}
|
|
|
|
push(type: TokenType, from?: number, to?: number) {
|
|
from ??= this.start
|
|
to ??= this.pos - getCharSize(this.char)
|
|
if (to < from) to = from
|
|
|
|
this.tokens.push(Object.assign({}, {
|
|
type,
|
|
from,
|
|
to,
|
|
}, valueTokens.has(type) ? { value: this.input.slice(from, to) } : {}))
|
|
|
|
if (DEBUG) {
|
|
const tok = this.tokens.at(-1)
|
|
console.log(`≫ PUSH(${from},${to})`, TokenType[tok?.type || 0], '—', tok?.value)
|
|
}
|
|
|
|
this.start = this.pos
|
|
}
|
|
|
|
// turn shrimp code into shrimp tokens that get fed into the parser
|
|
tokenize(input: string): Token[] {
|
|
this.reset()
|
|
this.input = input
|
|
this.next()
|
|
|
|
while (this.char > 0) {
|
|
const char = this.char
|
|
if (char === c`#`) {
|
|
this.readComment()
|
|
continue
|
|
}
|
|
|
|
if (isBracket(char)) {
|
|
this.readBracket()
|
|
continue
|
|
}
|
|
|
|
if (isStringDelim(char)) {
|
|
this.readString(char)
|
|
continue
|
|
}
|
|
|
|
if (char === c`{`) {
|
|
this.readCurlyString()
|
|
continue
|
|
}
|
|
|
|
if (isIdentStart(char)) {
|
|
this.readIdentOrKeyword()
|
|
continue
|
|
}
|
|
|
|
if (isDigit(char) || ((char === c`-` || char === c`+`) && isDigit(this.peek()))) {
|
|
this.readNumber()
|
|
continue
|
|
}
|
|
|
|
if (char === c`:`) {
|
|
this.push(TokenType.Colon, this.start - 1, this.pos) // TODO: why?
|
|
this.next()
|
|
continue
|
|
}
|
|
|
|
if (isWordChar(char)) {
|
|
this.readWord()
|
|
continue
|
|
}
|
|
|
|
if (char === c`\n`) {
|
|
if (this.inParen === 0 && this.inBracket === 0)
|
|
this.push(TokenType.Newline)
|
|
this.next()
|
|
continue
|
|
}
|
|
|
|
if (char === c`;`) {
|
|
this.push(TokenType.Semicolon)
|
|
this.next()
|
|
continue
|
|
}
|
|
|
|
this.next()
|
|
}
|
|
|
|
return this.tokens
|
|
}
|
|
|
|
readComment() {
|
|
while (this.char !== c`\n` && this.char > 0) this.next()
|
|
this.push(TokenType.Comment)
|
|
}
|
|
|
|
readBracket() {
|
|
switch (this.char) {
|
|
case c`(`:
|
|
this.inParen++
|
|
this.push(TokenType.OpenParen); break
|
|
case c`)`:
|
|
this.inParen--
|
|
this.push(TokenType.CloseParen); break
|
|
case c`[`:
|
|
this.inBracket++
|
|
this.push(TokenType.OpenBracket); break
|
|
case c`]`:
|
|
this.inBracket--
|
|
this.push(TokenType.CloseBracket); break
|
|
}
|
|
this.next()
|
|
}
|
|
|
|
readString(delim: number) {
|
|
this.start = this.pos - 1
|
|
this.next() // skip opening delim
|
|
while (this.char > 0 && (this.char !== delim || (this.char === delim && this.prev === c`\\`)))
|
|
this.next()
|
|
this.next() // skip closing delim
|
|
|
|
this.push(TokenType.String)
|
|
}
|
|
|
|
readCurlyString() {
|
|
let depth = 1
|
|
this.next()
|
|
|
|
while (depth > 0 && this.char > 0) {
|
|
if (this.char === c`{`) depth++
|
|
if (this.char === c`}`) depth--
|
|
this.next()
|
|
}
|
|
|
|
this.push(TokenType.String)
|
|
}
|
|
|
|
readIdentOrKeyword() {
|
|
this.start = this.pos - getCharSize(this.char)
|
|
|
|
while (isWordChar(this.char)) {
|
|
// stop at colon if followed by whitespace (e.g., 'do x: echo x end')
|
|
if (this.char === c`:`) {
|
|
const nextCh = this.peek()
|
|
if (isWhitespace(nextCh) || nextCh === 0) break
|
|
}
|
|
|
|
// stop at equal sign (named arg)
|
|
if (this.char === c`=`) {
|
|
this.next()
|
|
break
|
|
}
|
|
|
|
this.next()
|
|
}
|
|
|
|
const ident = this.input.slice(this.start, this.pos - getCharSize(this.char))
|
|
|
|
if (ident === 'null')
|
|
this.push(TokenType.Null)
|
|
|
|
else if (ident === 'true' || ident === 'false')
|
|
this.push(TokenType.Boolean)
|
|
|
|
else if (isKeyword(ident))
|
|
this.push(TokenType.Keyword)
|
|
|
|
else if (isOperator(ident))
|
|
this.push(TokenType.Operator) // only things like `and` and `or`
|
|
|
|
else if (isIdentifer(ident))
|
|
this.push(TokenType.Identifier)
|
|
|
|
else if (ident.endsWith('='))
|
|
this.push(TokenType.NamedArgPrefix)
|
|
|
|
else
|
|
this.push(TokenType.Word)
|
|
}
|
|
|
|
readNumber() {
|
|
this.start = this.pos - 1
|
|
while (isWordChar(this.char)) {
|
|
// stop at colon
|
|
if (this.char === c`:`) {
|
|
const nextCh = this.peek()
|
|
if (isWhitespace(nextCh) || nextCh === 0) break
|
|
}
|
|
this.next()
|
|
}
|
|
const ident = this.input.slice(this.start, this.pos - 1)
|
|
this.push(isNumber(ident) ? TokenType.Number : TokenType.Word)
|
|
}
|
|
|
|
readWord() {
|
|
this.start = this.pos - getCharSize(this.char)
|
|
|
|
while (isWordChar(this.char)) this.next()
|
|
|
|
const word = this.input.slice(this.start, this.pos - getCharSize(this.char))
|
|
|
|
if (word === '_')
|
|
this.push(TokenType.Underscore)
|
|
|
|
else if (operators.has(word))
|
|
this.push(TokenType.Operator)
|
|
|
|
else
|
|
this.push(TokenType.Word)
|
|
}
|
|
}
|
|
|
|
const isNumber = (word: string): boolean => {
|
|
// regular number
|
|
if (/^[+-]?\d+(_?\d+)*(\.(\d+(_?\d+)*))?$/.test(word))
|
|
return true
|
|
|
|
// binary
|
|
if (/^[+-]?0b[01]+(_?[01]+)*(\.[01](_?[01]*))?$/.test(word))
|
|
return true
|
|
|
|
// octal
|
|
if (/^[+-]?0o[0-7]+(_?[0-7]+)*(\.[0-7](_?[0-7]*))?$/.test(word))
|
|
return true
|
|
|
|
// hex
|
|
if (/^[+-]?0x[0-9a-f]+([0-9a-f]_?[0-9a-f]+)*(\.([0-9a-f]_?[0-9a-f]*))?$/i.test(word))
|
|
return true
|
|
|
|
return false
|
|
}
|
|
|
|
const isIdentifer = (s: string): boolean => {
|
|
if (s.length === 0) return false
|
|
|
|
let pos = 0
|
|
const chars = []
|
|
while (pos < s.length) {
|
|
const out = getFullCodePoint(s, pos)
|
|
pos += getCharSize(out)
|
|
chars.push(out)
|
|
}
|
|
|
|
if (chars.length === 1)
|
|
return isIdentStart(chars[0]!)
|
|
else if (chars.length === 2)
|
|
return isIdentStart(chars[0]!) && isIdentEnd(chars[1]!)
|
|
else
|
|
return isIdentStart(chars[0]!) &&
|
|
chars.slice(1, chars.length - 1).every(isIdentChar) &&
|
|
isIdentEnd(chars.at(-1)!)
|
|
}
|
|
|
|
const isStringDelim = (ch: number): boolean => {
|
|
return ch === c`'` || ch === c`"`
|
|
}
|
|
|
|
const isIdentStart = (char: number | string): boolean => {
|
|
let ch = typeof char === 'string' ? char.charCodeAt(0) : char
|
|
return isLowercaseLetter(ch) || isEmojiOrUnicode(ch) || ch === 36 /* $ */
|
|
}
|
|
|
|
const isIdentChar = (char: number | string): boolean => {
|
|
let ch = typeof char === 'string' ? char.charCodeAt(0) : char
|
|
return isIdentStart(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */
|
|
}
|
|
|
|
const isIdentEnd = (char: number | string): boolean => {
|
|
return isIdentChar(char)
|
|
}
|
|
|
|
const isLowercaseLetter = (ch: number): boolean => {
|
|
return ch >= 97 && ch <= 122 // a-z
|
|
}
|
|
|
|
const isDigit = (ch: number): boolean => {
|
|
return ch >= 48 && ch <= 57 // 0-9
|
|
}
|
|
|
|
const isWhitespace = (ch: number): boolean => {
|
|
return ch === 32 /* space */ || ch === 9 /* tab */ ||
|
|
ch === 13 /* \r */ || ch === 10 /* \n */ ||
|
|
ch === -1 || ch === 0 /* EOF */
|
|
}
|
|
|
|
const isWordChar = (ch: number): boolean => {
|
|
return (
|
|
!isWhitespace(ch) &&
|
|
ch !== 10 /* \n */ &&
|
|
ch !== 59 /* ; */ &&
|
|
ch !== 41 /* ) */ &&
|
|
ch !== 93 /* ] */ &&
|
|
ch !== -1 /* EOF */
|
|
)
|
|
}
|
|
|
|
const isOperator = (word: string): boolean => {
|
|
return operators.has(word)
|
|
}
|
|
|
|
const isKeyword = (word: string): boolean => {
|
|
return keywords.has(word)
|
|
}
|
|
|
|
const isBracket = (char: number): boolean => {
|
|
return char === c`(` || char === c`)` || char === c`[` || char === c`]`
|
|
}
|
|
|
|
const getCharSize = (ch: number) =>
|
|
(ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units
|
|
|
|
const getFullCodePoint = (input: string, pos: number): number => {
|
|
const ch = input[pos]?.charCodeAt(0) || 0
|
|
|
|
// Check if this is a high surrogate (0xD800-0xDBFF)
|
|
if (ch >= 0xd800 && ch <= 0xdbff) {
|
|
const low = input[pos + 1]?.charCodeAt(0) || 0
|
|
// Check if next is low surrogate (0xDC00-0xDFFF)
|
|
if (low >= 0xdc00 && low <= 0xdfff) {
|
|
// Combine surrogate pair into full code point
|
|
return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff)
|
|
}
|
|
}
|
|
|
|
return ch
|
|
}
|
|
|
|
const isEmojiOrUnicode = (ch: number): boolean => {
|
|
return (
|
|
// Basic Emoticons
|
|
(ch >= 0x1f600 && ch <= 0x1f64f) ||
|
|
// Miscellaneous Symbols and Pictographs
|
|
(ch >= 0x1f300 && ch <= 0x1f5ff) ||
|
|
// Transport and Map Symbols
|
|
(ch >= 0x1f680 && ch <= 0x1f6ff) ||
|
|
// Regional Indicator Symbols (flags)
|
|
(ch >= 0x1f1e6 && ch <= 0x1f1ff) ||
|
|
// Miscellaneous Symbols (hearts, stars, weather)
|
|
(ch >= 0x2600 && ch <= 0x26ff) ||
|
|
// Dingbats (scissors, pencils, etc)
|
|
(ch >= 0x2700 && ch <= 0x27bf) ||
|
|
// Supplemental Symbols and Pictographs (newer emojis)
|
|
(ch >= 0x1f900 && ch <= 0x1f9ff) ||
|
|
// Symbols and Pictographs Extended-A (newest emojis)
|
|
(ch >= 0x1fa70 && ch <= 0x1faff) ||
|
|
// Various Asian Characters with emoji presentation
|
|
(ch >= 0x1f018 && ch <= 0x1f270) ||
|
|
// Variation Selectors (for emoji presentation)
|
|
(ch >= 0xfe00 && ch <= 0xfe0f) ||
|
|
// Additional miscellaneous items
|
|
(ch >= 0x238c && ch <= 0x2454) ||
|
|
// Combining Diacritical Marks for Symbols
|
|
(ch >= 0x20d0 && ch <= 0x20ff) ||
|
|
// Latin-1 Supplement (includes ², ³, ¹ and other special chars)
|
|
(ch >= 0x00a0 && ch <= 0x00ff) ||
|
|
// Greek and Coptic (U+0370-U+03FF)
|
|
(ch >= 0x0370 && ch <= 0x03ff) ||
|
|
// Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF)
|
|
(ch >= 0x1d400 && ch <= 0x1d7ff) ||
|
|
// Mathematical Operators (U+2200-U+22FF)
|
|
(ch >= 0x2200 && ch <= 0x22ff) ||
|
|
// Superscripts and Subscripts (U+2070-U+209F)
|
|
(ch >= 0x2070 && ch <= 0x209f) ||
|
|
// Arrows (U+2190-U+21FF)
|
|
(ch >= 0x2190 && ch <= 0x21ff) ||
|
|
// Hiragana (U+3040-U+309F)
|
|
(ch >= 0x3040 && ch <= 0x309f) ||
|
|
// Katakana (U+30A0-U+30FF)
|
|
(ch >= 0x30a0 && ch <= 0x30ff) ||
|
|
// CJK Unified Ideographs (U+4E00-U+9FFF)
|
|
(ch >= 0x4e00 && ch <= 0x9fff)
|
|
)
|
|
}
|