const DEBUG = process.env.DEBUG || false export type Token = { type: TokenType value?: string, from: number, to: number, } export enum TokenType { Comment, Keyword, Operator, Newline, Semicolon, Colon, Underscore, OpenParen, CloseParen, OpenBracket, CloseBracket, Identifier, Word, NamedArgPrefix, Null, Boolean, Number, String, Regex, } const valueTokens = new Set([ TokenType.Comment, TokenType.Keyword, TokenType.Operator, TokenType.Identifier, TokenType.Word, TokenType.NamedArgPrefix, TokenType.Boolean, TokenType.Number, TokenType.String, TokenType.Regex, TokenType.Underscore ]) const operators = new Set([ // assignment '=', // logic 'or', 'and', // bitwise 'band', 'bor', 'bxor', '>>>', '>>', '<<', // compound assignment '??=', '+=', '-=', '*=', '/=', '%=', // nullish '??', // math '**', '*', '/', '+', '-', '%', // comparison '>=', '<=', '!=', '==', '>', '<', // property access '.', // pipe '|', ]) const keywords = new Set([ 'use', 'end', 'do', 'if', 'while', 'if', 'else', 'try', 'catch', 'finally', 'throw', ]) // helper function c(strings: TemplateStringsArray, ...values: any[]) { return strings.reduce((result, str, i) => result + str + (values[i] ?? ""), "").charCodeAt(0) } function s(c: number): string { return String.fromCharCode(c) } export class Scanner { input = '' pos = 0 start = 0 char = 0 prev = 0 inParen = 0 inBracket = 0 tokens: Token[] = [] prevIsWhitespace = true reset() { this.input = '' this.pos = 0 this.start = 0 this.char = 0 this.prev = 0 this.tokens.length = 0 this.prevIsWhitespace = true } peek(count = 0): number { return getFullCodePoint(this.input, this.pos + count) } next(): number { this.prevIsWhitespace = isWhitespace(this.char) this.prev = this.char this.char = this.peek() this.pos += getCharSize(this.char) return this.char } push(type: TokenType, from?: number, to?: number) { from ??= this.start to ??= this.pos - getCharSize(this.char) if (to < from) to = from this.tokens.push(Object.assign({}, { type, from, to, }, valueTokens.has(type) ? { value: this.input.slice(from, to) } : {})) if (DEBUG) { const tok = this.tokens.at(-1) console.log(`≫ PUSH(${from},${to})`, TokenType[tok?.type || 0], '—', tok?.value) } this.start = this.pos } pushChar(type: TokenType) { this.push(type, this.pos - 1, this.pos) } // turn shrimp code into shrimp tokens that get fed into the parser tokenize(input: string): Token[] { this.reset() this.input = input this.next() while (this.char > 0) { const char = this.char if (char === c`#`) { this.readComment() continue } if (isBracket(char)) { this.readBracket() continue } if (isStringDelim(char)) { this.readString(char) continue } if (char === c`{`) { this.readCurlyString() continue } if (isIdentStart(char)) { this.readWordOrIdent(true) // true = started with identifier char continue } if (isDigit(char) || ((char === c`-` || char === c`+`) && isDigit(this.peek()))) { this.readNumber() continue } if (char === c`:`) { this.pushChar(TokenType.Colon) this.next() continue } // whitespace-sensitive dot as operator (property access) only after identifier/number if (char === c`.`) { if (this.canBeDotGet(this.tokens.at(-1))) { this.pushChar(TokenType.Operator) this.next() continue } } if (char === c`/` && this.peek() === c`/`) { this.readRegex() continue } if (isWordChar(char)) { this.readWordOrIdent(false) // false = didn't start with identifier char continue } if (char === c`\n`) { if (this.inParen === 0 && this.inBracket === 0) this.pushChar(TokenType.Newline) this.next() continue } if (char === c`;`) { this.pushChar(TokenType.Semicolon) this.next() continue } this.next() } return this.tokens } readComment() { this.start = this.pos - 1 while (this.char !== c`\n` && this.char > 0) this.next() this.push(TokenType.Comment) } readBracket() { switch (this.char) { case c`(`: this.inParen++ this.pushChar(TokenType.OpenParen); break case c`)`: this.inParen-- this.pushChar(TokenType.CloseParen); break case c`[`: this.inBracket++ this.pushChar(TokenType.OpenBracket); break case c`]`: this.inBracket-- this.pushChar(TokenType.CloseBracket); break } this.next() } readString(delim: number) { this.start = this.pos - 1 this.next() // skip opening delim while (this.char > 0 && (this.char !== delim || (this.char === delim && this.prev === c`\\`))) this.next() this.next() // skip closing delim this.push(TokenType.String) } readCurlyString() { this.start = this.pos - 1 let depth = 1 this.next() while (depth > 0 && this.char > 0) { if (this.char === c`{`) depth++ if (this.char === c`}`) depth-- this.next() } this.push(TokenType.String) } readWordOrIdent(startedWithIdentChar: boolean) { this.start = this.pos - getCharSize(this.char) while (isWordChar(this.char)) { // stop at colon if followed by whitespace (e.g., 'do x: echo x end') if (this.char === c`:`) { const nextCh = this.peek() if (isWhitespace(nextCh) || nextCh === 0) break } // stop at equal sign (named arg) - but only if what we've read so far is an identifier if (this.char === c`=`) { const soFar = this.input.slice(this.start, this.pos - getCharSize(this.char)) if (isIdentifer(soFar)) { this.next() break } } // stop at dot only if it would create a valid property access // AND only if we started with an identifier character (not for Words like README.txt) if (startedWithIdentChar && this.char === c`.`) { const nextCh = this.peek() if (isIdentStart(nextCh) || isDigit(nextCh) || nextCh === c`(`) { const soFar = this.input.slice(this.start, this.pos - getCharSize(this.char)) if (isIdentifer(soFar)) break } } this.next() } const word = this.input.slice(this.start, this.pos - getCharSize(this.char)) // classify the token based on what we read if (word === '_') this.push(TokenType.Underscore) else if (word === 'null') this.push(TokenType.Null) else if (word === 'true' || word === 'false') this.push(TokenType.Boolean) else if (isKeyword(word)) this.push(TokenType.Keyword) else if (isOperator(word)) this.push(TokenType.Operator) else if (isIdentifer(word)) this.push(TokenType.Identifier) else if (word.endsWith('=')) this.push(TokenType.NamedArgPrefix) else this.push(TokenType.Word) } readNumber() { this.start = this.pos - 1 while (isWordChar(this.char)) { // stop at dot unless it's part of the number if (this.char === c`.`) { const nextCh = this.peek() if (!isDigit(nextCh)) break } // stop at colon if (this.char === c`:`) { const nextCh = this.peek() if (isWhitespace(nextCh) || nextCh === 0) break } this.next() } const ident = this.input.slice(this.start, this.pos - 1) this.push(isNumber(ident) ? TokenType.Number : TokenType.Word) } readRegex() { this.start = this.pos - 1 this.next() // skip 2nd / while (this.char > 0) { if (this.char === c`/` && this.peek() === c`/`) { this.next() // skip / this.next() // skip / // read regex flags while (this.char > 0 && isIdentStart(this.char)) this.next() // validate regex const to = this.pos - getCharSize(this.char) const regexText = this.input.slice(this.start, to) const [_, pattern, flags] = regexText.match(/^\/\/(.*)\/\/([gimsuy]*)$/) || [] if (pattern) { try { new RegExp(pattern, flags) this.push(TokenType.Regex) break } catch (e) { // invalid regex - fall through to Word } } // invalid regex is treated as Word this.push(TokenType.Word) break } this.next() } } canBeDotGet(lastToken?: Token): boolean { return !this.prevIsWhitespace && !!lastToken && (lastToken.type === TokenType.Identifier || lastToken.type === TokenType.Number || lastToken.type === TokenType.CloseParen || lastToken.type === TokenType.CloseBracket) } } const isNumber = (word: string): boolean => { // regular number if (/^[+-]?\d+(_?\d+)*(\.(\d+(_?\d+)*))?$/.test(word)) return true // binary if (/^[+-]?0b[01]+(_?[01]+)*(\.[01](_?[01]*))?$/.test(word)) return true // octal if (/^[+-]?0o[0-7]+(_?[0-7]+)*(\.[0-7](_?[0-7]*))?$/.test(word)) return true // hex if (/^[+-]?0x[0-9a-f]+([0-9a-f]_?[0-9a-f]+)*(\.([0-9a-f]_?[0-9a-f]*))?$/i.test(word)) return true return false } const isIdentifer = (s: string): boolean => { if (s.length === 0) return false let pos = 0 const chars = [] while (pos < s.length) { const out = getFullCodePoint(s, pos) pos += getCharSize(out) chars.push(out) } if (chars.length === 1) return isIdentStart(chars[0]!) else if (chars.length === 2) return isIdentStart(chars[0]!) && isIdentEnd(chars[1]!) else return isIdentStart(chars[0]!) && chars.slice(1, chars.length - 1).every(isIdentChar) && isIdentEnd(chars.at(-1)!) } const isStringDelim = (ch: number): boolean => { return ch === c`'` || ch === c`"` } const isIdentStart = (char: number | string): boolean => { let ch = typeof char === 'string' ? char.charCodeAt(0) : char return isLowercaseLetter(ch) || isEmojiOrUnicode(ch) || ch === 36 /* $ */ } const isIdentChar = (char: number | string): boolean => { let ch = typeof char === 'string' ? char.charCodeAt(0) : char return isIdentStart(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */ } const isIdentEnd = (char: number | string): boolean => { return isIdentChar(char) } const isLowercaseLetter = (ch: number): boolean => { return ch >= 97 && ch <= 122 // a-z } const isDigit = (ch: number): boolean => { return ch >= 48 && ch <= 57 // 0-9 } const isWhitespace = (ch: number): boolean => { return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */ || ch === 10 /* \n */ || ch === -1 || ch === 0 /* EOF */ } const isWordChar = (ch: number): boolean => { return ( !isWhitespace(ch) && ch !== 10 /* \n */ && ch !== 59 /* ; */ && ch !== 41 /* ) */ && ch !== 93 /* ] */ && ch !== -1 /* EOF */ ) } const isOperator = (word: string): boolean => { return operators.has(word) } const isKeyword = (word: string): boolean => { return keywords.has(word) } const isBracket = (char: number): boolean => { return char === c`(` || char === c`)` || char === c`[` || char === c`]` } const getCharSize = (ch: number) => (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units const getFullCodePoint = (input: string, pos: number): number => { const ch = input[pos]?.charCodeAt(0) || 0 // Check if this is a high surrogate (0xD800-0xDBFF) if (ch >= 0xd800 && ch <= 0xdbff) { const low = input[pos + 1]?.charCodeAt(0) || 0 // Check if next is low surrogate (0xDC00-0xDFFF) if (low >= 0xdc00 && low <= 0xdfff) { // Combine surrogate pair into full code point return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff) } } return ch } const isEmojiOrUnicode = (ch: number): boolean => { return ( // Basic Emoticons (ch >= 0x1f600 && ch <= 0x1f64f) || // Miscellaneous Symbols and Pictographs (ch >= 0x1f300 && ch <= 0x1f5ff) || // Transport and Map Symbols (ch >= 0x1f680 && ch <= 0x1f6ff) || // Regional Indicator Symbols (flags) (ch >= 0x1f1e6 && ch <= 0x1f1ff) || // Miscellaneous Symbols (hearts, stars, weather) (ch >= 0x2600 && ch <= 0x26ff) || // Dingbats (scissors, pencils, etc) (ch >= 0x2700 && ch <= 0x27bf) || // Supplemental Symbols and Pictographs (newer emojis) (ch >= 0x1f900 && ch <= 0x1f9ff) || // Symbols and Pictographs Extended-A (newest emojis) (ch >= 0x1fa70 && ch <= 0x1faff) || // Various Asian Characters with emoji presentation (ch >= 0x1f018 && ch <= 0x1f270) || // Variation Selectors (for emoji presentation) (ch >= 0xfe00 && ch <= 0xfe0f) || // Additional miscellaneous items (ch >= 0x238c && ch <= 0x2454) || // Combining Diacritical Marks for Symbols (ch >= 0x20d0 && ch <= 0x20ff) || // Latin-1 Supplement (includes ², ³, ¹ and other special chars) (ch >= 0x00a0 && ch <= 0x00ff) || // Greek and Coptic (U+0370-U+03FF) (ch >= 0x0370 && ch <= 0x03ff) || // Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF) (ch >= 0x1d400 && ch <= 0x1d7ff) || // Mathematical Operators (U+2200-U+22FF) (ch >= 0x2200 && ch <= 0x22ff) || // Superscripts and Subscripts (U+2070-U+209F) (ch >= 0x2070 && ch <= 0x209f) || // Arrows (U+2190-U+21FF) (ch >= 0x2190 && ch <= 0x21ff) || // Hiragana (U+3040-U+309F) (ch >= 0x3040 && ch <= 0x309f) || // Katakana (U+30A0-U+30FF) (ch >= 0x30a0 && ch <= 0x30ff) || // CJK Unified Ideographs (U+4E00-U+9FFF) (ch >= 0x4e00 && ch <= 0x9fff) ) }