diff --git a/bun.lock b/bun.lock index 1767760..2732c07 100644 --- a/bun.lock +++ b/bun.lock @@ -16,6 +16,8 @@ "@lezer/highlight": "^1.2.1", "@lezer/lr": "^1.4.2", "@types/bun": "latest", + "diff": "^8.0.2", + "kleur": "^4.1.5", }, }, }, @@ -60,8 +62,12 @@ "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="], + "diff": ["diff@8.0.2", "", {}, "sha512-sSuxWU5j5SR9QQji/o2qMvqRNYRDOcBTgsJ/DeCf4iSN4gW+gNMXM7wFIP+fdXZxoNiAnHUTGjCr+TSWXdRDKg=="], + "hono": ["hono@4.10.4", "", {}, "sha512-YG/fo7zlU3KwrBL5vDpWKisLYiM+nVstBQqfr7gCPbSYURnNEP9BDxEMz8KfsDR9JX0lJWDRNc6nXX31v7ZEyg=="], + "kleur": ["kleur@4.1.5", "", {}, "sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ=="], + "reefvm": ["reefvm@git+https://git.nose.space/defunkt/reefvm#3e2e68b31f504347225a4d705c7568a0957d629e", { "peerDependencies": { "typescript": "^5" } }, "3e2e68b31f504347225a4d705c7568a0957d629e"], "style-mod": ["style-mod@4.1.3", "", {}, "sha512-i/n8VsZydrugj3Iuzll8+x/00GH2vnYsk1eomD8QiRrSAeW6ItbCQDtfXCeJHd0iwiNagqjQkvpvREEPtW3IoQ=="], diff --git a/package.json b/package.json index f167e90..1c67e2d 100644 --- a/package.json +++ b/package.json @@ -24,7 +24,9 @@ "devDependencies": { "@lezer/highlight": "^1.2.1", "@lezer/lr": "^1.4.2", - "@types/bun": "latest" + "@types/bun": "latest", + "diff": "^8.0.2", + "kleur": "^4.1.5" }, "prettier": { "semi": false, diff --git a/src/parser/tests/tokens.test.ts b/src/parser/tests/tokens.test.ts new file mode 100644 index 0000000..5f5e3bf --- /dev/null +++ b/src/parser/tests/tokens.test.ts @@ -0,0 +1,593 @@ +import { expect, describe, test } from 'bun:test' + +describe('constant types', () => { + test('null', () => { + expect(`null`).toBeToken('Null') + }) + + test('boolean', () => { + expect(`true`).toMatchToken('Boolean', 'true') + expect(`false`).toMatchToken('Boolean', 'false') + }) +}) + +describe('numbers', () => { + test('non-numbers', () => { + expect(`1st`).toMatchToken('Word', '1st') + expect(`1_`).toMatchToken('Word', '1_') + expect(`100.`).toMatchToken('Word', '100.') + }) + + test('simple numbers', () => { + expect(`1`).toMatchToken('Number', '1') + expect(`200`).toMatchToken('Number', '200') + expect(`5.20`).toMatchToken('Number', '5.20') + expect(`0.20`).toMatchToken('Number', '0.20') + expect(`-20`).toMatchToken('Number', '-20') + expect(`+20`).toMatchToken('Number', '+20') + expect(`-2134.34`).toMatchToken('Number', '-2134.34') + expect(`+20.5325`).toMatchToken('Number', '+20.5325') + expect(`1_000`).toMatchToken('Number', '1_000') + expect(`53_232_220`).toMatchToken('Number', '53_232_220') + }) + + test('binary numbers', () => { + expect('0b110').toMatchToken('Number', '0b110') + }) + + test('hex numbers', () => { + expect('0xdeadbeef').toMatchToken('Number', '0xdeadbeef') + expect('0x02d3f4').toMatchToken('Number', '0x02d3f4') + }) + + test('hex numbers uppercase', () => { + expect('0xFF').toMatchToken('Number', '0xFF') + }) + + test('octal numbers', () => { + expect('0o644').toMatchToken('Number', '0o644') + expect('0o055').toMatchToken('Number', '0o055') + }) + + test('negative binary', () => { + expect('-0b110').toMatchToken('Number', '-0b110') + }) + + test('negative hex', () => { + expect('-0xFF').toMatchToken('Number', '-0xFF') + }) + + test('negative octal', () => { + expect('-0o755').toMatchToken('Number', '-0o755') + }) + + test('positive prefix binary', () => { + expect('+0b110').toMatchToken('Number', '+0b110') + }) + + test('positive prefix hex', () => { + expect('+0xFF').toMatchToken('Number', '+0xFF') + }) + + test('positive prefix octal', () => { + expect('+0o644').toMatchToken('Number', '+0o644') + }) + + test('underscores in number', () => { + expect(`1_000`).toMatchToken('Number', '1_000') + expect(`1_0`).toMatchToken('Number', '1_0') + expect('0b11_0').toMatchToken('Number', '0b11_0') + expect('0xdead_beef').toMatchToken('Number', '0xdead_beef') + expect('0o64_4').toMatchToken('Number', '0o64_4') + }) +}) + +describe('identifiers', () => { + test('regular', () => { + expect('name').toBeToken('Identifier') + expect('bobby-mcgee').toBeToken('Identifier') + expect('starts-with?').toBeToken('Identifier') + expect('📢').toMatchToken('Identifier', '📢') + expect(' 📢 ').toMatchToken('Identifier', '📢') + expect(' oink-🐷-oink').toMatchToken('Identifier', 'oink-🐷-oink') + expect('$').toMatchToken('Identifier', '$') + expect('$cool').toMatchToken('Identifier', '$cool') + }) + + test('one character identifiers', () => { + expect('a').toMatchToken('Identifier', 'a') + expect('z').toMatchToken('Identifier', 'z') + expect('$').toMatchToken('Identifier', '$') + expect('📢').toMatchToken('Identifier', '📢') + expect('?').toBeToken('Word') // ? alone is not valid identifier start + }) + + test('two character identifiers', () => { + expect('ab').toMatchToken('Identifier', 'ab') + expect('a1').toMatchToken('Identifier', 'a1') + expect('a-').toMatchToken('Identifier', 'a-') + expect('a?').toMatchToken('Identifier', 'a?') // ? valid at end + expect('ab?').toMatchToken('Identifier', 'ab?') + }) + + test('three+ character identifiers', () => { + expect('abc').toMatchToken('Identifier', 'abc') + expect('a-b').toMatchToken('Identifier', 'a-b') + expect('a1b').toMatchToken('Identifier', 'a1b') + expect('abc?').toMatchToken('Identifier', 'abc?') // ? valid at end + expect('a-b-c?').toMatchToken('Identifier', 'a-b-c?') + }) + + test('edge cases', () => { + expect('-bobby-mcgee').toBeToken('Word') + expect('starts-with??').toMatchToken('Identifier', 'starts-with??') + expect('starts?with?').toMatchToken('Identifier', 'starts?with?') + expect('a??b').toMatchToken('Identifier', 'a??b') + expect('oink-oink!').toBeToken('Word') + expect('dog#pound').toMatchToken('Word', 'dog#pound') + expect('http://website.com').toMatchToken('Word', 'http://website.com') + expect('school$cool').toMatchToken('Identifier', 'school$cool') + }) +}) + +describe('paths', () => { + test('starting with ./', () => { + expect('./tmp').toMatchToken('Word', './tmp') + }) + + test('starting with /', () => { + expect('/home/chris/dev').toMatchToken('Word', '/home/chris/dev') + }) + + test('ending with ext', () => { + expect('readme.txt').toMatchToken('Word', 'readme.txt') + expect('README.md').toMatchToken('Word', 'README.md') + }) + + test('all sorts of weird stuff', () => { + expect('dog#pound').toMatchToken('Word', 'dog#pound') + expect('my/kinda/place').toMatchToken('my/kinda/place') + expect('file://%/$##/@40!/index.php').toMatchToken('Word', 'file://%/$##/@40!/index.php') + }) +}) + +describe('strings', () => { + test('single quoted', () => { + expect(`'hello world'`).toMatchToken('String', `'hello world'`) + expect(`'it\\'s a beautiful world'`).toMatchToken("'it\\'s a beautiful world'") + }) + + test('double quoted', () => { + expect(`"hello world"`).toMatchToken('String', `"hello world"`) + expect(`"it's a beautiful world"`).toMatchToken('String', `"it's a beautiful world"`) + }) + + test('empty strings', () => { + expect(`''`).toMatchToken('String', `''`) + expect(`""`).toMatchToken('String', `""`) + }) + + test('escape sequences', () => { + expect(`'hello\\nworld'`).toMatchToken('String', `'hello\\nworld'`) + expect(`'tab\\there'`).toMatchToken('String', `'tab\\there'`) + expect(`'quote\\''`).toMatchToken('String', `'quote\\''`) + expect(`'backslash\\\\'`).toMatchToken('String', `'backslash\\\\'`) + expect(`'dollar\\$sign'`).toMatchToken('String', `'dollar\\$sign'`) + }) + + test('unclosed strings - error case', () => { + // These should either fail or produce unexpected results + expect(`'hello`).toMatchToken('String', `'hello`) + expect(`"world`).toMatchToken('String', `"world`) + }) +}) + +describe('curly strings', () => { + test('curly quoted', () => { + expect('{ one two three }').toMatchToken('String', `{ one two three }`) + }) + + test('work on multiple lines', () => { + expect(`{ + one + two + three }`).toMatchToken('String', `{ + one + two + three }`) + }) + + test('can contain other curlies', () => { + expect(`{ { one } + two + { three } }`).toMatchToken('String', `{ { one } + two + { three } }`) + }) + + test('empty curly string', () => { + expect('{}').toMatchToken('String', '{}') + }) + + test('unclosed curly string - error case', () => { + // Should either fail or produce unexpected results + expect('{ hello').toMatchToken('String', '{ hello') + expect('{ nested { unclosed }').toMatchToken('String', '{ nested { unclosed }') + }) +}) + +describe('operators', () => { + test('math operators', () => { + // assignment + expect('=').toMatchToken('Operator', '=') + + // logic + expect('or').toMatchToken('Operator', 'or') + expect('and').toMatchToken('Operator', 'and') + + // bitwise + expect('band').toMatchToken('Operator', 'band') + expect('bor').toMatchToken('Operator', 'bor') + expect('bxor').toMatchToken('Operator', 'bxor') + expect('>>>').toMatchToken('Operator', '>>>') + expect('>>').toMatchToken('Operator', '>>') + expect('<<').toMatchToken('Operator', '<<') + + // compound assignment + expect('??=').toMatchToken('Operator', '??=') + expect('+=').toMatchToken('Operator', '+=') + expect('-=').toMatchToken('Operator', '-=') + expect('*=').toMatchToken('Operator', '*=') + expect('/=').toMatchToken('Operator', '/=') + expect('%=').toMatchToken('Operator', '%=') + + // nullish + expect('??').toMatchToken('Operator', '??') + + // math + expect('**').toMatchToken('Operator', '**') + expect('*').toMatchToken('Operator', '*') + expect('/').toMatchToken('Operator', '/') + expect('+').toMatchToken('Operator', '+') + expect('-').toMatchToken('Operator', '-') + expect('%').toMatchToken('Operator', '%') + + // comparison + expect('>=').toMatchToken('Operator', '>=') + expect('<=').toMatchToken('Operator', '<=') + expect('!=').toMatchToken('Operator', '!=') + expect('==').toMatchToken('Operator', '==') + expect('>').toMatchToken('Operator', '>') + expect('<').toMatchToken('Operator', '<') + }) +}) + +describe('keywords', () => { + test('keywords', () => { + expect(`import`).toMatchToken('Keyword', 'import') + + expect(`end`).toMatchToken('Keyword', 'end') + expect(`do`).toMatchToken('Keyword', 'do') + + expect(`while`).toMatchToken('Keyword', 'while') + + expect(`if`).toMatchToken('Keyword', 'if') + expect(`else`).toMatchToken('Keyword', 'else') + + expect(`try`).toMatchToken('Keyword', 'try') + expect(`catch`).toMatchToken('Keyword', 'catch') + expect(`finally`).toMatchToken('Keyword', 'finally') + expect(`throw`).toMatchToken('Keyword', 'throw') + }) +}) + +describe('punctuation', () => { + test('underscore', () => { + expect(`_`).toBeToken('Underscore') + expect(`__`).toMatchToken('Word', '__') + }) + + test('semicolon', () => { + expect(`;`).toBeToken('Semicolon') + }) + + test('newline', () => { + expect('\n').toBeToken('Newline') + }) + + test('colon', () => { + expect(':').toBeToken('Colon') + }) +}) + +describe('comments', () => { + test('comments', () => { + expect(`# hey friends`).toMatchToken('Comment', '# hey friends') + expect(`#hey-friends`).toMatchToken('Comment', '#hey-friends') + }) +}) + +describe('brackets', () => { + test('parens', () => { + expect(`(`).toBeToken('OpenParen') + expect(`)`).toBeToken('CloseParen') + }) + + test('staples', () => { + expect(`[`).toBeToken('OpenBracket') + expect(`]`).toBeToken('CloseBracket') + }) +}) + +describe('multiple tokens', () => { + test('constants work fine', () => { + expect(`null true false`).toMatchTokens( + { type: 'Null' }, + { type: 'Boolean', value: 'true' }, + { type: 'Boolean', value: 'false' }, + ) + }) + + test('numbers', () => { + expect(`100 -400.42 null`).toMatchTokens( + { type: 'Number', value: '100' }, + { type: 'Number', value: '-400.42' }, + { type: 'Null' }, + ) + }) + + test('whitespace', () => { + expect(` + 'hello world' + + 'goodbye world' + `).toMatchTokens( + { type: 'Newline' }, + { type: 'String', value: "'hello world'" }, + { type: 'Newline' }, + { type: 'Newline' }, + { type: 'String', value: "'goodbye world'" }, + { type: 'Newline' }, + ) + }) + + test('newline in parens is ignored', () => { + expect(`( + 'hello world' + + 'goodbye world' + )`).toMatchTokens( + { type: 'OpenParen' }, + { type: 'String', value: "'hello world'" }, + { type: 'String', value: "'goodbye world'" }, + { type: 'CloseParen' }, + ) + }) + + test('newline in brackets is ignored', () => { + expect(`[ + a b +c d + +e + +f + + ]`).toMatchTokens( + { type: 'OpenBracket' }, + { type: 'Identifier', value: "a" }, + { type: 'Identifier', value: "b" }, + { type: 'Identifier', value: "c" }, + { type: 'Identifier', value: "d" }, + { type: 'Identifier', value: "e" }, + { type: 'Identifier', value: "f" }, + { type: 'CloseBracket' }, + ) + }) + + test('function call', () => { + expect('echo hello world').toMatchTokens( + { type: 'Identifier', value: 'echo' }, + { type: 'Identifier', value: 'hello' }, + { type: 'Identifier', value: 'world' }, + ) + }) + + test('assignment', () => { + expect('x = 5').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Operator', value: '=' }, + { type: 'Number', value: '5' }, + ) + }) + + test('math expression', () => { + expect('1 + 2 * 3').toMatchTokens( + { type: 'Number', value: '1' }, + { type: 'Operator', value: '+' }, + { type: 'Number', value: '2' }, + { type: 'Operator', value: '*' }, + { type: 'Number', value: '3' }, + ) + }) + + test('inline comment', () => { + expect('x = 5 # set x').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Operator', value: '=' }, + { type: 'Number', value: '5' }, + { type: 'Comment', value: '# set x' }, + ) + }) + + test('line comment', () => { + expect('x = 5 \n# hello\n set x').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Operator', value: '=' }, + { type: 'Number', value: '5' }, + { type: 'Newline' }, + { type: 'Comment', value: '# hello' }, + { type: 'Newline' }, + { type: 'Identifier', value: 'set' }, + { type: 'Identifier', value: 'x' }, + ) + }) + + test('colons separate tokens', () => { + expect('x do: y').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Keyword', value: 'do' }, + { type: 'Colon' }, + { type: 'Identifier', value: 'y' }, + ) + + expect('x: y').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Colon' }, + { type: 'Identifier', value: 'y' }, + ) + + expect('5: y').toMatchTokens( + { type: 'Number', value: '5' }, + { type: 'Colon' }, + { type: 'Identifier', value: 'y' }, + ) + + expect(` +do x: + y +end`).toMatchTokens( + { type: 'Newline' }, + { type: 'Keyword', value: 'do' }, + { type: 'Identifier', value: 'x' }, + { type: 'Colon' }, + { type: 'Newline' }, + { type: 'Identifier', value: 'y' }, + { type: 'Newline' }, + { type: 'Keyword', value: 'end' }, + ) + }) + + test('semicolons separate statements', () => { + expect('x; y').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Semicolon' }, + { type: 'Identifier', value: 'y' }, + ) + }) + + test('semicolons in parens', () => { + expect('(x; y)').toMatchTokens( + { type: 'OpenParen' }, + { type: 'Identifier', value: 'x' }, + { type: 'Semicolon' }, + { type: 'Identifier', value: 'y' }, + { type: 'CloseParen' }, + ) + }) +}) + +describe('nesting edge cases', () => { + test('deeply nested parens', () => { + expect('((nested))').toMatchTokens( + { type: 'OpenParen' }, + { type: 'OpenParen' }, + { type: 'Identifier', value: 'nested' }, + { type: 'CloseParen' }, + { type: 'CloseParen' }, + ) + }) + + test('mixed nesting', () => { + expect('([combo])').toMatchTokens( + { type: 'OpenParen' }, + { type: 'OpenBracket' }, + { type: 'Identifier', value: 'combo' }, + { type: 'CloseBracket' }, + { type: 'CloseParen' }, + ) + }) +}) + +describe('invalid numbers that should be words', () => { + test('invalid binary', () => { + expect('0b2').toMatchToken('Word', '0b2') + expect('0b123').toMatchToken('Word', '0b123') + }) + + test('invalid octal', () => { + expect('0o8').toMatchToken('Word', '0o8') + expect('0o999').toMatchToken('Word', '0o999') + }) + + test('invalid hex', () => { + expect('0xGGG').toMatchToken('Word', '0xGGG') + expect('0xZZZ').toMatchToken('Word', '0xZZZ') + }) + + test('multiple decimal points', () => { + expect('1.2.3').toMatchToken('Word', '1.2.3') + }) +}) + +describe('unicode and emoji', () => { + test('greek letters', () => { + expect('αβγ').toMatchToken('Identifier', 'αβγ') + expect('delta-δ').toMatchToken('Identifier', 'delta-δ') + }) + + test('math symbols', () => { + expect('∑').toMatchToken('Identifier', '∑') + expect('∏').toMatchToken('Identifier', '∏') + }) + + test('CJK characters', () => { + expect('你好').toMatchToken('Identifier', '你好') + expect('こんにちは').toMatchToken('Identifier', 'こんにちは') + }) +}) + +describe('empty and whitespace input', () => { + test('empty string', () => { + expect('').toMatchTokens() + }) + + test('only whitespace', () => { + expect(' ').toMatchTokens() + }) + + test('only tabs', () => { + expect('\t\t\t').toMatchTokens() + }) + + test('only newlines', () => { + expect('\n\n\n').toMatchTokens( + { type: 'Newline' }, + { type: 'Newline' }, + { type: 'Newline' }, + ) + }) +}) + +describe('named args', () => { + test("don't need spaces", () => { + expect(`named=arg`).toMatchTokens( + { type: 'NamedArgPrefix', value: 'named=' }, + { type: 'Identifier', value: 'arg' }, + ) + }) + + test("can have spaces", () => { + expect(`named= arg`).toMatchTokens( + { type: 'NamedArgPrefix', value: 'named=' }, + { type: 'Identifier', value: 'arg' }, + ) + }) + + test("can include numbers", () => { + expect(`named123= arg`).toMatchTokens( + { type: 'NamedArgPrefix', value: 'named123=' }, + { type: 'Identifier', value: 'arg' }, + ) + }) +}) \ No newline at end of file diff --git a/src/parser/tokenizer2.ts b/src/parser/tokenizer2.ts new file mode 100644 index 0000000..74844ae --- /dev/null +++ b/src/parser/tokenizer2.ts @@ -0,0 +1,508 @@ +const DEBUG = process.env.DEBUG || false + +export type Token = { + type: TokenType + value?: string, + from: number, + to: number, +} + +export enum TokenType { + Comment, + + Keyword, + Operator, + + Newline, + Semicolon, + Colon, + Underscore, + + OpenParen, + CloseParen, + OpenBracket, + CloseBracket, + + Identifier, + Word, + NamedArgPrefix, + + Null, + Boolean, + Number, + String, +} + +const valueTokens = new Set([ + TokenType.Comment, + TokenType.Keyword, TokenType.Operator, + TokenType.Identifier, TokenType.Word, TokenType.NamedArgPrefix, + TokenType.Boolean, TokenType.Number, TokenType.String +]) + +const operators = new Set([ + // assignment + '=', + + // logic + 'or', + 'and', + + // bitwise + 'band', + 'bor', + 'bxor', + '>>>', + '>>', + '<<', + + // compound assignment + '??=', + '+=', + '-=', + '*=', + '/=', + '%=', + + // nullish + '??', + + // math + '**', + '*', + '/', + '+', + '-', + '%', + + // comparison + '>=', + '<=', + '!=', + '==', + '>', + '<', +]) + +const keywords = new Set([ + 'import', + 'end', + 'do', + 'if', + 'while', + 'if', + 'else', + 'try', + 'catch', + 'finally', + 'throw', +]) + +// helper +function c(strings: TemplateStringsArray, ...values: any[]) { + return strings.reduce((result, str, i) => result + str + (values[i] ?? ""), "").charCodeAt(0) +} + +function s(c: number): string { + return String.fromCharCode(c) +} + +export class Scanner { + input = '' + pos = 0 + start = 0 + char = 0 + prev = 0 + inParen = 0 + inBracket = 0 + tokens: Token[] = [] + + reset() { + this.input = '' + this.pos = 0 + this.start = 0 + this.char = 0 + this.prev = 0 + this.tokens.length = 0 + } + + peek(count = 0): number { + return getFullCodePoint(this.input, this.pos + count) + } + + next(): number { + this.prev = this.char + this.char = this.peek() + this.pos += getCharSize(this.char) + return this.char + } + + push(type: TokenType, from?: number, to?: number) { + from ??= this.start + to ??= this.pos - getCharSize(this.char) + if (to < from) to = from + + this.tokens.push(Object.assign({}, { + type, + from, + to, + }, valueTokens.has(type) ? { value: this.input.slice(from, to) } : {})) + + if (DEBUG) { + const tok = this.tokens.at(-1) + console.log(`≫ PUSH(${from},${to})`, TokenType[tok?.type || 0], '—', tok?.value) + } + + this.start = this.pos + } + + // turn shrimp code into shrimp tokens that get fed into the parser + tokenize(input: string): Token[] { + this.reset() + this.input = input + this.next() + + while (this.char > 0) { + const char = this.char + if (char === c`#`) { + this.readComment() + continue + } + + if (isBracket(char)) { + this.readBracket() + continue + } + + if (isStringDelim(char)) { + this.readString(char) + continue + } + + if (char === c`{`) { + this.readCurlyString() + continue + } + + if (isIdentStart(char)) { + this.readIdentOrKeyword() + continue + } + + if (isDigit(char) || ((char === c`-` || char === c`+`) && isDigit(this.peek()))) { + this.readNumber() + continue + } + + if (char === c`:`) { + this.push(TokenType.Colon, this.start - 1, this.pos) // TODO: why? + this.next() + continue + } + + if (isWordChar(char)) { + this.readWord() + continue + } + + if (char === c`\n`) { + if (this.inParen === 0 && this.inBracket === 0) + this.push(TokenType.Newline) + this.next() + continue + } + + if (char === c`;`) { + this.push(TokenType.Semicolon) + this.next() + continue + } + + this.next() + } + + return this.tokens + } + + readComment() { + while (this.char !== c`\n` && this.char > 0) this.next() + this.push(TokenType.Comment) + } + + readBracket() { + switch (this.char) { + case c`(`: + this.inParen++ + this.push(TokenType.OpenParen); break + case c`)`: + this.inParen-- + this.push(TokenType.CloseParen); break + case c`[`: + this.inBracket++ + this.push(TokenType.OpenBracket); break + case c`]`: + this.inBracket-- + this.push(TokenType.CloseBracket); break + } + this.next() + } + + readString(delim: number) { + this.start = this.pos - 1 + this.next() // skip opening delim + while (this.char > 0 && (this.char !== delim || (this.char === delim && this.prev === c`\\`))) + this.next() + this.next() // skip closing delim + + this.push(TokenType.String) + } + + readCurlyString() { + let depth = 1 + this.next() + + while (depth > 0 && this.char > 0) { + if (this.char === c`{`) depth++ + if (this.char === c`}`) depth-- + this.next() + } + + this.push(TokenType.String) + } + + readIdentOrKeyword() { + this.start = this.pos - getCharSize(this.char) + + while (isWordChar(this.char)) { + // stop at colon if followed by whitespace (e.g., 'do x: echo x end') + if (this.char === c`:`) { + const nextCh = this.peek() + if (isWhitespace(nextCh) || nextCh === 0) break + } + + // stop at equal sign (named arg) + if (this.char === c`=`) { + this.next() + break + } + + this.next() + } + + const ident = this.input.slice(this.start, this.pos - getCharSize(this.char)) + + if (ident === 'null') + this.push(TokenType.Null) + + else if (ident === 'true' || ident === 'false') + this.push(TokenType.Boolean) + + else if (isKeyword(ident)) + this.push(TokenType.Keyword) + + else if (isOperator(ident)) + this.push(TokenType.Operator) // only things like `and` and `or` + + else if (isIdentifer(ident)) + this.push(TokenType.Identifier) + + else if (ident.endsWith('=')) + this.push(TokenType.NamedArgPrefix) + + else + this.push(TokenType.Word) + } + + readNumber() { + this.start = this.pos - 1 + while (isWordChar(this.char)) { + // stop at colon + if (this.char === c`:`) { + const nextCh = this.peek() + if (isWhitespace(nextCh) || nextCh === 0) break + } + this.next() + } + const ident = this.input.slice(this.start, this.pos - 1) + this.push(isNumber(ident) ? TokenType.Number : TokenType.Word) + } + + readWord() { + this.start = this.pos - getCharSize(this.char) + + while (isWordChar(this.char)) this.next() + + const word = this.input.slice(this.start, this.pos - getCharSize(this.char)) + + if (word === '_') + this.push(TokenType.Underscore) + + else if (operators.has(word)) + this.push(TokenType.Operator) + + else + this.push(TokenType.Word) + } +} + +const isNumber = (word: string): boolean => { + // regular number + if (/^[+-]?\d+(_?\d+)*(\.(\d+(_?\d+)*))?$/.test(word)) + return true + + // binary + if (/^[+-]?0b[01]+(_?[01]+)*(\.[01](_?[01]*))?$/.test(word)) + return true + + // octal + if (/^[+-]?0o[0-7]+(_?[0-7]+)*(\.[0-7](_?[0-7]*))?$/.test(word)) + return true + + // hex + if (/^[+-]?0x[0-9a-f]+([0-9a-f]_?[0-9a-f]+)*(\.([0-9a-f]_?[0-9a-f]*))?$/i.test(word)) + return true + + return false +} + +const isIdentifer = (s: string): boolean => { + if (s.length === 0) return false + + let pos = 0 + const chars = [] + while (pos < s.length) { + const out = getFullCodePoint(s, pos) + pos += getCharSize(out) + chars.push(out) + } + + if (chars.length === 1) + return isIdentStart(chars[0]!) + else if (chars.length === 2) + return isIdentStart(chars[0]!) && isIdentEnd(chars[1]!) + else + return isIdentStart(chars[0]!) && + chars.slice(1, chars.length - 1).every(isIdentChar) && + isIdentEnd(chars.at(-1)!) +} + +const isStringDelim = (ch: number): boolean => { + return ch === c`'` || ch === c`"` +} + +const isIdentStart = (char: number | string): boolean => { + let ch = typeof char === 'string' ? char.charCodeAt(0) : char + return isLowercaseLetter(ch) || isEmojiOrUnicode(ch) || ch === 36 /* $ */ +} + +const isIdentChar = (char: number | string): boolean => { + let ch = typeof char === 'string' ? char.charCodeAt(0) : char + return isIdentStart(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */ +} + +const isIdentEnd = (char: number | string): boolean => { + return isIdentChar(char) +} + +const isLowercaseLetter = (ch: number): boolean => { + return ch >= 97 && ch <= 122 // a-z +} + +const isDigit = (ch: number): boolean => { + return ch >= 48 && ch <= 57 // 0-9 +} + +const isWhitespace = (ch: number): boolean => { + return ch === 32 /* space */ || ch === 9 /* tab */ || + ch === 13 /* \r */ || ch === 10 /* \n */ || + ch === -1 || ch === 0 /* EOF */ +} + +const isWordChar = (ch: number): boolean => { + return ( + !isWhitespace(ch) && + ch !== 10 /* \n */ && + ch !== 59 /* ; */ && + ch !== 41 /* ) */ && + ch !== 93 /* ] */ && + ch !== -1 /* EOF */ + ) +} + +const isOperator = (word: string): boolean => { + return operators.has(word) +} + +const isKeyword = (word: string): boolean => { + return keywords.has(word) +} + +const isBracket = (char: number): boolean => { + return char === c`(` || char === c`)` || char === c`[` || char === c`]` +} + +const getCharSize = (ch: number) => + (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units + +const getFullCodePoint = (input: string, pos: number): number => { + const ch = input[pos]?.charCodeAt(0) || 0 + + // Check if this is a high surrogate (0xD800-0xDBFF) + if (ch >= 0xd800 && ch <= 0xdbff) { + const low = input[pos + 1]?.charCodeAt(0) || 0 + // Check if next is low surrogate (0xDC00-0xDFFF) + if (low >= 0xdc00 && low <= 0xdfff) { + // Combine surrogate pair into full code point + return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff) + } + } + + return ch +} + +const isEmojiOrUnicode = (ch: number): boolean => { + return ( + // Basic Emoticons + (ch >= 0x1f600 && ch <= 0x1f64f) || + // Miscellaneous Symbols and Pictographs + (ch >= 0x1f300 && ch <= 0x1f5ff) || + // Transport and Map Symbols + (ch >= 0x1f680 && ch <= 0x1f6ff) || + // Regional Indicator Symbols (flags) + (ch >= 0x1f1e6 && ch <= 0x1f1ff) || + // Miscellaneous Symbols (hearts, stars, weather) + (ch >= 0x2600 && ch <= 0x26ff) || + // Dingbats (scissors, pencils, etc) + (ch >= 0x2700 && ch <= 0x27bf) || + // Supplemental Symbols and Pictographs (newer emojis) + (ch >= 0x1f900 && ch <= 0x1f9ff) || + // Symbols and Pictographs Extended-A (newest emojis) + (ch >= 0x1fa70 && ch <= 0x1faff) || + // Various Asian Characters with emoji presentation + (ch >= 0x1f018 && ch <= 0x1f270) || + // Variation Selectors (for emoji presentation) + (ch >= 0xfe00 && ch <= 0xfe0f) || + // Additional miscellaneous items + (ch >= 0x238c && ch <= 0x2454) || + // Combining Diacritical Marks for Symbols + (ch >= 0x20d0 && ch <= 0x20ff) || + // Latin-1 Supplement (includes ², ³, ¹ and other special chars) + (ch >= 0x00a0 && ch <= 0x00ff) || + // Greek and Coptic (U+0370-U+03FF) + (ch >= 0x0370 && ch <= 0x03ff) || + // Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF) + (ch >= 0x1d400 && ch <= 0x1d7ff) || + // Mathematical Operators (U+2200-U+22FF) + (ch >= 0x2200 && ch <= 0x22ff) || + // Superscripts and Subscripts (U+2070-U+209F) + (ch >= 0x2070 && ch <= 0x209f) || + // Arrows (U+2190-U+21FF) + (ch >= 0x2190 && ch <= 0x21ff) || + // Hiragana (U+3040-U+309F) + (ch >= 0x3040 && ch <= 0x309f) || + // Katakana (U+30A0-U+30FF) + (ch >= 0x30a0 && ch <= 0x30ff) || + // CJK Unified Ideographs (U+4E00-U+9FFF) + (ch >= 0x4e00 && ch <= 0x9fff) + ) +} diff --git a/src/testSetup.ts b/src/testSetup.ts index c476ba2..814f91f 100644 --- a/src/testSetup.ts +++ b/src/testSetup.ts @@ -1,4 +1,7 @@ import { expect } from 'bun:test' +import { diffLines } from 'diff' +import color from 'kleur' +import { Scanner, TokenType, type Token } from '#parser/tokenizer2' import { parser } from '#parser/shrimp' import { setGlobals } from '#parser/tokenizer' import { globals as prelude } from '#prelude' @@ -37,6 +40,9 @@ declare module 'bun:test' { toFailParse(): T toEvaluateTo(expected: unknown, globals?: Record): Promise toFailEvaluation(): Promise + toBeToken(expected: string): T + toMatchToken(typeOrValue: string, value?: string): T + toMatchTokens(...tokens: { type: string, value?: string }[]): T } } @@ -144,8 +150,107 @@ expect.extend({ } } }, + toBeToken(received: unknown, expected: string) { + assert(typeof received === 'string', 'toBeToken can only be used with string values') + + try { + const tokens = tokenize(received) + const value = tokens[0] as Token + const target = TokenType[expected as keyof typeof TokenType] + + if (!value) { + return { + message: () => `Expected token type to be ${expected}, but got ${value}`, + pass: false, + } + } + + return { + message: () => `Expected token type to be ${expected}, but got ${TokenType[value.type]}`, + pass: value.type === target + } + } catch (error) { + return { + message: () => `Tokenization failed: ${errorMessage(error)}`, + pass: false, + } + } + }, + toMatchToken(received: unknown, typeOrValue: string, value?: string) { + assert(typeof received === 'string', 'toMatchToken can only be used with string values') + const expectedValue = value ? value : typeOrValue + const expectedType = value ? typeOrValue : undefined + + try { + const tokens = tokenize(received) + const token = tokens[0] as Token + + if (!token) { + return { + message: () => `Expected token to be ${expectedValue.replaceAll('\n', '\\n')}, got ${token}`, + pass: false, + } + } + + if (expectedType && TokenType[expectedType as keyof typeof TokenType] !== token.type) { + return { + message: () => `Expected token to be ${expectedType}, but got ${TokenType[token.type]}`, + pass: false + } + } + + return { + message: () => `Expected token to be ${expectedValue.replaceAll('\n', '\\n')}, but got ${token.value}`, + pass: token.value === expectedValue + } + } catch (error) { + return { + message: () => `Tokenization failed: ${errorMessage(error)} `, + pass: false, + } + } + }, + toMatchTokens(received: unknown, ...tokens: { type: string, value?: string }[]) { + assert(typeof received === 'string', 'toMatchTokens can only be used with string values') + + try { + const result = tokenize(received).map(t => toHumanToken(t)) + + if (result.length === 0 && tokens.length > 0) { + return { + message: () => `Expected tokens ${JSON.stringify(tokens)}, got nothing`, + pass: false, + } + } + + const expected = JSON.stringify(tokens, null, 2) + const actual = JSON.stringify(result, null, 2) + + return { + message: () => `Tokens don't match: \n\n${diff(actual, expected)}`, + pass: expected == actual + } + } catch (error) { + return { + message: () => `Tokenization failed: ${errorMessage(error)} `, + pass: false, + } + } + } }) +const tokenize = (code: string): Token[] => { + const scanner = new Scanner + return scanner.tokenize(code) +} + +const toHumanToken = (tok: Token): { type: string, value: string } => { + return { + type: TokenType[tok.type], + value: tok.value + } +} + const trimWhitespace = (str: string): string => { const lines = str.split('\n').filter((line) => line.trim().length > 0) const firstLine = lines[0] @@ -157,10 +262,33 @@ const trimWhitespace = (str: string): string => { if (!line.startsWith(leadingWhitespace)) { let foundWhitespace = line.match(/^(\s*)/)?.[1] || '' throw new Error( - `Line has inconsistent leading whitespace: "${line}" (found "${foundWhitespace}", expected "${leadingWhitespace}")` + `Line has inconsistent leading whitespace: "${line}"(found "${foundWhitespace}", expected "${leadingWhitespace}")` ) } return line.slice(leadingWhitespace.length) }) .join('\n') } + +const diff = (a: string, b: string): string => { + const expected = a.trim() + const actual = b.trim() + const lines = [] + + if (expected !== actual) { + const changes = diffLines(actual, expected) + for (const part of changes) { + const sign = part.added ? "+" : part.removed ? "-" : " " + let line = sign + part.value + if (part.added) { + line = color.green(line) + } else if (part.removed) { + line = color.red(line) + } + + lines.push(line.endsWith("\n") || line.endsWith("\n\u001b[39m") ? line : line + "\n") + } + } + + return lines.join('\n') +} \ No newline at end of file