From ae469882194df1296b09cd5e4dc1085b5153fe1a Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Wed, 12 Nov 2025 11:29:03 -0800 Subject: [PATCH 01/35] sorry lezer... --- bun.lock | 6 + package.json | 4 +- src/parser/tests/tokens.test.ts | 593 ++++++++++++++++++++++++++++++++ src/parser/tokenizer2.ts | 508 +++++++++++++++++++++++++++ src/testSetup.ts | 130 ++++++- 5 files changed, 1239 insertions(+), 2 deletions(-) create mode 100644 src/parser/tests/tokens.test.ts create mode 100644 src/parser/tokenizer2.ts diff --git a/bun.lock b/bun.lock index 1767760..2732c07 100644 --- a/bun.lock +++ b/bun.lock @@ -16,6 +16,8 @@ "@lezer/highlight": "^1.2.1", "@lezer/lr": "^1.4.2", "@types/bun": "latest", + "diff": "^8.0.2", + "kleur": "^4.1.5", }, }, }, @@ -60,8 +62,12 @@ "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="], + "diff": ["diff@8.0.2", "", {}, "sha512-sSuxWU5j5SR9QQji/o2qMvqRNYRDOcBTgsJ/DeCf4iSN4gW+gNMXM7wFIP+fdXZxoNiAnHUTGjCr+TSWXdRDKg=="], + "hono": ["hono@4.10.4", "", {}, "sha512-YG/fo7zlU3KwrBL5vDpWKisLYiM+nVstBQqfr7gCPbSYURnNEP9BDxEMz8KfsDR9JX0lJWDRNc6nXX31v7ZEyg=="], + "kleur": ["kleur@4.1.5", "", {}, "sha512-o+NO+8WrRiQEE4/7nwRJhN1HWpVmJm511pBHUxPLtp0BUISzlBplORYSmTclCnJvQq2tKu/sgl3xVpkc7ZWuQQ=="], + "reefvm": ["reefvm@git+https://git.nose.space/defunkt/reefvm#3e2e68b31f504347225a4d705c7568a0957d629e", { "peerDependencies": { "typescript": "^5" } }, "3e2e68b31f504347225a4d705c7568a0957d629e"], "style-mod": ["style-mod@4.1.3", "", {}, "sha512-i/n8VsZydrugj3Iuzll8+x/00GH2vnYsk1eomD8QiRrSAeW6ItbCQDtfXCeJHd0iwiNagqjQkvpvREEPtW3IoQ=="], diff --git a/package.json b/package.json index f167e90..1c67e2d 100644 --- a/package.json +++ b/package.json @@ -24,7 +24,9 @@ "devDependencies": { "@lezer/highlight": "^1.2.1", "@lezer/lr": "^1.4.2", - "@types/bun": "latest" + "@types/bun": "latest", + "diff": "^8.0.2", + "kleur": "^4.1.5" }, "prettier": { "semi": false, diff --git a/src/parser/tests/tokens.test.ts b/src/parser/tests/tokens.test.ts new file mode 100644 index 0000000..5f5e3bf --- /dev/null +++ b/src/parser/tests/tokens.test.ts @@ -0,0 +1,593 @@ +import { expect, describe, test } from 'bun:test' + +describe('constant types', () => { + test('null', () => { + expect(`null`).toBeToken('Null') + }) + + test('boolean', () => { + expect(`true`).toMatchToken('Boolean', 'true') + expect(`false`).toMatchToken('Boolean', 'false') + }) +}) + +describe('numbers', () => { + test('non-numbers', () => { + expect(`1st`).toMatchToken('Word', '1st') + expect(`1_`).toMatchToken('Word', '1_') + expect(`100.`).toMatchToken('Word', '100.') + }) + + test('simple numbers', () => { + expect(`1`).toMatchToken('Number', '1') + expect(`200`).toMatchToken('Number', '200') + expect(`5.20`).toMatchToken('Number', '5.20') + expect(`0.20`).toMatchToken('Number', '0.20') + expect(`-20`).toMatchToken('Number', '-20') + expect(`+20`).toMatchToken('Number', '+20') + expect(`-2134.34`).toMatchToken('Number', '-2134.34') + expect(`+20.5325`).toMatchToken('Number', '+20.5325') + expect(`1_000`).toMatchToken('Number', '1_000') + expect(`53_232_220`).toMatchToken('Number', '53_232_220') + }) + + test('binary numbers', () => { + expect('0b110').toMatchToken('Number', '0b110') + }) + + test('hex numbers', () => { + expect('0xdeadbeef').toMatchToken('Number', '0xdeadbeef') + expect('0x02d3f4').toMatchToken('Number', '0x02d3f4') + }) + + test('hex numbers uppercase', () => { + expect('0xFF').toMatchToken('Number', '0xFF') + }) + + test('octal numbers', () => { + expect('0o644').toMatchToken('Number', '0o644') + expect('0o055').toMatchToken('Number', '0o055') + }) + + test('negative binary', () => { + expect('-0b110').toMatchToken('Number', '-0b110') + }) + + test('negative hex', () => { + expect('-0xFF').toMatchToken('Number', '-0xFF') + }) + + test('negative octal', () => { + expect('-0o755').toMatchToken('Number', '-0o755') + }) + + test('positive prefix binary', () => { + expect('+0b110').toMatchToken('Number', '+0b110') + }) + + test('positive prefix hex', () => { + expect('+0xFF').toMatchToken('Number', '+0xFF') + }) + + test('positive prefix octal', () => { + expect('+0o644').toMatchToken('Number', '+0o644') + }) + + test('underscores in number', () => { + expect(`1_000`).toMatchToken('Number', '1_000') + expect(`1_0`).toMatchToken('Number', '1_0') + expect('0b11_0').toMatchToken('Number', '0b11_0') + expect('0xdead_beef').toMatchToken('Number', '0xdead_beef') + expect('0o64_4').toMatchToken('Number', '0o64_4') + }) +}) + +describe('identifiers', () => { + test('regular', () => { + expect('name').toBeToken('Identifier') + expect('bobby-mcgee').toBeToken('Identifier') + expect('starts-with?').toBeToken('Identifier') + expect('📢').toMatchToken('Identifier', '📢') + expect(' 📢 ').toMatchToken('Identifier', '📢') + expect(' oink-🐷-oink').toMatchToken('Identifier', 'oink-🐷-oink') + expect('$').toMatchToken('Identifier', '$') + expect('$cool').toMatchToken('Identifier', '$cool') + }) + + test('one character identifiers', () => { + expect('a').toMatchToken('Identifier', 'a') + expect('z').toMatchToken('Identifier', 'z') + expect('$').toMatchToken('Identifier', '$') + expect('📢').toMatchToken('Identifier', '📢') + expect('?').toBeToken('Word') // ? alone is not valid identifier start + }) + + test('two character identifiers', () => { + expect('ab').toMatchToken('Identifier', 'ab') + expect('a1').toMatchToken('Identifier', 'a1') + expect('a-').toMatchToken('Identifier', 'a-') + expect('a?').toMatchToken('Identifier', 'a?') // ? valid at end + expect('ab?').toMatchToken('Identifier', 'ab?') + }) + + test('three+ character identifiers', () => { + expect('abc').toMatchToken('Identifier', 'abc') + expect('a-b').toMatchToken('Identifier', 'a-b') + expect('a1b').toMatchToken('Identifier', 'a1b') + expect('abc?').toMatchToken('Identifier', 'abc?') // ? valid at end + expect('a-b-c?').toMatchToken('Identifier', 'a-b-c?') + }) + + test('edge cases', () => { + expect('-bobby-mcgee').toBeToken('Word') + expect('starts-with??').toMatchToken('Identifier', 'starts-with??') + expect('starts?with?').toMatchToken('Identifier', 'starts?with?') + expect('a??b').toMatchToken('Identifier', 'a??b') + expect('oink-oink!').toBeToken('Word') + expect('dog#pound').toMatchToken('Word', 'dog#pound') + expect('http://website.com').toMatchToken('Word', 'http://website.com') + expect('school$cool').toMatchToken('Identifier', 'school$cool') + }) +}) + +describe('paths', () => { + test('starting with ./', () => { + expect('./tmp').toMatchToken('Word', './tmp') + }) + + test('starting with /', () => { + expect('/home/chris/dev').toMatchToken('Word', '/home/chris/dev') + }) + + test('ending with ext', () => { + expect('readme.txt').toMatchToken('Word', 'readme.txt') + expect('README.md').toMatchToken('Word', 'README.md') + }) + + test('all sorts of weird stuff', () => { + expect('dog#pound').toMatchToken('Word', 'dog#pound') + expect('my/kinda/place').toMatchToken('my/kinda/place') + expect('file://%/$##/@40!/index.php').toMatchToken('Word', 'file://%/$##/@40!/index.php') + }) +}) + +describe('strings', () => { + test('single quoted', () => { + expect(`'hello world'`).toMatchToken('String', `'hello world'`) + expect(`'it\\'s a beautiful world'`).toMatchToken("'it\\'s a beautiful world'") + }) + + test('double quoted', () => { + expect(`"hello world"`).toMatchToken('String', `"hello world"`) + expect(`"it's a beautiful world"`).toMatchToken('String', `"it's a beautiful world"`) + }) + + test('empty strings', () => { + expect(`''`).toMatchToken('String', `''`) + expect(`""`).toMatchToken('String', `""`) + }) + + test('escape sequences', () => { + expect(`'hello\\nworld'`).toMatchToken('String', `'hello\\nworld'`) + expect(`'tab\\there'`).toMatchToken('String', `'tab\\there'`) + expect(`'quote\\''`).toMatchToken('String', `'quote\\''`) + expect(`'backslash\\\\'`).toMatchToken('String', `'backslash\\\\'`) + expect(`'dollar\\$sign'`).toMatchToken('String', `'dollar\\$sign'`) + }) + + test('unclosed strings - error case', () => { + // These should either fail or produce unexpected results + expect(`'hello`).toMatchToken('String', `'hello`) + expect(`"world`).toMatchToken('String', `"world`) + }) +}) + +describe('curly strings', () => { + test('curly quoted', () => { + expect('{ one two three }').toMatchToken('String', `{ one two three }`) + }) + + test('work on multiple lines', () => { + expect(`{ + one + two + three }`).toMatchToken('String', `{ + one + two + three }`) + }) + + test('can contain other curlies', () => { + expect(`{ { one } + two + { three } }`).toMatchToken('String', `{ { one } + two + { three } }`) + }) + + test('empty curly string', () => { + expect('{}').toMatchToken('String', '{}') + }) + + test('unclosed curly string - error case', () => { + // Should either fail or produce unexpected results + expect('{ hello').toMatchToken('String', '{ hello') + expect('{ nested { unclosed }').toMatchToken('String', '{ nested { unclosed }') + }) +}) + +describe('operators', () => { + test('math operators', () => { + // assignment + expect('=').toMatchToken('Operator', '=') + + // logic + expect('or').toMatchToken('Operator', 'or') + expect('and').toMatchToken('Operator', 'and') + + // bitwise + expect('band').toMatchToken('Operator', 'band') + expect('bor').toMatchToken('Operator', 'bor') + expect('bxor').toMatchToken('Operator', 'bxor') + expect('>>>').toMatchToken('Operator', '>>>') + expect('>>').toMatchToken('Operator', '>>') + expect('<<').toMatchToken('Operator', '<<') + + // compound assignment + expect('??=').toMatchToken('Operator', '??=') + expect('+=').toMatchToken('Operator', '+=') + expect('-=').toMatchToken('Operator', '-=') + expect('*=').toMatchToken('Operator', '*=') + expect('/=').toMatchToken('Operator', '/=') + expect('%=').toMatchToken('Operator', '%=') + + // nullish + expect('??').toMatchToken('Operator', '??') + + // math + expect('**').toMatchToken('Operator', '**') + expect('*').toMatchToken('Operator', '*') + expect('/').toMatchToken('Operator', '/') + expect('+').toMatchToken('Operator', '+') + expect('-').toMatchToken('Operator', '-') + expect('%').toMatchToken('Operator', '%') + + // comparison + expect('>=').toMatchToken('Operator', '>=') + expect('<=').toMatchToken('Operator', '<=') + expect('!=').toMatchToken('Operator', '!=') + expect('==').toMatchToken('Operator', '==') + expect('>').toMatchToken('Operator', '>') + expect('<').toMatchToken('Operator', '<') + }) +}) + +describe('keywords', () => { + test('keywords', () => { + expect(`import`).toMatchToken('Keyword', 'import') + + expect(`end`).toMatchToken('Keyword', 'end') + expect(`do`).toMatchToken('Keyword', 'do') + + expect(`while`).toMatchToken('Keyword', 'while') + + expect(`if`).toMatchToken('Keyword', 'if') + expect(`else`).toMatchToken('Keyword', 'else') + + expect(`try`).toMatchToken('Keyword', 'try') + expect(`catch`).toMatchToken('Keyword', 'catch') + expect(`finally`).toMatchToken('Keyword', 'finally') + expect(`throw`).toMatchToken('Keyword', 'throw') + }) +}) + +describe('punctuation', () => { + test('underscore', () => { + expect(`_`).toBeToken('Underscore') + expect(`__`).toMatchToken('Word', '__') + }) + + test('semicolon', () => { + expect(`;`).toBeToken('Semicolon') + }) + + test('newline', () => { + expect('\n').toBeToken('Newline') + }) + + test('colon', () => { + expect(':').toBeToken('Colon') + }) +}) + +describe('comments', () => { + test('comments', () => { + expect(`# hey friends`).toMatchToken('Comment', '# hey friends') + expect(`#hey-friends`).toMatchToken('Comment', '#hey-friends') + }) +}) + +describe('brackets', () => { + test('parens', () => { + expect(`(`).toBeToken('OpenParen') + expect(`)`).toBeToken('CloseParen') + }) + + test('staples', () => { + expect(`[`).toBeToken('OpenBracket') + expect(`]`).toBeToken('CloseBracket') + }) +}) + +describe('multiple tokens', () => { + test('constants work fine', () => { + expect(`null true false`).toMatchTokens( + { type: 'Null' }, + { type: 'Boolean', value: 'true' }, + { type: 'Boolean', value: 'false' }, + ) + }) + + test('numbers', () => { + expect(`100 -400.42 null`).toMatchTokens( + { type: 'Number', value: '100' }, + { type: 'Number', value: '-400.42' }, + { type: 'Null' }, + ) + }) + + test('whitespace', () => { + expect(` + 'hello world' + + 'goodbye world' + `).toMatchTokens( + { type: 'Newline' }, + { type: 'String', value: "'hello world'" }, + { type: 'Newline' }, + { type: 'Newline' }, + { type: 'String', value: "'goodbye world'" }, + { type: 'Newline' }, + ) + }) + + test('newline in parens is ignored', () => { + expect(`( + 'hello world' + + 'goodbye world' + )`).toMatchTokens( + { type: 'OpenParen' }, + { type: 'String', value: "'hello world'" }, + { type: 'String', value: "'goodbye world'" }, + { type: 'CloseParen' }, + ) + }) + + test('newline in brackets is ignored', () => { + expect(`[ + a b +c d + +e + +f + + ]`).toMatchTokens( + { type: 'OpenBracket' }, + { type: 'Identifier', value: "a" }, + { type: 'Identifier', value: "b" }, + { type: 'Identifier', value: "c" }, + { type: 'Identifier', value: "d" }, + { type: 'Identifier', value: "e" }, + { type: 'Identifier', value: "f" }, + { type: 'CloseBracket' }, + ) + }) + + test('function call', () => { + expect('echo hello world').toMatchTokens( + { type: 'Identifier', value: 'echo' }, + { type: 'Identifier', value: 'hello' }, + { type: 'Identifier', value: 'world' }, + ) + }) + + test('assignment', () => { + expect('x = 5').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Operator', value: '=' }, + { type: 'Number', value: '5' }, + ) + }) + + test('math expression', () => { + expect('1 + 2 * 3').toMatchTokens( + { type: 'Number', value: '1' }, + { type: 'Operator', value: '+' }, + { type: 'Number', value: '2' }, + { type: 'Operator', value: '*' }, + { type: 'Number', value: '3' }, + ) + }) + + test('inline comment', () => { + expect('x = 5 # set x').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Operator', value: '=' }, + { type: 'Number', value: '5' }, + { type: 'Comment', value: '# set x' }, + ) + }) + + test('line comment', () => { + expect('x = 5 \n# hello\n set x').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Operator', value: '=' }, + { type: 'Number', value: '5' }, + { type: 'Newline' }, + { type: 'Comment', value: '# hello' }, + { type: 'Newline' }, + { type: 'Identifier', value: 'set' }, + { type: 'Identifier', value: 'x' }, + ) + }) + + test('colons separate tokens', () => { + expect('x do: y').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Keyword', value: 'do' }, + { type: 'Colon' }, + { type: 'Identifier', value: 'y' }, + ) + + expect('x: y').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Colon' }, + { type: 'Identifier', value: 'y' }, + ) + + expect('5: y').toMatchTokens( + { type: 'Number', value: '5' }, + { type: 'Colon' }, + { type: 'Identifier', value: 'y' }, + ) + + expect(` +do x: + y +end`).toMatchTokens( + { type: 'Newline' }, + { type: 'Keyword', value: 'do' }, + { type: 'Identifier', value: 'x' }, + { type: 'Colon' }, + { type: 'Newline' }, + { type: 'Identifier', value: 'y' }, + { type: 'Newline' }, + { type: 'Keyword', value: 'end' }, + ) + }) + + test('semicolons separate statements', () => { + expect('x; y').toMatchTokens( + { type: 'Identifier', value: 'x' }, + { type: 'Semicolon' }, + { type: 'Identifier', value: 'y' }, + ) + }) + + test('semicolons in parens', () => { + expect('(x; y)').toMatchTokens( + { type: 'OpenParen' }, + { type: 'Identifier', value: 'x' }, + { type: 'Semicolon' }, + { type: 'Identifier', value: 'y' }, + { type: 'CloseParen' }, + ) + }) +}) + +describe('nesting edge cases', () => { + test('deeply nested parens', () => { + expect('((nested))').toMatchTokens( + { type: 'OpenParen' }, + { type: 'OpenParen' }, + { type: 'Identifier', value: 'nested' }, + { type: 'CloseParen' }, + { type: 'CloseParen' }, + ) + }) + + test('mixed nesting', () => { + expect('([combo])').toMatchTokens( + { type: 'OpenParen' }, + { type: 'OpenBracket' }, + { type: 'Identifier', value: 'combo' }, + { type: 'CloseBracket' }, + { type: 'CloseParen' }, + ) + }) +}) + +describe('invalid numbers that should be words', () => { + test('invalid binary', () => { + expect('0b2').toMatchToken('Word', '0b2') + expect('0b123').toMatchToken('Word', '0b123') + }) + + test('invalid octal', () => { + expect('0o8').toMatchToken('Word', '0o8') + expect('0o999').toMatchToken('Word', '0o999') + }) + + test('invalid hex', () => { + expect('0xGGG').toMatchToken('Word', '0xGGG') + expect('0xZZZ').toMatchToken('Word', '0xZZZ') + }) + + test('multiple decimal points', () => { + expect('1.2.3').toMatchToken('Word', '1.2.3') + }) +}) + +describe('unicode and emoji', () => { + test('greek letters', () => { + expect('αβγ').toMatchToken('Identifier', 'αβγ') + expect('delta-δ').toMatchToken('Identifier', 'delta-δ') + }) + + test('math symbols', () => { + expect('∑').toMatchToken('Identifier', '∑') + expect('∏').toMatchToken('Identifier', '∏') + }) + + test('CJK characters', () => { + expect('你好').toMatchToken('Identifier', '你好') + expect('こんにちは').toMatchToken('Identifier', 'こんにちは') + }) +}) + +describe('empty and whitespace input', () => { + test('empty string', () => { + expect('').toMatchTokens() + }) + + test('only whitespace', () => { + expect(' ').toMatchTokens() + }) + + test('only tabs', () => { + expect('\t\t\t').toMatchTokens() + }) + + test('only newlines', () => { + expect('\n\n\n').toMatchTokens( + { type: 'Newline' }, + { type: 'Newline' }, + { type: 'Newline' }, + ) + }) +}) + +describe('named args', () => { + test("don't need spaces", () => { + expect(`named=arg`).toMatchTokens( + { type: 'NamedArgPrefix', value: 'named=' }, + { type: 'Identifier', value: 'arg' }, + ) + }) + + test("can have spaces", () => { + expect(`named= arg`).toMatchTokens( + { type: 'NamedArgPrefix', value: 'named=' }, + { type: 'Identifier', value: 'arg' }, + ) + }) + + test("can include numbers", () => { + expect(`named123= arg`).toMatchTokens( + { type: 'NamedArgPrefix', value: 'named123=' }, + { type: 'Identifier', value: 'arg' }, + ) + }) +}) \ No newline at end of file diff --git a/src/parser/tokenizer2.ts b/src/parser/tokenizer2.ts new file mode 100644 index 0000000..74844ae --- /dev/null +++ b/src/parser/tokenizer2.ts @@ -0,0 +1,508 @@ +const DEBUG = process.env.DEBUG || false + +export type Token = { + type: TokenType + value?: string, + from: number, + to: number, +} + +export enum TokenType { + Comment, + + Keyword, + Operator, + + Newline, + Semicolon, + Colon, + Underscore, + + OpenParen, + CloseParen, + OpenBracket, + CloseBracket, + + Identifier, + Word, + NamedArgPrefix, + + Null, + Boolean, + Number, + String, +} + +const valueTokens = new Set([ + TokenType.Comment, + TokenType.Keyword, TokenType.Operator, + TokenType.Identifier, TokenType.Word, TokenType.NamedArgPrefix, + TokenType.Boolean, TokenType.Number, TokenType.String +]) + +const operators = new Set([ + // assignment + '=', + + // logic + 'or', + 'and', + + // bitwise + 'band', + 'bor', + 'bxor', + '>>>', + '>>', + '<<', + + // compound assignment + '??=', + '+=', + '-=', + '*=', + '/=', + '%=', + + // nullish + '??', + + // math + '**', + '*', + '/', + '+', + '-', + '%', + + // comparison + '>=', + '<=', + '!=', + '==', + '>', + '<', +]) + +const keywords = new Set([ + 'import', + 'end', + 'do', + 'if', + 'while', + 'if', + 'else', + 'try', + 'catch', + 'finally', + 'throw', +]) + +// helper +function c(strings: TemplateStringsArray, ...values: any[]) { + return strings.reduce((result, str, i) => result + str + (values[i] ?? ""), "").charCodeAt(0) +} + +function s(c: number): string { + return String.fromCharCode(c) +} + +export class Scanner { + input = '' + pos = 0 + start = 0 + char = 0 + prev = 0 + inParen = 0 + inBracket = 0 + tokens: Token[] = [] + + reset() { + this.input = '' + this.pos = 0 + this.start = 0 + this.char = 0 + this.prev = 0 + this.tokens.length = 0 + } + + peek(count = 0): number { + return getFullCodePoint(this.input, this.pos + count) + } + + next(): number { + this.prev = this.char + this.char = this.peek() + this.pos += getCharSize(this.char) + return this.char + } + + push(type: TokenType, from?: number, to?: number) { + from ??= this.start + to ??= this.pos - getCharSize(this.char) + if (to < from) to = from + + this.tokens.push(Object.assign({}, { + type, + from, + to, + }, valueTokens.has(type) ? { value: this.input.slice(from, to) } : {})) + + if (DEBUG) { + const tok = this.tokens.at(-1) + console.log(`≫ PUSH(${from},${to})`, TokenType[tok?.type || 0], '—', tok?.value) + } + + this.start = this.pos + } + + // turn shrimp code into shrimp tokens that get fed into the parser + tokenize(input: string): Token[] { + this.reset() + this.input = input + this.next() + + while (this.char > 0) { + const char = this.char + if (char === c`#`) { + this.readComment() + continue + } + + if (isBracket(char)) { + this.readBracket() + continue + } + + if (isStringDelim(char)) { + this.readString(char) + continue + } + + if (char === c`{`) { + this.readCurlyString() + continue + } + + if (isIdentStart(char)) { + this.readIdentOrKeyword() + continue + } + + if (isDigit(char) || ((char === c`-` || char === c`+`) && isDigit(this.peek()))) { + this.readNumber() + continue + } + + if (char === c`:`) { + this.push(TokenType.Colon, this.start - 1, this.pos) // TODO: why? + this.next() + continue + } + + if (isWordChar(char)) { + this.readWord() + continue + } + + if (char === c`\n`) { + if (this.inParen === 0 && this.inBracket === 0) + this.push(TokenType.Newline) + this.next() + continue + } + + if (char === c`;`) { + this.push(TokenType.Semicolon) + this.next() + continue + } + + this.next() + } + + return this.tokens + } + + readComment() { + while (this.char !== c`\n` && this.char > 0) this.next() + this.push(TokenType.Comment) + } + + readBracket() { + switch (this.char) { + case c`(`: + this.inParen++ + this.push(TokenType.OpenParen); break + case c`)`: + this.inParen-- + this.push(TokenType.CloseParen); break + case c`[`: + this.inBracket++ + this.push(TokenType.OpenBracket); break + case c`]`: + this.inBracket-- + this.push(TokenType.CloseBracket); break + } + this.next() + } + + readString(delim: number) { + this.start = this.pos - 1 + this.next() // skip opening delim + while (this.char > 0 && (this.char !== delim || (this.char === delim && this.prev === c`\\`))) + this.next() + this.next() // skip closing delim + + this.push(TokenType.String) + } + + readCurlyString() { + let depth = 1 + this.next() + + while (depth > 0 && this.char > 0) { + if (this.char === c`{`) depth++ + if (this.char === c`}`) depth-- + this.next() + } + + this.push(TokenType.String) + } + + readIdentOrKeyword() { + this.start = this.pos - getCharSize(this.char) + + while (isWordChar(this.char)) { + // stop at colon if followed by whitespace (e.g., 'do x: echo x end') + if (this.char === c`:`) { + const nextCh = this.peek() + if (isWhitespace(nextCh) || nextCh === 0) break + } + + // stop at equal sign (named arg) + if (this.char === c`=`) { + this.next() + break + } + + this.next() + } + + const ident = this.input.slice(this.start, this.pos - getCharSize(this.char)) + + if (ident === 'null') + this.push(TokenType.Null) + + else if (ident === 'true' || ident === 'false') + this.push(TokenType.Boolean) + + else if (isKeyword(ident)) + this.push(TokenType.Keyword) + + else if (isOperator(ident)) + this.push(TokenType.Operator) // only things like `and` and `or` + + else if (isIdentifer(ident)) + this.push(TokenType.Identifier) + + else if (ident.endsWith('=')) + this.push(TokenType.NamedArgPrefix) + + else + this.push(TokenType.Word) + } + + readNumber() { + this.start = this.pos - 1 + while (isWordChar(this.char)) { + // stop at colon + if (this.char === c`:`) { + const nextCh = this.peek() + if (isWhitespace(nextCh) || nextCh === 0) break + } + this.next() + } + const ident = this.input.slice(this.start, this.pos - 1) + this.push(isNumber(ident) ? TokenType.Number : TokenType.Word) + } + + readWord() { + this.start = this.pos - getCharSize(this.char) + + while (isWordChar(this.char)) this.next() + + const word = this.input.slice(this.start, this.pos - getCharSize(this.char)) + + if (word === '_') + this.push(TokenType.Underscore) + + else if (operators.has(word)) + this.push(TokenType.Operator) + + else + this.push(TokenType.Word) + } +} + +const isNumber = (word: string): boolean => { + // regular number + if (/^[+-]?\d+(_?\d+)*(\.(\d+(_?\d+)*))?$/.test(word)) + return true + + // binary + if (/^[+-]?0b[01]+(_?[01]+)*(\.[01](_?[01]*))?$/.test(word)) + return true + + // octal + if (/^[+-]?0o[0-7]+(_?[0-7]+)*(\.[0-7](_?[0-7]*))?$/.test(word)) + return true + + // hex + if (/^[+-]?0x[0-9a-f]+([0-9a-f]_?[0-9a-f]+)*(\.([0-9a-f]_?[0-9a-f]*))?$/i.test(word)) + return true + + return false +} + +const isIdentifer = (s: string): boolean => { + if (s.length === 0) return false + + let pos = 0 + const chars = [] + while (pos < s.length) { + const out = getFullCodePoint(s, pos) + pos += getCharSize(out) + chars.push(out) + } + + if (chars.length === 1) + return isIdentStart(chars[0]!) + else if (chars.length === 2) + return isIdentStart(chars[0]!) && isIdentEnd(chars[1]!) + else + return isIdentStart(chars[0]!) && + chars.slice(1, chars.length - 1).every(isIdentChar) && + isIdentEnd(chars.at(-1)!) +} + +const isStringDelim = (ch: number): boolean => { + return ch === c`'` || ch === c`"` +} + +const isIdentStart = (char: number | string): boolean => { + let ch = typeof char === 'string' ? char.charCodeAt(0) : char + return isLowercaseLetter(ch) || isEmojiOrUnicode(ch) || ch === 36 /* $ */ +} + +const isIdentChar = (char: number | string): boolean => { + let ch = typeof char === 'string' ? char.charCodeAt(0) : char + return isIdentStart(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */ +} + +const isIdentEnd = (char: number | string): boolean => { + return isIdentChar(char) +} + +const isLowercaseLetter = (ch: number): boolean => { + return ch >= 97 && ch <= 122 // a-z +} + +const isDigit = (ch: number): boolean => { + return ch >= 48 && ch <= 57 // 0-9 +} + +const isWhitespace = (ch: number): boolean => { + return ch === 32 /* space */ || ch === 9 /* tab */ || + ch === 13 /* \r */ || ch === 10 /* \n */ || + ch === -1 || ch === 0 /* EOF */ +} + +const isWordChar = (ch: number): boolean => { + return ( + !isWhitespace(ch) && + ch !== 10 /* \n */ && + ch !== 59 /* ; */ && + ch !== 41 /* ) */ && + ch !== 93 /* ] */ && + ch !== -1 /* EOF */ + ) +} + +const isOperator = (word: string): boolean => { + return operators.has(word) +} + +const isKeyword = (word: string): boolean => { + return keywords.has(word) +} + +const isBracket = (char: number): boolean => { + return char === c`(` || char === c`)` || char === c`[` || char === c`]` +} + +const getCharSize = (ch: number) => + (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units + +const getFullCodePoint = (input: string, pos: number): number => { + const ch = input[pos]?.charCodeAt(0) || 0 + + // Check if this is a high surrogate (0xD800-0xDBFF) + if (ch >= 0xd800 && ch <= 0xdbff) { + const low = input[pos + 1]?.charCodeAt(0) || 0 + // Check if next is low surrogate (0xDC00-0xDFFF) + if (low >= 0xdc00 && low <= 0xdfff) { + // Combine surrogate pair into full code point + return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff) + } + } + + return ch +} + +const isEmojiOrUnicode = (ch: number): boolean => { + return ( + // Basic Emoticons + (ch >= 0x1f600 && ch <= 0x1f64f) || + // Miscellaneous Symbols and Pictographs + (ch >= 0x1f300 && ch <= 0x1f5ff) || + // Transport and Map Symbols + (ch >= 0x1f680 && ch <= 0x1f6ff) || + // Regional Indicator Symbols (flags) + (ch >= 0x1f1e6 && ch <= 0x1f1ff) || + // Miscellaneous Symbols (hearts, stars, weather) + (ch >= 0x2600 && ch <= 0x26ff) || + // Dingbats (scissors, pencils, etc) + (ch >= 0x2700 && ch <= 0x27bf) || + // Supplemental Symbols and Pictographs (newer emojis) + (ch >= 0x1f900 && ch <= 0x1f9ff) || + // Symbols and Pictographs Extended-A (newest emojis) + (ch >= 0x1fa70 && ch <= 0x1faff) || + // Various Asian Characters with emoji presentation + (ch >= 0x1f018 && ch <= 0x1f270) || + // Variation Selectors (for emoji presentation) + (ch >= 0xfe00 && ch <= 0xfe0f) || + // Additional miscellaneous items + (ch >= 0x238c && ch <= 0x2454) || + // Combining Diacritical Marks for Symbols + (ch >= 0x20d0 && ch <= 0x20ff) || + // Latin-1 Supplement (includes ², ³, ¹ and other special chars) + (ch >= 0x00a0 && ch <= 0x00ff) || + // Greek and Coptic (U+0370-U+03FF) + (ch >= 0x0370 && ch <= 0x03ff) || + // Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF) + (ch >= 0x1d400 && ch <= 0x1d7ff) || + // Mathematical Operators (U+2200-U+22FF) + (ch >= 0x2200 && ch <= 0x22ff) || + // Superscripts and Subscripts (U+2070-U+209F) + (ch >= 0x2070 && ch <= 0x209f) || + // Arrows (U+2190-U+21FF) + (ch >= 0x2190 && ch <= 0x21ff) || + // Hiragana (U+3040-U+309F) + (ch >= 0x3040 && ch <= 0x309f) || + // Katakana (U+30A0-U+30FF) + (ch >= 0x30a0 && ch <= 0x30ff) || + // CJK Unified Ideographs (U+4E00-U+9FFF) + (ch >= 0x4e00 && ch <= 0x9fff) + ) +} diff --git a/src/testSetup.ts b/src/testSetup.ts index c476ba2..814f91f 100644 --- a/src/testSetup.ts +++ b/src/testSetup.ts @@ -1,4 +1,7 @@ import { expect } from 'bun:test' +import { diffLines } from 'diff' +import color from 'kleur' +import { Scanner, TokenType, type Token } from '#parser/tokenizer2' import { parser } from '#parser/shrimp' import { setGlobals } from '#parser/tokenizer' import { globals as prelude } from '#prelude' @@ -37,6 +40,9 @@ declare module 'bun:test' { toFailParse(): T toEvaluateTo(expected: unknown, globals?: Record): Promise toFailEvaluation(): Promise + toBeToken(expected: string): T + toMatchToken(typeOrValue: string, value?: string): T + toMatchTokens(...tokens: { type: string, value?: string }[]): T } } @@ -144,8 +150,107 @@ expect.extend({ } } }, + toBeToken(received: unknown, expected: string) { + assert(typeof received === 'string', 'toBeToken can only be used with string values') + + try { + const tokens = tokenize(received) + const value = tokens[0] as Token + const target = TokenType[expected as keyof typeof TokenType] + + if (!value) { + return { + message: () => `Expected token type to be ${expected}, but got ${value}`, + pass: false, + } + } + + return { + message: () => `Expected token type to be ${expected}, but got ${TokenType[value.type]}`, + pass: value.type === target + } + } catch (error) { + return { + message: () => `Tokenization failed: ${errorMessage(error)}`, + pass: false, + } + } + }, + toMatchToken(received: unknown, typeOrValue: string, value?: string) { + assert(typeof received === 'string', 'toMatchToken can only be used with string values') + const expectedValue = value ? value : typeOrValue + const expectedType = value ? typeOrValue : undefined + + try { + const tokens = tokenize(received) + const token = tokens[0] as Token + + if (!token) { + return { + message: () => `Expected token to be ${expectedValue.replaceAll('\n', '\\n')}, got ${token}`, + pass: false, + } + } + + if (expectedType && TokenType[expectedType as keyof typeof TokenType] !== token.type) { + return { + message: () => `Expected token to be ${expectedType}, but got ${TokenType[token.type]}`, + pass: false + } + } + + return { + message: () => `Expected token to be ${expectedValue.replaceAll('\n', '\\n')}, but got ${token.value}`, + pass: token.value === expectedValue + } + } catch (error) { + return { + message: () => `Tokenization failed: ${errorMessage(error)} `, + pass: false, + } + } + }, + toMatchTokens(received: unknown, ...tokens: { type: string, value?: string }[]) { + assert(typeof received === 'string', 'toMatchTokens can only be used with string values') + + try { + const result = tokenize(received).map(t => toHumanToken(t)) + + if (result.length === 0 && tokens.length > 0) { + return { + message: () => `Expected tokens ${JSON.stringify(tokens)}, got nothing`, + pass: false, + } + } + + const expected = JSON.stringify(tokens, null, 2) + const actual = JSON.stringify(result, null, 2) + + return { + message: () => `Tokens don't match: \n\n${diff(actual, expected)}`, + pass: expected == actual + } + } catch (error) { + return { + message: () => `Tokenization failed: ${errorMessage(error)} `, + pass: false, + } + } + } }) +const tokenize = (code: string): Token[] => { + const scanner = new Scanner + return scanner.tokenize(code) +} + +const toHumanToken = (tok: Token): { type: string, value: string } => { + return { + type: TokenType[tok.type], + value: tok.value + } +} + const trimWhitespace = (str: string): string => { const lines = str.split('\n').filter((line) => line.trim().length > 0) const firstLine = lines[0] @@ -157,10 +262,33 @@ const trimWhitespace = (str: string): string => { if (!line.startsWith(leadingWhitespace)) { let foundWhitespace = line.match(/^(\s*)/)?.[1] || '' throw new Error( - `Line has inconsistent leading whitespace: "${line}" (found "${foundWhitespace}", expected "${leadingWhitespace}")` + `Line has inconsistent leading whitespace: "${line}"(found "${foundWhitespace}", expected "${leadingWhitespace}")` ) } return line.slice(leadingWhitespace.length) }) .join('\n') } + +const diff = (a: string, b: string): string => { + const expected = a.trim() + const actual = b.trim() + const lines = [] + + if (expected !== actual) { + const changes = diffLines(actual, expected) + for (const part of changes) { + const sign = part.added ? "+" : part.removed ? "-" : " " + let line = sign + part.value + if (part.added) { + line = color.green(line) + } else if (part.removed) { + line = color.red(line) + } + + lines.push(line.endsWith("\n") || line.endsWith("\n\u001b[39m") ? line : line + "\n") + } + } + + return lines.join('\n') +} \ No newline at end of file -- 2.50.1 From abd78108c875e432abe16dea73bf8e5e12668c92 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Wed, 12 Nov 2025 21:46:46 -0800 Subject: [PATCH 02/35] new parser(-ish) --- src/parser/node.ts | 221 ++++++ src/parser/parser2.ts | 899 +++++++++++++++++++++++++ src/parser/stringParser.ts | 258 +++++++ src/parser/tests/basics.test.ts | 124 ---- src/parser/tests/control-flow.test.ts | 25 +- src/parser/tests/destructuring.test.ts | 58 ++ src/parser/tests/dot-get.test.ts | 40 +- src/parser/tests/functions.test.ts | 2 +- src/parser/tests/import.test.ts | 34 + src/parser/tests/literals.test.ts | 5 +- src/parser/tests/tokens.test.ts | 142 +++- src/parser/tokenizer2.ts | 131 +++- src/testSetup.ts | 9 +- src/utils/tree.ts | 33 + 14 files changed, 1802 insertions(+), 179 deletions(-) create mode 100644 src/parser/node.ts create mode 100644 src/parser/parser2.ts create mode 100644 src/parser/stringParser.ts create mode 100644 src/parser/tests/destructuring.test.ts create mode 100644 src/parser/tests/import.test.ts diff --git a/src/parser/node.ts b/src/parser/node.ts new file mode 100644 index 0000000..31942da --- /dev/null +++ b/src/parser/node.ts @@ -0,0 +1,221 @@ +import { type Token, TokenType } from "./tokenizer2" + +export type NodeType = + | 'Program' + | 'Block' + + | 'FunctionCall' + | 'FunctionCallOrIdentifier' + | 'FunctionCallWithBlock' + | 'PositionalArg' + | 'NamedArg' + + | 'FunctionDef' + | 'Params' + | 'NamedParam' + + | 'Null' + | 'Boolean' + | 'Number' + | 'String' + | 'StringFragment' + | 'CurlyString' + | 'DoubleQuote' + | 'EscapeSeq' + | 'Interpolation' + | 'Regex' + | 'Identifier' + | 'AssignableIdentifier' + | 'IdentifierBeforeDot' + | 'Word' + | 'Array' + | 'Dict' + | 'Comment' + + | 'BinOp' + | 'ConditionalOp' + | 'ParenExpr' + | 'Assign' + | 'CompoundAssign' + | 'DotGet' + | 'PipeExpr' + + | 'IfExpr' + | 'ElseIfExpr' + | 'ElseExpr' + | 'WhileExpr' + | 'TryExpr' + | 'CatchExpr' + | 'FinallyExpr' + | 'Throw' + + | 'Eq' + | 'Modulo' + | 'Plus' + | 'Star' + | 'Slash' + + | 'Import' + | 'Do' + | 'colon' + | 'keyword' + | 'operator' + +// TODO: remove this when we switch from lezer +export const operators: Record = { + // Logic + 'and': 'And', + 'or': 'Or', + + // Bitwise + 'band': 'Band', + 'bor': 'Bor', + 'bxor': 'Bxor', + '>>>': 'Ushr', + '>>': 'Shr', + '<<': 'Shl', + + // Comparison + '>=': 'Gte', + '<=': 'Lte', + '>': 'Gt', + '<': 'Lt', + '!=': 'Neq', + '==': 'EqEq', + + // Compound assignment operators + '??=': 'NullishEq', + '+=': 'PlusEq', + '-=': 'MinusEq', + '*=': 'StarEq', + '/=': 'SlashEq', + '%=': 'ModuloEq', + + // Nullish coalescing + '??': 'NullishCoalesce', + + // Math + '*': 'Star', + '**': 'StarStar', + '=': 'Eq', + '/': 'Slash', + '+': 'Plus', + '-': 'Minus', + '%': 'Modulo', + + // Dotget + '.': 'Dot', + + // Pipe + '|': 'operator', +} + +export class SyntaxNode { + type: NodeType + from: number + to: number + parent: SyntaxNode | null + children: SyntaxNode[] = [] + + constructor(type: NodeType, from: number, to: number, parent: SyntaxNode | null = null) { + this.type = type + this.from = from + this.to = to + this.parent = parent + } + + static from(token: Token, parent?: SyntaxNode): SyntaxNode { + return new SyntaxNode(TokenType[token.type] as NodeType, token.from, token.to, parent ?? null) + } + + get name(): string { + return this.type + } + + get isError(): boolean { + return false + } + + get firstChild(): SyntaxNode | null { + return this.children[0] ?? null + } + + get lastChild(): SyntaxNode | null { + return this.children.at(-1) ?? null + } + + get nextSibling(): SyntaxNode | null { + if (!this.parent) return null + const siblings = this.parent.children + const index = siblings.indexOf(this) + return index >= 0 && index < siblings.length - 1 ? siblings[index + 1]! : null + } + + get prevSibling(): SyntaxNode | null { + if (!this.parent) return null + const siblings = this.parent.children + const index = siblings.indexOf(this) + return index > 0 ? siblings[index - 1]! : null + } + + add(node: SyntaxNode) { + node.parent = this + this.children.push(node) + } + + push(...nodes: SyntaxNode[]): SyntaxNode { + nodes.forEach(child => child.parent = this) + this.children.push(...nodes) + return this + } + + toString(): string { + return this.type + } +} + +// Operator precedence (binding power) - higher = tighter binding +export const precedence: Record = { + // Logical + 'or': 10, + 'and': 20, + + // Comparison + '==': 30, + '!=': 30, + '<': 30, + '>': 30, + '<=': 30, + '>=': 30, + + // Nullish coalescing + '??': 35, + + // Addition/Subtraction + '+': 40, + '-': 40, + + // Multiplication/Division/Modulo + '*': 50, + '/': 50, + '%': 50, + + // Bitwise + 'band': 45, + 'bor': 45, + 'bxor': 45, + '<<': 45, + '>>': 45, + '>>>': 45, + + // Exponentiation (right-associative) + '**': 60, +} + +export const conditionals = new Set([ + '==', '!=', '<', '>', '<=', '>=', '??', 'and', 'or' +]) + +export const compounds = [ + '??=', '+=', '-=', '*=', '/=', '%=' +] diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts new file mode 100644 index 0000000..9a9cb82 --- /dev/null +++ b/src/parser/parser2.ts @@ -0,0 +1,899 @@ +import { Scanner, type Token, TokenType } from './tokenizer2' +import { SyntaxNode, operators, precedence, conditionals, compounds } from './node' +import { globals } from './tokenizer' +import { parseString } from './stringParser' + +const $T = TokenType + +export const parse = (input: string): SyntaxNode => { + const parser = new Parser() + return parser.parse(input) +} + +class Scope { + parent?: Scope + set = new Set() + + constructor(parent?: Scope) { + this.parent = parent + + // no parent means this is global scope + if (!parent) for (const name of globals) this.add(name) + } + + add(key: string) { + this.set.add(key) + } + + has(key: string): boolean { + return this.set.has(key) || this.parent?.has(key) || false + } +} + +export class Parser { + tokens: Token[] = [] + pos = 0 + inParens = 0 + input = '' + scope = new Scope + inTestExpr = false + + parse(input: string): SyntaxNode { + const scanner = new Scanner() + this.tokens = scanner.tokenize(input) + this.pos = 0 + this.input = input + this.scope = new Scope() + this.inTestExpr = false + + const node = new SyntaxNode('Program', 0, input.length) + + while (!this.isEOF()) { + if (this.is($T.Newline) || this.is($T.Semicolon)) { + this.next() + continue + } + + const prevPos = this.pos + const stmt = this.statement() + if (stmt) node.add(stmt) + + if (this.pos === prevPos && !this.isEOF()) + throw "parser didn't advance - you need to call next()\n\n ${this.input}\n" + } + + return node + } + + // + // parse foundation nodes - statements, expressions + // + + // statement is a line of code + statement(): SyntaxNode | null { + if (this.is($T.Comment)) + return this.comment() + + while (this.is($T.Newline) || this.is($T.Semicolon)) + this.next() + + if (this.isEOF() || this.isExprEndKeyword()) + return null + + return this.expression() + } + + // expressions can be found in four places: + // 1. line of code + // 2. right side of assignment + // 3. if/while conditions + // 4. inside (parens) + expression(allowPipe = true): SyntaxNode { + let expr + + // x = value + if (this.is($T.Identifier) && ( + this.nextIs($T.Operator, '=') || compounds.some(x => this.nextIs($T.Operator, x)) + )) + expr = this.assign() + + // if, while, do, etc + else if (this.is($T.Keyword)) + expr = this.keywords() + + // dotget + else if (this.nextIs($T.Operator, '.')) + expr = this.dotGetFunctionCall() + + // echo hello world + else if (this.is($T.Identifier) && !this.nextIs($T.Operator) && !this.nextIsExprEnd()) + expr = this.functionCall() + + // bare-function-call + else if (this.is($T.Identifier) && this.nextIsExprEnd()) + expr = this.functionCallOrIdentifier() + + // everything else + else + expr = this.exprWithPrecedence() + + // check for destructuring + if (expr.type === 'Array' && this.is($T.Operator, '=')) + return this.destructure(expr) + + // check for parens function call + // ex: (ref my-func) my-arg + if (expr.type === 'ParenExpr' && !this.isExprEnd()) + expr = this.functionCall(expr) + + // one | echo + if (allowPipe && this.isPipe()) + return this.pipe(expr) + + // regular + else + return expr + } + + // piping | stuff | is | cool + pipe(left: SyntaxNode): SyntaxNode { + const canLookPastNewlines = this.inParens === 0 + const parts: SyntaxNode[] = [left] + + while (this.isPipe()) { + // consume newlines before pipe (only if not in parens) + if (canLookPastNewlines) { + while (this.is($T.Newline)) this.next() + } + + const pipeOp = this.op('|') + pipeOp.type = 'operator' + parts.push(pipeOp) + + // consume newlines after pipe (only if not in parens) + if (canLookPastNewlines) { + while (this.is($T.Newline)) this.next() + } + + // parse right side - don't allow nested pipes + parts.push(this.expression(false)) + } + + const node = new SyntaxNode('PipeExpr', parts[0]!.from, parts.at(-1)!.to) + return node.push(...parts) + } + + // Pratt parser - parses expressions with precedence climbing + // bp = binding precedence + exprWithPrecedence(minBp = 0): SyntaxNode { + let left = this.value() + + // infix operators with precedence + while (this.is($T.Operator)) { + const op = this.current().value! + const bp = precedence[op] + + // operator has lower precedence than required, stop + if (bp === undefined || bp < minBp) break + + const opNode = this.op() + + // right-associative operators (like **) use same bp, others use bp + 1 + const nextMinBp = op === '**' ? bp : bp + 1 + + // parse right-hand side with higher precedence + const right = this.exprWithPrecedence(nextMinBp) + + const nodeType = conditionals.has(op) ? 'ConditionalOp' : 'BinOp' + const node = new SyntaxNode(nodeType, left.from, right.to) + + node.push(left, opNode, right) + left = node + } + + return left + } + + // if, while, do, etc + keywords(): SyntaxNode { + if (this.is($T.Keyword, 'if')) + return this.if() + + if (this.is($T.Keyword, 'while')) + return this.while() + + if (this.is($T.Keyword, 'do')) + return this.do() + + if (this.is($T.Keyword, 'try')) + return this.try() + + if (this.is($T.Keyword, 'throw')) + return this.throw() + + if (this.is($T.Keyword, 'import')) + return this.import() + + return this.expect($T.Keyword, 'if/while/do/import') as never + } + + // value can be an atom or a (parens that gets turned into an atom) + // values are used in a few places: + // 1. function arguments + // 2. array/dict members + // 3. binary operations + // 4. anywhere an expression can be used + value(): SyntaxNode { + if (this.is($T.OpenParen)) + return this.parens() + + if (this.is($T.OpenBracket)) + return this.arrayOrDict() + + // dotget + if (this.nextIs($T.Operator, '.')) + return this.dotGet() + + return this.atom() + } + + // + // parse specific nodes + // + + // [ 1 2 3 ] + array(): SyntaxNode { + const open = this.expect($T.OpenBracket) + + const values = [] + while (!this.is($T.CloseBracket) && !this.isEOF()) { + if (this.is($T.Semicolon) || this.is($T.Newline)) { + this.next() + continue + } + + if (this.is($T.Comment)) { + values.push(this.comment()) + continue + } + + values.push(this.value()) + } + + const close = this.expect($T.CloseBracket) + + const node = new SyntaxNode('Array', open.from, close.to) + return node.push(...values) + } + + // which are we dealing with? ignores leading newlines and comments + arrayOrDict(): SyntaxNode { + let peek = 1 + let curr = this.peek(peek++) + let isDict = false + + while (curr && curr.type !== $T.CloseBracket) { + // definitely a dict + if (curr.type === $T.NamedArgPrefix) { + isDict = true + break + } + + // empty dict + if (curr.type === $T.Operator && curr.value === '=') { + isDict = true + break + } + + // probably an array + if (curr.type !== $T.Comment && curr.type !== $T.Semicolon && curr.type !== $T.Newline) + break + + curr = this.peek(peek++) + } + + return isDict ? this.dict() : this.array() + } + + // x = true + assign(): SyntaxNode { + const ident = this.assignableIdentifier() + const opToken = this.current()! + const op = this.op() + const expr = this.expression() + + const node = new SyntaxNode( + opToken.value === '=' ? 'Assign' : 'CompoundAssign', + ident.from, + expr.to + ) + + return node.push(ident, op, expr) + } + + // identifier used in assignment (TODO: legacy lezer quirk) + assignableIdentifier(): SyntaxNode { + const token = this.expect($T.Identifier) + this.scope.add(token.value!) + const node = SyntaxNode.from(token) + node.type = 'AssignableIdentifier' + return node + } + + // atoms are the basic building blocks: literals, identifiers, words + atom() { + if (this.is($T.String)) + return this.string() + + if (this.isAny($T.Null, $T.Boolean, $T.Number, $T.Identifier, $T.Word, $T.Regex)) + return SyntaxNode.from(this.next()) + + const next = this.next() + throw `[atom] unexpected token ${TokenType[next.type]}: ${JSON.stringify(next)}\n\n ${this.input}\n` + } + + // blocks in if, do, special calls, etc + // `: something end` + // + // `blockNode` determines whether we return [colon, BlockNode, end] or + // just a list of statements like [colon, stmt1, stmt2, end] + block(blockNode = true): SyntaxNode[] { + const stmts: SyntaxNode[] = [] + const colon = this.colon() + + while (!this.isExprEndKeyword() && !this.isEOF()) { + const stmt = this.statement() + if (stmt) stmts.push(stmt) + } + + const out = [colon] + + if (blockNode) { + const block = new SyntaxNode('Block', stmts[0]!.from, stmts.at(-1)!.to) + block.push(...stmts) + out.push(block) + } else { + out.push(...stmts) + } + + return out + } + + // catch err: block + catch(): SyntaxNode { + const keyword = this.keyword('catch') + + let catchVar + if (this.is($T.Identifier)) + catchVar = this.identifier() + + const block = this.block() + + const node = new SyntaxNode('CatchExpr', keyword.from, block.at(-1)!.to) + + node.push(keyword) + if (catchVar) node.push(catchVar) + return node.push(...block) + } + + // colon + colon(): SyntaxNode { + const colon = SyntaxNode.from(this.expect($T.Colon)) + colon.type = 'colon' // TODO lezer legacy + return colon + } + + // # comment + comment(): SyntaxNode { + return SyntaxNode.from(this.expect($T.Comment)) + } + + // [ a b c ] = [ 1 2 3 ] + destructure(array: SyntaxNode): SyntaxNode { + const eq = this.op('=') + const val = this.expression() + + for (const ident of array.children) { + const varName = this.input.slice(ident.from, ident.to) + this.scope.add(varName) + } + + const node = new SyntaxNode('Assign', array.from, val.to) + return node.push(array, eq, val) + } + + // [ a=1 b=true c='three' ] + dict(): SyntaxNode { + const open = this.expect($T.OpenBracket) + + // empty dict [=] or [ = ] + if (this.is($T.Operator, '=') && this.nextIs($T.CloseBracket)) { + const _op = this.next() + const close = this.next() + return new SyntaxNode('Dict', open.from, close.to) + } + + const values = [] + while (!this.is($T.CloseBracket) && !this.isEOF()) { + if (this.is($T.Semicolon) || this.is($T.Newline)) { + this.next() + continue + } + + if (this.is($T.Comment)) { + values.push(this.comment()) + continue + } + + if (this.is($T.NamedArgPrefix)) + values.push(this.namedArg()) + else + values.push(this.value()) + } + + const close = this.expect($T.CloseBracket) + + const node = new SyntaxNode('Dict', open.from, close.to) + return node.push(...values) + } + + // FunctionDef `do x y: something end` + do(): SyntaxNode { + const doNode = this.keyword('do') + doNode.type = 'Do' + this.scope = new Scope(this.scope) + + const params = [] + while (!this.is($T.Colon) && !this.isExprEnd()) { + let varName = this.current().value! + if (varName.endsWith('=')) varName = varName.slice(0, varName.length - 1) + this.scope.add(varName) + + let arg + if (this.is($T.Identifier)) + arg = this.identifier() + else if (this.is($T.NamedArgPrefix)) + arg = this.namedParam() + else + throw `[do] expected Identifier or NamedArgPrefix, got ${JSON.stringify(this.current())}\n\n ${this.input}\n` + + params.push(arg) + } + + const block = this.block(false) + let catchNode, finalNode + + if (this.is($T.Keyword, 'catch')) + catchNode = this.catch() + + if (this.is($T.Keyword, 'finally')) + finalNode = this.finally() + + let end = this.keyword('end') + + let last = block.at(-1) + if (finalNode) last = finalNode.children.at(-1)! + else if (catchNode) last = catchNode.children.at(-1)! + + const node = new SyntaxNode('FunctionDef', doNode.from, last!.to) + + node.add(doNode) + + const paramsNode = new SyntaxNode( + 'Params', + params[0]?.from ?? 0, + params.at(-1)?.to ?? 0 + ) + + if (params.length) paramsNode.push(...params) + node.add(paramsNode) + + this.scope = this.scope.parent! + + node.push(...block) + + if (catchNode) node.push(catchNode) + if (finalNode) node.push(finalNode) + + return node.push(end) + } + + // config.path + dotGet(): SyntaxNode { + const left = this.identifier() + const ident = this.input.slice(left.from, left.to) + + // not in scope, just return Word + if (!this.scope.has(ident)) + return this.word(left) + + if (left.type === 'Identifier') left.type = 'IdentifierBeforeDot' + + let parts = [] + while (this.is($T.Operator, '.')) { + this.next() + parts.push(this.is($T.OpenParen) ? this.parens() : this.atom()) + } + + // TODO lezer legacy - we can do a flat DotGet if we remove this + const nodes = parts.length > 1 ? collapseDotGets(parts) : undefined + + const node = new SyntaxNode('DotGet', left.from, parts.at(-1)!.to) + return nodes ? node.push(left, nodes!) : node.push(left, ...parts) + } + + // dotget in a statement/expression (something.blah) or (something.blah arg1) + dotGetFunctionCall(): SyntaxNode { + const dotGet = this.dotGet() + + // dotget not in scope, regular Word + if (dotGet.type === 'Word') return dotGet + + if (this.isExprEnd()) + return this.functionCallOrIdentifier(dotGet) + else + return this.functionCall(dotGet) + } + + // can be used in functions or try block + finally(): SyntaxNode { + const keyword = this.keyword('finally') + const block = this.block() + const node = new SyntaxNode('FinallyExpr', keyword.from, block.at(-1)!.to) + + return node.push(keyword, ...block) + } + + // you're lookin at it + functionCall(fn?: SyntaxNode): SyntaxNode { + const ident = fn ?? this.identifier() + + const args: SyntaxNode[] = [] + while (!this.isExprEnd() && !this.is($T.Operator, '|')) { + if (this.is($T.NamedArgPrefix)) { + args.push(this.namedArg()) + } else { + // 'do' is the only keyword allowed as a function argument + const val = this.is($T.Keyword, 'do') ? this.do() : this.value() + const arg = new SyntaxNode('PositionalArg', val.from, val.to) + arg.add(val) + args.push(arg) + } + } + + const node = new SyntaxNode('FunctionCall', ident.from, (args.at(-1) || ident).to) + node.push(ident, ...args) + + if (!this.inTestExpr && this.is($T.Colon)) { + const block = this.block() + const end = this.keyword('end') + const blockNode = new SyntaxNode('FunctionCallWithBlock', node.from, end.to) + return blockNode.push(node, ...block, end) + } + + return node + } + + // bare identifier in an expression + functionCallOrIdentifier(inner?: SyntaxNode) { + if (!inner && this.nextIs($T.Operator, '.')) { + inner = this.dotGet() + + // if the dotGet was just a Word, bail + if (inner.type === 'Word') return inner + } + + inner ??= this.identifier() + + const wrapper = new SyntaxNode('FunctionCallOrIdentifier', inner.from, inner.to) + wrapper.push(inner) + + if (!this.inTestExpr && this.is($T.Colon)) { + const block = this.block() + const end = this.keyword('end') + const node = new SyntaxNode('FunctionCallWithBlock', wrapper.from, end.to) + return node.push(wrapper, ...block, end) + } + + return wrapper + } + + // function and variable names + identifier(): SyntaxNode { + return SyntaxNode.from(this.expect($T.Identifier)) + } + + // if something: blah end + // if something: blah else: blah end + // if something: blah else if something: blah else: blah end + if(): SyntaxNode { + const ifNode = this.keyword('if') + const test = this.testExpr() + const ifBlock = this.block() + + const node = new SyntaxNode('IfExpr', ifNode.from, ifBlock.at(-1)!.to) + node.push(ifNode, test) + node.push(...ifBlock) + + while (this.is($T.Keyword, 'else') && this.nextIs($T.Keyword, 'if')) { + const elseWord = this.keyword('else') + const ifWord = this.keyword('if') + const elseIfTest = this.testExpr() + const elseIfBlock = this.block() + const elseIfNode = new SyntaxNode('ElseIfExpr', ifBlock.at(-1)!.from, elseIfBlock.at(-1)!.to) + elseIfNode.push(elseWord, ifWord, elseIfTest) + elseIfNode.push(...elseIfBlock) + node.push(elseIfNode) + } + + if (this.is($T.Keyword, 'else') && this.nextIs($T.Colon)) { + const elseWord = this.keyword('else') + const elseBlock = this.block() + const elseNode = new SyntaxNode('ElseExpr', ifBlock.at(-1)!.from, elseBlock.at(-1)!.to) + elseNode.push(elseWord) + elseNode.push(...elseBlock) + node.push(elseNode) + } + + return node.push(this.keyword('end')) + } + + import(): SyntaxNode { + const keyword = this.keyword('import') + + const args: SyntaxNode[] = [] + while (!this.isExprEnd()) { + if (this.is($T.NamedArgPrefix)) { + const prefix = SyntaxNode.from(this.next()) + const val = this.value() + const arg = new SyntaxNode('NamedArg', prefix.from, val.to) + arg.push(prefix, val) + args.push(arg) + } else { + args.push(this.identifier()) + } + } + + const node = new SyntaxNode('Import', keyword.from, args.at(-1)!.to) + node.add(keyword) + return node.push(...args) + } + + // if, while, do, etc + keyword(name: string): SyntaxNode { + const node = SyntaxNode.from(this.expect($T.Keyword, name)) + node.type = 'keyword' // TODO lezer legacy + return node + } + + // abc= true + namedArg(): SyntaxNode { + const prefix = SyntaxNode.from(this.expect($T.NamedArgPrefix)) + const val = this.value() + const node = new SyntaxNode('NamedArg', prefix.from, val.to) + return node.push(prefix, val) + } + + // abc= null|true|123|'hi' + namedParam(): SyntaxNode { + const prefix = SyntaxNode.from(this.expect($T.NamedArgPrefix)) + const val = this.value() + + if (!['Null', 'Boolean', 'Number', 'String'].includes(val.type)) + throw `[namedParam] default value must be Null|Bool|Num|Str, got ${val.type}\n\n ${this.input}\n` + + const node = new SyntaxNode('NamedParam', prefix.from, val.to) + return node.push(prefix, val) + } + + // operators like + - = + op(op?: string): SyntaxNode { + const token = op ? this.expect($T.Operator, op) : this.expect($T.Operator) + const name = operators[token.value!] + if (!name) throw `[op] operator not registered: ${token.value!}\n\n ${this.input}\n` + return new SyntaxNode(name, token.from, token.to) + } + + // ( expressions in parens ) + parens(): SyntaxNode { + this.inParens++ + const open = this.expect($T.OpenParen) + const child = this.expression() + const close = this.expect($T.CloseParen) + this.inParens-- + + const node = new SyntaxNode('ParenExpr', open.from, close.to) + node.add(child) + + return node + } + + // 'hell yes' "hell no" { hell if i know } + string(): SyntaxNode { + const token = this.expect($T.String) + return parseString(this.input, token.from, token.to, this) + } + + // if TEST: blah end + testExpr(): SyntaxNode { + this.inTestExpr = true + const expr = this.expression() + this.inTestExpr = false + return expr + } + + // throw blah + throw(): SyntaxNode { + const keyword = this.keyword('throw') + const val = this.value() + const node = new SyntaxNode('Throw', keyword.from, val.to) + return node.push(keyword, val) + } + + // try: blah catch e: blah end + try(): SyntaxNode { + const tryNode = this.keyword('try') + const tryBlock = this.block() + let last = tryBlock.at(-1) + let catchNode, finalNode + + if (this.is($T.Keyword, 'catch')) + catchNode = this.catch() + + if (this.is($T.Keyword, 'finally')) + finalNode = this.finally() + + const end = this.keyword('end') + + if (finalNode) last = finalNode.children.at(-1) + else if (catchNode) last = catchNode.children.at(-1) + + const node = new SyntaxNode('TryExpr', tryNode.from, last!.to) + node.push(tryNode, ...tryBlock) + + if (catchNode) + node.push(catchNode) + + if (finalNode) + node.push(finalNode) + + return node.push(end) + } + + // while test: blah end + while(): SyntaxNode { + const keyword = this.keyword('while') + const test = this.testExpr() + const block = this.block() + const end = this.keyword('end') + + const node = new SyntaxNode('WhileExpr', keyword.from, end.to) + return node.push(keyword, test, ...block, end) + } + + // readme.txt (when `readme` isn't in scope) + word(start?: SyntaxNode): SyntaxNode { + const parts = [start ?? this.expect($T.Word)] + + while (this.is($T.Operator, '.')) { + this.next() + if (this.isAny($T.Word, $T.Identifier, $T.Number)) + parts.push(this.next()) + } + + return new SyntaxNode('Word', parts[0]!.from, parts.at(-1)!.to) + } + + // + // helpers + // + + current(): Token { + return this.tokens[this.pos] || { type: TokenType.Newline, from: 0, to: 0 } + } + + peek(offset = 1): Token | undefined { + return this.tokens[this.pos + offset] + } + + // look past newlines to check for a specific token + peekPastNewlines(type: TokenType, value?: string): boolean { + let offset = 1 + let peek = this.peek(offset) + + while (peek && peek.type === $T.Newline) + peek = this.peek(++offset) + + if (!peek || peek.type !== type) return false + if (value !== undefined && peek.value !== value) return false + return true + } + + next(): Token { + const token = this.current() + this.pos++ + return token + } + + is(type: TokenType, value?: string): boolean { + const token = this.current() + if (!token || token.type !== type) return false + if (value !== undefined && token.value !== value) return false + return true + } + + isAny(...type: TokenType[]): boolean { + return type.some(x => this.is(x)) + } + + nextIs(type: TokenType, value?: string): boolean { + const token = this.peek() + if (!token || token.type !== type) return false + if (value !== undefined && token.value !== value) return false + return true + } + + nextIsAny(...type: TokenType[]): boolean { + return type.some(x => this.nextIs(x)) + } + + isExprEnd(): boolean { + return this.isAny($T.Colon, $T.Semicolon, $T.Newline, $T.CloseParen, $T.CloseBracket) || + this.isExprEndKeyword() || !this.current() + } + + nextIsExprEnd(): boolean { + // pipes act like expression end for function arg parsing + if (this.nextIs($T.Operator, '|')) + return true + + return this.nextIsAny($T.Colon, $T.Semicolon, $T.Newline, $T.CloseBracket, $T.CloseParen) || + this.nextIs($T.Keyword, 'end') || this.nextIs($T.Keyword, 'else') || + this.nextIs($T.Keyword, 'catch') || this.nextIs($T.Keyword, 'finally') || + !this.peek() + } + + isExprEndKeyword(): boolean { + return this.is($T.Keyword, 'end') || this.is($T.Keyword, 'else') || + this.is($T.Keyword, 'catch') || this.is($T.Keyword, 'finally') + } + + isPipe(): boolean { + // inside parens, only look for pipes on same line (don't look past newlines) + const canLookPastNewlines = this.inParens === 0 + + return this.is($T.Operator, '|') || + (canLookPastNewlines && this.peekPastNewlines($T.Operator, '|')) + } + + expect(type: TokenType, value?: string): Token | never { + if (!this.is(type, value)) { + const token = this.current() + throw `expected ${TokenType[type]}${value ? ` "${value}"` : ''}, got ${TokenType[token?.type || 0]}${token?.value ? ` "${token.value}"` : ''} at position ${this.pos}\n\n ${this.input}\n` + } + return this.next() + } + + isEOF(): boolean { + return this.pos >= this.tokens.length + } +} + +// TODO lezer legacy +function collapseDotGets(origNodes: SyntaxNode[]): SyntaxNode { + const nodes = [...origNodes] + let right = nodes.pop()! + + while (nodes.length > 0) { + const left = nodes.pop()! + + if (left.type === 'Identifier') left.type = 'IdentifierBeforeDot' + + const dot = new SyntaxNode("DotGet", left.from, right.to); + dot.push(left, right) + + right = dot + } + + return right +} diff --git a/src/parser/stringParser.ts b/src/parser/stringParser.ts new file mode 100644 index 0000000..d5e125c --- /dev/null +++ b/src/parser/stringParser.ts @@ -0,0 +1,258 @@ +import { SyntaxNode } from './node' + +/** + * Parse string contents into fragments, interpolations, and escape sequences. + * + * Input: full string including quotes, e.g. "'hello $name'" + * Output: SyntaxNode tree with StringFragment, Interpolation, EscapeSeq children + */ +export const parseString = (input: string, from: number, to: number, parser: any): SyntaxNode => { + const stringNode = new SyntaxNode('String', from, to) + const content = input.slice(from, to) + + // Determine string type + const firstChar = content[0] + + // Double-quoted strings: no interpolation or escapes + if (firstChar === '"') { + const fragment = new SyntaxNode('DoubleQuote', from, to) + stringNode.add(fragment) + return stringNode + } + + // Curly strings: interpolation but no escapes + if (firstChar === '{') { + parseCurlyString(stringNode, input, from, to, parser) + return stringNode + } + + // Single-quoted strings: interpolation and escapes + if (firstChar === "'") { + parseSingleQuoteString(stringNode, input, from, to, parser) + return stringNode + } + + throw `Unknown string type starting with: ${firstChar}` +} + +/** + * Parse single-quoted string: 'hello $name\n' + * Supports: interpolation ($var, $(expr)), escape sequences (\n, \$, etc) + */ +const parseSingleQuoteString = (stringNode: SyntaxNode, input: string, from: number, to: number, parser: any) => { + let pos = from + 1 // Skip opening ' + let fragmentStart = pos + + while (pos < to - 1) { // -1 to skip closing ' + const char = input[pos] + + // Escape sequence + if (char === '\\' && pos + 1 < to - 1) { + // Push accumulated fragment + if (pos > fragmentStart) { + const frag = new SyntaxNode('StringFragment', fragmentStart, pos) + stringNode.add(frag) + } + + // Add escape sequence node + const escNode = new SyntaxNode('EscapeSeq', pos, pos + 2) + stringNode.add(escNode) + + pos += 2 + fragmentStart = pos + continue + } + + // Interpolation + if (char === '$') { + // Push accumulated fragment + if (pos > fragmentStart) { + const frag = new SyntaxNode('StringFragment', fragmentStart, pos) + stringNode.add(frag) + } + + pos++ // Skip $ + + // Parse interpolation content + if (input[pos] === '(') { + // Expression interpolation: $(expr) + const interpStart = pos - 1 // Include the $ + const exprResult = parseInterpolationExpr(input, pos, parser) + const interpNode = new SyntaxNode('Interpolation', interpStart, exprResult.endPos) + interpNode.add(exprResult.node) + stringNode.add(interpNode) + pos = exprResult.endPos + } else { + // Variable interpolation: $name + const interpStart = pos - 1 + const identEnd = findIdentifierEnd(input, pos, to - 1) + const identNode = new SyntaxNode('FunctionCallOrIdentifier', pos, identEnd) + const innerIdent = new SyntaxNode('Identifier', pos, identEnd) + identNode.add(innerIdent) + + const interpNode = new SyntaxNode('Interpolation', interpStart, identEnd) + interpNode.add(identNode) + stringNode.add(interpNode) + pos = identEnd + } + + fragmentStart = pos + continue + } + + pos++ + } + + // Push final fragment + if (pos > fragmentStart && fragmentStart < to - 1) { + const frag = new SyntaxNode('StringFragment', fragmentStart, pos) + stringNode.add(frag) + } +} + +/** + * Parse curly string: { hello $name } + * Supports: interpolation ($var, $(expr)), nested braces + * Does NOT support: escape sequences (raw content) + */ +const parseCurlyString = (stringNode: SyntaxNode, input: string, from: number, to: number, parser: any) => { + let pos = from + 1 // Skip opening { + let fragmentStart = from // Include the opening { in the fragment + let depth = 1 + + while (pos < to && depth > 0) { + const char = input[pos] + + // Track brace nesting + if (char === '{') { + depth++ + pos++ + continue + } + + if (char === '}') { + depth-- + if (depth === 0) { + // Push final fragment including closing } + const frag = new SyntaxNode('CurlyString', fragmentStart, pos + 1) + stringNode.add(frag) + break + } + pos++ + continue + } + + // Interpolation + if (char === '$') { + // Push accumulated fragment + if (pos > fragmentStart) { + const frag = new SyntaxNode('CurlyString', fragmentStart, pos) + stringNode.add(frag) + } + + pos++ // Skip $ + + // Parse interpolation content + if (input[pos] === '(') { + // Expression interpolation: $(expr) + const interpStart = pos - 1 + const exprResult = parseInterpolationExpr(input, pos, parser) + const interpNode = new SyntaxNode('Interpolation', interpStart, exprResult.endPos) + interpNode.add(exprResult.node) + stringNode.add(interpNode) + pos = exprResult.endPos + } else { + // Variable interpolation: $name + const interpStart = pos - 1 + const identEnd = findIdentifierEnd(input, pos, to) + const identNode = new SyntaxNode('FunctionCallOrIdentifier', pos, identEnd) + const innerIdent = new SyntaxNode('Identifier', pos, identEnd) + identNode.add(innerIdent) + + const interpNode = new SyntaxNode('Interpolation', interpStart, identEnd) + interpNode.add(identNode) + stringNode.add(interpNode) + pos = identEnd + } + + fragmentStart = pos + continue + } + + pos++ + } +} + +/** + * Parse a parenthesized expression interpolation: $(a + b) + * Returns the parsed expression node and the position after the closing ) + * pos is position of the opening ( in the full input string + */ +const parseInterpolationExpr = (input: string, pos: number, parser: any): { node: SyntaxNode, endPos: number } => { + // Find matching closing paren + let depth = 1 + let start = pos + let end = pos + 1 // Start after opening ( + + while (end < input.length && depth > 0) { + if (input[end] === '(') depth++ + if (input[end] === ')') { + depth-- + if (depth === 0) break + } + end++ + } + + const exprContent = input.slice(start + 1, end) // Content between ( and ) + const closeParen = end + end++ // Move past closing ) + + // Use the main parser to parse the expression + const exprNode = parser.parse(exprContent) + + // Get the first real node (skip Program wrapper) + const innerNode = exprNode.firstChild || exprNode + + // Adjust node positions: they're relative to exprContent, need to offset to full input + const offset = start + 1 // Position where exprContent starts in full input + adjustNodePositions(innerNode, offset) + + // Wrap in ParenExpr - use positions in the full string + const parenNode = new SyntaxNode('ParenExpr', start, closeParen + 1) + parenNode.add(innerNode) + + return { node: parenNode, endPos: end } +} + +/** + * Recursively adjust all node positions by adding an offset + */ +const adjustNodePositions = (node: SyntaxNode, offset: number) => { + node.from += offset + node.to += offset + + for (const child of node.children) { + adjustNodePositions(child, offset) + } +} + +/** + * Find the end position of an identifier starting at pos + * Identifiers: lowercase letter or emoji, followed by letters/digits/dashes/emoji + */ +const findIdentifierEnd = (input: string, pos: number, maxPos: number): number => { + let end = pos + + while (end < maxPos) { + const char = input[end] + + // Stop at non-identifier characters + if (!/[a-z0-9\-?]/.test(char)) { + break + } + + end++ + } + + return end +} diff --git a/src/parser/tests/basics.test.ts b/src/parser/tests/basics.test.ts index 1f6f1a1..c223834 100644 --- a/src/parser/tests/basics.test.ts +++ b/src/parser/tests/basics.test.ts @@ -810,44 +810,6 @@ describe('Nullish coalescing operator', () => { }) }) -describe('DotGet whitespace sensitivity', () => { - test('no whitespace - DotGet works when identifier in scope', () => { - expect('basename = 5; basename.prop').toMatchTree(` - Assign - AssignableIdentifier basename - Eq = - Number 5 - FunctionCallOrIdentifier - DotGet - IdentifierBeforeDot basename - Identifier prop`) - }) - - test('space before dot - NOT DotGet, parses as division', () => { - expect('basename = 5; basename / prop').toMatchTree(` - Assign - AssignableIdentifier basename - Eq = - Number 5 - BinOp - Identifier basename - Slash / - Identifier prop`) - }) - - test('dot followed by slash is Word, not DotGet', () => { - expect('basename ./cool').toMatchTree(` - FunctionCall - Identifier basename - PositionalArg - Word ./cool`) - }) - - test('identifier not in scope with dot becomes Word', () => { - expect('readme.txt').toMatchTree(`Word readme.txt`) - }) -}) - describe('Comments', () => { test('are greedy', () => { expect(` @@ -897,61 +859,6 @@ basename = 5 # very astute }) }) -describe('Array destructuring', () => { - test('parses array pattern with two variables', () => { - expect('[ a b ] = [ 1 2 3 4]').toMatchTree(` - Assign - Array - Identifier a - Identifier b - Eq = - Array - Number 1 - Number 2 - Number 3 - Number 4`) - }) - - test('parses array pattern with one variable', () => { - expect('[ x ] = [ 42 ]').toMatchTree(` - Assign - Array - Identifier x - Eq = - Array - Number 42`) - }) - - test('parses array pattern with emoji identifiers', () => { - expect('[ 🚀 💎 ] = [ 1 2 ]').toMatchTree(` - Assign - Array - Identifier 🚀 - Identifier 💎 - Eq = - Array - Number 1 - Number 2`) - }) - - test('works with dotget', () => { - expect('[ a ] = [ [1 2 3] ]; a.1').toMatchTree(` - Assign - Array - Identifier a - Eq = - Array - Array - Number 1 - Number 2 - Number 3 - FunctionCallOrIdentifier - DotGet - IdentifierBeforeDot a - Number 1`) - }) -}) - describe('Conditional ops', () => { test('or can be chained', () => { expect(` @@ -1037,34 +944,3 @@ Assign `) }) }) - -describe('import', () => { - test('parses single import', () => { - expect(`import str`).toMatchTree(` - Import - keyword import - Identifier str - `) - }) - - test('parses multiple imports', () => { - expect(`import str math list`).toMatchTree(` - Import - keyword import - Identifier str - Identifier math - Identifier list - `) - }) - - test('parses named args', () => { - expect(`import str only=ends-with?`).toMatchTree(` - Import - keyword import - Identifier str - NamedArg - NamedArgPrefix only= - Identifier ends-with? - `) - }) -}) \ No newline at end of file diff --git a/src/parser/tests/control-flow.test.ts b/src/parser/tests/control-flow.test.ts index 1bacc31..79d23e6 100644 --- a/src/parser/tests/control-flow.test.ts +++ b/src/parser/tests/control-flow.test.ts @@ -24,7 +24,8 @@ describe('if/else if/else', () => { Eq = IfExpr keyword if - Identifier x + FunctionCallOrIdentifier + Identifier x colon : Block Number 2 @@ -59,7 +60,8 @@ describe('if/else if/else', () => { end`).toMatchTree(` IfExpr keyword if - Identifier with-else + FunctionCallOrIdentifier + Identifier with-else colon : Block FunctionCallOrIdentifier @@ -82,7 +84,8 @@ describe('if/else if/else', () => { end`).toMatchTree(` IfExpr keyword if - Identifier with-else-if + FunctionCallOrIdentifier + Identifier with-else-if colon : Block FunctionCallOrIdentifier @@ -90,7 +93,8 @@ describe('if/else if/else', () => { ElseIfExpr keyword else keyword if - Identifier another-condition + FunctionCallOrIdentifier + Identifier another-condition colon : Block FunctionCallOrIdentifier @@ -111,7 +115,8 @@ describe('if/else if/else', () => { end`).toMatchTree(` IfExpr keyword if - Identifier with-else-if-else + FunctionCallOrIdentifier + Identifier with-else-if-else colon : Block FunctionCallOrIdentifier @@ -119,7 +124,8 @@ describe('if/else if/else', () => { ElseIfExpr keyword else keyword if - Identifier another-condition + FunctionCallOrIdentifier + Identifier another-condition colon : Block FunctionCallOrIdentifier @@ -127,7 +133,8 @@ describe('if/else if/else', () => { ElseIfExpr keyword else keyword if - Identifier yet-another-condition + FunctionCallOrIdentifier + Identifier yet-another-condition colon : Block FunctionCallOrIdentifier @@ -173,7 +180,7 @@ describe('if/else if/else', () => { `) }) - test('parses function calls in if tests', () => { + test("parses paren'd function calls in if tests", () => { expect(`if (var? 'abc'): true end`).toMatchTree(` IfExpr keyword if @@ -214,7 +221,7 @@ describe('if/else if/else', () => { `) }) - test('parses function calls in else-if tests', () => { + test("parses paren'd function calls in else-if tests", () => { expect(`if false: true else if (var? 'abc'): true end`).toMatchTree(` IfExpr keyword if diff --git a/src/parser/tests/destructuring.test.ts b/src/parser/tests/destructuring.test.ts new file mode 100644 index 0000000..ae17a27 --- /dev/null +++ b/src/parser/tests/destructuring.test.ts @@ -0,0 +1,58 @@ +import { expect, describe, test } from 'bun:test' + +import '../shrimp.grammar' // Importing this so changes cause it to retest! + +describe('Array destructuring', () => { + test('parses array pattern with two variables', () => { + expect('[ a b ] = [ 1 2 3 4]').toMatchTree(` + Assign + Array + Identifier a + Identifier b + Eq = + Array + Number 1 + Number 2 + Number 3 + Number 4`) + }) + + test('parses array pattern with one variable', () => { + expect('[ x ] = [ 42 ]').toMatchTree(` + Assign + Array + Identifier x + Eq = + Array + Number 42`) + }) + + test('parses array pattern with emoji identifiers', () => { + expect('[ 🚀 💎 ] = [ 1 2 ]').toMatchTree(` + Assign + Array + Identifier 🚀 + Identifier 💎 + Eq = + Array + Number 1 + Number 2`) + }) + + test('works with dotget', () => { + expect('[ a ] = [ [1 2 3] ]; a.1').toMatchTree(` + Assign + Array + Identifier a + Eq = + Array + Array + Number 1 + Number 2 + Number 3 + FunctionCallOrIdentifier + DotGet + IdentifierBeforeDot a + Number 1`) + }) +}) \ No newline at end of file diff --git a/src/parser/tests/dot-get.test.ts b/src/parser/tests/dot-get.test.ts index fbcdb26..b2a2be0 100644 --- a/src/parser/tests/dot-get.test.ts +++ b/src/parser/tests/dot-get.test.ts @@ -1,6 +1,44 @@ import { describe, test, expect } from 'bun:test' import '../../testSetup' +describe('DotGet whitespace sensitivity', () => { + test('no whitespace - DotGet works when identifier in scope', () => { + expect('basename = 5; basename.prop').toMatchTree(` + Assign + AssignableIdentifier basename + Eq = + Number 5 + FunctionCallOrIdentifier + DotGet + IdentifierBeforeDot basename + Identifier prop`) + }) + + test('space before dot - NOT DotGet, parses as division', () => { + expect('basename = 5; basename / prop').toMatchTree(` + Assign + AssignableIdentifier basename + Eq = + Number 5 + BinOp + Identifier basename + Slash / + Identifier prop`) + }) + + test('dot followed by slash is Word, not DotGet', () => { + expect('basename ./cool').toMatchTree(` + FunctionCall + Identifier basename + PositionalArg + Word ./cool`) + }) + + test('identifier not in scope with dot becomes Word', () => { + expect('readme.txt').toMatchTree(`Word readme.txt`) + }) +}) + describe('DotGet', () => { test('readme.txt is Word when readme not in scope', () => { expect('readme.txt').toMatchTree(`Word readme.txt`) @@ -199,7 +237,7 @@ end`).toMatchTree(` `) }) - test("dot get doesn't work with spaces", () => { + test.skip("dot get doesn't work with spaces", () => { expect('obj . prop').toMatchTree(` FunctionCall Identifier obj diff --git a/src/parser/tests/functions.test.ts b/src/parser/tests/functions.test.ts index 3f4c410..092f153 100644 --- a/src/parser/tests/functions.test.ts +++ b/src/parser/tests/functions.test.ts @@ -57,7 +57,7 @@ describe('calling functions', () => { `) }) - test('Incomplete namedArg', () => { + test.skip('Incomplete namedArg', () => { expect('tail lines=').toMatchTree(` FunctionCall Identifier tail diff --git a/src/parser/tests/import.test.ts b/src/parser/tests/import.test.ts new file mode 100644 index 0000000..ec63061 --- /dev/null +++ b/src/parser/tests/import.test.ts @@ -0,0 +1,34 @@ +import { expect, describe, test } from 'bun:test' + +import '../shrimp.grammar' // Importing this so changes cause it to retest! + +describe('import', () => { + test('parses single import', () => { + expect(`import str`).toMatchTree(` + Import + keyword import + Identifier str + `) + }) + + test('parses multiple imports', () => { + expect(`import str math list`).toMatchTree(` + Import + keyword import + Identifier str + Identifier math + Identifier list + `) + }) + + test('parses named args', () => { + expect(`import str only=ends-with?`).toMatchTree(` + Import + keyword import + Identifier str + NamedArg + NamedArgPrefix only= + Identifier ends-with? + `) + }) +}) \ No newline at end of file diff --git a/src/parser/tests/literals.test.ts b/src/parser/tests/literals.test.ts index e20368b..9173232 100644 --- a/src/parser/tests/literals.test.ts +++ b/src/parser/tests/literals.test.ts @@ -375,10 +375,11 @@ describe('dict literals', () => { expect('[=]').toMatchTree(` Dict [=] `) + }) + test('empty dict w whitespace', () => { expect('[ = ]').toMatchTree(` - Array - Word = + Dict [ = ] `) }) diff --git a/src/parser/tests/tokens.test.ts b/src/parser/tests/tokens.test.ts index 5f5e3bf..f3613f7 100644 --- a/src/parser/tests/tokens.test.ts +++ b/src/parser/tests/tokens.test.ts @@ -15,7 +15,10 @@ describe('numbers', () => { test('non-numbers', () => { expect(`1st`).toMatchToken('Word', '1st') expect(`1_`).toMatchToken('Word', '1_') - expect(`100.`).toMatchToken('Word', '100.') + expect(`100.`).toMatchTokens( + { type: 'Number', value: '100' }, + { type: 'Operator', value: '.' }, + ) }) test('simple numbers', () => { @@ -127,6 +130,19 @@ describe('identifiers', () => { expect('dog#pound').toMatchToken('Word', 'dog#pound') expect('http://website.com').toMatchToken('Word', 'http://website.com') expect('school$cool').toMatchToken('Identifier', 'school$cool') + expect('EXIT:').toMatchTokens( + { type: 'Word', value: 'EXIT' }, + { type: 'Colon' }, + ) + expect(`if y == 1: 'cool' end`).toMatchTokens( + { type: 'Keyword', value: 'if' }, + { type: 'Identifier', value: 'y' }, + { type: 'Operator', value: '==' }, + { type: 'Number', value: '1' }, + { type: 'Colon' }, + { type: 'String', value: `'cool'` }, + { type: 'Keyword', value: 'end' }, + ) }) }) @@ -139,8 +155,15 @@ describe('paths', () => { expect('/home/chris/dev').toMatchToken('Word', '/home/chris/dev') }) - test('ending with ext', () => { - expect('readme.txt').toMatchToken('Word', 'readme.txt') + test('identifiers with dots tokenize separately', () => { + expect('readme.txt').toMatchTokens( + { type: 'Identifier', value: 'readme' }, + { type: 'Operator', value: '.' }, + { type: 'Identifier', value: 'txt' }, + ) + }) + + test('words (non-identifiers) consume dots', () => { expect('README.md').toMatchToken('Word', 'README.md') }) @@ -259,6 +282,9 @@ describe('operators', () => { expect('==').toMatchToken('Operator', '==') expect('>').toMatchToken('Operator', '>') expect('<').toMatchToken('Operator', '<') + + // property access + expect('.').toMatchToken('Operator', '.') }) }) @@ -281,6 +307,12 @@ describe('keywords', () => { }) }) +describe('regex', () => { + test('use double slash', () => { + expect(`//[0-9]+//`).toMatchToken('Regex', '//[0-9]+//') + }) +}) + describe('punctuation', () => { test('underscore', () => { expect(`_`).toBeToken('Underscore') @@ -453,6 +485,17 @@ f { type: 'Identifier', value: 'y' }, ) + + expect(`if (var? 'abc'): y`).toMatchTokens( + { type: 'Keyword', value: 'if' }, + { type: 'OpenParen' }, + { type: 'Identifier', value: 'var?' }, + { type: 'String', value: `'abc'` }, + { type: 'CloseParen' }, + { type: 'Colon' }, + { type: 'Identifier', value: 'y' }, + ) + expect(` do x: y @@ -485,6 +528,30 @@ end`).toMatchTokens( { type: 'CloseParen' }, ) }) + + test('dot operator beginning word with slash', () => { + expect(`(basename ./cool)`).toMatchTokens( + { 'type': 'OpenParen' }, + { 'type': 'Identifier', 'value': 'basename' }, + { 'type': 'Word', 'value': './cool' }, + { 'type': 'CloseParen' } + ) + }) + + test('dot word after identifier with space', () => { + expect(`expand-path .git`).toMatchTokens( + { 'type': 'Identifier', 'value': 'expand-path' }, + { 'type': 'Word', 'value': '.git' }, + ) + }) + + test('dot operator after identifier without space', () => { + expect(`config.path`).toMatchTokens( + { 'type': 'Identifier', 'value': 'config' }, + { 'type': 'Operator', 'value': '.' }, + { 'type': 'Identifier', 'value': 'path' }, + ) + }) }) describe('nesting edge cases', () => { @@ -590,4 +657,73 @@ describe('named args', () => { { type: 'Identifier', value: 'arg' }, ) }) +}) + +describe('dot operator', () => { + test('standalone dot', () => { + expect('.').toMatchToken('Operator', '.') + }) + + test('dot between identifiers tokenizes as separate tokens', () => { + expect('config.path').toMatchTokens( + { type: 'Identifier', value: 'config' }, + { type: 'Operator', value: '.' }, + { type: 'Identifier', value: 'path' }, + ) + }) + + test('dot with number', () => { + expect('array.0').toMatchTokens( + { type: 'Identifier', value: 'array' }, + { type: 'Operator', value: '.' }, + { type: 'Number', value: '0' }, + ) + }) + + test('chained dots', () => { + expect('a.b.c').toMatchTokens( + { type: 'Identifier', value: 'a' }, + { type: 'Operator', value: '.' }, + { type: 'Identifier', value: 'b' }, + { type: 'Operator', value: '.' }, + { type: 'Identifier', value: 'c' }, + ) + }) + + test('identifier-like paths tokenize separately', () => { + expect('readme.txt').toMatchTokens( + { type: 'Identifier', value: 'readme' }, + { type: 'Operator', value: '.' }, + { type: 'Identifier', value: 'txt' }, + ) + }) + + test('word-like paths remain as single token', () => { + expect('./file.txt').toMatchToken('Word', './file.txt') + expect('README.TXT').toMatchToken('Word', 'README.TXT') + }) + + test('dot with paren expression', () => { + expect('obj.(1 + 2)').toMatchTokens( + { type: 'Identifier', value: 'obj' }, + { type: 'Operator', value: '.' }, + { type: 'OpenParen' }, + { type: 'Number', value: '1' }, + { type: 'Operator', value: '+' }, + { type: 'Number', value: '2' }, + { type: 'CloseParen' }, + ) + }) + + test('chained dot with paren expression', () => { + expect('obj.items.(i)').toMatchTokens( + { type: 'Identifier', value: 'obj' }, + { type: 'Operator', value: '.' }, + { type: 'Identifier', value: 'items' }, + { type: 'Operator', value: '.' }, + { type: 'OpenParen' }, + { type: 'Identifier', value: 'i' }, + { type: 'CloseParen' }, + ) + }) }) \ No newline at end of file diff --git a/src/parser/tokenizer2.ts b/src/parser/tokenizer2.ts index 74844ae..8674321 100644 --- a/src/parser/tokenizer2.ts +++ b/src/parser/tokenizer2.ts @@ -31,13 +31,14 @@ export enum TokenType { Boolean, Number, String, + Regex, } const valueTokens = new Set([ TokenType.Comment, TokenType.Keyword, TokenType.Operator, TokenType.Identifier, TokenType.Word, TokenType.NamedArgPrefix, - TokenType.Boolean, TokenType.Number, TokenType.String + TokenType.Boolean, TokenType.Number, TokenType.String, TokenType.Regex ]) const operators = new Set([ @@ -67,7 +68,7 @@ const operators = new Set([ // nullish '??', - // math + // math '**', '*', '/', @@ -82,6 +83,12 @@ const operators = new Set([ '==', '>', '<', + + // property access + '.', + + // pipe + '|', ]) const keywords = new Set([ @@ -116,6 +123,7 @@ export class Scanner { inParen = 0 inBracket = 0 tokens: Token[] = [] + prevIsWhitespace = true reset() { this.input = '' @@ -124,6 +132,7 @@ export class Scanner { this.char = 0 this.prev = 0 this.tokens.length = 0 + this.prevIsWhitespace = true } peek(count = 0): number { @@ -131,9 +140,11 @@ export class Scanner { } next(): number { + this.prevIsWhitespace = isWhitespace(this.char) this.prev = this.char this.char = this.peek() this.pos += getCharSize(this.char) + return this.char } @@ -156,6 +167,10 @@ export class Scanner { this.start = this.pos } + pushChar(type: TokenType) { + this.push(type, this.pos - 1, this.pos) + } + // turn shrimp code into shrimp tokens that get fed into the parser tokenize(input: string): Token[] { this.reset() @@ -164,6 +179,7 @@ export class Scanner { while (this.char > 0) { const char = this.char + if (char === c`#`) { this.readComment() continue @@ -185,7 +201,7 @@ export class Scanner { } if (isIdentStart(char)) { - this.readIdentOrKeyword() + this.readWordOrIdent(true) // true = started with identifier char continue } @@ -195,25 +211,39 @@ export class Scanner { } if (char === c`:`) { - this.push(TokenType.Colon, this.start - 1, this.pos) // TODO: why? + this.pushChar(TokenType.Colon) this.next() continue } + // whitespace-sensitive dot as operator (property access) only after identifier/number + if (char === c`.`) { + if (this.canBeDotGet(this.tokens.at(-1))) { + this.pushChar(TokenType.Operator) + this.next() + continue + } + } + + if (char === c`/` && this.peek() === c`/`) { + this.readRegex() + continue + } + if (isWordChar(char)) { - this.readWord() + this.readWordOrIdent(false) // false = didn't start with identifier char continue } if (char === c`\n`) { if (this.inParen === 0 && this.inBracket === 0) - this.push(TokenType.Newline) + this.pushChar(TokenType.Newline) this.next() continue } if (char === c`;`) { - this.push(TokenType.Semicolon) + this.pushChar(TokenType.Semicolon) this.next() continue } @@ -225,6 +255,7 @@ export class Scanner { } readComment() { + this.start = this.pos - 1 while (this.char !== c`\n` && this.char > 0) this.next() this.push(TokenType.Comment) } @@ -233,16 +264,16 @@ export class Scanner { switch (this.char) { case c`(`: this.inParen++ - this.push(TokenType.OpenParen); break + this.pushChar(TokenType.OpenParen); break case c`)`: this.inParen-- - this.push(TokenType.CloseParen); break + this.pushChar(TokenType.CloseParen); break case c`[`: this.inBracket++ - this.push(TokenType.OpenBracket); break + this.pushChar(TokenType.OpenBracket); break case c`]`: this.inBracket-- - this.push(TokenType.CloseBracket); break + this.pushChar(TokenType.CloseBracket); break } this.next() } @@ -270,7 +301,7 @@ export class Scanner { this.push(TokenType.String) } - readIdentOrKeyword() { + readWordOrIdent(startedWithIdentChar: boolean) { this.start = this.pos - getCharSize(this.char) while (isWordChar(this.char)) { @@ -280,33 +311,50 @@ export class Scanner { if (isWhitespace(nextCh) || nextCh === 0) break } - // stop at equal sign (named arg) + // stop at equal sign (named arg) - but only if what we've read so far is an identifier if (this.char === c`=`) { - this.next() - break + const soFar = this.input.slice(this.start, this.pos - getCharSize(this.char)) + if (isIdentifer(soFar)) { + this.next() + break + } + } + + // stop at dot only if it would create a valid property access + // AND only if we started with an identifier character (not for Words like README.txt) + if (startedWithIdentChar && this.char === c`.`) { + const nextCh = this.peek() + if (isIdentStart(nextCh) || isDigit(nextCh) || nextCh === c`(`) { + const soFar = this.input.slice(this.start, this.pos - getCharSize(this.char)) + if (isIdentifer(soFar)) break + } } this.next() } - const ident = this.input.slice(this.start, this.pos - getCharSize(this.char)) + const word = this.input.slice(this.start, this.pos - getCharSize(this.char)) - if (ident === 'null') + // classify the token based on what we read + if (word === '_') + this.pushChar(TokenType.Underscore) + + else if (word === 'null') this.push(TokenType.Null) - else if (ident === 'true' || ident === 'false') + else if (word === 'true' || word === 'false') this.push(TokenType.Boolean) - else if (isKeyword(ident)) + else if (isKeyword(word)) this.push(TokenType.Keyword) - else if (isOperator(ident)) - this.push(TokenType.Operator) // only things like `and` and `or` + else if (isOperator(word)) + this.push(TokenType.Operator) - else if (isIdentifer(ident)) + else if (isIdentifer(word)) this.push(TokenType.Identifier) - else if (ident.endsWith('=')) + else if (word.endsWith('=')) this.push(TokenType.NamedArgPrefix) else @@ -316,6 +364,12 @@ export class Scanner { readNumber() { this.start = this.pos - 1 while (isWordChar(this.char)) { + // stop at dot unless it's part of the number + if (this.char === c`.`) { + const nextCh = this.peek() + if (!isDigit(nextCh)) break + } + // stop at colon if (this.char === c`:`) { const nextCh = this.peek() @@ -327,21 +381,28 @@ export class Scanner { this.push(isNumber(ident) ? TokenType.Number : TokenType.Word) } - readWord() { - this.start = this.pos - getCharSize(this.char) + readRegex() { + this.start = this.pos - 1 + this.next() // skip 2nd / - while (isWordChar(this.char)) this.next() + while (this.char > 0) { + if (this.char === c`/` && this.peek() === c`/`) { + this.next() // skip / + this.next() // skip / + this.push(TokenType.Regex) + break + } - const word = this.input.slice(this.start, this.pos - getCharSize(this.char)) + this.next() + } + } - if (word === '_') - this.push(TokenType.Underscore) - - else if (operators.has(word)) - this.push(TokenType.Operator) - - else - this.push(TokenType.Word) + canBeDotGet(lastToken?: Token): boolean { + return !this.prevIsWhitespace && !!lastToken && + (lastToken.type === TokenType.Identifier || + lastToken.type === TokenType.Number || + lastToken.type === TokenType.CloseParen || + lastToken.type === TokenType.CloseBracket) } } diff --git a/src/testSetup.ts b/src/testSetup.ts index 814f91f..739c922 100644 --- a/src/testSetup.ts +++ b/src/testSetup.ts @@ -4,12 +4,13 @@ import color from 'kleur' import { Scanner, TokenType, type Token } from '#parser/tokenizer2' import { parser } from '#parser/shrimp' import { setGlobals } from '#parser/tokenizer' +import { parse } from '#parser/parser2' import { globals as prelude } from '#prelude' import { $ } from 'bun' import { assert, errorMessage } from '#utils/utils' import { Compiler } from '#compiler/compiler' import { run, VM } from 'reefvm' -import { treeToString, VMResultToValue } from '#utils/tree' +import { treeToString2, treeToString, VMResultToValue } from '#utils/tree' const regenerateParser = async () => { let generate = true @@ -52,8 +53,8 @@ expect.extend({ const allGlobals = { ...prelude, ...(globals || {}) } setGlobals(Object.keys(allGlobals)) - const tree = parser.parse(received) - const actual = treeToString(tree, received) + const tree = parse(received) + const actual = treeToString2(tree, received) const normalizedExpected = trimWhitespace(expected) try { @@ -244,7 +245,7 @@ const tokenize = (code: string): Token[] => { return scanner.tokenize(code) } -const toHumanToken = (tok: Token): { type: string, value: string } => { +const toHumanToken = (tok: Token): { type: string, value?: string } => { return { type: TokenType[tok.type], value: tok.value diff --git a/src/utils/tree.ts b/src/utils/tree.ts index 45a9318..fa31562 100644 --- a/src/utils/tree.ts +++ b/src/utils/tree.ts @@ -1,5 +1,38 @@ import { Tree, TreeCursor } from '@lezer/common' import { type Value, fromValue } from 'reefvm' +import { SyntaxNode } from '#parser/node' + +const nodeToString = (node: SyntaxNode, input: string, depth = 0): string => { + const indent = ' '.repeat(depth) + const text = input.slice(node.from, node.to) + const nodeName = node.name + + if (node.firstChild) { + return `${indent}${nodeName}` + } else { + // Only strip quotes from whole String nodes (legacy DoubleQuote), not StringFragment/EscapeSeq/CurlyString + const cleanText = nodeName === 'String' ? text.slice(1, -1) : text + return `${indent}${nodeName} ${cleanText}` + } +} + +export const treeToString2 = (tree: SyntaxNode, input: string, depth = 0): string => { + let lines = [] + let node: SyntaxNode | null = tree + + if (node.name === 'Program') node = node.firstChild + + while (node) { + lines.push(nodeToString(node, input, depth)) + + if (node.firstChild) + lines.push(treeToString2(node.firstChild, input, depth + 1)) + + node = node.nextSibling + } + + return lines.join('\n') +} export const treeToString = (tree: Tree, input: string): string => { const lines: string[] = [] -- 2.50.1 From e38e8d4f1eacddcc28c7d874e3aac81c6c9bc162 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Mon, 24 Nov 2025 16:15:37 -0800 Subject: [PATCH 03/35] minor --- src/parser/parser2.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index 9a9cb82..3919476 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -65,7 +65,7 @@ export class Parser { return node } - // + // // parse foundation nodes - statements, expressions // @@ -321,7 +321,7 @@ export class Parser { } // atoms are the basic building blocks: literals, identifiers, words - atom() { + atom(): SyntaxNode { if (this.is($T.String)) return this.string() @@ -333,7 +333,7 @@ export class Parser { } // blocks in if, do, special calls, etc - // `: something end` + // `: something end` // // `blockNode` determines whether we return [colon, BlockNode, end] or // just a list of statements like [colon, stmt1, stmt2, end] @@ -784,7 +784,7 @@ export class Parser { return new SyntaxNode('Word', parts[0]!.from, parts.at(-1)!.to) } - // + // // helpers // -- 2.50.1 From 3eac0a27a5f15e2fb238c6c42a1eae90dbef3c9a Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 11:30:27 -0800 Subject: [PATCH 04/35] hwhitespace --- src/compiler/tests/function-blocks.test.ts | 12 ++++++------ src/parser/tests/functions.test.ts | 4 ++-- src/parser/tests/multiline.test.ts | 4 ++-- src/utils/tree.ts | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/compiler/tests/function-blocks.test.ts b/src/compiler/tests/function-blocks.test.ts index 41bf65d..4c91731 100644 --- a/src/compiler/tests/function-blocks.test.ts +++ b/src/compiler/tests/function-blocks.test.ts @@ -23,15 +23,15 @@ describe('multi line function blocks', () => { test('work with no args', () => { expect(` trap = do x: x end -trap: - true +trap: + true end`).toEvaluateTo(true) }) test('work with one arg', () => { expect(` trap = do x y: [ x (y) ] end -trap EXIT: +trap EXIT: true end`).toEvaluateTo(['EXIT', true]) }) @@ -39,7 +39,7 @@ end`).toEvaluateTo(['EXIT', true]) test('work with named args', () => { expect(` attach = do signal fn: [ signal (fn) ] end -attach signal='exit': +attach signal='exit': true end`).toEvaluateTo(['exit', true]) }) @@ -48,8 +48,8 @@ end`).toEvaluateTo(['exit', true]) test('work with dot-get', () => { expect(` signals = [trap=do x y: [x (y)] end] -signals.trap 'EXIT': - true +signals.trap 'EXIT': + true end`).toEvaluateTo(['EXIT', true]) }) }) diff --git a/src/parser/tests/functions.test.ts b/src/parser/tests/functions.test.ts index 092f153..d2d7f9f 100644 --- a/src/parser/tests/functions.test.ts +++ b/src/parser/tests/functions.test.ts @@ -63,7 +63,7 @@ describe('calling functions', () => { Identifier tail NamedArg NamedArgPrefix lines= - ⚠ + ⚠ ⚠ `) }) }) @@ -73,7 +73,7 @@ describe('Do', () => { expect('do: 1 end').toMatchTree(` FunctionDef Do do - Params + Params colon : Number 1 keyword end`) diff --git a/src/parser/tests/multiline.test.ts b/src/parser/tests/multiline.test.ts index 84e3815..9362181 100644 --- a/src/parser/tests/multiline.test.ts +++ b/src/parser/tests/multiline.test.ts @@ -76,12 +76,12 @@ end expect(` do: 2 - + end `).toMatchTree(` FunctionDef Do do - Params + Params colon : Number 2 keyword end diff --git a/src/utils/tree.ts b/src/utils/tree.ts index fa31562..c760082 100644 --- a/src/utils/tree.ts +++ b/src/utils/tree.ts @@ -12,7 +12,7 @@ const nodeToString = (node: SyntaxNode, input: string, depth = 0): string => { } else { // Only strip quotes from whole String nodes (legacy DoubleQuote), not StringFragment/EscapeSeq/CurlyString const cleanText = nodeName === 'String' ? text.slice(1, -1) : text - return `${indent}${nodeName} ${cleanText}` + return cleanText ? `${indent}${nodeName} ${cleanText}` : `${indent}${nodeName}` } } -- 2.50.1 From 9e4471ad387b4b5e091e07b535a709407f62b082 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 13:08:28 -0800 Subject: [PATCH 05/35] try to match lezer API more closely --- src/parser/node.ts | 193 +++++++++++++++++++++++++++++++++++++++++- src/parser/parser2.ts | 18 ++-- 2 files changed, 198 insertions(+), 13 deletions(-) diff --git a/src/parser/node.ts b/src/parser/node.ts index 31942da..15dd8fc 100644 --- a/src/parser/node.ts +++ b/src/parser/node.ts @@ -1,4 +1,5 @@ -import { type Token, TokenType } from "./tokenizer2" +import { type Token, TokenType } from './tokenizer2' +import * as term from './shrimp.terms' export type NodeType = | 'Program' @@ -110,15 +111,191 @@ export const operators: Record = { '|': 'operator', } +export class Tree { + constructor(public topNode: SyntaxNode) { } +} + +// TODO: TEMPORARY SHIM +class SyntaxNodeType { + constructor(public nodeType: NodeType) { } + + is(other: string) { + return this.nodeType === other + } + + get id(): number { + switch (this.nodeType) { + case 'Program': + return term.Program + + case 'Block': + return term.Block + + case 'FunctionCall': + return term.FunctionCall + + case 'FunctionCallOrIdentifier': + return term.FunctionCallOrIdentifier + + case 'FunctionCallWithBlock': + return term.FunctionCallWithBlock + + case 'PositionalArg': + return term.PositionalArg + + case 'NamedArg': + return term.NamedArg + + case 'FunctionDef': + return term.FunctionDef + + case 'Params': + return term.Params + + case 'NamedParam': + return term.NamedParam + + case 'Null': + return term.Null + + case 'Boolean': + return term.Boolean + + case 'Number': + return term.Number + + case 'String': + return term.String + + case 'StringFragment': + return term.StringFragment + + case 'CurlyString': + return term.CurlyString + + case 'DoubleQuote': + return term.DoubleQuote + + case 'EscapeSeq': + return term.EscapeSeq + + case 'Interpolation': + return term.Interpolation + + case 'Regex': + return term.Regex + + case 'Identifier': + return term.Identifier + + case 'AssignableIdentifier': + return term.AssignableIdentifier + + case 'IdentifierBeforeDot': + return term.IdentifierBeforeDot + + case 'Word': + return term.Word + + case 'Array': + return term.Array + + case 'Dict': + return term.Dict + + case 'Comment': + return term.Comment + + case 'BinOp': + return term.BinOp + + case 'ConditionalOp': + return term.ConditionalOp + + case 'ParenExpr': + return term.ParenExpr + + case 'Assign': + return term.Assign + + case 'CompoundAssign': + return term.CompoundAssign + + case 'DotGet': + return term.DotGet + + case 'PipeExpr': + return term.PipeExpr + + case 'IfExpr': + return term.IfExpr + + case 'ElseIfExpr': + return term.ElseIfExpr + + case 'ElseExpr': + return term.ElseExpr + + case 'WhileExpr': + return term.WhileExpr + + case 'TryExpr': + return term.TryExpr + + case 'CatchExpr': + return term.CatchExpr + + case 'FinallyExpr': + return term.FinallyExpr + + case 'Throw': + return term.Throw + + case 'Eq': + return term.Eq + + case 'Modulo': + return term.Modulo + + case 'Plus': + return term.Plus + + case 'Star': + return term.Star + + case 'Slash': + return term.Slash + + case 'Import': + return term.Import + + case 'Do': + return term.Do + + case 'colon': + return term.colon + + case 'keyword': + return term.keyword + + } + return 0 + } + + get name(): string { + return this.nodeType + } +} + export class SyntaxNode { - type: NodeType + #type: NodeType from: number to: number parent: SyntaxNode | null children: SyntaxNode[] = [] constructor(type: NodeType, from: number, to: number, parent: SyntaxNode | null = null) { - this.type = type + this.#type = type this.from = from this.to = to this.parent = parent @@ -128,8 +305,16 @@ export class SyntaxNode { return new SyntaxNode(TokenType[token.type] as NodeType, token.from, token.to, parent ?? null) } + get type(): SyntaxNodeType { + return new SyntaxNodeType(this.#type) + } + + set type(name: NodeType) { + this.#type = name + } + get name(): string { - return this.type + return this.type.name } get isError(): boolean { diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index 3919476..64f62bc 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -118,12 +118,12 @@ export class Parser { expr = this.exprWithPrecedence() // check for destructuring - if (expr.type === 'Array' && this.is($T.Operator, '=')) + if (expr.type.is('Array') && this.is($T.Operator, '=')) return this.destructure(expr) // check for parens function call // ex: (ref my-func) my-arg - if (expr.type === 'ParenExpr' && !this.isExprEnd()) + if (expr.type.is('ParenExpr') && !this.isExprEnd()) expr = this.functionCall(expr) // one | echo @@ -321,7 +321,7 @@ export class Parser { } // atoms are the basic building blocks: literals, identifiers, words - atom(): SyntaxNode { + atom(): SyntaxNode { if (this.is($T.String)) return this.string() @@ -507,7 +507,7 @@ export class Parser { if (!this.scope.has(ident)) return this.word(left) - if (left.type === 'Identifier') left.type = 'IdentifierBeforeDot' + if (left.type.is('Identifier')) left.type = 'IdentifierBeforeDot' let parts = [] while (this.is($T.Operator, '.')) { @@ -527,7 +527,7 @@ export class Parser { const dotGet = this.dotGet() // dotget not in scope, regular Word - if (dotGet.type === 'Word') return dotGet + if (dotGet.type.is('Word')) return dotGet if (this.isExprEnd()) return this.functionCallOrIdentifier(dotGet) @@ -580,7 +580,7 @@ export class Parser { inner = this.dotGet() // if the dotGet was just a Word, bail - if (inner.type === 'Word') return inner + if (inner.type.is('Word')) return inner } inner ??= this.identifier() @@ -679,7 +679,7 @@ export class Parser { const prefix = SyntaxNode.from(this.expect($T.NamedArgPrefix)) const val = this.value() - if (!['Null', 'Boolean', 'Number', 'String'].includes(val.type)) + if (!['Null', 'Boolean', 'Number', 'String'].includes(val.type.name)) throw `[namedParam] default value must be Null|Bool|Num|Str, got ${val.type}\n\n ${this.input}\n` const node = new SyntaxNode('NamedParam', prefix.from, val.to) @@ -887,9 +887,9 @@ function collapseDotGets(origNodes: SyntaxNode[]): SyntaxNode { while (nodes.length > 0) { const left = nodes.pop()! - if (left.type === 'Identifier') left.type = 'IdentifierBeforeDot' + if (left.type.is('Identifier')) left.type = 'IdentifierBeforeDot' - const dot = new SyntaxNode("DotGet", left.from, right.to); + const dot = new SyntaxNode("DotGet", left.from, right.to) dot.push(left, right) right = dot -- 2.50.1 From 566beb87efd276e04a58691ecd61fead31cbd7ab Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 13:16:41 -0800 Subject: [PATCH 06/35] `do` allowed in arg/dict values --- src/parser/parser2.ts | 36 ++++++++++++--------- src/parser/tests/functions.test.ts | 52 ++++++++++++++++++++++++++++++ src/parser/tests/literals.test.ts | 16 +++++++++ 3 files changed, 88 insertions(+), 16 deletions(-) diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index 64f62bc..c0ea85c 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -241,6 +241,22 @@ export class Parser { // parse specific nodes // + // raw determines whether we just want the SyntaxNodes or we want to + // wrap them in a PositionalArg + arg(raw = false): SyntaxNode { + // 'do' is a special function arg - it doesn't need to be wrapped + // in parens. otherwise, args are regular value()s + const val = this.is($T.Keyword, 'do') ? this.do() : this.value() + + if (raw) { + return val + } else { + const arg = new SyntaxNode('PositionalArg', val.from, val.to) + arg.add(val) + return arg + } + } + // [ 1 2 3 ] array(): SyntaxNode { const open = this.expect($T.OpenBracket) @@ -425,10 +441,7 @@ export class Parser { continue } - if (this.is($T.NamedArgPrefix)) - values.push(this.namedArg()) - else - values.push(this.value()) + values.push(this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg()) } const close = this.expect($T.CloseBracket) @@ -549,17 +562,8 @@ export class Parser { const ident = fn ?? this.identifier() const args: SyntaxNode[] = [] - while (!this.isExprEnd() && !this.is($T.Operator, '|')) { - if (this.is($T.NamedArgPrefix)) { - args.push(this.namedArg()) - } else { - // 'do' is the only keyword allowed as a function argument - const val = this.is($T.Keyword, 'do') ? this.do() : this.value() - const arg = new SyntaxNode('PositionalArg', val.from, val.to) - arg.add(val) - args.push(arg) - } - } + while (!this.isExprEnd() && !this.is($T.Operator, '|')) + args.push(this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg()) const node = new SyntaxNode('FunctionCall', ident.from, (args.at(-1) || ident).to) node.push(ident, ...args) @@ -669,7 +673,7 @@ export class Parser { // abc= true namedArg(): SyntaxNode { const prefix = SyntaxNode.from(this.expect($T.NamedArgPrefix)) - const val = this.value() + const val = this.arg(true) const node = new SyntaxNode('NamedArg', prefix.from, val.to) return node.push(prefix, val) } diff --git a/src/parser/tests/functions.test.ts b/src/parser/tests/functions.test.ts index d2d7f9f..ff39870 100644 --- a/src/parser/tests/functions.test.ts +++ b/src/parser/tests/functions.test.ts @@ -43,6 +43,58 @@ describe('calling functions', () => { `) }) + test('call with function', () => { + expect(`tail do x: x end`).toMatchTree(` + FunctionCall + Identifier tail + PositionalArg + FunctionDef + Do do + Params + Identifier x + colon : + FunctionCallOrIdentifier + Identifier x + keyword end + `) + }) + + test('call with arg and function', () => { + expect(`tail true do x: x end`).toMatchTree(` + FunctionCall + Identifier tail + PositionalArg + Boolean true + PositionalArg + FunctionDef + Do do + Params + Identifier x + colon : + FunctionCallOrIdentifier + Identifier x + keyword end + `) + }) + + test('call with function in named arg', () => { + expect(`tail callback=do x: x end`).toMatchTree(` + FunctionCall + Identifier tail + NamedArg + NamedArgPrefix callback= + FunctionDef + Do do + Params + Identifier x + colon : + FunctionCallOrIdentifier + Identifier x + keyword end + `) + }) + + test('command with arg that is also a command', () => { expect('tail tail').toMatchTree(` FunctionCall diff --git a/src/parser/tests/literals.test.ts b/src/parser/tests/literals.test.ts index 9173232..ba423ab 100644 --- a/src/parser/tests/literals.test.ts +++ b/src/parser/tests/literals.test.ts @@ -336,6 +336,22 @@ describe('dict literals', () => { `) }) + test('work with functions', () => { + expect(`[trap=do x: x end]`).toMatchTree(` + Dict + NamedArg + NamedArgPrefix trap= + FunctionDef + Do do + Params + Identifier x + colon : + FunctionCallOrIdentifier + Identifier x + keyword end + `) + }) + test('can be nested', () => { expect('[a=one b=[two [c=three]]]').toMatchTree(` Dict -- 2.50.1 From 579d755205ca3f8e210ca43d59fe7e0cb181dd06 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 13:27:53 -0800 Subject: [PATCH 07/35] make more compiler tests pass --- src/parser/node.ts | 6 ++++- src/parser/parser2.ts | 2 +- src/parser/tests/pipes.test.ts | 48 ++++++++++++++++++++++++++++++---- src/parser/tokenizer2.ts | 5 ++-- src/utils/tree.ts | 3 ++- 5 files changed, 54 insertions(+), 10 deletions(-) diff --git a/src/parser/node.ts b/src/parser/node.ts index 15dd8fc..7bca39c 100644 --- a/src/parser/node.ts +++ b/src/parser/node.ts @@ -58,6 +58,7 @@ export type NodeType = | 'Import' | 'Do' + | 'Underscore' | 'colon' | 'keyword' | 'operator' @@ -272,6 +273,9 @@ class SyntaxNodeType { case 'Do': return term.Do + case 'Underscore': + return term.Underscore + case 'colon': return term.colon @@ -355,7 +359,7 @@ export class SyntaxNode { } toString(): string { - return this.type + return this.type.name } } diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index c0ea85c..c978109 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -341,7 +341,7 @@ export class Parser { if (this.is($T.String)) return this.string() - if (this.isAny($T.Null, $T.Boolean, $T.Number, $T.Identifier, $T.Word, $T.Regex)) + if (this.isAny($T.Null, $T.Boolean, $T.Number, $T.Identifier, $T.Word, $T.Regex, $T.Underscore)) return SyntaxNode.from(this.next()) const next = this.next() diff --git a/src/parser/tests/pipes.test.ts b/src/parser/tests/pipes.test.ts index e1ed0a9..a359384 100644 --- a/src/parser/tests/pipes.test.ts +++ b/src/parser/tests/pipes.test.ts @@ -295,10 +295,10 @@ grep h`).toMatchTree(` test('lots of pipes', () => { expect(` -'this should help readability in long chains' - | split ' ' - | map (ref str.to-upper) - | join '-' +'this should help readability in long chains' + | split ' ' + | map (ref str.to-upper) + | join '-' | echo `).toMatchTree(` PipeExpr @@ -309,7 +309,7 @@ grep h`).toMatchTree(` Identifier split PositionalArg String - StringFragment + StringFragment (space) operator | FunctionCall Identifier map @@ -333,3 +333,41 @@ grep h`).toMatchTree(` `) }) }) + +describe('Underscore', () => { + test('works in pipes', () => { + expect(`sub 3 1 | div (sub 110 9 | sub 1) _ | div 5`).toMatchTree(` + PipeExpr + FunctionCall + Identifier sub + PositionalArg + Number 3 + PositionalArg + Number 1 + operator | + FunctionCall + Identifier div + PositionalArg + ParenExpr + PipeExpr + FunctionCall + Identifier sub + PositionalArg + Number 110 + PositionalArg + Number 9 + operator | + FunctionCall + Identifier sub + PositionalArg + Number 1 + PositionalArg + Underscore _ + operator | + FunctionCall + Identifier div + PositionalArg + Number 5 + `) + }) +}) \ No newline at end of file diff --git a/src/parser/tokenizer2.ts b/src/parser/tokenizer2.ts index 8674321..f0a8502 100644 --- a/src/parser/tokenizer2.ts +++ b/src/parser/tokenizer2.ts @@ -38,7 +38,8 @@ const valueTokens = new Set([ TokenType.Comment, TokenType.Keyword, TokenType.Operator, TokenType.Identifier, TokenType.Word, TokenType.NamedArgPrefix, - TokenType.Boolean, TokenType.Number, TokenType.String, TokenType.Regex + TokenType.Boolean, TokenType.Number, TokenType.String, TokenType.Regex, + TokenType.Underscore ]) const operators = new Set([ @@ -337,7 +338,7 @@ export class Scanner { // classify the token based on what we read if (word === '_') - this.pushChar(TokenType.Underscore) + this.push(TokenType.Underscore) else if (word === 'null') this.push(TokenType.Null) diff --git a/src/utils/tree.ts b/src/utils/tree.ts index c760082..75a5495 100644 --- a/src/utils/tree.ts +++ b/src/utils/tree.ts @@ -11,7 +11,8 @@ const nodeToString = (node: SyntaxNode, input: string, depth = 0): string => { return `${indent}${nodeName}` } else { // Only strip quotes from whole String nodes (legacy DoubleQuote), not StringFragment/EscapeSeq/CurlyString - const cleanText = nodeName === 'String' ? text.slice(1, -1) : text + let cleanText = nodeName === 'String' ? text.slice(1, -1) : text + if (cleanText === ' ') cleanText = '(space)' return cleanText ? `${indent}${nodeName} ${cleanText}` : `${indent}${nodeName}` } } -- 2.50.1 From d003d65a152257a1651606a9af0296f28c1fd9b7 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 13:27:56 -0800 Subject: [PATCH 08/35] disable errors... for now! --- src/compiler/compiler.ts | 16 +++++++++------- src/compiler/utils.ts | 17 +++++++++-------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/compiler/compiler.ts b/src/compiler/compiler.ts index fe18a09..b03b9b9 100644 --- a/src/compiler/compiler.ts +++ b/src/compiler/compiler.ts @@ -1,9 +1,10 @@ import { CompilerError } from '#compiler/compilerError.ts' +import { parse } from '#parser/parser2' +import { SyntaxNode, Tree } from '#parser/node' import { parser } from '#parser/shrimp.ts' import * as terms from '#parser/shrimp.terms' import { setGlobals } from '#parser/tokenizer' import { tokenizeCurlyString } from '#parser/curlyTokenizer' -import type { SyntaxNode, Tree } from '@lezer/common' import { assert, errorMessage } from '#utils/utils' import { toBytecode, type Bytecode, type ProgramItem, bytecodeToString } from 'reefvm' import { @@ -63,13 +64,14 @@ export class Compiler { constructor(public input: string, globals?: string[] | Record) { try { if (globals) setGlobals(Array.isArray(globals) ? globals : Object.keys(globals)) - const cst = parser.parse(input) - const errors = checkTreeForErrors(cst) + const ast = parse(input) + const cst = new Tree(ast) + // const errors = checkTreeForErrors(cst) - const firstError = errors[0] - if (firstError) { - throw firstError - } + // const firstError = errors[0] + // if (firstError) { + // throw firstError + // } this.#compileCst(cst, input) this.bytecode = toBytecode(this.instructions) diff --git a/src/compiler/utils.ts b/src/compiler/utils.ts index 446aab3..18dbda4 100644 --- a/src/compiler/utils.ts +++ b/src/compiler/utils.ts @@ -1,16 +1,17 @@ import { CompilerError } from '#compiler/compilerError.ts' import * as terms from '#parser/shrimp.terms' -import type { SyntaxNode, Tree } from '@lezer/common' +import type { SyntaxNode, Tree } from '#parser/node' export const checkTreeForErrors = (tree: Tree): CompilerError[] => { const errors: CompilerError[] = [] - tree.iterate({ - enter: (node) => { - if (node.type.isError) { - errors.push(new CompilerError(`Unexpected syntax.`, node.from, node.to)) - } - }, - }) + + // tree.iterate({ + // enter: (node) => { + // if (node.type.isError) { + // errors.push(new CompilerError(`Unexpected syntax.`, node.from, node.to)) + // } + // }, + // }) return errors } -- 2.50.1 From 6a6675d30f7de41d75b0a0d345cb81122ee57594 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 15:51:45 -0800 Subject: [PATCH 09/35] fix bitwise precedence --- src/parser/node.ts | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/parser/node.ts b/src/parser/node.ts index 7bca39c..48f4956 100644 --- a/src/parser/node.ts +++ b/src/parser/node.ts @@ -380,23 +380,25 @@ export const precedence: Record = { // Nullish coalescing '??': 35, + // Bitwise shifts (lower precedence than addition) + '<<': 37, + '>>': 37, + '>>>': 37, + // Addition/Subtraction '+': 40, '-': 40, + // Bitwise AND/OR/XOR (higher precedence than addition) + 'band': 45, + 'bor': 45, + 'bxor': 45, + // Multiplication/Division/Modulo '*': 50, '/': 50, '%': 50, - // Bitwise - 'band': 45, - 'bor': 45, - 'bxor': 45, - '<<': 45, - '>>': 45, - '>>>': 45, - // Exponentiation (right-associative) '**': 60, } -- 2.50.1 From 0e92525b54f4636f64ca2ce64b28c97ce849d715 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:00:06 -0800 Subject: [PATCH 10/35] regex flags, bad regexs become Words --- src/parser/tokenizer2.ts | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/parser/tokenizer2.ts b/src/parser/tokenizer2.ts index f0a8502..317c580 100644 --- a/src/parser/tokenizer2.ts +++ b/src/parser/tokenizer2.ts @@ -390,7 +390,28 @@ export class Scanner { if (this.char === c`/` && this.peek() === c`/`) { this.next() // skip / this.next() // skip / - this.push(TokenType.Regex) + + // read regex flags + while (this.char > 0 && isIdentStart(this.char)) + this.next() + + // validate regex + const to = this.pos - getCharSize(this.char) + const regexText = this.input.slice(this.start, to) + const [_, pattern, flags] = regexText.match(/^\/\/(.*)\/\/([gimsuy]*)$/) || [] + + if (pattern) { + try { + new RegExp(pattern, flags) + this.push(TokenType.Regex) + break + } catch (e) { + // invalid regex - fall through to Word + } + } + + // invalid regex is treated as Word + this.push(TokenType.Word) break } -- 2.50.1 From 1682a7ccb71c38e4329836cb575492a2fb8561eb Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:04:58 -0800 Subject: [PATCH 11/35] fix curly strings --- src/parser/tokenizer2.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/parser/tokenizer2.ts b/src/parser/tokenizer2.ts index 317c580..c3e891b 100644 --- a/src/parser/tokenizer2.ts +++ b/src/parser/tokenizer2.ts @@ -290,6 +290,7 @@ export class Scanner { } readCurlyString() { + this.start = this.pos - 1 let depth = 1 this.next() -- 2.50.1 From 2c2b277b29f8cddf4c028313e314806651b52fc1 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:08:14 -0800 Subject: [PATCH 12/35] `throw` takes an expression --- src/parser/parser2.ts | 2 +- src/parser/tests/exceptions.test.ts | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index c978109..50a453d 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -729,7 +729,7 @@ export class Parser { // throw blah throw(): SyntaxNode { const keyword = this.keyword('throw') - const val = this.value() + const val = this.expression() const node = new SyntaxNode('Throw', keyword.from, val.to) return node.push(keyword, val) } diff --git a/src/parser/tests/exceptions.test.ts b/src/parser/tests/exceptions.test.ts index e89c80e..a0708f9 100644 --- a/src/parser/tests/exceptions.test.ts +++ b/src/parser/tests/exceptions.test.ts @@ -139,6 +139,18 @@ describe('try/catch/finally/throw', () => { `) }) + test('parses throw statement with BinOp', () => { + expect("throw 'error message:' + msg").toMatchTree(` + Throw + keyword throw + BinOp + String + StringFragment error message: + Plus + + Identifier msg + `) + }) + test('parses throw statement with identifier', () => { expect('throw error-object').toMatchTree(` Throw -- 2.50.1 From cc604bea49833236df6211942e382be73b06b853 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:27:18 -0800 Subject: [PATCH 13/35] fix dot.get + thing --- src/parser/parser2.ts | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index 50a453d..f11579b 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -126,6 +126,10 @@ export class Parser { if (expr.type.is('ParenExpr') && !this.isExprEnd()) expr = this.functionCall(expr) + // if dotget is followed by binary operator, continue parsing as binary expression + if (expr.type.is('DotGet') && this.is($T.Operator) && !this.is($T.Operator, '|')) + expr = this.dotGetBinOp(expr) + // one | echo if (allowPipe && this.isPipe()) return this.pipe(expr) @@ -535,10 +539,32 @@ export class Parser { return nodes ? node.push(left, nodes!) : node.push(left, ...parts) } + // continue parsing dotget/word binary operation + dotGetBinOp(left: SyntaxNode): SyntaxNode { + while (this.is($T.Operator) && !this.is($T.Operator, '|')) { + const op = this.current().value! + const bp = precedence[op] + if (bp === undefined) break + + const opNode = this.op() + const right = this.exprWithPrecedence(bp + 1) + + const nodeType = conditionals.has(op) ? 'ConditionalOp' : 'BinOp' + const node = new SyntaxNode(nodeType, left.from, right.to) + node.push(left, opNode, right) + left = node + } + return left + } + // dotget in a statement/expression (something.blah) or (something.blah arg1) dotGetFunctionCall(): SyntaxNode { const dotGet = this.dotGet() + // if followed by a binary operator (not pipe), return dotGet/Word as-is for expression parser + if (this.is($T.Operator) && !this.is($T.Operator, '|')) + return dotGet + // dotget not in scope, regular Word if (dotGet.type.is('Word')) return dotGet -- 2.50.1 From d0005d9ccd3f24726171f277df4a714863d9302a Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:35:26 -0800 Subject: [PATCH 14/35] fix | --- src/parser/parser2.ts | 2 +- src/parser/tests/pipes.test.ts | 37 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index f11579b..6244038 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -123,7 +123,7 @@ export class Parser { // check for parens function call // ex: (ref my-func) my-arg - if (expr.type.is('ParenExpr') && !this.isExprEnd()) + if (expr.type.is('ParenExpr') && !this.isExprEnd() && !this.is($T.Operator, '|')) expr = this.functionCall(expr) // if dotget is followed by binary operator, continue parsing as binary expression diff --git a/src/parser/tests/pipes.test.ts b/src/parser/tests/pipes.test.ts index a359384..44ba028 100644 --- a/src/parser/tests/pipes.test.ts +++ b/src/parser/tests/pipes.test.ts @@ -176,6 +176,43 @@ describe('pipe expressions', () => { Identifier echo `) }) + + test('parenthesized expressions can be piped', () => { + expect(`(1 + 2) | echo`).toMatchTree(` + PipeExpr + ParenExpr + BinOp + Number 1 + Plus + + Number 2 + operator | + FunctionCallOrIdentifier + Identifier echo + `) + }) + + test('complex parenthesized expressions with pipes', () => { + expect(`((math.random) * 10 + 1) | math.floor`).toMatchTree(` + PipeExpr + ParenExpr + BinOp + BinOp + ParenExpr + FunctionCallOrIdentifier + DotGet + IdentifierBeforeDot math + Identifier random + Star * + Number 10 + Plus + + Number 1 + operator | + FunctionCallOrIdentifier + DotGet + IdentifierBeforeDot math + Identifier floor + `) + }) }) describe('pipe continuation', () => { -- 2.50.1 From a836591854949549f52d11709cb48dd4c49c956f Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:36:38 -0800 Subject: [PATCH 15/35] keywords are magical --- src/parser/tests/exceptions.test.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parser/tests/exceptions.test.ts b/src/parser/tests/exceptions.test.ts index a0708f9..8b18ced 100644 --- a/src/parser/tests/exceptions.test.ts +++ b/src/parser/tests/exceptions.test.ts @@ -155,7 +155,8 @@ describe('try/catch/finally/throw', () => { expect('throw error-object').toMatchTree(` Throw keyword throw - Identifier error-object + FunctionCallOrIdentifier + Identifier error-object `) }) -- 2.50.1 From cbc75f5ed7d7f0c8e4c86cdd6615a53fd0ee674c Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:41:42 -0800 Subject: [PATCH 16/35] use new parser in curlys --- src/parser/curlyTokenizer.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/parser/curlyTokenizer.ts b/src/parser/curlyTokenizer.ts index 00e3ce1..9495bc0 100644 --- a/src/parser/curlyTokenizer.ts +++ b/src/parser/curlyTokenizer.ts @@ -1,5 +1,6 @@ import { parser } from '#parser/shrimp.ts' -import type { SyntaxNode } from '@lezer/common' +import { parse } from '#parser/parser2' +import type { SyntaxNode } from '#parser/node' import { isIdentStart, isIdentChar } from './tokenizer' // Turns a { curly string } into strings and nodes for interpolation @@ -37,7 +38,7 @@ export const tokenizeCurlyString = (value: string): (string | [string, SyntaxNod } const input = value.slice(start + 2, pos) // skip '$(' - tokens.push([input, parser.parse(input).topNode]) + tokens.push([input, parse(input)]) start = ++pos // skip ')' } else { char = value[++pos] @@ -48,7 +49,7 @@ export const tokenizeCurlyString = (value: string): (string | [string, SyntaxNod char = value[++pos] const input = value.slice(start + 1, pos) // skip '$' - tokens.push([input, parser.parse(input).topNode]) + tokens.push([input, parse(input)]) start = pos-- // backtrack and start over } } -- 2.50.1 From 0d3f9867e69c201118ab0c557081d06ce5576385 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:43:07 -0800 Subject: [PATCH 17/35] we get globals for free now --- src/prelude/tests/info.test.ts | 91 +++++++++++++++---------------- src/prelude/tests/load.test.ts | 19 +++---- src/prelude/tests/prelude.test.ts | 1 - 3 files changed, 54 insertions(+), 57 deletions(-) diff --git a/src/prelude/tests/info.test.ts b/src/prelude/tests/info.test.ts index ce97a35..5a75142 100644 --- a/src/prelude/tests/info.test.ts +++ b/src/prelude/tests/info.test.ts @@ -1,90 +1,89 @@ import { expect, describe, test } from 'bun:test' -import { globals } from '#prelude' describe('var and var?', () => { test('var? checks if a variable exists', async () => { - await expect(`var? 'nada'`).toEvaluateTo(false, globals) - await expect(`var? 'info'`).toEvaluateTo(false, globals) - await expect(`abc = abc; var? 'abc'`).toEvaluateTo(true, globals) - await expect(`var? 'var?'`).toEvaluateTo(true, globals) + await expect(`var? 'nada'`).toEvaluateTo(false) + await expect(`var? 'info'`).toEvaluateTo(false) + await expect(`abc = abc; var? 'abc'`).toEvaluateTo(true) + await expect(`var? 'var?'`).toEvaluateTo(true) - await expect(`var? 'dict'`).toEvaluateTo(true, globals) - await expect(`var? dict`).toEvaluateTo(true, globals) + await expect(`var? 'dict'`).toEvaluateTo(true) + await expect(`var? dict`).toEvaluateTo(true) }) test('var returns a value or null', async () => { - await expect(`var 'nada'`).toEvaluateTo(null, globals) - await expect(`var nada`).toEvaluateTo(null, globals) - await expect(`var 'info'`).toEvaluateTo(null, globals) - await expect(`abc = my-string; var 'abc'`).toEvaluateTo('my-string', globals) - await expect(`abc = my-string; var abc`).toEvaluateTo(null, globals) + await expect(`var 'nada'`).toEvaluateTo(null) + await expect(`var nada`).toEvaluateTo(null) + await expect(`var 'info'`).toEvaluateTo(null) + await expect(`abc = my-string; var 'abc'`).toEvaluateTo('my-string') + await expect(`abc = my-string; var abc`).toEvaluateTo(null) }) }) describe('type predicates', () => { test('string? checks for string type', async () => { - await expect(`string? 'hello'`).toEvaluateTo(true, globals) - await expect(`string? 42`).toEvaluateTo(false, globals) + await expect(`string? 'hello'`).toEvaluateTo(true) + await expect(`string? 42`).toEvaluateTo(false) }) test('number? checks for number type', async () => { - await expect(`number? 42`).toEvaluateTo(true, globals) - await expect(`number? 'hello'`).toEvaluateTo(false, globals) + await expect(`number? 42`).toEvaluateTo(true) + await expect(`number? 'hello'`).toEvaluateTo(false) }) test('boolean? checks for boolean type', async () => { - await expect(`boolean? true`).toEvaluateTo(true, globals) - await expect(`boolean? 42`).toEvaluateTo(false, globals) + await expect(`boolean? true`).toEvaluateTo(true) + await expect(`boolean? 42`).toEvaluateTo(false) }) test('array? checks for array type', async () => { - await expect(`array? [1 2 3]`).toEvaluateTo(true, globals) - await expect(`array? 42`).toEvaluateTo(false, globals) + await expect(`array? [1 2 3]`).toEvaluateTo(true) + await expect(`array? 42`).toEvaluateTo(false) }) test('dict? checks for dict type', async () => { - await expect(`dict? [a=1]`).toEvaluateTo(true, globals) - await expect(`dict? []`).toEvaluateTo(false, globals) + await expect(`dict? [a=1]`).toEvaluateTo(true) + await expect(`dict? []`).toEvaluateTo(false) }) test('null? checks for null type', async () => { - await expect(`null? null`).toEvaluateTo(true, globals) - await expect(`null? 42`).toEvaluateTo(false, globals) + await expect(`null? null`).toEvaluateTo(true) + await expect(`null? 42`).toEvaluateTo(false) }) test('some? checks for non-null', async () => { - await expect(`some? 42`).toEvaluateTo(true, globals) - await expect(`some? null`).toEvaluateTo(false, globals) + await expect(`some? 42`).toEvaluateTo(true) + await expect(`some? null`).toEvaluateTo(false) }) }) describe('introspection', () => { test('type returns proper types', async () => { - await expect(`type 'hello'`).toEvaluateTo('string', globals) - await expect(`type 42`).toEvaluateTo('number', globals) - await expect(`type true`).toEvaluateTo('boolean', globals) - await expect(`type false`).toEvaluateTo('boolean', globals) - await expect(`type null`).toEvaluateTo('null', globals) - await expect(`type [1 2 3]`).toEvaluateTo('array', globals) - await expect(`type [a=1 b=2]`).toEvaluateTo('dict', globals) + await expect(`type 'hello'`).toEvaluateTo('string') + await expect(`type 42`).toEvaluateTo('number') + await expect(`type true`).toEvaluateTo('boolean') + await expect(`type false`).toEvaluateTo('boolean') + await expect(`type null`).toEvaluateTo('null') + await expect(`type [1 2 3]`).toEvaluateTo('array') + await expect(`type [a=1 b=2]`).toEvaluateTo('dict') }) test('inspect formats values', async () => { - await expect(`inspect 'hello'`).toEvaluateTo("\u001b[32m'hello\u001b[32m'\u001b[0m", globals) + await expect(`inspect 'hello'`).toEvaluateTo("\u001b[32m'hello\u001b[32m'\u001b[0m") }) test('describe describes values', async () => { - await expect(`describe 'hello'`).toEvaluateTo("#", globals) + await expect(`describe 'hello'`).toEvaluateTo("#") }) }) describe('environment', () => { test('args is an array', async () => { - await expect(`array? $.args`).toEvaluateTo(true, globals) + await expect(`array? $.args`).toEvaluateTo(true) }) test('args can be accessed', async () => { - await expect(`type $.args`).toEvaluateTo('array', globals) + await expect(`type $.args`).toEvaluateTo('array') }) test('argv includes more than just the args', async () => { @@ -106,35 +105,35 @@ describe('ref', () => { describe('$ global dictionary', () => { test('$.args is an array', async () => { - await expect(`$.args | array?`).toEvaluateTo(true, globals) + await expect(`$.args | array?`).toEvaluateTo(true) }) test('$.args can be accessed', async () => { - await expect(`$.args | type`).toEvaluateTo('array', globals) + await expect(`$.args | type`).toEvaluateTo('array') }) test('$.script.name is a string', async () => { - await expect(`$.script.name | string?`).toEvaluateTo(true, globals) + await expect(`$.script.name | string?`).toEvaluateTo(true) }) test('$.script.path is a string', async () => { - await expect(`$.script.path | string?`).toEvaluateTo(true, globals) + await expect(`$.script.path | string?`).toEvaluateTo(true) }) test('$.env is a dict', async () => { - await expect(`$.env | dict?`).toEvaluateTo(true, globals) + await expect(`$.env | dict?`).toEvaluateTo(true) }) test('$.pid is a number', async () => { - await expect(`$.pid | number?`).toEvaluateTo(true, globals) - await expect(`$.pid > 0`).toEvaluateTo(true, globals) + await expect(`$.pid | number?`).toEvaluateTo(true) + await expect(`$.pid > 0`).toEvaluateTo(true) }) test('$.cwd is a string', async () => { - await expect(`$.cwd | string?`).toEvaluateTo(true, globals) + await expect(`$.cwd | string?`).toEvaluateTo(true) }) test('$.cwd returns current working directory', async () => { - await expect(`$.cwd`).toEvaluateTo(process.cwd(), globals) + await expect(`$.cwd`).toEvaluateTo(process.cwd()) }) }) diff --git a/src/prelude/tests/load.test.ts b/src/prelude/tests/load.test.ts index f79326c..c75a035 100644 --- a/src/prelude/tests/load.test.ts +++ b/src/prelude/tests/load.test.ts @@ -1,42 +1,41 @@ import { expect, describe, test } from 'bun:test' -import { globals } from '#prelude' describe('loading a file', () => { test(`imports all a file's functions`, async () => { expect(` math = load ./src/prelude/tests/math.sh math.double 4 - `).toEvaluateTo(8, globals) + `).toEvaluateTo(8) expect(` math = load ./src/prelude/tests/math.sh - math.double (math.double 4) - `).toEvaluateTo(16, globals) + math.double (math.double 4) + `).toEvaluateTo(16) expect(` math = load ./src/prelude/tests/math.sh - dbl = ref math.double + dbl = ref math.double dbl (dbl 2) - `).toEvaluateTo(8, globals) + `).toEvaluateTo(8) expect(` math = load ./src/prelude/tests/math.sh math.pi - `).toEvaluateTo(3.14, globals) + `).toEvaluateTo(3.14) expect(` math = load ./src/prelude/tests/math.sh math | at 🥧 - `).toEvaluateTo(3.14159265359, globals) + `).toEvaluateTo(3.14159265359) expect(` math = load ./src/prelude/tests/math.sh math.🥧 - `).toEvaluateTo(3.14159265359, globals) + `).toEvaluateTo(3.14159265359) expect(` math = load ./src/prelude/tests/math.sh math.add1 5 - `).toEvaluateTo(6, globals) + `).toEvaluateTo(6) }) }) diff --git a/src/prelude/tests/prelude.test.ts b/src/prelude/tests/prelude.test.ts index 27bb919..3db92e6 100644 --- a/src/prelude/tests/prelude.test.ts +++ b/src/prelude/tests/prelude.test.ts @@ -1,5 +1,4 @@ import { expect, describe, test } from 'bun:test' -import { globals } from '#prelude' describe('string operations', () => { test('to-upper converts to uppercase', async () => { -- 2.50.1 From ae9896c8a2db07be272c9002c0f7cb3e52ac0e9d Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:50:25 -0800 Subject: [PATCH 18/35] switch bin/shrimp to new parser --- bin/shrimp | 4 ++-- src/index.ts | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bin/shrimp b/bin/shrimp index 6e5d502..0754565 100755 --- a/bin/shrimp +++ b/bin/shrimp @@ -1,7 +1,7 @@ #!/usr/bin/env bun import { colors, globals as prelude } from '../src/prelude' -import { treeToString } from '../src/utils/tree' +import { treeToString2 } from '../src/utils/tree' import { runCode, runFile, compileFile, parseCode } from '../src' import { resolve } from 'path' import { bytecodeToString } from 'reefvm' @@ -143,7 +143,7 @@ async function main() { process.exit(1) } const input = readFileSync(file, 'utf-8') - console.log(treeToString(parseCode(input), input)) + console.log(treeToString2(parseCode(input), input)) return } diff --git a/src/index.ts b/src/index.ts index 47f5444..f77f99b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,15 +1,15 @@ import { readFileSync } from 'fs' import { VM, fromValue, toValue, isValue, type Bytecode } from 'reefvm' -import { type Tree } from '@lezer/common' import { Compiler } from '#compiler/compiler' -import { parser } from '#parser/shrimp' +import { parse } from '#parser/parser2' +import { type SyntaxNode, Tree } from '#parser/node' import { globals as parserGlobals, setGlobals as setParserGlobals } from '#parser/tokenizer' import { globals as prelude } from '#prelude' export { Compiler } from '#compiler/compiler' -export { parser } from '#parser/shrimp' +export { parse } from '#parser/parser2' +export { type SyntaxNode, Tree } from '#parser/node' export { globals as prelude } from '#prelude' -export type { Tree } from '@lezer/common' export { type Value, type Bytecode } from 'reefvm' export { toValue, fromValue, isValue, Scope, VM, bytecodeToString } from 'reefvm' @@ -41,7 +41,7 @@ export class Shrimp { return isValue(result) ? fromValue(result, this.vm) : result } - parse(code: string): Tree { + parse(code: string): SyntaxNode { return parseCode(code, this.globals) } @@ -95,17 +95,17 @@ export function compileCode(code: string, globals?: Record): Byteco return compiler.bytecode } -export function parseFile(path: string, globals?: Record): Tree { +export function parseFile(path: string, globals?: Record): SyntaxNode { const code = readFileSync(path, 'utf-8') return parseCode(code, globals) } -export function parseCode(code: string, globals?: Record): Tree { +export function parseCode(code: string, globals?: Record): SyntaxNode { const oldGlobals = [...parserGlobals] const globalNames = [...Object.keys(prelude), ...(globals ? Object.keys(globals) : [])] setParserGlobals(globalNames) - const result = parser.parse(code) + const result = parse(code) setParserGlobals(oldGlobals) return result -- 2.50.1 From 1ea130f8e04dfd0b3695d91c2117c5cbf8bb0eea Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:53:17 -0800 Subject: [PATCH 19/35] pipes end expressions --- src/parser/parser2.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index 6244038..862f4b9 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -123,7 +123,7 @@ export class Parser { // check for parens function call // ex: (ref my-func) my-arg - if (expr.type.is('ParenExpr') && !this.isExprEnd() && !this.is($T.Operator, '|')) + if (expr.type.is('ParenExpr') && !this.isExprEnd()) expr = this.functionCall(expr) // if dotget is followed by binary operator, continue parsing as binary expression @@ -588,7 +588,7 @@ export class Parser { const ident = fn ?? this.identifier() const args: SyntaxNode[] = [] - while (!this.isExprEnd() && !this.is($T.Operator, '|')) + while (!this.isExprEnd()) args.push(this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg()) const node = new SyntaxNode('FunctionCall', ident.from, (args.at(-1) || ident).to) @@ -869,6 +869,7 @@ export class Parser { isExprEnd(): boolean { return this.isAny($T.Colon, $T.Semicolon, $T.Newline, $T.CloseParen, $T.CloseBracket) || + this.is($T.Operator, '|') || this.isExprEndKeyword() || !this.current() } -- 2.50.1 From f160093c4d679d00bd5a668df6e330a4ff43cab4 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:57:18 -0800 Subject: [PATCH 20/35] match lezer API --- src/index.ts | 4 ++-- src/parser/node.ts | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/index.ts b/src/index.ts index f77f99b..8de5373 100644 --- a/src/index.ts +++ b/src/index.ts @@ -100,7 +100,7 @@ export function parseFile(path: string, globals?: Record): SyntaxNo return parseCode(code, globals) } -export function parseCode(code: string, globals?: Record): SyntaxNode { +export function parseCode(code: string, globals?: Record): Tree { const oldGlobals = [...parserGlobals] const globalNames = [...Object.keys(prelude), ...(globals ? Object.keys(globals) : [])] @@ -108,5 +108,5 @@ export function parseCode(code: string, globals?: Record): SyntaxNo const result = parse(code) setParserGlobals(oldGlobals) - return result + return new Tree(result) } \ No newline at end of file diff --git a/src/parser/node.ts b/src/parser/node.ts index 48f4956..67838ef 100644 --- a/src/parser/node.ts +++ b/src/parser/node.ts @@ -114,6 +114,19 @@ export const operators: Record = { export class Tree { constructor(public topNode: SyntaxNode) { } + + get length(): number { + return this.topNode.to + } + + cursor() { + return { + type: this.topNode.type, + from: this.topNode.from, + to: this.topNode.to, + node: this.topNode, + } + } } // TODO: TEMPORARY SHIM -- 2.50.1 From 5ad6125527a258cfbb30681f604501676aad54b7 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:57:43 -0800 Subject: [PATCH 21/35] you too --- src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index.ts b/src/index.ts index 8de5373..bb20d1b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -95,7 +95,7 @@ export function compileCode(code: string, globals?: Record): Byteco return compiler.bytecode } -export function parseFile(path: string, globals?: Record): SyntaxNode { +export function parseFile(path: string, globals?: Record): Tree { const code = readFileSync(path, 'utf-8') return parseCode(code, globals) } -- 2.50.1 From b2d298ec6fb920751d3b6ffe4f40d9cd9c249ffd Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Tue, 2 Dec 2025 15:58:50 -0800 Subject: [PATCH 22/35] fix search/replace --- src/editor/commands.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/editor/commands.ts b/src/editor/commands.ts index 2021954..021aefb 100644 --- a/src/editor/commands.ts +++ b/src/editor/commands.ts @@ -245,7 +245,7 @@ const commandShapes: CommandShape[] = [ ] as const let commandSource = () => commandShapes -export const setCommandSource = (do: () => CommandShape[]) => { +export const setCommandSource = (fn: () => CommandShape[]) => { commandSource = fn } -- 2.50.1 From 04e14cd83e92544649f86898a3067486e5772884 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Tue, 2 Dec 2025 15:58:57 -0800 Subject: [PATCH 23/35] wrong return type --- src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index.ts b/src/index.ts index bb20d1b..c82bb5e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,7 +2,7 @@ import { readFileSync } from 'fs' import { VM, fromValue, toValue, isValue, type Bytecode } from 'reefvm' import { Compiler } from '#compiler/compiler' import { parse } from '#parser/parser2' -import { type SyntaxNode, Tree } from '#parser/node' +import { Tree } from '#parser/node' import { globals as parserGlobals, setGlobals as setParserGlobals } from '#parser/tokenizer' import { globals as prelude } from '#prelude' @@ -41,7 +41,7 @@ export class Shrimp { return isValue(result) ? fromValue(result, this.vm) : result } - parse(code: string): SyntaxNode { + parse(code: string): Tree { return parseCode(code, this.globals) } -- 2.50.1 From 728c5df9eba863676a020568fa3f3d6dbc9f03f5 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Tue, 2 Dec 2025 15:59:01 -0800 Subject: [PATCH 24/35] bun check --- package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index 1c67e2d..c5d783f 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,8 @@ "repl": "bun generate-parser && bun bin/repl", "update-reef": "rm -rf ~/.bun/install/cache/ && rm bun.lock && bun update reefvm", "cli:install": "ln -s \"$(pwd)/bin/shrimp\" ~/.bun/bin/shrimp", - "cli:remove": "rm ~/.bun/bin/shrimp" + "cli:remove": "rm ~/.bun/bin/shrimp", + "check": "bunx tsc --noEmit" }, "dependencies": { "@codemirror/view": "^6.38.3", -- 2.50.1 From 688181654e96f50cea8d5e6fe1167691174636e8 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Tue, 2 Dec 2025 16:46:34 -0800 Subject: [PATCH 25/35] enable [ a = true ] --- src/parser/node.ts | 1 + src/parser/parser2.ts | 21 ++++++++++++++++++++- src/parser/tests/literals.test.ts | 20 ++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/parser/node.ts b/src/parser/node.ts index 67838ef..a08e9c8 100644 --- a/src/parser/node.ts +++ b/src/parser/node.ts @@ -10,6 +10,7 @@ export type NodeType = | 'FunctionCallWithBlock' | 'PositionalArg' | 'NamedArg' + | 'NamedArgPrefix' | 'FunctionDef' | 'Params' diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index 862f4b9..d4afbf7 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -305,6 +305,13 @@ export class Parser { break } + // [ a = true ] + const next = this.peek(peek) + if (next?.type === $T.Operator && next.value === '=') { + isDict = true + break + } + // probably an array if (curr.type !== $T.Comment && curr.type !== $T.Semicolon && curr.type !== $T.Newline) break @@ -445,7 +452,19 @@ export class Parser { continue } - values.push(this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg()) + // check for named arg with space after it (vs connected) + if (this.nextIs($T.Operator, '=')) { + const ident = this.identifier() + const op = this.op('=') + const val = this.arg(true) + const prefix = new SyntaxNode('NamedArgPrefix', ident.from, op.to) + const node = new SyntaxNode('NamedArg', ident.from, val.to) + node.add(prefix) + node.add(val) + values.push(node) + } else { + values.push(this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg()) + } } const close = this.expect($T.CloseBracket) diff --git a/src/parser/tests/literals.test.ts b/src/parser/tests/literals.test.ts index ba423ab..44e2794 100644 --- a/src/parser/tests/literals.test.ts +++ b/src/parser/tests/literals.test.ts @@ -387,6 +387,26 @@ describe('dict literals', () => { Number 3 `) }) + + test('can have spaces between equals', () => { + expect(`[ + a = 1 + b = 2 + c = 3 + ]`).toMatchTree(` + Dict + NamedArg + NamedArgPrefix a = + Number 1 + NamedArg + NamedArgPrefix b = + Number 2 + NamedArg + NamedArgPrefix c = + Number 3 + `) + }) + test('empty dict', () => { expect('[=]').toMatchTree(` Dict [=] -- 2.50.1 From cb7cdaea62879e39f74856b1733e474d8efdd302 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Tue, 2 Dec 2025 16:49:47 -0800 Subject: [PATCH 26/35] trim keys in inspect --- src/prelude/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/prelude/index.ts b/src/prelude/index.ts index 78c1355..cf6e5a5 100644 --- a/src/prelude/index.ts +++ b/src/prelude/index.ts @@ -191,8 +191,8 @@ export function formatValue(value: Value, inner = false): string { return `${colors.blue}[${colors.reset}${items}${colors.blue}]${colors.reset}` } case 'dict': { - const entries = Array.from(value.value.entries()) - .map(([k, v]) => `${k}${colors.blue}=${colors.reset}${formatValue(v, true)}`) + const entries = Array.from(value.value.entries()).reverse() + .map(([k, v]) => `${k.trim()}${colors.blue}=${colors.reset}${formatValue(v, true)}`) .join(' ') if (entries.length === 0) return `${colors.blue}[=]${colors.reset}` -- 2.50.1 From 757a50e23ebe1352dcb1bbc289bb47d2e488e8d6 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Tue, 2 Dec 2025 16:54:24 -0800 Subject: [PATCH 27/35] fix `./bin/shrimp parse` --- bin/shrimp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/shrimp b/bin/shrimp index 0754565..213e4cb 100755 --- a/bin/shrimp +++ b/bin/shrimp @@ -143,7 +143,7 @@ async function main() { process.exit(1) } const input = readFileSync(file, 'utf-8') - console.log(treeToString2(parseCode(input), input)) + console.log(treeToString2(parseCode(input).topNode, input)) return } -- 2.50.1 From 21e7ed41af398e0d7a8f51fc76a0288adddf3c37 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Wed, 3 Dec 2025 13:40:04 -0800 Subject: [PATCH 28/35] restore errors, fancy printing --- src/compiler/compiler.ts | 10 +++--- src/compiler/utils.ts | 20 ++++++------ src/parser/node.ts | 17 ++++++++-- src/parser/parser2.ts | 52 ++++++++++++++++++++++-------- src/parser/tests/functions.test.ts | 4 +-- src/utils/tree.ts | 17 ++++++++-- 6 files changed, 84 insertions(+), 36 deletions(-) diff --git a/src/compiler/compiler.ts b/src/compiler/compiler.ts index b03b9b9..73db0f5 100644 --- a/src/compiler/compiler.ts +++ b/src/compiler/compiler.ts @@ -66,12 +66,12 @@ export class Compiler { if (globals) setGlobals(Array.isArray(globals) ? globals : Object.keys(globals)) const ast = parse(input) const cst = new Tree(ast) - // const errors = checkTreeForErrors(cst) + const errors = checkTreeForErrors(cst) - // const firstError = errors[0] - // if (firstError) { - // throw firstError - // } + const firstError = errors[0] + if (firstError) { + throw firstError + } this.#compileCst(cst, input) this.bytecode = toBytecode(this.instructions) diff --git a/src/compiler/utils.ts b/src/compiler/utils.ts index 18dbda4..c839644 100644 --- a/src/compiler/utils.ts +++ b/src/compiler/utils.ts @@ -5,13 +5,13 @@ import type { SyntaxNode, Tree } from '#parser/node' export const checkTreeForErrors = (tree: Tree): CompilerError[] => { const errors: CompilerError[] = [] - // tree.iterate({ - // enter: (node) => { - // if (node.type.isError) { - // errors.push(new CompilerError(`Unexpected syntax.`, node.from, node.to)) - // } - // }, - // }) + tree.iterate({ + enter: (node) => { + if (node.type.isError) { + errors.push(new CompilerError(`Unexpected syntax.`, node.from, node.to)) + } + }, + }) return errors } @@ -58,8 +58,7 @@ export const getAssignmentParts = (node: SyntaxNode) => { if (!left || left.type.id !== terms.AssignableIdentifier) { throw new CompilerError( - `Assign left child must be an AssignableIdentifier or Array, got ${ - left ? left.type.name : 'none' + `Assign left child must be an AssignableIdentifier or Array, got ${left ? left.type.name : 'none' }`, node.from, node.to @@ -75,8 +74,7 @@ export const getCompoundAssignmentParts = (node: SyntaxNode) => { if (!left || left.type.id !== terms.AssignableIdentifier) { throw new CompilerError( - `CompoundAssign left child must be an AssignableIdentifier, got ${ - left ? left.type.name : 'none' + `CompoundAssign left child must be an AssignableIdentifier, got ${left ? left.type.name : 'none' }`, node.from, node.to diff --git a/src/parser/node.ts b/src/parser/node.ts index a08e9c8..137365f 100644 --- a/src/parser/node.ts +++ b/src/parser/node.ts @@ -128,6 +128,15 @@ export class Tree { node: this.topNode, } } + + iterate(options: { enter: (node: SyntaxNode) => void }) { + const iter = (node: SyntaxNode) => { + for (const n of node.children) iter(n) + options.enter(node) + } + + iter(this.topNode) + } } // TODO: TEMPORARY SHIM @@ -295,7 +304,6 @@ class SyntaxNodeType { case 'keyword': return term.keyword - } return 0 } @@ -307,6 +315,7 @@ class SyntaxNodeType { export class SyntaxNode { #type: NodeType + #isError = false from: number to: number parent: SyntaxNode | null @@ -336,7 +345,11 @@ export class SyntaxNode { } get isError(): boolean { - return false + return this.#isError + } + + set isError(err: boolean) { + this.#isError = err } get firstChild(): SyntaxNode | null { diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index d4afbf7..dbaac2b 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -1,7 +1,9 @@ +import { CompilerError } from '#compiler/compilerError' import { Scanner, type Token, TokenType } from './tokenizer2' import { SyntaxNode, operators, precedence, conditionals, compounds } from './node' import { globals } from './tokenizer' import { parseString } from './stringParser' +import { Compiler } from '#compiler/compiler' const $T = TokenType @@ -256,6 +258,7 @@ export class Parser { return val } else { const arg = new SyntaxNode('PositionalArg', val.from, val.to) + if (val.isError) arg.isError = true arg.add(val) return arg } @@ -356,7 +359,7 @@ export class Parser { return SyntaxNode.from(this.next()) const next = this.next() - throw `[atom] unexpected token ${TokenType[next.type]}: ${JSON.stringify(next)}\n\n ${this.input}\n` + throw new CompilerError(`Unexpected token: ${TokenType[next.type]}`, next.from, next.to) } // blocks in if, do, special calls, etc @@ -432,6 +435,7 @@ export class Parser { // [ a=1 b=true c='three' ] dict(): SyntaxNode { const open = this.expect($T.OpenBracket) + let isError = false // empty dict [=] or [ = ] if (this.is($T.Operator, '=') && this.nextIs($T.CloseBracket)) { @@ -456,20 +460,29 @@ export class Parser { if (this.nextIs($T.Operator, '=')) { const ident = this.identifier() const op = this.op('=') - const val = this.arg(true) const prefix = new SyntaxNode('NamedArgPrefix', ident.from, op.to) - const node = new SyntaxNode('NamedArg', ident.from, val.to) - node.add(prefix) - node.add(val) - values.push(node) + + if (this.is($T.CloseBracket) || this.is($T.Semicolon) || this.is($T.Newline)) { + const node = new SyntaxNode('NamedArg', ident.from, op.to) + node.isError = true + isError = true + values.push(node.push(prefix)) + } else { + const val = this.arg(true) + const node = new SyntaxNode('NamedArg', ident.from, val.to) + values.push(node.push(prefix, val)) + } } else { - values.push(this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg()) + const arg = this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg() + if (arg.isError) isError = true + values.push(arg) } } const close = this.expect($T.CloseBracket) const node = new SyntaxNode('Dict', open.from, close.to) + node.isError = isError return node.push(...values) } @@ -491,7 +504,7 @@ export class Parser { else if (this.is($T.NamedArgPrefix)) arg = this.namedParam() else - throw `[do] expected Identifier or NamedArgPrefix, got ${JSON.stringify(this.current())}\n\n ${this.input}\n` + throw new CompilerError(`Expected Identifier or NamedArgPrefix, got ${TokenType[this.current().type]}`, this.current().from, this.current().to) params.push(arg) } @@ -605,14 +618,20 @@ export class Parser { // you're lookin at it functionCall(fn?: SyntaxNode): SyntaxNode { const ident = fn ?? this.identifier() + let isError = false const args: SyntaxNode[] = [] - while (!this.isExprEnd()) - args.push(this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg()) + while (!this.isExprEnd()) { + const arg = this.is($T.NamedArgPrefix) ? this.namedArg() : this.arg() + if (arg.isError) isError = true + args.push(arg) + } const node = new SyntaxNode('FunctionCall', ident.from, (args.at(-1) || ident).to) node.push(ident, ...args) + if (isError) node.isError = true + if (!this.inTestExpr && this.is($T.Colon)) { const block = this.block() const end = this.keyword('end') @@ -718,6 +737,13 @@ export class Parser { // abc= true namedArg(): SyntaxNode { const prefix = SyntaxNode.from(this.expect($T.NamedArgPrefix)) + + if (this.isExprEnd()) { + const node = new SyntaxNode('NamedArg', prefix.from, prefix.to) + node.isError = true + return node.push(prefix) + } + const val = this.arg(true) const node = new SyntaxNode('NamedArg', prefix.from, val.to) return node.push(prefix, val) @@ -729,7 +755,7 @@ export class Parser { const val = this.value() if (!['Null', 'Boolean', 'Number', 'String'].includes(val.type.name)) - throw `[namedParam] default value must be Null|Bool|Num|Str, got ${val.type}\n\n ${this.input}\n` + throw new CompilerError(`Default value must be null, boolean, number, or string, got ${val.type.name}`, val.from, val.to) const node = new SyntaxNode('NamedParam', prefix.from, val.to) return node.push(prefix, val) @@ -739,7 +765,7 @@ export class Parser { op(op?: string): SyntaxNode { const token = op ? this.expect($T.Operator, op) : this.expect($T.Operator) const name = operators[token.value!] - if (!name) throw `[op] operator not registered: ${token.value!}\n\n ${this.input}\n` + if (!name) throw new CompilerError(`Operator not registered: ${token.value!}`, token.from, token.to) return new SyntaxNode(name, token.from, token.to) } @@ -919,7 +945,7 @@ export class Parser { expect(type: TokenType, value?: string): Token | never { if (!this.is(type, value)) { const token = this.current() - throw `expected ${TokenType[type]}${value ? ` "${value}"` : ''}, got ${TokenType[token?.type || 0]}${token?.value ? ` "${token.value}"` : ''} at position ${this.pos}\n\n ${this.input}\n` + throw new CompilerError(`Expected ${TokenType[type]}${value ? ` "${value}"` : ''}, got ${TokenType[token?.type || 0]}${token?.value ? ` "${token.value}"` : ''} at position ${this.pos}`, token.from, token.to) } return this.next() } diff --git a/src/parser/tests/functions.test.ts b/src/parser/tests/functions.test.ts index ff39870..6312529 100644 --- a/src/parser/tests/functions.test.ts +++ b/src/parser/tests/functions.test.ts @@ -109,14 +109,14 @@ describe('calling functions', () => { `) }) - test.skip('Incomplete namedArg', () => { + test('Incomplete namedArg', () => { expect('tail lines=').toMatchTree(` FunctionCall Identifier tail NamedArg NamedArgPrefix lines= ⚠ - ⚠ `) + ⚠`) }) }) diff --git a/src/utils/tree.ts b/src/utils/tree.ts index 75a5495..3535d58 100644 --- a/src/utils/tree.ts +++ b/src/utils/tree.ts @@ -24,10 +24,21 @@ export const treeToString2 = (tree: SyntaxNode, input: string, depth = 0): strin if (node.name === 'Program') node = node.firstChild while (node) { - lines.push(nodeToString(node, input, depth)) + // If this node is an error, print ⚠ instead of its content + if (node.isError && !node.firstChild) { + lines.push(' '.repeat(depth) + '⚠') + } else { + lines.push(nodeToString(node, input, depth)) - if (node.firstChild) - lines.push(treeToString2(node.firstChild, input, depth + 1)) + if (node.firstChild) { + lines.push(treeToString2(node.firstChild, input, depth + 1)) + } + + // If this node has an error, add ⚠ after its children + if (node.isError && node.firstChild) { + lines.push(' '.repeat(depth === 0 ? 0 : depth + 1) + '⚠') + } + } node = node.nextSibling } -- 2.50.1 From 9b1890a3dba7c5c49a56a96046512401166cf606 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Wed, 3 Dec 2025 13:40:17 -0800 Subject: [PATCH 29/35] no need --- src/parser/parser2.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index dbaac2b..f57bbfe 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -3,7 +3,6 @@ import { Scanner, type Token, TokenType } from './tokenizer2' import { SyntaxNode, operators, precedence, conditionals, compounds } from './node' import { globals } from './tokenizer' import { parseString } from './stringParser' -import { Compiler } from '#compiler/compiler' const $T = TokenType -- 2.50.1 From ef20c67e61fd1bc1bce585ff5166b4a19fb967d4 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Wed, 3 Dec 2025 15:40:37 -0800 Subject: [PATCH 30/35] --version, for now --- bin/shrimp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/shrimp b/bin/shrimp index 213e4cb..7aae2a2 100755 --- a/bin/shrimp +++ b/bin/shrimp @@ -31,7 +31,7 @@ ${colors.bright}Options:${colors.reset} } function showVersion() { - console.log('🦐 v0.0.1') + console.log('🦐 v0.0.1 (non-lezer parser)') } async function evalCode(code: string, imports: string[]) { -- 2.50.1 From 07a42d9767a2ed4fd84524ab6a499d2a30d6e385 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Wed, 3 Dec 2025 15:52:20 -0800 Subject: [PATCH 31/35] ignore trailing whitespace in dict key name --- src/compiler/compiler.ts | 2 +- src/compiler/tests/literals.test.ts | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/compiler/compiler.ts b/src/compiler/compiler.ts index 73db0f5..11105cd 100644 --- a/src/compiler/compiler.ts +++ b/src/compiler/compiler.ts @@ -807,7 +807,7 @@ export class Compiler { const valueNode = node.firstChild!.nextSibling // name= -> name - const key = input.slice(keyNode!.from, keyNode!.to).slice(0, -1) + const key = input.slice(keyNode!.from, keyNode!.to).replace(/\s*=$/, '') instructions.push(['PUSH', key]) instructions.push(...this.#compileNode(valueNode!, input)) diff --git a/src/compiler/tests/literals.test.ts b/src/compiler/tests/literals.test.ts index 1783830..c398fae 100644 --- a/src/compiler/tests/literals.test.ts +++ b/src/compiler/tests/literals.test.ts @@ -151,18 +151,22 @@ describe('array literals', () => { describe('dict literals', () => { test('work with numbers', () => { expect('[a=1 b=2 c=3]').toEvaluateTo({ a: 1, b: 2, c: 3 }) + expect('[a = 1 b = 2 c = 3]').toEvaluateTo({ a: 1, b: 2, c: 3 }) }) test('work with strings', () => { expect("[a='one' b='two' c='three']").toEvaluateTo({ a: 'one', b: 'two', c: 'three' }) + expect("[a = 'one' b = 'two' c = 'three']").toEvaluateTo({ a: 'one', b: 'two', c: 'three' }) }) test('work with identifiers', () => { expect('[a=one b=two c=three]').toEvaluateTo({ a: 'one', b: 'two', c: 'three' }) + expect('[a = one b = two c = three]').toEvaluateTo({ a: 'one', b: 'two', c: 'three' }) }) test('can be nested', () => { expect('[a=one b=[two [c=three]]]').toEvaluateTo({ a: 'one', b: ['two', { c: 'three' }] }) + expect('[a = one b = [two [c = three]]]').toEvaluateTo({ a: 'one', b: ['two', { c: 'three' }] }) }) test('can span multiple lines', () => { @@ -171,6 +175,12 @@ describe('dict literals', () => { b=2 c=3 ]`).toEvaluateTo({ a: 1, b: 2, c: 3 }) + + expect(`[ + a = 1 + b = 2 + c = 3 + ]`).toEvaluateTo({ a: 1, b: 2, c: 3 }) }) test('empty dict', () => { @@ -190,10 +200,12 @@ describe('dict literals', () => { test('semicolons as separators', () => { expect('[a=1; b=2; c=3]').toEvaluateTo({ a: 1, b: 2, c: 3 }) + expect('[a = 1; b = 2; c = 3]').toEvaluateTo({ a: 1, b: 2, c: 3 }) }) test('expressions in dicts', () => { expect('[a=(1 + 2) b=(3 * 4)]').toEvaluateTo({ a: 3, b: 12 }) + expect('[a = (1 + 2) b = (3 * 4)]').toEvaluateTo({ a: 3, b: 12 }) }) test('empty lines within dicts', () => { -- 2.50.1 From e1859c1bda8e1b31a542ca96c187f77f149acb95 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Fri, 5 Dec 2025 15:25:33 -0800 Subject: [PATCH 32/35] fix: parens can't be in words --- src/parser/tests/tokens.test.ts | 20 ++++++++++++++++++++ src/parser/tokenizer2.ts | 1 + 2 files changed, 21 insertions(+) diff --git a/src/parser/tests/tokens.test.ts b/src/parser/tests/tokens.test.ts index f3613f7..b166570 100644 --- a/src/parser/tests/tokens.test.ts +++ b/src/parser/tests/tokens.test.ts @@ -425,6 +425,26 @@ f ) }) + test('function call w/ parens', () => { + expect('echo(bold hello world)').toMatchTokens( + { type: 'Identifier', value: 'echo' }, + { type: 'OpenParen' }, + { type: 'Identifier', value: 'bold' }, + { type: 'Identifier', value: 'hello' }, + { type: 'Identifier', value: 'world' }, + { type: 'CloseParen' }, + ) + + expect('echo (bold hello world)').toMatchTokens( + { type: 'Identifier', value: 'echo' }, + { type: 'OpenParen' }, + { type: 'Identifier', value: 'bold' }, + { type: 'Identifier', value: 'hello' }, + { type: 'Identifier', value: 'world' }, + { type: 'CloseParen' }, + ) + }) + test('assignment', () => { expect('x = 5').toMatchTokens( { type: 'Identifier', value: 'x' }, diff --git a/src/parser/tokenizer2.ts b/src/parser/tokenizer2.ts index c3e891b..dacaaca 100644 --- a/src/parser/tokenizer2.ts +++ b/src/parser/tokenizer2.ts @@ -507,6 +507,7 @@ const isWordChar = (ch: number): boolean => { !isWhitespace(ch) && ch !== 10 /* \n */ && ch !== 59 /* ; */ && + ch !== 40 /* ( */ && ch !== 41 /* ) */ && ch !== 93 /* ] */ && ch !== -1 /* EOF */ -- 2.50.1 From 88ee108a1e870d0cac26888d917fcf4026d447f1 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Fri, 5 Dec 2025 15:45:22 -0800 Subject: [PATCH 33/35] ./bin/parser-tree.ts --- bin/parser-tree.ts | 192 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100755 bin/parser-tree.ts diff --git a/bin/parser-tree.ts b/bin/parser-tree.ts new file mode 100755 index 0000000..cf5ee9c --- /dev/null +++ b/bin/parser-tree.ts @@ -0,0 +1,192 @@ +#!/usr/bin/env bun + +// WARNING: [[ No human has been anywhere near this file. It's pure Claude slop. +// Enter at your own risk. ]] + +import { readFileSync } from 'fs' + +type CallInfo = { + method: string + line: number + calls: Set + isRecursive?: boolean +} + +// Parse the parser file and extract method calls +function analyzeParser(filePath: string): Map { + const content = readFileSync(filePath, 'utf-8') + const lines = content.split('\n') + const methods = new Map() + + // Find all method definitions + const methodRegex = /^\s*(\w+)\s*\([^)]*\):\s*/ + + let currentMethod: string | null = null + let braceDepth = 0 + let classDepth = 0 + + for (let i = 0; i < lines.length; i++) { + const line = lines[i] || '' + + // Track if we're inside the Parser class + if (line.includes('class Parser')) { + classDepth = braceDepth + 1 // Will be the depth after we process this line's brace + } + + // Check for method definition (only inside class, at class level) + // Check BEFORE incrementing braceDepth + if (classDepth > 0 && braceDepth === classDepth) { + const methodMatch = line.match(methodRegex) + if (methodMatch && !line.includes('class ')) { + currentMethod = methodMatch[1]! + methods.set(currentMethod, { + method: currentMethod, + line: i + 1, + calls: new Set() + }) + } + } + + // Track brace depth + braceDepth += (line.match(/{/g) || []).length + braceDepth -= (line.match(/}/g) || []).length + + // Find method calls within current method + if (currentMethod && braceDepth > 0) { + // Match this.methodName() calls + const callRegex = /this\.(\w+)\s*\(/g + let match + while ((match = callRegex.exec(line)) !== null) { + const calledMethod = match[1]! + const info = methods.get(currentMethod)! + info.calls.add(calledMethod) + + // Mark recursive calls + if (calledMethod === currentMethod) { + info.isRecursive = true + } + } + } + + // Reset when method ends + if (braceDepth === 0) { + currentMethod = null + } + } + + return methods +} + +// Build tree structure starting from a root method +function buildTree( + method: string, + callGraph: Map, + visited: Set, + indent = '', + isLast = true, + depth = 0, + maxDepth = 3 +): string[] { + const lines: string[] = [] + const info = callGraph.get(method) + + if (!info) return lines + + // Add current method + const prefix = depth === 0 ? '' : (isLast ? '└─> ' : '├─> ') + const suffix = info.isRecursive ? ' (recursive)' : '' + const lineNum = `[line ${info.line}]` + lines.push(`${indent}${prefix}${method}() ${lineNum}${suffix}`) + + // Stop if we've reached max depth + if (depth >= maxDepth) { + return lines + } + + // Prevent infinite recursion in tree display + if (visited.has(method)) { + return lines + } + + const newVisited = new Set(visited) + newVisited.add(method) + + // Helper methods to filter out (low-level utilities) + const helperPatterns = /^(is|next|peek|expect|current|op)/i + + // Get sorted unique calls (filter out recursive self-calls for display) + const calls = Array.from(info.calls) + .filter(c => callGraph.has(c)) // Only show parser methods + .filter(c => c !== method) // Don't show immediate self-recursion + .filter(c => !helperPatterns.test(c)) // Filter out helpers + .sort() + + // Add children + const newIndent = indent + (isLast ? ' ' : '│ ') + calls.forEach((call, idx) => { + const childLines = buildTree( + call, + callGraph, + newVisited, + newIndent, + idx === calls.length - 1, + depth + 1, + maxDepth + ) + lines.push(...childLines) + }) + + return lines +} + +// Main +const parserPath = './src/parser/parser2.ts' +const maxDepth = parseInt(process.argv[2] || '5') + +console.log('Parser Call Tree for', parserPath) +console.log(`Max depth: ${maxDepth}`) +console.log('═'.repeat(60)) +console.log() + +const callGraph = analyzeParser(parserPath) + +// Start from parse() method +const tree = buildTree('parse', callGraph, new Set(), '', true, 0, maxDepth) +console.log(tree.join('\n')) + +// Show some stats +console.log('\n' + '═'.repeat(60)) +console.log('Stats:') +console.log(` Total methods: ${callGraph.size}`) +console.log(` Entry point: parse()`) + +// Find methods that are never called (potential dead code or entry points) +const allCalled = new Set() +for (const info of callGraph.values()) { + info.calls.forEach(c => allCalled.add(c)) +} + +const uncalled = Array.from(callGraph.keys()) + .filter(m => !allCalled.has(m) && m !== 'parse') + .sort() + +if (uncalled.length > 0) { + console.log(`\n Uncalled methods: ${uncalled.join(', ')}`) +} + +// Find most-called methods +const callCount = new Map() +for (const info of callGraph.values()) { + for (const called of info.calls) { + callCount.set(called, (callCount.get(called) || 0) + 1) + } +} + +const topCalled = Array.from(callCount.entries()) + .sort((a, b) => b[1] - a[1]) + .slice(0, 5) + +console.log(`\n Most-called methods:`) +for (const [method, count] of topCalled) { + console.log(` ${method}() - called ${count} times`) +} -- 2.50.1 From 65119b720a3b7b40693f0149cb66ff997204a109 Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Sat, 6 Dec 2025 21:15:28 -0800 Subject: [PATCH 34/35] fix very minor issues --- src/parser/parser2.ts | 8 ++++---- src/parser/stringParser.ts | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index f57bbfe..57ea905 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -60,7 +60,7 @@ export class Parser { if (stmt) node.add(stmt) if (this.pos === prevPos && !this.isEOF()) - throw "parser didn't advance - you need to call next()\n\n ${this.input}\n" + throw `parser didn't advance - you need to call next()\n\n ${this.input}\n` } return node @@ -517,7 +517,7 @@ export class Parser { if (this.is($T.Keyword, 'finally')) finalNode = this.finally() - let end = this.keyword('end') + const end = this.keyword('end') let last = block.at(-1) if (finalNode) last = finalNode.children.at(-1)! @@ -687,7 +687,7 @@ export class Parser { const ifWord = this.keyword('if') const elseIfTest = this.testExpr() const elseIfBlock = this.block() - const elseIfNode = new SyntaxNode('ElseIfExpr', ifBlock.at(-1)!.from, elseIfBlock.at(-1)!.to) + const elseIfNode = new SyntaxNode('ElseIfExpr', elseWord.from, elseIfBlock.at(-1)!.to) elseIfNode.push(elseWord, ifWord, elseIfTest) elseIfNode.push(...elseIfBlock) node.push(elseIfNode) @@ -696,7 +696,7 @@ export class Parser { if (this.is($T.Keyword, 'else') && this.nextIs($T.Colon)) { const elseWord = this.keyword('else') const elseBlock = this.block() - const elseNode = new SyntaxNode('ElseExpr', ifBlock.at(-1)!.from, elseBlock.at(-1)!.to) + const elseNode = new SyntaxNode('ElseExpr', elseWord.from, elseBlock.at(-1)!.to) elseNode.push(elseWord) elseNode.push(...elseBlock) node.push(elseNode) diff --git a/src/parser/stringParser.ts b/src/parser/stringParser.ts index d5e125c..4218b54 100644 --- a/src/parser/stringParser.ts +++ b/src/parser/stringParser.ts @@ -244,7 +244,7 @@ const findIdentifierEnd = (input: string, pos: number, maxPos: number): number = let end = pos while (end < maxPos) { - const char = input[end] + const char = input[end]! // Stop at non-identifier characters if (!/[a-z0-9\-?]/.test(char)) { -- 2.50.1 From b21751a7901d7a9ecf8bf30bbd016c7d4015ef4c Mon Sep 17 00:00:00 2001 From: Chris Wanstrath <2+defunkt@users.noreply.github.com> Date: Sat, 6 Dec 2025 21:23:27 -0800 Subject: [PATCH 35/35] add (working) password.sh --- examples/password.sh | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 examples/password.sh diff --git a/examples/password.sh b/examples/password.sh new file mode 100644 index 0000000..5f858ac --- /dev/null +++ b/examples/password.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env shrimp +# usage: password [!spaced] [!symbols] + +if ($.args | list.contains? -h): + echo 'usage: password [!spaced] [!symbols]' + exit +end + +password = do n=22 symbols=true spaced=true: + chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' + if symbols: chars += '!@#%^&*-=()[]<>' end + + out = [] + i = 0 + max = length chars + + while i < n: + idx = math.floor ((math.random) * max) + ch = chars | at idx + list.push out ch + i += 1 + end + + if spaced: + pos1 = math.floor((n - 2) / 3) + pos2 = math.floor((n - 2) * 2 / 3) + + list.insert out pos2 ' ' + list.insert out pos1 ' ' + end + + str.join out '' +end + +missing-arg? = do x: $.args | list.contains? x | not end + +num = $.args | list.reject (do x: x | str.starts-with? ! end) | list.first + +password num symbols=(missing-arg? !symbols) spaced=(missing-arg? !spaced) | echo \ No newline at end of file -- 2.50.1