import { Scanner, type Token, TokenType } from './tokenizer2' import { SyntaxNode, operators, precedence, conditionals, compounds } from './node' import { globals } from './tokenizer' import { parseString } from './stringParser' const $T = TokenType export const parse = (input: string): SyntaxNode => { const parser = new Parser() return parser.parse(input) } class Scope { parent?: Scope set = new Set() constructor(parent?: Scope) { this.parent = parent // no parent means this is global scope if (!parent) for (const name of globals) this.add(name) } add(key: string) { this.set.add(key) } has(key: string): boolean { return this.set.has(key) || this.parent?.has(key) || false } } export class Parser { tokens: Token[] = [] pos = 0 inParens = 0 input = '' scope = new Scope inTestExpr = false parse(input: string): SyntaxNode { const scanner = new Scanner() this.tokens = scanner.tokenize(input) this.pos = 0 this.input = input this.scope = new Scope() this.inTestExpr = false const node = new SyntaxNode('Program', 0, input.length) while (!this.isEOF()) { if (this.is($T.Newline) || this.is($T.Semicolon)) { this.next() continue } const prevPos = this.pos const stmt = this.statement() if (stmt) node.add(stmt) if (this.pos === prevPos && !this.isEOF()) throw "parser didn't advance - you need to call next()\n\n ${this.input}\n" } return node } // // parse foundation nodes - statements, expressions // // statement is a line of code statement(): SyntaxNode | null { if (this.is($T.Comment)) return this.comment() while (this.is($T.Newline) || this.is($T.Semicolon)) this.next() if (this.isEOF() || this.isExprEndKeyword()) return null return this.expression() } // expressions can be found in four places: // 1. line of code // 2. right side of assignment // 3. if/while conditions // 4. inside (parens) expression(allowPipe = true): SyntaxNode { let expr // x = value if (this.is($T.Identifier) && ( this.nextIs($T.Operator, '=') || compounds.some(x => this.nextIs($T.Operator, x)) )) expr = this.assign() // if, while, do, etc else if (this.is($T.Keyword)) expr = this.keywords() // dotget else if (this.nextIs($T.Operator, '.')) expr = this.dotGetFunctionCall() // echo hello world else if (this.is($T.Identifier) && !this.nextIs($T.Operator) && !this.nextIsExprEnd()) expr = this.functionCall() // bare-function-call else if (this.is($T.Identifier) && this.nextIsExprEnd()) expr = this.functionCallOrIdentifier() // everything else else expr = this.exprWithPrecedence() // check for destructuring if (expr.type === 'Array' && this.is($T.Operator, '=')) return this.destructure(expr) // check for parens function call // ex: (ref my-func) my-arg if (expr.type === 'ParenExpr' && !this.isExprEnd()) expr = this.functionCall(expr) // one | echo if (allowPipe && this.isPipe()) return this.pipe(expr) // regular else return expr } // piping | stuff | is | cool pipe(left: SyntaxNode): SyntaxNode { const canLookPastNewlines = this.inParens === 0 const parts: SyntaxNode[] = [left] while (this.isPipe()) { // consume newlines before pipe (only if not in parens) if (canLookPastNewlines) { while (this.is($T.Newline)) this.next() } const pipeOp = this.op('|') pipeOp.type = 'operator' parts.push(pipeOp) // consume newlines after pipe (only if not in parens) if (canLookPastNewlines) { while (this.is($T.Newline)) this.next() } // parse right side - don't allow nested pipes parts.push(this.expression(false)) } const node = new SyntaxNode('PipeExpr', parts[0]!.from, parts.at(-1)!.to) return node.push(...parts) } // Pratt parser - parses expressions with precedence climbing // bp = binding precedence exprWithPrecedence(minBp = 0): SyntaxNode { let left = this.value() // infix operators with precedence while (this.is($T.Operator)) { const op = this.current().value! const bp = precedence[op] // operator has lower precedence than required, stop if (bp === undefined || bp < minBp) break const opNode = this.op() // right-associative operators (like **) use same bp, others use bp + 1 const nextMinBp = op === '**' ? bp : bp + 1 // parse right-hand side with higher precedence const right = this.exprWithPrecedence(nextMinBp) const nodeType = conditionals.has(op) ? 'ConditionalOp' : 'BinOp' const node = new SyntaxNode(nodeType, left.from, right.to) node.push(left, opNode, right) left = node } return left } // if, while, do, etc keywords(): SyntaxNode { if (this.is($T.Keyword, 'if')) return this.if() if (this.is($T.Keyword, 'while')) return this.while() if (this.is($T.Keyword, 'do')) return this.do() if (this.is($T.Keyword, 'try')) return this.try() if (this.is($T.Keyword, 'throw')) return this.throw() if (this.is($T.Keyword, 'import')) return this.import() return this.expect($T.Keyword, 'if/while/do/import') as never } // value can be an atom or a (parens that gets turned into an atom) // values are used in a few places: // 1. function arguments // 2. array/dict members // 3. binary operations // 4. anywhere an expression can be used value(): SyntaxNode { if (this.is($T.OpenParen)) return this.parens() if (this.is($T.OpenBracket)) return this.arrayOrDict() // dotget if (this.nextIs($T.Operator, '.')) return this.dotGet() return this.atom() } // // parse specific nodes // // [ 1 2 3 ] array(): SyntaxNode { const open = this.expect($T.OpenBracket) const values = [] while (!this.is($T.CloseBracket) && !this.isEOF()) { if (this.is($T.Semicolon) || this.is($T.Newline)) { this.next() continue } if (this.is($T.Comment)) { values.push(this.comment()) continue } values.push(this.value()) } const close = this.expect($T.CloseBracket) const node = new SyntaxNode('Array', open.from, close.to) return node.push(...values) } // which are we dealing with? ignores leading newlines and comments arrayOrDict(): SyntaxNode { let peek = 1 let curr = this.peek(peek++) let isDict = false while (curr && curr.type !== $T.CloseBracket) { // definitely a dict if (curr.type === $T.NamedArgPrefix) { isDict = true break } // empty dict if (curr.type === $T.Operator && curr.value === '=') { isDict = true break } // probably an array if (curr.type !== $T.Comment && curr.type !== $T.Semicolon && curr.type !== $T.Newline) break curr = this.peek(peek++) } return isDict ? this.dict() : this.array() } // x = true assign(): SyntaxNode { const ident = this.assignableIdentifier() const opToken = this.current()! const op = this.op() const expr = this.expression() const node = new SyntaxNode( opToken.value === '=' ? 'Assign' : 'CompoundAssign', ident.from, expr.to ) return node.push(ident, op, expr) } // identifier used in assignment (TODO: legacy lezer quirk) assignableIdentifier(): SyntaxNode { const token = this.expect($T.Identifier) this.scope.add(token.value!) const node = SyntaxNode.from(token) node.type = 'AssignableIdentifier' return node } // atoms are the basic building blocks: literals, identifiers, words atom() { if (this.is($T.String)) return this.string() if (this.isAny($T.Null, $T.Boolean, $T.Number, $T.Identifier, $T.Word, $T.Regex)) return SyntaxNode.from(this.next()) const next = this.next() throw `[atom] unexpected token ${TokenType[next.type]}: ${JSON.stringify(next)}\n\n ${this.input}\n` } // blocks in if, do, special calls, etc // `: something end` // // `blockNode` determines whether we return [colon, BlockNode, end] or // just a list of statements like [colon, stmt1, stmt2, end] block(blockNode = true): SyntaxNode[] { const stmts: SyntaxNode[] = [] const colon = this.colon() while (!this.isExprEndKeyword() && !this.isEOF()) { const stmt = this.statement() if (stmt) stmts.push(stmt) } const out = [colon] if (blockNode) { const block = new SyntaxNode('Block', stmts[0]!.from, stmts.at(-1)!.to) block.push(...stmts) out.push(block) } else { out.push(...stmts) } return out } // catch err: block catch(): SyntaxNode { const keyword = this.keyword('catch') let catchVar if (this.is($T.Identifier)) catchVar = this.identifier() const block = this.block() const node = new SyntaxNode('CatchExpr', keyword.from, block.at(-1)!.to) node.push(keyword) if (catchVar) node.push(catchVar) return node.push(...block) } // colon colon(): SyntaxNode { const colon = SyntaxNode.from(this.expect($T.Colon)) colon.type = 'colon' // TODO lezer legacy return colon } // # comment comment(): SyntaxNode { return SyntaxNode.from(this.expect($T.Comment)) } // [ a b c ] = [ 1 2 3 ] destructure(array: SyntaxNode): SyntaxNode { const eq = this.op('=') const val = this.expression() for (const ident of array.children) { const varName = this.input.slice(ident.from, ident.to) this.scope.add(varName) } const node = new SyntaxNode('Assign', array.from, val.to) return node.push(array, eq, val) } // [ a=1 b=true c='three' ] dict(): SyntaxNode { const open = this.expect($T.OpenBracket) // empty dict [=] or [ = ] if (this.is($T.Operator, '=') && this.nextIs($T.CloseBracket)) { const _op = this.next() const close = this.next() return new SyntaxNode('Dict', open.from, close.to) } const values = [] while (!this.is($T.CloseBracket) && !this.isEOF()) { if (this.is($T.Semicolon) || this.is($T.Newline)) { this.next() continue } if (this.is($T.Comment)) { values.push(this.comment()) continue } if (this.is($T.NamedArgPrefix)) values.push(this.namedArg()) else values.push(this.value()) } const close = this.expect($T.CloseBracket) const node = new SyntaxNode('Dict', open.from, close.to) return node.push(...values) } // FunctionDef `do x y: something end` do(): SyntaxNode { const doNode = this.keyword('do') doNode.type = 'Do' this.scope = new Scope(this.scope) const params = [] while (!this.is($T.Colon) && !this.isExprEnd()) { let varName = this.current().value! if (varName.endsWith('=')) varName = varName.slice(0, varName.length - 1) this.scope.add(varName) let arg if (this.is($T.Identifier)) arg = this.identifier() else if (this.is($T.NamedArgPrefix)) arg = this.namedParam() else throw `[do] expected Identifier or NamedArgPrefix, got ${JSON.stringify(this.current())}\n\n ${this.input}\n` params.push(arg) } const block = this.block(false) let catchNode, finalNode if (this.is($T.Keyword, 'catch')) catchNode = this.catch() if (this.is($T.Keyword, 'finally')) finalNode = this.finally() let end = this.keyword('end') let last = block.at(-1) if (finalNode) last = finalNode.children.at(-1)! else if (catchNode) last = catchNode.children.at(-1)! const node = new SyntaxNode('FunctionDef', doNode.from, last!.to) node.add(doNode) const paramsNode = new SyntaxNode( 'Params', params[0]?.from ?? 0, params.at(-1)?.to ?? 0 ) if (params.length) paramsNode.push(...params) node.add(paramsNode) this.scope = this.scope.parent! node.push(...block) if (catchNode) node.push(catchNode) if (finalNode) node.push(finalNode) return node.push(end) } // config.path dotGet(): SyntaxNode { const left = this.identifier() const ident = this.input.slice(left.from, left.to) // not in scope, just return Word if (!this.scope.has(ident)) return this.word(left) if (left.type === 'Identifier') left.type = 'IdentifierBeforeDot' let parts = [] while (this.is($T.Operator, '.')) { this.next() parts.push(this.is($T.OpenParen) ? this.parens() : this.atom()) } // TODO lezer legacy - we can do a flat DotGet if we remove this const nodes = parts.length > 1 ? collapseDotGets(parts) : undefined const node = new SyntaxNode('DotGet', left.from, parts.at(-1)!.to) return nodes ? node.push(left, nodes!) : node.push(left, ...parts) } // dotget in a statement/expression (something.blah) or (something.blah arg1) dotGetFunctionCall(): SyntaxNode { const dotGet = this.dotGet() // dotget not in scope, regular Word if (dotGet.type === 'Word') return dotGet if (this.isExprEnd()) return this.functionCallOrIdentifier(dotGet) else return this.functionCall(dotGet) } // can be used in functions or try block finally(): SyntaxNode { const keyword = this.keyword('finally') const block = this.block() const node = new SyntaxNode('FinallyExpr', keyword.from, block.at(-1)!.to) return node.push(keyword, ...block) } // you're lookin at it functionCall(fn?: SyntaxNode): SyntaxNode { const ident = fn ?? this.identifier() const args: SyntaxNode[] = [] while (!this.isExprEnd() && !this.is($T.Operator, '|')) { if (this.is($T.NamedArgPrefix)) { args.push(this.namedArg()) } else { // 'do' is the only keyword allowed as a function argument const val = this.is($T.Keyword, 'do') ? this.do() : this.value() const arg = new SyntaxNode('PositionalArg', val.from, val.to) arg.add(val) args.push(arg) } } const node = new SyntaxNode('FunctionCall', ident.from, (args.at(-1) || ident).to) node.push(ident, ...args) if (!this.inTestExpr && this.is($T.Colon)) { const block = this.block() const end = this.keyword('end') const blockNode = new SyntaxNode('FunctionCallWithBlock', node.from, end.to) return blockNode.push(node, ...block, end) } return node } // bare identifier in an expression functionCallOrIdentifier(inner?: SyntaxNode) { if (!inner && this.nextIs($T.Operator, '.')) { inner = this.dotGet() // if the dotGet was just a Word, bail if (inner.type === 'Word') return inner } inner ??= this.identifier() const wrapper = new SyntaxNode('FunctionCallOrIdentifier', inner.from, inner.to) wrapper.push(inner) if (!this.inTestExpr && this.is($T.Colon)) { const block = this.block() const end = this.keyword('end') const node = new SyntaxNode('FunctionCallWithBlock', wrapper.from, end.to) return node.push(wrapper, ...block, end) } return wrapper } // function and variable names identifier(): SyntaxNode { return SyntaxNode.from(this.expect($T.Identifier)) } // if something: blah end // if something: blah else: blah end // if something: blah else if something: blah else: blah end if(): SyntaxNode { const ifNode = this.keyword('if') const test = this.testExpr() const ifBlock = this.block() const node = new SyntaxNode('IfExpr', ifNode.from, ifBlock.at(-1)!.to) node.push(ifNode, test) node.push(...ifBlock) while (this.is($T.Keyword, 'else') && this.nextIs($T.Keyword, 'if')) { const elseWord = this.keyword('else') const ifWord = this.keyword('if') const elseIfTest = this.testExpr() const elseIfBlock = this.block() const elseIfNode = new SyntaxNode('ElseIfExpr', ifBlock.at(-1)!.from, elseIfBlock.at(-1)!.to) elseIfNode.push(elseWord, ifWord, elseIfTest) elseIfNode.push(...elseIfBlock) node.push(elseIfNode) } if (this.is($T.Keyword, 'else') && this.nextIs($T.Colon)) { const elseWord = this.keyword('else') const elseBlock = this.block() const elseNode = new SyntaxNode('ElseExpr', ifBlock.at(-1)!.from, elseBlock.at(-1)!.to) elseNode.push(elseWord) elseNode.push(...elseBlock) node.push(elseNode) } return node.push(this.keyword('end')) } import(): SyntaxNode { const keyword = this.keyword('import') const args: SyntaxNode[] = [] while (!this.isExprEnd()) { if (this.is($T.NamedArgPrefix)) { const prefix = SyntaxNode.from(this.next()) const val = this.value() const arg = new SyntaxNode('NamedArg', prefix.from, val.to) arg.push(prefix, val) args.push(arg) } else { args.push(this.identifier()) } } const node = new SyntaxNode('Import', keyword.from, args.at(-1)!.to) node.add(keyword) return node.push(...args) } // if, while, do, etc keyword(name: string): SyntaxNode { const node = SyntaxNode.from(this.expect($T.Keyword, name)) node.type = 'keyword' // TODO lezer legacy return node } // abc= true namedArg(): SyntaxNode { const prefix = SyntaxNode.from(this.expect($T.NamedArgPrefix)) const val = this.value() const node = new SyntaxNode('NamedArg', prefix.from, val.to) return node.push(prefix, val) } // abc= null|true|123|'hi' namedParam(): SyntaxNode { const prefix = SyntaxNode.from(this.expect($T.NamedArgPrefix)) const val = this.value() if (!['Null', 'Boolean', 'Number', 'String'].includes(val.type)) throw `[namedParam] default value must be Null|Bool|Num|Str, got ${val.type}\n\n ${this.input}\n` const node = new SyntaxNode('NamedParam', prefix.from, val.to) return node.push(prefix, val) } // operators like + - = op(op?: string): SyntaxNode { const token = op ? this.expect($T.Operator, op) : this.expect($T.Operator) const name = operators[token.value!] if (!name) throw `[op] operator not registered: ${token.value!}\n\n ${this.input}\n` return new SyntaxNode(name, token.from, token.to) } // ( expressions in parens ) parens(): SyntaxNode { this.inParens++ const open = this.expect($T.OpenParen) const child = this.expression() const close = this.expect($T.CloseParen) this.inParens-- const node = new SyntaxNode('ParenExpr', open.from, close.to) node.add(child) return node } // 'hell yes' "hell no" { hell if i know } string(): SyntaxNode { const token = this.expect($T.String) return parseString(this.input, token.from, token.to, this) } // if TEST: blah end testExpr(): SyntaxNode { this.inTestExpr = true const expr = this.expression() this.inTestExpr = false return expr } // throw blah throw(): SyntaxNode { const keyword = this.keyword('throw') const val = this.value() const node = new SyntaxNode('Throw', keyword.from, val.to) return node.push(keyword, val) } // try: blah catch e: blah end try(): SyntaxNode { const tryNode = this.keyword('try') const tryBlock = this.block() let last = tryBlock.at(-1) let catchNode, finalNode if (this.is($T.Keyword, 'catch')) catchNode = this.catch() if (this.is($T.Keyword, 'finally')) finalNode = this.finally() const end = this.keyword('end') if (finalNode) last = finalNode.children.at(-1) else if (catchNode) last = catchNode.children.at(-1) const node = new SyntaxNode('TryExpr', tryNode.from, last!.to) node.push(tryNode, ...tryBlock) if (catchNode) node.push(catchNode) if (finalNode) node.push(finalNode) return node.push(end) } // while test: blah end while(): SyntaxNode { const keyword = this.keyword('while') const test = this.testExpr() const block = this.block() const end = this.keyword('end') const node = new SyntaxNode('WhileExpr', keyword.from, end.to) return node.push(keyword, test, ...block, end) } // readme.txt (when `readme` isn't in scope) word(start?: SyntaxNode): SyntaxNode { const parts = [start ?? this.expect($T.Word)] while (this.is($T.Operator, '.')) { this.next() if (this.isAny($T.Word, $T.Identifier, $T.Number)) parts.push(this.next()) } return new SyntaxNode('Word', parts[0]!.from, parts.at(-1)!.to) } // // helpers // current(): Token { return this.tokens[this.pos] || { type: TokenType.Newline, from: 0, to: 0 } } peek(offset = 1): Token | undefined { return this.tokens[this.pos + offset] } // look past newlines to check for a specific token peekPastNewlines(type: TokenType, value?: string): boolean { let offset = 1 let peek = this.peek(offset) while (peek && peek.type === $T.Newline) peek = this.peek(++offset) if (!peek || peek.type !== type) return false if (value !== undefined && peek.value !== value) return false return true } next(): Token { const token = this.current() this.pos++ return token } is(type: TokenType, value?: string): boolean { const token = this.current() if (!token || token.type !== type) return false if (value !== undefined && token.value !== value) return false return true } isAny(...type: TokenType[]): boolean { return type.some(x => this.is(x)) } nextIs(type: TokenType, value?: string): boolean { const token = this.peek() if (!token || token.type !== type) return false if (value !== undefined && token.value !== value) return false return true } nextIsAny(...type: TokenType[]): boolean { return type.some(x => this.nextIs(x)) } isExprEnd(): boolean { return this.isAny($T.Colon, $T.Semicolon, $T.Newline, $T.CloseParen, $T.CloseBracket) || this.isExprEndKeyword() || !this.current() } nextIsExprEnd(): boolean { // pipes act like expression end for function arg parsing if (this.nextIs($T.Operator, '|')) return true return this.nextIsAny($T.Colon, $T.Semicolon, $T.Newline, $T.CloseBracket, $T.CloseParen) || this.nextIs($T.Keyword, 'end') || this.nextIs($T.Keyword, 'else') || this.nextIs($T.Keyword, 'catch') || this.nextIs($T.Keyword, 'finally') || !this.peek() } isExprEndKeyword(): boolean { return this.is($T.Keyword, 'end') || this.is($T.Keyword, 'else') || this.is($T.Keyword, 'catch') || this.is($T.Keyword, 'finally') } isPipe(): boolean { // inside parens, only look for pipes on same line (don't look past newlines) const canLookPastNewlines = this.inParens === 0 return this.is($T.Operator, '|') || (canLookPastNewlines && this.peekPastNewlines($T.Operator, '|')) } expect(type: TokenType, value?: string): Token | never { if (!this.is(type, value)) { const token = this.current() throw `expected ${TokenType[type]}${value ? ` "${value}"` : ''}, got ${TokenType[token?.type || 0]}${token?.value ? ` "${token.value}"` : ''} at position ${this.pos}\n\n ${this.input}\n` } return this.next() } isEOF(): boolean { return this.pos >= this.tokens.length } } // TODO lezer legacy function collapseDotGets(origNodes: SyntaxNode[]): SyntaxNode { const nodes = [...origNodes] let right = nodes.pop()! while (nodes.length > 0) { const left = nodes.pop()! if (left.type === 'Identifier') left.type = 'IdentifierBeforeDot' const dot = new SyntaxNode("DotGet", left.from, right.to); dot.push(left, right) right = dot } return right }