shrimp/src/compiler/compiler.ts

706 lines
22 KiB
TypeScript

import { CompilerError } from '#compiler/compilerError.ts'
import { parser } from '#parser/shrimp.ts'
import * as terms from '#parser/shrimp.terms'
import { setGlobals } from '#parser/tokenizer'
import type { SyntaxNode, Tree } from '@lezer/common'
import { assert, errorMessage } from '#utils/utils'
import { toBytecode, type Bytecode, type ProgramItem, bytecodeToString } from 'reefvm'
import {
checkTreeForErrors,
getAllChildren,
getAssignmentParts,
getCompoundAssignmentParts,
getBinaryParts,
getDotGetParts,
getFunctionCallParts,
getFunctionDefParts,
getIfExprParts,
getNamedArgParts,
getPipeExprParts,
getStringParts,
getTryExprParts,
} from '#compiler/utils'
const DEBUG = false
// const DEBUG = true
type Label = `.${string}`
// Process escape sequences in strings
function processEscapeSeq(escapeSeq: string): string {
// escapeSeq includes the backslash, e.g., "\n", "\$", "\\"
if (escapeSeq.length !== 2) return escapeSeq
switch (escapeSeq[1]) {
case 'n':
return '\n'
case 't':
return '\t'
case 'r':
return '\r'
case '\\':
return '\\'
case "'":
return "'"
case '$':
return '$'
default:
return escapeSeq // Unknown escape, keep as-is
}
}
export class Compiler {
instructions: ProgramItem[] = []
fnLabelCount = 0
ifLabelCount = 0
tryLabelCount = 0
loopLabelCount = 0
bytecode: Bytecode
pipeCounter = 0
constructor(public input: string, globals?: string[]) {
try {
if (globals) setGlobals(globals)
const cst = parser.parse(input)
const errors = checkTreeForErrors(cst)
const firstError = errors[0]
if (firstError) {
throw firstError
}
this.#compileCst(cst, input)
this.bytecode = toBytecode(this.instructions)
if (DEBUG) {
const bytecodeString = bytecodeToString(this.bytecode)
console.log(`\n🤖 bytecode:\n----------------\n${bytecodeString}\n\n`)
}
} catch (error) {
if (error instanceof CompilerError) {
throw new Error(error.toReadableString(input))
} else {
throw new Error(`Unknown error during compilation:\n${errorMessage(error)}`)
}
}
}
#compileCst(cst: Tree, input: string) {
const isProgram = cst.topNode.type.id === terms.Program
assert(isProgram, `Expected Program node, got ${cst.topNode.type.name}`)
let child = cst.topNode.firstChild
while (child) {
this.instructions.push(...this.#compileNode(child, input))
child = child.nextSibling
}
this.instructions.push(['HALT'])
}
#compileNode(node: SyntaxNode, input: string): ProgramItem[] {
const value = input.slice(node.from, node.to)
if (DEBUG) console.log(`🫦 ${node.name}: ${value}`)
switch (node.type.id) {
case terms.Number:
const number = Number(value)
if (Number.isNaN(number))
throw new CompilerError(`Invalid number literal: ${value}`, node.from, node.to)
return [[`PUSH`, number]]
case terms.String: {
const { parts, hasInterpolation } = getStringParts(node, input)
// Simple string without interpolation or escapes - extract text directly
if (!hasInterpolation) {
// Remove surrounding quotes and return as-is
const strValue = value.slice(1, -1)
return [['PUSH', strValue]]
}
// String with interpolation or escapes - compile each part and concatenate
const instructions: ProgramItem[] = []
parts.forEach((part) => {
const partValue = input.slice(part.from, part.to)
switch (part.type.id) {
case terms.StringFragment:
// Plain text fragment - just push as-is
instructions.push(['PUSH', partValue])
break
case terms.EscapeSeq:
// Process escape sequence and push the result
const processed = processEscapeSeq(partValue)
instructions.push(['PUSH', processed])
break
case terms.Interpolation:
// Interpolation contains either Identifier or ParenExpr (the $ is anonymous)
const child = part.firstChild
if (!child) {
throw new CompilerError('Interpolation has no child', part.from, part.to)
}
// Compile the Identifier or ParenExpr
instructions.push(...this.#compileNode(child, input))
break
default:
throw new CompilerError(
`Unexpected string part: ${part.type.name}`,
part.from,
part.to
)
}
})
// Use STR_CONCAT to join all parts
instructions.push(['STR_CONCAT', parts.length])
return instructions
}
case terms.Boolean: {
return [[`PUSH`, value === 'true']]
}
case terms.Null: {
return [[`PUSH`, null]]
}
case terms.Regex: {
// remove the surrounding slashes and any flags
const [_, pattern, flags] = value.match(/^\/\/(.*)\/\/([gimsuy]*)$/) || []
if (!pattern) {
throw new CompilerError(`Invalid regex literal: ${value}`, node.from, node.to)
}
let regex: RegExp
try {
regex = new RegExp(pattern, flags)
} catch (e) {
throw new CompilerError(`Invalid regex literal: ${value}`, node.from, node.to)
}
return [['PUSH', regex]]
}
case terms.Identifier: {
return [[`TRY_LOAD`, value]]
}
case terms.Word: {
return [['PUSH', value]]
}
case terms.DotGet: {
const { objectName, property } = getDotGetParts(node, input)
const instructions: ProgramItem[] = []
instructions.push(['TRY_LOAD', objectName])
if (property.type.id === terms.ParenExpr) {
instructions.push(...this.#compileNode(property, input))
} else {
const propertyValue = input.slice(property.from, property.to)
instructions.push(['PUSH', propertyValue])
}
instructions.push(['DOT_GET'])
return instructions
}
case terms.BinOp: {
const { left, op, right } = getBinaryParts(node)
const instructions: ProgramItem[] = []
instructions.push(...this.#compileNode(left, input))
instructions.push(...this.#compileNode(right, input))
const opValue = input.slice(op.from, op.to)
switch (opValue) {
case '+':
instructions.push(['ADD'])
break
case '-':
instructions.push(['SUB'])
break
case '*':
instructions.push(['MUL'])
break
case '/':
instructions.push(['DIV'])
break
case '%':
instructions.push(['MOD'])
break
default:
throw new CompilerError(`Unsupported binary operator: ${opValue}`, op.from, op.to)
}
return instructions
}
case terms.Assign: {
const assignParts = getAssignmentParts(node)
const instructions: ProgramItem[] = []
// right-hand side
instructions.push(...this.#compileNode(assignParts.right, input))
// array destructuring: [ a b ] = [ 1 2 3 4 ]
if ('arrayPattern' in assignParts) {
const identifiers = assignParts.arrayPattern ?? []
if (identifiers.length === 0) return instructions
for (let i = 0; i < identifiers.length; i++) {
instructions.push(['DUP'])
instructions.push(['PUSH', i])
instructions.push(['DOT_GET'])
instructions.push(['STORE', input.slice(identifiers[i]!.from, identifiers[i]!.to)])
}
// original array still on stack as the return value
return instructions
}
// simple assignment: x = value
instructions.push(['DUP'])
const identifierName = input.slice(assignParts.identifier.from, assignParts.identifier.to)
instructions.push(['STORE', identifierName])
return instructions
}
case terms.CompoundAssign: {
const { identifier, operator, right } = getCompoundAssignmentParts(node)
const identifierName = input.slice(identifier.from, identifier.to)
const instructions: ProgramItem[] = []
// will throw if undefined
instructions.push(['LOAD', identifierName])
instructions.push(...this.#compileNode(right, input))
const opValue = input.slice(operator.from, operator.to)
switch (opValue) {
case '+=': instructions.push(['ADD']); break
case '-=': instructions.push(['SUB']); break
case '*=': instructions.push(['MUL']); break
case '/=': instructions.push(['DIV']); break
case '%=': instructions.push(['MOD']); break
default:
throw new CompilerError(`Unknown compound operator: ${opValue}`, operator.from, operator.to)
}
// DUP and store (same as regular assignment)
instructions.push(['DUP'])
instructions.push(['STORE', identifierName])
return instructions
}
case terms.ParenExpr: {
const child = node.firstChild
if (!child) return [] // I guess it is empty parentheses?
return this.#compileNode(child, input)
}
case terms.FunctionDef: {
const { paramNames, bodyNodes, catchVariable, catchBody, finallyBody } = getFunctionDefParts(
node,
input
)
const instructions: ProgramItem[] = []
const functionLabel: Label = `.func_${this.fnLabelCount++}`
const afterLabel: Label = `.after_${functionLabel}`
instructions.push(['JUMP', afterLabel])
instructions.push([`${functionLabel}:`])
const compileFunctionBody = () => {
const bodyInstructions: ProgramItem[] = []
bodyNodes.forEach((bodyNode, index) => {
bodyInstructions.push(...this.#compileNode(bodyNode, input))
if (index < bodyNodes.length - 1) {
bodyInstructions.push(['POP'])
}
})
return bodyInstructions
}
if (catchVariable || finallyBody) {
// If function has catch or finally, wrap body in try/catch/finally
instructions.push(
...this.#compileTryCatchFinally(compileFunctionBody, catchVariable, catchBody, finallyBody, input)
)
} else {
instructions.push(...compileFunctionBody())
}
instructions.push(['RETURN'])
instructions.push([`${afterLabel}:`])
instructions.push(['MAKE_FUNCTION', paramNames, functionLabel])
return instructions
}
case terms.FunctionCallOrIdentifier: {
if (node.firstChild?.type.id === terms.DotGet) {
return this.#compileNode(node.firstChild, input)
}
return [['TRY_CALL', value]]
}
/*
### Function Calls
Stack order (bottom to top):
LOAD fn
PUSH arg1 ; Positional args
PUSH arg2
PUSH "name" ; Named arg key
PUSH "value" ; Named arg value
PUSH 2 ; Positional count
PUSH 1 ; Named count
CALL
*/
case terms.FunctionCall: {
const { identifierNode, namedArgs, positionalArgs } = getFunctionCallParts(node, input)
const instructions: ProgramItem[] = []
instructions.push(...this.#compileNode(identifierNode, input))
positionalArgs.forEach((arg) => {
instructions.push(...this.#compileNode(arg, input))
})
namedArgs.forEach((arg) => {
const { name, valueNode } = getNamedArgParts(arg, input)
instructions.push(['PUSH', name])
instructions.push(...this.#compileNode(valueNode, input))
})
instructions.push(['PUSH', positionalArgs.length])
instructions.push(['PUSH', namedArgs.length])
instructions.push(['CALL'])
return instructions
}
case terms.Block: {
const children = getAllChildren(node)
const instructions: ProgramItem[] = []
children.forEach((child, index) => {
instructions.push(...this.#compileNode(child, input))
// keep only the last expression's value
if (index < children.length - 1) {
instructions.push(['POP'])
}
})
return instructions
}
case terms.TryExpr: {
const { tryBlock, catchVariable, catchBody, finallyBody } = getTryExprParts(node, input)
return this.#compileTryCatchFinally(
() => this.#compileNode(tryBlock, input),
catchVariable,
catchBody,
finallyBody,
input
)
}
case terms.Throw: {
const children = getAllChildren(node)
const [_throwKeyword, expression] = children
if (!expression) {
throw new CompilerError(
`Throw expected expression, got ${children.length} children`,
node.from,
node.to
)
}
const instructions: ProgramItem[] = []
instructions.push(...this.#compileNode(expression, input))
instructions.push(['THROW'])
return instructions
}
case terms.IfExpr: {
const { conditionNode, thenBlock, elseIfBlocks, elseThenBlock } = getIfExprParts(
node,
input
)
const instructions: ProgramItem[] = []
instructions.push(...this.#compileNode(conditionNode, input))
this.ifLabelCount++
const endLabel: Label = `.end_${this.ifLabelCount}`
const thenBlockInstructions = this.#compileNode(thenBlock, input)
instructions.push(['JUMP_IF_FALSE', thenBlockInstructions.length + 1])
instructions.push(...thenBlockInstructions)
instructions.push(['JUMP', endLabel])
// Else if
elseIfBlocks.forEach(({ conditional, thenBlock }) => {
instructions.push(...this.#compileNode(conditional, input))
const elseIfInstructions = this.#compileNode(thenBlock, input)
instructions.push(['JUMP_IF_FALSE', elseIfInstructions.length + 1])
instructions.push(...elseIfInstructions)
instructions.push(['JUMP', endLabel])
})
// Else
if (elseThenBlock) {
const elseThenInstructions = this.#compileNode(elseThenBlock, input)
instructions.push(...elseThenInstructions)
} else {
instructions.push(['PUSH', null])
}
instructions.push([`${endLabel}:`])
return instructions
}
// - `EQ`, `NEQ`, `LT`, `GT`, `LTE`, `GTE` - Pop 2, push boolean
case terms.ConditionalOp: {
const instructions: ProgramItem[] = []
const { left, op, right } = getBinaryParts(node)
const leftInstructions: ProgramItem[] = this.#compileNode(left, input)
const rightInstructions: ProgramItem[] = this.#compileNode(right, input)
const opValue = input.slice(op.from, op.to)
switch (opValue) {
case '==':
instructions.push(...leftInstructions, ...rightInstructions, ['EQ'])
break
case '!=':
instructions.push(...leftInstructions, ...rightInstructions, ['NEQ'])
break
case '<':
instructions.push(...leftInstructions, ...rightInstructions, ['LT'])
break
case '>':
instructions.push(...leftInstructions, ...rightInstructions, ['GT'])
break
case '<=':
instructions.push(...leftInstructions, ...rightInstructions, ['LTE'])
break
case '>=':
instructions.push(...leftInstructions, ...rightInstructions, ['GTE'])
break
case 'and':
instructions.push(...leftInstructions)
instructions.push(['DUP'])
instructions.push(['JUMP_IF_FALSE', rightInstructions.length + 1])
instructions.push(['POP'])
instructions.push(...rightInstructions)
break
case 'or':
instructions.push(...leftInstructions)
instructions.push(['DUP'])
instructions.push(['JUMP_IF_TRUE', rightInstructions.length + 1])
instructions.push(['POP'])
instructions.push(...rightInstructions)
break
default:
throw new CompilerError(`Unsupported conditional operator: ${opValue}`, op.from, op.to)
}
return instructions
}
case terms.PipeExpr: {
const { pipedFunctionCall, pipeReceivers } = getPipeExprParts(node)
if (!pipedFunctionCall || pipeReceivers.length === 0) {
throw new CompilerError('PipeExpr must have at least two operands', node.from, node.to)
}
const instructions: ProgramItem[] = []
instructions.push(...this.#compileNode(pipedFunctionCall, input))
this.pipeCounter++
const pipeValName = `_pipe_value_${this.pipeCounter}`
pipeReceivers.forEach((pipeReceiver) => {
instructions.push(['STORE', pipeValName])
const { identifierNode, namedArgs, positionalArgs } = getFunctionCallParts(
pipeReceiver,
input
)
instructions.push(...this.#compileNode(identifierNode, input))
const isUnderscoreInPositionalArgs = positionalArgs.some(
(arg) => arg.type.id === terms.Underscore
)
const isUnderscoreInNamedArgs = namedArgs.some((arg) => {
const { valueNode } = getNamedArgParts(arg, input)
return valueNode.type.id === terms.Underscore
})
const shouldPushPositionalArg = !isUnderscoreInPositionalArgs && !isUnderscoreInNamedArgs
// If no underscore is explicitly used, add the piped value as the first positional arg
if (shouldPushPositionalArg) {
instructions.push(['LOAD', pipeValName])
}
positionalArgs.forEach((arg) => {
if (arg.type.id === terms.Underscore) {
instructions.push(['LOAD', pipeValName])
} else {
instructions.push(...this.#compileNode(arg, input))
}
})
namedArgs.forEach((arg) => {
const { name, valueNode } = getNamedArgParts(arg, input)
instructions.push(['PUSH', name])
if (valueNode.type.id === terms.Underscore) {
instructions.push(['LOAD', pipeValName])
} else {
instructions.push(...this.#compileNode(valueNode, input))
}
})
instructions.push(['PUSH', positionalArgs.length + (shouldPushPositionalArg ? 1 : 0)])
instructions.push(['PUSH', namedArgs.length])
instructions.push(['CALL'])
})
return instructions
}
case terms.Array: {
const children = getAllChildren(node)
// We can easily parse [=] as an empty dict, but `[ = ]` is tougher.
// = can be a valid word, and is also valid inside words, so for now we cheat
// and check for arrays that look like `[ = ]` to interpret them as
// empty dicts
if (children.length === 1 && children[0]!.name === 'Word') {
const child = children[0]!
if (input.slice(child.from, child.to) === '=') {
return [['MAKE_DICT', 0]]
}
}
const instructions: ProgramItem[] = children.map((x) => this.#compileNode(x, input)).flat()
instructions.push(['MAKE_ARRAY', children.length])
return instructions
}
case terms.Dict: {
const children = getAllChildren(node)
const instructions: ProgramItem[] = []
children.forEach((node) => {
const keyNode = node.firstChild
const valueNode = node.firstChild!.nextSibling
// name= -> name
const key = input.slice(keyNode!.from, keyNode!.to).slice(0, -1)
instructions.push(['PUSH', key])
instructions.push(...this.#compileNode(valueNode!, input))
})
instructions.push(['MAKE_DICT', children.length])
return instructions
}
case terms.WhileExpr: {
const [_while, test, _colon, block] = getAllChildren(node)
const instructions: ProgramItem[] = []
this.loopLabelCount++
const startLoop = `.loop_${this.loopLabelCount}:`
const endLoop = `.end_loop_${this.loopLabelCount}:`
instructions.push([`${startLoop}:`])
instructions.push(...this.#compileNode(test!, input))
instructions.push(['JUMP_IF_FALSE', endLoop])
instructions.push(...this.#compileNode(block!, input))
instructions.push(['JUMP', startLoop])
instructions.push([`${endLoop}:`])
return instructions
}
default:
throw new CompilerError(
`Compiler doesn't know how to handle a "${node.type.name}" node.`,
node.from,
node.to
)
}
}
#compileTryCatchFinally(
compileTryBody: () => ProgramItem[],
catchVariable: string | undefined,
catchBody: SyntaxNode | undefined,
finallyBody: SyntaxNode | undefined,
input: string
): ProgramItem[] {
const instructions: ProgramItem[] = []
this.tryLabelCount++
const catchLabel: Label = `.catch_${this.tryLabelCount}`
const finallyLabel: Label = finallyBody ? `.finally_${this.tryLabelCount}` : (null as any)
const endLabel: Label = `.end_try_${this.tryLabelCount}`
instructions.push(['PUSH_TRY', catchLabel])
instructions.push(...compileTryBody())
instructions.push(['POP_TRY'])
instructions.push(['JUMP', finallyBody ? finallyLabel : endLabel])
// catch block
instructions.push([`${catchLabel}:`])
if (catchBody && catchVariable) {
instructions.push(['STORE', catchVariable])
const catchInstructions = this.#compileNode(catchBody, input)
instructions.push(...catchInstructions)
instructions.push(['JUMP', finallyBody ? finallyLabel : endLabel])
} else {
// no catch block
if (finallyBody) {
instructions.push(['JUMP', finallyLabel])
} else {
instructions.push(['THROW'])
}
}
// finally block
if (finallyBody) {
instructions.push([`${finallyLabel}:`])
const finallyInstructions = this.#compileNode(finallyBody, input)
instructions.push(...finallyInstructions)
// finally doesn't return a value
instructions.push(['POP'])
}
instructions.push([`${endLabel}:`])
return instructions
}
}