From 87cb01392aaf4d1774c6cab27b0e0e05365a148b Mon Sep 17 00:00:00 2001 From: Chris Wanstrath Date: Wed, 10 Dec 2025 13:20:28 -0800 Subject: [PATCH] remove lezer parser & grammar --- package.json | 5 +- src/compiler/compiler.ts | 105 +++--- src/compiler/utils.ts | 51 ++- src/index.ts | 2 +- src/parser/curlyTokenizer.ts | 3 +- src/parser/node.ts | 187 +---------- src/parser/operatorTokenizer.ts | 99 ------ src/parser/parser2.ts | 8 +- src/parser/parserScopeContext.ts | 129 -------- src/parser/shrimp.grammar | 299 ----------------- src/parser/shrimp.grammar.d.ts | 4 - src/parser/shrimp.terms.ts | 82 ----- src/parser/shrimp.ts | 27 -- src/parser/tests/basics.test.ts | 16 +- src/parser/tests/bitwise.test.ts | 2 - src/parser/tests/control-flow.test.ts | 12 +- src/parser/tests/destructuring.test.ts | 2 - src/parser/tests/exceptions.test.ts | 2 - src/parser/tests/function-blocks.test.ts | 16 +- src/parser/tests/functions.test.ts | 2 - src/parser/tests/import.test.ts | 2 - src/parser/tests/literals.test.ts | 2 - src/parser/tests/multiline.test.ts | 2 - src/parser/tests/pipes.test.ts | 3 - src/parser/tests/strings.test.ts | 4 +- src/parser/tokenizer.ts | 389 ----------------------- src/parser/tokenizer2.ts | 4 +- src/testSetup.ts | 34 +- 28 files changed, 122 insertions(+), 1371 deletions(-) delete mode 100644 src/parser/operatorTokenizer.ts delete mode 100644 src/parser/parserScopeContext.ts delete mode 100644 src/parser/shrimp.grammar delete mode 100644 src/parser/shrimp.grammar.d.ts delete mode 100644 src/parser/shrimp.terms.ts delete mode 100644 src/parser/shrimp.ts delete mode 100644 src/parser/tokenizer.ts diff --git a/package.json b/package.json index c5d783f..8fdc7e2 100644 --- a/package.json +++ b/package.json @@ -5,9 +5,8 @@ "private": true, "type": "module", "scripts": { - "dev": "bun generate-parser && bun --hot src/server/server.tsx", - "generate-parser": "lezer-generator src/parser/shrimp.grammar --typeScript -o src/parser/shrimp.ts", - "repl": "bun generate-parser && bun bin/repl", + "dev": "bun --hot src/server/server.tsx", + "repl": "bun bin/repl", "update-reef": "rm -rf ~/.bun/install/cache/ && rm bun.lock && bun update reefvm", "cli:install": "ln -s \"$(pwd)/bin/shrimp\" ~/.bun/bin/shrimp", "cli:remove": "rm ~/.bun/bin/shrimp", diff --git a/src/compiler/compiler.ts b/src/compiler/compiler.ts index 7a6e005..a3a4326 100644 --- a/src/compiler/compiler.ts +++ b/src/compiler/compiler.ts @@ -1,9 +1,6 @@ import { CompilerError } from '#compiler/compilerError.ts' -import { parse } from '#parser/parser2' +import { parse, setGlobals } from '#parser/parser2' import { SyntaxNode, Tree } from '#parser/node' -import { parser } from '#parser/shrimp.ts' -import * as terms from '#parser/shrimp.terms' -import { setGlobals } from '#parser/tokenizer' import { tokenizeCurlyString } from '#parser/curlyTokenizer' import { assert, errorMessage } from '#utils/utils' import { toBytecode, type Bytecode, type ProgramItem, bytecodeToString } from 'reefvm' @@ -91,7 +88,7 @@ export class Compiler { } #compileCst(cst: Tree, input: string) { - const isProgram = cst.topNode.type.id === terms.Program + const isProgram = cst.topNode.type.is('Program') assert(isProgram, `Expected Program node, got ${cst.topNode.type.name}`) let child = cst.topNode.firstChild @@ -107,8 +104,8 @@ export class Compiler { const value = input.slice(node.from, node.to) if (DEBUG) console.log(`🫦 ${node.name}: ${value}`) - switch (node.type.id) { - case terms.Number: + switch (node.type.name) { + case 'Number': // Handle sign prefix for hex, binary, and octal literals // Number() doesn't parse '-0xFF', '+0xFF', '-0o77', etc. correctly let numberValue: number @@ -125,8 +122,8 @@ export class Compiler { return [[`PUSH`, numberValue]] - case terms.String: { - if (node.firstChild?.type.id === terms.CurlyString) + case 'String': { + if (node.firstChild?.type.is('CurlyString')) return this.#compileCurlyString(value, input) const { parts, hasInterpolation } = getStringParts(node, input) @@ -143,19 +140,19 @@ export class Compiler { parts.forEach((part) => { const partValue = input.slice(part.from, part.to) - switch (part.type.id) { - case terms.StringFragment: + switch (part.type.name) { + case 'StringFragment': // Plain text fragment - just push as-is instructions.push(['PUSH', partValue]) break - case terms.EscapeSeq: + case 'EscapeSeq': // Process escape sequence and push the result const processed = processEscapeSeq(partValue) instructions.push(['PUSH', processed]) break - case terms.Interpolation: + case 'Interpolation': // Interpolation contains either Identifier or ParenExpr (the $ is anonymous) const child = part.firstChild if (!child) { @@ -179,15 +176,15 @@ export class Compiler { return instructions } - case terms.Boolean: { + case 'Boolean': { return [[`PUSH`, value === 'true']] } - case terms.Null: { + case 'Null': { return [[`PUSH`, null]] } - case terms.Regex: { + case 'Regex': { // remove the surrounding slashes and any flags const [_, pattern, flags] = value.match(/^\/\/(.*)\/\/([gimsuy]*)$/) || [] if (!pattern) { @@ -204,15 +201,15 @@ export class Compiler { return [['PUSH', regex]] } - case terms.Identifier: { + case 'Identifier': { return [[`TRY_LOAD`, value]] } - case terms.Word: { + case 'Word': { return [['PUSH', value]] } - case terms.DotGet: { + case 'DotGet': { // DotGet is parsed into a nested tree because it's hard to parse it into a flat one. // However, we want a flat tree - so we're going to pretend like we are getting one from the parser. // @@ -224,7 +221,7 @@ export class Compiler { instructions.push(['TRY_LOAD', objectName]) const flattenProperty = (prop: SyntaxNode): void => { - if (prop.type.id === terms.DotGet) { + if (prop.type.is('DotGet')) { const nestedParts = getDotGetParts(prop, input) const nestedObjectValue = input.slice(nestedParts.object.from, nestedParts.object.to) @@ -233,7 +230,7 @@ export class Compiler { flattenProperty(nestedParts.property) } else { - if (prop.type.id === terms.ParenExpr) { + if (prop.type.is('ParenExpr')) { instructions.push(...this.#compileNode(prop, input)) } else { const propertyValue = input.slice(prop.from, prop.to) @@ -247,7 +244,7 @@ export class Compiler { return instructions } - case terms.BinOp: { + case 'BinOp': { const { left, op, right } = getBinaryParts(node) const instructions: ProgramItem[] = [] instructions.push(...this.#compileNode(left, input)) @@ -295,7 +292,7 @@ export class Compiler { return instructions } - case terms.Assign: { + case 'Assign': { const assignParts = getAssignmentParts(node) const instructions: ProgramItem[] = [] @@ -326,7 +323,7 @@ export class Compiler { return instructions } - case terms.CompoundAssign: { + case 'CompoundAssign': { const { identifier, operator, right } = getCompoundAssignmentParts(node) const identifierName = input.slice(identifier.from, identifier.to) const instructions: ProgramItem[] = [] @@ -388,14 +385,14 @@ export class Compiler { return instructions } - case terms.ParenExpr: { + case 'ParenExpr': { const child = node.firstChild if (!child) return [] // I guess it is empty parentheses? return this.#compileNode(child, input) } - case terms.FunctionDef: { + case 'FunctionDef': { const { paramNames, bodyNodes, catchVariable, catchBody, finallyBody } = getFunctionDefParts(node, input) const instructions: ProgramItem[] = [] @@ -441,8 +438,8 @@ export class Compiler { return instructions } - case terms.FunctionCallOrIdentifier: { - if (node.firstChild?.type.id === terms.DotGet) { + case 'FunctionCallOrIdentifier': { + if (node.firstChild?.type.is('DotGet')) { const instructions: ProgramItem[] = [] const callLabel: Label = `.call_dotget_${++this.labelCount}` const afterLabel: Label = `.after_dotget_${++this.labelCount}` @@ -484,8 +481,8 @@ export class Compiler { PUSH 1 ; Named count CALL */ - case terms.FunctionCallWithNewlines: - case terms.FunctionCall: { + + case 'FunctionCall': { const { identifierNode, namedArgs, positionalArgs } = getFunctionCallParts(node, input) const instructions: ProgramItem[] = [] instructions.push(...this.#compileNode(identifierNode, input)) @@ -507,7 +504,7 @@ export class Compiler { return instructions } - case terms.Block: { + case 'Block': { const children = getAllChildren(node) const instructions: ProgramItem[] = [] @@ -522,7 +519,7 @@ export class Compiler { return instructions } - case terms.FunctionCallWithBlock: { + case 'FunctionCallWithBlock': { const [fn, _colon, ...block] = getAllChildren(node) let instructions: ProgramItem[] = [] @@ -540,13 +537,13 @@ export class Compiler { instructions.push(['RETURN']) instructions.push([`${afterLabel}:`]) - if (fn?.type.id === terms.FunctionCallOrIdentifier) { + if (fn?.type.is('FunctionCallOrIdentifier')) { instructions.push(['LOAD', input.slice(fn!.from, fn!.to)]) instructions.push(['MAKE_FUNCTION', [], fnLabel]) instructions.push(['PUSH', 1]) instructions.push(['PUSH', 0]) instructions.push(['CALL']) - } else if (fn?.type.id === terms.FunctionCall) { + } else if (fn?.type.is('FunctionCall')) { let body = this.#compileNode(fn!, input) const namedArgCount = (body[body.length - 2]![1] as number) * 2 const startSlice = body.length - namedArgCount - 3 @@ -569,7 +566,7 @@ export class Compiler { return instructions } - case terms.TryExpr: { + case 'TryExpr': { const { tryBlock, catchVariable, catchBody, finallyBody } = getTryExprParts(node, input) return this.#compileTryCatchFinally( @@ -581,9 +578,9 @@ export class Compiler { ) } - case terms.Throw: - case terms.Not: { - const keyword = node.type.id === terms.Throw ? 'Throw' : 'Not' + case 'Throw': + case 'Not': { + const keyword = node.type.is('Throw') ? 'Throw' : 'Not' const children = getAllChildren(node) const [_throwKeyword, expression] = children if (!expression) { @@ -601,7 +598,7 @@ export class Compiler { return instructions } - case terms.IfExpr: { + case 'IfExpr': { const { conditionNode, thenBlock, elseIfBlocks, elseThenBlock } = getIfExprParts( node, input @@ -644,7 +641,7 @@ export class Compiler { } // - `EQ`, `NEQ`, `LT`, `GT`, `LTE`, `GTE` - Pop 2, push boolean - case terms.ConditionalOp: { + case 'ConditionalOp': { const instructions: ProgramItem[] = [] const { left, op, right } = getBinaryParts(node) const leftInstructions: ProgramItem[] = this.#compileNode(left, input) @@ -719,7 +716,7 @@ export class Compiler { return instructions } - case terms.PipeExpr: { + case 'PipeExpr': { const { pipedFunctionCall, pipeReceivers } = getPipeExprParts(node) if (!pipedFunctionCall || pipeReceivers.length === 0) { throw new CompilerError('PipeExpr must have at least two operands', node.from, node.to) @@ -741,11 +738,11 @@ export class Compiler { instructions.push(...this.#compileNode(identifierNode, input)) const isUnderscoreInPositionalArgs = positionalArgs.some( - (arg) => arg.type.id === terms.Underscore + (arg) => arg.type.is('Underscore') ) const isUnderscoreInNamedArgs = namedArgs.some((arg) => { const { valueNode } = getNamedArgParts(arg, input) - return valueNode.type.id === terms.Underscore + return valueNode.type.is('Underscore') }) const shouldPushPositionalArg = !isUnderscoreInPositionalArgs && !isUnderscoreInNamedArgs @@ -756,7 +753,7 @@ export class Compiler { } positionalArgs.forEach((arg) => { - if (arg.type.id === terms.Underscore) { + if (arg.type.is('Underscore')) { instructions.push(['LOAD', pipeValName]) } else { instructions.push(...this.#compileNode(arg, input)) @@ -766,7 +763,7 @@ export class Compiler { namedArgs.forEach((arg) => { const { name, valueNode } = getNamedArgParts(arg, input) instructions.push(['PUSH', name]) - if (valueNode.type.id === terms.Underscore) { + if (valueNode.type.is('Underscore')) { instructions.push(['LOAD', pipeValName]) } else { instructions.push(...this.#compileNode(valueNode, input)) @@ -781,14 +778,14 @@ export class Compiler { return instructions } - case terms.Array: { + case 'Array': { const children = getAllChildren(node) // We can easily parse [=] as an empty dict, but `[ = ]` is tougher. // = can be a valid word, and is also valid inside words, so for now we cheat // and check for arrays that look like `[ = ]` to interpret them as // empty dicts - if (children.length === 1 && children[0]!.type.id === terms.Word) { + if (children.length === 1 && children[0]!.type.is('Word')) { const child = children[0]! if (input.slice(child.from, child.to) === '=') { return [['MAKE_DICT', 0]] @@ -800,7 +797,7 @@ export class Compiler { return instructions } - case terms.Dict: { + case 'Dict': { const children = getAllChildren(node) const instructions: ProgramItem[] = [] @@ -819,7 +816,7 @@ export class Compiler { return instructions } - case terms.WhileExpr: { + case 'WhileExpr': { const [_while, test, _colon, block] = getAllChildren(node) const instructions: ProgramItem[] = [] @@ -837,11 +834,11 @@ export class Compiler { return instructions } - case terms.Import: { + case 'Import': { const instructions: ProgramItem[] = [] const [_import, ...nodes] = getAllChildren(node) - const args = nodes.filter(node => node.type.id === terms.Identifier) - const namedArgs = nodes.filter(node => node.type.id === terms.NamedArg) + const args = nodes.filter(node => node.type.is('Identifier')) + const namedArgs = nodes.filter(node => node.type.is('NamedArg')) instructions.push(['LOAD', 'import']) @@ -862,13 +859,13 @@ export class Compiler { return instructions } - case terms.Comment: { + case 'Comment': { return [] // ignore comments } default: throw new CompilerError( - `Compiler doesn't know how to handle a "${node.type.name}" (${node.type.id}) node.`, + `Compiler doesn't know how to handle a "${node.type.name}" node.`, node.from, node.to ) diff --git a/src/compiler/utils.ts b/src/compiler/utils.ts index c839644..e3816c2 100644 --- a/src/compiler/utils.ts +++ b/src/compiler/utils.ts @@ -1,5 +1,4 @@ import { CompilerError } from '#compiler/compilerError.ts' -import * as terms from '#parser/shrimp.terms' import type { SyntaxNode, Tree } from '#parser/node' export const checkTreeForErrors = (tree: Tree): CompilerError[] => { @@ -24,7 +23,7 @@ export const getAllChildren = (node: SyntaxNode): SyntaxNode[] => { child = child.nextSibling } - return children.filter((n) => n.type.id !== terms.Comment) + return children.filter((n) => !n.type.is('Comment')) } export const getBinaryParts = (node: SyntaxNode) => { @@ -51,12 +50,12 @@ export const getAssignmentParts = (node: SyntaxNode) => { } // array destructuring - if (left && left.type.id === terms.Array) { - const identifiers = getAllChildren(left).filter((child) => child.type.id === terms.Identifier) + if (left && left.type.is('Array')) { + const identifiers = getAllChildren(left).filter((child) => child.type.is('Identifier')) return { arrayPattern: identifiers, right } } - if (!left || left.type.id !== terms.AssignableIdentifier) { + if (!left || !left.type.is('AssignableIdentifier')) { throw new CompilerError( `Assign left child must be an AssignableIdentifier or Array, got ${left ? left.type.name : 'none' }`, @@ -72,7 +71,7 @@ export const getCompoundAssignmentParts = (node: SyntaxNode) => { const children = getAllChildren(node) const [left, operator, right] = children - if (!left || left.type.id !== terms.AssignableIdentifier) { + if (!left || !left.type.is('AssignableIdentifier')) { throw new CompilerError( `CompoundAssign left child must be an AssignableIdentifier, got ${left ? left.type.name : 'none' }`, @@ -103,7 +102,7 @@ export const getFunctionDefParts = (node: SyntaxNode, input: string) => { } const paramNames = getAllChildren(paramsNode).map((param) => { - if (param.type.id !== terms.Identifier && param.type.id !== terms.NamedParam) { + if (!param.type.is('Identifier') && !param.type.is('NamedParam')) { throw new CompilerError( `FunctionDef params must be Identifier or NamedParam, got ${param.type.name}`, param.from, @@ -122,7 +121,7 @@ export const getFunctionDefParts = (node: SyntaxNode, input: string) => { let finallyBody: SyntaxNode | undefined for (const child of rest) { - if (child.type.id === terms.CatchExpr) { + if (child.type.is('CatchExpr')) { catchExpr = child const catchChildren = getAllChildren(child) const [_catchKeyword, identifierNode, _colon, body] = catchChildren @@ -135,7 +134,7 @@ export const getFunctionDefParts = (node: SyntaxNode, input: string) => { } catchVariable = input.slice(identifierNode.from, identifierNode.to) catchBody = body - } else if (child.type.id === terms.FinallyExpr) { + } else if (child.type.is('FinallyExpr')) { finallyExpr = child const finallyChildren = getAllChildren(child) const [_finallyKeyword, _colon, body] = finallyChildren @@ -164,9 +163,9 @@ export const getFunctionCallParts = (node: SyntaxNode, input: string) => { throw new CompilerError(`FunctionCall expected at least 1 child, got 0`, node.from, node.to) } - const namedArgs = args.filter((arg) => arg.type.id === terms.NamedArg) + const namedArgs = args.filter((arg) => arg.type.is('NamedArg')) const positionalArgs = args - .filter((arg) => arg.type.id === terms.PositionalArg) + .filter((arg) => arg.type.is('PositionalArg')) .map((arg) => { const child = arg.firstChild if (!child) throw new CompilerError(`PositionalArg has no child`, arg.from, arg.to) @@ -207,13 +206,13 @@ export const getIfExprParts = (node: SyntaxNode, input: string) => { rest.forEach((child) => { const parts = getAllChildren(child) - if (child.type.id === terms.ElseExpr) { + if (child.type.is('ElseExpr')) { if (parts.length !== 3) { const message = `ElseExpr expected 1 child, got ${parts.length}` throw new CompilerError(message, child.from, child.to) } elseThenBlock = parts.at(-1) - } else if (child.type.id === terms.ElseIfExpr) { + } else if (child.type.is('ElseIfExpr')) { const [_else, _if, conditional, _colon, thenBlock] = parts if (!conditional || !thenBlock) { const names = parts.map((p) => p.type.name).join(', ') @@ -248,10 +247,10 @@ export const getStringParts = (node: SyntaxNode, input: string) => { // The text is just between the quotes const parts = children.filter((child) => { return ( - child.type.id === terms.StringFragment || - child.type.id === terms.Interpolation || - child.type.id === terms.EscapeSeq || - child.type.id === terms.CurlyString + child.type.is('StringFragment') || + child.type.is('Interpolation') || + child.type.is('EscapeSeq') || + child.type.is('CurlyString') ) }) @@ -259,10 +258,10 @@ export const getStringParts = (node: SyntaxNode, input: string) => { // Validate each part is the expected type parts.forEach((part) => { if ( - part.type.id !== terms.StringFragment && - part.type.id !== terms.Interpolation && - part.type.id !== terms.EscapeSeq && - part.type.id !== terms.CurlyString + part.type.is('StringFragment') && + part.type.is('Interpolation') && + part.type.is('EscapeSeq') && + part.type.is('CurlyString') ) { throw new CompilerError( `String child must be StringFragment, Interpolation, or EscapeSeq, got ${part.type.name}`, @@ -275,7 +274,7 @@ export const getStringParts = (node: SyntaxNode, input: string) => { // hasInterpolation means the string has interpolation ($var) or escape sequences (\n) // A simple string like 'hello' has one StringFragment but no interpolation const hasInterpolation = parts.some( - (p) => p.type.id === terms.Interpolation || p.type.id === terms.EscapeSeq + (p) => p.type.is('Interpolation') || p.type.is('EscapeSeq') ) return { parts, hasInterpolation } } @@ -292,7 +291,7 @@ export const getDotGetParts = (node: SyntaxNode, input: string) => { ) } - if (object.type.id !== terms.IdentifierBeforeDot && object.type.id !== terms.Dollar) { + if (!object.type.is('IdentifierBeforeDot')) { throw new CompilerError( `DotGet object must be an IdentifierBeforeDot, got ${object.type.name}`, object.from, @@ -300,7 +299,7 @@ export const getDotGetParts = (node: SyntaxNode, input: string) => { ) } - if (![terms.Identifier, terms.Number, terms.ParenExpr, terms.DotGet].includes(property.type.id)) { + if (!['Identifier', 'Number', 'ParenExpr', 'DotGet'].includes(property.type.name)) { throw new CompilerError( `DotGet property must be an Identifier, Number, ParenExpr, or DotGet, got ${property.type.name}`, property.from, @@ -334,7 +333,7 @@ export const getTryExprParts = (node: SyntaxNode, input: string) => { let finallyBody: SyntaxNode | undefined rest.forEach((child) => { - if (child.type.id === terms.CatchExpr) { + if (child.type.is('CatchExpr')) { catchExpr = child const catchChildren = getAllChildren(child) const [_catchKeyword, identifierNode, _colon, body] = catchChildren @@ -347,7 +346,7 @@ export const getTryExprParts = (node: SyntaxNode, input: string) => { } catchVariable = input.slice(identifierNode.from, identifierNode.to) catchBody = body - } else if (child.type.id === terms.FinallyExpr) { + } else if (child.type.is('FinallyExpr')) { finallyExpr = child const finallyChildren = getAllChildren(child) const [_finallyKeyword, _colon, body] = finallyChildren diff --git a/src/index.ts b/src/index.ts index c82bb5e..6062fb3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,7 +3,7 @@ import { VM, fromValue, toValue, isValue, type Bytecode } from 'reefvm' import { Compiler } from '#compiler/compiler' import { parse } from '#parser/parser2' import { Tree } from '#parser/node' -import { globals as parserGlobals, setGlobals as setParserGlobals } from '#parser/tokenizer' +import { globals as parserGlobals, setGlobals as setParserGlobals } from '#parser/parser2' import { globals as prelude } from '#prelude' export { Compiler } from '#compiler/compiler' diff --git a/src/parser/curlyTokenizer.ts b/src/parser/curlyTokenizer.ts index 9495bc0..95e5af6 100644 --- a/src/parser/curlyTokenizer.ts +++ b/src/parser/curlyTokenizer.ts @@ -1,7 +1,6 @@ -import { parser } from '#parser/shrimp.ts' import { parse } from '#parser/parser2' import type { SyntaxNode } from '#parser/node' -import { isIdentStart, isIdentChar } from './tokenizer' +import { isIdentStart, isIdentChar } from './tokenizer2' // Turns a { curly string } into strings and nodes for interpolation export const tokenizeCurlyString = (value: string): (string | [string, SyntaxNode])[] => { diff --git a/src/parser/node.ts b/src/parser/node.ts index 8816fe4..ca256de 100644 --- a/src/parser/node.ts +++ b/src/parser/node.ts @@ -1,5 +1,4 @@ import { type Token, TokenType } from './tokenizer2' -import * as term from './shrimp.terms' export type NodeType = | 'Program' @@ -140,183 +139,6 @@ export class Tree { } } -// TODO: TEMPORARY SHIM -class SyntaxNodeType { - constructor(public nodeType: NodeType, public isError: boolean) { } - - is(other: string) { - return this.nodeType === other - } - - get id(): number { - switch (this.nodeType) { - case 'Program': - return term.Program - - case 'Block': - return term.Block - - case 'FunctionCall': - return term.FunctionCall - - case 'FunctionCallOrIdentifier': - return term.FunctionCallOrIdentifier - - case 'FunctionCallWithBlock': - return term.FunctionCallWithBlock - - case 'PositionalArg': - return term.PositionalArg - - case 'NamedArg': - return term.NamedArg - - case 'FunctionDef': - return term.FunctionDef - - case 'Params': - return term.Params - - case 'NamedParam': - return term.NamedParam - - case 'Null': - return term.Null - - case 'Boolean': - return term.Boolean - - case 'Number': - return term.Number - - case 'String': - return term.String - - case 'StringFragment': - return term.StringFragment - - case 'CurlyString': - return term.CurlyString - - case 'DoubleQuote': - return term.DoubleQuote - - case 'EscapeSeq': - return term.EscapeSeq - - case 'Interpolation': - return term.Interpolation - - case 'Regex': - return term.Regex - - case 'Identifier': - return term.Identifier - - case 'AssignableIdentifier': - return term.AssignableIdentifier - - case 'IdentifierBeforeDot': - return term.IdentifierBeforeDot - - case 'Word': - return term.Word - - case 'Array': - return term.Array - - case 'Dict': - return term.Dict - - case 'Comment': - return term.Comment - - case 'BinOp': - return term.BinOp - - case 'ConditionalOp': - return term.ConditionalOp - - case 'ParenExpr': - return term.ParenExpr - - case 'Assign': - return term.Assign - - case 'CompoundAssign': - return term.CompoundAssign - - case 'DotGet': - return term.DotGet - - case 'PipeExpr': - return term.PipeExpr - - case 'IfExpr': - return term.IfExpr - - case 'ElseIfExpr': - return term.ElseIfExpr - - case 'ElseExpr': - return term.ElseExpr - - case 'WhileExpr': - return term.WhileExpr - - case 'TryExpr': - return term.TryExpr - - case 'CatchExpr': - return term.CatchExpr - - case 'FinallyExpr': - return term.FinallyExpr - - case 'Throw': - return term.Throw - - case 'Not': - return term.Not - - case 'Eq': - return term.Eq - - case 'Modulo': - return term.Modulo - - case 'Plus': - return term.Plus - - case 'Star': - return term.Star - - case 'Slash': - return term.Slash - - case 'Import': - return term.Import - - case 'Do': - return term.Do - - case 'Underscore': - return term.Underscore - - case 'colon': - return term.colon - - case 'keyword': - return term.keyword - } - return 0 - } - - get name(): string { - return this.nodeType - } -} - export class SyntaxNode { #type: NodeType #isError = false @@ -336,8 +158,13 @@ export class SyntaxNode { return new SyntaxNode(TokenType[token.type] as NodeType, token.from, token.to, parent ?? null) } - get type(): SyntaxNodeType { - return new SyntaxNodeType(this.#type, this.#isError) + get type(): { type: NodeType, name: NodeType, isError: boolean, is: (other: NodeType) => boolean } { + return { + type: this.#type, + name: this.#type, + isError: this.#isError, + is: (other: NodeType) => other === this.#type + } } set type(name: NodeType) { diff --git a/src/parser/operatorTokenizer.ts b/src/parser/operatorTokenizer.ts deleted file mode 100644 index 1ef7a94..0000000 --- a/src/parser/operatorTokenizer.ts +++ /dev/null @@ -1,99 +0,0 @@ -import { ExternalTokenizer, InputStream } from '@lezer/lr' -import * as terms from './shrimp.terms' - -type Operator = { str: string; tokenName: keyof typeof terms } -const operators: Array = [ - { str: 'and', tokenName: 'And' }, - { str: 'or', tokenName: 'Or' }, - { str: 'band', tokenName: 'Band' }, - { str: 'bor', tokenName: 'Bor' }, - { str: 'bxor', tokenName: 'Bxor' }, - { str: '>>>', tokenName: 'Ushr' }, // Must come before >> - { str: '>>', tokenName: 'Shr' }, - { str: '<<', tokenName: 'Shl' }, - { str: '>=', tokenName: 'Gte' }, - { str: '<=', tokenName: 'Lte' }, - { str: '!=', tokenName: 'Neq' }, - { str: '==', tokenName: 'EqEq' }, - - // Compound assignment operators (must come before single-char operators) - { str: '??=', tokenName: 'NullishEq' }, - { str: '+=', tokenName: 'PlusEq' }, - { str: '-=', tokenName: 'MinusEq' }, - { str: '*=', tokenName: 'StarEq' }, - { str: '/=', tokenName: 'SlashEq' }, - { str: '%=', tokenName: 'ModuloEq' }, - - // Nullish coalescing (must come before it could be mistaken for other tokens) - { str: '??', tokenName: 'NullishCoalesce' }, - - // Single-char operators - { str: '*', tokenName: 'Star' }, - { str: '=', tokenName: 'Eq' }, - { str: '/', tokenName: 'Slash' }, - { str: '+', tokenName: 'Plus' }, - { str: '-', tokenName: 'Minus' }, - { str: '>', tokenName: 'Gt' }, - { str: '<', tokenName: 'Lt' }, - { str: '%', tokenName: 'Modulo' }, -] - -export const operatorTokenizer = new ExternalTokenizer((input: InputStream) => { - for (let operator of operators) { - if (!matchesString(input, 0, operator.str)) continue - const afterOpPos = operator.str.length - const charAfterOp = input.peek(afterOpPos) - if (!isWhitespace(charAfterOp)) continue - - // Accept the operator token - const token = terms[operator.tokenName] - if (token === undefined) { - throw new Error(`Unknown token name: ${operator.tokenName}`) - } - - input.advance(afterOpPos) - input.acceptToken(token) - - return - } -}) - -const isWhitespace = (ch: number): boolean => { - return matchesChar(ch, [' ', '\t', '\n']) -} - -const matchesChar = (ch: number, chars: (string | number)[]): boolean => { - for (const c of chars) { - if (typeof c === 'number') { - if (ch === c) { - return true - } - } else if (ch === c.charCodeAt(0)) { - return true - } - } - return false -} - -const matchesString = (input: InputStream, pos: number, str: string): boolean => { - for (let i = 0; i < str.length; i++) { - if (input.peek(pos + i) !== str.charCodeAt(i)) { - return false - } - } - return true -} - -const peek = (numChars: number, input: InputStream): string => { - let result = '' - for (let i = 0; i < numChars; i++) { - const ch = input.peek(i) - if (ch === -1) { - result += 'EOF' - break - } else { - result += String.fromCharCode(ch) - } - } - return result -} diff --git a/src/parser/parser2.ts b/src/parser/parser2.ts index e96fa42..5cfa672 100644 --- a/src/parser/parser2.ts +++ b/src/parser/parser2.ts @@ -1,11 +1,17 @@ import { CompilerError } from '#compiler/compilerError' import { Scanner, type Token, TokenType } from './tokenizer2' import { SyntaxNode, operators, precedence, conditionals, compounds } from './node' -import { globals } from './tokenizer' import { parseString } from './stringParser' const $T = TokenType +// tell the dotGet searcher about builtin globals +export const globals: string[] = [] +export const setGlobals = (newGlobals: string[] | Record) => { + globals.length = 0 + globals.push(...(Array.isArray(newGlobals) ? newGlobals : Object.keys(newGlobals))) +} + export const parse = (input: string): SyntaxNode => { const parser = new Parser() return parser.parse(input) diff --git a/src/parser/parserScopeContext.ts b/src/parser/parserScopeContext.ts deleted file mode 100644 index 7ce09e0..0000000 --- a/src/parser/parserScopeContext.ts +++ /dev/null @@ -1,129 +0,0 @@ -import { ContextTracker, InputStream } from '@lezer/lr' -import * as terms from './shrimp.terms' - -export class Scope { - constructor(public parent: Scope | null, public vars = new Set()) { } - - has(name: string): boolean { - return this.vars.has(name) || (this.parent?.has(name) ?? false) - } - - hash(): number { - let h = 0 - for (const name of this.vars) { - for (let i = 0; i < name.length; i++) { - h = (h << 5) - h + name.charCodeAt(i) - h |= 0 - } - } - if (this.parent) { - h = (h << 5) - h + this.parent.hash() - h |= 0 - } - return h - } - - // Static methods that return new Scopes (immutable operations) - - static add(scope: Scope, ...names: string[]): Scope { - const newVars = new Set(scope.vars) - names.forEach((name) => newVars.add(name)) - return new Scope(scope.parent, newVars) - } - - push(): Scope { - return new Scope(this, new Set()) - } - - pop(): Scope { - return this.parent ?? this - } -} - -// Tracker context that combines Scope with temporary pending identifiers -class TrackerContext { - constructor(public scope: Scope, public pendingIds: string[] = []) { } -} - -// Extract identifier text from input stream -const readIdentifierText = (input: InputStream, start: number, end: number): string => { - let text = '' - for (let i = start; i < end; i++) { - const offset = i - input.pos - const ch = input.peek(offset) - if (ch === -1) break - text += String.fromCharCode(ch) - } - return text -} - -let inParams = false - -export const trackScope = new ContextTracker({ - start: new TrackerContext(new Scope(null, new Set())), - - shift(context, term, stack, input) { - if (term == terms.Do) inParams = true - - if (term === terms.AssignableIdentifier) { - const text = readIdentifierText(input, input.pos, stack.pos) - return new TrackerContext(Scope.add(context.scope, text), context.pendingIds) - } - - if (inParams && term === terms.Identifier) { - const text = readIdentifierText(input, input.pos, stack.pos) - return new TrackerContext(context.scope, [...context.pendingIds, text]) - } - - // Track identifiers in array destructuring: [ a b ] = ... - if (!inParams && term === terms.Identifier && isArrayDestructuring(input)) { - const text = readIdentifierText(input, input.pos, stack.pos) - return new TrackerContext(Scope.add(context.scope, text), context.pendingIds) - } - - return context - }, - - reduce(context, term) { - if (term === terms.Params) { - inParams = false - let newScope = context.scope.push() - if (context.pendingIds.length > 0) { - newScope = Scope.add(newScope, ...context.pendingIds) - } - return new TrackerContext(newScope, []) - } - - // Pop scope when exiting function - if (term === terms.FunctionDef) { - return new TrackerContext(context.scope.pop(), []) - } - - return context - }, - - hash: (context) => context.scope.hash(), -}) - -// Check if we're parsing array destructuring: [ a b ] = ... -const isArrayDestructuring = (input: InputStream): boolean => { - let pos = 0 - - // Find closing bracket - while (pos < 200 && input.peek(pos) !== 93 /* ] */) { - if (input.peek(pos) === -1) return false // EOF - pos++ - } - - if (input.peek(pos) !== 93 /* ] */) return false - pos++ - - // Skip whitespace - while (input.peek(pos) === 32 /* space */ || - input.peek(pos) === 9 /* tab */ || - input.peek(pos) === 10 /* \n */) { - pos++ - } - - return input.peek(pos) === 61 /* = */ -} \ No newline at end of file diff --git a/src/parser/shrimp.grammar b/src/parser/shrimp.grammar deleted file mode 100644 index e7cd9ee..0000000 --- a/src/parser/shrimp.grammar +++ /dev/null @@ -1,299 +0,0 @@ -@external propSource highlighting from "./highlight" - -@context trackScope from "./parserScopeContext" - -@skip { space | Comment } - -@top Program { item* } - -@external tokens operatorTokenizer from "./operatorTokenizer" { Star, Slash, Plus, Minus, And, Or, Eq, EqEq, Neq, Lt, Lte, Gt, Gte, Modulo, PlusEq, MinusEq, StarEq, SlashEq, ModuloEq, Band, Bor, Bxor, Shl, Shr, Ushr, NullishCoalesce, NullishEq } - -@tokens { - @precedence { Number Regex } - - StringFragment { !['\\$]+ } - DoubleQuote { '"' !["]* '"' } - NamedArgPrefix { $[a-z] $[a-z0-9-]* "=" } - Number { - ("-" | "+")? "0x" $[0-9a-fA-F]+ | - ("-" | "+")? "0b" $[01]+ | - ("-" | "+")? "0o" $[0-7]+ | - ("-" | "+")? $[0-9]+ ("_"? $[0-9]+)* ('.' $[0-9]+ ("_"? $[0-9]+)*)? - } - Boolean { "true" | "false" } - semicolon { ";" } - eof { @eof } - space { " " | "\t" } - Comment { "#" ![\n]* } - leftParen { "(" } - rightParen { ")" } - colon[closedBy="end", @name="colon"] { ":" } - Underscore { "_" } - Dollar { "$" } - Regex { "//" (![/\\\n[] | "\\" ![\n] | "[" (![\n\\\]] | "\\" ![\n])* "]")+ ("//" $[gimsuy]*)? } // Stolen from the lezer JavaScript grammar - "|"[@name=operator] -} - -newlineOrSemicolon { newline | semicolon } - -end { @specialize[@name=keyword] } -while { @specialize[@name=keyword] } -if { @specialize[@name=keyword] } -else { @specialize[@name=keyword] } -try { @specialize[@name=keyword] } -catch { @specialize[@name=keyword] } -finally { @specialize[@name=keyword] } -throw { @specialize[@name=keyword] } -not { @specialize[@name=keyword] } -import { @specialize[@name=keyword] } -null { @specialize[@name=Null] } - -@external tokens tokenizer from "./tokenizer" { Identifier, AssignableIdentifier, Word, IdentifierBeforeDot, CurlyString } -@external tokens pipeStartsLineTokenizer from "./tokenizer" { newline, pipeStartsLine } -@external specialize {Identifier} specializeKeyword from "./tokenizer" { Do } - -@precedence { - pipe @left, - or @left, - and @left, - nullish @left, - comparison @left, - multiplicative @left, - additive @left, - bitwise @left, - call, - functionWithNewlines -} - -item { - consumeToTerminator newlineOrSemicolon | - consumeToTerminator eof | - newlineOrSemicolon // allow blank lines -} - -consumeToTerminator { - PipeExpr | - WhileExpr | - FunctionCallWithBlock | - ambiguousFunctionCall | - TryExpr | - Throw | - Not | - Import | - IfExpr | - FunctionDef | - CompoundAssign | - Assign | - BinOp | - ConditionalOp | - expressionWithoutIdentifier -} - -PipeExpr { - pipeOperand (!pipe (pipeStartsLine? "|") newlineOrSemicolon* pipeOperand)+ -} - -pipeOperand { - consumeToTerminator -} - -WhileExpr { - while (ConditionalOp | expression) colon Block end -} - -Block { - consumeToTerminator | newlineOrSemicolon block -} - -FunctionCallWithBlock { - ambiguousFunctionCall colon Block CatchExpr? FinallyExpr? end -} - -FunctionCallOrIdentifier { - DotGet | Identifier -} - -ambiguousFunctionCall { - FunctionCall | FunctionCallOrIdentifier -} - -FunctionCall { - (DotGet | Identifier | ParenExpr) arg+ -} - -arg { - PositionalArg | NamedArg -} - -PositionalArg { - expression | FunctionDef | Underscore -} - -NamedArg { - NamedArgPrefix (expression | FunctionDef | Underscore) -} - -FunctionDef { - Do Params colon (consumeToTerminator | newlineOrSemicolon block) CatchExpr? FinallyExpr? end -} - -ifTest { - ConditionalOp | expression | FunctionCall -} - -IfExpr { - if ifTest colon Block ElseIfExpr* ElseExpr? end -} - -ElseIfExpr { - else if ifTest colon Block -} - -ElseExpr { - else colon Block -} - -TryExpr { - try colon Block CatchExpr? FinallyExpr? end -} - -CatchExpr { - catch Identifier colon Block -} - -FinallyExpr { - finally colon Block -} - -Throw { - throw (BinOp | ConditionalOp | expression) -} - -Not { - not (BinOp | ConditionalOp | expression) -} - -// this has to be in the parse tree so the scope tracker can use it -Import { - import NamedArg* Identifier+ NamedArg* -} - -ConditionalOp { - expression !comparison EqEq expression | - expression !comparison Neq expression | - expression !comparison Lt expression | - expression !comparison Lte expression | - expression !comparison Gt expression | - expression !comparison Gte expression | - (expression | ConditionalOp) !and And (expression | ConditionalOp) | - (expression | ConditionalOp) !or Or (expression | ConditionalOp) | - (expression | ConditionalOp) !nullish NullishCoalesce (expression | ConditionalOp) -} - -Params { - Identifier* NamedParam* -} - -NamedParam { - NamedArgPrefix (String | Number | Boolean | null) -} - -Assign { - (AssignableIdentifier | Array) Eq consumeToTerminator -} - -CompoundAssign { - AssignableIdentifier (PlusEq | MinusEq | StarEq | SlashEq | ModuloEq | NullishEq) consumeToTerminator -} - -BinOp { - expression !multiplicative Modulo expression | - (expression | BinOp) !multiplicative Star (expression | BinOp) | - (expression | BinOp) !multiplicative Slash (expression | BinOp) | - (expression | BinOp) !additive Plus (expression | BinOp) | - (expression | BinOp) !additive Minus (expression | BinOp) | - (expression | BinOp) !bitwise Band (expression | BinOp) | - (expression | BinOp) !bitwise Bor (expression | BinOp) | - (expression | BinOp) !bitwise Bxor (expression | BinOp) | - (expression | BinOp) !bitwise Shl (expression | BinOp) | - (expression | BinOp) !bitwise Shr (expression | BinOp) | - (expression | BinOp) !bitwise Ushr (expression | BinOp) -} - -ParenExpr { - leftParen newlineOrSemicolon* ( - FunctionCallWithNewlines | - IfExpr | - ambiguousFunctionCall | - BinOp newlineOrSemicolon* | - expressionWithoutIdentifier | - ConditionalOp newlineOrSemicolon* | - PipeExpr | - FunctionDef - ) - rightParen -} - -FunctionCallWithNewlines[@name=FunctionCall] { - (DotGet | Identifier | ParenExpr) newlineOrSemicolon+ arg !functionWithNewlines (newlineOrSemicolon+ arg)* newlineOrSemicolon* -} - -expression { - expressionWithoutIdentifier | DotGet | Identifier -} - - -@local tokens { - dot { "." } -} - -@skip {} { - DotGet { - IdentifierBeforeDot dot (DotGet | Number | Identifier | ParenExpr) | - Dollar dot (DotGet | Number | Identifier | ParenExpr) - } - - String { - "'" stringContent* "'" | CurlyString | DoubleQuote - } -} - -stringContent { - StringFragment | - Interpolation | - EscapeSeq -} - -Interpolation { - "$" FunctionCallOrIdentifier | - "$" ParenExpr -} - -EscapeSeq { - "\\" ("$" | "n" | "t" | "r" | "\\" | "'") -} - -Dict { - "[=]" | - "[" newlineOrSemicolon* NamedArg (newlineOrSemicolon | NamedArg)* "]" -} - -Array { - "[" newlineOrSemicolon* (expression (newlineOrSemicolon | expression)*)? "]" -} - -// We need expressionWithoutIdentifier to avoid conflicts in consumeToTerminator. -// Without this, when parsing "my-var" at statement level, the parser can't decide: -// - ambiguousFunctionCall → FunctionCallOrIdentifier → Identifier -// - expression → Identifier -// Both want the same Identifier token! So we use expressionWithoutIdentifier -// to remove Identifier from the second path, forcing standalone identifiers -// to go through ambiguousFunctionCall (which is what we want semantically). -// Yes, it is annoying and I gave up trying to use GLR to fix it. -expressionWithoutIdentifier { - ParenExpr | Word | String | Number | Boolean | Regex | Dict | Array | null -} - -block { - (consumeToTerminator? newlineOrSemicolon)* -} \ No newline at end of file diff --git a/src/parser/shrimp.grammar.d.ts b/src/parser/shrimp.grammar.d.ts deleted file mode 100644 index 248618c..0000000 --- a/src/parser/shrimp.grammar.d.ts +++ /dev/null @@ -1,4 +0,0 @@ -declare module '*.grammar' { - const content: string - export default content -} diff --git a/src/parser/shrimp.terms.ts b/src/parser/shrimp.terms.ts deleted file mode 100644 index c0c8482..0000000 --- a/src/parser/shrimp.terms.ts +++ /dev/null @@ -1,82 +0,0 @@ -// This file was generated by lezer-generator. You probably shouldn't edit it. -export const - Star = 1, - Slash = 2, - Plus = 3, - Minus = 4, - And = 5, - Or = 6, - Eq = 7, - EqEq = 8, - Neq = 9, - Lt = 10, - Lte = 11, - Gt = 12, - Gte = 13, - Modulo = 14, - PlusEq = 15, - MinusEq = 16, - StarEq = 17, - SlashEq = 18, - ModuloEq = 19, - Band = 20, - Bor = 21, - Bxor = 22, - Shl = 23, - Shr = 24, - Ushr = 25, - NullishCoalesce = 26, - NullishEq = 27, - Identifier = 28, - AssignableIdentifier = 29, - Word = 30, - IdentifierBeforeDot = 31, - CurlyString = 32, - newline = 103, - pipeStartsLine = 104, - Do = 33, - Comment = 34, - Program = 35, - PipeExpr = 36, - WhileExpr = 38, - keyword = 86, - ConditionalOp = 40, - ParenExpr = 41, - FunctionCallWithNewlines = 42, - DotGet = 43, - Number = 44, - Dollar = 45, - PositionalArg = 46, - FunctionDef = 47, - Params = 48, - NamedParam = 49, - NamedArgPrefix = 50, - String = 51, - StringFragment = 52, - Interpolation = 53, - FunctionCallOrIdentifier = 54, - EscapeSeq = 55, - DoubleQuote = 56, - Boolean = 57, - Null = 58, - colon = 59, - CatchExpr = 60, - Block = 62, - FinallyExpr = 63, - Underscore = 66, - NamedArg = 67, - IfExpr = 68, - FunctionCall = 70, - ElseIfExpr = 71, - ElseExpr = 73, - BinOp = 74, - Regex = 75, - Dict = 76, - Array = 77, - FunctionCallWithBlock = 78, - TryExpr = 79, - Throw = 81, - Not = 83, - Import = 85, - CompoundAssign = 87, - Assign = 88 diff --git a/src/parser/shrimp.ts b/src/parser/shrimp.ts deleted file mode 100644 index d99da52..0000000 --- a/src/parser/shrimp.ts +++ /dev/null @@ -1,27 +0,0 @@ -// This file was generated by lezer-generator. You probably shouldn't edit it. -import {LRParser, LocalTokenGroup} from "@lezer/lr" -import {operatorTokenizer} from "./operatorTokenizer" -import {tokenizer, pipeStartsLineTokenizer, specializeKeyword} from "./tokenizer" -import {trackScope} from "./parserScopeContext" -import {highlighting} from "./highlight" -const spec_Identifier = {__proto__:null,while:78, null:116, catch:122, finally:128, end:130, if:138, else:144, try:160, throw:164, not:168, import:172} -export const parser = LRParser.deserialize({ - version: 14, - states: "?tQYQ!SOOOOQ!Q'#Em'#EmO!vO!bO'#DXO%nQ!TO'#DdO&XOSO'#DaOOQ!R'#Da'#DaO)VQ!TO'#EpOOQ!Q'#E}'#E}O)sQRO'#DxO+{Q!TO'#ElO,iQ!SO'#DVOOQ!R'#Dz'#DzO/^Q!SO'#D{OOQ!R'#Ep'#EpO/eQ!TO'#EpO1iQ!TO'#EoO2wQ!TO'#ElO3UQRO'#EVOOQ!Q'#El'#ElO3mQ!SO'#ElO3tQrO'#EkOOQ!Q'#Ek'#EkOOQ!Q'#EX'#EXQYQ!SOOO4VQbO'#D]O4bQbO'#DrO5`QbO'#DSO6^QQO'#D}O5`QbO'#EPO5`QbO'#ERO6cQbO'#ETO6kObO,59sOOQ!Q'#D['#D[O6|QbO'#DqOOQ!Q'#Es'#EsOOQ!Q'#Ea'#EaO7WQ!SO,5:`OOQ!R'#Eo'#EoO8WQbO'#DcO8fQWO'#DeOOOO'#Eu'#EuOOOO'#E^'#E^O8zOSO,59{OOQ!R,59{,59{O5`QbO,5:dO5`QbO,5:dO5`QbO,5:dO5`QbO,5:dO5`QbO,59pO5`QbO,59pO5`QbO,59pO5`QbO,59pOOQ!Q'#EZ'#EZO,iQ!SO,59qO9YQ!TO'#DdO9dQ!TO'#EpO9nQsO,59qO9{QQO,59qO:QQrO,59qO:]QrO,59qO:kQsO,59qO;ZQsO,59qO;bQrO'#DQO;jQ!SO,5:gO;qQrO,5:fOOQ!R,5:g,5:gOfQQO'#EYOOQ!Q-E8V-E8VOOQ!Q'#E['#E[O>kQbO'#D^O>vQbO'#D_OOQO'#E]'#E]O>nQQO'#D^O?[QQO,59wO?aQcO'#EoO@^QRO'#E|OAZQRO'#E|OOQO'#E|'#E|OAbQQO,5:^OAgQRO,59nOAnQRO,59nOYQ!SO,5:iOA|Q!TO,5:kOCbQ!TO,5:kODUQ!TO,5:kODcQ!TO,5:mOEwQ!TO,5:mOFkQ!TO,5:mOFxQ!SO,5:oOOQ!Q'#Ee'#EeO6cQbO,5:oOOQ!R1G/_1G/_OOQ!Q,5:],5:]OOQ!Q-E8_-E8_OOOO'#Dd'#DdOOOO,59},59}OOOO,5:P,5:POOOO-E8[-E8[OOQ!R1G/g1G/gOOQ!R1G0O1G0OOH}Q!TO1G0OOIXQ!TO1G0OOJmQ!TO1G0OOJwQ!TO1G0OOKUQ!TO1G0OOOQ!R1G/[1G/[OLmQ!TO1G/[OLtQ!TO1G/[OL{Q!TO1G/[ONQQ!TO1G/[OMSQ!TO1G/[OOQ!Q-E8X-E8XONhQsO1G/]ONuQQO1G/]ONzQrO1G/]O! VQrO1G/]O! eQsO1G/]O! lQsO1G/]O! sQ!SO,59rO! }QrO1G/]OOQ!R1G/]1G/]O!!YQrO1G0QOOQ!R1G0R1G0RO!!hQ!SO1G0ROOQp'#Ec'#EcO!!YQrO1G0QOOQ!R1G0Q1G0QOOQ!Q'#Ed'#EdO!!hQ!SO1G0RO!!uQ!SO1G0^O!#gQ!SO1G0]O!$XQ!SO'#DlO!$mQ!SO'#DlO!$}QbO1G0SOOQ!Q-E8W-E8WOYQ!SO,5:tOOQ!Q,5:t,5:tOYQ!SO,5:tOOQ!Q-E8Y-E8YO!%YQQO,59xOOQO,59y,59yOOQO-E8Z-E8ZOYQ!SO1G/cOYQ!SO1G/xOYQ!SO1G/YO!%bQbO1G0TO!%mQ!SO1G0ZO!&bQ!SO1G0ZOOQ!Q-E8c-E8cO!&iQrO7+$wOOQ!R7+$w7+$wO!&tQrO1G/^O!'PQrO7+%lOOQ!R7+%l7+%lO!'_Q!SO7+%mOOQ!R7+%m7+%mOOQp-E8a-E8aOOQ!Q-E8b-E8bOOQ!Q'#E_'#E_O!'lQrO'#E_O!'zQ!SO'#E{OOQ`,5:W,5:WO!([QbO'#DjO!(aQQO'#DmOOQ!Q7+%n7+%nO!(fQbO7+%nO!(kQbO7+%nOOQ!Q1G0`1G0`OYQ!SO1G0`O!(sQ!SO7+$}O!)UQ!SO7+$}O!)cQbO7+%dO!)kQbO7+$tOOQ!Q7+%o7+%oO!)pQbO7+%oO!)uQbO7+%oO!)}Q!SO7+%uOOQ!R<tAN>tOOQ!QAN>TAN>TO!,wQbOAN>TO!,|QbOAN>TOOQ`-E8`-E8`OOQ!QAN>jAN>jO!-UQbOAN>jO4bQbO,5:aOYQ!SO,5:cOOQ!QAN>uAN>uP! sQ!SO'#EZOOQ`7+%[7+%[OOQ!QG23oG23oO!-ZQbOG23oP!,ZQbO'#DuOOQ!QG24UG24UO!-`QQO1G/{OOQ`1G/}1G/}OOQ!QLD)ZLD)ZOYQ!SO7+%gOOQ`<S!`#O$O#P;'S$O;'S;=`$g<%lO$OU>XV!USOt$Ouw$Ox#O$O#P#Q>n#Q;'S$O;'S;=`$g<%lO$OU>uU#sQ!USOt$Ouw$Ox#O$O#P;'S$O;'S;=`$g<%lO$O~?^O#k~U?eU#uQ!USOt$Ouw$Ox#O$O#P;'S$O;'S;=`$g<%lO$OU@OU!US!dQOt$Ouw$Ox#O$O#P;'S$O;'S;=`$g<%lO$OU@g^!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#o@b#o;'S$O;'S;=`$g<%lO$OUAjU!SQ!USOt$Ouw$Ox#O$O#P;'S$O;'S;=`$g<%lO$OUBR_!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#UCQ#U#o@b#o;'S$O;'S;=`$g<%lO$OUCV`!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#`@b#`#aDX#a#o@b#o;'S$O;'S;=`$g<%lO$OUD^`!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#g@b#g#hE`#h#o@b#o;'S$O;'S;=`$g<%lO$OUEe`!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#X@b#X#YFg#Y#o@b#o;'S$O;'S;=`$g<%lO$OUFn^!ZQ!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#o@b#o;'S$O;'S;=`$g<%lO$O^Gq^#lW!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#o@b#o;'S$O;'S;=`$g<%lO$O^Ht^#nW!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#o@b#o;'S$O;'S;=`$g<%lO$O^Iw`#mW!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#f@b#f#gJy#g#o@b#o;'S$O;'S;=`$g<%lO$OUKO`!USOt$Ouw$Ox}$O}!O@b!O!Q$O!Q![@b![!_$O!_!`Ac!`#O$O#P#T$O#T#i@b#i#jE`#j#o@b#o;'S$O;'S;=`$g<%lO$OULXUuQ!USOt$Ouw$Ox#O$O#P;'S$O;'S;=`$g<%lO$O~LpO#v~", - tokenizers: [operatorTokenizer, 1, 2, 3, tokenizer, pipeStartsLineTokenizer, new LocalTokenGroup("[~RP!O!PU~ZO#f~~", 11)], - topRules: {"Program":[0,35]}, - specialized: [{term: 28, get: (value: any, stack: any) => (specializeKeyword(value, stack) << 1), external: specializeKeyword},{term: 28, get: (value: keyof typeof spec_Identifier) => spec_Identifier[value] || -1}], - tokenPrec: 2711 -}) diff --git a/src/parser/tests/basics.test.ts b/src/parser/tests/basics.test.ts index c223834..9d8bcd4 100644 --- a/src/parser/tests/basics.test.ts +++ b/src/parser/tests/basics.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('null', () => { test('parses null', () => { expect('null').toMatchTree(`Null null`) @@ -370,8 +368,8 @@ describe('Parentheses', () => { }) test('function call with named args on multiple lines in parens', () => { - expect(`(tail - arg1=true + expect(`(tail + arg1=true arg2=30 )`).toMatchTree(` ParenExpr @@ -386,8 +384,8 @@ describe('Parentheses', () => { `) expect(`( - tail - arg1=true + tail + arg1=true arg2=30 )`).toMatchTree(` ParenExpr @@ -425,7 +423,7 @@ describe('Parentheses', () => { }) test('function call with multiple identifiers on separate lines in parens', () => { - expect(`(echo + expect(`(echo arg1 arg2 arg3 @@ -443,8 +441,8 @@ describe('Parentheses', () => { test('function call with mulitline identifiers starting separate lines in parens', () => { expect(`( - - echo + + echo arg1 arg2 arg3 diff --git a/src/parser/tests/bitwise.test.ts b/src/parser/tests/bitwise.test.ts index 5ccc8ed..678440c 100644 --- a/src/parser/tests/bitwise.test.ts +++ b/src/parser/tests/bitwise.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('bitwise operators - grammar', () => { test('parses band (bitwise AND)', () => { expect('5 band 3').toMatchTree(` diff --git a/src/parser/tests/control-flow.test.ts b/src/parser/tests/control-flow.test.ts index 79d23e6..4e776ee 100644 --- a/src/parser/tests/control-flow.test.ts +++ b/src/parser/tests/control-flow.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('if/else if/else', () => { test('parses single line if', () => { expect(`if y == 1: 'cool' end`).toMatchTree(` @@ -317,8 +315,8 @@ describe('while', () => { test('multiline infinite loop', () => { expect(` - while true: - true + while true: + true end`).toMatchTree(` WhileExpr keyword while @@ -331,7 +329,7 @@ describe('while', () => { test('multiline basic expression', () => { expect(` - while a > 0: + while a > 0: true end`).toMatchTree(` WhileExpr @@ -349,8 +347,8 @@ describe('while', () => { test('multiline compound expression', () => { expect(` - while a > 0 and b < 100 and c < 1000: - true + while a > 0 and b < 100 and c < 1000: + true end`).toMatchTree(` WhileExpr keyword while diff --git a/src/parser/tests/destructuring.test.ts b/src/parser/tests/destructuring.test.ts index ae17a27..f3aa839 100644 --- a/src/parser/tests/destructuring.test.ts +++ b/src/parser/tests/destructuring.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('Array destructuring', () => { test('parses array pattern with two variables', () => { expect('[ a b ] = [ 1 2 3 4]').toMatchTree(` diff --git a/src/parser/tests/exceptions.test.ts b/src/parser/tests/exceptions.test.ts index 8b18ced..ae88c2a 100644 --- a/src/parser/tests/exceptions.test.ts +++ b/src/parser/tests/exceptions.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('try/catch/finally/throw', () => { test('parses try with catch', () => { expect(`try: diff --git a/src/parser/tests/function-blocks.test.ts b/src/parser/tests/function-blocks.test.ts index 80805a9..c70ac2c 100644 --- a/src/parser/tests/function-blocks.test.ts +++ b/src/parser/tests/function-blocks.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('single line function blocks', () => { test('work with no args', () => { expect(`trap: echo bye bye end`).toMatchTree(` @@ -91,8 +89,8 @@ describe('single line function blocks', () => { describe('multi line function blocks', () => { test('work with no args', () => { expect(` -trap: - echo bye bye +trap: + echo bye bye end `).toMatchTree(` FunctionCallWithBlock @@ -112,8 +110,8 @@ end test('work with one arg', () => { expect(` -trap EXIT: - echo bye bye +trap EXIT: + echo bye bye end`).toMatchTree(` FunctionCallWithBlock FunctionCall @@ -135,7 +133,7 @@ end`).toMatchTree(` test('work with named args', () => { expect(` attach signal='exit' code=1: - echo bye bye + echo bye bye end`).toMatchTree(` FunctionCallWithBlock FunctionCall @@ -163,8 +161,8 @@ end`).toMatchTree(` test('work with dot-get', () => { expect(` signals = [=] -signals.trap 'EXIT': - echo bye bye +signals.trap 'EXIT': + echo bye bye end`).toMatchTree(` Assign AssignableIdentifier signals diff --git a/src/parser/tests/functions.test.ts b/src/parser/tests/functions.test.ts index 6312529..1d98721 100644 --- a/src/parser/tests/functions.test.ts +++ b/src/parser/tests/functions.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('calling functions', () => { test('call with no args', () => { expect('tail').toMatchTree(` diff --git a/src/parser/tests/import.test.ts b/src/parser/tests/import.test.ts index ec63061..71537e3 100644 --- a/src/parser/tests/import.test.ts +++ b/src/parser/tests/import.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('import', () => { test('parses single import', () => { expect(`import str`).toMatchTree(` diff --git a/src/parser/tests/literals.test.ts b/src/parser/tests/literals.test.ts index 44e2794..ef10820 100644 --- a/src/parser/tests/literals.test.ts +++ b/src/parser/tests/literals.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('number literals', () => { test('binary numbers', () => { expect('0b110').toMatchTree(` diff --git a/src/parser/tests/multiline.test.ts b/src/parser/tests/multiline.test.ts index 9362181..e3842f6 100644 --- a/src/parser/tests/multiline.test.ts +++ b/src/parser/tests/multiline.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('multiline', () => { test('parses multiline strings', () => { expect(`'first'\n'second'`).toMatchTree(` diff --git a/src/parser/tests/pipes.test.ts b/src/parser/tests/pipes.test.ts index 44ba028..b281381 100644 --- a/src/parser/tests/pipes.test.ts +++ b/src/parser/tests/pipes.test.ts @@ -1,7 +1,4 @@ import { expect, describe, test } from 'bun:test' -import { parser } from '../shrimp' - -import '../shrimp.grammar' // Importing this so changes cause it to retest! describe('pipe expressions', () => { test('simple pipe expression', () => { diff --git a/src/parser/tests/strings.test.ts b/src/parser/tests/strings.test.ts index 01fd0ac..c8d95b6 100644 --- a/src/parser/tests/strings.test.ts +++ b/src/parser/tests/strings.test.ts @@ -1,7 +1,5 @@ import { expect, describe, test } from 'bun:test' -import '../shrimp.grammar' // Importing this so changes cause it to retest! - describe('string interpolation', () => { test('string with variable interpolation', () => { expect("'hello $name'").toMatchTree(` @@ -178,4 +176,4 @@ describe('double quoted strings', () => { String DoubleQuote "hello $(1 + 2)"`) }) -}) \ No newline at end of file +}) diff --git a/src/parser/tokenizer.ts b/src/parser/tokenizer.ts deleted file mode 100644 index cbecdb7..0000000 --- a/src/parser/tokenizer.ts +++ /dev/null @@ -1,389 +0,0 @@ -import { ExternalTokenizer, InputStream, Stack } from '@lezer/lr' -import { Identifier, AssignableIdentifier, Word, IdentifierBeforeDot, Do, CurlyString, DotGet, newline, pipeStartsLine } from './shrimp.terms' - -// doobie doobie do (we need the `do` keyword to know when we're defining params) -export function specializeKeyword(ident: string) { - return ident === 'do' ? Do : -1 -} - -// tell the dotGet searcher about builtin globals -export const globals: string[] = [] -export const setGlobals = (newGlobals: string[] | Record) => { - globals.length = 0 - globals.push(...(Array.isArray(newGlobals) ? newGlobals : Object.keys(newGlobals))) -} - -// The only chars that can't be words are whitespace, apostrophes, closing parens, and EOF. - -export const tokenizer = new ExternalTokenizer( - (input: InputStream, stack: Stack) => { - const ch = getFullCodePoint(input, 0) - - // Handle curly strings - if (ch === 123 /* { */) return consumeCurlyString(input, stack) - - if (!isWordChar(ch)) return - - // Don't consume things that start with digits - let Number token handle it - if (isDigit(ch)) return - - // Don't consume things that start with - or + followed by a digit (negative/positive numbers) - if ((ch === 45 /* - */ || ch === 43) /* + */ && isDigit(input.peek(1))) return - - const isValidStart = isIdentStart(ch) - const canBeWord = stack.canShift(Word) - - // Consume all word characters, tracking if it remains a valid identifier - const { pos, isValidIdentifier, stoppedAtDot } = consumeWordToken( - input, - isValidStart, - canBeWord - ) - - // Check if we should emit IdentifierBeforeDot for property access - if (stoppedAtDot) { - const dotGetToken = checkForDotGet(input, stack, pos) - - if (dotGetToken) { - input.advance(pos) - input.acceptToken(dotGetToken) - } else { - // Not in scope - continue consuming the dot as part of the word - const afterDot = consumeRestOfWord(input, pos + 1, canBeWord) - input.advance(afterDot) - input.acceptToken(Word) - } - - return - } - - // Advance past the token we consumed - input.advance(pos) - - // Choose which token to emit - if (isValidIdentifier) { - const token = chooseIdentifierToken(input, stack) - input.acceptToken(token) - } else { - input.acceptToken(Word) - } - }, - { contextual: true } -) - -// Build identifier text from input stream, handling surrogate pairs for emoji -const buildIdentifierText = (input: InputStream, length: number): string => { - let text = '' - for (let i = 0; i < length; i++) { - const charCode = input.peek(i) - if (charCode === -1) break - - // Handle surrogate pairs for emoji (UTF-16 encoding) - if (charCode >= 0xd800 && charCode <= 0xdbff && i + 1 < length) { - const low = input.peek(i + 1) - if (low >= 0xdc00 && low <= 0xdfff) { - text += String.fromCharCode(charCode, low) - i++ // Skip the low surrogate - continue - } - } - text += String.fromCharCode(charCode) - } - return text -} - -// Consume word characters, tracking if it remains a valid identifier -// Returns the position after consuming, whether it's a valid identifier, and if we stopped at a dot -const consumeWordToken = ( - input: InputStream, - isValidStart: boolean, - canBeWord: boolean -): { pos: number; isValidIdentifier: boolean; stoppedAtDot: boolean } => { - let pos = getCharSize(getFullCodePoint(input, 0)) - let isValidIdentifier = isValidStart - let stoppedAtDot = false - - while (true) { - const ch = getFullCodePoint(input, pos) - - // Stop at dot if we have a valid identifier (might be property access) - if (ch === 46 /* . */ && isValidIdentifier) { - stoppedAtDot = true - break - } - - // Stop if we hit a non-word character - if (!isWordChar(ch)) break - - // Context-aware termination: semicolon/colon can end a word if followed by whitespace - // This allows `hello; 2` to parse correctly while `hello;world` stays as one word - if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { - const nextCh = getFullCodePoint(input, pos + 1) - if (!isWordChar(nextCh)) break - } - - // Track identifier validity: must be lowercase, digit, dash, or emoji/unicode - if (!isIdentChar(ch)) { - if (!canBeWord) break - isValidIdentifier = false - } - - pos += getCharSize(ch) - } - - return { pos, isValidIdentifier, stoppedAtDot } -} - -// Consume the rest of a word after we've decided not to treat a dot as DotGet -// Used when we have "file.txt" - we already consumed "file", now consume ".txt" -const consumeRestOfWord = (input: InputStream, startPos: number, canBeWord: boolean): number => { - let pos = startPos - while (true) { - const ch = getFullCodePoint(input, pos) - - // Stop if we hit a non-word character - if (!isWordChar(ch)) break - - // Context-aware termination for semicolon/colon - if (canBeWord && (ch === 59 /* ; */ || ch === 58) /* : */) { - const nextCh = getFullCodePoint(input, pos + 1) - if (!isWordChar(nextCh)) break - } - - pos += getCharSize(ch) - } - return pos -} - -// Consumes { curly strings } and tracks braces so you can { have { braces { inside { braces } } } -const consumeCurlyString = (input: InputStream, stack: Stack) => { - if (!stack.canShift(CurlyString)) return - - let depth = 0 - let pos = 0 - - while (true) { - const ch = input.peek(pos) - if (ch < 0) return // EOF - invalid - - if (ch === 123) depth++ // { - else if (ch === 125) { // } - depth-- - if (depth === 0) { - pos++ // consume final } - break - } - } - - pos++ - } - - input.acceptToken(CurlyString, pos) -} - -// Check if this identifier is in scope (for property access detection) -// Returns IdentifierBeforeDot token if in scope, null otherwise -const checkForDotGet = (input: InputStream, stack: Stack, pos: number): number | null => { - const identifierText = buildIdentifierText(input, pos) - const context = stack.context as { scope: { has(name: string): boolean } } | undefined - - // Check if identifier is in scope (lexical scope or globals) - const inScope = context?.scope.has(identifierText) || globals.includes(identifierText) - - // property access - if (inScope) return IdentifierBeforeDot - - // Not in scope - check if we're inside a DotGet chain - // Inside the @skip {} block where DotGet is defined, Word cannot be shifted - // but Identifier can be. This tells us we're at the RHS of a DotGet. - const canShiftIdentifier = stack.canShift(Identifier) - const canShiftWord = stack.canShift(Word) - const inDotGetChain = canShiftIdentifier && !canShiftWord - - // continue if we're inside a DotGet - return inDotGetChain ? IdentifierBeforeDot : null -} - -// Decide between AssignableIdentifier and Identifier using grammar state + peek-ahead -const chooseIdentifierToken = (input: InputStream, stack: Stack): number => { - const canAssignable = stack.canShift(AssignableIdentifier) - const canRegular = stack.canShift(Identifier) - - // Only one option is valid - use it - if (canAssignable && !canRegular) return AssignableIdentifier - if (canRegular && !canAssignable) return Identifier - - // Both possible (ambiguous context) - peek ahead for '=' to disambiguate - // This happens at statement start where both `x = 5` (assign) and `echo x` (call) are valid - let peekPos = 0 - while (true) { - const ch = getFullCodePoint(input, peekPos) - if (isWhiteSpace(ch)) { - peekPos += getCharSize(ch) - } else { - break - } - } - - const nextCh = getFullCodePoint(input, peekPos) - const nextCh2 = getFullCodePoint(input, peekPos + 1) - const nextCh3 = getFullCodePoint(input, peekPos + 2) - - // Check for ??= (three-character compound operator) - if (nextCh === 63 /* ? */ && nextCh2 === 63 /* ? */ && nextCh3 === 61 /* = */) { - const charAfterOp = getFullCodePoint(input, peekPos + 3) - if (isWhiteSpace(charAfterOp) || charAfterOp === -1 /* EOF */) { - return AssignableIdentifier - } - } - - // Check for compound assignment operators: +=, -=, *=, /=, %= - if ( - [43 /* + */, 45 /* - */, 42 /* * */, 47 /* / */, 37 /* % */].includes(nextCh) && - nextCh2 === 61 /* = */ - ) { - // Found compound operator, check if it's followed by whitespace - const charAfterOp = getFullCodePoint(input, peekPos + 2) - if (isWhiteSpace(charAfterOp) || charAfterOp === -1 /* EOF */) { - return AssignableIdentifier - } - } - - if (nextCh === 61 /* = */) { - // Found '=', but check if it's followed by whitespace - // If '=' is followed by non-whitespace (like '=cool*'), it won't be tokenized as Eq - // In that case, this should be Identifier (for function call), not AssignableIdentifier - const charAfterEquals = getFullCodePoint(input, peekPos + 1) - if (isWhiteSpace(charAfterEquals) || charAfterEquals === -1 /* EOF */) { - return AssignableIdentifier - } - } - return Identifier -} - -// Character classification helpers -export const isIdentStart = (ch: number): boolean => { - return isLowercaseLetter(ch) || isEmojiOrUnicode(ch) -} - -export const isIdentChar = (ch: number): boolean => { - return isLowercaseLetter(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */ || isEmojiOrUnicode(ch) -} - -const isWhiteSpace = (ch: number): boolean => { - return ch === 32 /* space */ || ch === 9 /* tab */ || ch === 13 /* \r */ -} - -const isWordChar = (ch: number): boolean => { - return ( - !isWhiteSpace(ch) && - ch !== 10 /* \n */ && - ch !== 41 /* ) */ && - ch !== 93 /* ] */ && - ch !== -1 /* EOF */ - ) -} - -const isLowercaseLetter = (ch: number): boolean => { - return ch >= 97 && ch <= 122 // a-z -} - -const isDigit = (ch: number): boolean => { - return ch >= 48 && ch <= 57 // 0-9 -} - -const getFullCodePoint = (input: InputStream, pos: number): number => { - const ch = input.peek(pos) - - // Check if this is a high surrogate (0xD800-0xDBFF) - if (ch >= 0xd800 && ch <= 0xdbff) { - const low = input.peek(pos + 1) - // Check if next is low surrogate (0xDC00-0xDFFF) - if (low >= 0xdc00 && low <= 0xdfff) { - // Combine surrogate pair into full code point - return 0x10000 + ((ch & 0x3ff) << 10) + (low & 0x3ff) - } - } - - return ch -} - -const isEmojiOrUnicode = (ch: number): boolean => { - return ( - // Basic Emoticons - (ch >= 0x1f600 && ch <= 0x1f64f) || - // Miscellaneous Symbols and Pictographs - (ch >= 0x1f300 && ch <= 0x1f5ff) || - // Transport and Map Symbols - (ch >= 0x1f680 && ch <= 0x1f6ff) || - // Regional Indicator Symbols (flags) - (ch >= 0x1f1e6 && ch <= 0x1f1ff) || - // Miscellaneous Symbols (hearts, stars, weather) - (ch >= 0x2600 && ch <= 0x26ff) || - // Dingbats (scissors, pencils, etc) - (ch >= 0x2700 && ch <= 0x27bf) || - // Supplemental Symbols and Pictographs (newer emojis) - (ch >= 0x1f900 && ch <= 0x1f9ff) || - // Symbols and Pictographs Extended-A (newest emojis) - (ch >= 0x1fa70 && ch <= 0x1faff) || - // Various Asian Characters with emoji presentation - (ch >= 0x1f018 && ch <= 0x1f270) || - // Variation Selectors (for emoji presentation) - (ch >= 0xfe00 && ch <= 0xfe0f) || - // Additional miscellaneous items - (ch >= 0x238c && ch <= 0x2454) || - // Combining Diacritical Marks for Symbols - (ch >= 0x20d0 && ch <= 0x20ff) || - // Latin-1 Supplement (includes ², ³, ¹ and other special chars) - (ch >= 0x00a0 && ch <= 0x00ff) || - // Greek and Coptic (U+0370-U+03FF) - (ch >= 0x0370 && ch <= 0x03ff) || - // Mathematical Alphanumeric Symbols (U+1D400-U+1D7FF) - (ch >= 0x1d400 && ch <= 0x1d7ff) || - // Mathematical Operators (U+2200-U+22FF) - (ch >= 0x2200 && ch <= 0x22ff) || - // Superscripts and Subscripts (U+2070-U+209F) - (ch >= 0x2070 && ch <= 0x209f) || - // Arrows (U+2190-U+21FF) - (ch >= 0x2190 && ch <= 0x21ff) || - // Hiragana (U+3040-U+309F) - (ch >= 0x3040 && ch <= 0x309f) || - // Katakana (U+30A0-U+30FF) - (ch >= 0x30a0 && ch <= 0x30ff) || - // CJK Unified Ideographs (U+4E00-U+9FFF) - (ch >= 0x4e00 && ch <= 0x9fff) - ) -} - -const getCharSize = (ch: number) => (ch > 0xffff ? 2 : 1) // emoji takes 2 UTF-16 code units - -export const pipeStartsLineTokenizer = new ExternalTokenizer((input: InputStream, stack: Stack) => { - const ch = input.peek(0) - - if (ch !== 10 /* \n */) return - - // ignore whitespace - let offset = 1 - let lastNewlineOffset = 0 - - while (true) { - const ch = input.peek(offset) - if (ch === 10 /* \n */) { - lastNewlineOffset = offset - offset++ - } else if (isWhiteSpace(ch)) { - offset++ - } else { - break - } - } - - // look for pipe after skipping empty lines - if (input.peek(offset) === 124 /* | */) { - input.advance(lastNewlineOffset + 1) - input.acceptToken(pipeStartsLine) - } else { - input.advance(1) - input.acceptToken(newline) - } -}) diff --git a/src/parser/tokenizer2.ts b/src/parser/tokenizer2.ts index fd51077..4619c55 100644 --- a/src/parser/tokenizer2.ts +++ b/src/parser/tokenizer2.ts @@ -475,12 +475,12 @@ const isStringDelim = (ch: number): boolean => { return ch === c`'` || ch === c`"` } -const isIdentStart = (char: number | string): boolean => { +export const isIdentStart = (char: number | string): boolean => { let ch = typeof char === 'string' ? char.charCodeAt(0) : char return isLowercaseLetter(ch) || isEmojiOrUnicode(ch) || ch === 36 /* $ */ } -const isIdentChar = (char: number | string): boolean => { +export const isIdentChar = (char: number | string): boolean => { let ch = typeof char === 'string' ? char.charCodeAt(0) : char return isIdentStart(ch) || isDigit(ch) || ch === 45 /* - */ || ch === 63 /* ? */ } diff --git a/src/testSetup.ts b/src/testSetup.ts index 739c922..c76471d 100644 --- a/src/testSetup.ts +++ b/src/testSetup.ts @@ -2,36 +2,13 @@ import { expect } from 'bun:test' import { diffLines } from 'diff' import color from 'kleur' import { Scanner, TokenType, type Token } from '#parser/tokenizer2' -import { parser } from '#parser/shrimp' -import { setGlobals } from '#parser/tokenizer' -import { parse } from '#parser/parser2' +import { parse, setGlobals } from '#parser/parser2' +import { Tree } from '#parser/node' import { globals as prelude } from '#prelude' -import { $ } from 'bun' import { assert, errorMessage } from '#utils/utils' import { Compiler } from '#compiler/compiler' import { run, VM } from 'reefvm' -import { treeToString2, treeToString, VMResultToValue } from '#utils/tree' - -const regenerateParser = async () => { - let generate = true - try { - const grammarStat = await Bun.file('./src/parser/shrimp.grammar').stat() - const tokenizerStat = await Bun.file('./src/parser/tokenizer.ts').stat() - const parserStat = await Bun.file('./src/parser/shrimp.ts').stat() - - if (grammarStat.mtime <= parserStat.mtime && tokenizerStat.mtime <= parserStat.mtime) { - generate = false - } - } catch (e) { - console.error('Error checking or regenerating parser:', e) - } finally { - if (generate) { - await $`bun generate-parser` - } - } -} - -await regenerateParser() +import { treeToString2, VMResultToValue } from '#utils/tree' // Type declaration for TypeScript declare module 'bun:test' { @@ -73,7 +50,8 @@ expect.extend({ assert(typeof received === 'string', 'toFailParse can only be used with string values') try { - const tree = parser.parse(received) + const node = parse(received) + const tree = new Tree(node) let hasErrors = false tree.iterate({ enter(n) { @@ -90,7 +68,7 @@ expect.extend({ pass: true, } } else { - const actual = treeToString(tree, received) + const actual = treeToString2(node, received) return { message: () => `Expected input to fail parsing, but it parsed successfully:\n${actual}`, pass: false,