shrimp/src/parser/tests/tokens.test.ts

import { expect, describe, test } from 'bun:test'

describe('constant types', () => {
  test('null', () => {
    expect(`null`).toBeToken('Null')
  })

  test('boolean', () => {
    expect(`true`).toMatchToken('Boolean', 'true')
    expect(`false`).toMatchToken('Boolean', 'false')
  })
})

describe('numbers', () => {
  test('non-numbers', () => {
    expect(`1st`).toMatchToken('Word', '1st')
    expect(`1_`).toMatchToken('Word', '1_')
    expect(`100.`).toMatchTokens(
      { type: 'Number', value: '100' },
      { type: 'Operator', value: '.' },
    )
  })

  test('simple numbers', () => {
    expect(`1`).toMatchToken('Number', '1')
    expect(`200`).toMatchToken('Number', '200')
    expect(`5.20`).toMatchToken('Number', '5.20')
    expect(`0.20`).toMatchToken('Number', '0.20')
    expect(`-20`).toMatchToken('Number', '-20')
    expect(`+20`).toMatchToken('Number', '+20')
    expect(`-2134.34`).toMatchToken('Number', '-2134.34')
    expect(`+20.5325`).toMatchToken('Number', '+20.5325')
    expect(`1_000`).toMatchToken('Number', '1_000')
    expect(`53_232_220`).toMatchToken('Number', '53_232_220')
  })

  test('binary numbers', () => {
    expect('0b110').toMatchToken('Number', '0b110')
  })

  test('hex numbers', () => {
    expect('0xdeadbeef').toMatchToken('Number', '0xdeadbeef')
    expect('0x02d3f4').toMatchToken('Number', '0x02d3f4')
  })

  test('hex numbers uppercase', () => {
    expect('0xFF').toMatchToken('Number', '0xFF')
  })

  test('octal numbers', () => {
    expect('0o644').toMatchToken('Number', '0o644')
    expect('0o055').toMatchToken('Number', '0o055')
  })

  test('negative binary', () => {
    expect('-0b110').toMatchToken('Number', '-0b110')
  })

  test('negative hex', () => {
    expect('-0xFF').toMatchToken('Number', '-0xFF')
  })

  test('negative octal', () => {
    expect('-0o755').toMatchToken('Number', '-0o755')
  })

  test('positive prefix binary', () => {
    expect('+0b110').toMatchToken('Number', '+0b110')
  })

  test('positive prefix hex', () => {
    expect('+0xFF').toMatchToken('Number', '+0xFF')
  })

  test('positive prefix octal', () => {
    expect('+0o644').toMatchToken('Number', '+0o644')
  })

  test('underscores in number', () => {
    expect(`1_000`).toMatchToken('Number', '1_000')
    expect(`1_0`).toMatchToken('Number', '1_0')
    expect('0b11_0').toMatchToken('Number', '0b11_0')
    expect('0xdead_beef').toMatchToken('Number', '0xdead_beef')
    expect('0o64_4').toMatchToken('Number', '0o64_4')
  })
})

describe('identifiers', () => {
  test('regular', () => {
    expect('name').toBeToken('Identifier')
    expect('bobby-mcgee').toBeToken('Identifier')
    expect('starts-with?').toBeToken('Identifier')
    expect('📢').toMatchToken('Identifier', '📢')
    expect(' 📢 ').toMatchToken('Identifier', '📢')
    expect(' oink-🐷-oink').toMatchToken('Identifier', 'oink-🐷-oink')
    expect('$').toMatchToken('Identifier', '$')
    expect('$cool').toMatchToken('Identifier', '$cool')
  })

  test('one character identifiers', () => {
    expect('a').toMatchToken('Identifier', 'a')
    expect('z').toMatchToken('Identifier', 'z')
    expect('$').toMatchToken('Identifier', '$')
    expect('📢').toMatchToken('Identifier', '📢')
    expect('?').toBeToken('Word') // ? alone is not valid identifier start
  })

  test('two character identifiers', () => {
    expect('ab').toMatchToken('Identifier', 'ab')
    expect('a1').toMatchToken('Identifier', 'a1')
    expect('a-').toMatchToken('Identifier', 'a-')
    expect('a?').toMatchToken('Identifier', 'a?') // ? valid at end
    expect('ab?').toMatchToken('Identifier', 'ab?')
  })

  test('three+ character identifiers', () => {
    expect('abc').toMatchToken('Identifier', 'abc')
    expect('a-b').toMatchToken('Identifier', 'a-b')
    expect('a1b').toMatchToken('Identifier', 'a1b')
    expect('abc?').toMatchToken('Identifier', 'abc?') // ? valid at end
    expect('a-b-c?').toMatchToken('Identifier', 'a-b-c?')
  })

  test('edge cases', () => {
    expect('-bobby-mcgee').toBeToken('Word')
    expect('starts-with??').toMatchToken('Identifier', 'starts-with??')
    expect('starts?with?').toMatchToken('Identifier', 'starts?with?')
    expect('a??b').toMatchToken('Identifier', 'a??b')
    expect('oink-oink!').toBeToken('Word')
    expect('dog#pound').toMatchToken('Word', 'dog#pound')
    expect('http://website.com').toMatchToken('Word', 'http://website.com')
    expect('school$cool').toMatchToken('Identifier', 'school$cool')
    expect('EXIT:').toMatchTokens(
      { type: 'Word', value: 'EXIT' },
      { type: 'Colon' },
    )
    expect(`if y == 1: 'cool' end`).toMatchTokens(
      { type: 'Keyword', value: 'if' },
      { type: 'Identifier', value: 'y' },
      { type: 'Operator', value: '==' },
      { type: 'Number', value: '1' },
      { type: 'Colon' },
      { type: 'String', value: `'cool'` },
      { type: 'Keyword', value: 'end' },
    )
  })
})

describe('paths', () => {
  test('starting with ./', () => {
    expect('./tmp').toMatchToken('Word', './tmp')
  })

  test('starting with /', () => {
    expect('/home/chris/dev').toMatchToken('Word', '/home/chris/dev')
  })

  test('identifiers with dots tokenize separately', () => {
    expect('readme.txt').toMatchTokens(
      { type: 'Identifier', value: 'readme' },
      { type: 'Operator', value: '.' },
      { type: 'Identifier', value: 'txt' },
    )
  })

  test('words (non-identifiers) consume dots', () => {
    expect('README.md').toMatchToken('Word', 'README.md')
  })

  test('all sorts of weird stuff', () => {
    expect('dog#pound').toMatchToken('Word', 'dog#pound')
    expect('my/kinda/place').toMatchToken('my/kinda/place')
    expect('file://%/$##/@40!/index.php').toMatchToken('Word', 'file://%/$##/@40!/index.php')
  })
})

describe('strings', () => {
  test('single quoted', () => {
    expect(`'hello world'`).toMatchToken('String', `'hello world'`)
    expect(`'it\\'s a beautiful world'`).toMatchToken("'it\\'s a beautiful world'")
  })

  test('double quoted', () => {
    expect(`"hello world"`).toMatchToken('String', `"hello world"`)
    expect(`"it's a beautiful world"`).toMatchToken('String', `"it's a beautiful world"`)
  })

  test('empty strings', () => {
    expect(`''`).toMatchToken('String', `''`)
    expect(`""`).toMatchToken('String', `""`)
  })

  test('escape sequences', () => {
    expect(`'hello\\nworld'`).toMatchToken('String', `'hello\\nworld'`)
    expect(`'tab\\there'`).toMatchToken('String', `'tab\\there'`)
    expect(`'quote\\''`).toMatchToken('String', `'quote\\''`)
    expect(`'backslash\\\\'`).toMatchToken('String', `'backslash\\\\'`)
    expect(`'dollar\\$sign'`).toMatchToken('String', `'dollar\\$sign'`)
  })

  test('unclosed strings - error case', () => {
    // These should either fail or produce unexpected results
    expect(`'hello`).toMatchToken('String', `'hello`)
    expect(`"world`).toMatchToken('String', `"world`)
  })
})

describe('curly strings', () => {
  test('curly quoted', () => {
    expect('{ one two three }').toMatchToken('String', `{ one two three }`)
  })

  test('work on multiple lines', () => {
    expect(`{
      one
      two
      three }`).toMatchToken('String', `{
      one
      two
      three }`)
  })

  test('can contain other curlies', () => {
    expect(`{ { one }
      two
      { three } }`).toMatchToken('String', `{ { one }
      two
      { three } }`)
  })

  test('empty curly string', () => {
    expect('{}').toMatchToken('String', '{}')
  })

  test('unclosed curly string - error case', () => {
    // Should either fail or produce unexpected results
    expect('{ hello').toMatchToken('String', '{ hello')
    expect('{ nested { unclosed }').toMatchToken('String', '{ nested { unclosed }')
  })
})

describe('operators', () => {
  test('math operators', () => {
    // assignment
    expect('=').toMatchToken('Operator', '=')

    // logic
    expect('or').toMatchToken('Operator', 'or')
    expect('and').toMatchToken('Operator', 'and')

    // bitwise
    expect('band').toMatchToken('Operator', 'band')
    expect('bor').toMatchToken('Operator', 'bor')
    expect('bxor').toMatchToken('Operator', 'bxor')
    expect('>>>').toMatchToken('Operator', '>>>')
    expect('>>').toMatchToken('Operator', '>>')
    expect('<<').toMatchToken('Operator', '<<')

    // compound assignment
    expect('??=').toMatchToken('Operator', '??=')
    expect('+=').toMatchToken('Operator', '+=')
    expect('-=').toMatchToken('Operator', '-=')
    expect('*=').toMatchToken('Operator', '*=')
    expect('/=').toMatchToken('Operator', '/=')
    expect('%=').toMatchToken('Operator', '%=')

    // nullish
    expect('??').toMatchToken('Operator', '??')

    // math
    expect('**').toMatchToken('Operator', '**')
    expect('*').toMatchToken('Operator', '*')
    expect('/').toMatchToken('Operator', '/')
    expect('+').toMatchToken('Operator', '+')
    expect('-').toMatchToken('Operator', '-')
    expect('%').toMatchToken('Operator', '%')

    // comparison
    expect('>=').toMatchToken('Operator', '>=')
    expect('<=').toMatchToken('Operator', '<=')
    expect('!=').toMatchToken('Operator', '!=')
    expect('==').toMatchToken('Operator', '==')
    expect('>').toMatchToken('Operator', '>')
    expect('<').toMatchToken('Operator', '<')

    // property access
    expect('.').toMatchToken('Operator', '.')
  })
})

describe('keywords', () => {
  test('keywords', () => {
    expect(`import`).toMatchToken('Keyword', 'import')

    expect(`end`).toMatchToken('Keyword', 'end')
    expect(`do`).toMatchToken('Keyword', 'do')

    expect(`while`).toMatchToken('Keyword', 'while')

    expect(`if`).toMatchToken('Keyword', 'if')
    expect(`else`).toMatchToken('Keyword', 'else')

    expect(`try`).toMatchToken('Keyword', 'try')
    expect(`catch`).toMatchToken('Keyword', 'catch')
    expect(`finally`).toMatchToken('Keyword', 'finally')
    expect(`throw`).toMatchToken('Keyword', 'throw')
  })
})

describe('regex', () => {
  test('use double slash', () => {
    expect(`//[0-9]+//`).toMatchToken('Regex', '//[0-9]+//')
  })
})

describe('punctuation', () => {
  test('underscore', () => {
    expect(`_`).toBeToken('Underscore')
    expect(`__`).toMatchToken('Word', '__')
  })

  test('semicolon', () => {
    expect(`;`).toBeToken('Semicolon')
  })

  test('newline', () => {
    expect('\n').toBeToken('Newline')
  })

  test('colon', () => {
    expect(':').toBeToken('Colon')
  })
})

describe('comments', () => {
  test('comments', () => {
    expect(`# hey friends`).toMatchToken('Comment', '# hey friends')
    expect(`#hey-friends`).toMatchToken('Comment', '#hey-friends')
  })
})

describe('brackets', () => {
  test('parens', () => {
    expect(`(`).toBeToken('OpenParen')
    expect(`)`).toBeToken('CloseParen')
  })

  test('staples', () => {
    expect(`[`).toBeToken('OpenBracket')
    expect(`]`).toBeToken('CloseBracket')
  })
})

describe('multiple tokens', () => {
  test('constants work fine', () => {
    expect(`null true false`).toMatchTokens(
      { type: 'Null' },
      { type: 'Boolean', value: 'true' },
      { type: 'Boolean', value: 'false' },
    )
  })

  test('numbers', () => {
    expect(`100 -400.42 null`).toMatchTokens(
      { type: 'Number', value: '100' },
      { type: 'Number', value: '-400.42' },
      { type: 'Null' },
    )
  })

  test('whitespace', () => {
    expect(`
      'hello world'

      'goodbye world'
    `).toMatchTokens(
      { type: 'Newline' },
      { type: 'String', value: "'hello world'" },
      { type: 'Newline' },
      { type: 'Newline' },
      { type: 'String', value: "'goodbye world'" },
      { type: 'Newline' },
    )
  })

  test('newline in parens is ignored', () => {
    expect(`(
      'hello world'

      'goodbye world'
     )`).toMatchTokens(
      { type: 'OpenParen' },
      { type: 'String', value: "'hello world'" },
      { type: 'String', value: "'goodbye world'" },
      { type: 'CloseParen' },
    )
  })

  test('newline in brackets is ignored', () => {
    expect(`[
      a b
c d

e

f

    ]`).toMatchTokens(
      { type: 'OpenBracket' },
      { type: 'Identifier', value: "a" },
      { type: 'Identifier', value: "b" },
      { type: 'Identifier', value: "c" },
      { type: 'Identifier', value: "d" },
      { type: 'Identifier', value: "e" },
      { type: 'Identifier', value: "f" },
      { type: 'CloseBracket' },
    )
  })

  test('function call', () => {
    expect('echo hello world').toMatchTokens(
      { type: 'Identifier', value: 'echo' },
      { type: 'Identifier', value: 'hello' },
      { type: 'Identifier', value: 'world' },
    )
  })

  test('assignment', () => {
    expect('x = 5').toMatchTokens(
      { type: 'Identifier', value: 'x' },
      { type: 'Operator', value: '=' },
      { type: 'Number', value: '5' },
    )
  })

  test('math expression', () => {
    expect('1 + 2 * 3').toMatchTokens(
      { type: 'Number', value: '1' },
      { type: 'Operator', value: '+' },
      { type: 'Number', value: '2' },
      { type: 'Operator', value: '*' },
      { type: 'Number', value: '3' },
    )
  })

  test('inline comment', () => {
    expect('x = 5 # set x').toMatchTokens(
      { type: 'Identifier', value: 'x' },
      { type: 'Operator', value: '=' },
      { type: 'Number', value: '5' },
      { type: 'Comment', value: '# set x' },
    )
  })

  test('line comment', () => {
    expect('x = 5 \n# hello\n set x').toMatchTokens(
      { type: 'Identifier', value: 'x' },
      { type: 'Operator', value: '=' },
      { type: 'Number', value: '5' },
      { type: 'Newline' },
      { type: 'Comment', value: '# hello' },
      { type: 'Newline' },
      { type: 'Identifier', value: 'set' },
      { type: 'Identifier', value: 'x' },
    )
  })

  test('colons separate tokens', () => {
    expect('x do: y').toMatchTokens(
      { type: 'Identifier', value: 'x' },
      { type: 'Keyword', value: 'do' },
      { type: 'Colon' },
      { type: 'Identifier', value: 'y' },
    )

    expect('x: y').toMatchTokens(
      { type: 'Identifier', value: 'x' },
      { type: 'Colon' },
      { type: 'Identifier', value: 'y' },
    )

    expect('5: y').toMatchTokens(
      { type: 'Number', value: '5' },
      { type: 'Colon' },
      { type: 'Identifier', value: 'y' },
    )


    expect(`if (var? 'abc'): y`).toMatchTokens(
      { type: 'Keyword', value: 'if' },
      { type: 'OpenParen' },
      { type: 'Identifier', value: 'var?' },
      { type: 'String', value: `'abc'` },
      { type: 'CloseParen' },
      { type: 'Colon' },
      { type: 'Identifier', value: 'y' },
    )

    expect(`
do x:
  y
end`).toMatchTokens(
      { type: 'Newline' },
      { type: 'Keyword', value: 'do' },
      { type: 'Identifier', value: 'x' },
      { type: 'Colon' },
      { type: 'Newline' },
      { type: 'Identifier', value: 'y' },
      { type: 'Newline' },
      { type: 'Keyword', value: 'end' },
    )
  })

  test('semicolons separate statements', () => {
    expect('x; y').toMatchTokens(
      { type: 'Identifier', value: 'x' },
      { type: 'Semicolon' },
      { type: 'Identifier', value: 'y' },
    )
  })

  test('semicolons in parens', () => {
    expect('(x; y)').toMatchTokens(
      { type: 'OpenParen' },
      { type: 'Identifier', value: 'x' },
      { type: 'Semicolon' },
      { type: 'Identifier', value: 'y' },
      { type: 'CloseParen' },
    )
  })

  test('dot operator beginning word with slash', () => {
    expect(`(basename ./cool)`).toMatchTokens(
      { 'type': 'OpenParen' },
      { 'type': 'Identifier', 'value': 'basename' },
      { 'type': 'Word', 'value': './cool' },
      { 'type': 'CloseParen' }
    )
  })

  test('dot word after identifier with space', () => {
    expect(`expand-path .git`).toMatchTokens(
      { 'type': 'Identifier', 'value': 'expand-path' },
      { 'type': 'Word', 'value': '.git' },
    )
  })

  test('dot operator after identifier without space', () => {
    expect(`config.path`).toMatchTokens(
      { 'type': 'Identifier', 'value': 'config' },
      { 'type': 'Operator', 'value': '.' },
      { 'type': 'Identifier', 'value': 'path' },
    )
  })
})

describe('nesting edge cases', () => {
  test('deeply nested parens', () => {
    expect('((nested))').toMatchTokens(
      { type: 'OpenParen' },
      { type: 'OpenParen' },
      { type: 'Identifier', value: 'nested' },
      { type: 'CloseParen' },
      { type: 'CloseParen' },
    )
  })

  test('mixed nesting', () => {
    expect('([combo])').toMatchTokens(
      { type: 'OpenParen' },
      { type: 'OpenBracket' },
      { type: 'Identifier', value: 'combo' },
      { type: 'CloseBracket' },
      { type: 'CloseParen' },
    )
  })
})

describe('invalid numbers that should be words', () => {
  test('invalid binary', () => {
    expect('0b2').toMatchToken('Word', '0b2')
    expect('0b123').toMatchToken('Word', '0b123')
  })

  test('invalid octal', () => {
    expect('0o8').toMatchToken('Word', '0o8')
    expect('0o999').toMatchToken('Word', '0o999')
  })

  test('invalid hex', () => {
    expect('0xGGG').toMatchToken('Word', '0xGGG')
    expect('0xZZZ').toMatchToken('Word', '0xZZZ')
  })

  test('multiple decimal points', () => {
    expect('1.2.3').toMatchToken('Word', '1.2.3')
  })
})

describe('unicode and emoji', () => {
  test('greek letters', () => {
    expect('αβγ').toMatchToken('Identifier', 'αβγ')
    expect('delta-δ').toMatchToken('Identifier', 'delta-δ')
  })

  test('math symbols', () => {
    expect('∑').toMatchToken('Identifier', '∑')
    expect('∏').toMatchToken('Identifier', '∏')
  })

  test('CJK characters', () => {
    expect('你好').toMatchToken('Identifier', '你好')
    expect('こんにちは').toMatchToken('Identifier', 'こんにちは')
  })
})

describe('empty and whitespace input', () => {
  test('empty string', () => {
    expect('').toMatchTokens()
  })

  test('only whitespace', () => {
    expect('   ').toMatchTokens()
  })

  test('only tabs', () => {
    expect('\t\t\t').toMatchTokens()
  })

  test('only newlines', () => {
    expect('\n\n\n').toMatchTokens(
      { type: 'Newline' },
      { type: 'Newline' },
      { type: 'Newline' },
    )
  })
})

describe('named args', () => {
  test("don't need spaces", () => {
    expect(`named=arg`).toMatchTokens(
      { type: 'NamedArgPrefix', value: 'named=' },
      { type: 'Identifier', value: 'arg' },
    )
  })

  test("can have spaces", () => {
    expect(`named= arg`).toMatchTokens(
      { type: 'NamedArgPrefix', value: 'named=' },
      { type: 'Identifier', value: 'arg' },
    )
  })

  test("can include numbers", () => {
    expect(`named123= arg`).toMatchTokens(
      { type: 'NamedArgPrefix', value: 'named123=' },
      { type: 'Identifier', value: 'arg' },
    )
  })
})

describe('dot operator', () => {
  test('standalone dot', () => {
    expect('.').toMatchToken('Operator', '.')
  })

  test('dot between identifiers tokenizes as separate tokens', () => {
    expect('config.path').toMatchTokens(
      { type: 'Identifier', value: 'config' },
      { type: 'Operator', value: '.' },
      { type: 'Identifier', value: 'path' },
    )
  })

  test('dot with number', () => {
    expect('array.0').toMatchTokens(
      { type: 'Identifier', value: 'array' },
      { type: 'Operator', value: '.' },
      { type: 'Number', value: '0' },
    )
  })

  test('chained dots', () => {
    expect('a.b.c').toMatchTokens(
      { type: 'Identifier', value: 'a' },
      { type: 'Operator', value: '.' },
      { type: 'Identifier', value: 'b' },
      { type: 'Operator', value: '.' },
      { type: 'Identifier', value: 'c' },
    )
  })

  test('identifier-like paths tokenize separately', () => {
    expect('readme.txt').toMatchTokens(
      { type: 'Identifier', value: 'readme' },
      { type: 'Operator', value: '.' },
      { type: 'Identifier', value: 'txt' },
    )
  })

  test('word-like paths remain as single token', () => {
    expect('./file.txt').toMatchToken('Word', './file.txt')
    expect('README.TXT').toMatchToken('Word', 'README.TXT')
  })

  test('dot with paren expression', () => {
    expect('obj.(1 + 2)').toMatchTokens(
      { type: 'Identifier', value: 'obj' },
      { type: 'Operator', value: '.' },
      { type: 'OpenParen' },
      { type: 'Number', value: '1' },
      { type: 'Operator', value: '+' },
      { type: 'Number', value: '2' },
      { type: 'CloseParen' },
    )
  })

  test('chained dot with paren expression', () => {
    expect('obj.items.(i)').toMatchTokens(
      { type: 'Identifier', value: 'obj' },
      { type: 'Operator', value: '.' },
      { type: 'Identifier', value: 'items' },
      { type: 'Operator', value: '.' },
      { type: 'OpenParen' },
      { type: 'Identifier', value: 'i' },
      { type: 'CloseParen' },
    )
  })
})