378 lines
7 KiB
Hare
378 lines
7 KiB
Hare
|
// License: MPL-2.0
|
||
|
// (c) 2022 Drew DeVault <sir@cmpwn.com>
|
||
|
use ascii;
|
||
|
use bufio;
|
||
|
use encoding::utf8;
|
||
|
use io;
|
||
|
use os;
|
||
|
use strconv;
|
||
|
use strings;
|
||
|
use memio;
|
||
|
|
||
|
export type lexer = struct {
|
||
|
src: io::handle,
|
||
|
strbuf: memio::stream,
|
||
|
un: (token | void),
|
||
|
rb: (rune | void),
|
||
|
loc: (uint, uint),
|
||
|
prevloc: (uint, uint),
|
||
|
nextloc: (uint, uint),
|
||
|
prevrloc: (uint, uint),
|
||
|
};
|
||
|
|
||
|
// Creates a new JSON lexer. The caller may obtain tokens with [[lex]] and
|
||
|
// should pass the result to [[close]] when they're done with it.
|
||
|
export fn newlexer(src: io::handle) lexer = lexer {
|
||
|
src = src,
|
||
|
strbuf = memio::dynamic(),
|
||
|
un = void,
|
||
|
rb = void,
|
||
|
loc = (1, 0),
|
||
|
...
|
||
|
};
|
||
|
|
||
|
// Frees state associated with a JSON lexer.
|
||
|
export fn close(lex: *lexer) void = {
|
||
|
io::close(&lex.strbuf)!;
|
||
|
};
|
||
|
|
||
|
// Returns the next token from a JSON lexer. The return value is borrowed from
|
||
|
// the lexer and will be overwritten on subsequent calls.
|
||
|
export fn lex(lex: *lexer) (token | io::EOF | error) = {
|
||
|
match (lex.un) {
|
||
|
case void =>
|
||
|
lex.prevloc = lex.loc;
|
||
|
case let tok: token =>
|
||
|
lex.un = void;
|
||
|
lex.prevloc = lex.loc;
|
||
|
lex.loc = lex.nextloc;
|
||
|
return tok;
|
||
|
};
|
||
|
|
||
|
const rn = match (nextrunews(lex)?) {
|
||
|
case io::EOF =>
|
||
|
return io::EOF;
|
||
|
case let rn: rune =>
|
||
|
yield rn;
|
||
|
};
|
||
|
|
||
|
switch (rn) {
|
||
|
case '[' =>
|
||
|
return arraystart;
|
||
|
case ']' =>
|
||
|
return arrayend;
|
||
|
case '{' =>
|
||
|
return objstart;
|
||
|
case '}' =>
|
||
|
return objend;
|
||
|
case ',' =>
|
||
|
return comma;
|
||
|
case ':' =>
|
||
|
return colon;
|
||
|
case '"' =>
|
||
|
return scan_str(lex)?;
|
||
|
case =>
|
||
|
yield;
|
||
|
};
|
||
|
|
||
|
if (ascii::isdigit(rn) || rn == '-') {
|
||
|
unget(lex, rn);
|
||
|
return scan_number(lex)?;
|
||
|
};
|
||
|
|
||
|
if (!ascii::isalpha(rn)) {
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
|
||
|
unget(lex, rn);
|
||
|
const word = scan_word(lex)?;
|
||
|
switch (word) {
|
||
|
case "true" =>
|
||
|
return true;
|
||
|
case "false" =>
|
||
|
return false;
|
||
|
case "null" =>
|
||
|
return _null;
|
||
|
case =>
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
// "Unlexes" a token from the lexer, such that the next call to [[lex]] will
|
||
|
// return that token again. Only one token can be unlexed at a time, otherwise
|
||
|
// the program will abort.
|
||
|
export fn unlex(lex: *lexer, tok: token) void = {
|
||
|
assert(lex.un is void, "encoding::json::unlex called twice in a row");
|
||
|
lex.un = tok;
|
||
|
lex.nextloc = lex.loc;
|
||
|
lex.loc = lex.prevloc;
|
||
|
};
|
||
|
|
||
|
// Scans until encountering a non-alphabetical character, returning the
|
||
|
// resulting word.
|
||
|
fn scan_word(lex: *lexer) (str | error) = {
|
||
|
memio::reset(&lex.strbuf);
|
||
|
|
||
|
for (true) {
|
||
|
const rn = match (nextrune(lex)?) {
|
||
|
case let rn: rune =>
|
||
|
yield rn;
|
||
|
case io::EOF =>
|
||
|
break;
|
||
|
};
|
||
|
if (!ascii::isalpha(rn)) {
|
||
|
unget(lex, rn);
|
||
|
break;
|
||
|
};
|
||
|
memio::appendrune(&lex.strbuf, rn)!;
|
||
|
};
|
||
|
|
||
|
return memio::string(&lex.strbuf)!;
|
||
|
};
|
||
|
|
||
|
type numstate = enum {
|
||
|
SIGN,
|
||
|
START,
|
||
|
ZERO,
|
||
|
INTEGER,
|
||
|
FRACSTART,
|
||
|
FRACTION,
|
||
|
EXPSIGN,
|
||
|
EXPSTART,
|
||
|
EXPONENT,
|
||
|
};
|
||
|
|
||
|
fn scan_number(lex: *lexer) (token | error) = {
|
||
|
memio::reset(&lex.strbuf);
|
||
|
|
||
|
let state = numstate::SIGN;
|
||
|
for (true) {
|
||
|
const rn = match (nextrune(lex)?) {
|
||
|
case let rn: rune =>
|
||
|
yield rn;
|
||
|
case io::EOF =>
|
||
|
break;
|
||
|
};
|
||
|
|
||
|
switch (state) {
|
||
|
case numstate::SIGN =>
|
||
|
state = numstate::START;
|
||
|
if (rn != '-') {
|
||
|
unget(lex, rn);
|
||
|
continue;
|
||
|
};
|
||
|
case numstate::START =>
|
||
|
switch (rn) {
|
||
|
case '0' =>
|
||
|
state = numstate::ZERO;
|
||
|
case =>
|
||
|
if (!ascii::isdigit(rn)) {
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
state = numstate::INTEGER;
|
||
|
};
|
||
|
case numstate::ZERO =>
|
||
|
switch (rn) {
|
||
|
case '.' =>
|
||
|
state = numstate::FRACSTART;
|
||
|
case 'e', 'E' =>
|
||
|
state = numstate::EXPSIGN;
|
||
|
case =>
|
||
|
if (ascii::isdigit(rn)) {
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
unget(lex, rn);
|
||
|
break;
|
||
|
};
|
||
|
case numstate::INTEGER =>
|
||
|
switch (rn) {
|
||
|
case '.' =>
|
||
|
state = numstate::FRACSTART;
|
||
|
case 'e', 'E' =>
|
||
|
state = numstate::EXPSIGN;
|
||
|
case =>
|
||
|
if (!ascii::isdigit(rn)) {
|
||
|
unget(lex, rn);
|
||
|
break;
|
||
|
};
|
||
|
};
|
||
|
case numstate::FRACSTART =>
|
||
|
if (!ascii::isdigit(rn)) {
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
state = numstate::FRACTION;
|
||
|
case numstate::FRACTION =>
|
||
|
switch (rn) {
|
||
|
case 'e', 'E' =>
|
||
|
state = numstate::EXPSIGN;
|
||
|
case =>
|
||
|
if (!ascii::isdigit(rn)) {
|
||
|
unget(lex, rn);
|
||
|
break;
|
||
|
};
|
||
|
};
|
||
|
case numstate::EXPSIGN =>
|
||
|
state = numstate::EXPSTART;
|
||
|
if (rn != '+' && rn != '-') {
|
||
|
unget(lex, rn);
|
||
|
continue;
|
||
|
};
|
||
|
case numstate::EXPSTART =>
|
||
|
if (!ascii::isdigit(rn)) {
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
state = numstate::EXPONENT;
|
||
|
case numstate::EXPONENT =>
|
||
|
if (!ascii::isdigit(rn)) {
|
||
|
unget(lex, rn);
|
||
|
break;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
memio::appendrune(&lex.strbuf, rn)!;
|
||
|
};
|
||
|
|
||
|
match (strconv::stof64(memio::string(&lex.strbuf)!)) {
|
||
|
case let f: f64 =>
|
||
|
return f;
|
||
|
case =>
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
fn scan_str(lex: *lexer) (token | error) = {
|
||
|
memio::reset(&lex.strbuf);
|
||
|
|
||
|
for (true) {
|
||
|
const rn = match (nextrune(lex)?) {
|
||
|
case let rn: rune =>
|
||
|
yield rn;
|
||
|
case io::EOF =>
|
||
|
lex.loc.1 += 1;
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
|
||
|
switch (rn) {
|
||
|
case '"' =>
|
||
|
break;
|
||
|
case '\\' =>
|
||
|
const rn = scan_escape(lex)?;
|
||
|
memio::appendrune(&lex.strbuf, rn)!;
|
||
|
case =>
|
||
|
if (iscntrl(rn)) {
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
memio::appendrune(&lex.strbuf, rn)!;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
return memio::string(&lex.strbuf)!;
|
||
|
};
|
||
|
|
||
|
fn scan_escape(lex: *lexer) (rune | error) = {
|
||
|
const rn = match (nextrune(lex)?) {
|
||
|
case let rn: rune =>
|
||
|
yield rn;
|
||
|
case io::EOF =>
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
|
||
|
switch (rn) {
|
||
|
case '\"' =>
|
||
|
return '\"';
|
||
|
case '\\' =>
|
||
|
return '\\';
|
||
|
case '/' =>
|
||
|
return '/';
|
||
|
case 'b' =>
|
||
|
return '\b';
|
||
|
case 'f' =>
|
||
|
return '\f';
|
||
|
case 'n' =>
|
||
|
return '\n';
|
||
|
case 'r' =>
|
||
|
return '\r';
|
||
|
case 't' =>
|
||
|
return '\t';
|
||
|
case 'u' =>
|
||
|
let buf: [4]u8 = [0...];
|
||
|
match (io::readall(lex.src, buf)?) {
|
||
|
case io::EOF =>
|
||
|
return lex.loc: invalid;
|
||
|
case size =>
|
||
|
yield;
|
||
|
};
|
||
|
const s = match (strings::fromutf8(buf)) {
|
||
|
case let s: str =>
|
||
|
yield s;
|
||
|
case =>
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
match (strconv::stou32(s, strconv::base::HEX)) {
|
||
|
case let u: u32 =>
|
||
|
lex.loc.1 += 4;
|
||
|
return u: rune;
|
||
|
case =>
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
case =>
|
||
|
return lex.loc: invalid;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
// Gets the next rune from the lexer.
|
||
|
fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
|
||
|
if (lex.rb is rune) {
|
||
|
lex.prevrloc = lex.loc;
|
||
|
const r = lex.rb as rune;
|
||
|
lex.rb = void;
|
||
|
if (r == '\n') {
|
||
|
lex.loc = (lex.loc.0 + 1, 0);
|
||
|
} else {
|
||
|
lex.loc.1 += 1;
|
||
|
};
|
||
|
return r;
|
||
|
};
|
||
|
match (bufio::read_rune(lex.src)) {
|
||
|
case let err: io::error =>
|
||
|
return err;
|
||
|
case utf8::invalid =>
|
||
|
return lex.loc: invalid;
|
||
|
case io::EOF =>
|
||
|
return io::EOF;
|
||
|
case let rn: rune =>
|
||
|
lex.prevrloc = lex.loc;
|
||
|
if (rn == '\n') {
|
||
|
lex.loc = (lex.loc.0 + 1, 0);
|
||
|
} else {
|
||
|
lex.loc.1 += 1;
|
||
|
};
|
||
|
return rn;
|
||
|
};
|
||
|
};
|
||
|
|
||
|
// Like nextrune but skips whitespace.
|
||
|
fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
|
||
|
for (true) {
|
||
|
match (nextrune(lex)?) {
|
||
|
case let rn: rune =>
|
||
|
if (isspace(rn)) {
|
||
|
continue;
|
||
|
};
|
||
|
return rn;
|
||
|
case io::EOF =>
|
||
|
return io::EOF;
|
||
|
};
|
||
|
};
|
||
|
};
|
||
|
|
||
|
fn unget(lex: *lexer, r: rune) void = {
|
||
|
assert(lex.rb is void);
|
||
|
lex.rb = r;
|
||
|
lex.loc = lex.prevrloc;
|
||
|
};
|
||
|
|
||
|
fn iscntrl(r: rune) bool = r: u32 < 0x20;
|
||
|
|
||
|
fn isspace(r: rune) bool = ascii::isspace(r) && r != '\f';
|