1
Fork 0
hare-playground/vendor/hare-json/encoding/json/lex.ha
2024-06-01 16:46:01 +02:00

378 lines
7 KiB
Hare

// License: MPL-2.0
// (c) 2022 Drew DeVault <sir@cmpwn.com>
use ascii;
use bufio;
use encoding::utf8;
use io;
use os;
use strconv;
use strings;
use memio;
export type lexer = struct {
src: io::handle,
strbuf: memio::stream,
un: (token | void),
rb: (rune | void),
loc: (uint, uint),
prevloc: (uint, uint),
nextloc: (uint, uint),
prevrloc: (uint, uint),
};
// Creates a new JSON lexer. The caller may obtain tokens with [[lex]] and
// should pass the result to [[close]] when they're done with it.
export fn newlexer(src: io::handle) lexer = lexer {
src = src,
strbuf = memio::dynamic(),
un = void,
rb = void,
loc = (1, 0),
...
};
// Frees state associated with a JSON lexer.
export fn close(lex: *lexer) void = {
io::close(&lex.strbuf)!;
};
// Returns the next token from a JSON lexer. The return value is borrowed from
// the lexer and will be overwritten on subsequent calls.
export fn lex(lex: *lexer) (token | io::EOF | error) = {
match (lex.un) {
case void =>
lex.prevloc = lex.loc;
case let tok: token =>
lex.un = void;
lex.prevloc = lex.loc;
lex.loc = lex.nextloc;
return tok;
};
const rn = match (nextrunews(lex)?) {
case io::EOF =>
return io::EOF;
case let rn: rune =>
yield rn;
};
switch (rn) {
case '[' =>
return arraystart;
case ']' =>
return arrayend;
case '{' =>
return objstart;
case '}' =>
return objend;
case ',' =>
return comma;
case ':' =>
return colon;
case '"' =>
return scan_str(lex)?;
case =>
yield;
};
if (ascii::isdigit(rn) || rn == '-') {
unget(lex, rn);
return scan_number(lex)?;
};
if (!ascii::isalpha(rn)) {
return lex.loc: invalid;
};
unget(lex, rn);
const word = scan_word(lex)?;
switch (word) {
case "true" =>
return true;
case "false" =>
return false;
case "null" =>
return _null;
case =>
return lex.loc: invalid;
};
};
// "Unlexes" a token from the lexer, such that the next call to [[lex]] will
// return that token again. Only one token can be unlexed at a time, otherwise
// the program will abort.
export fn unlex(lex: *lexer, tok: token) void = {
assert(lex.un is void, "encoding::json::unlex called twice in a row");
lex.un = tok;
lex.nextloc = lex.loc;
lex.loc = lex.prevloc;
};
// Scans until encountering a non-alphabetical character, returning the
// resulting word.
fn scan_word(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
break;
};
if (!ascii::isalpha(rn)) {
unget(lex, rn);
break;
};
memio::appendrune(&lex.strbuf, rn)!;
};
return memio::string(&lex.strbuf)!;
};
type numstate = enum {
SIGN,
START,
ZERO,
INTEGER,
FRACSTART,
FRACTION,
EXPSIGN,
EXPSTART,
EXPONENT,
};
fn scan_number(lex: *lexer) (token | error) = {
memio::reset(&lex.strbuf);
let state = numstate::SIGN;
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
break;
};
switch (state) {
case numstate::SIGN =>
state = numstate::START;
if (rn != '-') {
unget(lex, rn);
continue;
};
case numstate::START =>
switch (rn) {
case '0' =>
state = numstate::ZERO;
case =>
if (!ascii::isdigit(rn)) {
return lex.loc: invalid;
};
state = numstate::INTEGER;
};
case numstate::ZERO =>
switch (rn) {
case '.' =>
state = numstate::FRACSTART;
case 'e', 'E' =>
state = numstate::EXPSIGN;
case =>
if (ascii::isdigit(rn)) {
return lex.loc: invalid;
};
unget(lex, rn);
break;
};
case numstate::INTEGER =>
switch (rn) {
case '.' =>
state = numstate::FRACSTART;
case 'e', 'E' =>
state = numstate::EXPSIGN;
case =>
if (!ascii::isdigit(rn)) {
unget(lex, rn);
break;
};
};
case numstate::FRACSTART =>
if (!ascii::isdigit(rn)) {
return lex.loc: invalid;
};
state = numstate::FRACTION;
case numstate::FRACTION =>
switch (rn) {
case 'e', 'E' =>
state = numstate::EXPSIGN;
case =>
if (!ascii::isdigit(rn)) {
unget(lex, rn);
break;
};
};
case numstate::EXPSIGN =>
state = numstate::EXPSTART;
if (rn != '+' && rn != '-') {
unget(lex, rn);
continue;
};
case numstate::EXPSTART =>
if (!ascii::isdigit(rn)) {
return lex.loc: invalid;
};
state = numstate::EXPONENT;
case numstate::EXPONENT =>
if (!ascii::isdigit(rn)) {
unget(lex, rn);
break;
};
};
memio::appendrune(&lex.strbuf, rn)!;
};
match (strconv::stof64(memio::string(&lex.strbuf)!)) {
case let f: f64 =>
return f;
case =>
return lex.loc: invalid;
};
};
fn scan_str(lex: *lexer) (token | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
lex.loc.1 += 1;
return lex.loc: invalid;
};
switch (rn) {
case '"' =>
break;
case '\\' =>
const rn = scan_escape(lex)?;
memio::appendrune(&lex.strbuf, rn)!;
case =>
if (iscntrl(rn)) {
return lex.loc: invalid;
};
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scan_escape(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return lex.loc: invalid;
};
switch (rn) {
case '\"' =>
return '\"';
case '\\' =>
return '\\';
case '/' =>
return '/';
case 'b' =>
return '\b';
case 'f' =>
return '\f';
case 'n' =>
return '\n';
case 'r' =>
return '\r';
case 't' =>
return '\t';
case 'u' =>
let buf: [4]u8 = [0...];
match (io::readall(lex.src, buf)?) {
case io::EOF =>
return lex.loc: invalid;
case size =>
yield;
};
const s = match (strings::fromutf8(buf)) {
case let s: str =>
yield s;
case =>
return lex.loc: invalid;
};
match (strconv::stou32(s, strconv::base::HEX)) {
case let u: u32 =>
lex.loc.1 += 4;
return u: rune;
case =>
return lex.loc: invalid;
};
case =>
return lex.loc: invalid;
};
};
// Gets the next rune from the lexer.
fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
if (lex.rb is rune) {
lex.prevrloc = lex.loc;
const r = lex.rb as rune;
lex.rb = void;
if (r == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
return r;
};
match (bufio::read_rune(lex.src)) {
case let err: io::error =>
return err;
case utf8::invalid =>
return lex.loc: invalid;
case io::EOF =>
return io::EOF;
case let rn: rune =>
lex.prevrloc = lex.loc;
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
return rn;
};
};
// Like nextrune but skips whitespace.
fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
for (true) {
match (nextrune(lex)?) {
case let rn: rune =>
if (isspace(rn)) {
continue;
};
return rn;
case io::EOF =>
return io::EOF;
};
};
};
fn unget(lex: *lexer, r: rune) void = {
assert(lex.rb is void);
lex.rb = r;
lex.loc = lex.prevrloc;
};
fn iscntrl(r: rune) bool = r: u32 < 0x20;
fn isspace(r: rune) bool = ascii::isspace(r) && r != '\f';