// License: MPL-2.0 // (c) 2022 Drew DeVault use ascii; use bufio; use encoding::utf8; use io; use os; use strconv; use strings; use memio; export type lexer = struct { src: io::handle, strbuf: memio::stream, un: (token | void), rb: (rune | void), loc: (uint, uint), prevloc: (uint, uint), nextloc: (uint, uint), prevrloc: (uint, uint), }; // Creates a new JSON lexer. The caller may obtain tokens with [[lex]] and // should pass the result to [[close]] when they're done with it. export fn newlexer(src: io::handle) lexer = lexer { src = src, strbuf = memio::dynamic(), un = void, rb = void, loc = (1, 0), ... }; // Frees state associated with a JSON lexer. export fn close(lex: *lexer) void = { io::close(&lex.strbuf)!; }; // Returns the next token from a JSON lexer. The return value is borrowed from // the lexer and will be overwritten on subsequent calls. export fn lex(lex: *lexer) (token | io::EOF | error) = { match (lex.un) { case void => lex.prevloc = lex.loc; case let tok: token => lex.un = void; lex.prevloc = lex.loc; lex.loc = lex.nextloc; return tok; }; const rn = match (nextrunews(lex)?) { case io::EOF => return io::EOF; case let rn: rune => yield rn; }; switch (rn) { case '[' => return arraystart; case ']' => return arrayend; case '{' => return objstart; case '}' => return objend; case ',' => return comma; case ':' => return colon; case '"' => return scan_str(lex)?; case => yield; }; if (ascii::isdigit(rn) || rn == '-') { unget(lex, rn); return scan_number(lex)?; }; if (!ascii::isalpha(rn)) { return lex.loc: invalid; }; unget(lex, rn); const word = scan_word(lex)?; switch (word) { case "true" => return true; case "false" => return false; case "null" => return _null; case => return lex.loc: invalid; }; }; // "Unlexes" a token from the lexer, such that the next call to [[lex]] will // return that token again. Only one token can be unlexed at a time, otherwise // the program will abort. export fn unlex(lex: *lexer, tok: token) void = { assert(lex.un is void, "encoding::json::unlex called twice in a row"); lex.un = tok; lex.nextloc = lex.loc; lex.loc = lex.prevloc; }; // Scans until encountering a non-alphabetical character, returning the // resulting word. fn scan_word(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => break; }; if (!ascii::isalpha(rn)) { unget(lex, rn); break; }; memio::appendrune(&lex.strbuf, rn)!; }; return memio::string(&lex.strbuf)!; }; type numstate = enum { SIGN, START, ZERO, INTEGER, FRACSTART, FRACTION, EXPSIGN, EXPSTART, EXPONENT, }; fn scan_number(lex: *lexer) (token | error) = { memio::reset(&lex.strbuf); let state = numstate::SIGN; for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => break; }; switch (state) { case numstate::SIGN => state = numstate::START; if (rn != '-') { unget(lex, rn); continue; }; case numstate::START => switch (rn) { case '0' => state = numstate::ZERO; case => if (!ascii::isdigit(rn)) { return lex.loc: invalid; }; state = numstate::INTEGER; }; case numstate::ZERO => switch (rn) { case '.' => state = numstate::FRACSTART; case 'e', 'E' => state = numstate::EXPSIGN; case => if (ascii::isdigit(rn)) { return lex.loc: invalid; }; unget(lex, rn); break; }; case numstate::INTEGER => switch (rn) { case '.' => state = numstate::FRACSTART; case 'e', 'E' => state = numstate::EXPSIGN; case => if (!ascii::isdigit(rn)) { unget(lex, rn); break; }; }; case numstate::FRACSTART => if (!ascii::isdigit(rn)) { return lex.loc: invalid; }; state = numstate::FRACTION; case numstate::FRACTION => switch (rn) { case 'e', 'E' => state = numstate::EXPSIGN; case => if (!ascii::isdigit(rn)) { unget(lex, rn); break; }; }; case numstate::EXPSIGN => state = numstate::EXPSTART; if (rn != '+' && rn != '-') { unget(lex, rn); continue; }; case numstate::EXPSTART => if (!ascii::isdigit(rn)) { return lex.loc: invalid; }; state = numstate::EXPONENT; case numstate::EXPONENT => if (!ascii::isdigit(rn)) { unget(lex, rn); break; }; }; memio::appendrune(&lex.strbuf, rn)!; }; match (strconv::stof64(memio::string(&lex.strbuf)!)) { case let f: f64 => return f; case => return lex.loc: invalid; }; }; fn scan_str(lex: *lexer) (token | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => lex.loc.1 += 1; return lex.loc: invalid; }; switch (rn) { case '"' => break; case '\\' => const rn = scan_escape(lex)?; memio::appendrune(&lex.strbuf, rn)!; case => if (iscntrl(rn)) { return lex.loc: invalid; }; memio::appendrune(&lex.strbuf, rn)!; }; }; return memio::string(&lex.strbuf)!; }; fn scan_escape(lex: *lexer) (rune | error) = { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return lex.loc: invalid; }; switch (rn) { case '\"' => return '\"'; case '\\' => return '\\'; case '/' => return '/'; case 'b' => return '\b'; case 'f' => return '\f'; case 'n' => return '\n'; case 'r' => return '\r'; case 't' => return '\t'; case 'u' => let buf: [4]u8 = [0...]; match (io::readall(lex.src, buf)?) { case io::EOF => return lex.loc: invalid; case size => yield; }; const s = match (strings::fromutf8(buf)) { case let s: str => yield s; case => return lex.loc: invalid; }; match (strconv::stou32(s, strconv::base::HEX)) { case let u: u32 => lex.loc.1 += 4; return u: rune; case => return lex.loc: invalid; }; case => return lex.loc: invalid; }; }; // Gets the next rune from the lexer. fn nextrune(lex: *lexer) (rune | io::EOF | error) = { if (lex.rb is rune) { lex.prevrloc = lex.loc; const r = lex.rb as rune; lex.rb = void; if (r == '\n') { lex.loc = (lex.loc.0 + 1, 0); } else { lex.loc.1 += 1; }; return r; }; match (bufio::read_rune(lex.src)) { case let err: io::error => return err; case utf8::invalid => return lex.loc: invalid; case io::EOF => return io::EOF; case let rn: rune => lex.prevrloc = lex.loc; if (rn == '\n') { lex.loc = (lex.loc.0 + 1, 0); } else { lex.loc.1 += 1; }; return rn; }; }; // Like nextrune but skips whitespace. fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { for (true) { match (nextrune(lex)?) { case let rn: rune => if (isspace(rn)) { continue; }; return rn; case io::EOF => return io::EOF; }; }; }; fn unget(lex: *lexer, r: rune) void = { assert(lex.rb is void); lex.rb = r; lex.loc = lex.prevrloc; }; fn iscntrl(r: rune) bool = r: u32 < 0x20; fn isspace(r: rune) bool = ascii::isspace(r) && r != '\f';