#ifndef TOML11_SYNTAX_IMPL_HPP #define TOML11_SYNTAX_IMPL_HPP #include "../fwd/syntax_fwd.hpp" #include "../scanner.hpp" #include "../spec.hpp" #include "../version.hpp" namespace toml { inline namespace TOML11_INLINE_VERSION_NAMESPACE { namespace detail { namespace syntax { using char_type = location::char_type; template struct syntax_cache { using value_type = cxx::return_type_of_t; static_assert(std::is_base_of::value, ""); explicit syntax_cache(F f) : func_(std::move(f)), cache_(cxx::make_nullopt()) {} value_type const& at(const spec& s) { if( ! this->cache_.has_value() || this->cache_.value().first != s) { this->cache_ = std::make_pair(s, func_(s)); } return this->cache_.value().second; } private: F func_; cxx::optional> cache_; }; template syntax_cache> make_cache(F&& f) { return syntax_cache>(std::forward(f)); } // =========================================================================== // UTF-8 // avoid redundant representation and out-of-unicode sequence TOML11_INLINE character_in_range const& utf8_1byte(const spec&) { static thread_local character_in_range cache(0x00, 0x7F); return cache; } TOML11_INLINE sequence const& utf8_2bytes(const spec&) { static thread_local sequence cache( character_in_range(0xC2, 0xDF), character_in_range(0x80, 0xBF)); return cache; } TOML11_INLINE sequence const& utf8_3bytes(const spec&) { static thread_local sequence cache(/*1~2 bytes = */either( sequence(character (0xE0), character_in_range(0xA0, 0xBF)), sequence(character_in_range(0xE1, 0xEC), character_in_range(0x80, 0xBF)), sequence(character (0xED), character_in_range(0x80, 0x9F)), sequence(character_in_range(0xEE, 0xEF), character_in_range(0x80, 0xBF)) ), /*3rd byte = */ character_in_range(0x80, 0xBF)); return cache; } TOML11_INLINE sequence const& utf8_4bytes(const spec&) { static thread_local sequence cache(/*1~2 bytes = */either( sequence(character (0xF0), character_in_range(0x90, 0xBF)), sequence(character_in_range(0xF1, 0xF3), character_in_range(0x80, 0xBF)), sequence(character (0xF4), character_in_range(0x80, 0x8F)) ), character_in_range(0x80, 0xBF), character_in_range(0x80, 0xBF)); return cache; } // =========================================================================== // Whitespace TOML11_INLINE character_either const& wschar(const spec&) { static thread_local character_either cache(" \t"); return cache; } TOML11_INLINE repeat_at_least const& ws(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s){ return repeat_at_least(0, wschar(s)); }); return cache.at(sp); } // =========================================================================== // Newline TOML11_INLINE either const& newline(const spec&) { static thread_local either cache(character(char_type('\n')), literal("\r\n")); return cache; } // =========================================================================== // Comments TOML11_INLINE either const& allowed_comment_char(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s){ if(s.v1_1_0_allow_control_characters_in_comments) { return either( character_in_range(0x01, 0x09), character_in_range(0x0E, 0x7F), non_ascii(s) ); } else { return either( character(0x09), character_in_range(0x20, 0x7E), non_ascii(s) ); } }); return cache.at(sp); } // XXX Note that it does not take newline TOML11_INLINE sequence const& comment(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s){ return sequence(character(char_type('#')), repeat_at_least(0, allowed_comment_char(s))); }); return cache.at(sp); } // =========================================================================== // Boolean TOML11_INLINE either const& boolean(const spec&) { static thread_local either cache(literal("true"), literal("false")); return cache; } // =========================================================================== // Integer // non-digit-graph = ([a-zA-Z]|unicode mb char) // graph = ([a-zA-Z0-9]|unicode mb char) // suffix = _ non-digit-graph (graph | _graph) TOML11_INLINE sequence const& num_suffix(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { const auto non_digit_graph = [&s]() { return either( alpha(s), non_ascii(s) ); }; const auto graph = [&s]() { return either( alpha(s), digit(s), non_ascii(s) ); }; return sequence( character(char_type('_')), non_digit_graph(), repeat_at_least(0, either( sequence(character(char_type('_')), graph()), graph() ) ) ); }); return cache.at(sp); } TOML11_INLINE sequence const& dec_int(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { const auto digit19 = []() { return character_in_range(char_type('1'), char_type('9')); }; return sequence( maybe(character_either("+-")), either( sequence( digit19(), repeat_at_least(1, either( digit(s), sequence(character(char_type('_')), digit(s)) ) ) ), digit(s) ) ); }); return cache.at(sp); } TOML11_INLINE sequence const& hex_int(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( literal("0x"), hexdig(s), repeat_at_least(0, either( hexdig(s), sequence(character(char_type('_')), hexdig(s)) ) ) ); }); return cache.at(sp); } TOML11_INLINE sequence const& oct_int(const spec& s) { static thread_local auto cache = make_cache([](const spec&) { const auto digit07 = []() { return character_in_range(char_type('0'), char_type('7')); }; return sequence( literal("0o"), digit07(), repeat_at_least(0, either( digit07(), sequence(character(char_type('_')), digit07()) ) ) ); }); return cache.at(s); } TOML11_INLINE sequence const& bin_int(const spec& s) { static thread_local auto cache = make_cache([](const spec&) { const auto digit01 = []() { return character_either("01"); }; return sequence( literal("0b"), digit01(), repeat_at_least(0, either( digit01(), sequence(character(char_type('_')), digit01()) ) ) ); }); return cache.at(s); } TOML11_INLINE either const& integer(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return either( hex_int(s), oct_int(s), bin_int(s), dec_int(s) ); }); return cache.at(sp); } // =========================================================================== // Floating TOML11_INLINE sequence const& zero_prefixable_int(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( digit(s), repeat_at_least(0, either( digit(s), sequence(character('_'), digit(s)) ) ) ); }); return cache.at(sp); } TOML11_INLINE sequence const& fractional_part(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( character('.'), zero_prefixable_int(s) ); }); return cache.at(sp); } TOML11_INLINE sequence const& exponent_part(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( character_either("eE"), maybe(character_either("+-")), zero_prefixable_int(s) ); }); return cache.at(sp); } TOML11_INLINE sequence const& hex_floating(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { // C99 hexfloat (%a) // [+-]? 0x ( [0-9a-fA-F]*\.[0-9a-fA-F]+ | [0-9a-fA-F]+\.? ) [pP] [+-]? [0-9]+ // - 0x(int).(frac)p[+-](int) // - 0x(int).p[+-](int) // - 0x.(frac)p[+-](int) // - 0x(int)p[+-](int) return sequence( maybe(character_either("+-")), character('0'), character_either("xX"), either( sequence( repeat_at_least(0, hexdig(s)), character('.'), repeat_at_least(1, hexdig(s)) ), sequence( repeat_at_least(1, hexdig(s)), maybe(character('.')) ) ), character_either("pP"), maybe(character_either("+-")), repeat_at_least(1, character_in_range('0', '9')) ); }); return cache.at(sp); } TOML11_INLINE either const& floating(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return either( sequence( dec_int(s), either( exponent_part(s), sequence(fractional_part(s), maybe(exponent_part(s))) ) ), sequence( maybe(character_either("+-")), either(literal("inf"), literal("nan")) ) ); }); return cache.at(sp); } // =========================================================================== // Datetime TOML11_INLINE sequence const& local_date(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( repeat_exact(4, digit(s)), character('-'), repeat_exact(2, digit(s)), character('-'), repeat_exact(2, digit(s)) ); }); return cache.at(sp); } TOML11_INLINE sequence const& local_time(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { if(s.v1_1_0_make_seconds_optional) { return sequence( repeat_exact(2, digit(s)), character(':'), repeat_exact(2, digit(s)), maybe(sequence( character(':'), repeat_exact(2, digit(s)), maybe(sequence(character('.'), repeat_at_least(1, digit(s)))) ))); } else { return sequence( repeat_exact(2, digit(s)), character(':'), repeat_exact(2, digit(s)), character(':'), repeat_exact(2, digit(s)), maybe(sequence(character('.'), repeat_at_least(1, digit(s)))) ); } }); return cache.at(sp); } TOML11_INLINE either const& time_offset(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return either( character_either("zZ"), sequence(character_either("+-"), repeat_exact(2, digit(s)), character(':'), repeat_exact(2, digit(s)) ) ); }); return cache.at(sp); } TOML11_INLINE sequence const& full_time(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(local_time(s), time_offset(s)); }); return cache.at(sp); } TOML11_INLINE character_either const& time_delim(const spec& sp) { static thread_local auto cache = make_cache([](const spec&) { return character_either("Tt "); }); return cache.at(sp); } TOML11_INLINE sequence const& local_datetime(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(local_date(s), time_delim(s), local_time(s)); }); return cache.at(sp); } TOML11_INLINE sequence const& offset_datetime(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(local_date(s), time_delim(s), full_time(s)); }); return cache.at(sp); } // =========================================================================== // String TOML11_INLINE sequence const& escaped_x2(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(character('x'), repeat_exact(2, hexdig(s))); }); return cache.at(sp); } TOML11_INLINE sequence const& escaped_u4(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(character('u'), repeat_exact(4, hexdig(s))); }); return cache.at(sp); } TOML11_INLINE sequence const& escaped_U8(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(character('U'), repeat_exact(8, hexdig(s))); }); return cache.at(sp); } TOML11_INLINE sequence const& escaped(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { const auto escape_char = [&s] { if(s.v1_1_0_add_escape_sequence_e) { return character_either("\"\\bfnrte"); } else { return character_either("\"\\bfnrt"); } }; const auto escape_seq = [&s, &escape_char] { if(s.v1_1_0_add_escape_sequence_x) { return either( escape_char(), escaped_u4(s), escaped_U8(s), escaped_x2(s) ); } else { return either( escape_char(), escaped_u4(s), escaped_U8(s) ); } }; return sequence(character('\\'), escape_seq()); }); return cache.at(sp); } TOML11_INLINE either const& basic_char(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { const auto basic_unescaped = [&s]() { return either( wschar(s), character(0x21), // 22 is " character_in_range(0x23, 0x5B), // 5C is backslash character_in_range(0x5D, 0x7E), // 7F is DEL non_ascii(s) ); }; return either(basic_unescaped(), escaped(s)); }); return cache.at(sp); } TOML11_INLINE sequence const& basic_string(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( character('"'), repeat_at_least(0, basic_char(s)), character('"') ); }); return cache.at(sp); } // --------------------------------------------------------------------------- // multiline string TOML11_INLINE sequence const& escaped_newline(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( character('\\'), ws(s), newline(s), repeat_at_least(0, either(wschar(s), newline(s))) ); }); return cache.at(sp); } TOML11_INLINE sequence const& ml_basic_string(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { const auto mlb_content = [&s]() { return either(basic_char(s), newline(s), escaped_newline(s)); }; const auto mlb_quotes = []() { return either(literal("\"\""), character('\"')); }; return sequence( literal("\"\"\""), maybe(newline(s)), repeat_at_least(0, mlb_content()), repeat_at_least(0, sequence( mlb_quotes(), repeat_at_least(1, mlb_content()) ) ), // XXX """ and mlb_quotes are intentionally reordered to avoid // unexpected match of mlb_quotes literal("\"\"\""), maybe(mlb_quotes()) ); }); return cache.at(sp); } // --------------------------------------------------------------------------- // literal string TOML11_INLINE either const& literal_char(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return either( character (0x09), character_in_range(0x20, 0x26), character_in_range(0x28, 0x7E), non_ascii(s) ); }); return cache.at(sp); } TOML11_INLINE sequence const& literal_string(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( character('\''), repeat_at_least(0, literal_char(s)), character('\'') ); }); return cache.at(sp); } TOML11_INLINE sequence const& ml_literal_string(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { const auto mll_quotes = []() { return either(literal("''"), character('\'')); }; const auto mll_content = [&s]() { return either(literal_char(s), newline(s)); }; return sequence( literal("'''"), maybe(newline(s)), repeat_at_least(0, mll_content()), repeat_at_least(0, sequence( mll_quotes(), repeat_at_least(1, mll_content()) ) ), literal("'''"), maybe(mll_quotes()) // XXX ''' and mll_quotes are intentionally reordered to avoid // unexpected match of mll_quotes ); }); return cache.at(sp); } TOML11_INLINE either const& string(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return either( ml_basic_string(s), ml_literal_string(s), basic_string(s), literal_string(s) ); }); return cache.at(sp); } // =========================================================================== // Keys // to keep `expected_chars` simple TOML11_INLINE non_ascii_key_char::non_ascii_key_char(const spec& s) noexcept { assert(s.v1_1_0_allow_non_english_in_bare_keys); (void)s; // for NDEBUG } TOML11_INLINE std::uint32_t non_ascii_key_char::read_utf8(location& loc) const { // U+0000 ... U+0079 ; 0xxx_xxxx // U+0080 ... U+07FF ; 110y_yyyx 10xx_xxxx; // U+0800 ... U+FFFF ; 1110_yyyy 10yx_xxxx 10xx_xxxx // U+010000 ... U+10FFFF; 1111_0yyy 10yy_xxxx 10xx_xxxx 10xx_xxxx const unsigned char b1 = loc.current(); loc.advance(1); if(b1 < 0x80) { return static_cast(b1); } else if((b1 >> 5) == 6) // 0b110 == 6 { const auto b2 = loc.current(); loc.advance(1); const std::uint32_t c1 = b1 & ((1 << 5) - 1); const std::uint32_t c2 = b2 & ((1 << 6) - 1); const std::uint32_t codep = (c1 << 6) + c2; if(codep < 0x80) { return 0xFFFFFFFF; } return codep; } else if((b1 >> 4) == 14) // 0b1110 == 14 { const auto b2 = loc.current(); loc.advance(1); if(loc.eof()) {return 0xFFFFFFFF;} const auto b3 = loc.current(); loc.advance(1); const std::uint32_t c1 = b1 & ((1 << 4) - 1); const std::uint32_t c2 = b2 & ((1 << 6) - 1); const std::uint32_t c3 = b3 & ((1 << 6) - 1); const std::uint32_t codep = (c1 << 12) + (c2 << 6) + c3; if(codep < 0x800) { return 0xFFFFFFFF; } return codep; } else if((b1 >> 3) == 30) // 0b11110 == 30 { const auto b2 = loc.current(); loc.advance(1); if(loc.eof()) {return 0xFFFFFFFF;} const auto b3 = loc.current(); loc.advance(1); if(loc.eof()) {return 0xFFFFFFFF;} const auto b4 = loc.current(); loc.advance(1); const std::uint32_t c1 = b1 & ((1 << 3) - 1); const std::uint32_t c2 = b2 & ((1 << 6) - 1); const std::uint32_t c3 = b3 & ((1 << 6) - 1); const std::uint32_t c4 = b4 & ((1 << 6) - 1); const std::uint32_t codep = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4; if(codep < 0x10000) { return 0xFFFFFFFF; } return codep; } else // not a Unicode codepoint in UTF-8 { return 0xFFFFFFFF; } } TOML11_INLINE region non_ascii_key_char::scan(location& loc) const { if(loc.eof()) {return region{};} const auto first = loc; const auto cp = read_utf8(loc); if(cp == 0xFFFFFFFF) { return region{}; } // ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ // / %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions // / %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block // / %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon // / %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ // / %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics // / %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces // / %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) // / %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) if(cp == 0xB2 || cp == 0xB3 || cp == 0xB9 || (0xBC <= cp && cp <= 0xBE) || (0xC0 <= cp && cp <= 0xD6 ) || (0xD8 <= cp && cp <= 0xF6) || (0xF8 <= cp && cp <= 0x37D) || (0x37F <= cp && cp <= 0x1FFF) || (0x200C <= cp && cp <= 0x200D) || (0x203F <= cp && cp <= 0x2040) || (0x2070 <= cp && cp <= 0x218F) || (0x2460 <= cp && cp <= 0x24FF) || (0x2C00 <= cp && cp <= 0x2FEF) || (0x3001 <= cp && cp <= 0xD7FF) || (0xF900 <= cp && cp <= 0xFDCF) || (0xFDF0 <= cp && cp <= 0xFFFD) || (0x10000 <= cp && cp <= 0xEFFFF) ) { return region(first, loc); } loc = first; return region{}; } TOML11_INLINE repeat_at_least const& unquoted_key(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { const auto keychar = [&s] { if(s.v1_1_0_allow_non_english_in_bare_keys) { return either(alpha(s), digit(s), character{0x2D}, character{0x5F}, non_ascii_key_char(s)); } else { return either(alpha(s), digit(s), character{0x2D}, character{0x5F}); } }; return repeat_at_least(1, keychar()); }); return cache.at(sp); } TOML11_INLINE either const& quoted_key(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return either(basic_string(s), literal_string(s)); }); return cache.at(sp); } TOML11_INLINE either const& simple_key(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return either(unquoted_key(s), quoted_key(s)); }); return cache.at(sp); } TOML11_INLINE sequence const& dot_sep(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(ws(s), character('.'), ws(s)); }); return cache.at(sp); } TOML11_INLINE sequence const& dotted_key(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence( simple_key(s), repeat_at_least(1, sequence(dot_sep(s), simple_key(s))) ); }); return cache.at(sp); } TOML11_INLINE sequence const& keyval_sep(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(ws(s), character('='), ws(s)); }); return cache.at(sp); } // =========================================================================== // Table key TOML11_INLINE sequence const& std_table(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(character('['), ws(s), key(s), ws(s), character(']')); }); return cache.at(sp); } TOML11_INLINE sequence const& array_table(const spec& sp) { static thread_local auto cache = make_cache([](const spec& s) { return sequence(literal("[["), ws(s), key(s), ws(s), literal("]]")); }); return cache.at(sp); } // =========================================================================== // extension: null TOML11_INLINE literal const& null_value(const spec&) { static thread_local literal cache("null"); return cache; } } // namespace syntax } // namespace detail } // TOML11_INLINE_VERSION_NAMESPACE } // namespace toml #endif // TOML11_SYNTAX_IMPL_HPP