static bool read_0_9(SerdReader* reader, Ref str, bool at_least_one) { unsigned count = 0; for (uint8_t c; is_digit((c = peek_byte(reader))); ++count) { push_byte(reader, str, eat_byte_safe(reader, c)); } if (at_least_one && count == 0) { r_err(reader, SERD_ERR_BAD_SYNTAX, "expected digit\n"); } return count; }
// Read ECHAR escape, initial \ is already eaten by caller static inline bool read_ECHAR(SerdReader* reader, Ref dest, SerdNodeFlags* flags) { const uint8_t c = peek_byte(reader); switch (c) { case 't': eat_byte_safe(reader, 't'); push_byte(reader, dest, '\t'); return true; case 'b': eat_byte_safe(reader, 'b'); push_byte(reader, dest, '\b'); return true; case 'n': *flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'n'); push_byte(reader, dest, '\n'); return true; case 'r': *flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'r'); push_byte(reader, dest, '\r'); return true; case 'f': eat_byte_safe(reader, 'f'); push_byte(reader, dest, '\f'); return true; case '\\': case '"': case '\'': push_byte(reader, dest, eat_byte_safe(reader, c)); return true; default: return false; } }
static bool read_blank(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest, bool* ate_dot) { const SerdStatementFlags old_flags = *ctx.flags; bool empty; switch (peek_byte(reader)) { case '_': return (*dest = read_BLANK_NODE_LABEL(reader, ate_dot)); case '[': eat_byte_safe(reader, '['); if ((empty = peek_delim(reader, ']'))) { *ctx.flags |= (subject) ? SERD_EMPTY_S : SERD_EMPTY_O; } else { *ctx.flags |= (subject) ? SERD_ANON_S_BEGIN : SERD_ANON_O_BEGIN; if (peek_delim(reader, '=')) { if (!(*dest = read_blankName(reader)) || !eat_delim(reader, ';')) { return false; } } } if (!*dest) { *dest = blank_id(reader); } if (ctx.subject) { TRY_RET(emit_statement(reader, ctx, *dest, 0, 0)); } ctx.subject = *dest; if (!empty) { *ctx.flags &= ~(SERD_LIST_CONT); if (!subject) { *ctx.flags |= SERD_ANON_CONT; } bool ate_dot_in_list = false; read_predicateObjectList(reader, ctx, &ate_dot_in_list); if (ate_dot_in_list) { return r_err(reader, SERD_ERR_BAD_SYNTAX, "`.' inside blank\n"); } read_ws_star(reader); if (reader->end_sink) { reader->end_sink(reader->handle, deref(reader, *dest)); } *ctx.flags = old_flags; } return (eat_byte_check(reader, ']') == ']'); case '(': return read_collection(reader, ctx, dest); default: return false; // never reached } }
static bool read_PERCENT(SerdReader* reader, Ref dest) { push_byte(reader, dest, eat_byte_safe(reader, '%')); const uint8_t h1 = read_HEX(reader); const uint8_t h2 = read_HEX(reader); if (h1 && h2) { push_byte(reader, dest, h1); push_byte(reader, dest, h2); return true; } return false; }
static SerdStatus read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot) { uint8_t c = peek_byte(reader); SerdStatus st; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case ':': case '_': push_byte(reader, dest, eat_byte_safe(reader, c)); break; default: if ((st = read_PLX(reader, dest)) > SERD_FAILURE) { return st; } else if (st != SERD_SUCCESS && !read_PN_CHARS_BASE(reader, dest)) { return SERD_FAILURE; } } while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.' | ';')* if (c == '.' || c == ':') { push_byte(reader, dest, eat_byte_safe(reader, c)); } else if ((st = read_PLX(reader, dest)) > SERD_FAILURE) { return st; } else if (st != SERD_SUCCESS && !read_PN_CHARS(reader, dest)) { break; } } SerdNode* const n = deref(reader, dest); if (n->buf[n->n_bytes - 1] == '.') { // Ate trailing dot, pop it from stack/node and inform caller --n->n_bytes; serd_stack_pop(&reader->stack, 1); *ate_dot = true; } return SERD_SUCCESS; }
static inline SerdStatus bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c) { r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c); push_replacement(reader, dest); // Skip bytes until the next start byte for (uint8_t b = peek_byte(reader); (b & 0x80);) { eat_byte_safe(reader, b); b = peek_byte(reader); } return SERD_SUCCESS; }
static Ref read_blankName(SerdReader* reader) { eat_byte_safe(reader, '='); if (eat_byte_check(reader, '=') != '=') { return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `='\n"); } Ref subject = 0; bool ate_dot = false; read_ws_star(reader); read_iri(reader, &subject, &ate_dot); return subject; }
// STRING_LITERAL_LONG_QUOTE and STRING_LITERAL_LONG_SINGLE_QUOTE // Initial triple quotes are already eaten by caller static Ref read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q) { Ref ref = push_node(reader, SERD_LITERAL, "", 0); while (true) { const uint8_t c = peek_byte(reader); uint32_t code; switch (c) { case '\\': eat_byte_safe(reader, c); if (!read_ECHAR(reader, ref, flags) && !read_UCHAR(reader, ref, &code)) { r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid escape `\\%c'\n", peek_byte(reader)); return pop_node(reader, ref); } break; default: if (c == q) { eat_byte_safe(reader, q); const uint8_t q2 = eat_byte_safe(reader, peek_byte(reader)); const uint8_t q3 = peek_byte(reader); if (q2 == q && q3 == q) { // End of string eat_byte_safe(reader, q3); return ref; } else { *flags |= SERD_HAS_QUOTE; push_byte(reader, ref, c); read_character(reader, ref, flags, q2); } } else { read_character(reader, ref, flags, eat_byte_safe(reader, c)); } } } return ref; }
static SerdStatus read_PLX(SerdReader* reader, Ref dest) { uint8_t c = peek_byte(reader); switch (c) { case '%': if (!read_PERCENT(reader, dest)) { return SERD_ERR_BAD_SYNTAX; } return SERD_SUCCESS; case '\\': eat_byte_safe(reader, c); if (is_alpha(c = peek_byte(reader))) { // Escapes like \u \n etc. are not supported return SERD_ERR_BAD_SYNTAX; } else { // Allow escaping of pretty much any other character push_byte(reader, dest, eat_byte_safe(reader, c)); return SERD_SUCCESS; } default: return SERD_FAILURE; } }
// [24] ws ::= #x9 | #xA | #xD | #x20 | comment static inline bool read_ws(SerdReader* reader) { const uint8_t c = peek_byte(reader); switch (c) { case 0x9: case 0xA: case 0xD: case 0x20: eat_byte_safe(reader, c); return true; case '#': read_comment(reader); return true; default: return false; } }
// Read the remainder of a PN_PREFIX after some initial characters static SerdStatus read_PN_PREFIX_tail(SerdReader* reader, Ref dest) { uint8_t c; while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* if (c == '.') { push_byte(reader, dest, eat_byte_safe(reader, c)); } else if (!read_PN_CHARS(reader, dest)) { break; } } const SerdNode* const n = deref(reader, dest); if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, dest)) { r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with `.'\n"); return SERD_ERR_BAD_SYNTAX; } return SERD_SUCCESS; }
static bool read_number(SerdReader* reader, Ref* dest, Ref* datatype, bool* ate_dot) { #define XSD_DECIMAL NS_XSD "decimal" #define XSD_DOUBLE NS_XSD "double" #define XSD_INTEGER NS_XSD "integer" Ref ref = push_node(reader, SERD_LITERAL, "", 0); uint8_t c = peek_byte(reader); bool has_decimal = false; if (c == '-' || c == '+') { push_byte(reader, ref, eat_byte_safe(reader, c)); } if ((c = peek_byte(reader)) == '.') { has_decimal = true; // decimal case 2 (e.g. '.0' or `-.0' or `+.0') push_byte(reader, ref, eat_byte_safe(reader, c)); TRY_THROW(read_0_9(reader, ref, true)); } else { // all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ... TRY_THROW(is_digit(c)); read_0_9(reader, ref, true); if ((c = peek_byte(reader)) == '.') { has_decimal = true; // Annoyingly, dot can be end of statement, so tentatively eat eat_byte_safe(reader, c); c = peek_byte(reader); if (!is_digit(c) && c != 'e' && c != 'E') { *dest = ref; *ate_dot = true; // Force caller to deal with stupid grammar return true; // Next byte is not a number character, done } push_byte(reader, ref, '.'); read_0_9(reader, ref, false); } } c = peek_byte(reader); if (c == 'e' || c == 'E') { // double push_byte(reader, ref, eat_byte_safe(reader, c)); switch ((c = peek_byte(reader))) { case '+': case '-': push_byte(reader, ref, eat_byte_safe(reader, c)); default: break; } TRY_THROW(read_0_9(reader, ref, true)); *datatype = push_node(reader, SERD_URI, XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1); } else if (has_decimal) { *datatype = push_node(reader, SERD_URI, XSD_DECIMAL, sizeof(XSD_DECIMAL) - 1); } else { *datatype = push_node(reader, SERD_URI, XSD_INTEGER, sizeof(XSD_INTEGER) - 1); } *dest = ref; return true; except: pop_node(reader, *datatype); pop_node(reader, ref); return false; }
// Read UCHAR escape, initial \ is already eaten by caller static inline bool read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) { const uint8_t b = peek_byte(reader); unsigned length = 0; switch (b) { case 'U': length = 8; break; case 'u': length = 4; break; default: return false; } eat_byte_safe(reader, b); uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (unsigned i = 0; i < length; ++i) { if (!(buf[i] = read_HEX(reader))) { return false; } } uint32_t code; sscanf((const char*)buf, "%X", &code); unsigned size = 0; if (code < 0x00000080) { size = 1; } else if (code < 0x00000800) { size = 2; } else if (code < 0x00010000) { size = 3; } else if (code < 0x00110000) { size = 4; } else { r_err(reader, SERD_ERR_BAD_SYNTAX, "unicode character 0x%X out of range\n", code); push_replacement(reader, dest); *char_code = 0xFFFD; return true; } // Build output in buf // (Note # of bytes = # of leading 1 bits in first byte) uint32_t c = code; switch (size) { case 4: buf[3] = 0x80 | (uint8_t)(c & 0x3F); c >>= 6; c |= (16 << 12); // set bit 4 case 3: buf[2] = 0x80 | (uint8_t)(c & 0x3F); c >>= 6; c |= (32 << 6); // set bit 5 case 2: buf[1] = 0x80 | (uint8_t)(c & 0x3F); c >>= 6; c |= 0xC0; // set bits 6 and 7 case 1: buf[0] = (uint8_t)c; } for (unsigned i = 0; i < size; ++i) { push_byte(reader, dest, buf[i]); } *char_code = code; return true; }