static void print_char_class (const struct CharClass *cclass) { if (IS_SINGLE_CHAR_CLASS (cclass)) printf ("%s", dsk_ascii_byte_name (SINGLE_CHAR_CLASS_GET_CHAR (cclass))); else { dsk_boolean first = DSK_TRUE; unsigned i,j; for (i = 1; i < 256; ) if (CHAR_CLASS_BITVEC_IS_SET (cclass, i)) { j = i; while ((j+1) < 256 && CHAR_CLASS_BITVEC_IS_SET (cclass, j+1)) j++; if (first) first = DSK_FALSE; else printf (" "); if (i == j) printf ("%s", dsk_ascii_byte_name (i)); else printf ("%s-%s", dsk_ascii_byte_name (i), dsk_ascii_byte_name (j)); i = j + 1; } else i++; } }
static dsk_boolean dsk_hex_decoder_process (DskOctetFilter *filter, DskBuffer *out, unsigned in_length, const uint8_t *in_data, DskError **error) { DskHexDecoder *hexdec = (DskHexDecoder *) filter; DSK_UNUSED (error); while (in_length) { if (dsk_ascii_isxdigit (*in_data)) { if (hexdec->has_nibble) { dsk_buffer_append_byte (out, (hexdec->nibble << 4) | dsk_ascii_xdigit_value (*in_data)); hexdec->has_nibble = DSK_FALSE; } else { hexdec->nibble = dsk_ascii_xdigit_value (*in_data); hexdec->has_nibble = DSK_TRUE; } in_data++; in_length--; } else if (dsk_ascii_isspace (*in_data)) { in_data++; in_length--; } else { dsk_set_error (error, "bad character %s in hex-data", dsk_ascii_byte_name (*in_data)); return DSK_FALSE; } } return DSK_TRUE; }
/* --- lexing --- */ dsk_boolean dsk_json_parser_feed (DskJsonParser *parser, size_t n_bytes, const uint8_t *bytes, DskError **error) { while (n_bytes > 0) { switch (parser->lex_state) { case JSON_LEX_STATE_INIT: while (n_bytes > 0 && dsk_ascii_isspace (*bytes)) { if (*bytes == '\n') parser->line_no++; bytes++; n_bytes--; } if (n_bytes == 0) break; switch (*bytes) { case 't': case 'T': parser->lex_state = JSON_LEX_STATE_TRUE; parser->fixed_n_chars = 1; bytes++; n_bytes--; break; case 'f': case 'F': parser->lex_state = JSON_LEX_STATE_FALSE; parser->fixed_n_chars = 1; bytes++; n_bytes--; break; case 'n': case 'N': parser->lex_state = JSON_LEX_STATE_NULL; parser->fixed_n_chars = 1; bytes++; n_bytes--; break; case '"': parser->lex_state = JSON_LEX_STATE_IN_DQ; parser->str_len = 0; bytes++; n_bytes--; break; case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': parser->lex_state = JSON_LEX_STATE_IN_NUMBER; parser->str_len = 0; append_to_string_buffer (parser, 1, bytes); bytes++; n_bytes--; break; #define WRITE_CHAR_TOKEN_CASE(character, SHORTNAME) \ case character: \ if (!handle_token (parser, JSON_TOKEN_##SHORTNAME, error)) \ return DSK_FALSE; \ n_bytes--; \ bytes++; \ break WRITE_CHAR_TOKEN_CASE('{', LBRACE); WRITE_CHAR_TOKEN_CASE('}', RBRACE); WRITE_CHAR_TOKEN_CASE('[', LBRACKET); WRITE_CHAR_TOKEN_CASE(']', RBRACKET); WRITE_CHAR_TOKEN_CASE(',', COMMA); WRITE_CHAR_TOKEN_CASE(':', COLON); #undef WRITE_CHAR_TOKEN_CASE case '\n': parser->line_no++; n_bytes--; bytes++; break; case '\t': case '\r': case ' ': n_bytes--; bytes++; break; default: dsk_set_error (error, "unexpected character %s in json (line %u)", dsk_ascii_byte_name (*bytes), parser->line_no); return DSK_FALSE; } break; #define WRITE_FIXED_BAREWORD_CASE(SHORTNAME, lc, UC, length) \ case JSON_LEX_STATE_##SHORTNAME: \ if (parser->fixed_n_chars == length) \ { \ /* are we at end of string? */ \ if (dsk_ascii_isalnum (*bytes)) \ { \ dsk_set_error (error, \ "got %s after '%s' (line %u)", \ dsk_ascii_byte_name (*bytes), lc, \ parser->line_no); \ return DSK_FALSE; \ } \ else \ { \ parser->lex_state = JSON_LEX_STATE_INIT; \ if (!handle_token (parser, JSON_TOKEN_##SHORTNAME, \ error)) \ return DSK_FALSE; \ } \ } \ else if (*bytes == lc[parser->fixed_n_chars] \ || *bytes == UC[parser->fixed_n_chars]) \ { \ parser->fixed_n_chars += 1; \ n_bytes--; \ bytes++; \ } \ else \ { \ dsk_set_error (error, \ "unexpected character %s (parsing %s) (line %u)", \ dsk_ascii_byte_name (*bytes), UC, parser->line_no); \ return DSK_FALSE; \ } \ break; WRITE_FIXED_BAREWORD_CASE(TRUE, "true", "TRUE", 4); WRITE_FIXED_BAREWORD_CASE(FALSE, "false", "FALSE", 5); WRITE_FIXED_BAREWORD_CASE(NULL, "null", "NULL", 4); #undef WRITE_FIXED_BAREWORD_CASE case JSON_LEX_STATE_IN_DQ: if (*bytes == '"') { // TODO ASSERT utf16_surrogate == 0 if (!handle_token (parser, JSON_TOKEN_STRING, error)) return DSK_FALSE; bytes++; n_bytes--; parser->lex_state = JSON_LEX_STATE_INIT; } else if (*bytes == '\\') { n_bytes--; bytes++; parser->bs_sequence_len = 0; parser->lex_state = JSON_LEX_STATE_IN_DQ_BS; } else { // TODO ASSERT utf16_surrogate == 0 unsigned i; if (*bytes == '\n') parser->line_no++; for (i = 1; i < n_bytes; i++) if (bytes[i] == '"' || bytes[i] == '\\') break; else if (bytes[i] == '\n') parser->line_no++; append_to_string_buffer (parser, i, bytes); n_bytes -= i; bytes += i; } break; case JSON_LEX_STATE_IN_DQ_BS: if (parser->bs_sequence_len == 0) { switch (*bytes) { #define WRITE_BS_CHAR_CASE(bschar, cchar) \ case bschar: \ /* TODO ASSERT utf16_surrogate == 0 */ \ append_char_to_string_buffer (parser, cchar); \ bytes++; \ n_bytes--; \ parser->lex_state = JSON_LEX_STATE_IN_DQ; \ break WRITE_BS_CHAR_CASE('b', '\b'); WRITE_BS_CHAR_CASE('f', '\f'); WRITE_BS_CHAR_CASE('n', '\n'); WRITE_BS_CHAR_CASE('r', '\r'); WRITE_BS_CHAR_CASE('t', '\t'); WRITE_BS_CHAR_CASE('/', '/'); WRITE_BS_CHAR_CASE('"', '"'); WRITE_BS_CHAR_CASE('\\', '\\'); #undef WRITE_BS_CHAR_CASE case 'u': parser->bs_sequence[parser->bs_sequence_len++] = *bytes++; n_bytes--; break; default: dsk_set_error (error, "invalid character %s after '\\' (line %u)", dsk_ascii_byte_name (*bytes), parser->line_no); return DSK_FALSE; } } else { /* must be \uxxxx (the only multi-character \ sequence) */ if (!dsk_ascii_isxdigit (*bytes)) { dsk_set_error (error, "expected 4 hex digits after \\u, got %s (line %u)", dsk_ascii_byte_name (*bytes), parser->line_no); return DSK_FALSE; } parser->bs_sequence[parser->bs_sequence_len++] = *bytes++; n_bytes--; if (parser->bs_sequence_len == 5) { char utf8buf[8]; unsigned value; parser->bs_sequence[5] = 0; value = strtoul (parser->bs_sequence + 1, NULL, 16); if (DSK_UTF16_LO_SURROGATE_START <= value && value <= DSK_UTF16_LO_SURROGATE_END) { if (parser->utf16_surrogate == 0) { dsk_set_error (error, "low (second) half of surrogate pair was encountered without high-half, line %u", parser->line_no); return DSK_FALSE; } uint32_t code = dsk_utf16_surrogate_pair_to_codepoint (parser->utf16_surrogate, value); append_to_string_buffer (parser, dsk_utf8_encode_unichar (utf8buf, code), (const uint8_t *) utf8buf); parser->utf16_surrogate = 0; } else if (DSK_UTF16_HI_SURROGATE_START <= value && value <= DSK_UTF16_HI_SURROGATE_END) { if (parser->utf16_surrogate != 0) { dsk_set_error (error, "got two first-half surrogate pairs (UTF16 surrogate \\u%04u was followed by \\%04u), line %u", parser->utf16_surrogate, value, parser->line_no); return DSK_FALSE; } parser->utf16_surrogate = value; } else { if (parser->utf16_surrogate != 0) { dsk_set_error (error, "second half of UTF16 surrogate \\u%04u was not preceded by utf16, line %u", parser->utf16_surrogate, parser->line_no); return DSK_FALSE; } append_to_string_buffer (parser, dsk_utf8_encode_unichar (utf8buf, value), (const uint8_t *) utf8buf); parser->utf16_surrogate = 0; } parser->lex_state = JSON_LEX_STATE_IN_DQ; } #if 0 else { dsk_set_error (error, "internal error: expected 4 hex digits (line %u)", parser->line_no); return DSK_FALSE; } #endif } break; case JSON_LEX_STATE_IN_NUMBER: if (dsk_ascii_isdigit (*bytes) || *bytes == '.' || *bytes == 'e' || *bytes == 'E' || *bytes == '+' || *bytes == '-') { append_to_string_buffer (parser, 1, bytes); bytes++; n_bytes--; } else { /* append the number token */ if (!handle_token (parser, JSON_TOKEN_NUMBER, error)) return DSK_FALSE; /* go back to init state (do not consume character) */ parser->lex_state = JSON_LEX_STATE_INIT; } break; default: dsk_error ("unhandled lex state %u", parser->lex_state); } } return DSK_TRUE; }
static dsk_boolean tokenize (const char *regex, struct Token **token_list_out, DskMemPool *pool, DskError **error) { struct Token *last = NULL; *token_list_out = NULL; while (*regex) { struct Token *t = dsk_mem_pool_alloc (pool, sizeof (struct Token)); switch (*regex) { case '*': t->type = TOKEN_STAR; regex++; break; case '+': t->type = TOKEN_PLUS; regex++; break; case '?': t->type = TOKEN_QUESTION_MARK; regex++; break; case '(': t->type = TOKEN_LPAREN; regex++; break; case ')': t->type = TOKEN_RPAREN; regex++; break; case '|': t->type = TOKEN_ALTER; regex++; break; case '[': { struct CharClass *cclass; /* parse character class */ regex++; cclass = parse_character_class (®ex, pool, error); if (cclass == NULL || *regex != ']') return DSK_FALSE; regex++; t->type = TOKEN_PATTERN; t->pattern = dsk_mem_pool_alloc (pool, sizeof (struct Pattern)); t->pattern->type = PATTERN_LITERAL; t->pattern->info.literal = cclass; break; } case '\\': { /* parse either char class or special literal */ struct CharClass *cclass; regex++; if (get_backslash_char_class (®ex, &cclass)) { t->type = TOKEN_PATTERN; t->pattern = dsk_mem_pool_alloc (pool, sizeof (struct Pattern)); t->pattern->type = PATTERN_LITERAL; t->pattern->info.literal = cclass; } else { if (regex[1] == 0) dsk_set_error (error, "unexpected backslash sequence in regex"); else dsk_set_error (error, "bad char %s after backslash", dsk_ascii_byte_name (regex[1])); return DSK_FALSE; } break; } case '.': t->type = TOKEN_PATTERN; t->pattern = dsk_mem_pool_alloc (pool, sizeof (struct Pattern)); t->pattern->type = PATTERN_LITERAL; t->pattern->info.literal = &char_class_dot; regex++; break; default: /* character literal */ t->type = TOKEN_PATTERN; t->pattern = dsk_mem_pool_alloc (pool, sizeof (struct Pattern)); t->pattern->type = PATTERN_LITERAL; t->pattern->info.literal = MK_LITERAL_CHAR_CLASS (regex[0]); regex++; break; } /* append to list */ t->prev = last; t->next = NULL; if (last) last->next = t; else *token_list_out = last = t; last = t; } return DSK_TRUE; }
/* Parse a [] character class expression */ static struct CharClass * parse_character_class (const char **p_regex, DskMemPool *pool, DskError **error) { const char *at = *p_regex; dsk_boolean reverse = DSK_FALSE; struct CharClass *out = dsk_mem_pool_alloc0 (pool, sizeof (struct CharClass)); if (*at == '^') { reverse = DSK_TRUE; at++; } while (*at != 0 && *at != ']') { /* this muck is structured annoyingly: we just to the label got_range_start_and_dash whenever we encounter a '-' after a single character (either literally or as a backslash sequence), to handle range expressions. */ unsigned first_value; if (*at == '\\') { struct CharClass *sub; at++; if (!get_backslash_char_class (&at, &sub)) { *p_regex = at; /* for error reporting (maybe?) */ dsk_set_error (error, "bad \\ expression (at %s)", dsk_ascii_byte_name (*at)); return NULL; } if (IS_SINGLE_CHAR_CLASS (sub) && *at == '-') { first_value = SINGLE_CHAR_CLASS_GET_CHAR (sub); at++; goto got_range_start_and_dash; } char_class_union_inplace (out, sub); } else if (at[1] == '-') { first_value = *at; at += 2; goto got_range_start_and_dash; } else { /* single character */ CHAR_CLASS_BITVEC_SET (out, *at); at++; } continue; got_range_start_and_dash: { unsigned last_value; unsigned code; if (*at == '\\') { struct CharClass *sub; const char *start; at++; start = at; if (!get_backslash_char_class (&at, &sub)) { *p_regex = at; /* for error reporting (maybe?) */ dsk_set_error (error, "bad \\ expression (at %s)", dsk_ascii_byte_name (*at)); return NULL; } if (!IS_SINGLE_CHAR_CLASS (sub)) { dsk_set_error (error, "non-single-byte \\%c encountered - cannot use in range", *start); return NULL; } last_value = SINGLE_CHAR_CLASS_GET_CHAR (sub); } else if (*at == ']') { /* syntax error */ dsk_set_error (error, "unterminated character class range"); return NULL; } else { last_value = *at; at++; } if (first_value > last_value) { dsk_set_error (error, "character range is not first<last (first=%s, last=%s)", dsk_ascii_byte_name (first_value), dsk_ascii_byte_name (last_value)); return NULL; } for (code = first_value; code <= last_value; code++) CHAR_CLASS_BITVEC_SET (out, code); } } *p_regex = at; if (reverse) char_class_reverse_inplace (out); return out; }