Esempio n. 1
0
static void
print_char_class (const struct CharClass *cclass)
{
  if (IS_SINGLE_CHAR_CLASS (cclass))
    printf ("%s", dsk_ascii_byte_name (SINGLE_CHAR_CLASS_GET_CHAR (cclass)));
  else
    {
      dsk_boolean first = DSK_TRUE;
      unsigned i,j;
      for (i = 1; i < 256; )
        if (CHAR_CLASS_BITVEC_IS_SET (cclass, i))
          {
            j = i;
            while ((j+1) < 256 && CHAR_CLASS_BITVEC_IS_SET (cclass, j+1))
              j++;
            if (first)
              first = DSK_FALSE;
            else
              printf (" ");
            if (i == j)
              printf ("%s", dsk_ascii_byte_name (i));
            else
              printf ("%s-%s", dsk_ascii_byte_name (i), dsk_ascii_byte_name (j));
            i = j + 1;
          }
        else
          i++;
    }
}
Esempio n. 2
0
static dsk_boolean
dsk_hex_decoder_process (DskOctetFilter *filter,
                            DskBuffer      *out,
                            unsigned        in_length,
                            const uint8_t  *in_data,
                            DskError      **error)
{
  DskHexDecoder *hexdec = (DskHexDecoder *) filter;
  DSK_UNUSED (error);
  while (in_length)
    {
      if (dsk_ascii_isxdigit (*in_data))
        {
          if (hexdec->has_nibble)
            {
              dsk_buffer_append_byte (out,
                                      (hexdec->nibble << 4)
                                      | dsk_ascii_xdigit_value (*in_data));
              hexdec->has_nibble = DSK_FALSE;
            }
          else
            {
              hexdec->nibble = dsk_ascii_xdigit_value (*in_data);
              hexdec->has_nibble = DSK_TRUE;
            }
          in_data++;
          in_length--;
        }
      else if (dsk_ascii_isspace (*in_data))
        {
          in_data++;
          in_length--;
        }
      else
        {
          dsk_set_error (error, "bad character %s in hex-data",
                         dsk_ascii_byte_name (*in_data));
          return DSK_FALSE;
        }
    }
  return DSK_TRUE;
}
Esempio n. 3
0
/* --- lexing --- */
dsk_boolean
dsk_json_parser_feed     (DskJsonParser *parser,
                          size_t         n_bytes,
                          const uint8_t *bytes,
                          DskError     **error)
{
  while (n_bytes > 0)
    {
      switch (parser->lex_state)
        {
        case JSON_LEX_STATE_INIT:
          while (n_bytes > 0 && dsk_ascii_isspace (*bytes))
            {
              if (*bytes == '\n')
                parser->line_no++;
              bytes++;
              n_bytes--;
            }
          if (n_bytes == 0)
            break;
          switch (*bytes)
            {
            case 't': case 'T':
              parser->lex_state = JSON_LEX_STATE_TRUE;
              parser->fixed_n_chars = 1;
              bytes++;
              n_bytes--;
              break;
            case 'f': case 'F':
              parser->lex_state = JSON_LEX_STATE_FALSE;
              parser->fixed_n_chars = 1;
              bytes++;
              n_bytes--;
              break;
            case 'n': case 'N':
              parser->lex_state = JSON_LEX_STATE_NULL;
              parser->fixed_n_chars = 1;
              bytes++;
              n_bytes--;
              break;
            case '"':
              parser->lex_state = JSON_LEX_STATE_IN_DQ;
              parser->str_len = 0;
              bytes++;
              n_bytes--;
              break;
            case '-': case '+':
            case '0': case '1': case '2': case '3': case '4': 
            case '5': case '6': case '7': case '8': case '9': 
              parser->lex_state = JSON_LEX_STATE_IN_NUMBER;
              parser->str_len = 0;
              append_to_string_buffer (parser, 1, bytes);
              bytes++;
              n_bytes--;
              break;

#define WRITE_CHAR_TOKEN_CASE(character, SHORTNAME) \
            case character: \
              if (!handle_token (parser, JSON_TOKEN_##SHORTNAME, error)) \
                return DSK_FALSE; \
              n_bytes--; \
              bytes++; \
              break
            WRITE_CHAR_TOKEN_CASE('{', LBRACE);
            WRITE_CHAR_TOKEN_CASE('}', RBRACE);
            WRITE_CHAR_TOKEN_CASE('[', LBRACKET);
            WRITE_CHAR_TOKEN_CASE(']', RBRACKET);
            WRITE_CHAR_TOKEN_CASE(',', COMMA);
            WRITE_CHAR_TOKEN_CASE(':', COLON);
#undef WRITE_CHAR_TOKEN_CASE

            case '\n':
              parser->line_no++;
              n_bytes--;
              bytes++;
              break;
            case '\t': case '\r': case ' ':
              n_bytes--;
              bytes++;
              break;
            default:
              dsk_set_error (error,
                             "unexpected character %s in json (line %u)",
                             dsk_ascii_byte_name (*bytes), parser->line_no);
              return DSK_FALSE;
            }
          break;

#define WRITE_FIXED_BAREWORD_CASE(SHORTNAME, lc, UC, length) \
        case JSON_LEX_STATE_##SHORTNAME: \
          if (parser->fixed_n_chars == length) \
            { \
              /* are we at end of string? */ \
              if (dsk_ascii_isalnum (*bytes)) \
                { \
                  dsk_set_error (error,  \
                                 "got %s after '%s' (line %u)", \
                                 dsk_ascii_byte_name (*bytes), lc, \
                                 parser->line_no); \
                  return DSK_FALSE; \
                } \
              else \
                { \
                  parser->lex_state = JSON_LEX_STATE_INIT; \
                  if (!handle_token (parser, JSON_TOKEN_##SHORTNAME, \
                                     error)) \
                    return DSK_FALSE; \
                } \
            } \
          else if (*bytes == lc[parser->fixed_n_chars] \
                || *bytes == UC[parser->fixed_n_chars]) \
            { \
              parser->fixed_n_chars += 1; \
              n_bytes--; \
              bytes++; \
            } \
          else \
            { \
              dsk_set_error (error, \
                           "unexpected character %s (parsing %s) (line %u)", \
                           dsk_ascii_byte_name (*bytes), UC, parser->line_no); \
              return DSK_FALSE; \
            } \
          break;
        WRITE_FIXED_BAREWORD_CASE(TRUE, "true", "TRUE", 4);
        WRITE_FIXED_BAREWORD_CASE(FALSE, "false", "FALSE", 5);
        WRITE_FIXED_BAREWORD_CASE(NULL, "null", "NULL", 4);
#undef WRITE_FIXED_BAREWORD_CASE

        case JSON_LEX_STATE_IN_DQ:
          if (*bytes == '"')
            {
              // TODO ASSERT utf16_surrogate == 0
              if (!handle_token (parser, JSON_TOKEN_STRING, error))
                return DSK_FALSE;
              bytes++;
              n_bytes--;
              parser->lex_state = JSON_LEX_STATE_INIT;
            }
          else if (*bytes == '\\')
            {
              n_bytes--;
              bytes++;
              parser->bs_sequence_len = 0;
              parser->lex_state = JSON_LEX_STATE_IN_DQ_BS;
            }
          else
            {
              // TODO ASSERT utf16_surrogate == 0
              unsigned i;
              if (*bytes == '\n')
                parser->line_no++;
              for (i = 1; i < n_bytes; i++)
                if (bytes[i] == '"' || bytes[i] == '\\')
                  break;
                else if (bytes[i] == '\n')
                  parser->line_no++;
              append_to_string_buffer (parser, i, bytes);
              n_bytes -= i;
              bytes += i;
            }
          break;
        case JSON_LEX_STATE_IN_DQ_BS:
          if (parser->bs_sequence_len == 0)
            {
              switch (*bytes)
                {
#define WRITE_BS_CHAR_CASE(bschar, cchar) \
                case bschar: \
                  /* TODO ASSERT utf16_surrogate == 0 */ \
                  append_char_to_string_buffer (parser, cchar); \
                  bytes++; \
                  n_bytes--; \
                  parser->lex_state = JSON_LEX_STATE_IN_DQ; \
                  break
                WRITE_BS_CHAR_CASE('b', '\b');
                WRITE_BS_CHAR_CASE('f', '\f');
                WRITE_BS_CHAR_CASE('n', '\n');
                WRITE_BS_CHAR_CASE('r', '\r');
                WRITE_BS_CHAR_CASE('t', '\t');
                WRITE_BS_CHAR_CASE('/', '/');
                WRITE_BS_CHAR_CASE('"', '"');
                WRITE_BS_CHAR_CASE('\\', '\\');
#undef WRITE_BS_CHAR_CASE
                case 'u':
                  parser->bs_sequence[parser->bs_sequence_len++] = *bytes++;
                  n_bytes--;
                  break;
                default:
                  dsk_set_error (error,
                               "invalid character %s after '\\' (line %u)",
                               dsk_ascii_byte_name (*bytes), parser->line_no);
                  return DSK_FALSE;
                }
            }
          else
            {
              /* must be \uxxxx (the only multi-character \ sequence) */
              if (!dsk_ascii_isxdigit (*bytes))
                {
                  dsk_set_error (error,
                               "expected 4 hex digits after \\u, got %s (line %u)",
                               dsk_ascii_byte_name (*bytes), parser->line_no);
                  return DSK_FALSE;
                }
              parser->bs_sequence[parser->bs_sequence_len++] = *bytes++;
              n_bytes--;
              if (parser->bs_sequence_len == 5)
                {
                  char utf8buf[8];
                  unsigned value;
                  parser->bs_sequence[5] = 0;
                  value = strtoul (parser->bs_sequence + 1, NULL, 16);
                  if (DSK_UTF16_LO_SURROGATE_START <= value
                   && value <= DSK_UTF16_LO_SURROGATE_END)
                    {
                      if (parser->utf16_surrogate == 0)
                        {
                          dsk_set_error (error,
                                       "low (second) half of surrogate pair was encountered without high-half, line %u",
                                       parser->line_no);
                          return DSK_FALSE;
                        }
                      uint32_t code = dsk_utf16_surrogate_pair_to_codepoint (parser->utf16_surrogate, value);
                      append_to_string_buffer (parser,
                                               dsk_utf8_encode_unichar (utf8buf, code),
                                               (const uint8_t *) utf8buf);
                      parser->utf16_surrogate = 0;
                    }
                  else if (DSK_UTF16_HI_SURROGATE_START <= value
                        && value <= DSK_UTF16_HI_SURROGATE_END)
                    {
                      if (parser->utf16_surrogate != 0)
                        {
                          dsk_set_error (error,
                                       "got two first-half surrogate pairs (UTF16 surrogate \\u%04u was followed by \\%04u), line %u",
                                       parser->utf16_surrogate, value, parser->line_no);
                          return DSK_FALSE;
                        }
                      parser->utf16_surrogate = value;
                    }
                  else
                    {
                      if (parser->utf16_surrogate != 0)
                        {
                          dsk_set_error (error,
                                       "second half of UTF16 surrogate \\u%04u was not preceded by utf16, line %u", 
                                       parser->utf16_surrogate, parser->line_no);
                          return DSK_FALSE;
                        }
                      append_to_string_buffer (parser,
                                               dsk_utf8_encode_unichar (utf8buf, value),
                                               (const uint8_t *) utf8buf);
                      parser->utf16_surrogate = 0;
                    }
                  parser->lex_state = JSON_LEX_STATE_IN_DQ;
                }
#if 0
              else
                {
                  dsk_set_error (error,
                               "internal error: expected 4 hex digits (line %u)",
                               parser->line_no);
                  return DSK_FALSE;
                }
#endif
            }
          break;
        case JSON_LEX_STATE_IN_NUMBER:
          if (dsk_ascii_isdigit (*bytes)
           || *bytes == '.'
           || *bytes == 'e'
           || *bytes == 'E'
           || *bytes == '+'
           || *bytes == '-')
            {
              append_to_string_buffer (parser, 1, bytes);
              bytes++;
              n_bytes--;
            }
          else
            {
              /* append the number token */
              if (!handle_token (parser, JSON_TOKEN_NUMBER, error))
                return DSK_FALSE;

              /* go back to init state (do not consume character) */
              parser->lex_state = JSON_LEX_STATE_INIT;
            }
          break;
        default:
          dsk_error ("unhandled lex state %u", parser->lex_state);
        }
    }
  return DSK_TRUE;
}
Esempio n. 4
0
static dsk_boolean
tokenize (const char   *regex,
          struct Token **token_list_out,
          DskMemPool   *pool,
          DskError    **error)
{
  struct Token *last = NULL;
  *token_list_out = NULL;
  while (*regex)
    {
      struct Token *t = dsk_mem_pool_alloc (pool, sizeof (struct Token));
      switch (*regex)
        {
        case '*':
          t->type = TOKEN_STAR;
          regex++;
          break;
        case '+':
          t->type = TOKEN_PLUS;
          regex++;
          break;
        case '?':
          t->type = TOKEN_QUESTION_MARK;
          regex++;
          break;
        case '(':
          t->type = TOKEN_LPAREN;
          regex++;
          break;
        case ')':
          t->type = TOKEN_RPAREN;
          regex++;
          break;
        case '|':
          t->type = TOKEN_ALTER;
          regex++;
          break;
        case '[':
          {
            struct CharClass *cclass;
            /* parse character class */
            regex++;
            cclass = parse_character_class (&regex, pool, error);
            if (cclass == NULL || *regex != ']')
              return DSK_FALSE;
            regex++;
            t->type = TOKEN_PATTERN;
            t->pattern = dsk_mem_pool_alloc (pool, sizeof (struct Pattern));
            t->pattern->type = PATTERN_LITERAL;
            t->pattern->info.literal = cclass;
            break;
          }
        case '\\':
          {
            /* parse either char class or special literal */
            struct CharClass *cclass;
            regex++;
            if (get_backslash_char_class (&regex, &cclass))
              {
                t->type = TOKEN_PATTERN;
                t->pattern = dsk_mem_pool_alloc (pool, sizeof (struct Pattern));
                t->pattern->type = PATTERN_LITERAL;
                t->pattern->info.literal = cclass;
              }
            else
              {
                if (regex[1] == 0)
                  dsk_set_error (error, "unexpected backslash sequence in regex");
                else
                  dsk_set_error (error, "bad char %s after backslash", dsk_ascii_byte_name (regex[1]));
                return DSK_FALSE;
              }
            break;
          }
        case '.':
          t->type = TOKEN_PATTERN;
          t->pattern = dsk_mem_pool_alloc (pool, sizeof (struct Pattern));
          t->pattern->type = PATTERN_LITERAL;
          t->pattern->info.literal = &char_class_dot;
          regex++;
          break;
        default:
          /* character literal */
          t->type = TOKEN_PATTERN;
          t->pattern = dsk_mem_pool_alloc (pool, sizeof (struct Pattern));
          t->pattern->type = PATTERN_LITERAL;
          t->pattern->info.literal = MK_LITERAL_CHAR_CLASS (regex[0]);
          regex++;
          break;
        }

      /* append to list */
      t->prev = last;
      t->next = NULL;
      if (last)
        last->next = t;
      else
        *token_list_out = last = t;
      last = t;
    }
  return DSK_TRUE;
}
Esempio n. 5
0
/* Parse a [] character class expression */
static struct CharClass *
parse_character_class (const char **p_regex,
                       DskMemPool  *pool,
                       DskError   **error)
{
  const char *at = *p_regex;
  dsk_boolean reverse = DSK_FALSE;
  struct CharClass *out = dsk_mem_pool_alloc0 (pool, sizeof (struct CharClass));
  if (*at == '^')
    {
      reverse = DSK_TRUE;
      at++;
    }
  while (*at != 0 && *at != ']')
    {
      /* this muck is structured annoyingly:  we just to the label
         got_range_start_and_dash whenever we encounter a '-' after
         a single character (either literally or as a backslash sequence),
         to handle range expressions. */
      unsigned first_value;

      if (*at == '\\')
        {
          struct CharClass *sub;
          at++;
          if (!get_backslash_char_class (&at, &sub))
            {
              *p_regex = at;    /* for error reporting (maybe?) */
              dsk_set_error (error, "bad \\ expression (at %s)", dsk_ascii_byte_name (*at));
              return NULL;
            }
          if (IS_SINGLE_CHAR_CLASS (sub) && *at == '-')
            {
              first_value = SINGLE_CHAR_CLASS_GET_CHAR (sub);
              at++;
              goto got_range_start_and_dash;
            }
          char_class_union_inplace (out, sub);
        }
      else if (at[1] == '-')
        {
          first_value = *at;
          at += 2;
          goto got_range_start_and_dash;
        }
      else
        {
          /* single character */
          CHAR_CLASS_BITVEC_SET (out, *at);
          at++;
        }

      continue;
got_range_start_and_dash:
      {
        unsigned last_value;
        unsigned code;
        if (*at == '\\')
          {
            struct CharClass *sub;
            const char *start;
            at++;
            start = at;
            if (!get_backslash_char_class (&at, &sub))
              {
                *p_regex = at;    /* for error reporting (maybe?) */
                dsk_set_error (error, "bad \\ expression (at %s)", dsk_ascii_byte_name (*at));
                return NULL;
              }
            if (!IS_SINGLE_CHAR_CLASS (sub))
              {
                dsk_set_error (error, "non-single-byte \\%c encountered - cannot use in range", *start);
                return NULL;
              }
            last_value = SINGLE_CHAR_CLASS_GET_CHAR (sub);
          }
        else if (*at == ']')
          {
            /* syntax error */
            dsk_set_error (error, "unterminated character class range");
            return NULL;
          }
        else
          {
            last_value = *at;
            at++;
          }

        if (first_value > last_value)
          {
            dsk_set_error (error, "character range is not first<last (first=%s, last=%s)",
                           dsk_ascii_byte_name (first_value),
                           dsk_ascii_byte_name (last_value));
            return NULL;
          }
        for (code = first_value; code <= last_value; code++)
          CHAR_CLASS_BITVEC_SET (out, code);
      }
    }
  *p_regex = at;
  if (reverse)
    char_class_reverse_inplace (out);
  return out;
}