Example #1
0
static bool
is_all_utf8_text (const void *s_, size_t n)
{
  const uint8_t *s = s_;
  size_t ofs;

  ofs = 0;
  while (ofs < n)
    {
      uint8_t c = s[ofs];
      if (c < 0x80)
        {
          if (!encoding_guess_is_ascii_text (c))
            return false;
          ofs++;
        }
      else
        {
          ucs4_t uc;
          int mblen;

          mblen = u8_mbtoucr (&uc, s + ofs, n - ofs);
          if (mblen < 0)
            return mblen == -2;

          ofs += mblen;
        }
    }
  return true;
}
size_t
u8_mbsnlen (const uint8_t *s, size_t n)
{
  size_t characters;

  characters = 0;
  while (n > 0)
    {
      ucs4_t uc;
      int count = u8_mbtoucr (&uc, s, n);
      characters++;
      if (count == -2)
        break;
      if (count < 0)
        count = u8_mbtouc (&uc, s, n);
      else if (count == 0)
        count = 1;
      s += count;
      n -= count;
    }
  return characters;
}
Example #3
0
static void
check_segmentation (const char *input, size_t length, bool print_segments)
{
  size_t offset, line_number, line_offset;
  struct segmenter s;
  int prev_type;

  segmenter_init (&s, mode);

  line_number = 1;
  line_offset = 0;
  prev_type = -1;
  for (offset = 0; offset < length; )
    {
      enum segment_type type;
      const char *type_name, *p;
      int n;

      if (one_byte)
        {
          int n_newlines = 0;
          int i;

          for (i = 0; i <= length - offset; i++)
            {
              /* Make a copy to ensure that segmenter_push() isn't actually
                 looking ahead. */
              char *copy;

              if (i > 0 && input[offset + i - 1] == '\n')
                n_newlines++;

              copy = xmemdup (input + offset, i);
              n = segmenter_push (&s, copy, i, &type);
              free (copy);

              if (n >= 0)
                break;
            }
          assert (n_newlines <= 2);
        }
      else
        n = segmenter_push (&s, input + offset, length - offset, &type);

      if (n < 0)
        error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu",
               offset);
      assert (offset + n <= length);

      if (type == SEG_NEWLINE)
        assert ((n == 1 && input[offset] == '\n')
                || (n == 2
                    && input[offset] == '\r' && input[offset + 1] == '\n'));
      else
        assert (memchr (&input[offset], '\n', n) == NULL);

      if (!print_segments)
        {
          offset += n;
          continue;
        }

      if (!verbose)
        {
          if (prev_type != SEG_SPACES && prev_type != -1
              && type == SEG_SPACES && n == 1 && input[offset] == ' ')
            {
              printf ("    space\n");
              offset++;
              prev_type = -1;
              continue;
            }
        }
      if (prev_type != -1)
        putchar ('\n');
      prev_type = type;

      if (verbose)
        printf ("%2zu:%2zu: ", line_number, offset - line_offset);

      type_name = segment_type_to_string (type);
      for (p = type_name; *p != '\0'; p++)
        putchar (tolower ((unsigned char) *p));
      if (n > 0)
        {
          int i;

          for (i = MIN (15, strlen (type_name)); i < 16; i++)
            putchar (' ');
          for (i = 0; i < n; )
            {
              const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
              ucs4_t uc;
              int mblen;

              mblen = u8_mbtoucr (&uc, u_input + (offset + i), n - i);
              if (mblen < 0)
                {
                  int j;

                  mblen = u8_mbtouc (&uc, u_input + (offset + i), n - i);
                  putchar ('<');
                  for (j = 0; j < mblen; j++)
                    {
                      if (j > 0)
                        putchar (' ');
                      printf ("%02x", input[offset + i + j]);
                    }
                  putchar ('>');
                }
              else
                {
                  switch (uc)
                    {
                    case ' ':
                      printf ("_");
                      break;

                    case '_':
                      printf ("\\_");
                      break;

                    case '\\':
                      printf ("\\\\");
                      break;

                    case '\t':
                      printf ("\\t");
                      break;

                    case '\r':
                      printf ("\\r");
                      break;

                    case '\n':
                      printf ("\\n");
                      break;

                    case '\v':
                      printf ("\\v");
                      break;

                    default:
                      if (uc < 0x20 || uc == 0x00a0)
                        printf ("<U+%04X>", uc);
                      else
                        fwrite (input + offset + i, 1, mblen, stdout);
                      break;
                    }
                }

              i += mblen;
            }
        }

      offset += n;
      if (type == SEG_NEWLINE)
        {
          enum prompt_style prompt;

          line_number++;
          line_offset = offset;

          prompt = segmenter_get_prompt (&s);
          printf (" (%s)\n", prompt_style_to_string (prompt));
        }
    }
Example #4
0
int
main ()
{
  ucs4_t uc;
  int ret;

  /* Test NUL unit input.  */
  {
    static const uint8_t input[] = "";
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 1);
    ASSERT (ret == 1);
    ASSERT (uc == 0);
  }

  /* Test ISO 646 unit input.  */
  {
    ucs4_t c;
    uint8_t buf[1];

    for (c = 0; c < 0x80; c++)
      {
        buf[0] = c;
        uc = 0xBADFACE;
        ret = u8_mbtoucr (&uc, buf, 1);
        ASSERT (ret == 1);
        ASSERT (uc == c);
      }
  }

  /* Test 2-byte character input.  */
  {
    static const uint8_t input[] = { 0xC3, 0x97 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 2);
    ASSERT (ret == 2);
    ASSERT (uc == 0x00D7);
  }

  /* Test 3-byte character input.  */
  {
    static const uint8_t input[] = { 0xE2, 0x82, 0xAC };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 3);
    ASSERT (ret == 3);
    ASSERT (uc == 0x20AC);
  }

  /* Test 4-byte character input.  */
  {
    static const uint8_t input[] = { 0xF4, 0x8F, 0xBF, 0xBD };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 4);
    ASSERT (ret == 4);
    ASSERT (uc == 0x10FFFD);
  }

  /* Test incomplete/invalid 1-byte input.  */
  {
    static const uint8_t input[] = { 0xC1 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 1);
    ASSERT (ret == -1);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xC3 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 1);
    ASSERT (ret == -2);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xE2 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 1);
    ASSERT (ret == -2);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xF4 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 1);
    ASSERT (ret == -2);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xFE };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 1);
    ASSERT (ret == -1);
    ASSERT (uc == 0xFFFD);
  }

  /* Test incomplete/invalid 2-byte input.  */
  {
    static const uint8_t input[] = { 0xE0, 0x9F };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 2);
    ASSERT (ret == -1);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xE2, 0x82 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 2);
    ASSERT (ret == -2);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xE2, 0xD0 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 2);
    ASSERT (ret == -1);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xF0, 0x8F };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 2);
    ASSERT (ret == -1);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xF3, 0x8F };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 2);
    ASSERT (ret == -2);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xF3, 0xD0 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 2);
    ASSERT (ret == -1);
    ASSERT (uc == 0xFFFD);
  }

  /* Test incomplete/invalid 3-byte input.  */
  {
    static const uint8_t input[] = { 0xF3, 0x8F, 0xBF };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 3);
    ASSERT (ret == -2);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xF3, 0xD0, 0xBF };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 3);
    ASSERT (ret == -1);
    ASSERT (uc == 0xFFFD);
  }
  {
    static const uint8_t input[] = { 0xF3, 0x8F, 0xD0 };
    uc = 0xBADFACE;
    ret = u8_mbtoucr (&uc, input, 3);
    ASSERT (ret == -1);
    ASSERT (uc == 0xFFFD);
  }

  return 0;
}
Example #5
0
/* utf8conv_carefully is like iconv, except that
     - it converts from UTF-8 to UTF-8,
     - it stops as soon as it encounters a conversion error, and it returns
       in *INCREMENTED a boolean telling whether it has incremented the input
       pointers past the error location,
     - if one_character_only is true, it stops after converting one
       character.  */
static size_t
utf8conv_carefully (bool one_character_only,
                    const char **inbuf, size_t *inbytesleft,
                    char **outbuf, size_t *outbytesleft,
                    bool *incremented)
{
  const char *inptr = *inbuf;
  size_t insize = *inbytesleft;
  char *outptr = *outbuf;
  size_t outsize = *outbytesleft;
  size_t res;

  res = 0;
  do
    {
      ucs4_t uc;
      int n;
      int m;

      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
      if (n < 0)
        {
          errno = (n == -2 ? EINVAL : EILSEQ);
          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
          inptr += n;
          insize -= n;
          res = (size_t)(-1);
          *incremented = true;
          break;
        }
      if (outsize == 0)
        {
          errno = E2BIG;
          res = (size_t)(-1);
          *incremented = false;
          break;
        }
      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
      if (m == -2)
        {
          errno = E2BIG;
          res = (size_t)(-1);
          *incremented = false;
          break;
        }
      inptr += n;
      insize -= n;
      if (m == -1)
        {
          errno = EILSEQ;
          res = (size_t)(-1);
          *incremented = true;
          break;
        }
      outptr += m;
      outsize -= m;
    }
  while (!one_character_only && insize > 0);

  *inbuf = inptr;
  *inbytesleft = insize;
  *outbuf = outptr;
  *outbytesleft = outsize;
  return res;
}
DST_UNIT *
FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp)
{
  const SRC_UNIT *s_end = s + n;
  /* Output string accumulator.  */
  DST_UNIT *result;
  size_t allocated;
  size_t length;

  if (resultbuf != NULL)
    {
      result = resultbuf;
      allocated = *lengthp;
    }
  else
    {
      result = NULL;
      allocated = 0;
    }
  length = 0;
  /* Invariants:
     result is either == resultbuf or == NULL or malloc-allocated.
     If length > 0, then result != NULL.  */

  while (s < s_end)
    {
      ucs4_t uc;
      int count;

      /* Fetch a Unicode character from the input string.  */
      count = u8_mbtoucr (&uc, s, s_end - s);
      if (count < 0)
        {
          if (!(result == resultbuf || result == NULL))
            free (result);
          errno = EILSEQ;
          return NULL;
        }
      s += count;

      /* Store it in the output string.  */
      if (length + 1 > allocated)
        {
          DST_UNIT *memory;

          allocated = (allocated > 0 ? 2 * allocated : 12);
          if (length + 1 > allocated)
            allocated = length + 1;
          if (result == resultbuf || result == NULL)
            memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT));
          else
            memory =
              (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT));

          if (memory == NULL)
            {
              if (!(result == resultbuf || result == NULL))
                free (result);
              errno = ENOMEM;
              return NULL;
            }
          if (result == resultbuf && length > 0)
            memcpy ((char *) memory, (char *) result,
                    length * sizeof (DST_UNIT));
          result = memory;
        }
      result[length++] = uc;
    }

  if (length == 0)
    {
      if (result == NULL)
        {
          /* Return a non-NULL value.  NULL means error.  */
          result = (DST_UNIT *) malloc (1);
          if (result == NULL)
            {
              errno = ENOMEM;
              return NULL;
            }
        }
    }
  else if (result != resultbuf && length < allocated)
    {
      /* Shrink the allocated memory if possible.  */
      DST_UNIT *memory;

      memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT));
      if (memory != NULL)
        result = memory;
    }

  *lengthp = length;
  return result;
}