示例#1
0
const uint8_t *
u8_grapheme_prev (const uint8_t *s, const uint8_t *start)
{
  ucs4_t next;

  if (s == start)
    return NULL;

  s = u8_prev (&next, s, start);
  while (s != start)
    {
      const uint8_t *prev_s;
      ucs4_t prev;

      prev_s = u8_prev (&prev, s, start);
      if (prev_s == NULL)
        {
          /* Ill-formed UTF-8 encoding. */
          return start;
        }

      if (uc_is_grapheme_break (prev, next))
        break;

      s = prev_s;
      next = prev;
    }

  return s;
}
static int
check (const uint8_t *input, size_t input_length, ucs4_t *puc)
{
  ucs4_t uc;

  /* Test recognition when at the beginning of the string.  */
  if (u8_prev (&uc, input + input_length, input) != input)
    return 1;

  /* Test recognition when preceded by a 1-unit character.  */
  {
    uint8_t buf[100];
    uint8_t *ptr;
    size_t i;
    ucs4_t uc1;

    ptr = buf;
    *ptr++ = 'x';
    for (i = 0; i < input_length; i++)
      ptr[i] = input[i];

    if (u8_prev (&uc1, ptr + input_length, buf) != ptr)
      return 2;
    if (uc1 != uc)
      return 3;
  }

  /* Test recognition when preceded by a 2-unit character.  */
  {
    uint8_t buf[100];
    uint8_t *ptr;
    size_t i;
    ucs4_t uc1;

    ptr = buf;
    *ptr++ = 0xC3;
    *ptr++ = 0x97;
    for (i = 0; i < input_length; i++)
      ptr[i] = input[i];

    if (u8_prev (&uc1, ptr + input_length, buf) != ptr)
      return 4;
    if (uc1 != uc)
      return 5;
  }

  /* Test recognition when preceded by a 3-unit character.  */
  {
    uint8_t buf[100];
    uint8_t *ptr;
    size_t i;
    ucs4_t uc1;

    ptr = buf;
    *ptr++ = 0xE2;
    *ptr++ = 0x84;
    *ptr++ = 0x82;
    for (i = 0; i < input_length; i++)
      ptr[i] = input[i];

    if (u8_prev (&uc1, ptr + input_length, buf) != ptr)
      return 6;
    if (uc1 != uc)
      return 7;
  }

  /* Test recognition when preceded by a 4-unit character.  */
  {
    uint8_t buf[100];
    uint8_t *ptr;
    size_t i;
    ucs4_t uc1;

    ptr = buf;
    *ptr++ = 0xF0;
    *ptr++ = 0x9D;
    *ptr++ = 0x94;
    *ptr++ = 0x9E;
    for (i = 0; i < input_length; i++)
      ptr[i] = input[i];

    if (u8_prev (&uc1, ptr + input_length, buf) != ptr)
      return 8;
    if (uc1 != uc)
      return 9;
  }

  *puc = uc;
  return 0;
}
static int
check_invalid (const uint8_t *input, size_t input_length)
{
  ucs4_t uc;

  /* Test recognition when at the beginning of the string.  */
  uc = 0xBADFACE;
  if (u8_prev (&uc, input + input_length, input) != NULL)
    return 1;
  if (uc != 0xBADFACE)
    return 2;

  /* Test recognition when preceded by a 1-unit character.  */
  {
    uint8_t buf[100];
    uint8_t *ptr;
    size_t i;

    ptr = buf;
    *ptr++ = 'x';
    for (i = 0; i < input_length; i++)
      ptr[i] = input[i];

    uc = 0xBADFACE;
    if (u8_prev (&uc, ptr + input_length, buf) != NULL)
      return 3;
    if (uc != 0xBADFACE)
      return 4;
  }

  /* Test recognition when preceded by a 2-unit character.  */
  {
    uint8_t buf[100];
    uint8_t *ptr;
    size_t i;

    ptr = buf;
    *ptr++ = 0xC3;
    *ptr++ = 0x97;
    for (i = 0; i < input_length; i++)
      ptr[i] = input[i];

    uc = 0xBADFACE;
    if (u8_prev (&uc, ptr + input_length, buf) != NULL)
      return 5;
    if (uc != 0xBADFACE)
      return 6;
  }

  /* Test recognition when preceded by a 3-unit character.  */
  {
    uint8_t buf[100];
    uint8_t *ptr;
    size_t i;

    ptr = buf;
    *ptr++ = 0xE2;
    *ptr++ = 0x84;
    *ptr++ = 0x82;
    for (i = 0; i < input_length; i++)
      ptr[i] = input[i];

    uc = 0xBADFACE;
    if (u8_prev (&uc, ptr + input_length, buf) != NULL)
      return 7;
    if (uc != 0xBADFACE)
      return 8;
  }

  /* Test recognition when preceded by a 4-unit character.  */
  {
    uint8_t buf[100];
    uint8_t *ptr;
    size_t i;

    ptr = buf;
    *ptr++ = 0xF0;
    *ptr++ = 0x9D;
    *ptr++ = 0x94;
    *ptr++ = 0x9E;
    for (i = 0; i < input_length; i++)
      ptr[i] = input[i];

    uc = 0xBADFACE;
    if (u8_prev (&uc, ptr + input_length, buf) != NULL)
      return 9;
    if (uc != 0xBADFACE)
      return 10;
  }

  return 0;
}
示例#4
0
static int
mem_cd_iconveh_internal (const char *src, size_t srclen,
                         iconv_t cd, iconv_t cd1, iconv_t cd2,
                         enum iconv_ilseq_handler handler,
                         size_t extra_alloc,
                         size_t *offsets,
                         char **resultp, size_t *lengthp)
{
  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
     Instead, we have to start afresh from the beginning of SRC.  */
  /* Use a temporary buffer, so that for small strings, a single malloc()
     call will be sufficient.  */
# define tmpbufsize 4096
  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
     libiconv's UCS-4-INTERNAL encoding.  */
  union { unsigned int align; char buf[tmpbufsize]; } tmp;
# define tmpbuf tmp.buf

  char *initial_result;
  char *result;
  size_t allocated;
  size_t length;
  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */

  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
    {
      initial_result = *resultp;
      allocated = *lengthp;
    }
  else
    {
      initial_result = tmpbuf;
      allocated = sizeof (tmpbuf);
    }
  result = initial_result;

  /* Test whether a direct conversion is possible at all.  */
  if (cd == (iconv_t)(-1))
    goto indirectly;

  if (offsets != NULL)
    {
      size_t i;

      for (i = 0; i < srclen; i++)
        offsets[i] = (size_t)(-1);

      last_length = (size_t)(-1);
    }
  length = 0;

  /* First, try a direct conversion, and see whether a conversion error
     occurs at all.  */
  {
    const char *inptr = src;
    size_t insize = srclen;

    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
# if defined _LIBICONV_VERSION \
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
          || defined __sun)
    /* Set to the initial state.  */
    iconv (cd, NULL, NULL, NULL, NULL);
# endif

    while (insize > 0)
      {
        char *outptr = result + length;
        size_t outsize = allocated - extra_alloc - length;
        bool incremented;
        size_t res;
        bool grow;

        if (offsets != NULL)
          {
            if (length != last_length) /* ensure that offset[] be increasing */
              {
                offsets[inptr - src] = length;
                last_length = length;
              }
            res = iconv_carefully_1 (cd,
                                     &inptr, &insize,
                                     &outptr, &outsize,
                                     &incremented);
          }
        else
          /* Use iconv_carefully instead of iconv here, because:
             - If TO_CODESET is UTF-8, we can do the error handling in this
               loop, no need for a second loop,
             - With iconv() implementations other than GNU libiconv and GNU
               libc, if we use iconv() in a big swoop, checking for an E2BIG
               return, we lose the number of irreversible conversions.  */
          res = iconv_carefully (cd,
                                 &inptr, &insize,
                                 &outptr, &outsize,
                                 &incremented);

        length = outptr - result;
        grow = (length + extra_alloc > allocated / 2);
        if (res == (size_t)(-1))
          {
            if (errno == E2BIG)
              grow = true;
            else if (errno == EINVAL)
              break;
            else if (errno == EILSEQ && handler != iconveh_error)
              {
                if (cd2 == (iconv_t)(-1))
                  {
                    /* TO_CODESET is UTF-8.  */
                    /* Error handling can produce up to 1 byte of output.  */
                    if (length + 1 + extra_alloc > allocated)
                      {
                        char *memory;

                        allocated = 2 * allocated;
                        if (length + 1 + extra_alloc > allocated)
                          abort ();
                        if (result == initial_result)
                          memory = (char *) malloc (allocated);
                        else
                          memory = (char *) realloc (result, allocated);
                        if (memory == NULL)
                          {
                            if (result != initial_result)
                              free (result);
                            errno = ENOMEM;
                            return -1;
                          }
                        if (result == initial_result)
                          memcpy (memory, initial_result, length);
                        result = memory;
                        grow = false;
                      }
                    /* The input is invalid in FROM_CODESET.  Eat up one byte
                       and emit a question mark.  */
                    if (!incremented)
                      {
                        if (insize == 0)
                          abort ();
                        inptr++;
                        insize--;
                      }
                    result[length] = '?';
                    length++;
                  }
                else
                  goto indirectly;
              }
            else
              {
                if (result != initial_result)
                  {
                    int saved_errno = errno;
                    free (result);
                    errno = saved_errno;
                  }
                return -1;
              }
          }
        if (insize == 0)
          break;
        if (grow)
          {
            char *memory;

            allocated = 2 * allocated;
            if (result == initial_result)
              memory = (char *) malloc (allocated);
            else
              memory = (char *) realloc (result, allocated);
            if (memory == NULL)
              {
                if (result != initial_result)
                  free (result);
                errno = ENOMEM;
                return -1;
              }
            if (result == initial_result)
              memcpy (memory, initial_result, length);
            result = memory;
          }
      }
  }

  /* Now get the conversion state back to the initial state.
     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
#if defined _LIBICONV_VERSION \
    || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
         || defined __sun)
  for (;;)
    {
      char *outptr = result + length;
      size_t outsize = allocated - extra_alloc - length;
      size_t res;

      res = iconv (cd, NULL, NULL, &outptr, &outsize);
      length = outptr - result;
      if (res == (size_t)(-1))
        {
          if (errno == E2BIG)
            {
              char *memory;

              allocated = 2 * allocated;
              if (result == initial_result)
                memory = (char *) malloc (allocated);
              else
                memory = (char *) realloc (result, allocated);
              if (memory == NULL)
                {
                  if (result != initial_result)
                    free (result);
                  errno = ENOMEM;
                  return -1;
                }
              if (result == initial_result)
                memcpy (memory, initial_result, length);
              result = memory;
            }
          else
            {
              if (result != initial_result)
                {
                  int saved_errno = errno;
                  free (result);
                  errno = saved_errno;
                }
              return -1;
            }
        }
      else
        break;
    }
#endif

  /* The direct conversion succeeded.  */
  goto done;

 indirectly:
  /* The direct conversion failed.
     Use a conversion through UTF-8.  */
  if (offsets != NULL)
    {
      size_t i;

      for (i = 0; i < srclen; i++)
        offsets[i] = (size_t)(-1);

      last_length = (size_t)(-1);
    }
  length = 0;
  {
    const bool slowly = (offsets != NULL || handler == iconveh_error);
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
    char utf8buf[utf8bufsize + 1];
    size_t utf8len = 0;
    const char *in1ptr = src;
    size_t in1size = srclen;
    bool do_final_flush1 = true;
    bool do_final_flush2 = true;

    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
# if defined _LIBICONV_VERSION \
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
          || defined __sun)
    /* Set to the initial state.  */
    if (cd1 != (iconv_t)(-1))
      iconv (cd1, NULL, NULL, NULL, NULL);
    if (cd2 != (iconv_t)(-1))
      iconv (cd2, NULL, NULL, NULL, NULL);
# endif

    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
      {
        char *out1ptr = utf8buf + utf8len;
        size_t out1size = utf8bufsize - utf8len;
        bool incremented1;
        size_t res1;
        int errno1;

        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
        if (in1size > 0)
          {
            if (offsets != NULL
                && length != last_length) /* ensure that offset[] be increasing */
              {
                offsets[in1ptr - src] = length;
                last_length = length;
              }
            if (cd1 != (iconv_t)(-1))
              {
                if (slowly)
                  res1 = iconv_carefully_1 (cd1,
                                            &in1ptr, &in1size,
                                            &out1ptr, &out1size,
                                            &incremented1);
                else
                  res1 = iconv_carefully (cd1,
                                          &in1ptr, &in1size,
                                          &out1ptr, &out1size,
                                          &incremented1);
              }
            else
              {
                /* FROM_CODESET is UTF-8.  */
                res1 = utf8conv_carefully (slowly,
                                           &in1ptr, &in1size,
                                           &out1ptr, &out1size,
                                           &incremented1);
              }
          }
        else if (do_final_flush1)
          {
            /* Now get the conversion state of CD1 back to the initial state.
               But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
# if defined _LIBICONV_VERSION \
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
          || defined __sun)
            if (cd1 != (iconv_t)(-1))
              res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
            else
# endif
              res1 = 0;
            do_final_flush1 = false;
            incremented1 = true;
          }
        else
          {
            res1 = 0;
            incremented1 = true;
          }
        if (res1 == (size_t)(-1)
            && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
          {
            if (result != initial_result)
              {
                int saved_errno = errno;
                free (result);
                errno = saved_errno;
              }
            return -1;
          }
        if (res1 == (size_t)(-1)
            && errno == EILSEQ && handler != iconveh_error)
          {
            /* The input is invalid in FROM_CODESET.  Eat up one byte and
               emit a question mark.  Room for the question mark was allocated
               at the end of utf8buf.  */
            if (!incremented1)
              {
                if (in1size == 0)
                  abort ();
                in1ptr++;
                in1size--;
              }
            *out1ptr++ = '?';
            res1 = 0;
          }
        errno1 = errno;
        utf8len = out1ptr - utf8buf;

        if (offsets != NULL
            || in1size == 0
            || utf8len > utf8bufsize / 2
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
          {
            /* Conversion step 2: from UTF-8 to TO_CODESET.  */
            const char *in2ptr = utf8buf;
            size_t in2size = utf8len;

            while (in2size > 0
                   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
              {
                char *out2ptr = result + length;
                size_t out2size = allocated - extra_alloc - length;
                bool incremented2;
                size_t res2;
                bool grow;

                if (in2size > 0)
                  {
                    if (cd2 != (iconv_t)(-1))
                      res2 = iconv_carefully (cd2,
                                              &in2ptr, &in2size,
                                              &out2ptr, &out2size,
                                              &incremented2);
                    else
                      /* TO_CODESET is UTF-8.  */
                      res2 = utf8conv_carefully (false,
                                                 &in2ptr, &in2size,
                                                 &out2ptr, &out2size,
                                                 &incremented2);
                  }
                else /* in1size == 0 && !do_final_flush1
                        && in2size == 0 && do_final_flush2 */
                  {
                    /* Now get the conversion state of CD1 back to the initial
                       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
# if defined _LIBICONV_VERSION \
     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
          || defined __sun)
                    if (cd2 != (iconv_t)(-1))
                      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
                    else
# endif
                      res2 = 0;
                    do_final_flush2 = false;
                    incremented2 = true;
                  }

                length = out2ptr - result;
                grow = (length + extra_alloc > allocated / 2);
                if (res2 == (size_t)(-1))
                  {
                    if (errno == E2BIG)
                      grow = true;
                    else if (errno == EINVAL)
                      break;
                    else if (errno == EILSEQ && handler != iconveh_error)
                      {
                        /* Error handling can produce up to 10 bytes of ASCII
                           output.  But TO_CODESET may be UCS-2, UTF-16 or
                           UCS-4, so use CD2 here as well.  */
                        char scratchbuf[10];
                        size_t scratchlen;
                        ucs4_t uc;
                        const char *inptr;
                        size_t insize;
                        size_t res;

                        if (incremented2)
                          {
                            if (u8_prev (&uc, (const uint8_t *) in2ptr,
                                         (const uint8_t *) utf8buf)
                                == NULL)
                              abort ();
                          }
                        else
                          {
                            int n;
                            if (in2size == 0)
                              abort ();
                            n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
                                                  in2size);
                            in2ptr += n;
                            in2size -= n;
                          }

                        if (handler == iconveh_escape_sequence)
                          {
                            static char hex[16] = "0123456789ABCDEF";
                            scratchlen = 0;
                            scratchbuf[scratchlen++] = '\\';
                            if (uc < 0x10000)
                              scratchbuf[scratchlen++] = 'u';
                            else
                              {
                                scratchbuf[scratchlen++] = 'U';
                                scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
                                scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
                                scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
                                scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
                              }
                            scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
                            scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
                            scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
                            scratchbuf[scratchlen++] = hex[uc & 15];
                          }
                        else
                          {
                            scratchbuf[0] = '?';
                            scratchlen = 1;
                          }

                        inptr = scratchbuf;
                        insize = scratchlen;
                        if (cd2 != (iconv_t)(-1))
                          res = iconv (cd2,
                                       (ICONV_CONST char **) &inptr, &insize,
                                       &out2ptr, &out2size);
                        else
                          {
                            /* TO_CODESET is UTF-8.  */
                            if (out2size >= insize)
                              {
                                memcpy (out2ptr, inptr, insize);
                                out2ptr += insize;
                                out2size -= insize;
                                inptr += insize;
                                insize = 0;
                                res = 0;
                              }
                            else
                              {
                                errno = E2BIG;
                                res = (size_t)(-1);
                              }
                          }
                        length = out2ptr - result;
                        if (res == (size_t)(-1) && errno == E2BIG)
                          {
                            char *memory;

                            allocated = 2 * allocated;
                            if (length + 1 + extra_alloc > allocated)
                              abort ();
                            if (result == initial_result)
                              memory = (char *) malloc (allocated);
                            else
                              memory = (char *) realloc (result, allocated);
                            if (memory == NULL)
                              {
                                if (result != initial_result)
                                  free (result);
                                errno = ENOMEM;
                                return -1;
                              }
                            if (result == initial_result)
                              memcpy (memory, initial_result, length);
                            result = memory;
                            grow = false;

                            out2ptr = result + length;
                            out2size = allocated - extra_alloc - length;
                            if (cd2 != (iconv_t)(-1))
                              res = iconv (cd2,
                                           (ICONV_CONST char **) &inptr,
                                           &insize,
                                           &out2ptr, &out2size);
                            else
                              {
                                /* TO_CODESET is UTF-8.  */
                                if (!(out2size >= insize))
                                  abort ();
                                memcpy (out2ptr, inptr, insize);
                                out2ptr += insize;
                                out2size -= insize;
                                inptr += insize;
                                insize = 0;
                                res = 0;
                              }
                            length = out2ptr - result;
                          }
# if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
                        /* Irix iconv() inserts a NUL byte if it cannot convert.
                           NetBSD iconv() inserts a question mark if it cannot
                           convert.
                           Only GNU libiconv and GNU libc are known to prefer
                           to fail rather than doing a lossy conversion.  */
                        if (res != (size_t)(-1) && res > 0)
                          {
                            errno = EILSEQ;
                            res = (size_t)(-1);
                          }
# endif
                        if (res == (size_t)(-1))
                          {
                            /* Failure converting the ASCII replacement.  */
                            if (result != initial_result)
                              {
                                int saved_errno = errno;
                                free (result);
                                errno = saved_errno;
                              }
                            return -1;
                          }
                      }
                    else
                      {
                        if (result != initial_result)
                          {
                            int saved_errno = errno;
                            free (result);
                            errno = saved_errno;
                          }
                        return -1;
                      }
                  }
                if (!(in2size > 0
                      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
                  break;
                if (grow)
                  {
                    char *memory;

                    allocated = 2 * allocated;
                    if (result == initial_result)
                      memory = (char *) malloc (allocated);
                    else
                      memory = (char *) realloc (result, allocated);
                    if (memory == NULL)
                      {
                        if (result != initial_result)
                          free (result);
                        errno = ENOMEM;
                        return -1;
                      }
                    if (result == initial_result)
                      memcpy (memory, initial_result, length);
                    result = memory;
                  }
              }

            /* Move the remaining bytes to the beginning of utf8buf.  */
            if (in2size > 0)
              memmove (utf8buf, in2ptr, in2size);
            utf8len = in2size;
          }