Пример #1
0
U_CAPI UChar*  U_EXPORT2
u_uastrncpy(UChar *ucs1,
           const char *s2,
           int32_t n)
{
  UChar *target = ucs1;
  UErrorCode err = U_ZERO_ERROR;
  UConverter *cnv = u_getDefaultConverter(&err);
  if(U_SUCCESS(err) && cnv != NULL) {
    ucnv_reset(cnv);
    ucnv_toUnicode(cnv,
                   &target,
                   ucs1+n,
                   &s2,
                   s2+u_astrnlen(s2, n),
                   NULL,
                   TRUE,
                   &err);
    ucnv_reset(cnv); /* be good citizens */
    u_releaseDefaultConverter(cnv);
    if(U_FAILURE(err) && (err != U_BUFFER_OVERFLOW_ERROR) ) {
      *ucs1 = 0; /* failure */
    }
    if(target < (ucs1+n)) { /* U_BUFFER_OVERFLOW_ERROR isn't an err, just means no termination will happen. */
      *target = 0;  /* terminate */
    }
  } else {
    *ucs1 = 0;
  }
  return ucs1;
}
Пример #2
0
void convsample_50() {
  printf("\n\n==============================================\n"
         "Sample 50: C: ucnv_detectUnicodeSignature\n");

  //! [ucnv_detectUnicodeSignature]
  UErrorCode err = U_ZERO_ERROR;
  UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
  char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
  int32_t signatureLength = 0;
  const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
  UConverter *conv = NULL;
  UChar output[100];
  UChar *target = output, *out;
  const char *source = input;
  if(encoding!=NULL && U_SUCCESS(err)){
    // should signature be discarded ?
    conv = ucnv_open(encoding, &err);
    // do the conversion
    ucnv_toUnicode(conv,
                   &target, output + UPRV_LENGTHOF(output),
                   &source, input + sizeof(input),
                   NULL, TRUE, &err);
    out = output;
    if (discardSignature){
      ++out; // ignore initial U+FEFF
    }
    while(out != target) {
      printf("%04x ", *out++);
    }
    puts("");
  }
  //! [ucnv_detectUnicodeSignature]
  puts("");
}
Пример #3
0
QString QIcuCodec::convertToUnicode(const char *chars, int length, QTextCodec::ConverterState *state) const
{
    UConverter *conv = getConverter(state);

    QString string(length + 2, Qt::Uninitialized);

    const char *end = chars + length;
    int convertedChars = 0;
    while (1) {
        UChar *uc = (UChar *)string.data();
        UChar *ucEnd = uc + string.length();
        uc += convertedChars;
        UErrorCode error = U_ZERO_ERROR;
        ucnv_toUnicode(conv,
                       &uc, ucEnd,
                       &chars, end,
                       0, false, &error);
        if (!U_SUCCESS(error) && error != U_BUFFER_OVERFLOW_ERROR) {
            qDebug() << "convertToUnicode failed:" << u_errorName(error);
            break;
        }

        convertedChars = uc - (UChar *)string.data();
        if (chars >= end)
            break;
        string.resize(string.length()*2);
    }
    string.resize(convertedChars);

    if (!state)
        ucnv_close(conv);
    return string;
}
Пример #4
0
int TextCodecICU::decodeToBuffer(UChar* target, UChar* targetLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& err)
{
    UChar* targetStart = target;
    err = U_ZERO_ERROR;
    ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
    return target - targetStart;
}
Пример #5
0
UChar*
ufmt_defaultCPToUnicode(const char *s, int32_t sSize,
                        UChar *target, int32_t tSize)
{
    UChar *alias;
    UErrorCode status = U_ZERO_ERROR;
    UConverter *defConverter = u_getDefaultConverter(&status);
    
    if(U_FAILURE(status) || defConverter == 0)
        return 0;

    if(sSize <= 0) {
        sSize = uprv_strlen(s) + 1;
    }
    
    /* perform the conversion in one swoop */
    if(target != 0) {
        
        alias = target;
        ucnv_toUnicode(defConverter, &alias, alias + tSize, &s, s + sSize - 1, 
            NULL, TRUE, &status);
        
        
        /* add the null terminator */
        *alias = 0x0000;
    }
    
    u_releaseDefaultConverter(defConverter);
    
    return target;
}
Пример #6
0
inline int
mod_websocket_conv(UConverter *to, UConverter *from,
                   char **dst, size_t *dstsiz,
                   const char *src, size_t srcsiz) {
    UErrorCode err = U_ZERO_ERROR;
    size_t unisiz;
    UChar *unibuf, *punibuf, *ppunibuf;
    char *pdst;

    if (srcsiz == 0) {
        return -1;
    }
    if (!to) {
        *dst = (char *)malloc(srcsiz + 1);
        if (*dst == NULL) {
            return -1;
        }
        memcpy(*dst, src, srcsiz);
        (*dst)[srcsiz] = '\0';
        *dstsiz = srcsiz;
        return 0;
    }
    if (!from || !dst || !src || !dstsiz) {
        return -1;
    }
    unisiz = srcsiz / ucnv_getMinCharSize(from);
    unibuf = (UChar *)malloc(sizeof(UChar) * unisiz + 1);
    if (!unibuf) {
        return -1;
    }
    punibuf = unibuf;
    ucnv_toUnicode(from, &punibuf, punibuf + unisiz,
                   &src, src + srcsiz, 0, 0, &err);
    if (U_FAILURE(err)) {
        free(unibuf);
        return -1;
    }
    *punibuf = '\0';
    *dstsiz = (punibuf - unibuf) * ucnv_getMaxCharSize(to);
    *dst = (char *)malloc(*dstsiz + 1);
    if (!*dst) {
        free(unibuf);
        return -1;
    }
    pdst = *dst;
    ppunibuf = unibuf;
    ucnv_fromUnicode(to, &pdst, pdst + *dstsiz,
                     (const UChar **)&ppunibuf, punibuf, 0, 0, &err);
    free(unibuf);
    if (U_FAILURE(err)) {
        free(*dst);
        return -1;
    }
    *pdst = '\0';
    *dstsiz = pdst - *dst;
    return 0;
}
static jint NativeConverter_decode(JNIEnv* env, jclass, jlong address,
        jbyteArray source, jint sourceEnd, jcharArray target, jint targetEnd,
        jintArray data, jboolean flush) {

    UConverter* cnv = toUConverter(address);
    if (cnv == NULL) {
        maybeThrowIcuException(env, "toUConverter", U_ILLEGAL_ARGUMENT_ERROR);
        return U_ILLEGAL_ARGUMENT_ERROR;
    }
    ScopedByteArrayRO uSource(env, source);
    if (uSource.get() == NULL) {
        maybeThrowIcuException(env, "uSource", U_ILLEGAL_ARGUMENT_ERROR);
        return U_ILLEGAL_ARGUMENT_ERROR;
    }
    ScopedCharArrayRW uTarget(env, target);
    if (uTarget.get() == NULL) {
        maybeThrowIcuException(env, "uTarget", U_ILLEGAL_ARGUMENT_ERROR);
        return U_ILLEGAL_ARGUMENT_ERROR;
    }
    ScopedIntArrayRW myData(env, data);
    if (myData.get() == NULL) {
        maybeThrowIcuException(env, "myData", U_ILLEGAL_ARGUMENT_ERROR);
        return U_ILLEGAL_ARGUMENT_ERROR;
    }

    // Do the conversion.
    jint* sourceOffset = &myData[0];
    jint* targetOffset = &myData[1];
    const char* mySource = reinterpret_cast<const char*>(uSource.get() + *sourceOffset);
    const char* mySourceLimit = reinterpret_cast<const char*>(uSource.get() + sourceEnd);
    UChar* cTarget = uTarget.get() + *targetOffset;
    const UChar* cTargetLimit = uTarget.get() + targetEnd;
    UErrorCode errorCode = U_ZERO_ERROR;
    ucnv_toUnicode(cnv, &cTarget, cTargetLimit, &mySource, mySourceLimit, NULL, flush, &errorCode);
    *sourceOffset = mySource - reinterpret_cast<const char*>(uSource.get()) - *sourceOffset;
    *targetOffset = cTarget - uTarget.get() - *targetOffset;

    // If there was an error, count the problematic bytes.
    if (errorCode == U_ILLEGAL_CHAR_FOUND || errorCode == U_INVALID_CHAR_FOUND ||
        errorCode == U_TRUNCATED_CHAR_FOUND) {
        int8_t invalidByteCount = 32;
        char invalidBytes[32] = {'\0'};
        UErrorCode minorErrorCode = U_ZERO_ERROR;
        ucnv_getInvalidChars(cnv, invalidBytes, &invalidByteCount, &minorErrorCode);
        if (U_SUCCESS(minorErrorCode)) {
            myData[2] = invalidByteCount;
        }
    }

    // Managed code handles some cases; throw all other errors.
    if (shouldCodecThrow(flush, errorCode)) {
        maybeThrowIcuException(env, "ucnv_toUnicode", errorCode);
    }
    return errorCode;
}
Пример #8
0
DeprecatedString StreamingTextDecoderICU::convertUsingICU(const unsigned char* chs, int len, bool flush)
{
    // Get a converter for the passed-in encoding.
    if (!m_converterICU) {
        createICUConverter();
        if (!m_converterICU)
            return DeprecatedString();
    }

    DeprecatedString result("");
    result.reserve(len);

    UChar buffer[ConversionBufferSize];
    const char* source = reinterpret_cast<const char*>(chs);
    const char* sourceLimit = source + len;
    int32_t* offsets = NULL;
    UErrorCode err;

    do {
        UChar* target = buffer;
        const UChar* targetLimit = target + ConversionBufferSize;
        err = U_ZERO_ERROR;
        ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err);
        int count = target - buffer;
        appendOmittingBOM(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar));
    } while (err == U_BUFFER_OVERFLOW_ERROR);

    if (U_FAILURE(err)) {
        // flush the converter so it can be reused, and not be bothered by this error.
        do {
            UChar *target = buffer;
            const UChar *targetLimit = target + ConversionBufferSize;
            err = U_ZERO_ERROR;
            ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err);
        } while (source < sourceLimit);
        LOG_ERROR("ICU conversion error");
        return DeprecatedString();
    }

    return result;
}
Пример #9
0
U_CAPI UBool U_EXPORT2
ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* error){
    char start[8];
    int32_t numRead;

    UChar target[1]={ 0 };
    UChar* pTarget;
    const char* pStart;

    /* read a few bytes */
    numRead=T_FileStream_read(in, start, sizeof(start));

    *cp = ucnv_detectUnicodeSignature(start, numRead, signatureLength, error);
    
    /* unread the bytes beyond what was consumed for U+FEFF */
    T_FileStream_rewind(in);
    if (*signatureLength > 0) {
        numRead = T_FileStream_read(in, start, *signatureLength);
    }

    if(*cp==NULL){
        *conv =NULL;
        return FALSE;
    }

    /* open the converter for the detected Unicode charset */
    *conv = ucnv_open(*cp,error);

    /* convert and ignore initial U+FEFF, and the buffer overflow */
    pTarget = target;
    pStart = start;
    ucnv_toUnicode(*conv, &pTarget, target+1, &pStart, start+*signatureLength, NULL, FALSE, error);
    *signatureLength = (int32_t)(pStart - start);
    if(*error==U_BUFFER_OVERFLOW_ERROR) {
        *error=U_ZERO_ERROR;
    }

    /* verify that we successfully read exactly U+FEFF */
    if(U_SUCCESS(*error) && (pTarget!=(target+1) || target[0]!=0xfeff)) {
        *error=U_INTERNAL_PROGRAM_ERROR;
    }


    return TRUE; 
}
Пример #10
0
static void printCString(FILE *out, UConverter *converter, const char *str, int32_t len) {
    UChar buf[256];
    const char *strEnd;

    if (len < 0) {
        len = (int32_t)uprv_strlen(str);
    }
    strEnd = str + len;

    do {
        UErrorCode err = U_ZERO_ERROR;
        UChar *bufp = buf, *bufend = buf + (sizeof(buf)/sizeof(buf[0])) - 1 ;

        ucnv_toUnicode(defaultConverter, &bufp, bufend, &str, strEnd, 0, 0, &err);
        *bufp = 0;

        printString(out, converter, buf, (int32_t)(bufp - buf));
    } while (str < strEnd);
}
Пример #11
0
/* params : desc : the document descriptor
 *          buf  : destination buffer for UTF-16 data
 * return : the length of the paragraph
 *          NO_MORE_DATA if there is no more paragraph
 *          ERR_STREAMFILE if an error occured
 *
 * reads the next paragraph and converts to UTF-16
 */
int p_read_content(struct doc_descriptor *desc, UChar *buf) {
  char *outputbuf, *src;
  UChar *dest;
  int len;
  UErrorCode err;

  len = 0;

  outputbuf = (char *) malloc(INTERNAL_BUFSIZE);

  /* reading the next paragraph */
  memset(outputbuf, '\x00', INTERNAL_BUFSIZE);
  len = parse(desc, outputbuf);
  
  if (len > 0) {
    (desc->nb_par_read) += 1;

    /* converting to UTF-16 */
    err = U_ZERO_ERROR;
    dest = buf;
    src = outputbuf;
    ucnv_toUnicode(desc->conv, &dest, dest + 2*INTERNAL_BUFSIZE,
		   &src, outputbuf + strlen(outputbuf), NULL, FALSE, &err);
    len = 2*(dest - buf);
    if (U_FAILURE(err)) {
      free(outputbuf);
      outputbuf = NULL;
      fprintf(stderr, "Unable to convert buffer\n");
      return ERR_ICU;
    }

  }

  if(outputbuf != NULL) {
    free(outputbuf);
  }

  return len;
}
Пример #12
0
static int convertUTF8toUChar(const char *src, UChar *dst, int len)
{
    static UConverter *c;
    UErrorCode status;
    UChar *p = dst;
    const char *s = src;

    if (!c) {
        status = U_ZERO_ERROR;
        c = ucnv_open("UTF-8", &status);
        if (!c) {
            fprintf(stderr, "CouchStore CollateJSON: Couldn't initialize ICU (%d)\n", (int)status);
            abort();
        }
    }

    while (len) {
        unsigned char ch = (unsigned char)(*s);
        if ((ch & 0x80)) {
            goto icu_conv;
        }
        *p++ = (UChar)(ch);
        s++;
        len--;
    }

    return p - dst;

icu_conv:
    status = U_ZERO_ERROR;
    ucnv_toUnicode(c, &p, p + len, &s, s + len, NULL, TRUE, &status);

    if (U_FAILURE(status)) {
        return -1;
    }

    return p - dst;
}
Пример #13
0
/* rewind the buf and file stream */
U_CAPI void U_EXPORT2
ucbuf_rewind(UCHARBUF* buf,UErrorCode* error){
    if(error==NULL || U_FAILURE(*error)){
        return;
    }
    if(buf){
        buf->currentPos=buf->buffer;
        buf->bufLimit=buf->buffer;
        T_FileStream_rewind(buf->in);
        buf->remaining=T_FileStream_size(buf->in)-buf->signatureLength;

        ucnv_resetToUnicode(buf->conv);
        if(buf->signatureLength>0) {
            UChar target[1]={ 0 };
            UChar* pTarget;
            char start[8];
            const char* pStart;
            int32_t numRead;

            /* read the signature bytes */
            numRead=T_FileStream_read(buf->in, start, buf->signatureLength);

            /* convert and ignore initial U+FEFF, and the buffer overflow */
            pTarget = target;
            pStart = start;
            ucnv_toUnicode(buf->conv, &pTarget, target+1, &pStart, start+numRead, NULL, FALSE, error);
            if(*error==U_BUFFER_OVERFLOW_ERROR) {
                *error=U_ZERO_ERROR;
            }

            /* verify that we successfully read exactly U+FEFF */
            if(U_SUCCESS(*error) && (numRead!=buf->signatureLength || pTarget!=(target+1) || target[0]!=0xfeff)) {
                *error=U_INTERNAL_PROGRAM_ERROR;
            }
        }
    }
}
Пример #14
0
int
getText (struct doc_descriptor *desc, UChar * buf, int size)
{
  struct meta *meta = NULL;
  char buf2[BUFSIZE];
  UErrorCode err;
  char *src;
  UChar *dest, esc[3];
  UChar name[1024], value[1024];
  int len, i, isMarkup, isJavascript, isMeta, l, j;
  int dangerousCut, fini, r, offset, endOfFile, space_added;

  space_added = 0;
  l = 0;
  fini = 0;
  endOfFile = 0;
  isJavascript = 0;
  dangerousCut = 0;
  isMarkup = 0;
  isMeta = 0;
  len = read (desc->fd, buf2, BUFSIZE);
  while (!fini && len > 0 && 2*l < size - 2)
    {

      /* consuming buffer */
      for (i = 0; 2*l < size - 2 && i < len && !dangerousCut && !fini; i++)
        {

          /* end of buffer are possible points of failure
             if a markup or a token is cut, it will not be
             parsed. */
          if (!endOfFile && i > len - 9 && (!strncmp (buf2 + i, "\x3c", 1) ||
                                            !strncmp (buf2 + i, "\x26", 1)))
            {
              dangerousCut = 1;
              break;
            }

          /* detecting end of javascript */
          if (isJavascript
              && !strncasecmp (buf2 + i, "</script>", 9))
            {
              isJavascript = 0;
              i += 9;
            }

          /* detecting new paragraph */
          if (l > 0 && !isJavascript
              && (!strncasecmp (buf2 + i, "<p", 2)
                  || !strncasecmp (buf2 + i, "<br", 3)
                  || !strncasecmp (buf2 + i, "<div", 4)))
            {
              fini = 1;
              i += 2;
              while (strncmp (buf2 + i, ">", 1))
                {
                  i++;
                }
              lseek (desc->fd, i - len, SEEK_CUR);
              break;
            }

          /* detecting begining of markup */
          if (!isJavascript && !isMarkup && !strncmp (buf2 + i, "\x3c", 1))
            {

              /* detecting begining of javascript */
              if (!strncasecmp (buf2 + i, "<script", 7))
                {
                  isJavascript = 1;

                }
              else if (!strncasecmp (buf2 + i, "<title", 6))
                {
                  err = U_ZERO_ERROR;
                  /* finding last metadata of desc */
                  if (desc->meta == NULL)
                    {
                      meta = (struct meta *) malloc (sizeof (struct meta));
                      desc->meta = meta;
                    }
                  else
                    {
                      meta = desc->meta;
                      while (meta->next != NULL)
                        {
                          meta = meta->next;
                        }
                      meta->next =
                        (struct meta *) malloc (sizeof (struct meta));
                      meta = meta->next;
                    }
                  meta->next = NULL;
                  meta->name = (UChar *) malloc (12);

                  /* filling name field */
                  meta->name_length =
                    2 * ucnv_toUChars (desc->conv, meta->name, 12, "title", 5,
                                       &err);
                  meta->name_length = u_strlen (meta->name);
                  if (U_FAILURE (err))
                    {
                      printf ("error icu\n");
                      return ERR_ICU;
                    }
                  isMeta = 1;

                }
              else if (!strncasecmp (buf2 + i, "<meta", 5))
                {
                  i += 5;
                  if (i >= size - 9)
                    {
                      strncpy (buf2, buf2 + i, len - i);
                      len =
                        read (desc->fd, buf2 + i,
                              BUFSIZE - len + i) + len - i;
                      i = 0;
                    }
                  for (; strncasecmp (buf2 + i, "name=\"", 6)  &&
                       strncmp (buf2 + i, "\x3E", 1); i++)
                    {
                      if (i >= size - 9)
                        {
                          strncpy (buf2, buf2 + i, len - i);
                          len =
                            read (desc->fd, buf2 + i,
                                  BUFSIZE - len + i) + len - i;
                          i = 0;
                        }
                    }
                  if (!strncmp (buf2 + i, "\x3E", 1))
                    {
                      continue;

                    }
                  else
                    {
                      i += 6;
                      /* get metadata name */
                      memset (name, '\x00', 2048);
                      for (j = 0; len != 0 && strncmp (buf2 + i, "\"", 1);
                           i++)
                        {
                          if (i >= size - 9)
                            {
                              strncpy (buf2, buf2 + i, len - i);
                              len =
                                read (desc->fd, buf2 + i,
                                      BUFSIZE - len + i) + len - i;
                              i = 0;
                            }
                          if (!strncmp (buf2 + i, "\x26", 1))
                            {
                              memset (esc, '\x00', 6);
                              offset = escapeChar (desc, buf2 + i, esc);
                              memcpy (name + j, esc, 2 * u_strlen (esc));
                              j += u_strlen (esc);
                              i += (offset - 1);
                            }
                          else
                            {

                              /* filling name buffer */
                              dest = name + j;
                              src = buf2 + i;
                              err = U_ZERO_ERROR;
                              ucnv_toUnicode (desc->conv, &dest, name + 1024,
                                              &src, buf2 + i + 1, NULL, FALSE,
                                              &err);
                              if (U_FAILURE (err))
                                {
                                  fprintf (stderr,
                                           "Unable to convert buffer\n");
                                  return ERR_ICU;
                                }
                              j += (dest - name - j);
                            }
                        }

                      /* get metadata value */
                      for (; strncasecmp (buf2 + i, "content=\"", 9) && strncmp (buf2 + i, "\x3E", 1); i++)
                        {
                          if (i >= size - 9)
                            {
                              strncpy (buf2, buf2 + i, len - i);
                              len =
                                read (desc->fd, buf2 + i,
                                      BUFSIZE - len + i) + len - i;
                              i = 0;
                            }
                        }
                      i += 9;
                      if (i >= size - 9)
                        {
                          strncpy (buf2, buf2 + i, len - i);
                          len =
                            read (desc->fd, buf2 + i,
                                  BUFSIZE - len + i) + len - i;
                          i = 0;
                        }
                      memset (value, '\x00', 2048);
                      for (j = 0; len != 0 && strncmp (buf2 + i, "\"", 1);
                           i++)
                        {
                          if (i >= size - 9)
                            {
                              strncpy (buf2, buf2 + i, len - i);
                              len =
                                read (desc->fd, buf2 + i,
                                      BUFSIZE - len + i) + len - i;
                              i = 0;
                            }
                          if (!strncmp (buf2 + i, "\x26", 1))
                            {
                              memset (esc, '\x00', 6);
                              offset = escapeChar (desc, buf2 + i, esc);
                              memcpy (value + j, esc, 2 * u_strlen (esc));
                              j += u_strlen (esc);
                              i += (offset - 1);
                            }
                          else
                            {

                              /* filling value buffer */
                              dest = value + j;
                              src = buf2 + i;
                              err = U_ZERO_ERROR;
                              ucnv_toUnicode (desc->conv, &dest, value + 1024,
                                              &src, buf2 + i + 1, NULL, FALSE,
                                              &err);
                              if (U_FAILURE (err))
                                {
                                  fprintf (stderr,
                                           "Unable to convert buffer\n");
                                  return ERR_ICU;
                                }
                              j += (dest - value - j);
                            }
                        }

                      /* insert metadata in list */
                      if (desc->meta == NULL)
                        {
                          meta =
                            (struct meta *) malloc (sizeof (struct meta));
                          desc->meta = meta;
                        }
                      else
                        {
                          meta = desc->meta;
                          while (meta->next != NULL)
                            {
                              meta = meta->next;
                            }
                          meta->next =
                            (struct meta *) malloc (sizeof (struct meta));
                          meta = meta->next;
                        }
                      meta->next = NULL;
                      meta->name = (UChar *) malloc (2 * u_strlen (name) + 2);
                      meta->value =
                        (UChar *) malloc (2 * u_strlen (value) + 2);
                      memset (meta->name, '\x00', 2 * u_strlen (name) + 2);
                      memset (meta->value, '\x00', 2 * u_strlen (value) + 2);
                      memcpy (meta->name, name, 2 * u_strlen (name));
                      memcpy (meta->value, value, 2 * u_strlen (value));
                      meta->name_length = u_strlen (name);
                      meta->value_length = u_strlen (value);

                      for (; strncmp (buf2 + i, "\x3E", 1); i++)
                        {
                          if (i >= size - 9)
                            {
                              strncpy (buf2, buf2 + i, len - i);
                              len =
                                read (desc->fd, buf2 + i,
                                      BUFSIZE - len + i) + len - i;
                              i = 0;
                            }
                        }
                      continue;
                    }

                }
              else
                {

                  isMarkup = 1;
                }
            }

          /* get metadata value */
          if (!isJavascript && isMeta)
            {
              for (; len != 0 && strncmp (buf2 + i, "\x3E", 1); i++)
                {
                  if (i >= size - 9)
                    {
                      strncpy (buf2, buf2 + i, len - i);
                      len =
                        read (desc->fd, buf2 + i,
                              BUFSIZE - len + i) + len - i;
                      i = 0;
                    }
                }
              i++;
              memset (value, '\x00', 2048);
              for (j = 0; len != 0 && strncmp (buf2 + i, "\x3C", 1); i++)
                {
                  if (i >= size - 9)
                    {
                      strncpy (buf2, buf2 + i, len - i);
                      len =
                        read (desc->fd, buf2 + i,
                              BUFSIZE - len + i) + len - i;
                      i = 0;
                    }
                  if (!strncmp (buf2 + i, "\x26", 1))
                    {
                      memset (esc, '\x00', 6);
                      offset = escapeChar (desc, buf2 + i, esc);
                      memcpy (value + j, esc, 2 * u_strlen (esc));
                      j += u_strlen (esc);
                      i += (offset - 1);
                    }
                  else
                    {

                      /* filling value buffer */
                      dest = value + j;
                      src = buf2 + i;
                      err = U_ZERO_ERROR;
                      ucnv_toUnicode (desc->conv, &dest, value + 1024,
                                      &src, buf2 + i + 1, NULL, FALSE, &err);
                      if (U_FAILURE (err))
                        {
                          fprintf (stderr, "Unable to convert buffer\n");
                          return ERR_ICU;
                        }
                      j += (dest - value - j);
                    }
                }
              meta->value = (UChar *) malloc (2 * (j + 1));
              memcpy (meta->value, value, 2 * u_strlen (value));
              meta->value_length = u_strlen (value);
              isMeta = 0;
              i += 7;
              continue;
            }

          /* detecting end of markup */
          if (!isJavascript && isMarkup && !strncmp (buf2 + i, "\x3e", 1))
            {
              if (!space_added && l > 0)
                {
                  buf[l] = 0x20;
                  l ++;
                  space_added = 1;
                }
              isMarkup = 0;
            }

          /* handling text */
          if (!isJavascript && !isMarkup && strncmp (buf2 + i, "\x3e", 1))
            {

              if (strncmp (buf2 + i, "\n", 1) && strncmp (buf2 + i, "\t", 1) && strncmp (buf2 + i, "\r", 1))
                {

                  /* converting tokens */
                  if (!isJavascript && !isMarkup
                      && !strncmp (buf2 + i, "\x26", 1))
                    {
                      memset (esc, '\x00', 6);
                      offset = escapeChar (desc, buf2 + i, esc);
                      if (memcmp (esc, "\x20\x00", u_strlen (esc)))
                        {
                          memcpy (buf + l, esc, 2 * u_strlen (esc));
                          l += u_strlen (esc);
                          space_added = 0;
                        }
                      else {
                        if (!space_added){
                            buf[l] = 0x20;
                            space_added = 1;
                            l++;
                        }
                      }
                      i += (offset - 1);
                    }
                  else
                    {
                      if (buf2[i] != 0x20 || !space_added){
                        /* filling output buffer */
                        dest = buf + l;
                        src = buf2 + i;
                        err = U_ZERO_ERROR;
                        ucnv_toUnicode (desc->conv, &dest, buf + size / 2,
                                        &src, buf2 + i + 1, NULL, FALSE, &err);
                        if (U_FAILURE (err))
                            {
                            fprintf (stderr, "Unable to convert buffer\n");
                            return ERR_ICU;
                            }
                        l += (dest - buf - l);
                        if (buf2[i] == 0x20) {space_added = 1;} else {space_added=0;}
                      }
                    }
                }
              else
                {
                  /* replace tabs and eol by spaces */
                  if (!space_added){
                    buf[l] = 0x20;
                    space_added = 1;
                    l++;
                  }
                }
            }
        }

      /* filling new buffer correctly */
      if (!fini)
        {
          if (dangerousCut)
            {
              r = len - i;
              strncpy (buf2, buf2 + i, r);
              len = read (desc->fd, buf2 + r, BUFSIZE - r) + r;
              if (len < 9)
                {
                  endOfFile = 1;
                }
              dangerousCut = 0;
            }
          else
            {
              len = read (desc->fd, buf2, BUFSIZE);
            }
        }

    }

  /* ending buffer properly */
  if (l > 0)
    {
      buf[l] = 0x20;
      return 2*l;
    }

  if (len == 0)
    {
      return NO_MORE_DATA;
    }

  return 2*l;
}
Пример #15
0
void charsetConverter_icu::convert
	(utility::inputStream& in, utility::outputStream& out, status* st)
{
	UErrorCode err = U_ZERO_ERROR;

	ucnv_reset(m_from);
	ucnv_reset(m_to);

	if (st)
		new (st) status();

	// From buffers
	byte_t cpInBuffer[16]; // stream data put here
	const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar);
	std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here

	// To buffers
	// converted (char) data end up here
	const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize;
	std::vector <char> cpOutBuffer(cpOutBufferSz);

	// Tell ICU what to do when encountering an illegal byte sequence
	if (m_options.silentlyReplaceInvalidSequences)
	{
		// Set replacement chars for when converting from Unicode to codepage
		icu::UnicodeString substString(m_options.invalidSequence.c_str());
		ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);

		if (U_FAILURE(err))
			throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
	}
	else
	{
		// Tell ICU top stop (and return an error) on illegal byte sequences
		ucnv_setToUCallBack
			(m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);

		if (U_FAILURE(err))
			throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");

		ucnv_setFromUCallBack
			(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);

		if (U_FAILURE(err))
			throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
	}

	// Input data available
	while (!in.eof())
	{
		// Read input data into buffer
		size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer));

		// Beginning of read data
		const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]);
		const char* sourceLimit = source + inLength; // end + 1

		UBool flush = in.eof();  // is this last run?

		UErrorCode toErr;

		// Loop until all source has been processed
		do
		{
			// Set up target pointers
			UChar* target = &uOutBuffer[0];
			UChar* targetLimit = &target[0] + outSize;

			toErr = U_ZERO_ERROR;
			ucnv_toUnicode(m_from, &target, targetLimit,
			               &source, sourceLimit, NULL, flush, &toErr);

			if (st)
				st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0]));

			if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr))
			{
				if (toErr == U_INVALID_CHAR_FOUND ||
				    toErr == U_TRUNCATED_CHAR_FOUND ||
				    toErr == U_ILLEGAL_CHAR_FOUND)
				{
					// Error will be thrown later (*)
				}
				else
				{
					throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
				}
			}

			// The Unicode source is the buffer just written and the limit
			// is where the previous conversion stopped (target is moved in the conversion)
			const UChar* uSource = &uOutBuffer[0];
			UChar* uSourceLimit = &target[0];
			UErrorCode fromErr;

			// Loop until converted chars are fully written
			do
			{
				char* cpTarget = &cpOutBuffer[0];
				const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz;

				fromErr = U_ZERO_ERROR;

				// Write converted bytes (Unicode) to destination codepage
				ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
				                 &uSource, uSourceLimit, NULL, flush, &fromErr);

				if (st)
				{
					// Decrement input bytes count by the number of input bytes in error
					char errBytes[16];
					int8_t errBytesLen = sizeof(errBytes);
					UErrorCode errBytesErr = U_ZERO_ERROR;

	 				ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr);

					st->inputBytesRead -= errBytesLen;
					st->outputBytesWritten += cpTarget - &cpOutBuffer[0];
				}

				// (*) If an error occurred while converting from input charset, throw it now
				if (toErr == U_INVALID_CHAR_FOUND ||
				    toErr == U_TRUNCATED_CHAR_FOUND ||
				    toErr == U_ILLEGAL_CHAR_FOUND)
				{
					throw exceptions::illegal_byte_sequence_for_charset();
				}

				if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
				{
					if (fromErr == U_INVALID_CHAR_FOUND ||
					    fromErr == U_TRUNCATED_CHAR_FOUND ||
					    fromErr == U_ILLEGAL_CHAR_FOUND)
					{
						throw exceptions::illegal_byte_sequence_for_charset();
					}
					else
					{
						throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
					}
				}

				// Write to destination stream
				out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0]));

			} while (fromErr == U_BUFFER_OVERFLOW_ERROR);

		} while (toErr == U_BUFFER_OVERFLOW_ERROR);
	}
}
Пример #16
0
// ---------------------------------------------------------------------------
//  ICUTranscoder: The virtual transcoder API
// ---------------------------------------------------------------------------
unsigned int
ICUTranscoder::transcodeFrom(const  XMLByte* const          srcData
                            , const unsigned int            srcCount
                            ,       XMLCh* const            toFill
                            , const unsigned int            maxChars
                            ,       unsigned int&           bytesEaten
                            ,       unsigned char* const    charSizes)
{
    // If debugging, insure the block size is legal
    #if defined(XERCES_DEBUG)
    checkBlockSize(maxChars);
    #endif

    // Set up pointers to the start and end of the source buffer
    const XMLByte*  startSrc = srcData;
    const XMLByte*  endSrc = srcData + srcCount;

    //
    //  And now do the target buffer. This works differently according to
    //  whether XMLCh and UChar are the same size or not.
    //
    UChar* startTarget;
    if (sizeof(XMLCh) == sizeof(UChar))
        startTarget = (UChar*)toFill;
     else
        startTarget = (UChar*) getMemoryManager()->allocate
        (
            maxChars * sizeof(UChar)
        );//new UChar[maxChars];
    UChar* orgTarget = startTarget;

    //
    //  Transoode the buffer.  Buffer overflow errors are normal, occuring
    //  when the raw input buffer holds more characters than will fit in
    //  the Unicode output buffer.
    //
    UErrorCode  err = U_ZERO_ERROR;
    ucnv_toUnicode
    (
        fConverter
        , &startTarget
        , startTarget + maxChars
        , (const char**)&startSrc
        , (const char*)endSrc
        , (fFixed ? 0 : (int32_t*)fSrcOffsets)
        , false
        , &err
    );

    if ((err != U_ZERO_ERROR) && (err != U_BUFFER_OVERFLOW_ERROR))
    {
        if (orgTarget != (UChar*)toFill)
            getMemoryManager()->deallocate(orgTarget);//delete [] orgTarget;

        if (fFixed)
        {
            XMLCh tmpBuf[17];
            XMLString::binToText((unsigned int)(*startTarget), tmpBuf, 16, 16, getMemoryManager());
            ThrowXMLwithMemMgr2
            (
                TranscodingException
                , XMLExcepts::Trans_BadSrcCP
                , tmpBuf
                , getEncodingName()
                , getMemoryManager()
            );
        }
        else
        {
            ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());
        }
    }

    // Calculate the bytes eaten and store in caller's param
    bytesEaten = startSrc - srcData;

    // And the characters decoded
    const unsigned int charsDecoded = startTarget - orgTarget;

    //
    //  Translate the array of char offsets into an array of character
    //  sizes, which is what the transcoder interface semantics requires.
    //  If its fixed, then we can optimize it.
    //
    if (fFixed)
    {
        const unsigned char fillSize = (unsigned char)ucnv_getMaxCharSize(fConverter);
        memset(charSizes, fillSize, maxChars);
    }
     else
    {
        //
        //  We have to convert the series of offsets into a series of
        //  sizes. If just one char was decoded, then its the total bytes
        //  eaten. Otherwise, do a loop and subtract out each element from
        //  its previous element.
        //
        if (charsDecoded == 1)
        {
            charSizes[0] = (unsigned char)bytesEaten;
        }
         else
        {
            //  ICU does not return an extra element to allow us to figure
            //  out the last char size, so we have to compute it from the
            //  total bytes used.
            unsigned int index;
            for (index = 0; index < charsDecoded - 1; index++)
            {
                charSizes[index] = (unsigned char)(fSrcOffsets[index + 1]
                                                    - fSrcOffsets[index]);
            }
            if( charsDecoded > 0 ) {
                charSizes[charsDecoded - 1] = (unsigned char)(bytesEaten
                                              - fSrcOffsets[charsDecoded - 1]);
            }
        }
    }

    //
    //  If XMLCh and UChar are not the same size, then we need to copy over
    //  the temp buffer to the new one.
    //
    if (sizeof(UChar) != sizeof(XMLCh))
    {
        XMLCh* outPtr = toFill;
        startTarget = orgTarget;
        for (unsigned int index = 0; index < charsDecoded; index++)
            *outPtr++ = XMLCh(*startTarget++);

        // And delete the temp buffer
        getMemoryManager()->deallocate(orgTarget);//delete [] orgTarget;
    }

    // Return the chars we put into the target buffer
    return charsDecoded;
}
Пример #17
0
// Convert a file from one encoding to another
static UBool convertFile(const char *pname,
                         const char *fromcpage,
                         UConverterToUCallback toucallback,
                         const void *touctxt,
                         const char *tocpage,
                         UConverterFromUCallback fromucallback,
                         const void *fromuctxt,
                         int fallback,
                         size_t bufsz,
                         const char *translit,
                         const char *infilestr,
                         FILE * outfile, int verbose)
{
    FILE *infile;
    UBool ret = TRUE;
    UConverter *convfrom = 0;
    UConverter *convto = 0;
    UErrorCode err = U_ZERO_ERROR;
    UBool flush;
    const char *cbufp;
    char *bufp;
    char *buf = 0;

    uint32_t infoffset = 0, outfoffset = 0;   /* Where we are in the file, for error reporting. */

    const UChar *unibufbp;
    UChar *unibufp;
    UChar *unibuf = 0;
    int32_t *fromoffsets = 0, *tooffsets = 0;

    size_t rd, wr, tobufsz;

#if !UCONFIG_NO_TRANSLITERATION
    Transliterator *t = 0;      // Transliterator acting on Unicode data.
#endif
    UnicodeString u;            // String to do the transliteration.

    // Open the correct input file or connect to stdin for reading input

    if (infilestr != 0 && strcmp(infilestr, "-")) {
        infile = fopen(infilestr, "rb");
        if (infile == 0) {
            UnicodeString str1(infilestr, "");
            str1.append((UChar32) 0);
            UnicodeString str2(strerror(errno), "");
            str2.append((UChar32) 0);
            initMsg(pname);
            u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
            return FALSE;
        }
    } else {
        infilestr = "-";
        infile = stdin;
#ifdef WIN32
        if (setmode(fileno(stdin), O_BINARY) == -1) {
            initMsg(pname);
            u_wmsg(stderr, "cantSetInBinMode");
            return FALSE;
        }
#endif
    }

    if (verbose) {
        fprintf(stderr, "%s:\n", infilestr);
    }

#if !UCONFIG_NO_TRANSLITERATION
    // Create transliterator as needed.

    if (translit != NULL && *translit) {
        UParseError parse;
        UnicodeString str(translit), pestr;

        /* Create from rules or by ID as needed. */

        parse.line = -1;

        if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
            t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
        } else {
            t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
        }

        if (U_FAILURE(err)) {
            str.append((UChar32) 0);
            initMsg(pname);

            if (parse.line >= 0) {
                UChar linebuf[20], offsetbuf[20];
                uprv_itou(linebuf, 20, parse.line, 10, 0);
                uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
                u_wmsg(stderr, "cantCreateTranslitParseErr", str.getBuffer(),
                    u_wmsg_errorName(err), linebuf, offsetbuf);
            } else {
                u_wmsg(stderr, "cantCreateTranslit", str.getBuffer(),
                    u_wmsg_errorName(err));
            }

            if (t) {
                delete t;
                t = 0;
            }
            goto error_exit;
        }
    }
#endif

    // Create codepage converter. If the codepage or its aliases weren't
    // available, it returns NULL and a failure code. We also set the
    // callbacks, and return errors in the same way.

    convfrom = ucnv_open(fromcpage, &err);
    if (U_FAILURE(err)) {
        UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1));
        initMsg(pname);
        u_wmsg(stderr, "cantOpenFromCodeset", str.getBuffer(),
            u_wmsg_errorName(err));
        goto error_exit;
    }
    ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
    if (U_FAILURE(err)) {
        initMsg(pname);
        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
        goto error_exit;
    }

    convto = ucnv_open(tocpage, &err);
    if (U_FAILURE(err)) {
        UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1));
        initMsg(pname);
        u_wmsg(stderr, "cantOpenToCodeset", str.getBuffer(),
            u_wmsg_errorName(err));
        goto error_exit;
    }
    ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
    if (U_FAILURE(err)) {
        initMsg(pname);
        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
        goto error_exit;
    }
    ucnv_setFallback(convto, fallback);

    // To ensure that the buffer always is of enough size, we
    // must take the worst case scenario, that is the character in
    // the codepage that uses the most bytes and multiply it against
    // the buffer size.

    // use bufsz+1 to allow for additional BOM/signature character (U+FEFF)
    tobufsz = (bufsz+1) * ucnv_getMaxCharSize(convto);

    buf = new char[tobufsz];
    unibuf = new UChar[bufsz];

    fromoffsets = new int32_t[bufsz];
    tooffsets = new int32_t[tobufsz];

    // OK, we can convert now.

    do {
        char willexit = 0;

        rd = fread(buf, 1, bufsz, infile);
        if (ferror(infile) != 0) {
            UnicodeString str(strerror(errno));
            str.append((UChar32) 0);
            initMsg(pname);
            u_wmsg(stderr, "cantRead", str.getBuffer());
            goto error_exit;
        }

        // Convert the read buffer into the new coding
        // After the call 'unibufp' will be placed on the last
        // character that was converted in the 'unibuf'.
        // Also the 'cbufp' is positioned on the last converted
        // character.
        // At the last conversion in the file, flush should be set to
        // true so that we get all characters converted
        //
        // The converter must be flushed at the end of conversion so
        // that characters on hold also will be written.

        unibufp = unibuf;
        cbufp = buf;
        flush = rd != bufsz;
        ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp,
            cbufp + rd, fromoffsets, flush, &err);

        infoffset += (uint32_t)(cbufp - buf);

        if (U_FAILURE(err)) {
            char pos[32];
            sprintf(pos, "%u", infoffset - 1);
            UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
            initMsg(pname);
            u_wmsg(stderr, "problemCvtToU", str.getBuffer(), u_wmsg_errorName(err));
            willexit = 1;
            err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
        }

        // At the last conversion, the converted characters should be
        // equal to number of chars read.

        if (flush && !willexit && cbufp != (buf + rd)) {
            char pos[32];
            sprintf(pos, "%u", infoffset);
            UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
            initMsg(pname);
            u_wmsg(stderr, "premEndInput", str.getBuffer());
            willexit = 1;
        }

        // Prepare to transliterate and convert. Transliterate if needed.

#if !UCONFIG_NO_TRANSLITERATION
        if (t) {
            u.setTo(unibuf, (int32_t)(unibufp - unibuf)); // Copy into string.
            t->transliterate(u);
        } else
#endif
        {
            u.setTo(unibuf, (int32_t)(unibufp - unibuf), (int32_t)(bufsz)); // Share the buffer.
        }

        int32_t ulen = u.length();

        // Convert the Unicode buffer into the destination codepage
        // Again 'bufp' will be placed on the last converted character
        // And 'unibufbp' will be placed on the last converted unicode character
        // At the last conversion flush should be set to true to ensure that
        // all characters left get converted

        const UChar *unibufu = unibufbp = u.getBuffer();

        do {
            int32_t len = ulen > (int32_t)bufsz ? (int32_t)bufsz : ulen;

            bufp = buf;
            unibufp = (UChar *) (unibufbp + len);

            ucnv_fromUnicode(convto, &bufp, bufp + tobufsz,
                             &unibufbp,
                             unibufp,
                             tooffsets, flush, &err);

            if (U_FAILURE(err)) {
                const char *errtag;
                char pos[32];

                uint32_t erroffset =
                    dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(tobufsz));
                int32_t ferroffset = (int32_t)(infoffset - (unibufp - unibufu) + erroffset);

                if ((int32_t) ferroffset < 0) {
                    ferroffset = (int32_t)(outfoffset + (bufp - buf));
                    errtag = "problemCvtFromUOut";
                } else {
                    errtag = "problemCvtFromU";
                }
                sprintf(pos, "%u", ferroffset);
                UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
                initMsg(pname);
                u_wmsg(stderr, errtag, str.getBuffer(),
                       u_wmsg_errorName(err));
                willexit = 1;
            }

            // At the last conversion, the converted characters should be equal to number
            // of consumed characters.
            if (flush && !willexit && unibufbp != (unibufu + (size_t) (unibufp - unibufu))) {
                char pos[32];
                sprintf(pos, "%u", infoffset);
                UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
                initMsg(pname);
                u_wmsg(stderr, "premEnd", str.getBuffer());
                willexit = 1;
            }

            // Finally, write the converted buffer to the output file


            rd = (size_t) (bufp - buf);
            outfoffset += (int32_t)(wr = fwrite(buf, 1, rd, outfile));
            if (wr != rd) {
                UnicodeString str(strerror(errno), "");
                initMsg(pname);
                u_wmsg(stderr, "cantWrite", str.getBuffer());
                willexit = 1;
            }

            if (willexit) {
                goto error_exit;
            }
        } while ((ulen -= (int32_t)(bufsz)) > 0);
    } while (!flush);           // Stop when we have flushed the
                                // converters (this means that it's
                                // the end of output)

    goto normal_exit;

error_exit:
    ret = FALSE;

normal_exit:
    // Cleanup.

    if (convfrom) ucnv_close(convfrom);
    if (convto) ucnv_close(convto);

#if !UCONFIG_NO_TRANSLITERATION
    if (t) delete t;
#endif

    if (buf) delete[] buf;
    if (unibuf) delete[] unibuf;

    if (fromoffsets) delete[] fromoffsets;
    if (tooffsets) delete[] tooffsets;

    if (infile != stdin) {
        fclose(infile);
    }

    return ret;
}
Пример #18
0
/**************************
* Will convert a sequence of bytes from one codepage to another.
* @param toConverterName: The name of the converter that will be used to encode the output buffer
* @param fromConverterName: The name of the converter that will be used to decode the input buffer
* @param target: Pointer to the output buffer* written
* @param targetLength: on input contains the capacity of target, on output the number of bytes copied to target
* @param source: Pointer to the input buffer
* @param sourceLength: on input contains the capacity of source, on output the number of bytes processed in "source"
* @param internal: used internally to store store state data across calls
* @param err: fills in an error status
*/
void
T_UConverter_fromCodepageToCodepage (UConverter * outConverter,
				     UConverter * inConverter,
				     char **target,
				     const char *targetLimit,
				     const char **source,
				     const char *sourceLimit,
				     int32_t* offsets,
				     int flush,
				     UErrorCode * err)
{

  UChar out_chunk[CHUNK_SIZE];
  const UChar *out_chunk_limit = out_chunk + CHUNK_SIZE;
  UChar *out_chunk_alias;
  UChar const *out_chunk_alias2;


  if (U_FAILURE (*err))    return;


  /*loops until the input buffer is completely consumed
   *or if an error has be encountered
   *first we convert from inConverter codepage to Unicode
   *then from Unicode to outConverter codepage
   */
  while ((*source != sourceLimit) && U_SUCCESS (*err))
    {
      out_chunk_alias = out_chunk;
      ucnv_toUnicode (inConverter,
		      &out_chunk_alias,
		      out_chunk_limit,
		      source,
		      sourceLimit,
		      NULL,
		      flush,
		      err);

      /*BUFFER_OVERFLOW_ERROR means that the output "CHUNK" is full
       *we will require at least another loop (it's a recoverable error)
       */

      if (U_SUCCESS (*err) || (*err == U_BUFFER_OVERFLOW_ERROR))
	{
	  *err = U_ZERO_ERROR;
	  out_chunk_alias2 = out_chunk;

	  while ((out_chunk_alias2 != out_chunk_alias) && U_SUCCESS (*err))
	    {
	      ucnv_fromUnicode (outConverter,
				target,
				targetLimit,
				&out_chunk_alias2,
				out_chunk_alias,
				NULL,
				TRUE,
				err);

	    }
	}
      else
	break;
    }

  return;
}
Пример #19
0
int32_t ucnv_toUChars (const UConverter * converter,
		       UChar * target,
		       int32_t targetSize,
		       const char *source,
		       int32_t sourceSize,
		       UErrorCode * err)
{
  const char *mySource = source;
  const char *mySource_limit = source + sourceSize;
  UConverter myConverter;
  UChar *myTarget = target;
  int32_t targetCapacity;

  if (U_FAILURE (*err))
    return 0;

  if ((converter == NULL) || (targetSize < 0) || (sourceSize < 0))
    {
      *err = U_ILLEGAL_ARGUMENT_ERROR;
      return 0;
    }
  /*Means there is no work to be done */
  if (sourceSize == 0)
    {
      /*for consistency we still need to
       *store 0 in the targetCapacity
       *if the user requires it
       */
      if (targetSize >= 1)
	{
	  target[0] = 0x0000;
	  return 1;
	}
      else
	return 0;
    }

  /*makes a local copy of the UConverter */
  myConverter = *converter;


  /*Not in pure pre-flight mode */
  if (targetSize > 0)
    {
     /*  Changed from (targetSize * 2) to (targetSize) */
      ucnv_toUnicode (&myConverter,
		      &myTarget,
		      target + (targetSize-1),	  /*Save a spot for the Null terminator */
		      &mySource,
		      mySource_limit,
		      NULL,
		      TRUE,
		      err);

      /*Null terminates the string */
      *(myTarget) = 0x0000;
    }


  /*Rigs targetCapacity to have at least one cell for zero termination */
  /*Updates targetCapacity to contain the number of bytes written to target */
  targetCapacity = 1;
  targetCapacity += myTarget - target;
  if (targetSize == 0)
    {
      *err = U_BUFFER_OVERFLOW_ERROR;
    }
  /* If the output buffer is exhausted, we need to stop writing
   * to it but if the input buffer is not exhausted,
   * we need to continue the conversion in order to store in targetSize
   * the number of bytes that was required
   */
  if (*err == U_BUFFER_OVERFLOW_ERROR)
    {
      UChar target2[CHUNK_SIZE];
      UChar *target2_alias = target2;
      const UChar *target2_limit = target2 + CHUNK_SIZE;

      /*We use a stack allocated buffer around which we loop
         (in case the output is greater than CHUNK_SIZE) */

      while (*err == U_BUFFER_OVERFLOW_ERROR)
	{
	  *err = U_ZERO_ERROR;
	  target2_alias = target2;
	  ucnv_toUnicode (&myConverter,
			  &target2_alias,
			  target2_limit,
			  &mySource,
			  mySource_limit,
			  NULL,
			  TRUE,
			  err);

	  /*updates the output parameter to contain the number of char required */
	  targetCapacity += target2_alias - target2 + 1;
	}
      (targetCapacity)--;	/*adjust for last one */
      if (U_SUCCESS (*err))
	*err = U_BUFFER_OVERFLOW_ERROR;
    }

  return targetCapacity;
}
Пример #20
0
UErrorCode convsample_40()
{
  printf("\n\n==============================================\n"
    "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");

  FILE *f;
  FILE *out;
  int32_t count;
  char inBuf[BUFFERSIZE];
  const char *source;
  const char *sourceLimit;
  UChar *uBuf;
  UChar *target;
  UChar *targetLimit;
  int32_t uBufSize = 0;
  UConverter *conv = NULL;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t inbytes=0, total=0;

  f = fopen("data02.bin", "rb");
  if(!f)
  {
    fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
    return U_FILE_ACCESS_ERROR;
  }

  out = fopen("data40.utf16", "wb");
  if(!out)
  {
    fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
    fclose(f);
    return U_FILE_ACCESS_ERROR;
  }

  // **************************** START SAMPLE *******************
  conv = ucnv_openCCSID(37, UCNV_IBM, &status);
  assert(U_SUCCESS(status));

  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
  printf("input bytes %d / min chars %d = %d UChars\n",
         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
  assert(uBuf!=NULL);

  // grab another buffer's worth
  while((!feof(f)) && 
        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
  {
    inbytes += count;

    // Convert bytes to unicode
    source = inBuf;
    sourceLimit = inBuf + count;
    
    do
    {
        target = uBuf;
        targetLimit = uBuf + uBufSize;
        
        ucnv_toUnicode( conv, &target, targetLimit, 
                       &source, sourceLimit, NULL,
                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
                                   /* is true (when no more data will come) */
                         &status);
      
        if(status == U_BUFFER_OVERFLOW_ERROR)
        {
          // simply ran out of space - we'll reset the target ptr the next
          // time through the loop.
          status = U_ZERO_ERROR;
        }
        else
        {
          //  Check other errors here.
          assert(U_SUCCESS(status));
          // Break out of the loop (by force)
        }

        // Process the Unicode
        // Todo: handle UTF-16/surrogates
        assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
               (size_t)(target-uBuf));
        total += (target-uBuf);
    } while (source < sourceLimit); // while simply out of space
  }

  printf("%d bytes in,  %d UChars out.\n", inbytes, total);
  
  // ***************************** END SAMPLE ********************
  ucnv_close(conv);

  fclose(f);
  fclose(out);
  printf("\n");

  return U_ZERO_ERROR;
}
Пример #21
0
UXMLElement *
UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
    char bytes[4096], charsetBuffer[100];
    FileStream *f;
    const char *charset, *pb;
    UnicodeString src;
    UConverter *cnv;
    UChar *buffer, *pu;
    int32_t fileLength, bytesLength, length, capacity;
    UBool flush;

    if(U_FAILURE(errorCode)) {
        return NULL;
    }

    f=T_FileStream_open(filename, "rb");
    if(f==NULL) {
        errorCode=U_FILE_ACCESS_ERROR;
        return NULL;
    }

    bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
    if(bytesLength<(int32_t)sizeof(bytes)) {
        // we have already read the entire file
        fileLength=bytesLength;
    } else {
        // get the file length
        fileLength=T_FileStream_size(f);
    }

    /*
     * get the charset:
     * 1. Unicode signature
     * 2. treat as ISO-8859-1 and read XML encoding="charser"
     * 3. default to UTF-8
     */
    charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
    if(U_SUCCESS(errorCode) && charset!=NULL) {
        // open converter according to Unicode signature
        cnv=ucnv_open(charset, &errorCode);
    } else {
        // read as Latin-1 and parse the XML declaration and encoding
        cnv=ucnv_open("ISO-8859-1", &errorCode);
        if(U_FAILURE(errorCode)) {
            // unexpected error opening Latin-1 converter
            goto exit;
        }

        buffer=src.getBuffer(bytesLength);
        if(buffer==NULL) {
            // unexpected failure to reserve some string capacity
            errorCode=U_MEMORY_ALLOCATION_ERROR;
            goto exit;
        }
        pb=bytes;
        pu=buffer;
        ucnv_toUnicode(
            cnv,
            &pu, buffer+src.getCapacity(),
            &pb, bytes+bytesLength,
            NULL, TRUE, &errorCode);
        src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
        ucnv_close(cnv);
        cnv=NULL;
        if(U_FAILURE(errorCode)) {
            // unexpected error in conversion from Latin-1
            src.remove();
            goto exit;
        }

        // parse XML declaration
        if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
            int32_t declEnd=mXMLDecl.end(errorCode);
            // go beyond <?xml
            int32_t pos=src.indexOf((UChar)x_l)+1;

            mAttrValue.reset(src);
            while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
                UnicodeString attName  = mAttrValue.group(1, errorCode);
                UnicodeString attValue = mAttrValue.group(2, errorCode);

                // Trim the quotes from the att value.  These are left over from the original regex
                //   that parsed the attribue, which couldn't conveniently strip them.
                attValue.remove(0,1);                    // one char from the beginning
                attValue.truncate(attValue.length()-1);  // and one from the end.

                if(attName==UNICODE_STRING("encoding", 8)) {
                    length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
                    charset=charsetBuffer;
                    break;
                }
                pos = mAttrValue.end(2, errorCode);
            }

            if(charset==NULL) {
                // default to UTF-8
                charset="UTF-8";
            }
            cnv=ucnv_open(charset, &errorCode);
        }
    }

    if(U_FAILURE(errorCode)) {
        // unable to open the converter
        goto exit;
    }

    // convert the file contents
    capacity=fileLength;        // estimated capacity
    src.getBuffer(capacity);
    src.releaseBuffer(0);       // zero length
    flush=FALSE;
    for(;;) {
        // convert contents of bytes[bytesLength]
        pb=bytes;
        for(;;) {
            length=src.length();
            buffer=src.getBuffer(capacity);
            if(buffer==NULL) {
                // unexpected failure to reserve some string capacity
                errorCode=U_MEMORY_ALLOCATION_ERROR;
                goto exit;
            }

            pu=buffer+length;
            ucnv_toUnicode(
                cnv, &pu, buffer+src.getCapacity(),
                &pb, bytes+bytesLength,
                NULL, FALSE, &errorCode);
            src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
            if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
                errorCode=U_ZERO_ERROR;
                capacity=(3*src.getCapacity())/2; // increase capacity by 50%
            } else {
                break;
            }
        }

        if(U_FAILURE(errorCode)) {
            break; // conversion error
        }

        if(flush) {
            break; // completely converted the file
        }

        // read next block
        bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
        if(bytesLength==0) {
            // reached end of file, convert once more to flush the converter
            flush=TRUE;
        }
    };

exit:
    ucnv_close(cnv);
    T_FileStream_close(f);

    if(U_SUCCESS(errorCode)) {
        return parse(src, errorCode);
    } else {
        return NULL;
    }
}
Пример #22
0
void charsetFilteredOutputStream_icu::writeImpl
	(const byte_t* const data, const size_t count)
{
	if (m_from == NULL || m_to == NULL)
		throw exceptions::charset_conv_error("Cannot initialize converters.");

	// Allocate buffer for Unicode chars
	const size_t uniSize = ucnv_getMinCharSize(m_from) * count * sizeof(UChar);
	std::vector <UChar> uniBuffer(uniSize);

	// Conversion loop
	UErrorCode toErr = U_ZERO_ERROR;

	const char* uniSource = reinterpret_cast <const char*>(data);
	const char* uniSourceLimit = uniSource + count;

	do
	{
		// Convert from source charset to Unicode
		UChar* uniTarget = &uniBuffer[0];
		UChar* uniTargetLimit = &uniBuffer[0] + uniSize;

		toErr = U_ZERO_ERROR;

		ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit,
		               &uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr);

		if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
		{
			if (toErr == U_INVALID_CHAR_FOUND ||
			    toErr == U_TRUNCATED_CHAR_FOUND ||
			    toErr == U_ILLEGAL_CHAR_FOUND)
			{
				throw exceptions::illegal_byte_sequence_for_charset();
			}
			else
			{
				throw exceptions::charset_conv_error
					("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
			}
		}

		const size_t uniLength = uniTarget - &uniBuffer[0];

		// Allocate buffer for destination charset
		const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
		std::vector <char> cpBuffer(cpSize);

		// Convert from Unicode to destination charset
		UErrorCode fromErr = U_ZERO_ERROR;

		const UChar* cpSource = &uniBuffer[0];
		const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;

		do
		{
			char* cpTarget = &cpBuffer[0];
			char* cpTargetLimit = &cpBuffer[0] + cpSize;

			fromErr = U_ZERO_ERROR;

			ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
							 &cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr);

			if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
			{
				if (fromErr == U_INVALID_CHAR_FOUND ||
				    fromErr == U_TRUNCATED_CHAR_FOUND ||
				    fromErr == U_ILLEGAL_CHAR_FOUND)
				{
					throw exceptions::illegal_byte_sequence_for_charset();
				}
				else
				{
					throw exceptions::charset_conv_error
						("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
				}
			}

			const size_t cpLength = cpTarget - &cpBuffer[0];

			// Write successfully converted bytes
			m_stream.write(&cpBuffer[0], cpLength);

		} while (fromErr == U_BUFFER_OVERFLOW_ERROR);

	} while (toErr == U_BUFFER_OVERFLOW_ERROR);
}
Пример #23
0
static UBool testConvertToUnicode( const uint8_t *source, int sourcelen, const UChar *expect, int expectlen, 
               const char *codepage, UBool fallback, const int32_t *expectOffsets)
{
    UErrorCode status = U_ZERO_ERROR;
    UConverter *conv = 0;
    UChar   junkout[NEW_MAX_BUFFER]; /* FIX */
    int32_t junokout[NEW_MAX_BUFFER]; /* FIX */
    const char *src;
    const char *realSourceEnd;
    const char *srcLimit;
    UChar *targ;
    UChar *end;
    int32_t *offs;
    int i;
    UBool   checkOffsets = TRUE;
    char junk[9999];
    char offset_str[9999];
    UChar *p;
    UBool action;

    int32_t   realBufferSize;
    UChar *realBufferEnd;


    for(i=0;i<NEW_MAX_BUFFER;i++)
        junkout[i] = 0xFFFE;

    for(i=0;i<NEW_MAX_BUFFER;i++)
        junokout[i] = -1;

    setNuConvTestName(codepage, "TO");

    log_verbose("\n=========  %s\n", gNuConvTestName);

    conv = my_ucnv_open(codepage, &status);
    if(U_FAILURE(status))
    {
        log_data_err("Couldn't open converter %s\n",gNuConvTestName);
        return TRUE; /* because it has been logged */
    }

    log_verbose("Converter opened..\n");

    src = (const char *)source;
    targ = junkout;
    offs = junokout;

    realBufferSize = (sizeof(junkout)/sizeof(junkout[0]));
    realBufferEnd = junkout + realBufferSize;
    realSourceEnd = src + sourcelen;
    /*----setting the fallback routine----*/
    ucnv_setFallback (conv, fallback);
    action = ucnv_usesFallback(conv);
    if(action != fallback){
        log_err("FAIL: Error is setting fallback. Errocode=%s\n", myErrorName(status));
    }
    /*-------------------------------------*/
    if ( gOutBufferSize != realBufferSize )
      checkOffsets = FALSE;

    if( gInBufferSize != NEW_MAX_BUFFER )
      checkOffsets = FALSE;

    do
    {
        end = nct_min( targ + gOutBufferSize, realBufferEnd);
        srcLimit = nct_min(realSourceEnd, src + gInBufferSize);

        if(targ == realBufferEnd)
        {
            log_err("Error, the end would overflow the real output buffer while about to call toUnicode! tarjey=%08lx %s",targ,gNuConvTestName);
            return FALSE;
        }
        log_verbose("calling toUnicode @ %08lx to %08lx\n", targ,end);



        status = U_ZERO_ERROR;

        ucnv_toUnicode (conv,
                &targ,
                end,
                (const char **)&src,
                (const char *)srcLimit,
                checkOffsets ? offs : NULL,
                (UBool)(srcLimit == realSourceEnd), /* flush if we're at the end of hte source data */
                &status);
    } while ( (status == U_BUFFER_OVERFLOW_ERROR) || (srcLimit < realSourceEnd) ); /* while we just need another buffer */


    if(U_FAILURE(status))
    {
        log_err("Problem doing toUnicode, errcode %s %s\n", myErrorName(status), gNuConvTestName);
        return FALSE;
    }

    log_verbose("\nConversion done. %d bytes -> %d chars.\nResult :",
        sourcelen, targ-junkout);
    if(VERBOSITY)
    {

        junk[0] = 0;
        offset_str[0] = 0;

        for(p = junkout;p<targ;p++)
        {
            sprintf(junk + strlen(junk), "0x%04x, ", (0xFFFF) & (unsigned int)*p);
            sprintf(offset_str + strlen(offset_str), "0x%04x, ", (0xFFFF) & (unsigned int)junokout[p-junkout]);
        }

        log_verbose(junk);
        printUSeq(expect, expectlen);
        if ( checkOffsets )
        {
            log_verbose("\nOffsets:");
            log_verbose(offset_str);
        }
        log_verbose("\n");
    }
    ucnv_close(conv);

    log_verbose("comparing %d uchars (%d bytes)..\n",expectlen,expectlen*2);

    if (checkOffsets && (expectOffsets != 0))
    {
        if(memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t)))
        {
            log_err("\n\ndid not get the expected offsets while %s \n", gNuConvTestName);
            log_err("\nGot  : ");
            for(p=junkout;p<targ;p++)
                log_err("%d, ", junokout[p-junkout]); 
            log_err("\nExpected: ");
            for(i=0; i<(targ-junkout); i++)
                log_err("%d,", expectOffsets[i]);
            log_err("");
            for(i=0; i<(targ-junkout); i++)
                log_err("0x%04X,", junkout[i]);
            log_err("");
            for(i=0; i<(src-(const char *)source); i++)
                log_err("0x%04X,", (unsigned char)source[i]);
        }
    }

    if(!memcmp(junkout, expect, expectlen*2))
    {
        log_verbose("Matches!\n");
        return TRUE;
    }
    else
    {
        log_err("String does not match. %s\n", gNuConvTestName);
        log_verbose("String does not match. %s\n", gNuConvTestName);
        printUSeqErr(junkout, expectlen);
        printf("\n");
        printUSeqErr(expect, expectlen);
        return FALSE;
    }
}
Пример #24
0
/* helper function */
static UChar* 
_strFromWCS( UChar   *dest,
             int32_t destCapacity, 
             int32_t *pDestLength,
             const wchar_t *src,
             int32_t srcLength,
             UErrorCode *pErrorCode)
{
    int32_t retVal =0, count =0 ;
    UConverter* conv = NULL;
    UChar* pTarget = NULL;
    UChar* pTargetLimit = NULL;
    UChar* target = NULL;
    
    UChar uStack [_STACK_BUFFER_CAPACITY];

    wchar_t wStack[_STACK_BUFFER_CAPACITY];
    wchar_t* pWStack = wStack;


    char cStack[_STACK_BUFFER_CAPACITY];
    int32_t cStackCap = _STACK_BUFFER_CAPACITY;
    char* pCSrc=cStack;
    char* pCSave=pCSrc;
    char* pCSrcLimit=NULL;

    const wchar_t* pSrc = src;
    const wchar_t* pSrcLimit = NULL;

    if(srcLength ==-1){
        /* if the wchar_t source is null terminated we can safely
         * assume that there are no embedded nulls, this is a fast
         * path for null terminated strings.
         */
        for(;;){
            /* convert wchars  to chars */
            retVal = uprv_wcstombs(pCSrc,src, cStackCap);
    
            if(retVal == -1){
                *pErrorCode = U_ILLEGAL_CHAR_FOUND;
                goto cleanup;
            }else if(retVal >= (cStackCap-1)){
                /* Should rarely occur */
                u_growAnyBufferFromStatic(cStack,(void**)&pCSrc,&cStackCap,
                    cStackCap * _BUFFER_CAPACITY_MULTIPLIER, 0, sizeof(char));
                pCSave = pCSrc;
            }else{
                /* converted every thing */
                pCSrc = pCSrc+retVal;
                break;
            }
        }
        
    }else{
        /* here the source is not null terminated 
         * so it may have nulls embeded and we need to
         * do some extra processing 
         */
        int32_t remaining =cStackCap;
        
        pSrcLimit = src + srcLength;

        for(;;){
            register int32_t nulLen = 0;

            /* find nulls in the string */
            while(nulLen<srcLength && pSrc[nulLen++]!=0){
            }

            if((pSrc+nulLen) < pSrcLimit){
                /* check if we have enough room in pCSrc */
                if(remaining < (nulLen * MB_CUR_MAX)){
                    /* should rarely occur */
                    int32_t len = (pCSrc-pCSave);
                    pCSrc = pCSave;
                    /* we do not have enough room so grow the buffer*/
                    u_growAnyBufferFromStatic(cStack,(void**)&pCSrc,&cStackCap,
                           _BUFFER_CAPACITY_MULTIPLIER*cStackCap+(nulLen*MB_CUR_MAX),len,sizeof(char));

                    pCSave = pCSrc;
                    pCSrc = pCSave+len;
                    remaining = cStackCap-(pCSrc - pCSave);
                }

                /* we have found a null  so convert the 
                 * chunk from begining of non-null char to null
                 */
                retVal = uprv_wcstombs(pCSrc,pSrc,remaining);

                if(retVal==-1){
                    /* an error occurred bail out */
                    *pErrorCode = U_ILLEGAL_CHAR_FOUND;
                    goto cleanup;
                }

                pCSrc += retVal+1 /* already null terminated */;

                pSrc += nulLen; /* skip past the null */
                srcLength-=nulLen; /* decrement the srcLength */
                remaining -= (pCSrc-pCSave);


            }else{
                /* the source is not null terminated and we are 
                 * end of source so we copy the source to a temp buffer
                 * null terminate it and convert wchar_ts to chars
                 */
                if(nulLen >= _STACK_BUFFER_CAPACITY){
                    /* Should rarely occcur */
                    /* allocate new buffer buffer */
                    pWStack =(wchar_t*) uprv_malloc(sizeof(wchar_t) * (nulLen + 1));
                    if(pWStack==NULL){
                        *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
                        goto cleanup;
                    }
                }
                if(nulLen>0){
                    /* copy the contents to tempStack */
                    uprv_memcpy(pWStack,pSrc,nulLen*sizeof(wchar_t));
                }
            
                /* null terminate the tempBuffer */
                pWStack[nulLen] =0 ;
            
                if(remaining < (nulLen * MB_CUR_MAX)){
                    /* Should rarely occur */
                    int32_t len = (pCSrc-pCSave);
                    pCSrc = pCSave;
                    /* we do not have enough room so grow the buffer*/
                    u_growAnyBufferFromStatic(cStack,(void**)&pCSrc,&cStackCap,
                           cStackCap+(nulLen*MB_CUR_MAX),len,sizeof(char));

                    pCSave = pCSrc;
                    pCSrc = pCSave+len;
                    remaining = cStackCap-(pCSrc - pCSave);
                }
                /* convert to chars */
                retVal = uprv_wcstombs(pCSrc,pWStack,remaining);
            
                pCSrc += retVal;
                pSrc  += nulLen;
                srcLength-=nulLen; /* decrement the srcLength */
                break;
            }
        }
    }

    /* OK..now we have converted from wchar_ts to chars now 
     * convert chars to UChars 
     */
    pCSrcLimit = pCSrc;
    pCSrc = pCSave;
    pTarget = target= dest;
    pTargetLimit = dest + destCapacity;    
    
    conv= u_getDefaultConverter(pErrorCode);
    
    if(U_FAILURE(*pErrorCode)|| conv==NULL){
        goto cleanup;
    }
    
    for(;;) {
        
        *pErrorCode = U_ZERO_ERROR;
        
        /* convert to stack buffer*/
        ucnv_toUnicode(conv,&pTarget,pTargetLimit,(const char**)&pCSrc,pCSrcLimit,NULL,(UBool)(pCSrc==pCSrcLimit),pErrorCode);
        
        /* increment count to number written to stack */
        count+= pTarget - target;
        
        if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR){
            target = uStack;
            pTarget = uStack;
            pTargetLimit = uStack + _STACK_BUFFER_CAPACITY;
        } else {
            break;
        }
        
    }
    
    if(pDestLength){
        *pDestLength =count;
    }

    u_terminateUChars(dest,destCapacity,count,pErrorCode);
    
cleanup:
 
    if(cStack != pCSave){
        uprv_free(pCSave);
    }

    if(wStack != pWStack){
        uprv_free(pWStack);
    }
    
    u_releaseDefaultConverter(conv);

    return dest;
}
Пример #25
0
void charsetFilteredOutputStream_icu::flush()
{
	if (m_from == NULL || m_to == NULL)
		throw exceptions::charset_conv_error("Cannot initialize converters.");

	// Allocate buffer for Unicode chars
	const size_t uniSize = ucnv_getMinCharSize(m_from) * 1024 * sizeof(UChar);
	std::vector <UChar> uniBuffer(uniSize);

	// Conversion loop (with flushing)
	UErrorCode toErr = U_ZERO_ERROR;

	const char* uniSource = 0;
	const char* uniSourceLimit = 0;

	do
	{
		// Convert from source charset to Unicode
		UChar* uniTarget = &uniBuffer[0];
		UChar* uniTargetLimit = &uniBuffer[0] + uniSize;

		toErr = U_ZERO_ERROR;

		ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit,
		               &uniSource, uniSourceLimit, NULL, /* flush */ TRUE, &toErr);

		if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
		{
			throw exceptions::charset_conv_error
				("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
		}

		const size_t uniLength = uniTarget - &uniBuffer[0];

		// Allocate buffer for destination charset
		const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
		std::vector <char> cpBuffer(cpSize);

		// Convert from Unicode to destination charset
		UErrorCode fromErr = U_ZERO_ERROR;

		const UChar* cpSource = &uniBuffer[0];
		const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;

		do
		{
			char* cpTarget = &cpBuffer[0];
			char* cpTargetLimit = &cpBuffer[0] + cpSize;

			fromErr = U_ZERO_ERROR;

			ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
							 &cpSource, cpSourceLimit, NULL, /* flush */ TRUE, &fromErr);

			if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
			{
				throw exceptions::charset_conv_error
					("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
			}

			const size_t cpLength = cpTarget - &cpBuffer[0];

			// Write successfully converted bytes
			m_stream.write(&cpBuffer[0], cpLength);

		} while (fromErr == U_BUFFER_OVERFLOW_ERROR);

	} while (toErr == U_BUFFER_OVERFLOW_ERROR);

	m_stream.flush();
}
Пример #26
0
CF_PRIVATE CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
    UConverter *converter;
    UErrorCode errorCode = U_ZERO_ERROR;
    const char *source = (const char *)bytes;
    const char *sourceLimit = source + numBytes;
    UTF16Char *destination = characters;
    const UTF16Char *destinationLimit = destination + maxCharLen;
    bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
    CFIndex status;

    if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;

    if (0 == maxCharLen) {
        UTF16Char buffer[MAX_BUFFER_SIZE];
        CFIndex totalLength = 0;
        
        while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
            destination = buffer;
            destinationLimit = destination + MAX_BUFFER_SIZE;
            
            ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
            
            totalLength += (destination - buffer);
            
            if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
        }
        
        if (NULL != usedCharLen) *usedCharLen = totalLength;
    } else {
        ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);

        if (NULL != usedCharLen) *usedCharLen = destination - characters;
    }

    status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));

    if (NULL != usedByteLen) {
#if HAS_ICU_BUG_6024743
	/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
	if (kCFStringEncodingInvalidInputStream == status) {
#define MAX_ERROR_BUFFER_LEN (32)
	    char errorBuffer[MAX_ERROR_BUFFER_LEN];
	    int8_t errorLength = MAX_ERROR_BUFFER_LEN;
#undef MAX_ERROR_BUFFER_LEN

	    errorCode = U_ZERO_ERROR;
	    
	    ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
	    
	    if (U_ZERO_ERROR == errorCode) {
#if HAS_ICU_BUG_6025527
                // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
                if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
#endif
		source -= errorLength;
	    } else {
		// Gah, something is terribly wrong. Reset everything
		source = (const char *)bytes; // 0 length
		if (NULL != usedCharLen) *usedCharLen = 0;
	    }
	}
#endif

	*usedByteLen = source - (const char *)bytes;
    }
    
    status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);

    return status;
}
Пример #27
0
/* private function used for buffering input */
void
ufile_fill_uchar_buffer(UFILE *f)
{
    UErrorCode  status;
    const char  *mySource;
    const char  *mySourceEnd;
    UChar       *myTarget;
    int32_t     bufferSize;
    int32_t     maxCPBytes;
    int32_t     bytesRead;
    int32_t     availLength;
    int32_t     dataSize;
    char        charBuffer[UFILE_CHARBUFFER_SIZE];
    u_localized_string *str;

    if (f->fFile == NULL) {
        /* There is nothing to do. It's a string. */
        return;
    }

    str = &f->str;
    dataSize = (int32_t)(str->fLimit - str->fPos);
    if (f->fFileno == 0 && dataSize > 0) {
        /* Don't read from stdin too many times. There is still some data. */
        return;
    }

    /* shift the buffer if it isn't empty */
    if(dataSize != 0) {
        uprv_memmove(f->fUCBuffer, str->fPos, dataSize * sizeof(UChar));
    }


    /* record how much buffer space is available */
    availLength = UFILE_UCHARBUFFER_SIZE - dataSize;

    /* Determine the # of codepage bytes needed to fill our UChar buffer */
    /* weiv: if converter is NULL, we use invariant converter with charwidth = 1)*/
    maxCPBytes = availLength / (f->fConverter!=NULL?(2*ucnv_getMinCharSize(f->fConverter)):1);

    /* Read in the data to convert */
    if (f->fFileno == 0) {
        /* Special case. Read from stdin one line at a time. */
        char *retStr = fgets(charBuffer, ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE), f->fFile);
        bytesRead = (int32_t)(retStr ? uprv_strlen(charBuffer) : 0);
    }
    else {
        /* A normal file */
        bytesRead = (int32_t)fread(charBuffer,
                                   sizeof(char),
                                   ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE),
                                   f->fFile);
    }

    /* Set up conversion parameters */
    status      = U_ZERO_ERROR;
    mySource    = charBuffer;
    mySourceEnd = charBuffer + bytesRead;
    myTarget    = f->fUCBuffer + dataSize;
    bufferSize  = UFILE_UCHARBUFFER_SIZE;

    if(f->fConverter != NULL) { /* We have a valid converter */
        /* Perform the conversion */
        ucnv_toUnicode(f->fConverter,
                       &myTarget,
                       f->fUCBuffer + bufferSize,
                       &mySource,
                       mySourceEnd,
                       NULL,
                       (UBool)(feof(f->fFile) != 0),
                       &status);

    } else { /*weiv: do the invariant conversion */
        u_charsToUChars(mySource, myTarget, bytesRead);
        myTarget += bytesRead;
    }

    /* update the pointers into our array */
    str->fPos    = str->fBuffer;
    str->fLimit  = myTarget;
}
Пример #28
0
UErrorCode convsample_05()
{
  printf("\n\n==============================================\n"
         "Sample 05: C: count the number of letters in a UTF-8 document\n");

  FILE *f;
  int32_t count;
  char inBuf[BUFFERSIZE];
  const char *source;
  const char *sourceLimit;
  UChar *uBuf;
  UChar *target;
  UChar *targetLimit;
  UChar *p;
  int32_t uBufSize = 0;
  UConverter *conv;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t letters=0, total=0;

  f = fopen("data01.txt", "r");
  if(!f)
  {
    fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
    return U_FILE_ACCESS_ERROR;
  }

  // **************************** START SAMPLE *******************
  conv = ucnv_open("utf-8", &status);
  assert(U_SUCCESS(status));

  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
  printf("input bytes %d / min chars %d = %d UChars\n",
         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
  assert(uBuf!=NULL);

  // grab another buffer's worth
  while((!feof(f)) && 
        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
  {
    // Convert bytes to unicode
    source = inBuf;
    sourceLimit = inBuf + count;
    
    do
    {
        target = uBuf;
        targetLimit = uBuf + uBufSize;
        
        ucnv_toUnicode(conv, &target, targetLimit, 
                       &source, sourceLimit, NULL,
                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
                                   /* is true (when no more data will come) */
                       &status);
      
        if(status == U_BUFFER_OVERFLOW_ERROR)
        {
          // simply ran out of space - we'll reset the target ptr the next
          // time through the loop.
          status = U_ZERO_ERROR;
        }
        else
        {
          //  Check other errors here.
          assert(U_SUCCESS(status));
          // Break out of the loop (by force)
        }

        // Process the Unicode
        // Todo: handle UTF-16/surrogates

        for(p = uBuf; p<target; p++)
        {
          if(u_isalpha(*p))
            letters++;
          total++;
        }
    } while (source < sourceLimit); // while simply out of space
  }

  printf("%d letters out of %d total UChars.\n", letters, total);
  
  // ***************************** END SAMPLE ********************
  ucnv_close(conv);

  printf("\n");

  fclose(f);

  return U_ZERO_ERROR;
}
Пример #29
0
/* private function used for buffering input */
void
ufile_fill_uchar_buffer(UFILE *f)
{
    UErrorCode         status;
    const char        *mySource;
    const char        *mySourceEnd;
    UChar            *myTarget;
    int32_t        bufferSize;
    int32_t        maxCPBytes;
    int32_t        bytesRead;
    int32_t        availLength;
    int32_t        dataSize;


    /* shift the buffer if it isn't empty */
    dataSize = (int32_t)(f->fUCLimit - f->fUCPos);
    if(dataSize != 0) {
        memmove(f->fUCBuffer,
            f->fUCPos,
            dataSize * sizeof(UChar));
    }


    /* record how much buffer space is available */
    availLength = UFILE_UCHARBUFFER_SIZE - dataSize;

    /* Determine the # of codepage bytes needed to fill our UChar buffer */
    /* weiv: if converter is NULL, we use invariant converter with charwidth = 1)*/
    maxCPBytes = availLength / (f->fConverter!=NULL?(2*ucnv_getMinCharSize(f->fConverter)):1);

    /* Read in the data to convert */
    bytesRead = (int32_t)fread(f->fCharBuffer,
        sizeof(char),
        ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE),
        f->fFile);

    /* Set up conversion parameters */
    status      = U_ZERO_ERROR;
    mySource    = f->fCharBuffer;
    mySourceEnd = f->fCharBuffer + bytesRead;
    myTarget    = f->fUCBuffer + dataSize;
    bufferSize  = UFILE_UCHARBUFFER_SIZE;

    if(f->fConverter != NULL) { /* We have a valid converter */
        /* Perform the conversion */
        ucnv_toUnicode(f->fConverter,
            &myTarget,
            f->fUCBuffer + bufferSize,
            &mySource,
            mySourceEnd,
            NULL,
            (UBool)(feof(f->fFile) != 0),
            &status);

    } else { /*weiv: do the invariant conversion */
        u_charsToUChars(mySource, myTarget, bytesRead);
        myTarget += bytesRead;
    }

    /* update the pointers into our array */
    f->fUCPos    = f->fUCBuffer;
    f->fUCLimit     = myTarget;
}
Пример #30
0
/* fill the uchar buffer */
static UCHARBUF*
ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* error){
    UChar* pTarget=NULL;
    UChar* target=NULL;
    const char* source=NULL;
    char  carr[MAX_IN_BUF] = {'\0'};
    char* cbuf =  carr;
    int32_t inputRead=0;
    int32_t outputWritten=0;
    int32_t offset=0;
    const char* sourceLimit =NULL;
    int32_t cbufSize=0;
    pTarget = buf->buffer;
    /* check if we arrived here without exhausting the buffer*/
    if(buf->currentPos<buf->bufLimit){
        offset = (int32_t)(buf->bufLimit-buf->currentPos);
        memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar));
    }

#if DEBUG
    memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset));
#endif
    if(buf->isBuffered){
        cbufSize = MAX_IN_BUF;
        /* read the file */
        inputRead=T_FileStream_read(buf->in,cbuf,cbufSize-offset);
        buf->remaining-=inputRead;
        
    }else{
        cbufSize = T_FileStream_size(buf->in);
        cbuf = (char*)uprv_malloc(cbufSize);
        if (cbuf == NULL) {
        	*error = U_MEMORY_ALLOCATION_ERROR;
        	return NULL;
        }
        inputRead= T_FileStream_read(buf->in,cbuf,cbufSize);
        buf->remaining-=inputRead;
    }

    /* just to be sure...*/
    if ( 0 == inputRead )
       buf->remaining = 0;

    target=pTarget;
    /* convert the bytes */
    if(buf->conv){
        /* set the callback to stop */
        UConverterToUCallback toUOldAction ;
        void* toUOldContext;
        void* toUNewContext=NULL;
        ucnv_setToUCallBack(buf->conv,
           UCNV_TO_U_CALLBACK_STOP,
           toUNewContext,
           &toUOldAction,
           (const void**)&toUOldContext,
           error);
        /* since state is saved in the converter we add offset to source*/
        target = pTarget+offset;
        source = cbuf;
        sourceLimit = source + inputRead;
        ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset),
                        &source,sourceLimit,NULL,
                        (UBool)(buf->remaining==0),error);

        if(U_FAILURE(*error)){
            char context[CONTEXT_LEN+1];
            char preContext[CONTEXT_LEN+1];
            char postContext[CONTEXT_LEN+1];
            int8_t len = CONTEXT_LEN;
            int32_t start=0;
            int32_t stop =0;
            int32_t pos =0;
            /* use erro1 to preserve the error code */
            UErrorCode error1 =U_ZERO_ERROR;
            
            if( buf->showWarning==TRUE){
                fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while"
                               " converting input stream to target encoding: %s\n",
                               u_errorName(*error));
            }


            /* now get the context chars */
            ucnv_getInvalidChars(buf->conv,context,&len,&error1);
            context[len]= 0 ; /* null terminate the buffer */

            pos = (int32_t)(source - cbuf - len);

            /* for pre-context */
            start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1));
            stop  = pos-len;

            memcpy(preContext,cbuf+start,stop-start);
            /* null terminate the buffer */
            preContext[stop-start] = 0;

            /* for post-context */
            start = pos+len;
            stop  = (int32_t)(((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf));

            memcpy(postContext,source,stop-start);
            /* null terminate the buffer */
            postContext[stop-start] = 0;

            if(buf->showWarning ==TRUE){
                /* print out the context */
                fprintf(stderr,"\tPre-context: %s\n",preContext);
                fprintf(stderr,"\tContext: %s\n",context);
                fprintf(stderr,"\tPost-context: %s\n", postContext);
            }

            /* reset the converter */
            ucnv_reset(buf->conv);

            /* set the call back to substitute
             * and restart conversion
             */
            ucnv_setToUCallBack(buf->conv,
               UCNV_TO_U_CALLBACK_SUBSTITUTE,
               toUNewContext,
               &toUOldAction,
               (const void**)&toUOldContext,
               &error1);

            /* reset source and target start positions */
            target = pTarget+offset;
            source = cbuf;

            /* re convert */
            ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset),
                            &source,sourceLimit,NULL,
                            (UBool)(buf->remaining==0),&error1);

        }
        outputWritten = (int32_t)(target - pTarget);


#if DEBUG
        {
            int i;
            target = pTarget;
            for(i=0;i<numRead;i++){
              /*  printf("%c", (char)(*target++));*/
            }
        }
#endif

    }else{
        u_charsToUChars(cbuf,target+offset,inputRead);
        outputWritten=((buf->remaining>cbufSize)? cbufSize:inputRead+offset);
    }
    buf->currentPos = pTarget;
    buf->bufLimit=pTarget+outputWritten;
    *buf->bufLimit=0; /*NUL terminate*/
    if(cbuf!=carr){
        uprv_free(cbuf);
    }
    return buf;
}