U_CAPI UChar* U_EXPORT2 u_uastrncpy(UChar *ucs1, const char *s2, int32_t n) { UChar *target = ucs1; UErrorCode err = U_ZERO_ERROR; UConverter *cnv = u_getDefaultConverter(&err); if(U_SUCCESS(err) && cnv != NULL) { ucnv_reset(cnv); ucnv_toUnicode(cnv, &target, ucs1+n, &s2, s2+u_astrnlen(s2, n), NULL, TRUE, &err); ucnv_reset(cnv); /* be good citizens */ u_releaseDefaultConverter(cnv); if(U_FAILURE(err) && (err != U_BUFFER_OVERFLOW_ERROR) ) { *ucs1 = 0; /* failure */ } if(target < (ucs1+n)) { /* U_BUFFER_OVERFLOW_ERROR isn't an err, just means no termination will happen. */ *target = 0; /* terminate */ } } else { *ucs1 = 0; } return ucs1; }
void convsample_50() { printf("\n\n==============================================\n" "Sample 50: C: ucnv_detectUnicodeSignature\n"); //! [ucnv_detectUnicodeSignature] UErrorCode err = U_ZERO_ERROR; UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */ char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; int32_t signatureLength = 0; const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); UConverter *conv = NULL; UChar output[100]; UChar *target = output, *out; const char *source = input; if(encoding!=NULL && U_SUCCESS(err)){ // should signature be discarded ? conv = ucnv_open(encoding, &err); // do the conversion ucnv_toUnicode(conv, &target, output + UPRV_LENGTHOF(output), &source, input + sizeof(input), NULL, TRUE, &err); out = output; if (discardSignature){ ++out; // ignore initial U+FEFF } while(out != target) { printf("%04x ", *out++); } puts(""); } //! [ucnv_detectUnicodeSignature] puts(""); }
QString QIcuCodec::convertToUnicode(const char *chars, int length, QTextCodec::ConverterState *state) const { UConverter *conv = getConverter(state); QString string(length + 2, Qt::Uninitialized); const char *end = chars + length; int convertedChars = 0; while (1) { UChar *uc = (UChar *)string.data(); UChar *ucEnd = uc + string.length(); uc += convertedChars; UErrorCode error = U_ZERO_ERROR; ucnv_toUnicode(conv, &uc, ucEnd, &chars, end, 0, false, &error); if (!U_SUCCESS(error) && error != U_BUFFER_OVERFLOW_ERROR) { qDebug() << "convertToUnicode failed:" << u_errorName(error); break; } convertedChars = uc - (UChar *)string.data(); if (chars >= end) break; string.resize(string.length()*2); } string.resize(convertedChars); if (!state) ucnv_close(conv); return string; }
int TextCodecICU::decodeToBuffer(UChar* target, UChar* targetLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& err) { UChar* targetStart = target; err = U_ZERO_ERROR; ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err); return target - targetStart; }
UChar* ufmt_defaultCPToUnicode(const char *s, int32_t sSize, UChar *target, int32_t tSize) { UChar *alias; UErrorCode status = U_ZERO_ERROR; UConverter *defConverter = u_getDefaultConverter(&status); if(U_FAILURE(status) || defConverter == 0) return 0; if(sSize <= 0) { sSize = uprv_strlen(s) + 1; } /* perform the conversion in one swoop */ if(target != 0) { alias = target; ucnv_toUnicode(defConverter, &alias, alias + tSize, &s, s + sSize - 1, NULL, TRUE, &status); /* add the null terminator */ *alias = 0x0000; } u_releaseDefaultConverter(defConverter); return target; }
inline int mod_websocket_conv(UConverter *to, UConverter *from, char **dst, size_t *dstsiz, const char *src, size_t srcsiz) { UErrorCode err = U_ZERO_ERROR; size_t unisiz; UChar *unibuf, *punibuf, *ppunibuf; char *pdst; if (srcsiz == 0) { return -1; } if (!to) { *dst = (char *)malloc(srcsiz + 1); if (*dst == NULL) { return -1; } memcpy(*dst, src, srcsiz); (*dst)[srcsiz] = '\0'; *dstsiz = srcsiz; return 0; } if (!from || !dst || !src || !dstsiz) { return -1; } unisiz = srcsiz / ucnv_getMinCharSize(from); unibuf = (UChar *)malloc(sizeof(UChar) * unisiz + 1); if (!unibuf) { return -1; } punibuf = unibuf; ucnv_toUnicode(from, &punibuf, punibuf + unisiz, &src, src + srcsiz, 0, 0, &err); if (U_FAILURE(err)) { free(unibuf); return -1; } *punibuf = '\0'; *dstsiz = (punibuf - unibuf) * ucnv_getMaxCharSize(to); *dst = (char *)malloc(*dstsiz + 1); if (!*dst) { free(unibuf); return -1; } pdst = *dst; ppunibuf = unibuf; ucnv_fromUnicode(to, &pdst, pdst + *dstsiz, (const UChar **)&ppunibuf, punibuf, 0, 0, &err); free(unibuf); if (U_FAILURE(err)) { free(*dst); return -1; } *pdst = '\0'; *dstsiz = pdst - *dst; return 0; }
static jint NativeConverter_decode(JNIEnv* env, jclass, jlong address, jbyteArray source, jint sourceEnd, jcharArray target, jint targetEnd, jintArray data, jboolean flush) { UConverter* cnv = toUConverter(address); if (cnv == NULL) { maybeThrowIcuException(env, "toUConverter", U_ILLEGAL_ARGUMENT_ERROR); return U_ILLEGAL_ARGUMENT_ERROR; } ScopedByteArrayRO uSource(env, source); if (uSource.get() == NULL) { maybeThrowIcuException(env, "uSource", U_ILLEGAL_ARGUMENT_ERROR); return U_ILLEGAL_ARGUMENT_ERROR; } ScopedCharArrayRW uTarget(env, target); if (uTarget.get() == NULL) { maybeThrowIcuException(env, "uTarget", U_ILLEGAL_ARGUMENT_ERROR); return U_ILLEGAL_ARGUMENT_ERROR; } ScopedIntArrayRW myData(env, data); if (myData.get() == NULL) { maybeThrowIcuException(env, "myData", U_ILLEGAL_ARGUMENT_ERROR); return U_ILLEGAL_ARGUMENT_ERROR; } // Do the conversion. jint* sourceOffset = &myData[0]; jint* targetOffset = &myData[1]; const char* mySource = reinterpret_cast<const char*>(uSource.get() + *sourceOffset); const char* mySourceLimit = reinterpret_cast<const char*>(uSource.get() + sourceEnd); UChar* cTarget = uTarget.get() + *targetOffset; const UChar* cTargetLimit = uTarget.get() + targetEnd; UErrorCode errorCode = U_ZERO_ERROR; ucnv_toUnicode(cnv, &cTarget, cTargetLimit, &mySource, mySourceLimit, NULL, flush, &errorCode); *sourceOffset = mySource - reinterpret_cast<const char*>(uSource.get()) - *sourceOffset; *targetOffset = cTarget - uTarget.get() - *targetOffset; // If there was an error, count the problematic bytes. if (errorCode == U_ILLEGAL_CHAR_FOUND || errorCode == U_INVALID_CHAR_FOUND || errorCode == U_TRUNCATED_CHAR_FOUND) { int8_t invalidByteCount = 32; char invalidBytes[32] = {'\0'}; UErrorCode minorErrorCode = U_ZERO_ERROR; ucnv_getInvalidChars(cnv, invalidBytes, &invalidByteCount, &minorErrorCode); if (U_SUCCESS(minorErrorCode)) { myData[2] = invalidByteCount; } } // Managed code handles some cases; throw all other errors. if (shouldCodecThrow(flush, errorCode)) { maybeThrowIcuException(env, "ucnv_toUnicode", errorCode); } return errorCode; }
DeprecatedString StreamingTextDecoderICU::convertUsingICU(const unsigned char* chs, int len, bool flush) { // Get a converter for the passed-in encoding. if (!m_converterICU) { createICUConverter(); if (!m_converterICU) return DeprecatedString(); } DeprecatedString result(""); result.reserve(len); UChar buffer[ConversionBufferSize]; const char* source = reinterpret_cast<const char*>(chs); const char* sourceLimit = source + len; int32_t* offsets = NULL; UErrorCode err; do { UChar* target = buffer; const UChar* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err); int count = target - buffer; appendOmittingBOM(result, reinterpret_cast<const UChar*>(buffer), count * sizeof(UChar)); } while (err == U_BUFFER_OVERFLOW_ERROR); if (U_FAILURE(err)) { // flush the converter so it can be reused, and not be bothered by this error. do { UChar *target = buffer; const UChar *targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, true, &err); } while (source < sourceLimit); LOG_ERROR("ICU conversion error"); return DeprecatedString(); } return result; }
U_CAPI UBool U_EXPORT2 ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* error){ char start[8]; int32_t numRead; UChar target[1]={ 0 }; UChar* pTarget; const char* pStart; /* read a few bytes */ numRead=T_FileStream_read(in, start, sizeof(start)); *cp = ucnv_detectUnicodeSignature(start, numRead, signatureLength, error); /* unread the bytes beyond what was consumed for U+FEFF */ T_FileStream_rewind(in); if (*signatureLength > 0) { numRead = T_FileStream_read(in, start, *signatureLength); } if(*cp==NULL){ *conv =NULL; return FALSE; } /* open the converter for the detected Unicode charset */ *conv = ucnv_open(*cp,error); /* convert and ignore initial U+FEFF, and the buffer overflow */ pTarget = target; pStart = start; ucnv_toUnicode(*conv, &pTarget, target+1, &pStart, start+*signatureLength, NULL, FALSE, error); *signatureLength = (int32_t)(pStart - start); if(*error==U_BUFFER_OVERFLOW_ERROR) { *error=U_ZERO_ERROR; } /* verify that we successfully read exactly U+FEFF */ if(U_SUCCESS(*error) && (pTarget!=(target+1) || target[0]!=0xfeff)) { *error=U_INTERNAL_PROGRAM_ERROR; } return TRUE; }
static void printCString(FILE *out, UConverter *converter, const char *str, int32_t len) { UChar buf[256]; const char *strEnd; if (len < 0) { len = (int32_t)uprv_strlen(str); } strEnd = str + len; do { UErrorCode err = U_ZERO_ERROR; UChar *bufp = buf, *bufend = buf + (sizeof(buf)/sizeof(buf[0])) - 1 ; ucnv_toUnicode(defaultConverter, &bufp, bufend, &str, strEnd, 0, 0, &err); *bufp = 0; printString(out, converter, buf, (int32_t)(bufp - buf)); } while (str < strEnd); }
/* params : desc : the document descriptor * buf : destination buffer for UTF-16 data * return : the length of the paragraph * NO_MORE_DATA if there is no more paragraph * ERR_STREAMFILE if an error occured * * reads the next paragraph and converts to UTF-16 */ int p_read_content(struct doc_descriptor *desc, UChar *buf) { char *outputbuf, *src; UChar *dest; int len; UErrorCode err; len = 0; outputbuf = (char *) malloc(INTERNAL_BUFSIZE); /* reading the next paragraph */ memset(outputbuf, '\x00', INTERNAL_BUFSIZE); len = parse(desc, outputbuf); if (len > 0) { (desc->nb_par_read) += 1; /* converting to UTF-16 */ err = U_ZERO_ERROR; dest = buf; src = outputbuf; ucnv_toUnicode(desc->conv, &dest, dest + 2*INTERNAL_BUFSIZE, &src, outputbuf + strlen(outputbuf), NULL, FALSE, &err); len = 2*(dest - buf); if (U_FAILURE(err)) { free(outputbuf); outputbuf = NULL; fprintf(stderr, "Unable to convert buffer\n"); return ERR_ICU; } } if(outputbuf != NULL) { free(outputbuf); } return len; }
static int convertUTF8toUChar(const char *src, UChar *dst, int len) { static UConverter *c; UErrorCode status; UChar *p = dst; const char *s = src; if (!c) { status = U_ZERO_ERROR; c = ucnv_open("UTF-8", &status); if (!c) { fprintf(stderr, "CouchStore CollateJSON: Couldn't initialize ICU (%d)\n", (int)status); abort(); } } while (len) { unsigned char ch = (unsigned char)(*s); if ((ch & 0x80)) { goto icu_conv; } *p++ = (UChar)(ch); s++; len--; } return p - dst; icu_conv: status = U_ZERO_ERROR; ucnv_toUnicode(c, &p, p + len, &s, s + len, NULL, TRUE, &status); if (U_FAILURE(status)) { return -1; } return p - dst; }
/* rewind the buf and file stream */ U_CAPI void U_EXPORT2 ucbuf_rewind(UCHARBUF* buf,UErrorCode* error){ if(error==NULL || U_FAILURE(*error)){ return; } if(buf){ buf->currentPos=buf->buffer; buf->bufLimit=buf->buffer; T_FileStream_rewind(buf->in); buf->remaining=T_FileStream_size(buf->in)-buf->signatureLength; ucnv_resetToUnicode(buf->conv); if(buf->signatureLength>0) { UChar target[1]={ 0 }; UChar* pTarget; char start[8]; const char* pStart; int32_t numRead; /* read the signature bytes */ numRead=T_FileStream_read(buf->in, start, buf->signatureLength); /* convert and ignore initial U+FEFF, and the buffer overflow */ pTarget = target; pStart = start; ucnv_toUnicode(buf->conv, &pTarget, target+1, &pStart, start+numRead, NULL, FALSE, error); if(*error==U_BUFFER_OVERFLOW_ERROR) { *error=U_ZERO_ERROR; } /* verify that we successfully read exactly U+FEFF */ if(U_SUCCESS(*error) && (numRead!=buf->signatureLength || pTarget!=(target+1) || target[0]!=0xfeff)) { *error=U_INTERNAL_PROGRAM_ERROR; } } } }
int getText (struct doc_descriptor *desc, UChar * buf, int size) { struct meta *meta = NULL; char buf2[BUFSIZE]; UErrorCode err; char *src; UChar *dest, esc[3]; UChar name[1024], value[1024]; int len, i, isMarkup, isJavascript, isMeta, l, j; int dangerousCut, fini, r, offset, endOfFile, space_added; space_added = 0; l = 0; fini = 0; endOfFile = 0; isJavascript = 0; dangerousCut = 0; isMarkup = 0; isMeta = 0; len = read (desc->fd, buf2, BUFSIZE); while (!fini && len > 0 && 2*l < size - 2) { /* consuming buffer */ for (i = 0; 2*l < size - 2 && i < len && !dangerousCut && !fini; i++) { /* end of buffer are possible points of failure if a markup or a token is cut, it will not be parsed. */ if (!endOfFile && i > len - 9 && (!strncmp (buf2 + i, "\x3c", 1) || !strncmp (buf2 + i, "\x26", 1))) { dangerousCut = 1; break; } /* detecting end of javascript */ if (isJavascript && !strncasecmp (buf2 + i, "</script>", 9)) { isJavascript = 0; i += 9; } /* detecting new paragraph */ if (l > 0 && !isJavascript && (!strncasecmp (buf2 + i, "<p", 2) || !strncasecmp (buf2 + i, "<br", 3) || !strncasecmp (buf2 + i, "<div", 4))) { fini = 1; i += 2; while (strncmp (buf2 + i, ">", 1)) { i++; } lseek (desc->fd, i - len, SEEK_CUR); break; } /* detecting begining of markup */ if (!isJavascript && !isMarkup && !strncmp (buf2 + i, "\x3c", 1)) { /* detecting begining of javascript */ if (!strncasecmp (buf2 + i, "<script", 7)) { isJavascript = 1; } else if (!strncasecmp (buf2 + i, "<title", 6)) { err = U_ZERO_ERROR; /* finding last metadata of desc */ if (desc->meta == NULL) { meta = (struct meta *) malloc (sizeof (struct meta)); desc->meta = meta; } else { meta = desc->meta; while (meta->next != NULL) { meta = meta->next; } meta->next = (struct meta *) malloc (sizeof (struct meta)); meta = meta->next; } meta->next = NULL; meta->name = (UChar *) malloc (12); /* filling name field */ meta->name_length = 2 * ucnv_toUChars (desc->conv, meta->name, 12, "title", 5, &err); meta->name_length = u_strlen (meta->name); if (U_FAILURE (err)) { printf ("error icu\n"); return ERR_ICU; } isMeta = 1; } else if (!strncasecmp (buf2 + i, "<meta", 5)) { i += 5; if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } for (; strncasecmp (buf2 + i, "name=\"", 6) && strncmp (buf2 + i, "\x3E", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } } if (!strncmp (buf2 + i, "\x3E", 1)) { continue; } else { i += 6; /* get metadata name */ memset (name, '\x00', 2048); for (j = 0; len != 0 && strncmp (buf2 + i, "\"", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } if (!strncmp (buf2 + i, "\x26", 1)) { memset (esc, '\x00', 6); offset = escapeChar (desc, buf2 + i, esc); memcpy (name + j, esc, 2 * u_strlen (esc)); j += u_strlen (esc); i += (offset - 1); } else { /* filling name buffer */ dest = name + j; src = buf2 + i; err = U_ZERO_ERROR; ucnv_toUnicode (desc->conv, &dest, name + 1024, &src, buf2 + i + 1, NULL, FALSE, &err); if (U_FAILURE (err)) { fprintf (stderr, "Unable to convert buffer\n"); return ERR_ICU; } j += (dest - name - j); } } /* get metadata value */ for (; strncasecmp (buf2 + i, "content=\"", 9) && strncmp (buf2 + i, "\x3E", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } } i += 9; if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } memset (value, '\x00', 2048); for (j = 0; len != 0 && strncmp (buf2 + i, "\"", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } if (!strncmp (buf2 + i, "\x26", 1)) { memset (esc, '\x00', 6); offset = escapeChar (desc, buf2 + i, esc); memcpy (value + j, esc, 2 * u_strlen (esc)); j += u_strlen (esc); i += (offset - 1); } else { /* filling value buffer */ dest = value + j; src = buf2 + i; err = U_ZERO_ERROR; ucnv_toUnicode (desc->conv, &dest, value + 1024, &src, buf2 + i + 1, NULL, FALSE, &err); if (U_FAILURE (err)) { fprintf (stderr, "Unable to convert buffer\n"); return ERR_ICU; } j += (dest - value - j); } } /* insert metadata in list */ if (desc->meta == NULL) { meta = (struct meta *) malloc (sizeof (struct meta)); desc->meta = meta; } else { meta = desc->meta; while (meta->next != NULL) { meta = meta->next; } meta->next = (struct meta *) malloc (sizeof (struct meta)); meta = meta->next; } meta->next = NULL; meta->name = (UChar *) malloc (2 * u_strlen (name) + 2); meta->value = (UChar *) malloc (2 * u_strlen (value) + 2); memset (meta->name, '\x00', 2 * u_strlen (name) + 2); memset (meta->value, '\x00', 2 * u_strlen (value) + 2); memcpy (meta->name, name, 2 * u_strlen (name)); memcpy (meta->value, value, 2 * u_strlen (value)); meta->name_length = u_strlen (name); meta->value_length = u_strlen (value); for (; strncmp (buf2 + i, "\x3E", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } } continue; } } else { isMarkup = 1; } } /* get metadata value */ if (!isJavascript && isMeta) { for (; len != 0 && strncmp (buf2 + i, "\x3E", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } } i++; memset (value, '\x00', 2048); for (j = 0; len != 0 && strncmp (buf2 + i, "\x3C", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } if (!strncmp (buf2 + i, "\x26", 1)) { memset (esc, '\x00', 6); offset = escapeChar (desc, buf2 + i, esc); memcpy (value + j, esc, 2 * u_strlen (esc)); j += u_strlen (esc); i += (offset - 1); } else { /* filling value buffer */ dest = value + j; src = buf2 + i; err = U_ZERO_ERROR; ucnv_toUnicode (desc->conv, &dest, value + 1024, &src, buf2 + i + 1, NULL, FALSE, &err); if (U_FAILURE (err)) { fprintf (stderr, "Unable to convert buffer\n"); return ERR_ICU; } j += (dest - value - j); } } meta->value = (UChar *) malloc (2 * (j + 1)); memcpy (meta->value, value, 2 * u_strlen (value)); meta->value_length = u_strlen (value); isMeta = 0; i += 7; continue; } /* detecting end of markup */ if (!isJavascript && isMarkup && !strncmp (buf2 + i, "\x3e", 1)) { if (!space_added && l > 0) { buf[l] = 0x20; l ++; space_added = 1; } isMarkup = 0; } /* handling text */ if (!isJavascript && !isMarkup && strncmp (buf2 + i, "\x3e", 1)) { if (strncmp (buf2 + i, "\n", 1) && strncmp (buf2 + i, "\t", 1) && strncmp (buf2 + i, "\r", 1)) { /* converting tokens */ if (!isJavascript && !isMarkup && !strncmp (buf2 + i, "\x26", 1)) { memset (esc, '\x00', 6); offset = escapeChar (desc, buf2 + i, esc); if (memcmp (esc, "\x20\x00", u_strlen (esc))) { memcpy (buf + l, esc, 2 * u_strlen (esc)); l += u_strlen (esc); space_added = 0; } else { if (!space_added){ buf[l] = 0x20; space_added = 1; l++; } } i += (offset - 1); } else { if (buf2[i] != 0x20 || !space_added){ /* filling output buffer */ dest = buf + l; src = buf2 + i; err = U_ZERO_ERROR; ucnv_toUnicode (desc->conv, &dest, buf + size / 2, &src, buf2 + i + 1, NULL, FALSE, &err); if (U_FAILURE (err)) { fprintf (stderr, "Unable to convert buffer\n"); return ERR_ICU; } l += (dest - buf - l); if (buf2[i] == 0x20) {space_added = 1;} else {space_added=0;} } } } else { /* replace tabs and eol by spaces */ if (!space_added){ buf[l] = 0x20; space_added = 1; l++; } } } } /* filling new buffer correctly */ if (!fini) { if (dangerousCut) { r = len - i; strncpy (buf2, buf2 + i, r); len = read (desc->fd, buf2 + r, BUFSIZE - r) + r; if (len < 9) { endOfFile = 1; } dangerousCut = 0; } else { len = read (desc->fd, buf2, BUFSIZE); } } } /* ending buffer properly */ if (l > 0) { buf[l] = 0x20; return 2*l; } if (len == 0) { return NO_MORE_DATA; } return 2*l; }
void charsetConverter_icu::convert (utility::inputStream& in, utility::outputStream& out, status* st) { UErrorCode err = U_ZERO_ERROR; ucnv_reset(m_from); ucnv_reset(m_to); if (st) new (st) status(); // From buffers byte_t cpInBuffer[16]; // stream data put here const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar); std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here // To buffers // converted (char) data end up here const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize; std::vector <char> cpOutBuffer(cpOutBufferSz); // Tell ICU what to do when encountering an illegal byte sequence if (m_options.silentlyReplaceInvalidSequences) { // Set replacement chars for when converting from Unicode to codepage icu::UnicodeString substString(m_options.invalidSequence.c_str()); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); } else { // Tell ICU top stop (and return an error) on illegal byte sequences ucnv_setToUCallBack (m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); ucnv_setFromUCallBack (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); } // Input data available while (!in.eof()) { // Read input data into buffer size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer)); // Beginning of read data const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]); const char* sourceLimit = source + inLength; // end + 1 UBool flush = in.eof(); // is this last run? UErrorCode toErr; // Loop until all source has been processed do { // Set up target pointers UChar* target = &uOutBuffer[0]; UChar* targetLimit = &target[0] + outSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &target, targetLimit, &source, sourceLimit, NULL, flush, &toErr); if (st) st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0])); if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr)) { if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { // Error will be thrown later (*) } else { throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName()); } } // The Unicode source is the buffer just written and the limit // is where the previous conversion stopped (target is moved in the conversion) const UChar* uSource = &uOutBuffer[0]; UChar* uSourceLimit = &target[0]; UErrorCode fromErr; // Loop until converted chars are fully written do { char* cpTarget = &cpOutBuffer[0]; const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz; fromErr = U_ZERO_ERROR; // Write converted bytes (Unicode) to destination codepage ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &uSource, uSourceLimit, NULL, flush, &fromErr); if (st) { // Decrement input bytes count by the number of input bytes in error char errBytes[16]; int8_t errBytesLen = sizeof(errBytes); UErrorCode errBytesErr = U_ZERO_ERROR; ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr); st->inputBytesRead -= errBytesLen; st->outputBytesWritten += cpTarget - &cpOutBuffer[0]; } // (*) If an error occurred while converting from input charset, throw it now if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { if (fromErr == U_INVALID_CHAR_FOUND || fromErr == U_TRUNCATED_CHAR_FOUND || fromErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName()); } } // Write to destination stream out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0])); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); } }
// --------------------------------------------------------------------------- // ICUTranscoder: The virtual transcoder API // --------------------------------------------------------------------------- unsigned int ICUTranscoder::transcodeFrom(const XMLByte* const srcData , const unsigned int srcCount , XMLCh* const toFill , const unsigned int maxChars , unsigned int& bytesEaten , unsigned char* const charSizes) { // If debugging, insure the block size is legal #if defined(XERCES_DEBUG) checkBlockSize(maxChars); #endif // Set up pointers to the start and end of the source buffer const XMLByte* startSrc = srcData; const XMLByte* endSrc = srcData + srcCount; // // And now do the target buffer. This works differently according to // whether XMLCh and UChar are the same size or not. // UChar* startTarget; if (sizeof(XMLCh) == sizeof(UChar)) startTarget = (UChar*)toFill; else startTarget = (UChar*) getMemoryManager()->allocate ( maxChars * sizeof(UChar) );//new UChar[maxChars]; UChar* orgTarget = startTarget; // // Transoode the buffer. Buffer overflow errors are normal, occuring // when the raw input buffer holds more characters than will fit in // the Unicode output buffer. // UErrorCode err = U_ZERO_ERROR; ucnv_toUnicode ( fConverter , &startTarget , startTarget + maxChars , (const char**)&startSrc , (const char*)endSrc , (fFixed ? 0 : (int32_t*)fSrcOffsets) , false , &err ); if ((err != U_ZERO_ERROR) && (err != U_BUFFER_OVERFLOW_ERROR)) { if (orgTarget != (UChar*)toFill) getMemoryManager()->deallocate(orgTarget);//delete [] orgTarget; if (fFixed) { XMLCh tmpBuf[17]; XMLString::binToText((unsigned int)(*startTarget), tmpBuf, 16, 16, getMemoryManager()); ThrowXMLwithMemMgr2 ( TranscodingException , XMLExcepts::Trans_BadSrcCP , tmpBuf , getEncodingName() , getMemoryManager() ); } else { ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager()); } } // Calculate the bytes eaten and store in caller's param bytesEaten = startSrc - srcData; // And the characters decoded const unsigned int charsDecoded = startTarget - orgTarget; // // Translate the array of char offsets into an array of character // sizes, which is what the transcoder interface semantics requires. // If its fixed, then we can optimize it. // if (fFixed) { const unsigned char fillSize = (unsigned char)ucnv_getMaxCharSize(fConverter); memset(charSizes, fillSize, maxChars); } else { // // We have to convert the series of offsets into a series of // sizes. If just one char was decoded, then its the total bytes // eaten. Otherwise, do a loop and subtract out each element from // its previous element. // if (charsDecoded == 1) { charSizes[0] = (unsigned char)bytesEaten; } else { // ICU does not return an extra element to allow us to figure // out the last char size, so we have to compute it from the // total bytes used. unsigned int index; for (index = 0; index < charsDecoded - 1; index++) { charSizes[index] = (unsigned char)(fSrcOffsets[index + 1] - fSrcOffsets[index]); } if( charsDecoded > 0 ) { charSizes[charsDecoded - 1] = (unsigned char)(bytesEaten - fSrcOffsets[charsDecoded - 1]); } } } // // If XMLCh and UChar are not the same size, then we need to copy over // the temp buffer to the new one. // if (sizeof(UChar) != sizeof(XMLCh)) { XMLCh* outPtr = toFill; startTarget = orgTarget; for (unsigned int index = 0; index < charsDecoded; index++) *outPtr++ = XMLCh(*startTarget++); // And delete the temp buffer getMemoryManager()->deallocate(orgTarget);//delete [] orgTarget; } // Return the chars we put into the target buffer return charsDecoded; }
// Convert a file from one encoding to another static UBool convertFile(const char *pname, const char *fromcpage, UConverterToUCallback toucallback, const void *touctxt, const char *tocpage, UConverterFromUCallback fromucallback, const void *fromuctxt, int fallback, size_t bufsz, const char *translit, const char *infilestr, FILE * outfile, int verbose) { FILE *infile; UBool ret = TRUE; UConverter *convfrom = 0; UConverter *convto = 0; UErrorCode err = U_ZERO_ERROR; UBool flush; const char *cbufp; char *bufp; char *buf = 0; uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ const UChar *unibufbp; UChar *unibufp; UChar *unibuf = 0; int32_t *fromoffsets = 0, *tooffsets = 0; size_t rd, wr, tobufsz; #if !UCONFIG_NO_TRANSLITERATION Transliterator *t = 0; // Transliterator acting on Unicode data. #endif UnicodeString u; // String to do the transliteration. // Open the correct input file or connect to stdin for reading input if (infilestr != 0 && strcmp(infilestr, "-")) { infile = fopen(infilestr, "rb"); if (infile == 0) { UnicodeString str1(infilestr, ""); str1.append((UChar32) 0); UnicodeString str2(strerror(errno), ""); str2.append((UChar32) 0); initMsg(pname); u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); return FALSE; } } else { infilestr = "-"; infile = stdin; #ifdef WIN32 if (setmode(fileno(stdin), O_BINARY) == -1) { initMsg(pname); u_wmsg(stderr, "cantSetInBinMode"); return FALSE; } #endif } if (verbose) { fprintf(stderr, "%s:\n", infilestr); } #if !UCONFIG_NO_TRANSLITERATION // Create transliterator as needed. if (translit != NULL && *translit) { UParseError parse; UnicodeString str(translit), pestr; /* Create from rules or by ID as needed. */ parse.line = -1; if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err); } else { t = Transliterator::createInstance(translit, UTRANS_FORWARD, err); } if (U_FAILURE(err)) { str.append((UChar32) 0); initMsg(pname); if (parse.line >= 0) { UChar linebuf[20], offsetbuf[20]; uprv_itou(linebuf, 20, parse.line, 10, 0); uprv_itou(offsetbuf, 20, parse.offset, 10, 0); u_wmsg(stderr, "cantCreateTranslitParseErr", str.getBuffer(), u_wmsg_errorName(err), linebuf, offsetbuf); } else { u_wmsg(stderr, "cantCreateTranslit", str.getBuffer(), u_wmsg_errorName(err)); } if (t) { delete t; t = 0; } goto error_exit; } } #endif // Create codepage converter. If the codepage or its aliases weren't // available, it returns NULL and a failure code. We also set the // callbacks, and return errors in the same way. convfrom = ucnv_open(fromcpage, &err); if (U_FAILURE(err)) { UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1)); initMsg(pname); u_wmsg(stderr, "cantOpenFromCodeset", str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); if (U_FAILURE(err)) { initMsg(pname); u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); goto error_exit; } convto = ucnv_open(tocpage, &err); if (U_FAILURE(err)) { UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1)); initMsg(pname); u_wmsg(stderr, "cantOpenToCodeset", str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); if (U_FAILURE(err)) { initMsg(pname); u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); goto error_exit; } ucnv_setFallback(convto, fallback); // To ensure that the buffer always is of enough size, we // must take the worst case scenario, that is the character in // the codepage that uses the most bytes and multiply it against // the buffer size. // use bufsz+1 to allow for additional BOM/signature character (U+FEFF) tobufsz = (bufsz+1) * ucnv_getMaxCharSize(convto); buf = new char[tobufsz]; unibuf = new UChar[bufsz]; fromoffsets = new int32_t[bufsz]; tooffsets = new int32_t[tobufsz]; // OK, we can convert now. do { char willexit = 0; rd = fread(buf, 1, bufsz, infile); if (ferror(infile) != 0) { UnicodeString str(strerror(errno)); str.append((UChar32) 0); initMsg(pname); u_wmsg(stderr, "cantRead", str.getBuffer()); goto error_exit; } // Convert the read buffer into the new coding // After the call 'unibufp' will be placed on the last // character that was converted in the 'unibuf'. // Also the 'cbufp' is positioned on the last converted // character. // At the last conversion in the file, flush should be set to // true so that we get all characters converted // // The converter must be flushed at the end of conversion so // that characters on hold also will be written. unibufp = unibuf; cbufp = buf; flush = rd != bufsz; ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp, cbufp + rd, fromoffsets, flush, &err); infoffset += (uint32_t)(cbufp - buf); if (U_FAILURE(err)) { char pos[32]; sprintf(pos, "%u", infoffset - 1); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "problemCvtToU", str.getBuffer(), u_wmsg_errorName(err)); willexit = 1; err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ } // At the last conversion, the converted characters should be // equal to number of chars read. if (flush && !willexit && cbufp != (buf + rd)) { char pos[32]; sprintf(pos, "%u", infoffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "premEndInput", str.getBuffer()); willexit = 1; } // Prepare to transliterate and convert. Transliterate if needed. #if !UCONFIG_NO_TRANSLITERATION if (t) { u.setTo(unibuf, (int32_t)(unibufp - unibuf)); // Copy into string. t->transliterate(u); } else #endif { u.setTo(unibuf, (int32_t)(unibufp - unibuf), (int32_t)(bufsz)); // Share the buffer. } int32_t ulen = u.length(); // Convert the Unicode buffer into the destination codepage // Again 'bufp' will be placed on the last converted character // And 'unibufbp' will be placed on the last converted unicode character // At the last conversion flush should be set to true to ensure that // all characters left get converted const UChar *unibufu = unibufbp = u.getBuffer(); do { int32_t len = ulen > (int32_t)bufsz ? (int32_t)bufsz : ulen; bufp = buf; unibufp = (UChar *) (unibufbp + len); ucnv_fromUnicode(convto, &bufp, bufp + tobufsz, &unibufbp, unibufp, tooffsets, flush, &err); if (U_FAILURE(err)) { const char *errtag; char pos[32]; uint32_t erroffset = dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(tobufsz)); int32_t ferroffset = (int32_t)(infoffset - (unibufp - unibufu) + erroffset); if ((int32_t) ferroffset < 0) { ferroffset = (int32_t)(outfoffset + (bufp - buf)); errtag = "problemCvtFromUOut"; } else { errtag = "problemCvtFromU"; } sprintf(pos, "%u", ferroffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, errtag, str.getBuffer(), u_wmsg_errorName(err)); willexit = 1; } // At the last conversion, the converted characters should be equal to number // of consumed characters. if (flush && !willexit && unibufbp != (unibufu + (size_t) (unibufp - unibufu))) { char pos[32]; sprintf(pos, "%u", infoffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "premEnd", str.getBuffer()); willexit = 1; } // Finally, write the converted buffer to the output file rd = (size_t) (bufp - buf); outfoffset += (int32_t)(wr = fwrite(buf, 1, rd, outfile)); if (wr != rd) { UnicodeString str(strerror(errno), ""); initMsg(pname); u_wmsg(stderr, "cantWrite", str.getBuffer()); willexit = 1; } if (willexit) { goto error_exit; } } while ((ulen -= (int32_t)(bufsz)) > 0); } while (!flush); // Stop when we have flushed the // converters (this means that it's // the end of output) goto normal_exit; error_exit: ret = FALSE; normal_exit: // Cleanup. if (convfrom) ucnv_close(convfrom); if (convto) ucnv_close(convto); #if !UCONFIG_NO_TRANSLITERATION if (t) delete t; #endif if (buf) delete[] buf; if (unibuf) delete[] unibuf; if (fromoffsets) delete[] fromoffsets; if (tooffsets) delete[] tooffsets; if (infile != stdin) { fclose(infile); } return ret; }
/************************** * Will convert a sequence of bytes from one codepage to another. * @param toConverterName: The name of the converter that will be used to encode the output buffer * @param fromConverterName: The name of the converter that will be used to decode the input buffer * @param target: Pointer to the output buffer* written * @param targetLength: on input contains the capacity of target, on output the number of bytes copied to target * @param source: Pointer to the input buffer * @param sourceLength: on input contains the capacity of source, on output the number of bytes processed in "source" * @param internal: used internally to store store state data across calls * @param err: fills in an error status */ void T_UConverter_fromCodepageToCodepage (UConverter * outConverter, UConverter * inConverter, char **target, const char *targetLimit, const char **source, const char *sourceLimit, int32_t* offsets, int flush, UErrorCode * err) { UChar out_chunk[CHUNK_SIZE]; const UChar *out_chunk_limit = out_chunk + CHUNK_SIZE; UChar *out_chunk_alias; UChar const *out_chunk_alias2; if (U_FAILURE (*err)) return; /*loops until the input buffer is completely consumed *or if an error has be encountered *first we convert from inConverter codepage to Unicode *then from Unicode to outConverter codepage */ while ((*source != sourceLimit) && U_SUCCESS (*err)) { out_chunk_alias = out_chunk; ucnv_toUnicode (inConverter, &out_chunk_alias, out_chunk_limit, source, sourceLimit, NULL, flush, err); /*BUFFER_OVERFLOW_ERROR means that the output "CHUNK" is full *we will require at least another loop (it's a recoverable error) */ if (U_SUCCESS (*err) || (*err == U_BUFFER_OVERFLOW_ERROR)) { *err = U_ZERO_ERROR; out_chunk_alias2 = out_chunk; while ((out_chunk_alias2 != out_chunk_alias) && U_SUCCESS (*err)) { ucnv_fromUnicode (outConverter, target, targetLimit, &out_chunk_alias2, out_chunk_alias, NULL, TRUE, err); } } else break; } return; }
int32_t ucnv_toUChars (const UConverter * converter, UChar * target, int32_t targetSize, const char *source, int32_t sourceSize, UErrorCode * err) { const char *mySource = source; const char *mySource_limit = source + sourceSize; UConverter myConverter; UChar *myTarget = target; int32_t targetCapacity; if (U_FAILURE (*err)) return 0; if ((converter == NULL) || (targetSize < 0) || (sourceSize < 0)) { *err = U_ILLEGAL_ARGUMENT_ERROR; return 0; } /*Means there is no work to be done */ if (sourceSize == 0) { /*for consistency we still need to *store 0 in the targetCapacity *if the user requires it */ if (targetSize >= 1) { target[0] = 0x0000; return 1; } else return 0; } /*makes a local copy of the UConverter */ myConverter = *converter; /*Not in pure pre-flight mode */ if (targetSize > 0) { /* Changed from (targetSize * 2) to (targetSize) */ ucnv_toUnicode (&myConverter, &myTarget, target + (targetSize-1), /*Save a spot for the Null terminator */ &mySource, mySource_limit, NULL, TRUE, err); /*Null terminates the string */ *(myTarget) = 0x0000; } /*Rigs targetCapacity to have at least one cell for zero termination */ /*Updates targetCapacity to contain the number of bytes written to target */ targetCapacity = 1; targetCapacity += myTarget - target; if (targetSize == 0) { *err = U_BUFFER_OVERFLOW_ERROR; } /* If the output buffer is exhausted, we need to stop writing * to it but if the input buffer is not exhausted, * we need to continue the conversion in order to store in targetSize * the number of bytes that was required */ if (*err == U_BUFFER_OVERFLOW_ERROR) { UChar target2[CHUNK_SIZE]; UChar *target2_alias = target2; const UChar *target2_limit = target2 + CHUNK_SIZE; /*We use a stack allocated buffer around which we loop (in case the output is greater than CHUNK_SIZE) */ while (*err == U_BUFFER_OVERFLOW_ERROR) { *err = U_ZERO_ERROR; target2_alias = target2; ucnv_toUnicode (&myConverter, &target2_alias, target2_limit, &mySource, mySource_limit, NULL, TRUE, err); /*updates the output parameter to contain the number of char required */ targetCapacity += target2_alias - target2 + 1; } (targetCapacity)--; /*adjust for last one */ if (U_SUCCESS (*err)) *err = U_BUFFER_OVERFLOW_ERROR; } return targetCapacity; }
UErrorCode convsample_40() { printf("\n\n==============================================\n" "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); FILE *f; FILE *out; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; UChar *target; UChar *targetLimit; int32_t uBufSize = 0; UConverter *conv = NULL; UErrorCode status = U_ZERO_ERROR; uint32_t inbytes=0, total=0; f = fopen("data02.bin", "rb"); if(!f) { fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); return U_FILE_ACCESS_ERROR; } out = fopen("data40.utf16", "wb"); if(!out) { fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); fclose(f); return U_FILE_ACCESS_ERROR; } // **************************** START SAMPLE ******************* conv = ucnv_openCCSID(37, UCNV_IBM, &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { inbytes += count; // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; do { target = uBuf; targetLimit = uBuf + uBufSize; ucnv_toUnicode( conv, &target, targetLimit, &source, sourceLimit, NULL, feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ /* is true (when no more data will come) */ &status); if(status == U_BUFFER_OVERFLOW_ERROR) { // simply ran out of space - we'll reset the target ptr the next // time through the loop. status = U_ZERO_ERROR; } else { // Check other errors here. assert(U_SUCCESS(status)); // Break out of the loop (by force) } // Process the Unicode // Todo: handle UTF-16/surrogates assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == (size_t)(target-uBuf)); total += (target-uBuf); } while (source < sourceLimit); // while simply out of space } printf("%d bytes in, %d UChars out.\n", inbytes, total); // ***************************** END SAMPLE ******************** ucnv_close(conv); fclose(f); fclose(out); printf("\n"); return U_ZERO_ERROR; }
UXMLElement * UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { char bytes[4096], charsetBuffer[100]; FileStream *f; const char *charset, *pb; UnicodeString src; UConverter *cnv; UChar *buffer, *pu; int32_t fileLength, bytesLength, length, capacity; UBool flush; if(U_FAILURE(errorCode)) { return NULL; } f=T_FileStream_open(filename, "rb"); if(f==NULL) { errorCode=U_FILE_ACCESS_ERROR; return NULL; } bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); if(bytesLength<(int32_t)sizeof(bytes)) { // we have already read the entire file fileLength=bytesLength; } else { // get the file length fileLength=T_FileStream_size(f); } /* * get the charset: * 1. Unicode signature * 2. treat as ISO-8859-1 and read XML encoding="charser" * 3. default to UTF-8 */ charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); if(U_SUCCESS(errorCode) && charset!=NULL) { // open converter according to Unicode signature cnv=ucnv_open(charset, &errorCode); } else { // read as Latin-1 and parse the XML declaration and encoding cnv=ucnv_open("ISO-8859-1", &errorCode); if(U_FAILURE(errorCode)) { // unexpected error opening Latin-1 converter goto exit; } buffer=src.getBuffer(bytesLength); if(buffer==NULL) { // unexpected failure to reserve some string capacity errorCode=U_MEMORY_ALLOCATION_ERROR; goto exit; } pb=bytes; pu=buffer; ucnv_toUnicode( cnv, &pu, buffer+src.getCapacity(), &pb, bytes+bytesLength, NULL, TRUE, &errorCode); src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); ucnv_close(cnv); cnv=NULL; if(U_FAILURE(errorCode)) { // unexpected error in conversion from Latin-1 src.remove(); goto exit; } // parse XML declaration if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { int32_t declEnd=mXMLDecl.end(errorCode); // go beyond <?xml int32_t pos=src.indexOf((UChar)x_l)+1; mAttrValue.reset(src); while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. UnicodeString attName = mAttrValue.group(1, errorCode); UnicodeString attValue = mAttrValue.group(2, errorCode); // Trim the quotes from the att value. These are left over from the original regex // that parsed the attribue, which couldn't conveniently strip them. attValue.remove(0,1); // one char from the beginning attValue.truncate(attValue.length()-1); // and one from the end. if(attName==UNICODE_STRING("encoding", 8)) { length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); charset=charsetBuffer; break; } pos = mAttrValue.end(2, errorCode); } if(charset==NULL) { // default to UTF-8 charset="UTF-8"; } cnv=ucnv_open(charset, &errorCode); } } if(U_FAILURE(errorCode)) { // unable to open the converter goto exit; } // convert the file contents capacity=fileLength; // estimated capacity src.getBuffer(capacity); src.releaseBuffer(0); // zero length flush=FALSE; for(;;) { // convert contents of bytes[bytesLength] pb=bytes; for(;;) { length=src.length(); buffer=src.getBuffer(capacity); if(buffer==NULL) { // unexpected failure to reserve some string capacity errorCode=U_MEMORY_ALLOCATION_ERROR; goto exit; } pu=buffer+length; ucnv_toUnicode( cnv, &pu, buffer+src.getCapacity(), &pb, bytes+bytesLength, NULL, FALSE, &errorCode); src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); if(errorCode==U_BUFFER_OVERFLOW_ERROR) { errorCode=U_ZERO_ERROR; capacity=(3*src.getCapacity())/2; // increase capacity by 50% } else { break; } } if(U_FAILURE(errorCode)) { break; // conversion error } if(flush) { break; // completely converted the file } // read next block bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); if(bytesLength==0) { // reached end of file, convert once more to flush the converter flush=TRUE; } }; exit: ucnv_close(cnv); T_FileStream_close(f); if(U_SUCCESS(errorCode)) { return parse(src, errorCode); } else { return NULL; } }
void charsetFilteredOutputStream_icu::writeImpl (const byte_t* const data, const size_t count) { if (m_from == NULL || m_to == NULL) throw exceptions::charset_conv_error("Cannot initialize converters."); // Allocate buffer for Unicode chars const size_t uniSize = ucnv_getMinCharSize(m_from) * count * sizeof(UChar); std::vector <UChar> uniBuffer(uniSize); // Conversion loop UErrorCode toErr = U_ZERO_ERROR; const char* uniSource = reinterpret_cast <const char*>(data); const char* uniSourceLimit = uniSource + count; do { // Convert from source charset to Unicode UChar* uniTarget = &uniBuffer[0]; UChar* uniTargetLimit = &uniBuffer[0] + uniSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit, &uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr); if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) { if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); } } const size_t uniLength = uniTarget - &uniBuffer[0]; // Allocate buffer for destination charset const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength; std::vector <char> cpBuffer(cpSize); // Convert from Unicode to destination charset UErrorCode fromErr = U_ZERO_ERROR; const UChar* cpSource = &uniBuffer[0]; const UChar* cpSourceLimit = &uniBuffer[0] + uniLength; do { char* cpTarget = &cpBuffer[0]; char* cpTargetLimit = &cpBuffer[0] + cpSize; fromErr = U_ZERO_ERROR; ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr); if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { if (fromErr == U_INVALID_CHAR_FOUND || fromErr == U_TRUNCATED_CHAR_FOUND || fromErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); } } const size_t cpLength = cpTarget - &cpBuffer[0]; // Write successfully converted bytes m_stream.write(&cpBuffer[0], cpLength); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); }
static UBool testConvertToUnicode( const uint8_t *source, int sourcelen, const UChar *expect, int expectlen, const char *codepage, UBool fallback, const int32_t *expectOffsets) { UErrorCode status = U_ZERO_ERROR; UConverter *conv = 0; UChar junkout[NEW_MAX_BUFFER]; /* FIX */ int32_t junokout[NEW_MAX_BUFFER]; /* FIX */ const char *src; const char *realSourceEnd; const char *srcLimit; UChar *targ; UChar *end; int32_t *offs; int i; UBool checkOffsets = TRUE; char junk[9999]; char offset_str[9999]; UChar *p; UBool action; int32_t realBufferSize; UChar *realBufferEnd; for(i=0;i<NEW_MAX_BUFFER;i++) junkout[i] = 0xFFFE; for(i=0;i<NEW_MAX_BUFFER;i++) junokout[i] = -1; setNuConvTestName(codepage, "TO"); log_verbose("\n========= %s\n", gNuConvTestName); conv = my_ucnv_open(codepage, &status); if(U_FAILURE(status)) { log_data_err("Couldn't open converter %s\n",gNuConvTestName); return TRUE; /* because it has been logged */ } log_verbose("Converter opened..\n"); src = (const char *)source; targ = junkout; offs = junokout; realBufferSize = (sizeof(junkout)/sizeof(junkout[0])); realBufferEnd = junkout + realBufferSize; realSourceEnd = src + sourcelen; /*----setting the fallback routine----*/ ucnv_setFallback (conv, fallback); action = ucnv_usesFallback(conv); if(action != fallback){ log_err("FAIL: Error is setting fallback. Errocode=%s\n", myErrorName(status)); } /*-------------------------------------*/ if ( gOutBufferSize != realBufferSize ) checkOffsets = FALSE; if( gInBufferSize != NEW_MAX_BUFFER ) checkOffsets = FALSE; do { end = nct_min( targ + gOutBufferSize, realBufferEnd); srcLimit = nct_min(realSourceEnd, src + gInBufferSize); if(targ == realBufferEnd) { log_err("Error, the end would overflow the real output buffer while about to call toUnicode! tarjey=%08lx %s",targ,gNuConvTestName); return FALSE; } log_verbose("calling toUnicode @ %08lx to %08lx\n", targ,end); status = U_ZERO_ERROR; ucnv_toUnicode (conv, &targ, end, (const char **)&src, (const char *)srcLimit, checkOffsets ? offs : NULL, (UBool)(srcLimit == realSourceEnd), /* flush if we're at the end of hte source data */ &status); } while ( (status == U_BUFFER_OVERFLOW_ERROR) || (srcLimit < realSourceEnd) ); /* while we just need another buffer */ if(U_FAILURE(status)) { log_err("Problem doing toUnicode, errcode %s %s\n", myErrorName(status), gNuConvTestName); return FALSE; } log_verbose("\nConversion done. %d bytes -> %d chars.\nResult :", sourcelen, targ-junkout); if(VERBOSITY) { junk[0] = 0; offset_str[0] = 0; for(p = junkout;p<targ;p++) { sprintf(junk + strlen(junk), "0x%04x, ", (0xFFFF) & (unsigned int)*p); sprintf(offset_str + strlen(offset_str), "0x%04x, ", (0xFFFF) & (unsigned int)junokout[p-junkout]); } log_verbose(junk); printUSeq(expect, expectlen); if ( checkOffsets ) { log_verbose("\nOffsets:"); log_verbose(offset_str); } log_verbose("\n"); } ucnv_close(conv); log_verbose("comparing %d uchars (%d bytes)..\n",expectlen,expectlen*2); if (checkOffsets && (expectOffsets != 0)) { if(memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t))) { log_err("\n\ndid not get the expected offsets while %s \n", gNuConvTestName); log_err("\nGot : "); for(p=junkout;p<targ;p++) log_err("%d, ", junokout[p-junkout]); log_err("\nExpected: "); for(i=0; i<(targ-junkout); i++) log_err("%d,", expectOffsets[i]); log_err(""); for(i=0; i<(targ-junkout); i++) log_err("0x%04X,", junkout[i]); log_err(""); for(i=0; i<(src-(const char *)source); i++) log_err("0x%04X,", (unsigned char)source[i]); } } if(!memcmp(junkout, expect, expectlen*2)) { log_verbose("Matches!\n"); return TRUE; } else { log_err("String does not match. %s\n", gNuConvTestName); log_verbose("String does not match. %s\n", gNuConvTestName); printUSeqErr(junkout, expectlen); printf("\n"); printUSeqErr(expect, expectlen); return FALSE; } }
/* helper function */ static UChar* _strFromWCS( UChar *dest, int32_t destCapacity, int32_t *pDestLength, const wchar_t *src, int32_t srcLength, UErrorCode *pErrorCode) { int32_t retVal =0, count =0 ; UConverter* conv = NULL; UChar* pTarget = NULL; UChar* pTargetLimit = NULL; UChar* target = NULL; UChar uStack [_STACK_BUFFER_CAPACITY]; wchar_t wStack[_STACK_BUFFER_CAPACITY]; wchar_t* pWStack = wStack; char cStack[_STACK_BUFFER_CAPACITY]; int32_t cStackCap = _STACK_BUFFER_CAPACITY; char* pCSrc=cStack; char* pCSave=pCSrc; char* pCSrcLimit=NULL; const wchar_t* pSrc = src; const wchar_t* pSrcLimit = NULL; if(srcLength ==-1){ /* if the wchar_t source is null terminated we can safely * assume that there are no embedded nulls, this is a fast * path for null terminated strings. */ for(;;){ /* convert wchars to chars */ retVal = uprv_wcstombs(pCSrc,src, cStackCap); if(retVal == -1){ *pErrorCode = U_ILLEGAL_CHAR_FOUND; goto cleanup; }else if(retVal >= (cStackCap-1)){ /* Should rarely occur */ u_growAnyBufferFromStatic(cStack,(void**)&pCSrc,&cStackCap, cStackCap * _BUFFER_CAPACITY_MULTIPLIER, 0, sizeof(char)); pCSave = pCSrc; }else{ /* converted every thing */ pCSrc = pCSrc+retVal; break; } } }else{ /* here the source is not null terminated * so it may have nulls embeded and we need to * do some extra processing */ int32_t remaining =cStackCap; pSrcLimit = src + srcLength; for(;;){ register int32_t nulLen = 0; /* find nulls in the string */ while(nulLen<srcLength && pSrc[nulLen++]!=0){ } if((pSrc+nulLen) < pSrcLimit){ /* check if we have enough room in pCSrc */ if(remaining < (nulLen * MB_CUR_MAX)){ /* should rarely occur */ int32_t len = (pCSrc-pCSave); pCSrc = pCSave; /* we do not have enough room so grow the buffer*/ u_growAnyBufferFromStatic(cStack,(void**)&pCSrc,&cStackCap, _BUFFER_CAPACITY_MULTIPLIER*cStackCap+(nulLen*MB_CUR_MAX),len,sizeof(char)); pCSave = pCSrc; pCSrc = pCSave+len; remaining = cStackCap-(pCSrc - pCSave); } /* we have found a null so convert the * chunk from begining of non-null char to null */ retVal = uprv_wcstombs(pCSrc,pSrc,remaining); if(retVal==-1){ /* an error occurred bail out */ *pErrorCode = U_ILLEGAL_CHAR_FOUND; goto cleanup; } pCSrc += retVal+1 /* already null terminated */; pSrc += nulLen; /* skip past the null */ srcLength-=nulLen; /* decrement the srcLength */ remaining -= (pCSrc-pCSave); }else{ /* the source is not null terminated and we are * end of source so we copy the source to a temp buffer * null terminate it and convert wchar_ts to chars */ if(nulLen >= _STACK_BUFFER_CAPACITY){ /* Should rarely occcur */ /* allocate new buffer buffer */ pWStack =(wchar_t*) uprv_malloc(sizeof(wchar_t) * (nulLen + 1)); if(pWStack==NULL){ *pErrorCode = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } } if(nulLen>0){ /* copy the contents to tempStack */ uprv_memcpy(pWStack,pSrc,nulLen*sizeof(wchar_t)); } /* null terminate the tempBuffer */ pWStack[nulLen] =0 ; if(remaining < (nulLen * MB_CUR_MAX)){ /* Should rarely occur */ int32_t len = (pCSrc-pCSave); pCSrc = pCSave; /* we do not have enough room so grow the buffer*/ u_growAnyBufferFromStatic(cStack,(void**)&pCSrc,&cStackCap, cStackCap+(nulLen*MB_CUR_MAX),len,sizeof(char)); pCSave = pCSrc; pCSrc = pCSave+len; remaining = cStackCap-(pCSrc - pCSave); } /* convert to chars */ retVal = uprv_wcstombs(pCSrc,pWStack,remaining); pCSrc += retVal; pSrc += nulLen; srcLength-=nulLen; /* decrement the srcLength */ break; } } } /* OK..now we have converted from wchar_ts to chars now * convert chars to UChars */ pCSrcLimit = pCSrc; pCSrc = pCSave; pTarget = target= dest; pTargetLimit = dest + destCapacity; conv= u_getDefaultConverter(pErrorCode); if(U_FAILURE(*pErrorCode)|| conv==NULL){ goto cleanup; } for(;;) { *pErrorCode = U_ZERO_ERROR; /* convert to stack buffer*/ ucnv_toUnicode(conv,&pTarget,pTargetLimit,(const char**)&pCSrc,pCSrcLimit,NULL,(UBool)(pCSrc==pCSrcLimit),pErrorCode); /* increment count to number written to stack */ count+= pTarget - target; if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR){ target = uStack; pTarget = uStack; pTargetLimit = uStack + _STACK_BUFFER_CAPACITY; } else { break; } } if(pDestLength){ *pDestLength =count; } u_terminateUChars(dest,destCapacity,count,pErrorCode); cleanup: if(cStack != pCSave){ uprv_free(pCSave); } if(wStack != pWStack){ uprv_free(pWStack); } u_releaseDefaultConverter(conv); return dest; }
void charsetFilteredOutputStream_icu::flush() { if (m_from == NULL || m_to == NULL) throw exceptions::charset_conv_error("Cannot initialize converters."); // Allocate buffer for Unicode chars const size_t uniSize = ucnv_getMinCharSize(m_from) * 1024 * sizeof(UChar); std::vector <UChar> uniBuffer(uniSize); // Conversion loop (with flushing) UErrorCode toErr = U_ZERO_ERROR; const char* uniSource = 0; const char* uniSourceLimit = 0; do { // Convert from source charset to Unicode UChar* uniTarget = &uniBuffer[0]; UChar* uniTargetLimit = &uniBuffer[0] + uniSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit, &uniSource, uniSourceLimit, NULL, /* flush */ TRUE, &toErr); if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) { throw exceptions::charset_conv_error ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); } const size_t uniLength = uniTarget - &uniBuffer[0]; // Allocate buffer for destination charset const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength; std::vector <char> cpBuffer(cpSize); // Convert from Unicode to destination charset UErrorCode fromErr = U_ZERO_ERROR; const UChar* cpSource = &uniBuffer[0]; const UChar* cpSourceLimit = &uniBuffer[0] + uniLength; do { char* cpTarget = &cpBuffer[0]; char* cpTargetLimit = &cpBuffer[0] + cpSize; fromErr = U_ZERO_ERROR; ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &cpSource, cpSourceLimit, NULL, /* flush */ TRUE, &fromErr); if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { throw exceptions::charset_conv_error ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); } const size_t cpLength = cpTarget - &cpBuffer[0]; // Write successfully converted bytes m_stream.write(&cpBuffer[0], cpLength); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); m_stream.flush(); }
CF_PRIVATE CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { UConverter *converter; UErrorCode errorCode = U_ZERO_ERROR; const char *source = (const char *)bytes; const char *sourceLimit = source + numBytes; UTF16Char *destination = characters; const UTF16Char *destinationLimit = destination + maxCharLen; bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false); CFIndex status; if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable; if (0 == maxCharLen) { UTF16Char buffer[MAX_BUFFER_SIZE]; CFIndex totalLength = 0; while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) { destination = buffer; destinationLimit = destination + MAX_BUFFER_SIZE; ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode); totalLength += (destination - buffer); if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR; } if (NULL != usedCharLen) *usedCharLen = totalLength; } else { ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode); if (NULL != usedCharLen) *usedCharLen = destination - characters; } status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream)); if (NULL != usedByteLen) { #if HAS_ICU_BUG_6024743 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */ if (kCFStringEncodingInvalidInputStream == status) { #define MAX_ERROR_BUFFER_LEN (32) char errorBuffer[MAX_ERROR_BUFFER_LEN]; int8_t errorLength = MAX_ERROR_BUFFER_LEN; #undef MAX_ERROR_BUFFER_LEN errorCode = U_ZERO_ERROR; ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode); if (U_ZERO_ERROR == errorCode) { #if HAS_ICU_BUG_6025527 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte. if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength; #endif source -= errorLength; } else { // Gah, something is terribly wrong. Reset everything source = (const char *)bytes; // 0 length if (NULL != usedCharLen) *usedCharLen = 0; } } #endif *usedByteLen = source - (const char *)bytes; } status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status); return status; }
/* private function used for buffering input */ void ufile_fill_uchar_buffer(UFILE *f) { UErrorCode status; const char *mySource; const char *mySourceEnd; UChar *myTarget; int32_t bufferSize; int32_t maxCPBytes; int32_t bytesRead; int32_t availLength; int32_t dataSize; char charBuffer[UFILE_CHARBUFFER_SIZE]; u_localized_string *str; if (f->fFile == NULL) { /* There is nothing to do. It's a string. */ return; } str = &f->str; dataSize = (int32_t)(str->fLimit - str->fPos); if (f->fFileno == 0 && dataSize > 0) { /* Don't read from stdin too many times. There is still some data. */ return; } /* shift the buffer if it isn't empty */ if(dataSize != 0) { uprv_memmove(f->fUCBuffer, str->fPos, dataSize * sizeof(UChar)); } /* record how much buffer space is available */ availLength = UFILE_UCHARBUFFER_SIZE - dataSize; /* Determine the # of codepage bytes needed to fill our UChar buffer */ /* weiv: if converter is NULL, we use invariant converter with charwidth = 1)*/ maxCPBytes = availLength / (f->fConverter!=NULL?(2*ucnv_getMinCharSize(f->fConverter)):1); /* Read in the data to convert */ if (f->fFileno == 0) { /* Special case. Read from stdin one line at a time. */ char *retStr = fgets(charBuffer, ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE), f->fFile); bytesRead = (int32_t)(retStr ? uprv_strlen(charBuffer) : 0); } else { /* A normal file */ bytesRead = (int32_t)fread(charBuffer, sizeof(char), ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE), f->fFile); } /* Set up conversion parameters */ status = U_ZERO_ERROR; mySource = charBuffer; mySourceEnd = charBuffer + bytesRead; myTarget = f->fUCBuffer + dataSize; bufferSize = UFILE_UCHARBUFFER_SIZE; if(f->fConverter != NULL) { /* We have a valid converter */ /* Perform the conversion */ ucnv_toUnicode(f->fConverter, &myTarget, f->fUCBuffer + bufferSize, &mySource, mySourceEnd, NULL, (UBool)(feof(f->fFile) != 0), &status); } else { /*weiv: do the invariant conversion */ u_charsToUChars(mySource, myTarget, bytesRead); myTarget += bytesRead; } /* update the pointers into our array */ str->fPos = str->fBuffer; str->fLimit = myTarget; }
UErrorCode convsample_05() { printf("\n\n==============================================\n" "Sample 05: C: count the number of letters in a UTF-8 document\n"); FILE *f; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; UChar *target; UChar *targetLimit; UChar *p; int32_t uBufSize = 0; UConverter *conv; UErrorCode status = U_ZERO_ERROR; uint32_t letters=0, total=0; f = fopen("data01.txt", "r"); if(!f) { fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); return U_FILE_ACCESS_ERROR; } // **************************** START SAMPLE ******************* conv = ucnv_open("utf-8", &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; do { target = uBuf; targetLimit = uBuf + uBufSize; ucnv_toUnicode(conv, &target, targetLimit, &source, sourceLimit, NULL, feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ /* is true (when no more data will come) */ &status); if(status == U_BUFFER_OVERFLOW_ERROR) { // simply ran out of space - we'll reset the target ptr the next // time through the loop. status = U_ZERO_ERROR; } else { // Check other errors here. assert(U_SUCCESS(status)); // Break out of the loop (by force) } // Process the Unicode // Todo: handle UTF-16/surrogates for(p = uBuf; p<target; p++) { if(u_isalpha(*p)) letters++; total++; } } while (source < sourceLimit); // while simply out of space } printf("%d letters out of %d total UChars.\n", letters, total); // ***************************** END SAMPLE ******************** ucnv_close(conv); printf("\n"); fclose(f); return U_ZERO_ERROR; }
/* private function used for buffering input */ void ufile_fill_uchar_buffer(UFILE *f) { UErrorCode status; const char *mySource; const char *mySourceEnd; UChar *myTarget; int32_t bufferSize; int32_t maxCPBytes; int32_t bytesRead; int32_t availLength; int32_t dataSize; /* shift the buffer if it isn't empty */ dataSize = (int32_t)(f->fUCLimit - f->fUCPos); if(dataSize != 0) { memmove(f->fUCBuffer, f->fUCPos, dataSize * sizeof(UChar)); } /* record how much buffer space is available */ availLength = UFILE_UCHARBUFFER_SIZE - dataSize; /* Determine the # of codepage bytes needed to fill our UChar buffer */ /* weiv: if converter is NULL, we use invariant converter with charwidth = 1)*/ maxCPBytes = availLength / (f->fConverter!=NULL?(2*ucnv_getMinCharSize(f->fConverter)):1); /* Read in the data to convert */ bytesRead = (int32_t)fread(f->fCharBuffer, sizeof(char), ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE), f->fFile); /* Set up conversion parameters */ status = U_ZERO_ERROR; mySource = f->fCharBuffer; mySourceEnd = f->fCharBuffer + bytesRead; myTarget = f->fUCBuffer + dataSize; bufferSize = UFILE_UCHARBUFFER_SIZE; if(f->fConverter != NULL) { /* We have a valid converter */ /* Perform the conversion */ ucnv_toUnicode(f->fConverter, &myTarget, f->fUCBuffer + bufferSize, &mySource, mySourceEnd, NULL, (UBool)(feof(f->fFile) != 0), &status); } else { /*weiv: do the invariant conversion */ u_charsToUChars(mySource, myTarget, bytesRead); myTarget += bytesRead; } /* update the pointers into our array */ f->fUCPos = f->fUCBuffer; f->fUCLimit = myTarget; }
/* fill the uchar buffer */ static UCHARBUF* ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* error){ UChar* pTarget=NULL; UChar* target=NULL; const char* source=NULL; char carr[MAX_IN_BUF] = {'\0'}; char* cbuf = carr; int32_t inputRead=0; int32_t outputWritten=0; int32_t offset=0; const char* sourceLimit =NULL; int32_t cbufSize=0; pTarget = buf->buffer; /* check if we arrived here without exhausting the buffer*/ if(buf->currentPos<buf->bufLimit){ offset = (int32_t)(buf->bufLimit-buf->currentPos); memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar)); } #if DEBUG memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset)); #endif if(buf->isBuffered){ cbufSize = MAX_IN_BUF; /* read the file */ inputRead=T_FileStream_read(buf->in,cbuf,cbufSize-offset); buf->remaining-=inputRead; }else{ cbufSize = T_FileStream_size(buf->in); cbuf = (char*)uprv_malloc(cbufSize); if (cbuf == NULL) { *error = U_MEMORY_ALLOCATION_ERROR; return NULL; } inputRead= T_FileStream_read(buf->in,cbuf,cbufSize); buf->remaining-=inputRead; } /* just to be sure...*/ if ( 0 == inputRead ) buf->remaining = 0; target=pTarget; /* convert the bytes */ if(buf->conv){ /* set the callback to stop */ UConverterToUCallback toUOldAction ; void* toUOldContext; void* toUNewContext=NULL; ucnv_setToUCallBack(buf->conv, UCNV_TO_U_CALLBACK_STOP, toUNewContext, &toUOldAction, (const void**)&toUOldContext, error); /* since state is saved in the converter we add offset to source*/ target = pTarget+offset; source = cbuf; sourceLimit = source + inputRead; ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), &source,sourceLimit,NULL, (UBool)(buf->remaining==0),error); if(U_FAILURE(*error)){ char context[CONTEXT_LEN+1]; char preContext[CONTEXT_LEN+1]; char postContext[CONTEXT_LEN+1]; int8_t len = CONTEXT_LEN; int32_t start=0; int32_t stop =0; int32_t pos =0; /* use erro1 to preserve the error code */ UErrorCode error1 =U_ZERO_ERROR; if( buf->showWarning==TRUE){ fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while" " converting input stream to target encoding: %s\n", u_errorName(*error)); } /* now get the context chars */ ucnv_getInvalidChars(buf->conv,context,&len,&error1); context[len]= 0 ; /* null terminate the buffer */ pos = (int32_t)(source - cbuf - len); /* for pre-context */ start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1)); stop = pos-len; memcpy(preContext,cbuf+start,stop-start); /* null terminate the buffer */ preContext[stop-start] = 0; /* for post-context */ start = pos+len; stop = (int32_t)(((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf)); memcpy(postContext,source,stop-start); /* null terminate the buffer */ postContext[stop-start] = 0; if(buf->showWarning ==TRUE){ /* print out the context */ fprintf(stderr,"\tPre-context: %s\n",preContext); fprintf(stderr,"\tContext: %s\n",context); fprintf(stderr,"\tPost-context: %s\n", postContext); } /* reset the converter */ ucnv_reset(buf->conv); /* set the call back to substitute * and restart conversion */ ucnv_setToUCallBack(buf->conv, UCNV_TO_U_CALLBACK_SUBSTITUTE, toUNewContext, &toUOldAction, (const void**)&toUOldContext, &error1); /* reset source and target start positions */ target = pTarget+offset; source = cbuf; /* re convert */ ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), &source,sourceLimit,NULL, (UBool)(buf->remaining==0),&error1); } outputWritten = (int32_t)(target - pTarget); #if DEBUG { int i; target = pTarget; for(i=0;i<numRead;i++){ /* printf("%c", (char)(*target++));*/ } } #endif }else{ u_charsToUChars(cbuf,target+offset,inputRead); outputWritten=((buf->remaining>cbufSize)? cbufSize:inputRead+offset); } buf->currentPos = pTarget; buf->bufLimit=pTarget+outputWritten; *buf->bufLimit=0; /*NUL terminate*/ if(cbuf!=carr){ uprv_free(cbuf); } return buf; }