static UChar* toUChar(const char *src, void **freeHook) { /* Structure of the memory that we allocate on the heap */ int32_t numUChars; int32_t destSize; UChar stackBuf[2000 + sizeof(void *)/sizeof(UChar)]; StringStruct *dest; UConverter *cnv; UErrorCode status = U_ZERO_ERROR; if (src == NULL) { return NULL; }; cnv = ucnv_open(NULL, &status); if(U_FAILURE(status) || cnv == NULL) { return NULL; } ucnv_reset(cnv); numUChars = ucnv_toUChars(cnv, stackBuf, 2000, src, -1, &status); destSize = (numUChars+1) * sizeof(UChar) + sizeof(struct StringStruct); dest = (StringStruct *)malloc(destSize); if (dest != NULL) { if (status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) { ucnv_toUChars(cnv, dest->str, numUChars+1, src, -1, &status); } else if (status == U_ZERO_ERROR) { u_strcpy(dest->str, stackBuf); } else { free(dest); dest = NULL; } } ucnv_reset(cnv); /* be good citizens */ ucnv_close(cnv); if (dest == NULL) { return NULL; } dest->link = (StringStruct*)(*freeHook); *freeHook = dest; return dest->str; }
void FStringConverter::ConvertString(const TCHAR* Source, const int32 SourceStartIndex, const int32 SourceLen, icu::UnicodeString& Destination, const bool ShouldNullTerminate) { if (SourceLen > 0) { UErrorCode ICUStatus = U_ZERO_ERROR; ucnv_reset(ICUConverter); // Get the internal buffer of the string, we're going to use it as scratch space const int32_t DestinationCapacityUChars = SourceLen * 2; UChar* InternalStringBuffer = Destination.getBuffer(DestinationCapacityUChars); // Perform the conversion into the string buffer const int32_t SourceSizeBytes = SourceLen * sizeof(TCHAR); const int32_t DestinationLength = ucnv_toUChars(ICUConverter, InternalStringBuffer, DestinationCapacityUChars, reinterpret_cast<const char*>(Source + SourceStartIndex), SourceSizeBytes, &ICUStatus); // Optionally null terminate the string if (ShouldNullTerminate) { InternalStringBuffer[DestinationLength] = 0; } // Size it back down to the correct size and release our lock on the string buffer Destination.releaseBuffer(DestinationLength); check(U_SUCCESS(ICUStatus)); } else { Destination.remove(); } }
/* params : desc : the document descriptor * buf : destination buffer for UTF-16 data * return : the length of the paragraph * NO_MORE_DATA if there is no more paragraph * ERR_STREAMFILE if an error occured * * reads the next paragraph and converts to UTF-16 */ int p_read_content(struct doc_descriptor *desc, UChar *buf) { char *outputbuf; int len; UErrorCode err; len = 0; outputbuf = (char *) malloc(INTERNAL_BUFSIZE); /* reading the next paragraph */ memset(outputbuf, '\x00', INTERNAL_BUFSIZE); len = getText(desc, outputbuf, INTERNAL_BUFSIZE); if (len > 0) { (desc->nb_par_read) += 1; /* converting to UTF-16 */ err = U_ZERO_ERROR; len = 2 * ucnv_toUChars(desc->conv, buf, 2*INTERNAL_BUFSIZE, outputbuf, strlen(outputbuf), &err); if (U_FAILURE(err)) { free(outputbuf); outputbuf = NULL; fprintf(stderr, "Unable to convert buffer\n"); return ERR_ICU; } } if(outputbuf != NULL) { free(outputbuf); } return len; }
size_t CodePage2UnicodeConverter::convertToUnicode(UChar * pclTarget, size_t uiTargetMaxLength, const char * cpacSource, size_t uiSourceLength) /* ----------------------------------------------------------------------- */ { size_t uiTargetSize; assert(iv_uconverter !=NULL); UErrorCode err=(UErrorCode)0; uiTargetSize = ucnv_toUChars(iv_uconverter, pclTarget, uiTargetMaxLength, cpacSource, uiSourceLength, &err); if (!U_SUCCESS(err) && err != U_BUFFER_OVERFLOW_ERROR) { cout << "ERROR: convertToUnicode " << err << endl; ///cerr << "CodePage2UnicodeConverter::getMaximumLength() rc= " << err << endl; ErrorMessage errMsg = ErrorMessage(UIMA_MSG_ID_CODEPAGE_CONV_ERROR); errMsg.addParam(err); UIMA_EXC_THROW_NEW(CodePageConversionException, UIMA_ERR_CODEPAGE, errMsg, UIMA_MSG_ID_CODEPAGE_CONV_ERROR, ErrorInfo::unrecoverable); } return uiTargetSize; //// return(uiTargetSize / sizeof(UChar)); /* as characters */ }
static char* convertToUtf8(UConverter* conv, const unsigned char* name, int len) { char* buffer1 = NULL; char* buffer2 = NULL; int bufSize = -1; if (2 * (len + 1) > bufSize) { if (buffer1 != NULL) { delete[] buffer1; delete[] buffer2; } bufSize = 2 * len + 100; buffer1 = new char[bufSize]; buffer2 = new char[bufSize]; } UErrorCode status = U_ZERO_ERROR; len = ucnv_toUChars(conv, (UChar*)buffer1, bufSize, (const char*)name, len, &status); len = ucnv_fromUChars(utf8Conv, buffer2, bufSize, (UChar*)buffer1, len, &status); buffer2[len] = 0; delete[] buffer1; return buffer2; }
UErrorCode convsample_12() { printf("\n\n==============================================\n" "Sample 12: C: simple sjis -> unicode conversion\n"); // **************************** START SAMPLE ******************* char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; UChar target[100]; UErrorCode status = U_ZERO_ERROR; UConverter *conv; int32_t len; // set up the converter conv = ucnv_open("shift_jis", &status); assert(U_SUCCESS(status)); // convert to Unicode // Note: we can use strlen, we know it's an 8 bit null terminated codepage target[6] = 0xFDCA; len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); U_ASSERT(status); // close the converter ucnv_close(conv); // ***************************** END SAMPLE ******************** // Print it out printBytes("src", source, strlen(source) ); printf("\n"); printUChars("targ", target, len); return U_ZERO_ERROR; }
int32_t CharsetMatch::getUChars(UChar *buf, int32_t cap, UErrorCode *status) const { UConverter *conv = ucnv_open(getName(), status); int32_t result = ucnv_toUChars(conv, buf, cap, (const char *) textIn->fRawInput, textIn->fRawLength, status); ucnv_close(conv); return result; }
String u16(const char *u8, int32_t u8_len, UErrorCode &error) { error = U_ZERO_ERROR; int32_t outlen = ucnv_toUChars(s_intl_request->utf8(), nullptr, 0, u8, u8_len, &error); if (error != U_BUFFER_OVERFLOW_ERROR) { return uninit_null(); } String ret = String(sizeof(UChar) * (outlen + 1), ReserveString); UChar *out = (UChar*)ret->mutableData(); error = U_ZERO_ERROR; outlen = ucnv_toUChars(s_intl_request->utf8(), out, outlen + 1, u8, u8_len, &error); if (U_FAILURE(error)) { return uninit_null(); } ret.setSize(outlen * sizeof(UChar)); return ret; }
size_t StringCharsetConverter::getConvertedLengthToUTF16(const char *str, size_t length) { UErrorCode err = U_ZERO_ERROR; int32_t size = ucnv_toUChars(cv, nullptr, 0, str, static_cast<int32_t>(length), &err); if(err != U_ZERO_ERROR && err != U_BUFFER_OVERFLOW_ERROR && err != U_STRING_NOT_TERMINATED_WARNING) return static_cast<size_t>(-1); if(length == numeric_limits<size_t>::max()) ++ size; // Size must includes null character. return static_cast<size_t>(size); }
/* test one string with the ICU and the reference BOCU-1 implementations */ static void roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) { UChar *roundtripRef, *roundtripICU; char *bocu1Ref, *bocu1ICU; int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength; UErrorCode errorCode; roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); bocu1Ref = malloc(DEFAULT_BUFFER_SIZE); bocu1ICU = malloc(DEFAULT_BUFFER_SIZE); /* Unicode -> BOCU-1 */ bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref); errorCode=U_ZERO_ERROR; bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); goto cleanup; } if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) { log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength); goto cleanup; } /* BOCU-1 -> Unicode */ roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef); if(roundtripRefLength<0) { goto cleanup; /* readString() found an error and reported it */ } roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); goto cleanup; } if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) { log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength); goto cleanup; } if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) { log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength); goto cleanup; } cleanup: free(roundtripRef); free(roundtripICU); free(bocu1Ref); free(bocu1ICU); }
FromUTF8::FromUTF8(Char const* src, UInt length) { UErrorCode err = U_ZERO_ERROR; UConverter *conv = ucnv_open("utf8", &err); CHECK_ICU(err); ON_BLOCK_EXIT(ucnv_close, conv); if (!length) length = static_cast<UInt>(strlen(src)); int32_t utf16_len = ucnv_toUChars(conv, 0, 0, src, length, &err); if(err==U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; m_utf16.reset(new UChar[utf16_len+1]); ucnv_toUChars(conv, m_utf16.get(), utf16_len, src, length, &err); CHECK_ICU(err); m_utf16[utf16_len]=0; } }
size_t StringCharsetConverter::convertToUTF16(wchar_t *toString, size_t toBufferLength, const char *fromString, size_t fromLength) { UErrorCode err = U_ZERO_ERROR; int32_t size = ucnv_toUChars(cv, toString, static_cast<int32_t>(toBufferLength), fromString, static_cast<int32_t>(fromLength), &err); if(err != U_ZERO_ERROR && err != U_BUFFER_OVERFLOW_ERROR && err != U_STRING_NOT_TERMINATED_WARNING) return static_cast<size_t>(-1); if(fromLength == numeric_limits<size_t>::max()) ++ size; // Size must includes null character. if(size > 0 && toString[0] == L'\uFEFF') // remove byte order mark memmove(toString, toString + 1, (-- size) * sizeof(wchar_t)); return static_cast<size_t>(size); }
int initTxt(struct doc_descriptor *desc) { UErrorCode err; char *encoding = NULL; int len, BOMlength = 0; char buf[BUFSIZE]; UChar outbuf[4*BUFSIZE]; lseek(desc->fd, 0, SEEK_SET); len = read(desc->fd, buf, BUFSIZE); /* detect BOM */ err = U_ZERO_ERROR; encoding = ucnv_detectUnicodeSignature(buf, BUFSIZE, &BOMlength, &err); if(encoding != NULL) { lseek(desc->fd, BOMlength, SEEK_SET); /* initialize converter to encoding */ err = U_ZERO_ERROR; desc->conv = ucnv_open(encoding, &err); if (U_FAILURE(err)) { fprintf(stderr, "unable to open ICU converter\n"); return ERR_ICU; } } else { /* initialize converter to UTF-8 */ err = U_ZERO_ERROR; desc->conv = ucnv_open("utf8", &err); if (U_FAILURE(err)) { fprintf(stderr, "unable to open ICU converter\n"); return ERR_ICU; } /* check the first 2048 bytes */ err = U_ZERO_ERROR; ucnv_setToUCallBack(desc->conv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &err); if (U_FAILURE(err)) { fprintf(stderr, "error setToUCallback\n"); return ERR_ICU; } err = U_ZERO_ERROR; ucnv_toUChars(desc->conv, outbuf, 4 * BUFSIZE, buf, len, &err); if (U_FAILURE(err)) { fprintf(stderr, "Unknown encoding\n"); return ERR_ICU; } lseek(desc->fd, 0, SEEK_SET); } return OK; }
/*---------------------------------------------------------------------------------------------- This method uses an ICU converter to convert a string from UTF-8 to UTF-16. Assumptions: <text> Exit conditions: <text> Parameters: <text> Return value: The number of characters required to store the fully-converted string (which may be greater than targetLen) ----------------------------------------------------------------------------------------------*/ int UnicodeConverter::Convert(const char* source, int sourceLen, UChar* target, int targetLen) { UErrorCode status = U_ZERO_ERROR; int spaceRequiredForData = ucnv_toUChars(m_converter, target, targetLen, source, sourceLen, &status); if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { TRACE("Unable to convert from UTF-8 to UTF-16 (" << status << ")\n"); //throw std::runtime_error("Unable to convert from UTF-8 to UTF-16"); } return spaceRequiredForData; }
/** * Convert from an existing encoding to UTF-16 * @param src the data read from the file * @param srclen the length of src in bytes * @param dst the destination buffer * @param dstlen the length of the destination in UChars * @param charset the charset of the src * @return the number of BYTES written */ int convert_from_encoding( char *src, int srclen, UChar *dst, int dstlen, char *charset ) { UConverter *conv = NULL; UErrorCode status = U_ZERO_ERROR; int32_t len=0; conv = ucnv_open( charset, &status ); if ( status == U_ZERO_ERROR ) { len = ucnv_toUChars( conv, dst, dstlen, src, srclen, &status ); if ( status != U_ZERO_ERROR ) fprintf(stderr,"encoding: %s\n",u_errorName(status)); len *= sizeof(UChar); ucnv_close(conv); } return len; }
Bool CodeSet_Validate(const char *buf, // IN: the string size_t size, // IN: length of string const char *code) // IN: encoding { #if defined(NO_ICU) return CodeSetOld_Validate(buf, size, code); #else UConverter *cv; UErrorCode uerr; // ucnv_toUChars takes 32-bit int size ASSERT_NOT_IMPLEMENTED(size <= (size_t) MAX_INT32); if (size == 0) { return TRUE; } /* * Fallback if necessary. */ if (dontUseIcu) { return CodeSetOld_Validate(buf, size, code); } /* * Calling ucnv_toUChars() this way is the idiom to precompute * the length of the output. (See preflighting in the ICU User Guide.) * So if the error is not U_BUFFER_OVERFLOW_ERROR, then the input * is bad. */ uerr = U_ZERO_ERROR; cv = ucnv_open(code, &uerr); ASSERT_NOT_IMPLEMENTED(uerr == U_ZERO_ERROR); ucnv_setToUCallBack(cv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &uerr); ASSERT_NOT_IMPLEMENTED(uerr == U_ZERO_ERROR); ucnv_toUChars(cv, NULL, 0, buf, size, &uerr); ucnv_close(cv); return uerr == U_BUFFER_OVERFLOW_ERROR; #endif }
/** * How many bytes are needed to convert from an encoding to utf16? * @param src the source in the encoding * @param srclen its length in bytes * @param encoding the src's encoding * @return the number of UCHARS needed */ int measure_from_encoding( char *src, size_t srclen, char *encoding ) { UConverter *conv = NULL; UErrorCode status = U_ZERO_ERROR; int32_t len=0; conv = ucnv_open( encoding, &status ); if ( status == U_ZERO_ERROR ) { len = ucnv_toUChars( conv, NULL, 0, src, srclen, &status ); if ( status != U_BUFFER_OVERFLOW_ERROR ) { printf("encoding: %s\n",u_errorName(status)); len = 0; } ucnv_close(conv); } return len; }
UBool isEuroAware(UConverter* myConv) { static const UChar euroString[2] = { 0x20AC, 0x0000 }; char target[20]; UChar euroBack[2]; int32_t targetSize, euroBackSize; UErrorCode err = U_ZERO_ERROR; /*const char* myName = ucnv_getName(myConv, &err);*/ targetSize = ucnv_fromUChars(myConv, target, sizeof(target), euroString, -1, &err); if (U_FAILURE(err)) { log_err("Failure Occured in ucnv_fromUChars euro roundtrip test\n"); return FALSE; } euroBackSize = ucnv_toUChars(myConv, euroBack, 2, target, targetSize, &err); if (U_FAILURE(err)) { log_err("Failure Occured in ucnv_toUChars euro roundtrip test\n"); return FALSE; } if (u_strcmp(euroString, euroBack)) { /* log_err("%s FAILED Euro rountrip\n", myName);*/ return FALSE; } else { /* log_verbose("%s PASSED Euro rountrip\n", myName);*/ return TRUE; } }
// Requires free() of returned UTF16Chars. void convertUTF8ToUTF16(const NPUTF8 *UTF8Chars, int UTF8Length, NPUTF16 **UTF16Chars, unsigned int *UTF16Length) { #if USE(ICU_UNICODE) assert(UTF8Chars || UTF8Length == 0); assert(UTF16Chars); if (UTF8Length == -1) UTF8Length = static_cast<int>(strlen(UTF8Chars)); // UTF16Length maximum length is the length of the UTF8 string, plus one to include terminator // Without the plus one, it will convert ok, but a warning is generated from the converter as // there is not enough room for a terminating character. *UTF16Length = UTF8Length + 1; *UTF16Chars = 0; UErrorCode status = U_ZERO_ERROR; UConverter* conv = ucnv_open("utf8", &status); if (U_SUCCESS(status)) { *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length)); ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 0, 0, 0, &status); *UTF16Length = ucnv_toUChars(conv, (::UChar*)*UTF16Chars, *UTF16Length, UTF8Chars, UTF8Length, &status); ucnv_close(conv); } // Check to see if the conversion was successful // Some plugins return invalid UTF-8 in NPVariantType_String, see <http://bugs.webkit.org/show_bug.cgi?id=5163> // There is no "bad data" for latin1. It is unlikely that the plugin was really sending text in this encoding, // but it should have used UTF-8, and now we are simply avoiding a crash. if (!U_SUCCESS(status)) { *UTF16Length = UTF8Length; if (!*UTF16Chars) // If the memory wasn't allocated, allocate it. *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length)); for (unsigned i = 0; i < *UTF16Length; i++) (*UTF16Chars)[i] = UTF8Chars[i] & 0xFF; } #else assert(!"Implement me!"); #endif }
U_CAPI UChar* U_EXPORT2 u_uastrcpy(UChar *ucs1, const char *s2 ) { UErrorCode err = U_ZERO_ERROR; UConverter *cnv = u_getDefaultConverter(&err); if(U_SUCCESS(err) && cnv != NULL) { ucnv_toUChars(cnv, ucs1, MAX_STRLEN, s2, (int32_t)uprv_strlen(s2), &err); u_releaseDefaultConverter(cnv); if(U_FAILURE(err)) { *ucs1 = 0; } } else { *ucs1 = 0; } return ucs1; }
char UTF8arShaping::processText(SWBuf &text, const SWKey *key, const SWModule *module) { UChar *ustr, *ustr2; if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering return -1; int32_t len = text.length(); ustr = new UChar[len]; ustr2 = new UChar[len]; // Convert UTF-8 string to UTF-16 (UChars) len = ucnv_toUChars(conv, ustr, len, text.c_str(), -1, &err); len = u_shapeArabic(ustr, len, ustr2, len, U_SHAPE_LETTERS_SHAPE | U_SHAPE_DIGITS_EN2AN, &err); text.setSize(text.size()*2); len = ucnv_fromUChars(conv, text.getRawData(), text.size(), ustr2, len, &err); text.setSize(len); delete [] ustr2; delete [] ustr; return 0; }
char UTF8NFKD::processText(SWBuf &text, const SWKey *key, const SWModule *module) { if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering return -1; int32_t len = 5 + text.length() * 5; source = new UChar[len + 1]; //each char could become a surrogate pair // Convert UTF-8 string to UTF-16 (UChars) int32_t ulen = ucnv_toUChars(conv, source, len, text.c_str(), -1, &err); target = new UChar[len + 1]; //compatability decomposition ulen = unorm_normalize(source, ulen, UNORM_NFKD, 0, target, len, &err); text.setSize(len); len = ucnv_fromUChars(conv, text.getRawData(), len, target, ulen, &err); text.setSize(len); delete [] source; delete [] target; return 0; }
int getText (struct doc_descriptor *desc, UChar * buf, int size) { struct meta *meta = NULL; char buf2[BUFSIZE]; UErrorCode err; char *src; UChar *dest, esc[3]; UChar name[1024], value[1024]; int len, i, isMarkup, isJavascript, isMeta, l, j; int dangerousCut, fini, r, offset, endOfFile, space_added; space_added = 0; l = 0; fini = 0; endOfFile = 0; isJavascript = 0; dangerousCut = 0; isMarkup = 0; isMeta = 0; len = read (desc->fd, buf2, BUFSIZE); while (!fini && len > 0 && 2*l < size - 2) { /* consuming buffer */ for (i = 0; 2*l < size - 2 && i < len && !dangerousCut && !fini; i++) { /* end of buffer are possible points of failure if a markup or a token is cut, it will not be parsed. */ if (!endOfFile && i > len - 9 && (!strncmp (buf2 + i, "\x3c", 1) || !strncmp (buf2 + i, "\x26", 1))) { dangerousCut = 1; break; } /* detecting end of javascript */ if (isJavascript && !strncasecmp (buf2 + i, "</script>", 9)) { isJavascript = 0; i += 9; } /* detecting new paragraph */ if (l > 0 && !isJavascript && (!strncasecmp (buf2 + i, "<p", 2) || !strncasecmp (buf2 + i, "<br", 3) || !strncasecmp (buf2 + i, "<div", 4))) { fini = 1; i += 2; while (strncmp (buf2 + i, ">", 1)) { i++; } lseek (desc->fd, i - len, SEEK_CUR); break; } /* detecting begining of markup */ if (!isJavascript && !isMarkup && !strncmp (buf2 + i, "\x3c", 1)) { /* detecting begining of javascript */ if (!strncasecmp (buf2 + i, "<script", 7)) { isJavascript = 1; } else if (!strncasecmp (buf2 + i, "<title", 6)) { err = U_ZERO_ERROR; /* finding last metadata of desc */ if (desc->meta == NULL) { meta = (struct meta *) malloc (sizeof (struct meta)); desc->meta = meta; } else { meta = desc->meta; while (meta->next != NULL) { meta = meta->next; } meta->next = (struct meta *) malloc (sizeof (struct meta)); meta = meta->next; } meta->next = NULL; meta->name = (UChar *) malloc (12); /* filling name field */ meta->name_length = 2 * ucnv_toUChars (desc->conv, meta->name, 12, "title", 5, &err); meta->name_length = u_strlen (meta->name); if (U_FAILURE (err)) { printf ("error icu\n"); return ERR_ICU; } isMeta = 1; } else if (!strncasecmp (buf2 + i, "<meta", 5)) { i += 5; if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } for (; strncasecmp (buf2 + i, "name=\"", 6) && strncmp (buf2 + i, "\x3E", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } } if (!strncmp (buf2 + i, "\x3E", 1)) { continue; } else { i += 6; /* get metadata name */ memset (name, '\x00', 2048); for (j = 0; len != 0 && strncmp (buf2 + i, "\"", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } if (!strncmp (buf2 + i, "\x26", 1)) { memset (esc, '\x00', 6); offset = escapeChar (desc, buf2 + i, esc); memcpy (name + j, esc, 2 * u_strlen (esc)); j += u_strlen (esc); i += (offset - 1); } else { /* filling name buffer */ dest = name + j; src = buf2 + i; err = U_ZERO_ERROR; ucnv_toUnicode (desc->conv, &dest, name + 1024, &src, buf2 + i + 1, NULL, FALSE, &err); if (U_FAILURE (err)) { fprintf (stderr, "Unable to convert buffer\n"); return ERR_ICU; } j += (dest - name - j); } } /* get metadata value */ for (; strncasecmp (buf2 + i, "content=\"", 9) && strncmp (buf2 + i, "\x3E", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } } i += 9; if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } memset (value, '\x00', 2048); for (j = 0; len != 0 && strncmp (buf2 + i, "\"", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } if (!strncmp (buf2 + i, "\x26", 1)) { memset (esc, '\x00', 6); offset = escapeChar (desc, buf2 + i, esc); memcpy (value + j, esc, 2 * u_strlen (esc)); j += u_strlen (esc); i += (offset - 1); } else { /* filling value buffer */ dest = value + j; src = buf2 + i; err = U_ZERO_ERROR; ucnv_toUnicode (desc->conv, &dest, value + 1024, &src, buf2 + i + 1, NULL, FALSE, &err); if (U_FAILURE (err)) { fprintf (stderr, "Unable to convert buffer\n"); return ERR_ICU; } j += (dest - value - j); } } /* insert metadata in list */ if (desc->meta == NULL) { meta = (struct meta *) malloc (sizeof (struct meta)); desc->meta = meta; } else { meta = desc->meta; while (meta->next != NULL) { meta = meta->next; } meta->next = (struct meta *) malloc (sizeof (struct meta)); meta = meta->next; } meta->next = NULL; meta->name = (UChar *) malloc (2 * u_strlen (name) + 2); meta->value = (UChar *) malloc (2 * u_strlen (value) + 2); memset (meta->name, '\x00', 2 * u_strlen (name) + 2); memset (meta->value, '\x00', 2 * u_strlen (value) + 2); memcpy (meta->name, name, 2 * u_strlen (name)); memcpy (meta->value, value, 2 * u_strlen (value)); meta->name_length = u_strlen (name); meta->value_length = u_strlen (value); for (; strncmp (buf2 + i, "\x3E", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } } continue; } } else { isMarkup = 1; } } /* get metadata value */ if (!isJavascript && isMeta) { for (; len != 0 && strncmp (buf2 + i, "\x3E", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } } i++; memset (value, '\x00', 2048); for (j = 0; len != 0 && strncmp (buf2 + i, "\x3C", 1); i++) { if (i >= size - 9) { strncpy (buf2, buf2 + i, len - i); len = read (desc->fd, buf2 + i, BUFSIZE - len + i) + len - i; i = 0; } if (!strncmp (buf2 + i, "\x26", 1)) { memset (esc, '\x00', 6); offset = escapeChar (desc, buf2 + i, esc); memcpy (value + j, esc, 2 * u_strlen (esc)); j += u_strlen (esc); i += (offset - 1); } else { /* filling value buffer */ dest = value + j; src = buf2 + i; err = U_ZERO_ERROR; ucnv_toUnicode (desc->conv, &dest, value + 1024, &src, buf2 + i + 1, NULL, FALSE, &err); if (U_FAILURE (err)) { fprintf (stderr, "Unable to convert buffer\n"); return ERR_ICU; } j += (dest - value - j); } } meta->value = (UChar *) malloc (2 * (j + 1)); memcpy (meta->value, value, 2 * u_strlen (value)); meta->value_length = u_strlen (value); isMeta = 0; i += 7; continue; } /* detecting end of markup */ if (!isJavascript && isMarkup && !strncmp (buf2 + i, "\x3e", 1)) { if (!space_added && l > 0) { buf[l] = 0x20; l ++; space_added = 1; } isMarkup = 0; } /* handling text */ if (!isJavascript && !isMarkup && strncmp (buf2 + i, "\x3e", 1)) { if (strncmp (buf2 + i, "\n", 1) && strncmp (buf2 + i, "\t", 1) && strncmp (buf2 + i, "\r", 1)) { /* converting tokens */ if (!isJavascript && !isMarkup && !strncmp (buf2 + i, "\x26", 1)) { memset (esc, '\x00', 6); offset = escapeChar (desc, buf2 + i, esc); if (memcmp (esc, "\x20\x00", u_strlen (esc))) { memcpy (buf + l, esc, 2 * u_strlen (esc)); l += u_strlen (esc); space_added = 0; } else { if (!space_added){ buf[l] = 0x20; space_added = 1; l++; } } i += (offset - 1); } else { if (buf2[i] != 0x20 || !space_added){ /* filling output buffer */ dest = buf + l; src = buf2 + i; err = U_ZERO_ERROR; ucnv_toUnicode (desc->conv, &dest, buf + size / 2, &src, buf2 + i + 1, NULL, FALSE, &err); if (U_FAILURE (err)) { fprintf (stderr, "Unable to convert buffer\n"); return ERR_ICU; } l += (dest - buf - l); if (buf2[i] == 0x20) {space_added = 1;} else {space_added=0;} } } } else { /* replace tabs and eol by spaces */ if (!space_added){ buf[l] = 0x20; space_added = 1; l++; } } } } /* filling new buffer correctly */ if (!fini) { if (dangerousCut) { r = len - i; strncpy (buf2, buf2 + i, r); len = read (desc->fd, buf2 + r, BUFSIZE - r) + r; if (len < 9) { endOfFile = 1; } dangerousCut = 0; } else { len = read (desc->fd, buf2, BUFSIZE); } } } /* ending buffer properly */ if (l > 0) { buf[l] = 0x20; return 2*l; } if (len == 0) { return NO_MORE_DATA; } return 2*l; }
/* test invariant-character handling */ static void TestInvariant() { /* all invariant graphic chars and some control codes (not \n!) */ const char invariantChars[]= "\t\r \"%&'()*+,-./" "0123456789:;<=>?" "ABCDEFGHIJKLMNOPQRSTUVWXYZ_" "abcdefghijklmnopqrstuvwxyz"; const UChar invariantUChars[]={ 9, 0xd, 0x20, 0x22, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5f, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0 }; const char variantChars[]="\n!#$@[\\]^`{|}~"; const UChar variantUChars[]={ 0x0a, 0x21, 0x23, 0x24, 0x40, 0x5b, 0x5c, 0x5d, 0x5e, 0x60, 0x7b, 0x7c, 0x7d, 0x7e, 0 }; const UChar nonASCIIUChars[]={ 0x80, 0xa0, 0x900, 0xff51 }; UChar us[120]; char cs[120]; int32_t i, length; /* make sure that all invariant characters convert both ways */ length=sizeof(invariantChars); u_charsToUChars(invariantChars, us, length); if(u_strcmp(us, invariantUChars)!=0) { log_err("u_charsToUChars(invariantChars) failed\n"); } u_UCharsToChars(invariantUChars, cs, length); if(strcmp(cs, invariantChars)!=0) { log_err("u_UCharsToChars(invariantUChars) failed\n"); } /* * make sure that variant characters convert from source code literals to Unicode * but not back to char * */ length=sizeof(variantChars); u_charsToUChars(variantChars, us, length); if(u_strcmp(us, variantUChars)!=0) { log_err("u_charsToUChars(variantChars) failed\n"); } #ifdef NDEBUG /* * Test u_UCharsToChars(variantUChars) only in release mode because it will * cause an assertion failure in debug builds. */ u_UCharsToChars(variantUChars, cs, length); for(i=0; i<length; ++i) { if(cs[i]!=0) { log_err("u_UCharsToChars(variantUChars) converted the %d-th character to %02x instead of 00\n", i, cs[i]); } } #endif /* * Verify that invariant characters roundtrip from Unicode to the * default converter and back. */ { UConverter *cnv; UErrorCode errorCode; errorCode=U_ZERO_ERROR; cnv=ucnv_open(NULL, &errorCode); if(U_FAILURE(errorCode)) { log_err("unable to open the default converter\n"); } else { length=ucnv_fromUChars(cnv, cs, sizeof(cs), invariantUChars, -1, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucnv_fromUChars(invariantUChars) failed - %s\n", u_errorName(errorCode)); } else if(length!=sizeof(invariantChars)-1 || strcmp(cs, invariantChars)!=0) { log_err("ucnv_fromUChars(invariantUChars) failed\n"); } errorCode=U_ZERO_ERROR; length=ucnv_toUChars(cnv, us, LENGTHOF(us), invariantChars, -1, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucnv_toUChars(invariantChars) failed - %s\n", u_errorName(errorCode)); } else if(length!=LENGTHOF(invariantUChars)-1 || u_strcmp(us, invariantUChars)!=0) { log_err("ucnv_toUChars(invariantChars) failed\n"); } ucnv_close(cnv); } } /* API tests */ if(!uprv_isInvariantString(invariantChars, -1)) { log_err("uprv_isInvariantString(invariantChars) failed\n"); } if(!uprv_isInvariantUString(invariantUChars, -1)) { log_err("uprv_isInvariantUString(invariantUChars) failed\n"); } if(!uprv_isInvariantString(invariantChars+strlen(invariantChars), 1)) { log_err("uprv_isInvariantString(\"\\0\") failed\n"); } for(i=0; i<(sizeof(variantChars)-1); ++i) { if(uprv_isInvariantString(variantChars+i, 1)) { log_err("uprv_isInvariantString(variantChars[%d]) failed\n", i); } if(uprv_isInvariantUString(variantUChars+i, 1)) { log_err("uprv_isInvariantUString(variantUChars[%d]) failed\n", i); } } for(i=0; i<LENGTHOF(nonASCIIUChars); ++i) { if(uprv_isInvariantUString(nonASCIIUChars+i, 1)) { log_err("uprv_isInvariantUString(nonASCIIUChars[%d]) failed\n", i); } } }
//---------------------------------------------------------------------------- // // main for genctd // //---------------------------------------------------------------------------- int main(int argc, char **argv) { UErrorCode status = U_ZERO_ERROR; const char *wordFileName; const char *outFileName; const char *outDir = NULL; const char *copyright = NULL; // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. // U_MAIN_INIT_ARGS(argc, argv); progName = argv[0]; argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); if(argc<0) { // Unrecognized option fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } if(options[0].doesOccur || options[1].doesOccur) { // -? or -h for help. usageAndDie(0); } if (!options[3].doesOccur || argc < 2) { fprintf(stderr, "input and output file must both be specified.\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } outFileName = options[3].value; wordFileName = argv[1]; if (options[4].doesOccur) { u_setDataDirectory(options[4].value); } status = U_ZERO_ERROR; /* Combine the directory with the file name */ if(options[5].doesOccur) { outDir = options[5].value; } if (options[6].doesOccur) { copyright = U_COPYRIGHT_STRING; } #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); fprintf(stderr, "%s\n", msg); /* write the dummy data file */ pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &status); return (int)status; #else /* Initialize ICU */ u_init(&status); if (U_FAILURE(status)) { fprintf(stderr, "%s: can not initialize ICU. status = %s\n", argv[0], u_errorName(status)); exit(1); } status = U_ZERO_ERROR; // // Read in the dictionary source file // long result; long wordFileSize; FILE *file; char *wordBufferC; MutableTrieDictionary *mtd = NULL; file = fopen(wordFileName, "rb"); if( file == 0 ) { //cannot find file //create 1-line dummy file: ie 1 char, 1 value UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); fprintf(stderr, "%s\n", msg); UChar c = 0x0020; mtd = new MutableTrieDictionary(c, status, TRUE); mtd->addWord(&c, 1, status, 1); } else { //read words in from input file fseek(file, 0, SEEK_END); wordFileSize = ftell(file); fseek(file, 0, SEEK_SET); wordBufferC = new char[wordFileSize+10]; result = (long)fread(wordBufferC, 1, wordFileSize, file); if (result != wordFileSize) { fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); exit (-1); } wordBufferC[wordFileSize]=0; fclose(file); // // Look for a Unicode Signature (BOM) on the word file // int32_t signatureLength; const char * wordSourceC = wordBufferC; const char* encoding = ucnv_detectUnicodeSignature( wordSourceC, wordFileSize, &signatureLength, &status); if (U_FAILURE(status)) { exit(status); } if(encoding!=NULL ){ wordSourceC += signatureLength; wordFileSize -= signatureLength; } // // Open a converter to take the rule file to UTF-16 // UConverter* conv; conv = ucnv_open(encoding, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // // Convert the words to UChar. // Preflight first to determine required buffer size. // uint32_t destCap = ucnv_toUChars(conv, NULL, // dest, 0, // destCapacity, wordSourceC, wordFileSize, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; status = U_ZERO_ERROR; UChar *wordSourceU = new UChar[destCap+1]; ucnv_toUChars(conv, wordSourceU, // dest, destCap+1, wordSourceC, wordFileSize, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; ucnv_close(conv); // Get rid of the original file buffer delete[] wordBufferC; // Create a MutableTrieDictionary, and loop through all the lines, inserting // words. // First, pick a median character. UChar *current = wordSourceU + (destCap/2); UChar uc = *current++; UnicodeSet breaks; breaks.add(0x000A); // Line Feed breaks.add(0x000D); // Carriage Return breaks.add(0x2028); // Line Separator breaks.add(0x2029); // Paragraph Separator do { // Look for line break while (uc && !breaks.contains(uc)) { uc = *current++; } // Now skip to first non-line-break while (uc && breaks.contains(uc)) { uc = *current++; } } while (uc && (breaks.contains(uc) || u_isspace(uc))); mtd = new MutableTrieDictionary(uc, status); if (U_FAILURE(status)) { fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Now add the words. Words are non-space characters at the beginning of // lines, and must be at least one UChar. If a word has an associated value, // the value should follow the word on the same line after a tab character. current = wordSourceU; UChar *candidate = current; uc = *current++; int32_t length = 0; int count = 0; while (uc) { while (uc && !u_isspace(uc)) { ++length; uc = *current++; } UnicodeString valueString; UChar candidateValue; if(uc == 0x0009){ //separator is a tab char, read in number after space while (uc && u_isspace(uc)) { uc = *current++; } while (uc && !u_isspace(uc)) { valueString.append(uc); uc = *current++; } } if (length > 0) { count++; if(valueString.length() > 0){ mtd->setValued(TRUE); uint32_t value = 0; char* s = new char[valueString.length()]; valueString.extract(0,valueString.length(), s, valueString.length()); int n = sscanf(s, "%ud", &value); U_ASSERT(n == 1); U_ASSERT(value >= 0); mtd->addWord(candidate, length, status, (uint16_t)value); delete[] s; } else { mtd->addWord(candidate, length, status); } if (U_FAILURE(status)) { fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", u_errorName(status), count); exit(status); } } // Find beginning of next line while (uc && !breaks.contains(uc)) { uc = *current++; } // Find next non-line-breaking character while (uc && breaks.contains(uc)) { uc = *current++; } candidate = current-1; length = 0; } // Get rid of the Unicode text buffer delete[] wordSourceU; } // Now, create a CompactTrieDictionary from the mutable dictionary CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); if (U_FAILURE(status)) { fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Get rid of the MutableTrieDictionary delete mtd; // // Get the binary data from the dictionary. // uint32_t outDataSize = ctd->dataSize(); const uint8_t *outData = (const uint8_t *)ctd->data(); // // Create the output file // size_t bytesWritten; UNewDataMemory *pData; pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", outFileName, u_errorName(status)); exit(status); } // Write the data itself. udata_writeBlock(pData, outData, outDataSize); // finish up bytesWritten = udata_finish(pData, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status)); exit(status); } if (bytesWritten != outDataSize) { fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); exit(-1); } // Get rid of the CompactTrieDictionary delete ctd; u_cleanup(); printf("genctd: tool completed successfully.\n"); return 0; #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ }
//------------------------------------------------------------------------------------------ // // readFile Read a file into memory, and convert it to Unicode. // // Since this is just a demo program, take the simple minded approach // of always reading the whole file at once. No intelligent buffering // is done. // //------------------------------------------------------------------------------------------ void readFile(const char *name) { // // Initialize global file variables // fileName = name; fileLen = 0; // zero length prevents processing in case of errors. // // Open the file and determine its size. // FILE *file = fopen(name, "rb"); if (file == 0 ) { fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName); return; } fseek(file, 0, SEEK_END); int rawFileLen = ftell(file); fseek(file, 0, SEEK_SET); // // Read in the file // charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking... int t = fread(charBuf, 1, rawFileLen, file); if (t != rawFileLen) { fprintf(stderr, "Error reading file \"%s\"\n", fileName); fclose(file); return; } charBuf[rawFileLen]=0; fclose(file); // // Look for a Unicode Signature (BOM) in the data // int32_t signatureLength; const char * charDataStart = charBuf; UErrorCode status = U_ZERO_ERROR; const char* encoding = ucnv_detectUnicodeSignature( charDataStart, rawFileLen, &signatureLength, &status); if (U_FAILURE(status)) { fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n", u_errorName(status)); return; } if(encoding!=NULL ){ charDataStart += signatureLength; rawFileLen -= signatureLength; } // // Open a converter to take the file to UTF-16 // UConverter* conv; conv = ucnv_open(encoding, &status); if (U_FAILURE(status)) { fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status)); return; } // // Convert the file data to UChar. // Preflight first to determine required buffer size. // uint32_t destCap = ucnv_toUChars(conv, NULL, // dest, 0, // destCapacity, charDataStart, rawFileLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); return; }; status = U_ZERO_ERROR; ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar)); ucnv_toUChars(conv, ucharBuf, // dest, destCap+1, charDataStart, rawFileLen, &status); if (U_FAILURE(status)) { fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); return; }; ucnv_close(conv); // // Successful conversion. Set the global size variables so that // the rest of the processing will proceed for this file. // fileLen = destCap; }
U16Char_t* convCharStrToU16Str(const char* src, const char* Encoding) { //static char const* const tocode = CHARCONV_ICONV_UTF16; char const* const fromcode = getPlatformEncoding(Encoding); UErrorCode status = U_ZERO_ERROR; #ifdef ENCCONV_DEBUG std::cout << "\t" "convString" << std::endl; //std::cout << "\t\t" "tocode = " << tocode << std::endl; std::cout << "\t\t" "fromcode = " << fromcode << std::endl; #endif //iconv_t cd = iconv_open(tocode, fromcode); // Initializing ICU converter UConverter *conv= ucnv_open(fromcode, &status); #ifdef CHARCONV_DEBUG std::cout << "\t\t" "aft ucnv_open: status = " << status << std::endl; #endif if (conv == NULL) { // try default encoding "ISO-8859-1" //throw std::runtime_error("Unable to create Unicode converter object"); conv = ucnv_open("ISO-8859-1", &status); } char const* srcWrk = src; const size_t srcSizeInBytes = std::strlen(src); const size_t dstSizeInBytes = MAX(256, (srcSizeInBytes + 1)) * sizeof(U16Char_t); U16Char_t* dst = new U16Char_t [dstSizeInBytes / sizeof(U16Char_t)]; U16Char_t* dstWrk = dst; size_t srcLeftInBytes = srcSizeInBytes; size_t dstLeftInBytes = dstSizeInBytes - sizeof(U16Char_t); status = U_ZERO_ERROR; //still if conv is null simply return blank string if (conv == NULL) { dst[0] = NULL; //Fix for #3211945 dstWrk = NULL; return dst; } ucnv_toUChars(conv, (UChar *) dstWrk, dstLeftInBytes, (char*)srcWrk, srcLeftInBytes, &status); #ifdef CHARCONV_DEBUG std::cout << "\t\t" "aft iconv: status = " << status << std::endl; #endif if (status != U_ZERO_ERROR ) { // throw std::runtime_error("Unable to convert to string"); *dstWrk = 0; } //const int err = iconv_close(cd); ucnv_close(conv); //if (err == -1) // throw std::runtime_error("Unable to deallocate iconv_t object"); //Fix for #3211945 dstWrk = NULL; return dst; }
std::string ReaderUtil::Recode(const std::string& str_to_encode, const std::string& src_enc, const std::string& dst_enc) { std::string src_enc_str = src_enc; std::string dst_enc_str = dst_enc; if (src_enc.empty() || dst_enc.empty() || str_to_encode.empty()) { return str_to_encode; } if (atoi(src_enc.c_str()) > 0) { src_enc_str = ReaderUtil::CodepageToEncoding(atoi(src_enc.c_str())); } if (atoi(dst_enc.c_str()) > 0) { dst_enc_str = ReaderUtil::CodepageToEncoding(atoi(dst_enc.c_str())); } #ifdef LCF_SUPPORT_ICU UErrorCode status = U_ZERO_ERROR; int size = str_to_encode.size() * 4; UChar* unicode_str = new UChar[size]; UConverter *conv; int length; std::string result_str; conv = ucnv_open(src_enc_str.c_str(), &status); if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) { fprintf(stderr, "liblcf: ucnv_open() error for source encoding \"%s\": %s\n", src_enc_str.c_str(), u_errorName(status)); return std::string(); } status = U_ZERO_ERROR; length = ucnv_toUChars(conv, unicode_str, size, str_to_encode.c_str(), -1, &status); ucnv_close(conv); if (status != U_ZERO_ERROR) { fprintf(stderr, "liblcf: ucnv_toUChars() error when encoding \"%s\": %s\n", str_to_encode.c_str(), u_errorName(status)); delete[] unicode_str; return std::string(); } char* result = new char[length * 4]; conv = ucnv_open(dst_enc_str.c_str(), &status); if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) { fprintf(stderr, "liblcf: ucnv_open() error for destination encoding \"%s\": %s\n", dst_enc_str.c_str(), u_errorName(status)); delete[] unicode_str; delete[] result; return std::string(); } status = U_ZERO_ERROR; ucnv_fromUChars(conv, result, length * 4, unicode_str, -1, &status); ucnv_close(conv); if (status != U_ZERO_ERROR) { fprintf(stderr, "liblcf: ucnv_fromUChars() error: %s\n", u_errorName(status)); delete[] unicode_str; delete[] result; return std::string(); } result_str = result; delete[] unicode_str; delete[] result; return std::string(result_str); #else iconv_t cd = iconv_open(dst_enc_str.c_str(), src_enc_str.c_str()); if (cd == (iconv_t)-1) return str_to_encode; char *src = const_cast<char *>(str_to_encode.c_str()); size_t src_left = str_to_encode.size(); size_t dst_size = str_to_encode.size() * 5 + 10; char *dst = new char[dst_size]; size_t dst_left = dst_size; # ifdef ICONV_CONST char ICONV_CONST *p = src; # else char *p = src; # endif char *q = dst; size_t status = iconv(cd, &p, &src_left, &q, &dst_left); iconv_close(cd); if (status == (size_t) -1 || src_left > 0) { delete[] dst; return std::string(); } *q++ = '\0'; std::string result(dst); delete[] dst; return result; #endif }
UBool convsample_21_didSubstitute(const char *source) { UChar uchars[100]; char bytes[100]; UConverter *conv = NULL, *cloneCnv = NULL; UErrorCode status = U_ZERO_ERROR; uint32_t len, len2; int32_t cloneLen; UBool flagVal = FALSE; UConverterFromUCallback junkCB; FromUFLAGContext *flagCtx = NULL, *cloneFlagCtx = NULL; debugCBContext *debugCtx1 = NULL, *debugCtx2 = NULL, *cloneDebugCtx = NULL; printf("\n\n==============================================\n" "Sample 21: C: Test for substitution w/ callbacks & clones \n"); /* print out the original source */ printBytes("src", source); printf("\n"); /* First, convert from UTF8 to unicode */ conv = ucnv_open("utf-8", &status); U_ASSERT(status); len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); U_ASSERT(status); printUChars("uch", uchars, len); printf("\n"); /* Now, close the converter */ ucnv_close(conv); /* Now, convert to windows-1252 */ conv = ucnv_open("windows-1252", &status); U_ASSERT(status); /* Converter starts out with the SUBSTITUTE callback set. */ /* initialize our callback */ /* from the 'bottom' innermost, out * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ #if DEBUG_TMI printf("flagCB_fromU = %p\n", &flagCB_fromU); printf("debugCB_fromU = %p\n", &debugCB_fromU); #endif debugCtx1 = debugCB_openContext(); flagCtx = flagCB_fromU_openContext(); debugCtx2 = debugCB_openContext(); debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ debugCtx1->subContext = flagCtx; flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ flagCtx->subContext = debugCtx2; debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; debugCtx2->subContext = NULL; /* Set our special callback */ ucnv_setFromUCallBack(conv, debugCB_fromU, debugCtx1, &(debugCtx2->subCallback), &(debugCtx2->subContext), &status); U_ASSERT(status); #if DEBUG_TMI printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", conv, debugCtx1, debugCtx1->subCallback, debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); #endif cloneLen = 1; /* but passing in null so it will clone */ cloneCnv = ucnv_safeClone(conv, NULL, &cloneLen, &status); U_ASSERT(status); #if DEBUG_TMI printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); #endif ucnv_close(conv); #if DEBUG_TMI printf("%p closed.\n", conv); #endif U_ASSERT(status); /* Now, we have to extract the context */ cloneDebugCtx = NULL; cloneFlagCtx = NULL; ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); if(cloneDebugCtx != NULL) { cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; } printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); U_ASSERT(status); if(cloneFlagCtx != NULL) { flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ } else { printf("** Warning, couldn't get the subcallback \n"); } ucnv_close(cloneCnv); /* print out the original source */ printBytes("bytes", bytes, len2); return flagVal; /* true if callback was called */ }
UBool convsample_20_didSubstitute(const char *source) { UChar uchars[100]; char bytes[100]; UConverter *conv = NULL; UErrorCode status = U_ZERO_ERROR; uint32_t len, len2; UBool flagVal; FromUFLAGContext * context = NULL; printf("\n\n==============================================\n" "Sample 20: C: Test for substitution using callbacks\n"); /* print out the original source */ printBytes("src", source); printf("\n"); /* First, convert from UTF8 to unicode */ conv = ucnv_open("utf-8", &status); U_ASSERT(status); len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); U_ASSERT(status); printUChars("uch", uchars, len); printf("\n"); /* Now, close the converter */ ucnv_close(conv); /* Now, convert to windows-1252 */ conv = ucnv_open("windows-1252", &status); U_ASSERT(status); /* Converter starts out with the SUBSTITUTE callback set. */ /* initialize our callback */ context = flagCB_fromU_openContext(); /* Set our special callback */ ucnv_setFromUCallBack(conv, flagCB_fromU, context, &(context->subCallback), &(context->subContext), &status); U_ASSERT(status); len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); U_ASSERT(status); flagVal = context->flag; /* it's about to go away when we close the cnv */ ucnv_close(conv); /* print out the original source */ printBytes("bytes", bytes, len2); return flagVal; /* true if callback was called */ }