librevenge::RVNGString libvisio::VSDMetaData::readCodePageString(librevenge::RVNGInputStream *input) { uint32_t size = readU32(input); if (size > getRemainingLength(input)) size = getRemainingLength(input); if (size == 0) return librevenge::RVNGString(); std::vector<unsigned char> characters; for (uint32_t i = 0; i < size; ++i) characters.push_back(readU8(input)); uint32_t codepage = getCodePage(); librevenge::RVNGString string; if (codepage == 65001) { // http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130%28v=vs.85%29.aspx // says this is UTF-8. characters.push_back(0); string.append(reinterpret_cast<const char *>(characters.data())); } else { UErrorCode status = U_ZERO_ERROR; UConverter *conv = nullptr; switch (codepage) { case 1252: // http://msdn.microsoft.com/en-us/goglobal/bb964654 conv = ucnv_open("windows-1252", &status); break; } if (U_SUCCESS(status) && conv) { assert(!characters.empty()); const auto *src = (const char *)&characters[0]; const char *srcLimit = (const char *)src + characters.size(); while (src < srcLimit) { UChar32 ucs4Character = ucnv_getNextUChar(conv, &src, srcLimit, &status); if (U_SUCCESS(status) && U_IS_UNICODE_CHAR(ucs4Character)) appendUCS4(string, ucs4Character); } } if (conv) ucnv_close(conv); } return string; }
SEXP R_stri_length(SEXP vec) { int vec_len = LENGTH(vec); SEXP ret = PROTECT(allocVector(INTSXP, vec_len)); int* retint = INTEGER(ret); for (int i = 0; i < vec_len; i++) { SEXP str = STRING_ELT(vec, i); if (str == NA_STRING) { retint[i] = NA_INTEGER; continue; } int str_len = LENGTH(str); if (getCharCE(str) == CE_LATIN1 || (getCharCE(str) == CE_NATIVE && getNativeCE() == CE_LATIN1)) { retint[i] = str_len; } else if (getCharCE(str) == CE_BYTES) { UNPROTECT(1); error("Invalid encoding: bytes."); } else if (getCharCE(str) == CE_UTF8 || (getCharCE(str) == CE_NATIVE && getNativeCE() == CE_UTF8)) { UChar32 out = 0; const char* source = CHAR(str); R_len_t j = 0; int count; for (count = 0; out >= 0 && j < str_len; count++) { U8_NEXT(source, j, str_len, out); // faster that U8_FWD_1 & gives bad UChar32s } if (out < 0) { warning("Invalid UTF8 string: %s", source); retint[i] = NA_INTEGER; } else { retint[i] = count; } } else if (native_is_singlebyte()) { // native-8bit retint[i] = str_len; } else { // native encoding, not 8 bit UErrorCode status = U_ZERO_ERROR; UConverter* conv = ucnv_open(NULL, &status); const char* source = CHAR(str); const char* sourceLimit = source + str_len; int j; for (j = 0; source != sourceLimit; j++) { ucnv_getNextUChar(conv, &source, sourceLimit, &status); } retint[i] = j; // all right, we got it! } } UNPROTECT(1); return ret; }
UErrorCode convsample_13() { printf("\n\n==============================================\n" "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; const char *source, *sourceLimit; UChar32 target; UErrorCode status = U_ZERO_ERROR; UConverter *conv = NULL; int32_t srcCount=0; int32_t dstCount=0; srcCount = sizeof(sourceChars); conv = ucnv_open("Big5", &status); U_ASSERT(status); source = sourceChars; sourceLimit = sourceChars + sizeof(sourceChars); // **************************** START SAMPLE ******************* printBytes("src",source,sourceLimit-source); while(source < sourceLimit) { puts(""); target = ucnv_getNextUChar (conv, &source, sourceLimit, &status); // printBytes("src",source,sourceLimit-source); U_ASSERT(status); printUChar(target); dstCount++; } // ************************** END SAMPLE ************************* printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); ucnv_close(conv); return U_ZERO_ERROR; }
void str_ucnv_each_uchar32_starting_from(rb_str_t *self, long start_offset_in_bytes, each_uchar32_callback_t callback) { USE_CONVERTER(cnv, self->encoding); rb_ensure_b(^{ const char *pos = self->bytes + start_offset_in_bytes; const char *end = pos + self->length_in_bytes; bool stop = false; for (;;) { const char *char_start_pos = pos; // iterate through the string one Unicode code point at a time UErrorCode err = U_ZERO_ERROR; UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err); if (err == U_INDEX_OUTOFBOUNDS_ERROR) { // end of the string break; } else if (U_FAILURE(err)) { long min_char_size = self->encoding->min_char_size; while (char_start_pos < pos) { long char_len = pos - char_start_pos; if (char_len > min_char_size) { char_len = min_char_size; } callback(U_SENTINEL, char_start_pos-self->bytes, char_len, &stop); if (stop) { return; } char_start_pos += char_len; } } else { long char_len = pos - char_start_pos; callback(c, char_start_pos-self->bytes, char_len, &stop); if (stop) { return; } } } }, ^{
long str_ucnv_length(rb_str_t *self, bool ucs2_mode) { USE_CONVERTER(cnv, self->encoding); const char *pos = self->bytes; const char *end = pos + self->length_in_bytes; long len = 0; bool valid_encoding = true; for (;;) { const char *character_start_pos = pos; // iterate through the string one Unicode code point at a time UErrorCode err = U_ZERO_ERROR; UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err); if (err == U_INDEX_OUTOFBOUNDS_ERROR) { // end of the string break; } else if (U_FAILURE(err)) { valid_encoding = false; long min_char_size = self->encoding->min_char_size; long converted_width = pos - character_start_pos; len += div_round_up(converted_width, min_char_size); } else { if (ucs2_mode && !U_IS_BMP(c)) { len += 2; } else { ++len; } } } ucnv_close(cnv); str_set_valid_encoding(self, valid_encoding); return len; }
void str_ucnv_update_flags(rb_str_t *self) { USE_CONVERTER(cnv, self->encoding); bool ascii_only = true; bool valid_encoding = true; const char *pos = self->bytes; const char *end = pos + self->length_in_bytes; for (;;) { // iterate through the string one Unicode code point at a time UErrorCode err = U_ZERO_ERROR; UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err); if (U_FAILURE(err)) { if (err == U_INDEX_OUTOFBOUNDS_ERROR) { // end of the string break; } else { // conversion error valid_encoding = false; ascii_only = false; break; } } else { if (c > 127) { ascii_only = false; } } } ucnv_close(cnv); str_set_valid_encoding(self, valid_encoding); str_set_ascii_only(self, ascii_only); }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid utf-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_length(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored by R if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8 UChar32 c = 0; const char* curs_s = CHAR(curs); R_len_t j = 0; R_len_t i = 0; while (c >= 0 && j < curs_n) { U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s i++; } if (c < 0) { // invalid utf-8 sequence Rf_warning(MSG__INVALID_UTF8); retint[k] = NA_INTEGER; } else retint[k] = i; } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') // this is weird, but we'll face it UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) }
UErrorCode convsample_06() { printf("\n\n==============================================\n" "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); FILE *f; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; int32_t uBufSize = 0; UConverter *conv; UErrorCode status = U_ZERO_ERROR; uint32_t letters=0, total=0; CharFreqInfo *info; UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ UChar32 p; uint32_t ie = 0; uint32_t gh = 0; UChar32 l = 0; f = fopen("data06.txt", "r"); if(!f) { fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); return U_FILE_ACCESS_ERROR; } info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); if(!info) { fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); } /* reset frequencies */ for(p=0;p<charCount;p++) { info[p].codepoint = p; info[p].frequency = 0; } // **************************** START SAMPLE ******************* conv = ucnv_open("utf-8", &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; while(source < sourceLimit) { p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); if(U_FAILURE(status)) { fprintf(stderr, "%s @ %d\n", u_errorName(status), total); status = U_ZERO_ERROR; continue; } U_ASSERT(status); total++; if(u_isalpha(p)) letters++; if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) ie++; if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) gh++; if(p>charCount) { fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); return U_UNSUPPORTED_ERROR; } info[p].frequency++; l = p; } } fclose(f); ucnv_close(conv); printf("%d letters out of %d total UChars.\n", letters, total); printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); // now, we could sort it.. // qsort(info, charCount, sizeof(info[0]), charfreq_compare); for(p=0;p<charCount;p++) { if(info[p].frequency) { printf("% 5d U+%06X ", info[p].frequency, p); if(p <= 0xFFFF) { prettyPrintUChar((UChar)p); } printf("\n"); } } free(info); // ***************************** END SAMPLE ******************** printf("\n"); return U_ZERO_ERROR; }
/** * Count the number of characters in a string * * Note that ICU permits only strings of length < 2^31. * @param s R character vector * @return integer vector * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) Multiple input encoding support * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_length(SEXP str) { str = stri_prepare_arg_string(str, "str"); R_len_t ns = LENGTH(str); SEXP ret; UConverter* uconv = NULL; bool uconv_8bit = false; bool uconv_utf8 = false; STRI__ERROR_HANDLER_BEGIN /* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */ #define STRI_LENGTH_CALCULATE_UTF8 \ const char* qc = CHAR(q); \ R_len_t j = 0; \ for (R_len_t i = 0; i < nq; j++) \ U8_FWD_1(qc, i, nq); \ retint[k] = j; PROTECT(ret = Rf_allocVector(INTSXP, ns)); int* retint = INTEGER(ret); for (R_len_t k = 0; k < ns; k++) { SEXP q = STRING_ELT(str, k); if (q == NA_STRING) retint[k] = NA_INTEGER; else { R_len_t nq = LENGTH(q); // O(1) - stored by R // We trust (is that a wise assumption?) // R encoding marks; However, it there is no mark, // the string may have any encoding (ascii, latin1, utf8, native) if (IS_ASCII(q) || IS_LATIN1(q)) retint[k] = nq; else if (IS_BYTES(q)) throw StriException(MSG__BYTESENC); else if (IS_UTF8(q)) { STRI_LENGTH_CALCULATE_UTF8 } else { // Any encoding - detection needed // UTF-8 strings can be fairly reliably recognized as such by a // simple algorithm, i.e., the probability that a string of // characters in any other encoding appears as valid UTF-8 is low, // diminishing with increasing string length. // We have two possibilities here: // 1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native // This won't work correctly in some cases. // e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8 // and ("A umlaut", "Ellipsis") in WINDOWS-1250 // 2. Assume it's Native; this assumes the user working in an 8-bit environment // would convert strings to UTF-8 manually if needed - I think is's // a more reasonable approach (Native --> input via keyboard) if (!uconv) { // open ucnv on demand uconv = stri__ucnv_open((const char*)NULL); // native decoder if (!uconv) { retint[k] = NA_INTEGER; continue; } uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1); if (!uconv_8bit) { UErrorCode err = U_ZERO_ERROR; const char* name = ucnv_getName(uconv, &err); if (U_FAILURE(err)) throw StriException("could not query default converter"); uconv_utf8 = !strncmp("UTF-8", name, 5); } } if (uconv_8bit) { retint[k] = nq; // it's an 8-bit encoding :-) } else if (uconv_utf8) { // it's UTF-8 STRI_LENGTH_CALCULATE_UTF8 } else { // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5') UErrorCode err = U_ZERO_ERROR; const char* source = CHAR(q); const char* sourceLimit = source + nq; R_len_t j; for (j = 0; source != sourceLimit; j++) { if (U_FAILURE(err)) break; // error from previous iteration // iterate through each native-encoded character: ucnv_getNextUChar(uconv, &source, sourceLimit, &err); } if (U_FAILURE(err)) { // error from last iteration Rf_warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string."); retint[k] = NA_INTEGER; } else retint[k] = j; // all right, we got it! } } }