void ReplaceableGlue::extractBetween(int32_t start, int32_t limit, UnicodeString & target) const { (*func->extract)(rep, start, limit, target.getBuffer(limit - start)); target.releaseBuffer(limit - start); }
/* {{{ intl_stringFromChar */ int intl_stringFromChar(UnicodeString &ret, char *str, size_t str_len, UErrorCode *status) { if(str_len > INT32_MAX) { *status = U_BUFFER_OVERFLOW_ERROR; ret.setToBogus(); return FAILURE; } //the number of UTF-16 code units is not larger than that of UTF-8 code //units, + 1 for the terminator int32_t capacity = (int32_t)str_len + 1; //no check necessary -- if NULL will fail ahead UChar *utf16 = ret.getBuffer(capacity); int32_t utf16_len = 0; *status = U_ZERO_ERROR; u_strFromUTF8WithSub(utf16, ret.getCapacity(), &utf16_len, str, str_len, U_SENTINEL /* no substitution */, NULL, status); ret.releaseBuffer(utf16_len); if (U_FAILURE(*status)) { ret.setToBogus(); return FAILURE; } return SUCCESS; }
/* Explain <xxxxx> tag to a native value * * Since <xxxxx> is always larger than the native value, * the operation will replace the tag directly in the buffer, * and, of course, will shift tail elements. */ void IdnaConfTest::ExplainCodePointTag(UnicodeString& buf){ buf.append((UChar)0); // add a terminal NULL UChar* bufBase = buf.getBuffer(buf.length()); UChar* p = bufBase; while (*p != 0){ if ( *p != 0x3C){ // < *bufBase++ = *p++; } else { p++; // skip < UChar32 cp = 0; for ( ;*p != 0x3E; p++){ // > if (0x30 <= *p && *p <= 0x39){ // 0-9 cp = (cp * 16) + (*p - 0x30); } else if (0x61 <= *p && *p <= 0x66){ // a-f cp = (cp * 16) + (*p - 0x61) + 10; } else if (0x41 <= *p && *p <= 0x46) {// A-F cp = (cp * 16) + (*p - 0x41) + 10; } // no else. hope everything is good. } p++; // skip > if (U_IS_BMP(cp)){ *bufBase++ = cp; } else { *bufBase++ = U16_LEAD(cp); *bufBase++ = U16_TRAIL(cp); } } } *bufBase = 0; // close our buffer buf.releaseBuffer(); }
static UnicodeString loadNumericDateFormatterPattern( const UResourceBundle *resource, const char *pattern, UErrorCode &status) { UnicodeString result; if (U_FAILURE(status)) { return result; } CharString chs; chs.append("durationUnits", status) .append("/", status).append(pattern, status); LocalUResourceBundlePointer patternBundle( ures_getByKeyWithFallback( resource, chs.data(), NULL, &status)); if (U_FAILURE(status)) { return result; } getString(patternBundle.getAlias(), result, status); // Replace 'h' with 'H' int32_t len = result.length(); UChar *buffer = result.getBuffer(len); for (int32_t i = 0; i < len; ++i) { if (buffer[i] == 0x68) { // 'h' buffer[i] = 0x48; // 'H' } } result.releaseBuffer(len); return result; }
void PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { UChar *buffer=uni.getBuffer(-1); int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); if(errorCode==U_BUFFER_OVERFLOW_ERROR) { errorCode=U_ZERO_ERROR; uni.releaseBuffer(0); buffer=uni.getBuffer(length); length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); } uni.releaseBuffer(length); if(U_FAILURE(errorCode)) { fprintf(stderr, "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", s, (long)lineNumber); } }
void CasePropsBuilder::makeUnfoldData(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } UChar *p, *q; int32_t i, j, k; /* sort the data */ int32_t unfoldLength=unfold.length(); int32_t unfoldRows=unfoldLength/UGENCASE_UNFOLD_WIDTH-1; UChar *unfoldBuffer=unfold.getBuffer(-1); uprv_sortArray(unfoldBuffer+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2, compareUnfold, NULL, FALSE, &errorCode); /* make unique-string rows by merging adjacent ones' code point columns */ /* make p point to row i-1 */ p=unfoldBuffer+UGENCASE_UNFOLD_WIDTH; for(i=1; i<unfoldRows;) { if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) { /* concatenate code point columns */ q=p+UGENCASE_UNFOLD_STRING_WIDTH; for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {} for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) { q[j]=q[UGENCASE_UNFOLD_WIDTH+k]; } if(j>UGENCASE_UNFOLD_CP_WIDTH) { fprintf(stderr, "genprops error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n", (long)j, UGENCASE_UNFOLD_CP_WIDTH); errorCode=U_BUFFER_OVERFLOW_ERROR; return; } /* move following rows up one */ --unfoldRows; u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH); } else { p+=UGENCASE_UNFOLD_WIDTH; ++i; } } unfoldBuffer[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows; if(beVerbose) { puts("unfold data:"); p=unfoldBuffer; for(i=0; i<unfoldRows; ++i) { p+=UGENCASE_UNFOLD_WIDTH; printf("[%2d] %04x %04x %04x <- %04x %04x\n", (int)i, p[0], p[1], p[2], p[3], p[4]); } } unfold.releaseBuffer((unfoldRows+1)*UGENCASE_UNFOLD_WIDTH); }
void font_face_set::get_string_info(string_info & info, UnicodeString const& ustr, char_properties *format) { double avg_height = character_dimensions('X').height(); UErrorCode err = U_ZERO_ERROR; UnicodeString reordered; UnicodeString shaped; int32_t length = ustr.length(); UBiDi *bidi = ubidi_openSized(length, 0, &err); ubidi_setPara(bidi, ustr.getBuffer(), length, UBIDI_DEFAULT_LTR, 0, &err); ubidi_writeReordered(bidi, reordered.getBuffer(length), length, UBIDI_DO_MIRRORING, &err); reordered.releaseBuffer(length); u_shapeArabic(reordered.getBuffer(), length, shaped.getBuffer(length), length, U_SHAPE_LETTERS_SHAPE | U_SHAPE_LENGTH_FIXED_SPACES_NEAR | U_SHAPE_TEXT_DIRECTION_VISUAL_LTR, &err); shaped.releaseBuffer(length); if (U_SUCCESS(err)) { StringCharacterIterator iter(shaped); for (iter.setToStart(); iter.hasNext();) { UChar ch = iter.nextPostInc(); char_info char_dim = character_dimensions(ch); char_dim.format = format; char_dim.avg_height = avg_height; info.add_info(char_dim); } } #if (U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 406) if (ubidi_getBaseDirection(ustr.getBuffer(), length) == UBIDI_RTL) { info.set_rtl(true); } #endif ubidi_close(bidi); }
//--------------------------------------------------------------------- // // pattern // //--------------------------------------------------------------------- UnicodeString RegexPattern::pattern() const { if (fPatternString != NULL) { return *fPatternString; } else if (fPattern == NULL) { return UnicodeString(); } else { UErrorCode status = U_ZERO_ERROR; int64_t nativeLen = utext_nativeLength(fPattern); int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error UnicodeString result; status = U_ZERO_ERROR; UChar *resultChars = result.getBuffer(len16); utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning result.releaseBuffer(len16); return result; } }
UnicodeString MessageFormat::autoQuoteApostrophe(const UnicodeString& pattern, UErrorCode& status) { UnicodeString result; if (U_SUCCESS(status)) { int32_t plen = pattern.length(); const UChar* pat = pattern.getBuffer(); int32_t blen = plen * 2 + 1; // space for null termination, convenience UChar* buf = result.getBuffer(blen); if (buf == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } else { int32_t len = umsg_autoQuoteApostrophe(pat, plen, buf, blen, &status); result.releaseBuffer(U_SUCCESS(status) ? len : 0); } } if (U_FAILURE(status)) { result.setToBogus(); } return result; }
/* Returns a list of tokens, but with various normalizations performed * based on the token type. * * Default behavior: * Whitespace: dropped (removed from output) * Words: converted to lower case * Numbers: replaced with #XXX, where the number of X's is based on the * format of the number; any punctuation is maintained * Japanese/Chinese scripts: converted to lower case * Email: Converted to TOKEN_EMAIL * URL: Converted to TOKEN_URL * Emoticon: Left as-is * Heart: Converted to TOKEN_HEART * Exclamation: Replaced with an empty string * Date: Replaced with TOKEN_DATE * Money: Replaced with TOKEN_MONEY * Time: Replaced with TOKEN_TIME * Acronym: converted to lower case * Other: replaced with empty string * */ Array f_icu_tokenize(CStrRef text) { // Boundary markers that indicate the beginning and end of a token stream. const String BEGIN_MARKER("_B_"); const String END_MARKER("_E_"); Array ret; std::vector<Token> tokens; TAINT_OBSERVER(TAINT_BIT_MUTATED, TAINT_BIT_NONE); #if HAVE_OLD_LIBICU // inspired by the UnicodeString::setToUTF8 implementation int32_t length = text.length(); int32_t bytesWritten=0; UnicodeString input; u_strFromUTF8WithSub(input.getBuffer(length+1), length+1, &bytesWritten, text.data(), length, 0xfffd, NULL, NULL); input.releaseBuffer(bytesWritten); tokenizeString(tokens, HPHP::kMaster, input); #else tokenizeString(tokens, HPHP::kMaster, UnicodeString::fromUTF8(text.data())); #endif int i = 0; ret.set(i++, BEGIN_MARKER); for(std::vector<Token>::iterator iter = tokens.begin(); iter != tokens.end(); iter++) { normalizeToken(*iter); const UnicodeString& word = iter->value; // Ignore spaces and empty strings. if(!s_spaceMatcher->matches(word) && word.length() > 0) { ret.set(i++, String(icuStringToUTF8(word))); } } ret.set(i++, END_MARKER); return ret; }
void StringCaseTest::TestCasingImpl(const UnicodeString &input, const UnicodeString &output, int32_t whichCase, void *iter, const char *localeID, uint32_t options) { // UnicodeString UnicodeString result; const char *name; Locale locale(localeID); result=input; switch(whichCase) { case TEST_LOWER: name="toLower"; result.toLower(locale); break; case TEST_UPPER: name="toUpper"; result.toUpper(locale); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="toTitle"; result.toTitle((BreakIterator *)iter, locale, options); break; #endif case TEST_FOLD: name="foldCase"; result.foldCase(options); break; default: name=""; break; // won't happen } if(result!=output) { dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { result=input; result.toTitle((BreakIterator *)iter, locale); if(result!=output) { dataerrln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res"); } } #endif // UTF-8 char utf8In[100], utf8Out[100]; int32_t utf8InLength, utf8OutLength, resultLength; UChar *buffer; IcuTestErrorCode errorCode(*this, "TestCasingImpl"); LocalUCaseMapPointer csm(ucasemap_open(localeID, options, errorCode)); #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { // Clone the break iterator so that the UCaseMap can safely adopt it. UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, NULL, errorCode); ucasemap_setBreakIterator(csm.getAlias(), clone, errorCode); } #endif u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), errorCode); switch(whichCase) { case TEST_LOWER: name="ucasemap_utf8ToLower"; utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; case TEST_UPPER: name="ucasemap_utf8ToUpper"; utf8OutLength=ucasemap_utf8ToUpper(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="ucasemap_utf8ToTitle"; utf8OutLength=ucasemap_utf8ToTitle(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; #endif case TEST_FOLD: name="ucasemap_utf8FoldCase"; utf8OutLength=ucasemap_utf8FoldCase(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; default: name=""; utf8OutLength=0; break; // won't happen } buffer=result.getBuffer(utf8OutLength); u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, errorCode); result.releaseBuffer(errorCode.isSuccess() ? resultLength : 0); if(errorCode.isFailure()) { errcheckln(errorCode, "error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode)); errorCode.reset(); } else if(result!=output) { errln("error: %s() got a wrong result for a test case from casing.res", name); errln("expected \"" + output + "\" got \"" + result + "\"" ); } }
void StringCaseTest::TestCasingImpl(const UnicodeString &input, const UnicodeString &output, int32_t whichCase, void *iter, const char *localeID, uint32_t options) { // UnicodeString UnicodeString result; const char *name; Locale locale(localeID); result=input; switch(whichCase) { case TEST_LOWER: name="toLower"; result.toLower(locale); break; case TEST_UPPER: name="toUpper"; result.toUpper(locale); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="toTitle"; result.toTitle((BreakIterator *)iter, locale, options); break; #endif case TEST_FOLD: name="foldCase"; result.foldCase(options); break; default: name=""; break; // won't happen } if(result!=output) { errln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { result=input; result.toTitle((BreakIterator *)iter, locale); if(result!=output) { errln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res"); } } #endif // UTF-8 char utf8In[100], utf8Out[100]; int32_t utf8InLength, utf8OutLength, resultLength; UChar *buffer; UCaseMap *csm; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open(localeID, options, &errorCode); #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { // Clone the break iterator so that the UCaseMap can safely adopt it. int32_t size=1; // Not 0 because that only gives preflighting. UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, &size, &errorCode); ucasemap_setBreakIterator(csm, clone, &errorCode); } #endif u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), &errorCode); switch(whichCase) { case TEST_LOWER: name="ucasemap_utf8ToLower"; utf8OutLength=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; case TEST_UPPER: name="ucasemap_utf8ToUpper"; utf8OutLength=ucasemap_utf8ToUpper(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="ucasemap_utf8ToTitle"; utf8OutLength=ucasemap_utf8ToTitle(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; #endif case TEST_FOLD: name="ucasemap_utf8FoldCase"; utf8OutLength=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; default: name=""; utf8OutLength=0; break; // won't happen } buffer=result.getBuffer(utf8OutLength); u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, &errorCode); result.releaseBuffer(U_SUCCESS(errorCode) ? resultLength : 0); if(U_FAILURE(errorCode)) { errln("error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode)); } else if(result!=output) { errln("error: %s() got a wrong result for a test case from casing.res", name); } ucasemap_close(csm); }
UXMLElement * UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { char bytes[4096], charsetBuffer[100]; FileStream *f; const char *charset, *pb; UnicodeString src; UConverter *cnv; UChar *buffer, *pu; int32_t fileLength, bytesLength, length, capacity; UBool flush; if(U_FAILURE(errorCode)) { return NULL; } f=T_FileStream_open(filename, "rb"); if(f==NULL) { errorCode=U_FILE_ACCESS_ERROR; return NULL; } bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); if(bytesLength<(int32_t)sizeof(bytes)) { // we have already read the entire file fileLength=bytesLength; } else { // get the file length fileLength=T_FileStream_size(f); } /* * get the charset: * 1. Unicode signature * 2. treat as ISO-8859-1 and read XML encoding="charser" * 3. default to UTF-8 */ charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); if(U_SUCCESS(errorCode) && charset!=NULL) { // open converter according to Unicode signature cnv=ucnv_open(charset, &errorCode); } else { // read as Latin-1 and parse the XML declaration and encoding cnv=ucnv_open("ISO-8859-1", &errorCode); if(U_FAILURE(errorCode)) { // unexpected error opening Latin-1 converter goto exit; } buffer=src.getBuffer(bytesLength); if(buffer==NULL) { // unexpected failure to reserve some string capacity errorCode=U_MEMORY_ALLOCATION_ERROR; goto exit; } pb=bytes; pu=buffer; ucnv_toUnicode( cnv, &pu, buffer+src.getCapacity(), &pb, bytes+bytesLength, NULL, TRUE, &errorCode); src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); ucnv_close(cnv); cnv=NULL; if(U_FAILURE(errorCode)) { // unexpected error in conversion from Latin-1 src.remove(); goto exit; } // parse XML declaration if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { int32_t declEnd=mXMLDecl.end(errorCode); // go beyond <?xml int32_t pos=src.indexOf((UChar)x_l)+1; mAttrValue.reset(src); while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. UnicodeString attName = mAttrValue.group(1, errorCode); UnicodeString attValue = mAttrValue.group(2, errorCode); // Trim the quotes from the att value. These are left over from the original regex // that parsed the attribue, which couldn't conveniently strip them. attValue.remove(0,1); // one char from the beginning attValue.truncate(attValue.length()-1); // and one from the end. if(attName==UNICODE_STRING("encoding", 8)) { length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); charset=charsetBuffer; break; } pos = mAttrValue.end(2, errorCode); } if(charset==NULL) { // default to UTF-8 charset="UTF-8"; } cnv=ucnv_open(charset, &errorCode); } } if(U_FAILURE(errorCode)) { // unable to open the converter goto exit; } // convert the file contents capacity=fileLength; // estimated capacity src.getBuffer(capacity); src.releaseBuffer(0); // zero length flush=FALSE; for(;;) { // convert contents of bytes[bytesLength] pb=bytes; for(;;) { length=src.length(); buffer=src.getBuffer(capacity); if(buffer==NULL) { // unexpected failure to reserve some string capacity errorCode=U_MEMORY_ALLOCATION_ERROR; goto exit; } pu=buffer+length; ucnv_toUnicode( cnv, &pu, buffer+src.getCapacity(), &pb, bytes+bytesLength, NULL, FALSE, &errorCode); src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); if(errorCode==U_BUFFER_OVERFLOW_ERROR) { errorCode=U_ZERO_ERROR; capacity=(3*src.getCapacity())/2; // increase capacity by 50% } else { break; } } if(U_FAILURE(errorCode)) { break; // conversion error } if(flush) { break; // completely converted the file } // read next block bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); if(bytesLength==0) { // reached end of file, convert once more to flush the converter flush=TRUE; } }; exit: ucnv_close(cnv); T_FileStream_close(f); if(U_SUCCESS(errorCode)) { return parse(src, errorCode); } else { return NULL; } }
int main (int argc, char** argv) { if (argc != 2) { std::cerr << "Usage: " << argv[0] << " <num-iter>" << std::endl; return EXIT_FAILURE; } const unsigned NUM_ITER = atoi(argv[1]); // open first face in the font FT_Library ft_library = 0; FT_Error error = FT_Init_FreeType(&ft_library); if (error) throw std::runtime_error("Failed to initialize FreeType2 library"); FT_Face ft_face[NUM_EXAMPLES]; FT_New_Face(ft_library, "fonts/DejaVuSerif.ttf", 0, &ft_face[ENGLISH]); FT_New_Face(ft_library, "fonts/amiri-0.104/amiri-regular.ttf", 0, &ft_face[ARABIC]); FT_New_Face(ft_library, "fonts/fireflysung-1.3.0/fireflysung.ttf", 0, &ft_face[CHINESE]); // Get our harfbuzz font structs hb_font_t *hb_ft_font[NUM_EXAMPLES]; hb_ft_font[ENGLISH] = hb_ft_font_create(ft_face[ENGLISH], NULL); hb_ft_font[ARABIC] = hb_ft_font_create(ft_face[ARABIC] , NULL); hb_ft_font[CHINESE] = hb_ft_font_create(ft_face[CHINESE], NULL); { std::cerr << "Starting ICU shaping:" << std::endl; progress_timer timer1(std::clog,"ICU shaping done"); UErrorCode err = U_ZERO_ERROR; for (unsigned i = 0; i < NUM_ITER; ++i) { for (unsigned j = 0; j < NUM_EXAMPLES; ++j) { UnicodeString text = UnicodeString::fromUTF8(texts[j]); int32_t length = text.length(); UnicodeString reordered; UnicodeString shaped; UBiDi *bidi = ubidi_openSized(length, 0, &err); ubidi_setPara(bidi, text.getBuffer(), length, UBIDI_DEFAULT_LTR, 0, &err); ubidi_writeReordered(bidi, reordered.getBuffer(length), length, UBIDI_DO_MIRRORING, &err); ubidi_close(bidi); reordered.releaseBuffer(length); u_shapeArabic(reordered.getBuffer(), length, shaped.getBuffer(length), length, U_SHAPE_LETTERS_SHAPE | U_SHAPE_LENGTH_FIXED_SPACES_NEAR | U_SHAPE_TEXT_DIRECTION_VISUAL_LTR, &err); shaped.releaseBuffer(length); if (U_SUCCESS(err)) { U_NAMESPACE_QUALIFIER StringCharacterIterator iter(shaped); for (iter.setToStart(); iter.hasNext();) { UChar ch = iter.nextPostInc(); int32_t glyph_index = FT_Get_Char_Index(ft_face[j], ch); if (i == 0) { std::cerr << glyph_index << ":"; } } if (i == 0) std::cerr << std::endl; } } } } { const char **shaper_list = hb_shape_list_shapers(); for ( ;*shaper_list; shaper_list++) { std::cerr << *shaper_list << std::endl; } std::cerr << "Starting Harfbuzz shaping" << std::endl; progress_timer timer2(std::clog,"Harfbuzz shaping done"); const char* const shapers[] = { /*"ot",*/"fallback" }; hb_buffer_t *buffer(hb_buffer_create()); for (unsigned i = 0; i < NUM_ITER; ++i) { for (unsigned j = 0; j < NUM_EXAMPLES; ++j) { UnicodeString text = UnicodeString::fromUTF8(texts[j]); int32_t length = text.length(); hb_buffer_clear_contents(buffer); //hb_buffer_set_unicode_funcs(buffer.get(), hb_icu_get_unicode_funcs()); hb_buffer_pre_allocate(buffer, length); hb_buffer_add_utf16(buffer, text.getBuffer(), text.length(), 0, length); hb_buffer_set_direction(buffer, text_directions[j]); hb_buffer_set_script(buffer, scripts[j]); hb_buffer_set_language(buffer,hb_language_from_string(languages[j], std::strlen(languages[j]))); //hb_shape(hb_ft_font[j], buffer.get(), 0, 0); hb_shape_full(hb_ft_font[j], buffer, 0, 0, shapers); unsigned num_glyphs = hb_buffer_get_length(buffer); hb_glyph_info_t *glyphs = hb_buffer_get_glyph_infos(buffer, NULL); //hb_glyph_position_t *positions = hb_buffer_get_glyph_positions(buffer.get(), NULL); for (unsigned k=0; k<num_glyphs; ++k) { int32_t glyph_index = glyphs[k].codepoint; if (i == 0) { std::cerr << glyph_index << ":"; } } if (i == 0) std::cerr << std::endl; } } hb_buffer_destroy(buffer); } // cleanup for (int j=0; j < NUM_EXAMPLES; ++j) { hb_font_destroy(hb_ft_font[j]); } FT_Done_FreeType(ft_library); return EXIT_SUCCESS; }
void font_face_set::get_string_info(string_info & info) { unsigned width = 0; unsigned height = 0; UErrorCode err = U_ZERO_ERROR; UnicodeString const& ustr = info.get_string(); const UChar * text = ustr.getBuffer(); UBiDi * bidi = ubidi_openSized(ustr.length(),0,&err); if (U_SUCCESS(err)) { ubidi_setPara(bidi,text,ustr.length(), UBIDI_DEFAULT_LTR,0,&err); if (U_SUCCESS(err)) { int32_t count = ubidi_countRuns(bidi,&err); int32_t logicalStart; int32_t length; for (int32_t i=0; i< count;++i) { if (UBIDI_LTR == ubidi_getVisualRun(bidi,i,&logicalStart,&length)) { do { UChar ch = text[logicalStart++]; dimension_t char_dim = character_dimensions(ch); info.add_info(ch, char_dim.width, char_dim.height); width += char_dim.width; height = char_dim.height > height ? char_dim.height : height; } while (--length > 0); } else { logicalStart += length; int32_t j=0,i=length; UnicodeString arabic; UChar * buf = arabic.getBuffer(length); do { UChar ch = text[--logicalStart]; buf[j++] = ch; } while (--i > 0); arabic.releaseBuffer(length); if ( *arabic.getBuffer() >= 0x0600 && *arabic.getBuffer() <= 0x06ff) { UnicodeString shaped; u_shapeArabic(arabic.getBuffer(),arabic.length(),shaped.getBuffer(arabic.length()),arabic.length(), U_SHAPE_LETTERS_SHAPE|U_SHAPE_LENGTH_FIXED_SPACES_NEAR| U_SHAPE_TEXT_DIRECTION_VISUAL_LTR ,&err); shaped.releaseBuffer(arabic.length()); if (U_SUCCESS(err)) { for (int j=0;j<shaped.length();++j) { dimension_t char_dim = character_dimensions(shaped[j]); info.add_info(shaped[j], char_dim.width, char_dim.height); width += char_dim.width; height = char_dim.height > height ? char_dim.height : height; } } } else { // Non-Arabic RTL for (int j=0;j<arabic.length();++j) { dimension_t char_dim = character_dimensions(arabic[j]); info.add_info(arabic[j], char_dim.width, char_dim.height); width += char_dim.width; height = char_dim.height > height ? char_dim.height : height; } } } } } ubidi_close(bidi); } info.set_dimensions(width, height); }
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // start and limit of the input range int32_t start = offsets.start; int32_t limit = offsets.limit; int32_t length, delta; if(start >= limit) { return; } // a C code unit iterator, implemented around the Replaceable UCharIterator iter; uiter_setReplaceable(&iter, &text); // the output string and buffer pointer UnicodeString output; UChar *buffer; UBool neededToNormalize; UErrorCode errorCode; /* * Normalize as short chunks at a time as possible even in * bulk mode, so that styled text is minimally disrupted. * In incremental mode, a chunk that ends with offsets.limit * must not be normalized. * * If it was known that the input text is not styled, then * a bulk mode normalization could look like this: * UChar staticChars[256]; UnicodeString input; length = limit - start; input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); input.releaseBuffer(length); UErrorCode status = U_ZERO_ERROR; Normalizer::normalize(input, fMode, options, output, status); text.handleReplaceBetween(start, limit, output); int32_t delta = output.length() - length; offsets.contextLimit += delta; offsets.limit += delta; offsets.start = limit + delta; * */ while(start < limit) { // set the iterator limits for the remaining input range // this is a moving target because of the replacements in the text object iter.start = iter.index = start; iter.limit = limit; // incrementally normalize a small chunk of the input buffer = output.getBuffer(-1); errorCode = U_ZERO_ERROR; length = unorm_next(&iter, buffer, output.getCapacity(), fMode, 0, TRUE, &neededToNormalize, &errorCode); output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); if(errorCode == U_BUFFER_OVERFLOW_ERROR) { // use a larger output string buffer and do it again from the start iter.index = start; buffer = output.getBuffer(length); errorCode = U_ZERO_ERROR; length = unorm_next(&iter, buffer, output.getCapacity(), fMode, 0, TRUE, &neededToNormalize, &errorCode); output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); } if(U_FAILURE(errorCode)) { break; } limit = iter.index; if(isIncremental && limit == iter.limit) { // stop in incremental mode when we reach the input limit // in case there are additional characters that could change the // normalization result // UNLESS all characters in the result of the normalization of // the last run are in the skippable set const UChar *s=output.getBuffer(); int32_t i=0, outLength=output.length(); UChar32 c; while(i<outLength) { U16_NEXT(s, i, outLength, c); if(!unorm_isNFSkippable(c, fMode)) { outLength=-1; // I wish C++ had labeled loops and break outer; ... break; } } if (outLength<0) { break; } } if(neededToNormalize) { // replace the input chunk with its normalized form text.handleReplaceBetween(start, limit, output); // update all necessary indexes accordingly delta = length - (limit - start); // length change in the text object start = limit += delta; // the next chunk starts where this one ends, with adjustment limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range offsets.contextLimit += delta; } else { // delta == 0 start = limit; limit = offsets.limit; } } offsets.start = start; }
/* ******************************************************************************* * * created on: 2013jul01 * created by: Matitiahu Allouche This function performs a conformance test for implementations of the Unicode Bidirectional Algorithm, specified in UAX #9: Unicode Bidirectional Algorithm, at http://www.unicode.org/unicode/reports/tr9/ Each test case is represented in a single line which is read from a file named BidiCharacter.txt. Empty, blank and comment lines may also appear in this file. The format of the test data is specified below. Note that each test case constitutes a single line of text; reordering is applied within a single line and independently of a rendering engine, and rules L3 and L4 are out of scope. The number sign '#' is the comment character: everything is ignored from the occurrence of '#' until the end of the line, Empty lines and lines containing only spaces and/or comments are ignored. Lines which represent test cases consist of 4 or 5 fields separated by a semicolon. Each field consists of tokens separated by whitespace (space or Tab). Whitespace before and after semicolons is optional. Field 0: A sequence of hexadecimal code point values separated by space Field 1: A value representing the paragraph direction, as follows: - 0 represents left-to-right - 1 represents right-to-left - 2 represents auto-LTR according to rules P2 and P3 of the algorithm - 3 represents auto-RTL according to rules P2 and P3 of the algorithm - a negative number whose absolute value is taken as paragraph level; this may be useful to test cases where the embedding level approaches or exceeds the maximum embedding level. Field 2: The resolved paragraph embedding level. If the input (field 0) includes more than one paragraph, this field represents the resolved level of the first paragraph. Field 3: An ordered list of resulting levels for each token in field 0 (each token represents one source character). The UBA does not assign levels to certain characters (e.g. LRO); characters removed in rule X9 are indicated with an 'x'. Field 4: An ordered list of indices showing the resulting visual ordering from left to right; characters with a resolved level of 'x' are skipped. The number are zero-based. Each index corresponds to a character in the reordered (visual) string. It represents the index of the source character in the input (field 0). This field is optional. When it is absent, the visual ordering is not verified. Examples: # This is a comment line. L L ON R ; 0 ; 0 ; 0 0 0 1 ; 0 1 2 3 L L ON R;0;0;0 0 0 1;0 1 2 3 # Note: in the next line, 'B' represents a block separator, not the letter 'B'. LRE A B C PDF;2;0;x 2 0 0 x;1 2 3 # Note: in the next line, 'b' represents the letter 'b', not a block separator. a b c 05d0 05d1 x ; 0 ; 0 ; 0 0 0 1 1 0 ; 0 1 2 4 3 5 a R R x ; 1 ; 1 ; 2 1 1 2 L L R R R B R R L L L B ON ON ; 3 ; 0 ; 0 0 1 1 1 0 1 1 2 2 2 1 1 1 * ******************************************************************************* */ void BiDiConformanceTest::TestBidiCharacterTest() { IcuTestErrorCode errorCode(*this, "TestBidiCharacterTest"); const char *sourceTestDataPath=getSourceTestData(errorCode); if(errorCode.logIfFailureAndReset("unable to find the source/test/testdata " "folder (getSourceTestData())")) { return; } char bidiTestPath[400]; strcpy(bidiTestPath, sourceTestDataPath); strcat(bidiTestPath, "BidiCharacterTest.txt"); LocalStdioFilePointer bidiTestFile(fopen(bidiTestPath, "r")); if(bidiTestFile.isNull()) { errln("unable to open %s", bidiTestPath); return; } LocalUBiDiPointer ubidi(ubidi_open()); lineNumber=0; levelsCount=0; orderingCount=0; errorCount=0; while(errorCount<20 && fgets(line, (int)sizeof(line), bidiTestFile.getAlias())!=NULL) { ++lineNumber; paraLevelName="N/A"; inputString="N/A"; // Remove trailing comments and whitespace. char *commentStart=strchr(line, '#'); if(commentStart!=NULL) { *commentStart=0; } u_rtrim(line); const char *start=u_skipWhitespace(line); if(*start==0) { continue; // Skip empty and comment-only lines. } // Parse the code point string in field 0. UChar *buffer=inputString.getBuffer(200); int32_t length=u_parseString(start, buffer, inputString.getCapacity(), NULL, errorCode); if(errorCode.logIfFailureAndReset("Invalid string in field 0")) { errln("Input line %d: %s", (int)lineNumber, line); inputString.remove(); continue; } inputString.releaseBuffer(length); start=strchr(start, ';'); if(start==NULL) { errorCount++; errln("\nError on line %d: Missing ; separator on line: %s", (int)lineNumber, line); continue; } start=u_skipWhitespace(start+1); char *end; int32_t paraDirection=(int32_t)strtol(start, &end, 10); UBiDiLevel paraLevel=UBIDI_MAX_EXPLICIT_LEVEL+2; if(paraDirection==0) { paraLevel=0; paraLevelName="LTR"; } else if(paraDirection==1) { paraLevel=1; paraLevelName="RTL"; } else if(paraDirection==2) { paraLevel=UBIDI_DEFAULT_LTR; paraLevelName="Auto/LTR"; } else if(paraDirection==3) { paraLevel=UBIDI_DEFAULT_RTL; paraLevelName="Auto/RTL"; } else if(paraDirection<0 && -paraDirection<=(UBIDI_MAX_EXPLICIT_LEVEL+1)) { paraLevel=(UBiDiLevel)(-paraDirection); sprintf(levelNameString, "%d", (int)paraLevel); paraLevelName=levelNameString; } if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || paraLevel==(UBIDI_MAX_EXPLICIT_LEVEL+2)) { errln("\nError on line %d: Input paragraph direction incorrect at %s", (int)lineNumber, start); printErrorLine(); continue; } start=u_skipWhitespace(end); if(*start!=';') { errorCount++; errln("\nError on line %d: Missing ; separator on line: %s", (int)lineNumber, line); continue; } start++; uint32_t resolvedParaLevel=(uint32_t)strtoul(start, &end, 10); if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || resolvedParaLevel>1) { errln("\nError on line %d: Resolved paragraph level incorrect at %s", (int)lineNumber, start); printErrorLine(); continue; } start=u_skipWhitespace(end); if(*start!=';') { errorCount++; errln("\nError on line %d: Missing ; separator on line: %s", (int)lineNumber, line); return; } start++; if(!parseLevels(start)) { continue; } start=u_skipWhitespace(start); if(*start==';') { if(!parseOrdering(start+1)) { continue; } } else orderingCount=-1; ubidi_setPara(ubidi.getAlias(), inputString.getBuffer(), inputString.length(), paraLevel, NULL, errorCode); const UBiDiLevel *actualLevels=ubidi_getLevels(ubidi.getAlias(), errorCode); if(errorCode.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) { errln("Input line %d: %s", (int)lineNumber, line); continue; } UBiDiLevel actualLevel; if((actualLevel=ubidi_getParaLevel(ubidi.getAlias()))!=resolvedParaLevel) { printErrorLine(); errln("\nError on line %d: Wrong resolved paragraph level; expected %d actual %d", (int)lineNumber, resolvedParaLevel, actualLevel); continue; } if(!checkLevels(actualLevels, ubidi_getProcessedLength(ubidi.getAlias()))) { continue; } if(orderingCount>=0 && !checkOrdering(ubidi.getAlias())) { continue; } } }