std::string toupper(const std::string& str) { return transform(str, [](uint32_t cp) { return u_toupper(static_cast<UChar32>(cp)); }); }
static PyObject* icu_swap_case(PyObject *self, PyObject *input) { PyObject *result = NULL; UErrorCode status = U_ZERO_ERROR; UChar *input_buf = NULL, *output_buf = NULL; UChar32 *buf = NULL; int32_t sz = 0, sz32 = 0, i = 0; input_buf = python_to_icu(input, &sz); if (input_buf == NULL) goto end; output_buf = (UChar*) calloc(3 * sz, sizeof(UChar)); buf = (UChar32*) calloc(2 * sz, sizeof(UChar32)); if (output_buf == NULL || buf == NULL) { PyErr_NoMemory(); goto end; } u_strToUTF32(buf, 2 * sz, &sz32, input_buf, sz, &status); for (i = 0; i < sz32; i++) { if (u_islower(buf[i])) buf[i] = u_toupper(buf[i]); else if (u_isupper(buf[i])) buf[i] = u_tolower(buf[i]); } u_strFromUTF32(output_buf, 3*sz, &sz, buf, sz32, &status); if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; } result = icu_to_python(output_buf, sz); end: if (input_buf != NULL) free(input_buf); if (output_buf != NULL) free(output_buf); if (buf != NULL) free(buf); return result; } // }}}
/** * Returns 1 if 'c' is considered as an uppercase letter * in the given alphabet, 0 otherwise. */ int is_upper(unichar c,const Alphabet* alphabet) { if (alphabet==NULL) { if (u_is_letter(c) == 0) return 0; return (c == u_toupper(c)) ? 1 : 0; } return IS_UPPER_MACRO(c,alphabet); }
// --------------------------------------------------------------------------- // RangeToken: Getter methods // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { if (fCaseIToken == 0 && tokFactory) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); for (unsigned int i = 0; i < fElemCount - 1; i += 2) { for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) { #if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER) const XMLInt32 upperCh = u_toupper(ch); if (upperCh != ch) { lwrToken->addRange(upperCh, upperCh); } const XMLInt32 lowerCh = u_tolower(ch); if (lowerCh != ch) { lwrToken->addRange(lowerCh, lowerCh); } const XMLInt32 titleCh = u_totitle(ch); if (titleCh != ch && titleCh != upperCh) { lwrToken->addRange(titleCh, titleCh); } #else if (ch >= chLatin_A && ch <= chLatin_Z) { ch += chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } else if (ch >= chLatin_a && ch <= chLatin_z) { ch -= chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } #endif } } lwrToken->mergeRanges(this); lwrToken->compactRanges(); lwrToken->createMap(); fCaseIToken = lwrToken; } return fCaseIToken; }
/* Static Function: AreEqualOrdinalIgnoreCase */ static bool AreEqualOrdinalIgnoreCase(UChar32 one, UChar32 two) { // Return whether the two characters are identical or would be identical if they were upper-cased. if (one == two) { return true; } if (one == 0x0131 || two == 0x0131) { // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131) // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049). // We special case it to match the Windows invariant behavior. return false; } return u_toupper(one) == u_toupper(two); }
UChar UChar::toUpper() const { #if APPLE_CHANGES return static_cast<unsigned short>(u_toupper(uc)); #else if (uc >= 256 || isupper(uc)) return *this; return (unsigned char)toupper(uc); #endif }
/** * Returns 1 if 'upper' is considered as an uppercase equivalent * of 'lower' for the given alphabet; returns 0 otherwise. */ int is_upper_of(unichar lower,unichar upper,const Alphabet* alphabet) { if (alphabet==NULL) { return upper==u_toupper(lower); } int i_pos_in_array_of_string = alphabet->pos_in_represent_list[lower]; if (i_pos_in_array_of_string == 0) return 0; int i=0; while (alphabet->t_array_collection[i_pos_in_array_of_string][i]!='\0') { if (alphabet->t_array_collection[i_pos_in_array_of_string][i]==upper) return 1; i++; } return 0; }
extern "C" int32_t CompareStringOrdinalIgnoreCase(const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length) { assert(lpStr1 != nullptr); assert(cwStr1Length >= 0); assert(lpStr2 != nullptr); assert(cwStr2Length >= 0); int32_t str1Idx = 0; int32_t str2Idx = 0; while (str1Idx < cwStr1Length && str2Idx < cwStr2Length) { UChar32 str1Codepoint; UChar32 str2Codepoint; U16_NEXT(lpStr1, str1Idx, cwStr1Length, str1Codepoint); U16_NEXT(lpStr2, str2Idx, cwStr2Length, str2Codepoint); if (str1Codepoint != str2Codepoint && u_toupper(str1Codepoint) != u_toupper(str2Codepoint)) { return str1Codepoint < str2Codepoint ? -1 : 1; } } if (cwStr1Length < cwStr2Length) { return -1; } if (cwStr2Length < cwStr1Length) { return 1; } return 0; }
wxString FilePathNormalCase(const wxString& name) { #ifndef __WXMSW__ return name; #else // wxString::Upper is buggy under Windows // and the filename insensitive of Windows is also buggy // but they are different wxString uppername; BOOST_FOREACH(wxChar ch, name) uppername.append(1, (wxChar)u_toupper((UChar32)(unsigned int)ch)); return uppername; #endif }
/** * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) */ void StriContainerByteSearch::upgradePatternCaseInsensitive() { UChar32 c = 0; R_len_t j = 0; patternLenCaseInsensitive = 0; while (j < patternLen) { U8_NEXT(patternStr, j, patternLen, c); #ifndef NDEBUG if (patternLenCaseInsensitive >= this->kmpMaxSize) throw StriException("!NDEBUG: StriContainerByteSearch::upgradePatternCaseInsensitive()"); #endif patternStrCaseInsensitive[patternLenCaseInsensitive++] = u_toupper(c); } patternStrCaseInsensitive[patternLenCaseInsensitive] = 0; }
void ICUUnicodeSupport::_toUpperCase<2>(StringHolder<2> _str) { if(!_str.empty()) { uint16_t* buf = &_str[0]; int32_t len = _str.length(); int32_t ofs = 0, ofs2 = 0; while(ofs != len) { UChar32 c; U16_NEXT(buf, ofs, len, c); c = u_toupper(c); U16_APPEND_UNSAFE( buf, ofs2, c); } } }
/** find last match - KMP * * @param startPos where to start * @return USEARCH_DONE on no match, otherwise start index * * @version 0.2-3 (Marek Gagolewski, 2014-05-11) * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * use BYTESEARCH_CASE_INSENSITIVE */ R_len_t StriContainerByteSearch::findFromPosBack_KMP(R_len_t startPos) { int j = startPos; patternPos = 0; if (flags&BYTESEARCH_CASE_INSENSITIVE) { while (j > 0) { UChar32 c; U8_PREV(searchStr, 0, j, c); c = u_toupper(c); while (patternPos >= 0 && patternStrCaseInsensitive[patternLenCaseInsensitive-1-patternPos] != c) patternPos = kmpNext[patternPos]; patternPos++; if (patternPos == patternLenCaseInsensitive) { searchPos = j; // we need to go forward by patternLenCaseInsensitive code points R_len_t k = patternLenCaseInsensitive; searchEnd = j; while (k > 0) { U8_FWD_1((const uint8_t*)searchStr, searchEnd, searchLen); k--; } return searchPos; } } } else { while (j > 0) { j--; while (patternPos >= 0 && patternStr[patternLen-1-patternPos] != searchStr[j]) patternPos = kmpNext[patternPos]; patternPos++; if (patternPos == patternLen) { searchEnd = j+patternLen; searchPos = j; return searchPos; } } } // else not found searchPos = searchEnd = searchLen; return USEARCH_DONE; }
/** * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) */ bool StriContainerByteSearch::endsWith(R_len_t byteindex) { if (flags&BYTESEARCH_CASE_INSENSITIVE) { for (R_len_t k = 0; k < patternLenCaseInsensitive; ++k) { UChar32 c; U8_PREV(searchStr, 0, byteindex, c); c = u_toupper(c); if (patternStrCaseInsensitive[patternLenCaseInsensitive-k-1] != c) return false; } } else { for (R_len_t k=0; k < patternLen; ++k) if (searchStr[byteindex-k-1] != patternStr[patternLen-k-1]) return false; } return true; // found }
/** find first match - KMP * * @param startPos where to start * @return USEARCH_DONE on no match, otherwise start index * * @version 0.1-?? (Bartek Tartanus, 2013-08-15) * KMP - first approach * * @version 0.2-3 (Marek Gagolewski, 2014-05-11) * KMP upgraded; separate method * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * use BYTESEARCH_CASE_INSENSITIVE */ R_len_t StriContainerByteSearch::findFromPosFwd_KMP(R_len_t startPos) { int j = startPos; patternPos = 0; if (flags&BYTESEARCH_CASE_INSENSITIVE) { UChar32 c = 0; while (j < searchLen) { U8_NEXT(searchStr, j, searchLen, c); c = u_toupper(c); while (patternPos >= 0 && patternStrCaseInsensitive[patternPos] != c) patternPos = kmpNext[patternPos]; patternPos++; if (patternPos == patternLenCaseInsensitive) { searchEnd = j; // we need to go back by patternLenCaseInsensitive code points R_len_t k = patternLenCaseInsensitive; searchPos = j; while (k > 0) { U8_BACK_1((const uint8_t*)searchStr, 0, searchPos); k--; } return searchPos; } } } else { while (j < searchLen) { while (patternPos >= 0 && patternStr[patternPos] != searchStr[j]) patternPos = kmpNext[patternPos]; patternPos++; j++; if (patternPos == patternLen) { searchEnd = j; searchPos = j-patternLen; return searchPos; } } } // else not found searchPos = searchEnd = searchLen; return USEARCH_DONE; }
/** * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) */ bool StriContainerByteSearch::startsWith(R_len_t byteindex) { if (flags&BYTESEARCH_CASE_INSENSITIVE) { for (R_len_t k = 0; k < patternLenCaseInsensitive; ++k) { UChar32 c; U8_NEXT(searchStr, byteindex, searchLen, c); c = u_toupper(c); if (patternStrCaseInsensitive[k] != c) return false; } } else { for (R_len_t k=0; k < patternLen; ++k) if (searchStr[byteindex+k] != patternStr[k]) return false; } return true; // found }
/* Function: ChangeCase Performs upper or lower casing of a string into a new buffer. No special casing is performed beyond that provided by ICU. */ extern "C" void ChangeCase(const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper) { // Iterate through the string, decoding the next one or two UTF-16 code units // into a codepoint and updating srcIdx to point to the next UTF-16 code unit // to decode. Then upper or lower case it, write dstCodepoint into lpDst at // offset dstIdx, and update dstIdx. // (The loop here has been manually cloned for each of the four cases, rather // than having a single loop that internally branched based on bToUpper as the // compiler wasn't doing that optimization, and it results in an ~15-20% perf // improvement on longer strings.) UBool isError = FALSE; int32_t srcIdx = 0, dstIdx = 0; UChar32 srcCodepoint, dstCodepoint; if (bToUpper) { while (srcIdx < cwSrcLength) { U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = u_toupper(srcCodepoint); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } else { while (srcIdx < cwSrcLength) { U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = u_tolower(srcCodepoint); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } }
static void printProps(UChar32 codePoint) { char buffer[100]; UErrorCode errorCode; /* get the character name */ errorCode=U_ZERO_ERROR; u_charName(codePoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); /* print the code point and the character name */ printf("U+%04lx\t%s\n", codePoint, buffer); /* print some properties */ printf(" general category (numeric enum value): %u\n", u_charType(codePoint)); /* note: these APIs do not provide the data from SpecialCasing.txt */ printf(" is lowercase: %d uppercase: U+%04lx\n", u_islower(codePoint), u_toupper(codePoint)); printf(" is digit: %d decimal digit value: %d\n", u_isdigit(codePoint), u_charDigitValue(codePoint)); printf(" BiDi directional category (numeric enum value): %u\n", u_charDirection(codePoint)); }
/* Function: ChangeCaseInvariant Performs upper or lower casing of a string into a new buffer. Special casing is performed to ensure that invariant casing matches that of Windows in certain situations, e.g. Turkish i's. */ extern "C" void ChangeCaseInvariant(const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper) { // See algorithmic comment in ChangeCase. UBool isError = FALSE; int32_t srcIdx = 0, dstIdx = 0; UChar32 srcCodepoint, dstCodepoint; if (bToUpper) { while (srcIdx < cwSrcLength) { // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131) // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049). // We special case it to match the Windows invariant behavior. U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = ((srcCodepoint == (UChar32)0x0131) ? (UChar32)0x0131 : u_toupper(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } else { while (srcIdx < cwSrcLength) { // On Windows with InvariantCulture, the LATIN CAPITAL LETTER I WITH DOT ABOVE (U+0130) // lower cases to itself, whereas with ICU it lower cases to LATIN SMALL LETTER I (U+0069). // We special case it to match the Windows invariant behavior. U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = ((srcCodepoint == (UChar32)0x0130) ? (UChar32)0x0130 : u_tolower(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } }
/* Function: ChangeCaseTurkish Performs upper or lower casing of a string into a new buffer, performing special casing for Turkish. */ extern "C" void ChangeCaseTurkish(const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper) { // See algorithmic comment in ChangeCase. UBool isError = FALSE; int32_t srcIdx = 0, dstIdx = 0; UChar32 srcCodepoint, dstCodepoint; if (bToUpper) { while (srcIdx < cwSrcLength) { // In turkish casing, LATIN SMALL LETTER I (U+0069) upper cases to LATIN // CAPITAL LETTER I WITH DOT ABOVE (U+0130). U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = ((srcCodepoint == (UChar32)0x0069) ? (UChar32)0x0130 : u_toupper(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } else { while (srcIdx < cwSrcLength) { // In turkish casing, LATIN CAPITAL LETTER I (U+0049) lower cases to // LATIN SMALL LETTER DOTLESS I (U+0131). U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); dstCodepoint = ((srcCodepoint == (UChar32)0x0049) ? (UChar32)0x0131 : u_tolower(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } } }
std::pair<GlyphData, GlyphPage*> FontGlyphs::glyphDataAndPageForCharacter(const FontDescription& description, UChar32 c, bool mirror, FontDataVariant variant) const { ASSERT(isMainThread()); if (variant == AutoVariant) { if (description.smallCaps() && !primarySimpleFontData(description)->isSVGFont()) { UChar32 upperC = u_toupper(c); if (upperC != c) { c = upperC; variant = SmallCapsVariant; } else variant = NormalVariant; } else variant = NormalVariant; } if (mirror) c = u_charMirror(c); unsigned pageNumber = (c / GlyphPage::size); GlyphPageTreeNode* node = pageNumber ? m_pages.get(pageNumber) : m_pageZero; if (!node) { node = GlyphPageTreeNode::getRootChild(realizeFontDataAt(description, 0), pageNumber); if (pageNumber) m_pages.set(pageNumber, node); else m_pageZero = node; } GlyphPage* page = 0; if (variant == NormalVariant) { // Fastest loop, for the common case (normal variant). while (true) { page = node->page(); if (page) { GlyphData data = page->glyphDataForCharacter(c); if (data.fontData && (data.fontData->platformData().orientation() == Horizontal || data.fontData->isTextOrientationFallback())) return std::make_pair(data, page); if (data.fontData) { if (Font::isCJKIdeographOrSymbol(c)) { if (!data.fontData->hasVerticalGlyphs()) { // Use the broken ideograph font data. The broken ideograph font will use the horizontal width of glyphs // to make sure you get a square (even for broken glyphs like symbols used for punctuation). variant = BrokenIdeographVariant; break; } #if PLATFORM(COCOA) else if (data.fontData->platformData().syntheticOblique()) return glyphDataAndPageForCJKCharacterWithoutSyntheticItalic(c, data, page, pageNumber); #endif } else return glyphDataAndPageForNonCJKCharacterWithGlyphOrientation(c, description.nonCJKGlyphOrientation(), data, page, pageNumber); return std::make_pair(data, page); } if (node->isSystemFallback()) break; } node = node->getChild(realizeFontDataAt(description, node->level()), pageNumber); if (pageNumber) m_pages.set(pageNumber, node); else m_pageZero = node; } } if (variant != NormalVariant) { while (true) { page = node->page(); if (page) { GlyphData data = page->glyphDataForCharacter(c); if (data.fontData) { // The variantFontData function should not normally return 0. // But if it does, we will just render the capital letter big. RefPtr<SimpleFontData> variantFontData = data.fontData->variantFontData(description, variant); if (!variantFontData) return std::make_pair(data, page); GlyphPageTreeNode* variantNode = GlyphPageTreeNode::getRootChild(variantFontData.get(), pageNumber); GlyphPage* variantPage = variantNode->page(); if (variantPage) { GlyphData data = variantPage->glyphDataForCharacter(c); if (data.fontData) return std::make_pair(data, variantPage); } // Do not attempt system fallback off the variantFontData. This is the very unlikely case that // a font has the lowercase character but the small caps font does not have its uppercase version. return std::make_pair(variantFontData->missingGlyphData(), page); } if (node->isSystemFallback()) break; } node = node->getChild(realizeFontDataAt(description, node->level()), pageNumber); if (pageNumber) m_pages.set(pageNumber, node); else m_pageZero = node; } } ASSERT(page); ASSERT(node->isSystemFallback()); // System fallback is character-dependent. When we get here, we // know that the character in question isn't in the system fallback // font's glyph page. Try to lazily create it here. UChar codeUnits[2]; int codeUnitsLength; if (c <= 0xFFFF) { codeUnits[0] = Font::normalizeSpaces(c); codeUnitsLength = 1; } else { codeUnits[0] = U16_LEAD(c); codeUnits[1] = U16_TRAIL(c); codeUnitsLength = 2; } const SimpleFontData* originalFontData = primaryFontData(description)->fontDataForCharacter(c); RefPtr<SimpleFontData> characterFontData = fontCache().systemFallbackForCharacters(description, originalFontData, m_isForPlatformFont, codeUnits, codeUnitsLength); if (characterFontData) { if (characterFontData->platformData().orientation() == Vertical && !characterFontData->hasVerticalGlyphs() && Font::isCJKIdeographOrSymbol(c)) variant = BrokenIdeographVariant; if (variant != NormalVariant) characterFontData = characterFontData->variantFontData(description, variant); } if (characterFontData) { // Got the fallback glyph and font. GlyphPage* fallbackPage = GlyphPageTreeNode::getRootChild(characterFontData.get(), pageNumber)->page(); GlyphData data = fallbackPage && fallbackPage->fontDataForCharacter(c) ? fallbackPage->glyphDataForCharacter(c) : characterFontData->missingGlyphData(); // Cache it so we don't have to do system fallback again next time. if (variant == NormalVariant) { #if OS(WINCE) // missingGlyphData returns a null character, which is not suitable for GDI to display. // Also, sometimes we cannot map a font for the character on WINCE, but GDI can still // display the character, probably because the font package is not installed correctly. // So we just always set the glyph to be same as the character, and let GDI solve it. page->setGlyphDataForCharacter(c, c, characterFontData.get()); characterFontData->setMaxGlyphPageTreeLevel(std::max(characterFontData->maxGlyphPageTreeLevel(), node->level())); return std::make_pair(page->glyphDataForCharacter(c), page); #else page->setGlyphDataForCharacter(c, data.glyph, data.fontData); data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level())); if (!Font::isCJKIdeographOrSymbol(c) && data.fontData->platformData().orientation() != Horizontal && !data.fontData->isTextOrientationFallback()) return glyphDataAndPageForNonCJKCharacterWithGlyphOrientation(c, description.nonCJKGlyphOrientation(), data, fallbackPage, pageNumber); #endif } return std::make_pair(data, page); } // Even system fallback can fail; use the missing glyph in that case. // FIXME: It would be nicer to use the missing glyph from the last resort font instead. GlyphData data = primarySimpleFontData(description)->missingGlyphData(); if (variant == NormalVariant) { #if OS(WINCE) // See comment about WINCE GDI handling near setGlyphDataForCharacter above. page->setGlyphDataForCharacter(c, c, data.fontData); data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level())); return std::make_pair(page->glyphDataForCharacter(c), page); #else page->setGlyphDataForCharacter(c, data.glyph, data.fontData); data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level())); #endif } return std::make_pair(data, page); }
inline unsigned WidthIterator::advanceInternal(TextIterator& textIterator, GlyphBuffer* glyphBuffer) { bool rtl = m_run.rtl(); bool hasExtraSpacing = (m_font->letterSpacing() || m_font->wordSpacing() || m_expansion) && !m_run.spacingDisabled(); float widthSinceLastRounding = m_runWidthSoFar; m_runWidthSoFar = floorf(m_runWidthSoFar); widthSinceLastRounding -= m_runWidthSoFar; float lastRoundingWidth = m_finalRoundingWidth; FloatRect bounds; const SimpleFontData* primaryFont = m_font->primaryFont(); const SimpleFontData* lastFontData = primaryFont; int lastGlyphCount = glyphBuffer ? glyphBuffer->size() : 0; UChar32 character = 0; unsigned clusterLength = 0; CharactersTreatedAsSpace charactersTreatedAsSpace; String normalizedSpacesStringCache; while (textIterator.consume(character, clusterLength)) { unsigned advanceLength = clusterLength; int currentCharacter = textIterator.currentCharacter(); const GlyphData& glyphData = glyphDataForCharacter(character, rtl, currentCharacter, advanceLength, normalizedSpacesStringCache); Glyph glyph = glyphData.glyph; const SimpleFontData* fontData = glyphData.fontData; ASSERT(fontData); // Now that we have a glyph and font data, get its width. float width; if (character == '\t' && m_run.allowTabs()) width = m_font->tabWidth(*fontData, m_run.tabSize(), m_run.xPos() + m_runWidthSoFar + widthSinceLastRounding); else { width = fontData->widthForGlyph(glyph); // SVG uses horizontalGlyphStretch(), when textLength is used to stretch/squeeze text. width *= m_run.horizontalGlyphStretch(); // We special case spaces in two ways when applying word rounding. // First, we round spaces to an adjusted width in all fonts. // Second, in fixed-pitch fonts we ensure that all characters that // match the width of the space character have the same width as the space character. if (m_run.applyWordRounding() && width == fontData->spaceWidth() && (fontData->pitch() == FixedPitch || glyph == fontData->spaceGlyph())) width = fontData->adjustedSpaceWidth(); } if (fontData != lastFontData && width) { if (shouldApplyFontTransforms()) { m_runWidthSoFar += applyFontTransforms(glyphBuffer, m_run.ltr(), lastGlyphCount, lastFontData, *this, m_typesettingFeatures, charactersTreatedAsSpace); lastGlyphCount = glyphBuffer->size(); // applyFontTransforms doesn't update when there had been only one glyph. } lastFontData = fontData; if (m_fallbackFonts && fontData != primaryFont) { // FIXME: This does a little extra work that could be avoided if // glyphDataForCharacter() returned whether it chose to use a small caps font. if (!m_font->isSmallCaps() || character == u_toupper(character)) m_fallbackFonts->add(fontData); else { const GlyphData& uppercaseGlyphData = m_font->glyphDataForCharacter(u_toupper(character), rtl); if (uppercaseGlyphData.fontData != primaryFont) m_fallbackFonts->add(uppercaseGlyphData.fontData); } } } if (hasExtraSpacing) { // Account for letter-spacing. if (width && m_font->letterSpacing()) width += m_font->letterSpacing(); static bool expandAroundIdeographs = Font::canExpandAroundIdeographsInComplexText(); bool treatAsSpace = Font::treatAsSpace(character); if (treatAsSpace || (expandAroundIdeographs && Font::isCJKIdeographOrSymbol(character))) { // Distribute the run's total expansion evenly over all expansion opportunities in the run. if (m_expansion) { float previousExpansion = m_expansion; if (!treatAsSpace && !m_isAfterExpansion) { // Take the expansion opportunity before this ideograph. m_expansion -= m_expansionPerOpportunity; float expansionAtThisOpportunity = !m_run.applyWordRounding() ? m_expansionPerOpportunity : roundf(previousExpansion) - roundf(m_expansion); m_runWidthSoFar += expansionAtThisOpportunity; if (glyphBuffer) { if (glyphBuffer->isEmpty()) { if (m_forTextEmphasis) glyphBuffer->add(fontData->zeroWidthSpaceGlyph(), fontData, m_expansionPerOpportunity, currentCharacter); else glyphBuffer->add(fontData->spaceGlyph(), fontData, expansionAtThisOpportunity, currentCharacter); } else glyphBuffer->expandLastAdvance(expansionAtThisOpportunity); } previousExpansion = m_expansion; } if (m_run.allowsTrailingExpansion() || (m_run.ltr() && currentCharacter + advanceLength < static_cast<size_t>(m_run.length())) || (m_run.rtl() && currentCharacter)) { m_expansion -= m_expansionPerOpportunity; width += !m_run.applyWordRounding() ? m_expansionPerOpportunity : roundf(previousExpansion) - roundf(m_expansion); m_isAfterExpansion = true; } } else m_isAfterExpansion = false; // Account for word spacing. // We apply additional space between "words" by adding width to the space character. if (treatAsSpace && (character != '\t' || !m_run.allowTabs()) && (currentCharacter || character == noBreakSpace) && m_font->wordSpacing()) width += m_font->wordSpacing(); } else m_isAfterExpansion = false; } if (shouldApplyFontTransforms() && glyphBuffer && Font::treatAsSpace(character)) charactersTreatedAsSpace.append(std::make_pair(glyphBuffer->size(), OriginalAdvancesForCharacterTreatedAsSpace(character == ' ', glyphBuffer->size() ? glyphBuffer->advanceAt(glyphBuffer->size() - 1).width() : 0, width))); if (m_accountForGlyphBounds) { bounds = fontData->boundsForGlyph(glyph); if (!currentCharacter) m_firstGlyphOverflow = std::max<float>(0, -bounds.x()); } if (m_forTextEmphasis && !Font::canReceiveTextEmphasis(character)) glyph = 0; // Advance past the character we just dealt with. textIterator.advance(advanceLength); float oldWidth = width; // Force characters that are used to determine word boundaries for the rounding hack // to be integer width, so following words will start on an integer boundary. if (m_run.applyWordRounding() && Font::isRoundingHackCharacter(character)) { width = ceilf(width); // Since widthSinceLastRounding can lose precision if we include measurements for // preceding whitespace, we bypass it here. m_runWidthSoFar += width; // Since this is a rounding hack character, we should have reset this sum on the previous // iteration. ASSERT(!widthSinceLastRounding); } else { // Check to see if the next character is a "rounding hack character", if so, adjust // width so that the total run width will be on an integer boundary. if ((m_run.applyWordRounding() && textIterator.currentCharacter() < m_run.length() && Font::isRoundingHackCharacter(*(textIterator.characters()))) || (m_run.applyRunRounding() && textIterator.currentCharacter() >= m_run.length())) { float totalWidth = widthSinceLastRounding + width; widthSinceLastRounding = ceilf(totalWidth); width += widthSinceLastRounding - totalWidth; m_runWidthSoFar += widthSinceLastRounding; widthSinceLastRounding = 0; } else widthSinceLastRounding += width; } if (glyphBuffer) glyphBuffer->add(glyph, fontData, (rtl ? oldWidth + lastRoundingWidth : width), currentCharacter); lastRoundingWidth = width - oldWidth; if (m_accountForGlyphBounds) { m_maxGlyphBoundingBoxY = std::max(m_maxGlyphBoundingBoxY, bounds.maxY()); m_minGlyphBoundingBoxY = std::min(m_minGlyphBoundingBoxY, bounds.y()); m_lastGlyphOverflow = std::max<float>(0, bounds.maxX() - width); } } if (shouldApplyFontTransforms()) m_runWidthSoFar += applyFontTransforms(glyphBuffer, m_run.ltr(), lastGlyphCount, lastFontData, *this, m_typesettingFeatures, charactersTreatedAsSpace); unsigned consumedCharacters = textIterator.currentCharacter() - m_currentCharacter; m_currentCharacter = textIterator.currentCharacter(); m_runWidthSoFar += widthSinceLastRounding; m_finalRoundingWidth = lastRoundingWidth; return consumedCharacters; }
/** * Explores the given dictionary to match the given word. */ static void explore_dic(int offset,unichar* word,int pos_word,Dictionary* d,SpellCheckConfig* cfg, Ustring* output,SpellCheckHypothesis* *list,int base,Ustring* inflected) { int original_offset=offset; int original_base=base; int final,n_transitions,inf_code; int z=save_output(output); int size_pairs=cfg->pairs->nbelems; offset=read_dictionary_state(d,offset,&final,&n_transitions,&inf_code); if (final) { if (word[pos_word]=='\0') { /* If we have a match */ deal_with_matches(d,inflected->str,inf_code,output,cfg,base,list); } base=output->len; } /* If we are at the end of the token, then we stop */ if (word[pos_word]=='\0') { return; } unsigned int l2=inflected->len; unichar c; int dest_offset; for (int i=0;i<n_transitions;i++) { restore_output(z,output); offset=read_dictionary_transition(d,offset,&c,&dest_offset,output); /* For backup_output, see comment below */ int backup_output=save_output(output); if (c==word[pos_word] || word[pos_word]==u_toupper(c)) { u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } else { /* We deal with the SP_SWAP case, made of 2 SP_CHANGE_XXX */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SWAP!=cfg->max_SP_SWAP && is_letter_swap(cfg,word,pos_word,inflected,c)) { /* We don't modify the number of errors since we override an existing * SP_CHANGE_XXX one */ cfg->current_SP_SWAP++; /* We override the previous change */ int a=cfg->pairs->tab[cfg->pairs->nbelems-2]; int b=cfg->pairs->tab[cfg->pairs->nbelems-1]; cfg->pairs->tab[cfg->pairs->nbelems-2]=pos_word-1; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_SWAP_DEFAULT; u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); cfg->pairs->tab[cfg->pairs->nbelems-2]=a; cfg->pairs->tab[cfg->pairs->nbelems-1]=b; cfg->current_SP_SWAP--; } else /* We deal with the SP_CHANGE case */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_CHANGE!=cfg->max_SP_CHANGE /* We want letters, not spaces or anything else */ && is_letter(c,NULL) /* We do not allow the replacement of a lowercase letter by an uppercase * letter at the beginning of the word like Niserable, unless the whole word * is in uppercase or the letter is the same, module the case */ && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL) || word[0]==u_toupper(c)))) { cfg->current_errors++; cfg->current_SP_CHANGE++; /* Now we test all possible kinds of change */ vector_int_add(cfg->pairs,pos_word); u_strcat(inflected,c); /* We always add the default case */ vector_int_add(cfg->pairs,SP_CHANGE_DEFAULT); int n_elem=cfg->pairs->nbelems; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); /* Then we test the accent case */ if (u_deaccentuate(c)==u_deaccentuate(word[pos_word])) { /* After a call to explore_dic, we must restore the output. * But, when dealing with SP_CHANGE_XXX ops, we must restore the * output including the output associated to the current transition, * which is why we don't use z (output before the current transition) * but backup_output */ restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_DIACRITIC; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } /* And the case variations */ if (u_tolower(c)==u_tolower(word[pos_word])) { restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_CASE; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } /* And finally the position on keyboard */ if (areCloseOnKeyboard(c,word[pos_word],cfg->keyboard)) { restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_KEYBOARD; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_CHANGE--; /* End of the SP_CHANGE case */ } } restore_output(backup_output,output); truncate(inflected,l2); /* Now we deal with the SP_SUPPR case */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SUPPR!=cfg->max_SP_SUPPR /* We want letters, not spaces or anything else */ && is_letter(c,NULL)) { cfg->current_errors++; cfg->current_SP_SUPPR++; vector_int_add(cfg->pairs,pos_word); if (pos_word>=1 && c==word[pos_word-1]) { vector_int_add(cfg->pairs,SP_SUPPR_DOUBLE); } else { vector_int_add(cfg->pairs,SP_SUPPR_DEFAULT); } u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word,d,cfg,output,list,original_base,inflected); truncate(inflected,l2); cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_SUPPR--; } } restore_output(z,output); /* Finally, we deal with the SP_INSERT case, by calling again the current * function with the same parameters, except pos_word that will be increased of 1 */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_INSERT!=cfg->max_SP_INSERT /* We want letters, not spaces or anything else */ && is_letter(word[pos_word],NULL) /* We do not allow the insertion of a capital letter at the beginning of * the word like Astreet, unless the whole word is in uppercase like ASTREET */ && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL)))) { cfg->current_errors++; cfg->current_SP_INSERT++; vector_int_add(cfg->pairs,pos_word); if (pos_word>=1 && word[pos_word]==word[pos_word-1]) { vector_int_add(cfg->pairs,SP_INSERT_DOUBLE); } else { vector_int_add(cfg->pairs,SP_INSERT_DEFAULT); } explore_dic(original_offset,word,pos_word+1,d,cfg,output,list,original_base,inflected); truncate(inflected,l2); cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_INSERT--; } /* Finally, we restore the output as it was when we enter the function */ restore_output(z,output); }
/** * Takes a given unicode string 'dest' and * replaces any lowercase letter by the set made of itself and * its uppercase equivalent, surrounded with square brackets if * the letter was not already between square brackets. * Examples: * * "For" => "F[oO][rR]" * "F[ao]r" => "F[aAoO][rR]" * * The output is stored in 'src'. The function assumes that 'src' is * wide enough. * * This function is used for morphological filter regular expressions. */ void replace_letter_by_letter_set(const Alphabet* a,unichar* dest,const unichar* src) { int i=0,j=0; char inside_a_set=0; while (src[i]!='\0') { switch (src[i]) { case '\\': if (src[i+1]=='\0') { // there is nothing after a backslash, then we stop, // and the RE compiler may indicate an error dest[j++] = src[i++]; dest[j] = src[i]; return; } if (is_lower(src[i+1],a)) { // this is a lowercase letter in Unitex alphabet : // we don't need "\" and we make expansion "[eE]" ++i; if (!inside_a_set) dest[j++]='['; dest[j++]=src[i]; if (a==NULL) { /* If there is no alphabet file, we just consider the unique * uppercase variant of the letter */ dest[j++]=u_toupper(src[i]); } else { unichar* tbrowse = NULL; int i_pos_in_array_of_string = a->pos_in_represent_list[src[i]]; if (i_pos_in_array_of_string != 0) tbrowse = a->t_array_collection[i_pos_in_array_of_string]; if (tbrowse != NULL) while ((*tbrowse) != '\0') { dest[j++]=*(tbrowse++); } } if (!inside_a_set) dest[j++]=']'; i++; } else { // others cases : // we keep the "\" and the letter dest[j++] = src[i++]; dest[j++] = src[i++]; } break; case '[': dest[j++]=src[i++]; inside_a_set=1; break; case ']': dest[j++]=src[i++]; inside_a_set=0; break; case '.': case '*': case '+': case '?': case '|': case '^': case '$': case ':': case '(': case ')': case '{': case '}': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': dest[j++]=src[i++]; break; default: if (is_lower(src[i],a)) { if (!inside_a_set) dest[j++]='['; dest[j++]=src[i]; if (inside_a_set && src[i+1]=='-') { /* Special case: * if we had [a-d], we don't want to turn it into * [aA-dD], but rather into [a-dA-D]. In such a case, * we just use u_toupper */ i=i+2; dest[j++]='-'; dest[j++]=src[i++]; dest[j++]=u_toupper(dest[i-3]); dest[j++]='-'; dest[j++]=u_toupper(src[i-1]); continue; } if (a==NULL) { /* If there is no alphabet file, we just consider the unique * uppercase variant of the letter */ dest[j++]=u_toupper(src[i]); } else { /* If there is an alphabet file, we use it */ unichar* tbrowse = NULL; int i_pos_in_array_of_string = a->pos_in_represent_list[src[i]]; if (i_pos_in_array_of_string != 0) { tbrowse = a->t_array_collection[i_pos_in_array_of_string]; } if (tbrowse != NULL) { while ((*tbrowse) != '\0') { dest[j++]=*(tbrowse++); } } } if (!inside_a_set) dest[j++]=']'; i++; } else { /* Not a lower case letter */ dest[j++]=src[i++]; } } } dest[j]='\0'; }
uint32 BUnicodeChar::ToUpper(uint32 c) { BUnicodeChar(); return u_toupper(c); }
static jint Character_toUpperCaseImpl(JNIEnv*, jclass, jint codePoint) { return u_toupper(codePoint); }
//static jint Character_toUpperCaseImpl(JNIEnv*, jclass, jint codePoint) { JNIEXPORT jint JNICALL Java_java_lang_Character_toUpperCaseImpl(JNIEnv*, jclass, jint codePoint) { return u_toupper(codePoint); }
// Helper sets the character attribute properties and sets up the script table. // Does not set tops and bottoms. void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET* unicharset) { for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { // Convert any custom ligatures. const char* unichar_str = unicharset->id_to_unichar(unichar_id); for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) { if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { unichar_str = UNICHARSET::kCustomLigatures[i][0]; break; } } // Convert the unichar to UTF32 representation std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str); // Assume that if the property is true for any character in the string, // then it holds for the whole "character". bool unichar_isalpha = false; bool unichar_islower = false; bool unichar_isupper = false; bool unichar_isdigit = false; bool unichar_ispunct = false; for (char32 u_ch : uni_vector) { if (u_isalpha(u_ch)) unichar_isalpha = true; if (u_islower(u_ch)) unichar_islower = true; if (u_isupper(u_ch)) unichar_isupper = true; if (u_isdigit(u_ch)) unichar_isdigit = true; if (u_ispunct(u_ch)) unichar_ispunct = true; } unicharset->set_isalpha(unichar_id, unichar_isalpha); unicharset->set_islower(unichar_id, unichar_islower); unicharset->set_isupper(unichar_id, unichar_isupper); unicharset->set_isdigit(unichar_id, unichar_isdigit); unicharset->set_ispunctuation(unichar_id, unichar_ispunct); tesseract::IcuErrorCode err; unicharset->set_script(unichar_id, uscript_getName( uscript_getScript(uni_vector[0], err))); const int num_code_points = uni_vector.size(); // Obtain the lower/upper case if needed and record it in the properties. unicharset->set_other_case(unichar_id, unichar_id); if (unichar_islower || unichar_isupper) { std::vector<char32> other_case(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. // However since they deal with UChars (so need a conversion function // from char32 or UTF8string) and require a meaningful locale string, // for now u_tolower()/u_toupper() are used. other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : u_tolower(uni_vector[i]); } std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case); UNICHAR_ID other_case_id = unicharset->unichar_to_id(other_case_uch.c_str()); if (other_case_id != INVALID_UNICHAR_ID) { unicharset->set_other_case(unichar_id, other_case_id); } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) { tprintf("Other case %s of %s is not in unicharset\n", other_case_uch.c_str(), unichar_str); } } // Set RTL property and obtain mirror unichar ID from ICU. std::vector<char32> mirrors(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { mirrors[i] = u_charMirror(uni_vector[i]); if (i == 0) { // set directionality to that of the 1st code point unicharset->set_direction(unichar_id, static_cast<UNICHARSET::Direction>( u_charDirection(uni_vector[i]))); } } std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors); UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); if (mirror_uch_id != INVALID_UNICHAR_ID) { unicharset->set_mirror(unichar_id, mirror_uch_id); } else if (report_errors) { tprintf("Mirror %s of %s is not in unicharset\n", mirror_uch.c_str(), unichar_str); } // Record normalized version of this unichar. std::string normed_str; if (unichar_id != 0 && tesseract::NormalizeUTF8String( decompose ? tesseract::UnicodeNormMode::kNFKD : tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone, unichar_str, &normed_str) && !normed_str.empty()) { unicharset->set_normed(unichar_id, normed_str.c_str()); } else { unicharset->set_normed(unichar_id, unichar_str); } ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size()); } unicharset->post_load_setup(); }
// // this function explores the dictionary to decompose the word mot // void explore_state_german(int adresse,unichar* current_component,int pos_in_current_component, const unichar* original_word,int pos_in_original_word,const unichar* decomposition, unichar* dela_line,struct german_word_decomposition_list** L,int n_decomp, const char* left,const char* right, const struct INF_codes* inf_codes,const Alphabet* alphabet, const unsigned char* tableau_bin) { int c; int index,t; c=tableau_bin[adresse]*256+tableau_bin[adresse+1]; if (!(c&32768)) { // if we are in a terminal state index=tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4]; current_component[pos_in_current_component]='\0'; if (pos_in_current_component>1) { // we don't consider words with a length of 1 if (original_word[pos_in_original_word]=='\0') { // if we have explored the entire original word if (right[index]) { // and if we have a valid right component struct list_ustring* l=inf_codes->codes[index]; while (l!=NULL) { unichar dec[500]; u_strcpy(dec,decomposition); if (dec[0]!='\0') {u_strcat(dec," +++ ");} unichar entry[500]; uncompress_entry(current_component,l->string,entry); u_strcat(dec,entry); unichar new_dela_line[500]; struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1); if (tmp_entry==NULL) { /* If there was an error in the dictionary, we skip the entry */ l=l->next; continue; } // change case if there is a prefix // prefixes are downcase, nouns (=suffixes) uppercase: // "investitionsObjekte" -> "Investitionsobjekte" if ( u_strlen(dela_line) != 0 ) { // capitalize dela_line dela_line[0] = u_toupper((unichar) dela_line[0]); // downcase lemma and inflected tmp_entry->inflected[0] = u_tolower(tmp_entry->inflected[0]); tmp_entry->lemma[0] = u_tolower(tmp_entry->lemma[0]); } u_strcpy(new_dela_line,dela_line); u_strcat(new_dela_line,tmp_entry->inflected); u_strcat(new_dela_line,","); u_strcat(new_dela_line,dela_line); u_strcat(new_dela_line,tmp_entry->lemma); u_strcat(new_dela_line,"."); u_strcat(new_dela_line,tmp_entry->semantic_codes[0]); int k; for (k=1;k<tmp_entry->n_semantic_codes;k++) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,tmp_entry->semantic_codes[k]); } for (k=0;k<tmp_entry->n_inflectional_codes;k++) { u_strcat(new_dela_line,":"); u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]); } free_dela_entry(tmp_entry); struct german_word_decomposition* wd=new_german_word_decomposition(); wd->n_parts=n_decomp; u_strcpy(wd->decomposition,dec); u_strcpy(wd->dela_line,new_dela_line); if (check_valid_right_component_for_one_INF_code_german(l->string)) { // if we got a correct right component (N-FF) struct german_word_decomposition_list* wdl=new_german_word_decomposition_list(); wdl->element=wd; wdl->suivant=(*L); (*L)=wdl; } else { free_german_word_decomposition(wd); } l=l->next; } } } else { // else, we must explore the rest of the original word if (left[index]) { // but only if the current component was a valid left one // we go on with the next component unichar dec[2000]; unichar line[500]; u_strcpy(dec,decomposition); if (dec[0]!='\0') {u_strcat(dec," +++ ");} unichar sia_code[500]; unichar entry[500]; get_first_sia_code_german(index,sia_code,inf_codes); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,dela_line); u_strcat(line,current_component); unichar temp[500]; explore_state_german(4,temp,0,original_word,pos_in_original_word, dec,line,L,n_decomp+1,left,right,inf_codes,alphabet,tableau_bin); } } } t=adresse+5; } else { c=c-32768; t=adresse+2; } if (original_word[pos_in_original_word]=='\0') { // if we have finished, we return return; } // if not, we go on with the next letter for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),original_word[pos_in_original_word],alphabet) || is_equal_or_uppercase(original_word[pos_in_original_word],(unichar)(tableau_bin[t]*256+tableau_bin[t+1]),alphabet)) { index=tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4]; current_component[pos_in_current_component]=(unichar)(tableau_bin[t]*256+tableau_bin[t+1]); explore_state_german(index,current_component,pos_in_current_component+1,original_word,pos_in_original_word+1, decomposition,dela_line,L,n_decomp,left,right,inf_codes,alphabet,tableau_bin); } t=t+5; } }
void UniscribeController::advance(unsigned offset, GlyphBuffer* glyphBuffer) { // FIXME: We really want to be using a newer version of Uniscribe that supports the new OpenType // functions. Those functions would allow us to turn off kerning and ligatures. Without being able // to do that, we will have buggy line breaking and metrics when simple and complex text are close // together (the complex code path will narrow the text because of kerning and ligatures and then // when bidi processing splits into multiple runs, the simple portions will get wider and cause us to // spill off the edge of a line). if (static_cast<int>(offset) > m_end) offset = m_end; int length = offset - m_currentCharacter; if (length <= 0) return; // Itemize the string. const UChar* cp = m_run.data(m_currentCharacter); unsigned baseCharacter = m_currentCharacter; // We break up itemization of the string by fontData and (if needed) the use of small caps. // FIXME: It's inconsistent that we use logical order when itemizing, since this // does not match normal RTL. // FIXME: This function should decode surrogate pairs. Currently it makes little difference that // it does not because the font cache on Windows does not support non-BMP characters. Vector<UChar, 256> smallCapsBuffer; if (m_font.isSmallCaps()) smallCapsBuffer.resize(length); unsigned indexOfFontTransition = m_run.rtl() ? length - 1 : 0; const UChar* curr = m_run.rtl() ? cp + length - 1 : cp; const UChar* end = m_run.rtl() ? cp - 1 : cp + length; const SimpleFontData* fontData; const SimpleFontData* nextFontData = m_font.glyphDataForCharacter(*curr, false).fontData; UChar newC = 0; bool isSmallCaps; bool nextIsSmallCaps = m_font.isSmallCaps() && !(U_GET_GC_MASK(*curr) & U_GC_M_MASK) && (newC = u_toupper(*curr)) != *curr; if (nextIsSmallCaps) smallCapsBuffer[curr - cp] = newC; while (true) { curr = m_run.rtl() ? curr - 1 : curr + 1; if (curr == end) break; fontData = nextFontData; isSmallCaps = nextIsSmallCaps; int index = curr - cp; UChar c = *curr; bool forceSmallCaps = isSmallCaps && (U_GET_GC_MASK(c) & U_GC_M_MASK); nextFontData = m_font.glyphDataForCharacter(*curr, false, forceSmallCaps ? SmallCapsVariant : AutoVariant).fontData; if (m_font.isSmallCaps()) { nextIsSmallCaps = forceSmallCaps || (newC = u_toupper(c)) != c; if (nextIsSmallCaps) smallCapsBuffer[index] = forceSmallCaps ? c : newC; } if (m_fallbackFonts && nextFontData != fontData && fontData != m_font.primaryFont()) m_fallbackFonts->add(fontData); if (nextFontData != fontData || nextIsSmallCaps != isSmallCaps) { int itemStart = m_run.rtl() ? index + 1 : indexOfFontTransition; int itemLength = m_run.rtl() ? indexOfFontTransition - index : index - indexOfFontTransition; m_currentCharacter = baseCharacter + itemStart; itemizeShapeAndPlace((isSmallCaps ? smallCapsBuffer.data() : cp) + itemStart, itemLength, fontData, glyphBuffer); indexOfFontTransition = index; } } int itemLength = m_run.rtl() ? indexOfFontTransition + 1 : length - indexOfFontTransition; if (itemLength) { if (m_fallbackFonts && nextFontData != m_font.primaryFont()) m_fallbackFonts->add(nextFontData); int itemStart = m_run.rtl() ? 0 : indexOfFontTransition; m_currentCharacter = baseCharacter + itemStart; itemizeShapeAndPlace((nextIsSmallCaps ? smallCapsBuffer.data() : cp) + itemStart, itemLength, nextFontData, glyphBuffer); } m_currentCharacter = baseCharacter + length; }
jint fastiva_vm_Character_C$__toUpperCaseImpl(jint codePoint) { return u_toupper(codePoint); }