Ejemplo n.º 1
0
Archivo: utf.cpp Proyecto: MGKhKhD/meta
std::string toupper(const std::string& str)
{
    return transform(str, [](uint32_t cp)
                     {
                         return u_toupper(static_cast<UChar32>(cp));
                     });
}
Ejemplo n.º 2
0
static PyObject* icu_swap_case(PyObject *self, PyObject *input) {
    PyObject *result = NULL;
    UErrorCode status = U_ZERO_ERROR;
    UChar *input_buf = NULL, *output_buf = NULL;
    UChar32 *buf = NULL;
    int32_t sz = 0, sz32 = 0, i = 0;

    input_buf = python_to_icu(input, &sz);
    if (input_buf == NULL) goto end;
    output_buf = (UChar*) calloc(3 * sz, sizeof(UChar));
    buf = (UChar32*) calloc(2 * sz, sizeof(UChar32));
    if (output_buf == NULL || buf == NULL) { PyErr_NoMemory(); goto end; }
    u_strToUTF32(buf, 2 * sz, &sz32, input_buf, sz, &status);

    for (i = 0; i < sz32; i++) {
        if (u_islower(buf[i])) buf[i] = u_toupper(buf[i]);
        else if (u_isupper(buf[i])) buf[i] = u_tolower(buf[i]);
    }
    u_strFromUTF32(output_buf, 3*sz, &sz, buf, sz32, &status);
    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); goto end; }
    result = icu_to_python(output_buf, sz);

end:
    if (input_buf != NULL) free(input_buf);
    if (output_buf != NULL) free(output_buf);
    if (buf != NULL) free(buf);
    return result;

} // }}}
Ejemplo n.º 3
0
/**
 * Returns 1 if 'c' is considered as an uppercase letter
 * in the given alphabet, 0 otherwise.
 */
int is_upper(unichar c,const Alphabet* alphabet) {
if (alphabet==NULL) {
    if (u_is_letter(c) == 0)
        return 0;
    return (c == u_toupper(c)) ? 1 : 0;
}
return IS_UPPER_MACRO(c,alphabet);
}
Ejemplo n.º 4
0
// ---------------------------------------------------------------------------
//  RangeToken: Getter methods
// ---------------------------------------------------------------------------
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {

    if (fCaseIToken == 0 && tokFactory) {

        bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
        RangeToken* lwrToken = tokFactory->createRange(isNRange);

        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            for (XMLInt32 ch = fRanges[i];  ch <= fRanges[i + 1];  ++ch) {
#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER)
                const XMLInt32  upperCh = u_toupper(ch);

                if (upperCh != ch)
                {
                    lwrToken->addRange(upperCh, upperCh);
                }

                const XMLInt32  lowerCh = u_tolower(ch);

                if (lowerCh != ch)
                {
                    lwrToken->addRange(lowerCh, lowerCh);
                }

                const XMLInt32  titleCh = u_totitle(ch);

                if (titleCh != ch && titleCh != upperCh)
                {
                    lwrToken->addRange(titleCh, titleCh);
                }
#else
                if (ch >= chLatin_A && ch <= chLatin_Z)
                {
                    ch += chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
                else if (ch >= chLatin_a && ch <= chLatin_z)
                {
                    ch -= chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
#endif
            }
        }

        lwrToken->mergeRanges(this);
        lwrToken->compactRanges();
        lwrToken->createMap();

        fCaseIToken = lwrToken;
    }

    return fCaseIToken;
}
Ejemplo n.º 5
0
/*
Static Function:
AreEqualOrdinalIgnoreCase
*/
static bool AreEqualOrdinalIgnoreCase(UChar32 one, UChar32 two)
{
    // Return whether the two characters are identical or would be identical if they were upper-cased.

    if (one == two)
    {
        return true;
    }

    if (one == 0x0131 || two == 0x0131)
    {
        // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131)
        // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049).
        // We special case it to match the Windows invariant behavior.
        return false;
    }

    return u_toupper(one) == u_toupper(two);
}
Ejemplo n.º 6
0
UChar UChar::toUpper() const
{
#if APPLE_CHANGES
  return static_cast<unsigned short>(u_toupper(uc));
#else
  if (uc >= 256 || isupper(uc))
    return *this;

  return (unsigned char)toupper(uc);
#endif
}
Ejemplo n.º 7
0
/**
 * Returns 1 if 'upper' is considered as an uppercase equivalent
 * of 'lower' for the given alphabet; returns 0 otherwise.
 */
int is_upper_of(unichar lower,unichar upper,const Alphabet* alphabet) {
if (alphabet==NULL) {
   return upper==u_toupper(lower);   
}
int i_pos_in_array_of_string = alphabet->pos_in_represent_list[lower];
if (i_pos_in_array_of_string == 0) return 0;
int i=0;
while (alphabet->t_array_collection[i_pos_in_array_of_string][i]!='\0') {
      if (alphabet->t_array_collection[i_pos_in_array_of_string][i]==upper) return 1;
      i++;
}
return 0;
}
Ejemplo n.º 8
0
extern "C" int32_t
CompareStringOrdinalIgnoreCase(const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length)
{
    assert(lpStr1 != nullptr);
    assert(cwStr1Length >= 0);
    assert(lpStr2 != nullptr);
    assert(cwStr2Length >= 0);

    int32_t str1Idx = 0;
    int32_t str2Idx = 0;

    while (str1Idx < cwStr1Length && str2Idx < cwStr2Length)
    {
        UChar32 str1Codepoint;
        UChar32 str2Codepoint;

        U16_NEXT(lpStr1, str1Idx, cwStr1Length, str1Codepoint);
        U16_NEXT(lpStr2, str2Idx, cwStr2Length, str2Codepoint);

        if (str1Codepoint != str2Codepoint && u_toupper(str1Codepoint) != u_toupper(str2Codepoint))
        {
            return str1Codepoint < str2Codepoint ? -1 : 1;
        }
    }

    if (cwStr1Length < cwStr2Length)
    {
        return -1;
    }

    if (cwStr2Length < cwStr1Length)
    {
        return 1;
    }

    return 0;
}
Ejemplo n.º 9
0
wxString FilePathNormalCase(const wxString& name)
{
#ifndef __WXMSW__
	return name;
#else
	// wxString::Upper is buggy under Windows
	// and the filename insensitive of Windows is also buggy
	// but they are different
	wxString uppername;
	BOOST_FOREACH(wxChar ch, name)
		uppername.append(1, (wxChar)u_toupper((UChar32)(unsigned int)ch));

	return uppername;
#endif
}
/**
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 */
void StriContainerByteSearch::upgradePatternCaseInsensitive()
{
   UChar32 c = 0;
   R_len_t j = 0;
   patternLenCaseInsensitive = 0;
   while (j < patternLen) {
      U8_NEXT(patternStr, j, patternLen, c);
#ifndef NDEBUG
      if (patternLenCaseInsensitive >= this->kmpMaxSize)
         throw StriException("!NDEBUG: StriContainerByteSearch::upgradePatternCaseInsensitive()");
#endif
      patternStrCaseInsensitive[patternLenCaseInsensitive++] = u_toupper(c);
   }
   patternStrCaseInsensitive[patternLenCaseInsensitive] = 0;
}
Ejemplo n.º 11
0
	void ICUUnicodeSupport::_toUpperCase<2>(StringHolder<2> _str)
	{
		if(!_str.empty())
		{
			uint16_t* buf = &_str[0];
			int32_t len = _str.length();
			int32_t ofs = 0, ofs2 = 0;
			while(ofs != len)
			{
				UChar32 c;
				U16_NEXT(buf, ofs, len, c);
				c = u_toupper(c);
				U16_APPEND_UNSAFE( buf, ofs2, c);
			}
		}
	}
/** find last match - KMP
 *
 * @param startPos where to start
 * @return USEARCH_DONE on no match, otherwise start index
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-11)
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 *    use BYTESEARCH_CASE_INSENSITIVE
 */
R_len_t StriContainerByteSearch::findFromPosBack_KMP(R_len_t startPos)
{
   int j = startPos;
   patternPos = 0;
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      while (j > 0) {
         UChar32 c;
         U8_PREV(searchStr, 0, j, c);
         c = u_toupper(c);
         while (patternPos >= 0 &&
               patternStrCaseInsensitive[patternLenCaseInsensitive-1-patternPos] != c)
            patternPos = kmpNext[patternPos];
         patternPos++;
         if (patternPos == patternLenCaseInsensitive) {
            searchPos = j;

            // we need to go forward by patternLenCaseInsensitive code points
            R_len_t k = patternLenCaseInsensitive;
            searchEnd = j;
            while (k > 0) {
               U8_FWD_1((const uint8_t*)searchStr, searchEnd, searchLen);
               k--;
            }

            return searchPos;
         }
      }
   }
   else {
      while (j > 0) {
         j--;
         while (patternPos >= 0 && patternStr[patternLen-1-patternPos] != searchStr[j])
            patternPos = kmpNext[patternPos];
         patternPos++;
         if (patternPos == patternLen) {
            searchEnd = j+patternLen;
            searchPos = j;
            return searchPos;
         }
      }
   }

   // else not found
   searchPos = searchEnd = searchLen;
   return USEARCH_DONE;
}
/**
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 */
bool StriContainerByteSearch::endsWith(R_len_t byteindex)
{
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      for (R_len_t k = 0; k < patternLenCaseInsensitive; ++k) {
         UChar32 c;
         U8_PREV(searchStr, 0, byteindex, c);
         c = u_toupper(c);
         if (patternStrCaseInsensitive[patternLenCaseInsensitive-k-1] != c)
            return false;
      }
   }
   else {
      for (R_len_t k=0; k < patternLen; ++k)
         if (searchStr[byteindex-k-1] != patternStr[patternLen-k-1])
            return false;
   }

   return true; // found
}
/** find first match - KMP
 *
 * @param startPos where to start
 * @return USEARCH_DONE on no match, otherwise start index
 *
 * @version 0.1-?? (Bartek Tartanus, 2013-08-15)
 *          KMP - first approach
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-11)
 *          KMP upgraded; separate method
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 *    use BYTESEARCH_CASE_INSENSITIVE
 */
R_len_t StriContainerByteSearch::findFromPosFwd_KMP(R_len_t startPos)
{
   int j = startPos;
   patternPos = 0;
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      UChar32 c = 0;
      while (j < searchLen) {
         U8_NEXT(searchStr, j, searchLen, c);
         c = u_toupper(c);
         while (patternPos >= 0 && patternStrCaseInsensitive[patternPos] != c)
            patternPos = kmpNext[patternPos];
         patternPos++;
         if (patternPos == patternLenCaseInsensitive) {
            searchEnd = j;

            // we need to go back by patternLenCaseInsensitive code points
            R_len_t k = patternLenCaseInsensitive;
            searchPos = j;
            while (k > 0) {
               U8_BACK_1((const uint8_t*)searchStr, 0, searchPos);
               k--;
            }
            return searchPos;
         }
      }
   }
   else {
      while (j < searchLen) {
         while (patternPos >= 0 && patternStr[patternPos] != searchStr[j])
            patternPos = kmpNext[patternPos];
         patternPos++;
         j++;
         if (patternPos == patternLen) {
            searchEnd = j;
            searchPos = j-patternLen;
            return searchPos;
         }
      }
   }
   // else not found
   searchPos = searchEnd = searchLen;
   return USEARCH_DONE;
}
/**
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 */
bool StriContainerByteSearch::startsWith(R_len_t byteindex)
{
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      for (R_len_t k = 0; k < patternLenCaseInsensitive; ++k) {
         UChar32 c;
         U8_NEXT(searchStr, byteindex, searchLen, c);
         c = u_toupper(c);
         if (patternStrCaseInsensitive[k] != c)
            return false;
      }
   }
   else {
      for (R_len_t k=0; k < patternLen; ++k)
         if (searchStr[byteindex+k] != patternStr[k])
            return false;
   }

   return true; // found
}
Ejemplo n.º 16
0
/*
Function:
ChangeCase

Performs upper or lower casing of a string into a new buffer.
No special casing is performed beyond that provided by ICU.
*/
extern "C" void ChangeCase(const UChar* lpSrc,
                           int32_t cwSrcLength,
                           UChar* lpDst,
                           int32_t cwDstLength,
                           int32_t bToUpper)
{
	// Iterate through the string, decoding the next one or two UTF-16 code units
	// into a codepoint and updating srcIdx to point to the next UTF-16 code unit 
	// to decode.  Then upper or lower case it, write dstCodepoint into lpDst at 
	// offset dstIdx, and update dstIdx.

	// (The loop here has been manually cloned for each of the four cases, rather
	// than having a single loop that internally branched based on bToUpper as the 
	// compiler wasn't doing that optimization, and it results in an ~15-20% perf
	// improvement on longer strings.)

	UBool isError = FALSE;
	int32_t srcIdx = 0, dstIdx = 0;
	UChar32 srcCodepoint, dstCodepoint;

	if (bToUpper)
	{
		while (srcIdx < cwSrcLength)
		{
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = u_toupper(srcCodepoint);
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
	else
	{
		while (srcIdx < cwSrcLength)
		{
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = u_tolower(srcCodepoint);
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
}
Ejemplo n.º 17
0
static void
printProps(UChar32 codePoint) {
    char buffer[100];
    UErrorCode errorCode;

    /* get the character name */
    errorCode=U_ZERO_ERROR;
    u_charName(codePoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);

    /* print the code point and the character name */
    printf("U+%04lx\t%s\n", codePoint, buffer);

    /* print some properties */
    printf("  general category (numeric enum value): %u\n", u_charType(codePoint));

    /* note: these APIs do not provide the data from SpecialCasing.txt */
    printf("  is lowercase: %d  uppercase: U+%04lx\n", u_islower(codePoint), u_toupper(codePoint));

    printf("  is digit: %d  decimal digit value: %d\n", u_isdigit(codePoint), u_charDigitValue(codePoint));

    printf("  BiDi directional category (numeric enum value): %u\n", u_charDirection(codePoint));
}
Ejemplo n.º 18
0
/*
Function:
ChangeCaseInvariant

Performs upper or lower casing of a string into a new buffer.
Special casing is performed to ensure that invariant casing 
matches that of Windows in certain situations, e.g. Turkish i's.
*/
extern "C" void ChangeCaseInvariant(const UChar* lpSrc,
                                    int32_t cwSrcLength,
                                    UChar* lpDst,
                                    int32_t cwDstLength,
                                    int32_t bToUpper)
{
	// See algorithmic comment in ChangeCase.

	UBool isError = FALSE;
	int32_t srcIdx = 0, dstIdx = 0;
	UChar32 srcCodepoint, dstCodepoint;

	if (bToUpper)
	{
		while (srcIdx < cwSrcLength)
		{
			// On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131)
			// capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049).
			// We special case it to match the Windows invariant behavior.
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = ((srcCodepoint == (UChar32)0x0131) ? (UChar32)0x0131 : u_toupper(srcCodepoint));
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
	else
	{
		while (srcIdx < cwSrcLength)
		{
			// On Windows with InvariantCulture, the LATIN CAPITAL LETTER I WITH DOT ABOVE (U+0130)
			// lower cases to itself, whereas with ICU it lower cases to LATIN SMALL LETTER I (U+0069).
			// We special case it to match the Windows invariant behavior.
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = ((srcCodepoint == (UChar32)0x0130) ? (UChar32)0x0130 : u_tolower(srcCodepoint));
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
}
Ejemplo n.º 19
0
/*
Function:
ChangeCaseTurkish

Performs upper or lower casing of a string into a new buffer, performing special
casing for Turkish.
*/
extern "C" void ChangeCaseTurkish(const UChar* lpSrc,
								  int32_t cwSrcLength,
								  UChar* lpDst,
								  int32_t cwDstLength,
								  int32_t bToUpper)
{
	// See algorithmic comment in ChangeCase.

	UBool isError = FALSE;
	int32_t srcIdx = 0, dstIdx = 0;
	UChar32 srcCodepoint, dstCodepoint;

	if (bToUpper)
	{
		while (srcIdx < cwSrcLength)
		{
			// In turkish casing, LATIN SMALL LETTER I (U+0069) upper cases to LATIN
			// CAPITAL LETTER I WITH DOT ABOVE (U+0130).
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = ((srcCodepoint == (UChar32)0x0069) ? (UChar32)0x0130 : u_toupper(srcCodepoint));
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
	else
	{
		while (srcIdx < cwSrcLength)
		{
			// In turkish casing, LATIN CAPITAL LETTER I (U+0049) lower cases to
			// LATIN SMALL LETTER DOTLESS I (U+0131).
			U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
			dstCodepoint = ((srcCodepoint == (UChar32)0x0049) ? (UChar32)0x0131 : u_tolower(srcCodepoint));
			U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
			assert(isError == FALSE && srcIdx == dstIdx);
		}
	}
}
Ejemplo n.º 20
0
std::pair<GlyphData, GlyphPage*> FontGlyphs::glyphDataAndPageForCharacter(const FontDescription& description, UChar32 c, bool mirror, FontDataVariant variant) const
{
    ASSERT(isMainThread());

    if (variant == AutoVariant) {
        if (description.smallCaps() && !primarySimpleFontData(description)->isSVGFont()) {
            UChar32 upperC = u_toupper(c);
            if (upperC != c) {
                c = upperC;
                variant = SmallCapsVariant;
            } else
                variant = NormalVariant;
        } else
            variant = NormalVariant;
    }

    if (mirror)
        c = u_charMirror(c);

    unsigned pageNumber = (c / GlyphPage::size);

    GlyphPageTreeNode* node = pageNumber ? m_pages.get(pageNumber) : m_pageZero;
    if (!node) {
        node = GlyphPageTreeNode::getRootChild(realizeFontDataAt(description, 0), pageNumber);
        if (pageNumber)
            m_pages.set(pageNumber, node);
        else
            m_pageZero = node;
    }

    GlyphPage* page = 0;
    if (variant == NormalVariant) {
        // Fastest loop, for the common case (normal variant).
        while (true) {
            page = node->page();
            if (page) {
                GlyphData data = page->glyphDataForCharacter(c);
                if (data.fontData && (data.fontData->platformData().orientation() == Horizontal || data.fontData->isTextOrientationFallback()))
                    return std::make_pair(data, page);

                if (data.fontData) {
                    if (Font::isCJKIdeographOrSymbol(c)) {
                        if (!data.fontData->hasVerticalGlyphs()) {
                            // Use the broken ideograph font data. The broken ideograph font will use the horizontal width of glyphs
                            // to make sure you get a square (even for broken glyphs like symbols used for punctuation).
                            variant = BrokenIdeographVariant;
                            break;
                        }
#if PLATFORM(COCOA)
                        else if (data.fontData->platformData().syntheticOblique())
                            return glyphDataAndPageForCJKCharacterWithoutSyntheticItalic(c, data, page, pageNumber);
#endif
                    } else
                        return glyphDataAndPageForNonCJKCharacterWithGlyphOrientation(c, description.nonCJKGlyphOrientation(), data, page, pageNumber);

                    return std::make_pair(data, page);
                }

                if (node->isSystemFallback())
                    break;
            }

            node = node->getChild(realizeFontDataAt(description, node->level()), pageNumber);
            if (pageNumber)
                m_pages.set(pageNumber, node);
            else
                m_pageZero = node;
        }
    }
    if (variant != NormalVariant) {
        while (true) {
            page = node->page();
            if (page) {
                GlyphData data = page->glyphDataForCharacter(c);
                if (data.fontData) {
                    // The variantFontData function should not normally return 0.
                    // But if it does, we will just render the capital letter big.
                    RefPtr<SimpleFontData> variantFontData = data.fontData->variantFontData(description, variant);
                    if (!variantFontData)
                        return std::make_pair(data, page);

                    GlyphPageTreeNode* variantNode = GlyphPageTreeNode::getRootChild(variantFontData.get(), pageNumber);
                    GlyphPage* variantPage = variantNode->page();
                    if (variantPage) {
                        GlyphData data = variantPage->glyphDataForCharacter(c);
                        if (data.fontData)
                            return std::make_pair(data, variantPage);
                    }

                    // Do not attempt system fallback off the variantFontData. This is the very unlikely case that
                    // a font has the lowercase character but the small caps font does not have its uppercase version.
                    return std::make_pair(variantFontData->missingGlyphData(), page);
                }

                if (node->isSystemFallback())
                    break;
            }

            node = node->getChild(realizeFontDataAt(description, node->level()), pageNumber);
            if (pageNumber)
                m_pages.set(pageNumber, node);
            else
                m_pageZero = node;
        }
    }

    ASSERT(page);
    ASSERT(node->isSystemFallback());

    // System fallback is character-dependent. When we get here, we
    // know that the character in question isn't in the system fallback
    // font's glyph page. Try to lazily create it here.
    UChar codeUnits[2];
    int codeUnitsLength;
    if (c <= 0xFFFF) {
        codeUnits[0] = Font::normalizeSpaces(c);
        codeUnitsLength = 1;
    } else {
        codeUnits[0] = U16_LEAD(c);
        codeUnits[1] = U16_TRAIL(c);
        codeUnitsLength = 2;
    }
    const SimpleFontData* originalFontData = primaryFontData(description)->fontDataForCharacter(c);
    RefPtr<SimpleFontData> characterFontData = fontCache().systemFallbackForCharacters(description, originalFontData, m_isForPlatformFont, codeUnits, codeUnitsLength);
    if (characterFontData) {
        if (characterFontData->platformData().orientation() == Vertical && !characterFontData->hasVerticalGlyphs() && Font::isCJKIdeographOrSymbol(c))
            variant = BrokenIdeographVariant;
        if (variant != NormalVariant)
            characterFontData = characterFontData->variantFontData(description, variant);
    }
    if (characterFontData) {
        // Got the fallback glyph and font.
        GlyphPage* fallbackPage = GlyphPageTreeNode::getRootChild(characterFontData.get(), pageNumber)->page();
        GlyphData data = fallbackPage && fallbackPage->fontDataForCharacter(c) ? fallbackPage->glyphDataForCharacter(c) : characterFontData->missingGlyphData();
        // Cache it so we don't have to do system fallback again next time.
        if (variant == NormalVariant) {
#if OS(WINCE)
            // missingGlyphData returns a null character, which is not suitable for GDI to display.
            // Also, sometimes we cannot map a font for the character on WINCE, but GDI can still
            // display the character, probably because the font package is not installed correctly.
            // So we just always set the glyph to be same as the character, and let GDI solve it.
            page->setGlyphDataForCharacter(c, c, characterFontData.get());
            characterFontData->setMaxGlyphPageTreeLevel(std::max(characterFontData->maxGlyphPageTreeLevel(), node->level()));
            return std::make_pair(page->glyphDataForCharacter(c), page);
#else
            page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
            data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level()));
            if (!Font::isCJKIdeographOrSymbol(c) && data.fontData->platformData().orientation() != Horizontal && !data.fontData->isTextOrientationFallback())
                return glyphDataAndPageForNonCJKCharacterWithGlyphOrientation(c, description.nonCJKGlyphOrientation(), data, fallbackPage, pageNumber);
#endif
        }
        return std::make_pair(data, page);
    }

    // Even system fallback can fail; use the missing glyph in that case.
    // FIXME: It would be nicer to use the missing glyph from the last resort font instead.
    GlyphData data = primarySimpleFontData(description)->missingGlyphData();
    if (variant == NormalVariant) {
#if OS(WINCE)
        // See comment about WINCE GDI handling near setGlyphDataForCharacter above.
        page->setGlyphDataForCharacter(c, c, data.fontData);
        data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level()));
        return std::make_pair(page->glyphDataForCharacter(c), page);
#else
        page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
        data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level()));
#endif
    }
    return std::make_pair(data, page);
}
inline unsigned WidthIterator::advanceInternal(TextIterator& textIterator, GlyphBuffer* glyphBuffer)
{
    bool rtl = m_run.rtl();
    bool hasExtraSpacing = (m_font->letterSpacing() || m_font->wordSpacing() || m_expansion) && !m_run.spacingDisabled();

    float widthSinceLastRounding = m_runWidthSoFar;
    m_runWidthSoFar = floorf(m_runWidthSoFar);
    widthSinceLastRounding -= m_runWidthSoFar;

    float lastRoundingWidth = m_finalRoundingWidth;
    FloatRect bounds;

    const SimpleFontData* primaryFont = m_font->primaryFont();
    const SimpleFontData* lastFontData = primaryFont;
    int lastGlyphCount = glyphBuffer ? glyphBuffer->size() : 0;

    UChar32 character = 0;
    unsigned clusterLength = 0;
    CharactersTreatedAsSpace charactersTreatedAsSpace;
    String normalizedSpacesStringCache;
    while (textIterator.consume(character, clusterLength)) {
        unsigned advanceLength = clusterLength;
        int currentCharacter = textIterator.currentCharacter();
        const GlyphData& glyphData = glyphDataForCharacter(character, rtl, currentCharacter, advanceLength, normalizedSpacesStringCache);
        Glyph glyph = glyphData.glyph;
        const SimpleFontData* fontData = glyphData.fontData;

        ASSERT(fontData);

        // Now that we have a glyph and font data, get its width.
        float width;
        if (character == '\t' && m_run.allowTabs())
            width = m_font->tabWidth(*fontData, m_run.tabSize(), m_run.xPos() + m_runWidthSoFar + widthSinceLastRounding);
        else {
            width = fontData->widthForGlyph(glyph);

            // SVG uses horizontalGlyphStretch(), when textLength is used to stretch/squeeze text.
            width *= m_run.horizontalGlyphStretch();

            // We special case spaces in two ways when applying word rounding.
            // First, we round spaces to an adjusted width in all fonts.
            // Second, in fixed-pitch fonts we ensure that all characters that
            // match the width of the space character have the same width as the space character.
            if (m_run.applyWordRounding() && width == fontData->spaceWidth() && (fontData->pitch() == FixedPitch || glyph == fontData->spaceGlyph()))
                width = fontData->adjustedSpaceWidth();
        }

        if (fontData != lastFontData && width) {
            if (shouldApplyFontTransforms()) {
                m_runWidthSoFar += applyFontTransforms(glyphBuffer, m_run.ltr(), lastGlyphCount, lastFontData, *this, m_typesettingFeatures, charactersTreatedAsSpace);
                lastGlyphCount = glyphBuffer->size(); // applyFontTransforms doesn't update when there had been only one glyph.
            }

            lastFontData = fontData;
            if (m_fallbackFonts && fontData != primaryFont) {
                // FIXME: This does a little extra work that could be avoided if
                // glyphDataForCharacter() returned whether it chose to use a small caps font.
                if (!m_font->isSmallCaps() || character == u_toupper(character))
                    m_fallbackFonts->add(fontData);
                else {
                    const GlyphData& uppercaseGlyphData = m_font->glyphDataForCharacter(u_toupper(character), rtl);
                    if (uppercaseGlyphData.fontData != primaryFont)
                        m_fallbackFonts->add(uppercaseGlyphData.fontData);
                }
            }
        }

        if (hasExtraSpacing) {
            // Account for letter-spacing.
            if (width && m_font->letterSpacing())
                width += m_font->letterSpacing();

            static bool expandAroundIdeographs = Font::canExpandAroundIdeographsInComplexText();
            bool treatAsSpace = Font::treatAsSpace(character);
            if (treatAsSpace || (expandAroundIdeographs && Font::isCJKIdeographOrSymbol(character))) {
                // Distribute the run's total expansion evenly over all expansion opportunities in the run.
                if (m_expansion) {
                    float previousExpansion = m_expansion;
                    if (!treatAsSpace && !m_isAfterExpansion) {
                        // Take the expansion opportunity before this ideograph.
                        m_expansion -= m_expansionPerOpportunity;
                        float expansionAtThisOpportunity = !m_run.applyWordRounding() ? m_expansionPerOpportunity : roundf(previousExpansion) - roundf(m_expansion);
                        m_runWidthSoFar += expansionAtThisOpportunity;
                        if (glyphBuffer) {
                            if (glyphBuffer->isEmpty()) {
                                if (m_forTextEmphasis)
                                    glyphBuffer->add(fontData->zeroWidthSpaceGlyph(), fontData, m_expansionPerOpportunity, currentCharacter);
                                else
                                    glyphBuffer->add(fontData->spaceGlyph(), fontData, expansionAtThisOpportunity, currentCharacter);
                            } else
                                glyphBuffer->expandLastAdvance(expansionAtThisOpportunity);
                        }
                        previousExpansion = m_expansion;
                    }
                    if (m_run.allowsTrailingExpansion() || (m_run.ltr() && currentCharacter + advanceLength < static_cast<size_t>(m_run.length()))
                        || (m_run.rtl() && currentCharacter)) {
                        m_expansion -= m_expansionPerOpportunity;
                        width += !m_run.applyWordRounding() ? m_expansionPerOpportunity : roundf(previousExpansion) - roundf(m_expansion);
                        m_isAfterExpansion = true;
                    }
                } else
                    m_isAfterExpansion = false;

                // Account for word spacing.
                // We apply additional space between "words" by adding width to the space character.
                if (treatAsSpace && (character != '\t' || !m_run.allowTabs()) && (currentCharacter || character == noBreakSpace) && m_font->wordSpacing())
                    width += m_font->wordSpacing();
            } else
                m_isAfterExpansion = false;
        }

        if (shouldApplyFontTransforms() && glyphBuffer && Font::treatAsSpace(character))
            charactersTreatedAsSpace.append(std::make_pair(glyphBuffer->size(),
                OriginalAdvancesForCharacterTreatedAsSpace(character == ' ', glyphBuffer->size() ? glyphBuffer->advanceAt(glyphBuffer->size() - 1).width() : 0, width)));

        if (m_accountForGlyphBounds) {
            bounds = fontData->boundsForGlyph(glyph);
            if (!currentCharacter)
                m_firstGlyphOverflow = std::max<float>(0, -bounds.x());
        }

        if (m_forTextEmphasis && !Font::canReceiveTextEmphasis(character))
            glyph = 0;

        // Advance past the character we just dealt with.
        textIterator.advance(advanceLength);

        float oldWidth = width;

        // Force characters that are used to determine word boundaries for the rounding hack
        // to be integer width, so following words will start on an integer boundary.
        if (m_run.applyWordRounding() && Font::isRoundingHackCharacter(character)) {
            width = ceilf(width);

            // Since widthSinceLastRounding can lose precision if we include measurements for
            // preceding whitespace, we bypass it here.
            m_runWidthSoFar += width;

            // Since this is a rounding hack character, we should have reset this sum on the previous
            // iteration.
            ASSERT(!widthSinceLastRounding);
        } else {
            // Check to see if the next character is a "rounding hack character", if so, adjust
            // width so that the total run width will be on an integer boundary.
            if ((m_run.applyWordRounding() && textIterator.currentCharacter() < m_run.length() && Font::isRoundingHackCharacter(*(textIterator.characters())))
                || (m_run.applyRunRounding() && textIterator.currentCharacter() >= m_run.length())) {
                float totalWidth = widthSinceLastRounding + width;
                widthSinceLastRounding = ceilf(totalWidth);
                width += widthSinceLastRounding - totalWidth;
                m_runWidthSoFar += widthSinceLastRounding;
                widthSinceLastRounding = 0;
            } else
                widthSinceLastRounding += width;
        }

        if (glyphBuffer)
            glyphBuffer->add(glyph, fontData, (rtl ? oldWidth + lastRoundingWidth : width), currentCharacter);

        lastRoundingWidth = width - oldWidth;

        if (m_accountForGlyphBounds) {
            m_maxGlyphBoundingBoxY = std::max(m_maxGlyphBoundingBoxY, bounds.maxY());
            m_minGlyphBoundingBoxY = std::min(m_minGlyphBoundingBoxY, bounds.y());
            m_lastGlyphOverflow = std::max<float>(0, bounds.maxX() - width);
        }
    }

    if (shouldApplyFontTransforms())
        m_runWidthSoFar += applyFontTransforms(glyphBuffer, m_run.ltr(), lastGlyphCount, lastFontData, *this, m_typesettingFeatures, charactersTreatedAsSpace);

    unsigned consumedCharacters = textIterator.currentCharacter() - m_currentCharacter;
    m_currentCharacter = textIterator.currentCharacter();
    m_runWidthSoFar += widthSinceLastRounding;
    m_finalRoundingWidth = lastRoundingWidth;
    return consumedCharacters;
}
Ejemplo n.º 22
0
/**
 * Explores the given dictionary to match the given word.
 */
static void explore_dic(int offset,unichar* word,int pos_word,Dictionary* d,SpellCheckConfig* cfg,
		Ustring* output,SpellCheckHypothesis* *list,int base,Ustring* inflected) {
int original_offset=offset;
int original_base=base;
int final,n_transitions,inf_code;
int z=save_output(output);
int size_pairs=cfg->pairs->nbelems;
offset=read_dictionary_state(d,offset,&final,&n_transitions,&inf_code);
if (final) {
	if (word[pos_word]=='\0') {
		/* If we have a match */
		deal_with_matches(d,inflected->str,inf_code,output,cfg,base,list);
	}
	base=output->len;
}
/* If we are at the end of the token, then we stop */
if (word[pos_word]=='\0') {
	return;
}
unsigned int l2=inflected->len;
unichar c;
int dest_offset;
for (int i=0;i<n_transitions;i++) {
	restore_output(z,output);
	offset=read_dictionary_transition(d,offset,&c,&dest_offset,output);
	/* For backup_output, see comment below */
	int backup_output=save_output(output);
	if (c==word[pos_word] || word[pos_word]==u_toupper(c)) {
		u_strcat(inflected,c);
		explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
	} else {
		/* We deal with the SP_SWAP case, made of 2 SP_CHANGE_XXX */
		if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SWAP!=cfg->max_SP_SWAP
				&& is_letter_swap(cfg,word,pos_word,inflected,c)) {
			/* We don't modify the number of errors since we override an existing
			 * SP_CHANGE_XXX one */
			cfg->current_SP_SWAP++;
			/* We override the previous change */
			int a=cfg->pairs->tab[cfg->pairs->nbelems-2];
			int b=cfg->pairs->tab[cfg->pairs->nbelems-1];
			cfg->pairs->tab[cfg->pairs->nbelems-2]=pos_word-1;
			cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_SWAP_DEFAULT;
			u_strcat(inflected,c);
			explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			cfg->pairs->tab[cfg->pairs->nbelems-2]=a;
			cfg->pairs->tab[cfg->pairs->nbelems-1]=b;
			cfg->current_SP_SWAP--;
		} else /* We deal with the SP_CHANGE case */
		       if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_CHANGE!=cfg->max_SP_CHANGE
				/* We want letters, not spaces or anything else */
				&& is_letter(c,NULL)
		        /* We do not allow the replacement of a lowercase letter by an uppercase
		         * letter at the beginning of the word like Niserable, unless the whole word
		         * is in uppercase or the letter is the same, module the case */
		        && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL) || word[0]==u_toupper(c)))) {
			cfg->current_errors++;
			cfg->current_SP_CHANGE++;
			/* Now we test all possible kinds of change */
			vector_int_add(cfg->pairs,pos_word);
			u_strcat(inflected,c);
			/* We always add the default case */
			vector_int_add(cfg->pairs,SP_CHANGE_DEFAULT);
			int n_elem=cfg->pairs->nbelems;
			explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			/* Then we test the accent case */
			if (u_deaccentuate(c)==u_deaccentuate(word[pos_word])) {
				/* After a call to explore_dic, we must restore the output.
				 * But, when dealing with SP_CHANGE_XXX ops, we must restore the
				 * output including the output associated to the current transition,
				 * which is why we don't use z (output before the current transition)
				 * but backup_output */
				restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
			    cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_DIACRITIC;
			    explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			/* And the case variations */
			if (u_tolower(c)==u_tolower(word[pos_word])) {
			    restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
				cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_CASE;
				explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			/* And finally the position on keyboard */
			if (areCloseOnKeyboard(c,word[pos_word],cfg->keyboard)) {
			    restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
				cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_KEYBOARD;
				explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			cfg->pairs->nbelems=size_pairs;
			cfg->current_errors--;
			cfg->current_SP_CHANGE--;
			/* End of the SP_CHANGE case */
		}
	}
    restore_output(backup_output,output);
	truncate(inflected,l2);
	/* Now we deal with the SP_SUPPR case */
	if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SUPPR!=cfg->max_SP_SUPPR
		/* We want letters, not spaces or anything else */
		&& is_letter(c,NULL)) {
		cfg->current_errors++;
		cfg->current_SP_SUPPR++;
		vector_int_add(cfg->pairs,pos_word);
		if (pos_word>=1 && c==word[pos_word-1]) {
			vector_int_add(cfg->pairs,SP_SUPPR_DOUBLE);
		} else {
			vector_int_add(cfg->pairs,SP_SUPPR_DEFAULT);
		}
		u_strcat(inflected,c);
		explore_dic(dest_offset,word,pos_word,d,cfg,output,list,original_base,inflected);
		truncate(inflected,l2);
		cfg->pairs->nbelems=size_pairs;
		cfg->current_errors--;
		cfg->current_SP_SUPPR--;
	}
}
restore_output(z,output);
/* Finally, we deal with the SP_INSERT case, by calling again the current
 * function with the same parameters, except pos_word that will be increased of 1 */
if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_INSERT!=cfg->max_SP_INSERT
	/* We want letters, not spaces or anything else */
	&& is_letter(word[pos_word],NULL)
	/* We do not allow the insertion of a capital letter at the beginning of
	 * the word like Astreet, unless the whole word is in uppercase like ASTREET */
    && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL)))) {
	cfg->current_errors++;
	cfg->current_SP_INSERT++;
	vector_int_add(cfg->pairs,pos_word);
	if (pos_word>=1 && word[pos_word]==word[pos_word-1]) {
		vector_int_add(cfg->pairs,SP_INSERT_DOUBLE);
	} else {
		vector_int_add(cfg->pairs,SP_INSERT_DEFAULT);
	}
	explore_dic(original_offset,word,pos_word+1,d,cfg,output,list,original_base,inflected);
	truncate(inflected,l2);
	cfg->pairs->nbelems=size_pairs;
	cfg->current_errors--;
	cfg->current_SP_INSERT--;
}
/* Finally, we restore the output as it was when we enter the function */
restore_output(z,output);
}
Ejemplo n.º 23
0
/**
 * Takes a given unicode string 'dest' and
 * replaces any lowercase letter by the set made of itself and
 * its uppercase equivalent, surrounded with square brackets if
 * the letter was not already between square brackets.
 * Examples:
 *
 * "For" => "F[oO][rR]"
 * "F[ao]r" => "F[aAoO][rR]"
 *
 * The output is stored in 'src'. The function assumes that 'src' is
 * wide enough.
 *
 * This function is used for morphological filter regular expressions.
 */
void replace_letter_by_letter_set(const Alphabet* a,unichar* dest,const unichar* src) {
int i=0,j=0;
char inside_a_set=0;
while (src[i]!='\0') {
   switch (src[i]) {
      case '\\':
         if (src[i+1]=='\0') {
             // there is nothing after a backslash, then we stop,
             // and the RE compiler may indicate an error
             dest[j++] = src[i++];
             dest[j] = src[i];
             return;
         }
         if (is_lower(src[i+1],a)) {
             // this is a lowercase letter in Unitex alphabet :
             // we don't need "\" and we make expansion "[eE]"
             ++i;
             if (!inside_a_set) dest[j++]='[';
             dest[j++]=src[i];
             if (a==NULL) {
                /* If there is no alphabet file, we just consider the unique
                 * uppercase variant of the letter */
                dest[j++]=u_toupper(src[i]);
             } else {
			 unichar* tbrowse = NULL;
			 int i_pos_in_array_of_string = a->pos_in_represent_list[src[i]];
			 if (i_pos_in_array_of_string != 0)
				 tbrowse = a->t_array_collection[i_pos_in_array_of_string];
			 if (tbrowse != NULL)
				 while ((*tbrowse) != '\0') {
					 dest[j++]=*(tbrowse++);
				 }
             }
             if (!inside_a_set) dest[j++]=']';
             i++;
          } else {
             // others cases :
             // we keep the "\" and the letter
             dest[j++] = src[i++];
             dest[j++] = src[i++];
          }
          break;
       case '[':
          dest[j++]=src[i++];
          inside_a_set=1;
          break;
       case ']':
          dest[j++]=src[i++];
          inside_a_set=0;
          break;
       case '.': case '*': case '+': case '?': case '|': case '^': case '$':
       case ':': case '(': case ')': case '{': case '}': case '1': case '2':
       case '3': case '4': case '5': case '6': case '7': case '8': case '9':
          dest[j++]=src[i++];
          break;
       default:
          if (is_lower(src[i],a)) {
             if (!inside_a_set) dest[j++]='[';
             dest[j++]=src[i];
             if (inside_a_set && src[i+1]=='-') {
            	 /* Special case:
            	  * if we had [a-d], we don't want to turn it into
            	  * [aA-dD], but rather into [a-dA-D]. In such a case,
            	  * we just use u_toupper
            	  */
            	 i=i+2;
            	 dest[j++]='-';
            	 dest[j++]=src[i++];
            	 dest[j++]=u_toupper(dest[i-3]);
            	 dest[j++]='-';
            	 dest[j++]=u_toupper(src[i-1]);
            	 continue;
             }

             if (a==NULL) {
                /* If there is no alphabet file, we just consider the unique
                 * uppercase variant of the letter */
                dest[j++]=u_toupper(src[i]);
             } else {
                /* If there is an alphabet file, we use it */
                unichar* tbrowse = NULL;
                int i_pos_in_array_of_string = a->pos_in_represent_list[src[i]];
                if (i_pos_in_array_of_string != 0) {
                   tbrowse = a->t_array_collection[i_pos_in_array_of_string];
                }
                if (tbrowse != NULL) {
                   while ((*tbrowse) != '\0') {
                      dest[j++]=*(tbrowse++);
                   }
                }
             }
             if (!inside_a_set) dest[j++]=']';
             i++;
         }
          else {
             /* Not a lower case letter */
             dest[j++]=src[i++];
          }
   }
}
dest[j]='\0';
}
Ejemplo n.º 24
0
uint32
BUnicodeChar::ToUpper(uint32 c)
{
	BUnicodeChar();
	return u_toupper(c);
}
static jint Character_toUpperCaseImpl(JNIEnv*, jclass, jint codePoint) {
    return u_toupper(codePoint);
}
Ejemplo n.º 26
0
//static jint Character_toUpperCaseImpl(JNIEnv*, jclass, jint codePoint) {
JNIEXPORT jint JNICALL
Java_java_lang_Character_toUpperCaseImpl(JNIEnv*, jclass, jint codePoint) {
    return u_toupper(codePoint);
}
// Helper sets the character attribute properties and sets up the script table.
// Does not set tops and bottoms.
void SetupBasicProperties(bool report_errors, bool decompose,
                          UNICHARSET* unicharset) {
  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
    // Convert any custom ligatures.
    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
        unichar_str = UNICHARSET::kCustomLigatures[i][0];
        break;
      }
    }

    // Convert the unichar to UTF32 representation
    std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);

    // Assume that if the property is true for any character in the string,
    // then it holds for the whole "character".
    bool unichar_isalpha = false;
    bool unichar_islower = false;
    bool unichar_isupper = false;
    bool unichar_isdigit = false;
    bool unichar_ispunct = false;

    for (char32 u_ch : uni_vector) {
      if (u_isalpha(u_ch)) unichar_isalpha = true;
      if (u_islower(u_ch)) unichar_islower = true;
      if (u_isupper(u_ch)) unichar_isupper = true;
      if (u_isdigit(u_ch)) unichar_isdigit = true;
      if (u_ispunct(u_ch)) unichar_ispunct = true;
    }

    unicharset->set_isalpha(unichar_id, unichar_isalpha);
    unicharset->set_islower(unichar_id, unichar_islower);
    unicharset->set_isupper(unichar_id, unichar_isupper);
    unicharset->set_isdigit(unichar_id, unichar_isdigit);
    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);

    tesseract::IcuErrorCode err;
    unicharset->set_script(unichar_id, uscript_getName(
        uscript_getScript(uni_vector[0], err)));

    const int num_code_points = uni_vector.size();
    // Obtain the lower/upper case if needed and record it in the properties.
    unicharset->set_other_case(unichar_id, unichar_id);
    if (unichar_islower || unichar_isupper) {
      std::vector<char32> other_case(num_code_points, 0);
      for (int i = 0; i < num_code_points; ++i) {
        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
        // However since they deal with UChars (so need a conversion function
        // from char32 or UTF8string) and require a meaningful locale string,
        // for now u_tolower()/u_toupper() are used.
        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
          u_tolower(uni_vector[i]);
      }
      std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);
      UNICHAR_ID other_case_id =
          unicharset->unichar_to_id(other_case_uch.c_str());
      if (other_case_id != INVALID_UNICHAR_ID) {
        unicharset->set_other_case(unichar_id, other_case_id);
      } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
        tprintf("Other case %s of %s is not in unicharset\n",
                other_case_uch.c_str(), unichar_str);
      }
    }

    // Set RTL property and obtain mirror unichar ID from ICU.
    std::vector<char32> mirrors(num_code_points, 0);
    for (int i = 0; i < num_code_points; ++i) {
      mirrors[i] = u_charMirror(uni_vector[i]);
      if (i == 0) {  // set directionality to that of the 1st code point
        unicharset->set_direction(unichar_id,
                                  static_cast<UNICHARSET::Direction>(
                                      u_charDirection(uni_vector[i])));
      }
    }
    std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);
    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
    if (mirror_uch_id != INVALID_UNICHAR_ID) {
      unicharset->set_mirror(unichar_id, mirror_uch_id);
    } else if (report_errors) {
      tprintf("Mirror %s of %s is not in unicharset\n",
              mirror_uch.c_str(), unichar_str);
    }

    // Record normalized version of this unichar.
    std::string normed_str;
    if (unichar_id != 0 &&
        tesseract::NormalizeUTF8String(
            decompose ? tesseract::UnicodeNormMode::kNFKD
                      : tesseract::UnicodeNormMode::kNFKC,
            tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
            unichar_str, &normed_str) &&
        !normed_str.empty()) {
      unicharset->set_normed(unichar_id, normed_str.c_str());
    } else {
      unicharset->set_normed(unichar_id, unichar_str);
    }
    ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
  }
  unicharset->post_load_setup();
}
Ejemplo n.º 28
0
//
// this function explores the dictionary to decompose the word mot
//
void explore_state_german(int adresse,unichar* current_component,int pos_in_current_component,
                   const unichar* original_word,int pos_in_original_word,const unichar* decomposition,
                   unichar* dela_line,struct german_word_decomposition_list** L,int n_decomp,
                   const char* left,const char* right,
                   const struct INF_codes* inf_codes,const Alphabet* alphabet,
                   const unsigned char* tableau_bin) {
int c;
int index,t;
c=tableau_bin[adresse]*256+tableau_bin[adresse+1];
if (!(c&32768)) {
  // if we are in a terminal state
  index=tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4];
  current_component[pos_in_current_component]='\0';
  if (pos_in_current_component>1) {
    // we don't consider words with a length of 1
    if (original_word[pos_in_original_word]=='\0') {
      // if we have explored the entire original word
      if (right[index]) {
         // and if we have a valid right component
         struct list_ustring* l=inf_codes->codes[index];
         while (l!=NULL) {
            unichar dec[500];
            u_strcpy(dec,decomposition);
            if (dec[0]!='\0') {u_strcat(dec," +++ ");}
            unichar entry[500];
            uncompress_entry(current_component,l->string,entry);
            u_strcat(dec,entry);
            unichar new_dela_line[500];
            struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1);
            if (tmp_entry==NULL) {
               /* If there was an error in the dictionary, we skip the entry */
               l=l->next;
               continue;
            }
            // change case if there is a prefix
            // prefixes are downcase, nouns (=suffixes) uppercase:
            // "investitionsObjekte" -> "Investitionsobjekte"
            if ( u_strlen(dela_line) != 0 ) {
              // capitalize dela_line
              dela_line[0] = u_toupper((unichar) dela_line[0]);
              // downcase lemma and inflected
              tmp_entry->inflected[0] = u_tolower(tmp_entry->inflected[0]);
              tmp_entry->lemma[0] = u_tolower(tmp_entry->lemma[0]);
            }
            u_strcpy(new_dela_line,dela_line);
            u_strcat(new_dela_line,tmp_entry->inflected);
            u_strcat(new_dela_line,",");
            u_strcat(new_dela_line,dela_line);
            u_strcat(new_dela_line,tmp_entry->lemma);
            u_strcat(new_dela_line,".");
            u_strcat(new_dela_line,tmp_entry->semantic_codes[0]);
            int k;
            for (k=1;k<tmp_entry->n_semantic_codes;k++) {
               u_strcat(new_dela_line,"+");
               u_strcat(new_dela_line,tmp_entry->semantic_codes[k]);
            }
            for (k=0;k<tmp_entry->n_inflectional_codes;k++) {
               u_strcat(new_dela_line,":");
               u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]);
            }
            free_dela_entry(tmp_entry);
            struct german_word_decomposition* wd=new_german_word_decomposition();
            wd->n_parts=n_decomp;
            u_strcpy(wd->decomposition,dec);
            u_strcpy(wd->dela_line,new_dela_line);
            if (check_valid_right_component_for_one_INF_code_german(l->string)) {
               // if we got a correct right component (N-FF)
               struct german_word_decomposition_list* wdl=new_german_word_decomposition_list();
               wdl->element=wd;
               wdl->suivant=(*L);
               (*L)=wdl;
            } else {
               free_german_word_decomposition(wd);
            }
            l=l->next;
         }
      }
    }
    else {
      // else, we must explore the rest of the original word
      if (left[index]) {
         // but only if the current component was a valid left one
         // we go on with the next component
         unichar dec[2000];
         unichar line[500];
         u_strcpy(dec,decomposition);
         if (dec[0]!='\0') {u_strcat(dec," +++ ");}
         unichar sia_code[500];
         unichar entry[500];
         get_first_sia_code_german(index,sia_code,inf_codes);
         uncompress_entry(current_component,sia_code,entry);
         u_strcat(dec,entry);
         u_strcpy(line,dela_line);
         u_strcat(line,current_component);
         unichar temp[500];
         explore_state_german(4,temp,0,original_word,pos_in_original_word,
                  dec,line,L,n_decomp+1,left,right,inf_codes,alphabet,tableau_bin);
      }
    }
  }
  t=adresse+5;
}
else {
  c=c-32768;
  t=adresse+2;
}
if (original_word[pos_in_original_word]=='\0') {
   // if we have finished, we return
   return;
}
// if not, we go on with the next letter
for (int i=0;i<c;i++) {
  if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),original_word[pos_in_original_word],alphabet)
      || is_equal_or_uppercase(original_word[pos_in_original_word],(unichar)(tableau_bin[t]*256+tableau_bin[t+1]),alphabet)) {
    index=tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4];
    current_component[pos_in_current_component]=(unichar)(tableau_bin[t]*256+tableau_bin[t+1]);
    explore_state_german(index,current_component,pos_in_current_component+1,original_word,pos_in_original_word+1,
                  decomposition,dela_line,L,n_decomp,left,right,inf_codes,alphabet,tableau_bin);
  }
  t=t+5;
}
}
Ejemplo n.º 29
0
void UniscribeController::advance(unsigned offset, GlyphBuffer* glyphBuffer)
{
    // FIXME: We really want to be using a newer version of Uniscribe that supports the new OpenType
    // functions.  Those functions would allow us to turn off kerning and ligatures.  Without being able
    // to do that, we will have buggy line breaking and metrics when simple and complex text are close
    // together (the complex code path will narrow the text because of kerning and ligatures and then
    // when bidi processing splits into multiple runs, the simple portions will get wider and cause us to
    // spill off the edge of a line).
    if (static_cast<int>(offset) > m_end)
        offset = m_end;

    int length = offset - m_currentCharacter;
    if (length <= 0)
        return;

    // Itemize the string.
    const UChar* cp = m_run.data(m_currentCharacter);
    unsigned baseCharacter = m_currentCharacter;

    // We break up itemization of the string by fontData and (if needed) the use of small caps.

    // FIXME: It's inconsistent that we use logical order when itemizing, since this
    // does not match normal RTL.

    // FIXME: This function should decode surrogate pairs. Currently it makes little difference that
    // it does not because the font cache on Windows does not support non-BMP characters.
    Vector<UChar, 256> smallCapsBuffer;
    if (m_font.isSmallCaps())
        smallCapsBuffer.resize(length);

    unsigned indexOfFontTransition = m_run.rtl() ? length - 1 : 0;
    const UChar* curr = m_run.rtl() ? cp + length  - 1 : cp;
    const UChar* end = m_run.rtl() ? cp - 1 : cp + length;

    const SimpleFontData* fontData;
    const SimpleFontData* nextFontData = m_font.glyphDataForCharacter(*curr, false).fontData;

    UChar newC = 0;

    bool isSmallCaps;
    bool nextIsSmallCaps = m_font.isSmallCaps() && !(U_GET_GC_MASK(*curr) & U_GC_M_MASK) && (newC = u_toupper(*curr)) != *curr;

    if (nextIsSmallCaps)
        smallCapsBuffer[curr - cp] = newC;

    while (true) {
        curr = m_run.rtl() ? curr - 1 : curr + 1;
        if (curr == end)
            break;

        fontData = nextFontData;
        isSmallCaps = nextIsSmallCaps;
        int index = curr - cp;
        UChar c = *curr;

        bool forceSmallCaps = isSmallCaps && (U_GET_GC_MASK(c) & U_GC_M_MASK);
        nextFontData = m_font.glyphDataForCharacter(*curr, false, forceSmallCaps ? SmallCapsVariant : AutoVariant).fontData;
        if (m_font.isSmallCaps()) {
            nextIsSmallCaps = forceSmallCaps || (newC = u_toupper(c)) != c;
            if (nextIsSmallCaps)
                smallCapsBuffer[index] = forceSmallCaps ? c : newC;
        }

        if (m_fallbackFonts && nextFontData != fontData && fontData != m_font.primaryFont())
            m_fallbackFonts->add(fontData);

        if (nextFontData != fontData || nextIsSmallCaps != isSmallCaps) {
            int itemStart = m_run.rtl() ? index + 1 : indexOfFontTransition;
            int itemLength = m_run.rtl() ? indexOfFontTransition - index : index - indexOfFontTransition;
            m_currentCharacter = baseCharacter + itemStart;
            itemizeShapeAndPlace((isSmallCaps ? smallCapsBuffer.data() : cp) + itemStart, itemLength, fontData, glyphBuffer);
            indexOfFontTransition = index;
        }
    }
    
    int itemLength = m_run.rtl() ? indexOfFontTransition + 1 : length - indexOfFontTransition;
    if (itemLength) {
        if (m_fallbackFonts && nextFontData != m_font.primaryFont())
            m_fallbackFonts->add(nextFontData);

        int itemStart = m_run.rtl() ? 0 : indexOfFontTransition;
        m_currentCharacter = baseCharacter + itemStart;
        itemizeShapeAndPlace((nextIsSmallCaps ? smallCapsBuffer.data() : cp) + itemStart, itemLength, nextFontData, glyphBuffer);
    }

    m_currentCharacter = baseCharacter + length;
}
Ejemplo n.º 30
0
jint fastiva_vm_Character_C$__toUpperCaseImpl(jint codePoint) {
    return u_toupper(codePoint);
}