int Unicode_CompareRange(const char *str1, // IN: UnicodeIndex str1Start, // IN: UnicodeIndex str1Length, // IN: const char *str2, // IN: UnicodeIndex str2Start, // IN: UnicodeIndex str2Length, // IN: Bool ignoreCase) // IN: { int result = -1; char *substr1 = NULL; char *substr2 = NULL; utf16_t *substr1UTF16 = NULL; utf16_t *substr2UTF16 = NULL; UnicodeIndex i = 0; UnicodeIndex utf16Index; utf16_t codeUnit1; utf16_t codeUnit2; uint32 codePoint1; uint32 codePoint2; /* * TODO: Allocating substrings is a performance hit. We should do this * search in-place. (However, searching UTF-8 requires tender loving * care, and it's just easier to search UTF-16.) */ substr1 = Unicode_Substr(str1, str1Start, str1Length); if (!substr1) { goto out; } substr2 = Unicode_Substr(str2, str2Start, str2Length); if (!substr2) { goto out; } /* * XXX TODO: Need to normalize the incoming strings to NFC or NFD. */ substr1UTF16 = Unicode_GetAllocUTF16(substr1); if (!substr1UTF16) { goto out; } substr2UTF16 = Unicode_GetAllocUTF16(substr2); if (!substr2UTF16) { goto out; } /* * TODO: This is the naive string search algorithm, which is O(n * m). We * can do better with KMP or Boyer-Moore if this proves to be a bottleneck. */ while (TRUE) { codeUnit1 = *(substr1UTF16 + i); codeUnit2 = *(substr2UTF16 + i); /* * TODO: Simple case folding doesn't handle the situation where more * than one code unit is needed to store the result of the case folding. * * This means that German "straBe" (where B = sharp S, U+00DF) will not * match "STRASSE", even though the two strings are the same. */ if (ignoreCase) { codeUnit1 = UnicodeSimpleCaseFold(codeUnit1); codeUnit2 = UnicodeSimpleCaseFold(codeUnit2); } if (codeUnit1 != codeUnit2) { break; } if (codeUnit1 == 0) { // End of both strings reached: strings are equal. result = 0; goto out; } i++; } /* * The two UTF-16 code units differ. If they're the first code unit of a * surrogate pair (for Unicode values past U+FFFF), decode the surrogate * pair into a full Unicode code point. */ if (U16_IS_SURROGATE(codeUnit1)) { ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr1UTF16); // U16_NEXT modifies the index, so let it work on a copy. utf16Index = i; // Decode the surrogate if needed. U16_NEXT(substr1UTF16, utf16Index, substrUTF16Len, codePoint1); } else { // Not a surrogate? Then the code point value is the code unit. codePoint1 = codeUnit1; } if (U16_IS_SURROGATE(codeUnit2)) { ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr2UTF16); utf16Index = i; U16_NEXT(substr2UTF16, utf16Index, substrUTF16Len, codePoint2); } else { codePoint2 = codeUnit2; } if (codePoint1 < codePoint2) { result = -1; } else if (codePoint1 > codePoint2) { result = 1; } else { // If we hit the end of the string, we've already gone to 'out'. NOT_REACHED(); } out: free(substr1UTF16); free(substr2UTF16); free(substr1); free(substr2); return result; }
U_CAPI int32_t U_EXPORT2 uspoof_check(const USpoofChecker *sc, const UChar *text, int32_t length, int32_t *position, UErrorCode *status) { const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (This == NULL) { return 0; } if (length < -1) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } if (length == -1) { // It's not worth the bother to handle nul terminated strings everywhere. // Just get the length and be done with it. length = u_strlen(text); } int32_t result = 0; int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32? // A count of the number of non-Common or inherited scripts. // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests. // Share the computation when possible. scriptCount == -1 means that we haven't // done it yet. int32_t scriptCount = -1; if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) { scriptCount = This->scriptScan(text, length, failPos, *status); // printf("scriptCount (clipped to 2) = %d\n", scriptCount); if ( scriptCount >= 2) { // Note: scriptCount == 2 covers all cases of the number of scripts >= 2 result |= USPOOF_SINGLE_SCRIPT; } } if (This->fChecks & USPOOF_CHAR_LIMIT) { int32_t i; UChar32 c; for (i=0; i<length ;) { U16_NEXT(text, i, length, c); if (!This->fAllowedCharsSet->contains(c)) { result |= USPOOF_CHAR_LIMIT; if (i < failPos) { failPos = i; } break; } } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) { // These are the checks that need to be done on NFD input NFDBuffer normalizedInput(text, length, *status); const UChar *nfdText = normalizedInput.getBuffer(); int32_t nfdLength = normalizedInput.getLength(); if (This->fChecks & USPOOF_INVISIBLE) { // scan for more than one occurence of the same non-spacing mark // in a sequence of non-spacing marks. int32_t i; UChar32 c; UChar32 firstNonspacingMark = 0; UBool haveMultipleMarks = FALSE; UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. for (i=0; i<nfdLength ;) { U16_NEXT(nfdText, i, nfdLength, c); if (u_charType(c) != U_NON_SPACING_MARK) { firstNonspacingMark = 0; if (haveMultipleMarks) { marksSeenSoFar.clear(); haveMultipleMarks = FALSE; } continue; } if (firstNonspacingMark == 0) { firstNonspacingMark = c; continue; } if (!haveMultipleMarks) { marksSeenSoFar.add(firstNonspacingMark); haveMultipleMarks = TRUE; } if (marksSeenSoFar.contains(c)) { // report the error, and stop scanning. // No need to find more than the first failure. result |= USPOOF_INVISIBLE; failPos = i; // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want // to give back to our caller is a position in the original input string. if (failPos > length) { failPos = length; } break; } marksSeenSoFar.add(c); } } if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) { // The basic test is the same for both whole and mixed script confusables. // Compute the set of scripts that every input character has a confusable in. // For this computation an input character is always considered to be // confusable with itself in its own script. // If the number of such scripts is two or more, and the input consisted of // characters all from a single script, we have a whole script confusable. // (The two scripts will be the original script and the one that is confusable) // If the number of such scripts >= one, and the original input contained characters from // more than one script, we have a mixed script confusable. (We can transform // some of the characters, and end up with a visually similar string all in // one script.) if (scriptCount == -1) { int32_t t; scriptCount = This->scriptScan(text, length, t, *status); } ScriptSet scripts; This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status); int32_t confusableScriptCount = scripts.countMembers(); //printf("confusableScriptCount = %d\n", confusableScriptCount); if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) && confusableScriptCount >= 2 && scriptCount == 1) { result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; } if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) && confusableScriptCount >= 1 && scriptCount > 1) { result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; } } } if (position != NULL && failPos != 0x7fffffff) { *position = failPos; } return result; }
U_CAPI int32_t U_EXPORT2 uspoof_getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t length, UChar *dest, int32_t destCapacity, UErrorCode *status) { // TODO: this function could be sped up a bit // Skip the input normalization when not needed, work from callers data. // Put the initial skeleton straight into the caller's destination buffer. // It probably won't need normalization. // But these would make the structure more complicated. const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); if (U_FAILURE(*status)) { return 0; } if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) || (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } int32_t tableMask = 0; switch (type) { case 0: tableMask = USPOOF_ML_TABLE_FLAG; break; case USPOOF_SINGLE_SCRIPT_CONFUSABLE: tableMask = USPOOF_SL_TABLE_FLAG; break; case USPOOF_ANY_CASE: tableMask = USPOOF_MA_TABLE_FLAG; break; case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE: tableMask = USPOOF_SA_TABLE_FLAG; break; default: *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } // NFD transform of the user supplied input UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE]; UChar *nfdInput = nfdStackBuf; int32_t normalizedLen = unorm_normalize( s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status); if (*status == U_BUFFER_OVERFLOW_ERROR) { nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar)); if (nfdInput == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } *status = U_ZERO_ERROR; normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0, nfdInput, normalizedLen+1, status); } if (U_FAILURE(*status)) { if (nfdInput != nfdStackBuf) { uprv_free(nfdInput); } return 0; } // buffer to hold the Unicode defined skeleton mappings for a single code point UChar buf[USPOOF_MAX_SKELETON_EXPANSION]; // Apply the skeleton mapping to the NFD normalized input string // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. int32_t inputIndex = 0; UnicodeString skelStr; while (inputIndex < normalizedLen) { UChar32 c; U16_NEXT(nfdInput, inputIndex, normalizedLen, c); int32_t replaceLen = This->confusableLookup(c, tableMask, buf); skelStr.append(buf, replaceLen); } if (nfdInput != nfdStackBuf) { uprv_free(nfdInput); } const UChar *result = skelStr.getBuffer(); int32_t resultLen = skelStr.length(); UChar *normedResult = NULL; // Check the skeleton for NFD, normalize it if needed. // Unnormalized results should be very rare. if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) { normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status); normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar))); if (normedResult == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } *status = U_ZERO_ERROR; unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status); result = normedResult; resultLen = normalizedLen; } // Copy the skeleton to the caller's buffer if (U_SUCCESS(*status)) { if (destCapacity == 0 || resultLen > destCapacity) { *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING; } else { u_memcpy(dest, result, resultLen); if (destCapacity > resultLen) { dest[resultLen] = 0; } else { *status = U_STRING_NOT_TERMINATED_WARNING; } } } uprv_free(normedResult); return resultLen; }
static void TestNextPrevChar(){ static UChar input[]={0x0061, 0xd800, 0xdc00, 0xdbff, 0xdfff, 0x0062, 0xd841, 0xd7ff, 0xd841, 0xdc41, 0xdc00, 0x0000}; static UChar32 result[]={ /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/ 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000, 0x10000, 0x10000, 0x10000, 0x120400, 0xdc00, UTF_ERROR_VALUE, 0xdc00, 0xdc00, UTF_ERROR_VALUE, 0x20441, 0x20441, 0x20441, 0x10ffff, 0x10ffff, 0x10ffff, 0xd841, 0xd841, UTF_ERROR_VALUE, 0xdfff, 0xdfff, UTF_ERROR_VALUE, 0xd7ff, 0xd7ff, 0xd7ff, 0x0062, 0x0062, 0x0062, 0xd841, 0xd841, UTF_ERROR_VALUE, 0x1ffff, 0xd841, UTF_ERROR_VALUE, 0x0062, 0x0062, 0x0062, 0xd7ff, 0xd7ff, 0xd7ff, 0x10ffff, 0x10ffff, 0x10ffff, 0x20441, 0x20441, 0x20441, 0xdbff, 0xdbff, UTF_ERROR_VALUE, 0xdc41, 0xdc41, UTF_ERROR_VALUE, 0x10000, 0x10000, 0x10000, 0xdc00, 0xdc00, UTF_ERROR_VALUE, 0xd800, 0xd800, UTF_ERROR_VALUE, 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061 }; static uint16_t movedOffset[]={ /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/ 1, 1, 1, 11, 11, 11, 3, 3, 3, 9, 10 , 10, 3, 3, 3, 8, 8, 8, 5, 5, 4, 8, 8, 8, 5, 5, 5, 7, 7, 7, 6, 6, 6, 6, 6, 6, 8, 7, 7, 5, 5, 5, 8, 8, 8, 3, 3, 3, 10, 10, 10, 3, 3, 3, 10, 10, 10, 1, 1, 1, 11, 11, 11, 1, 1, 1, 12, 12, 12, 0, 0, 0, }; UChar32 c=0x0000; uint16_t i=0; uint16_t offset=0, setOffset=0; for(offset=0; offset<sizeof(input)/U_SIZEOF_UCHAR; offset++){ setOffset=offset; UTF16_NEXT_CHAR_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i]){ log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i], setOffset); } if(c != result[i]){ log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } setOffset=offset; U16_NEXT_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i]){ log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i], setOffset); } if(c != result[i]){ log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } setOffset=offset; UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE); if(setOffset != movedOffset[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } if(c != result[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } setOffset=offset; U16_NEXT(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c); if(setOffset != movedOffset[i+1]){ log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } if(c != result[i+1]){ log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } setOffset=offset; UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE); if(setOffset != movedOffset[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+2], setOffset); } if(c != result[i+2]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); } i=(uint16_t)(i+6); } i=0; for(offset=(uint16_t)sizeof(input)/U_SIZEOF_UCHAR; offset > 0; --offset){ setOffset=offset; UTF16_PREV_CHAR_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i+3]){ log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+3], setOffset); } if(c != result[i+3]){ log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); } setOffset=offset; U16_PREV_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i+3]){ log_err("ERROR: U16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+3], setOffset); } if(c != result[i+3]){ log_err("ERROR: U16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); } setOffset=offset; UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); if(setOffset != movedOffset[i+4]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } if(c != result[i+4]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } setOffset=offset; U16_PREV(input, 0, setOffset, c); if(setOffset != movedOffset[i+4]){ log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } if(c != result[i+4]){ log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } setOffset=offset; UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); if(setOffset != movedOffset[i+5]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+5], setOffset); } if(c != result[i+5]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c); } i=(uint16_t)(i+6); } }
U_CAPI UBool U_EXPORT2 u_hasBinaryProperty(UChar32 c, UProperty which) { /* c is range-checked in the functions that are called from here */ if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { /* not a known binary property */ } else { uint32_t mask=binProps[which].mask; int32_t column=binProps[which].column; if(mask!=0) { /* systematic, directly stored properties */ return (u_getUnicodeProperties(c, column)&mask)!=0; } else { if(column==UPROPS_SRC_CASE) { return ucase_hasBinaryProperty(c, which); } else if(column==UPROPS_SRC_NORM) { #if !UCONFIG_NO_NORMALIZATION /* normalization properties from unorm.icu */ switch(which) { case UCHAR_SEGMENT_STARTER: return unorm_isCanonSafeStart(c); default: break; } #endif } else if(column==UPROPS_SRC_NFC) { #if !UCONFIG_NO_NORMALIZATION UErrorCode errorCode=U_ZERO_ERROR; switch(which) { case UCHAR_FULL_COMPOSITION_EXCLUSION: { // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); break; } default: { // UCHAR_NF[CD]_INERT properties const Normalizer2 *norm2=Normalizer2Factory::getInstance( (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); return U_SUCCESS(errorCode) && norm2->isInert(c); } } #endif } else if(column==UPROPS_SRC_NFKC) { #if !UCONFIG_NO_NORMALIZATION // UCHAR_NFK[CD]_INERT properties UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2 *norm2=Normalizer2Factory::getInstance( (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); return U_SUCCESS(errorCode) && norm2->isInert(c); #endif } else if(column==UPROPS_SRC_NFKC_CF) { // currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED #if !UCONFIG_NO_NORMALIZATION UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); if(U_SUCCESS(errorCode)) { UnicodeString src(c); UnicodeString dest; { // The ReorderingBuffer must be in a block because its destructor // needs to release dest's buffer before we look at its contents. ReorderingBuffer buffer(*kcf, dest); // Small destCapacity for NFKC_CF(c). if(buffer.init(5, errorCode)) { const UChar *srcArray=src.getBuffer(); kcf->compose(srcArray, srcArray+src.length(), FALSE, TRUE, buffer, errorCode); } } return U_SUCCESS(errorCode) && dest!=src; } #endif } else if(column==UPROPS_SRC_BIDI) { /* bidi/shaping properties */ const UBiDiProps *bdp=GET_BIDI_PROPS(); if(bdp!=NULL) { switch(which) { case UCHAR_BIDI_MIRRORED: return ubidi_isMirrored(bdp, c); case UCHAR_BIDI_CONTROL: return ubidi_isBidiControl(bdp, c); case UCHAR_JOIN_CONTROL: return ubidi_isJoinControl(bdp, c); default: break; } } /* else return FALSE below */ } else if(column==UPROPS_SRC_CHAR) { switch(which) { case UCHAR_POSIX_BLANK: return u_isblank(c); case UCHAR_POSIX_GRAPH: return u_isgraphPOSIX(c); case UCHAR_POSIX_PRINT: return u_isprintPOSIX(c); case UCHAR_POSIX_XDIGIT: return u_isxdigit(c); default: break; } } else if(column==UPROPS_SRC_CHAR_AND_PROPSVEC) { switch(which) { case UCHAR_POSIX_ALNUM: return u_isalnumPOSIX(c); default: break; } } else if(column==UPROPS_SRC_CASE_AND_NORM) { #if !UCONFIG_NO_NORMALIZATION UChar nfdBuffer[4]; const UChar *nfd; int32_t nfdLength; UErrorCode errorCode=U_ZERO_ERROR; const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); if(U_FAILURE(errorCode)) { return FALSE; } switch(which) { case UCHAR_CHANGES_WHEN_CASEFOLDED: nfd=nfcImpl->getDecomposition(c, nfdBuffer, nfdLength); if(nfd!=NULL) { /* c has a decomposition */ if(nfdLength==1) { c=nfd[0]; /* single BMP code point */ } else if(nfdLength<=U16_MAX_LENGTH) { int32_t i=0; U16_NEXT(nfd, i, nfdLength, c); if(i==nfdLength) { /* single supplementary code point */ } else { c=U_SENTINEL; } } else { c=U_SENTINEL; } } else if(c<0) { return FALSE; /* protect against bad input */ } errorCode=U_ZERO_ERROR; if(c>=0) { /* single code point */ const UCaseProps *csp=ucase_getSingleton(&errorCode); const UChar *resultString; return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0); } else { /* guess some large but stack-friendly capacity */ UChar dest[2*UCASE_MAX_STRING_LENGTH]; int32_t destLength; destLength=u_strFoldCase(dest, LENGTHOF(dest), nfd, nfdLength, U_FOLD_CASE_DEFAULT, &errorCode); return (UBool)(U_SUCCESS(errorCode) && 0!=u_strCompare(nfd, nfdLength, dest, destLength, FALSE)); } default: break; } #endif } } } return FALSE; }
UBool Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { if(norms[value].hasMapping()) { Norm &norm=norms[value]; const UnicodeString &m=*norm.mapping; UnicodeString *decomposed=NULL; const UChar *s=m.getBuffer(); int32_t length=m.length(); int32_t prev, i=0; UChar32 c; while(i<length) { prev=i; U16_NEXT(s, i, length, c); if(start<=c && c<=end) { fprintf(stderr, "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", (long)c); exit(U_INVALID_FORMAT_ERROR); } const Norm &cNorm=getNormRef(c); if(cNorm.hasMapping()) { if(norm.mappingType==Norm::ROUND_TRIP) { if(prev==0) { if(cNorm.mappingType!=Norm::ROUND_TRIP) { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's starter " "U+%04lX one-way-decomposes, " "not possible in Unicode normalization\n", (long)start, (long)c); exit(U_INVALID_FORMAT_ERROR); } uint8_t myTrailCC=getCC(m.char32At(i)); UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); uint8_t cTrailCC=getCC(cTrailChar); if(cTrailCC>myTrailCC) { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's starter " "U+%04lX decomposes and the " "inner/earlier tccc=%hu > outer/following tccc=%hu, " "not possible in Unicode normalization\n", (long)start, (long)c, (short)cTrailCC, (short)myTrailCC); exit(U_INVALID_FORMAT_ERROR); } } else { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's non-starter " "U+%04lX decomposes, " "not possible in Unicode normalization\n", (long)start, (long)c); exit(U_INVALID_FORMAT_ERROR); } } if(decomposed==NULL) { decomposed=new UnicodeString(m, 0, prev); } decomposed->append(*cNorm.mapping); } else if(Hangul::isHangul(c)) { UChar buffer[3]; int32_t hangulLength=Hangul::decompose(c, buffer); if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { fprintf(stderr, "gennorm2 error: " "U+%04lX's round-trip mapping's non-starter " "U+%04lX decomposes, " "not possible in Unicode normalization\n", (long)start, (long)c); exit(U_INVALID_FORMAT_ERROR); } if(decomposed==NULL) { decomposed=new UnicodeString(m, 0, prev); } decomposed->append(buffer, hangulLength); } else if(decomposed!=NULL) { decomposed->append(m, prev, i-prev); } } if(decomposed!=NULL) { if(norm.rawMapping==NULL) { // Remember the original mapping when decomposing recursively. norm.rawMapping=norm.mapping; } else { delete norm.mapping; } norm.mapping=decomposed; // Not norm.setMappingCP(); because the original mapping // is most likely to be encodable as a delta. return TRUE; } } return FALSE; }
int32_t NamePrepTransform::process( const UChar* src, int32_t srcLength, UChar* dest, int32_t destCapacity, UBool allowUnassigned, UParseError* parseError, UErrorCode& status ){ // check error status if(U_FAILURE(status)){ return 0; } //check arguments if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { status=U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString b1String; UChar *b1 = b1String.getBuffer(MAX_BUFFER_SIZE); int32_t b1Len; int32_t b1Index = 0; UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT; UBool leftToRight=FALSE, rightToLeft=FALSE; b1Len = map(src, srcLength, b1, b1String.getCapacity(), allowUnassigned, parseError, status); b1String.releaseBuffer(b1Len); if(status == U_BUFFER_OVERFLOW_ERROR){ // redo processing of string /* we do not have enough room so grow the buffer*/ b1 = b1String.getBuffer(b1Len); status = U_ZERO_ERROR; // reset error b1Len = map(src, srcLength, b1, b1String.getCapacity(), allowUnassigned, parseError, status); b1String.releaseBuffer(b1Len); } if(U_FAILURE(status)){ b1Len = 0; goto CLEANUP; } for(; b1Index<b1Len; ){ UChar32 ch = 0; U16_NEXT(b1, b1Index, b1Len, ch); if(prohibited.contains(ch) && ch!=0x0020){ status = U_IDNA_PROHIBITED_ERROR; b1Len = 0; goto CLEANUP; } direction = u_charDirection(ch); if(firstCharDir==U_CHAR_DIRECTION_COUNT){ firstCharDir = direction; } if(direction == U_LEFT_TO_RIGHT){ leftToRight = TRUE; } if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){ rightToLeft = TRUE; } } // satisfy 2 if( leftToRight == TRUE && rightToLeft == TRUE){ status = U_IDNA_CHECK_BIDI_ERROR; b1Len = 0; goto CLEANUP; } //satisfy 3 if( rightToLeft == TRUE && !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) && (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC)) ){ status = U_IDNA_CHECK_BIDI_ERROR; return FALSE; } if(b1Len <= destCapacity){ u_memmove(dest, b1, b1Len); } CLEANUP: return u_terminateUChars(dest, destCapacity, b1Len, &status); }
static void TestTable32(void) { static const struct { const char *key; int32_t number; } testcases[]={ { "ooooooooooooooooo", 0 }, { "oooooooooooooooo1", 1 }, { "ooooooooooooooo1o", 2 }, { "oo11ooo1ooo11111o", 25150 }, { "oo11ooo1ooo111111", 25151 }, { "o1111111111111111", 65535 }, { "1oooooooooooooooo", 65536 }, { "1ooooooo11o11ooo1", 65969 }, { "1ooooooo11o11oo1o", 65970 }, { "1ooooooo111oo1111", 65999 } }; /* ### TODO UResourceBundle staticItem={ 0 }; - need to know the size */ UResourceBundle *res, *item; const UChar *s; const char *key; UErrorCode errorCode; int32_t i, j, number, parsedNumber, length, count; errorCode=U_ZERO_ERROR; res=ures_open(loadTestData(&errorCode), "testtable32", &errorCode); if(U_FAILURE(errorCode)) { log_data_err("unable to open testdata/testtable32.res - %s\n", u_errorName(errorCode)); return; } if(ures_getType(res)!=URES_TABLE) { log_data_err("testdata/testtable32.res has type %d instead of URES_TABLE\n", ures_getType(res)); } count=ures_getSize(res); if(count!=66000) { log_err("testdata/testtable32.res should have 66000 entries but has %d\n", count); } /* get the items by index */ item=NULL; for(i=0; i<count; ++i) { item=ures_getByIndex(res, i, item, &errorCode); if(U_FAILURE(errorCode)) { log_err("unable to get item %d of %d in testdata/testtable32.res - %s\n", i, count, u_errorName(errorCode)); break; } key=ures_getKey(item); parsedNumber=parseTable32Key(key); switch(ures_getType(item)) { case URES_STRING: s=ures_getString(item, &length, &errorCode); if(U_FAILURE(errorCode) || s==NULL) { log_err("unable to access the string \"%s\" at %d in testdata/testtable32.res - %s\n", key, i, u_errorName(errorCode)); number=-1; } else { j=0; U16_NEXT(s, j, length, number); } break; case URES_INT: number=ures_getInt(item, &errorCode); if(U_FAILURE(errorCode)) { log_err("unable to access the integer \"%s\" at %d in testdata/testtable32.res - %s\n", key, i, u_errorName(errorCode)); number=-1; } break; default: log_err("unexpected resource type %d for \"%s\" at %d in testdata/testtable32.res - %s\n", ures_getType(item), key, i, u_errorName(errorCode)); number=-1; break; } if(number>=0 && number!=parsedNumber) { log_err("\"%s\" at %d in testdata/testtable32.res has a string/int value of %d, expected %d\n", key, i, number, parsedNumber); } } /* search for some items by key */ for(i=0; i<LENGTHOF(testcases); ++i) { item=ures_getByKey(res, testcases[i].key, item, &errorCode); if(U_FAILURE(errorCode)) { log_err("unable to find the key \"%s\" in testdata/testtable32.res - %s\n", testcases[i].key, u_errorName(errorCode)); continue; } switch(ures_getType(item)) { case URES_STRING: s=ures_getString(item, &length, &errorCode); if(U_FAILURE(errorCode) || s==NULL) { log_err("unable to access the string \"%s\" in testdata/testtable32.res - %s\n", testcases[i].key, u_errorName(errorCode)); number=-1; } else { j=0; U16_NEXT(s, j, length, number); } break; case URES_INT: number=ures_getInt(item, &errorCode); if(U_FAILURE(errorCode)) { log_err("unable to access the integer \"%s\" in testdata/testtable32.res - %s\n", testcases[i].key, u_errorName(errorCode)); number=-1; } break; default: log_err("unexpected resource type %d for \"%s\" in testdata/testtable32.res - %s\n", ures_getType(item), testcases[i].key, u_errorName(errorCode)); number=-1; break; } if(number>=0 && number!=testcases[i].number) { log_err("\"%s\" in testdata/testtable32.res has a string/int value of %d, expected %d\n", testcases[i].key, number, testcases[i].number); } key=ures_getKey(item); if(0!=uprv_strcmp(key, testcases[i].key)) { log_err("\"%s\" in testdata/testtable32.res claims to have the key \"%s\"\n", testcases[i].key, key); } } ures_close(item); ures_close(res); }
U_CFUNC int32_t U_CALLCONV ustrcase_internalToTitle(const UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { const UChar *s; UChar32 c; int32_t prev, titleStart, titleLimit, idx, destIndex, length; UBool isFirstIndex; if(U_FAILURE(*pErrorCode)) { return 0; } // Use the C++ abstract base class to minimize dependencies. // TODO: Change UCaseMap.iter to store a BreakIterator directly. BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter); /* set up local variables */ int32_t locCache=csm->locCache; UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; destIndex=0; prev=0; isFirstIndex=TRUE; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ if(isFirstIndex) { isFirstIndex=FALSE; idx=bi->first(); } else { idx=bi->next(); } if(idx==UBRK_DONE || idx>srcLength) { idx=srcLength; } /* * Unicode 4 & 5 section 3.13 Default Case Operations: * * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex * #29, "Text Boundaries." Between each pair of word boundaries, find the first * cased character F. If F exists, map F to default_title(F); then map each * subsequent character C to default_lower(C). * * In this implementation, segment [prev..index[ into 3 parts: * a) uncased characters (copy as-is) [prev..titleStart[ * b) first case letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ if(prev<idx) { /* find and copy uncased characters [prev..titleStart[ */ titleStart=titleLimit=prev; U16_NEXT(src, titleLimit, idx, c); if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { /* Adjust the titlecasing index (titleStart) to the next cased character. */ for(;;) { titleStart=titleLimit; if(titleLimit==idx) { /* * only uncased characters in [prev..index[ * stop with titleStart==titleLimit==index */ break; } U16_NEXT(src, titleLimit, idx, c); if(UCASE_NONE!=ucase_getType(csm->csp, c)) { break; /* cased letter at [titleStart..titleLimit[ */ } } length=titleStart-prev; if(length>0) { if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); } destIndex+=length; } } if(titleStart<titleLimit) { /* titlecase c which is from [titleStart..titleLimit[ */ csc.cpStart=titleStart; csc.cpLimit=titleLimit; c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s); /* Special case Dutch IJ titlecasing */ if ( titleStart+1 < idx && ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH && ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { c=(UChar32) 0x004A; destIndex=appendResult(dest, destIndex, destCapacity, c, s); titleLimit++; } /* lowercase [titleLimit..index[ */ if(titleLimit<idx) { if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { /* Normal operation: Lowercase the rest of the word. */ destIndex+= _caseMap( csm, ucase_toFullLower, dest+destIndex, destCapacity-destIndex, src, &csc, titleLimit, idx, pErrorCode); } else { /* Optionally just copy the rest of the word unchanged. */ length=idx-titleLimit; if((destIndex+length)<=destCapacity) { uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); } destIndex+=length; } } } } prev=idx; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
/* * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we * semantically write RTL runs in reverse and later reverse them again. * Instead, we actually write them in forward order to begin with. * However, if the RTL run was to be mirrored, we need to mirror here now * since the implicit second reversal must not do it. * It looks strange to do mirroring in LTR output, but it is only because * we are writing RTL output in reverse. */ static int32_t doWriteForward(const UChar *src, int32_t srcLength, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode) { /* optimize for several combinations of options */ switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) { case 0: { /* simply copy the LTR run to the destination */ int32_t length=srcLength; if(destSize<length) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return srcLength; } do { *dest++=*src++; } while(--length>0); return srcLength; } case UBIDI_DO_MIRRORING: { /* do mirroring */ int32_t i=0, j=0; UChar32 c; if(destSize<srcLength) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return srcLength; } do { U16_NEXT(src, i, srcLength, c); c=u_charMirror(c); U16_APPEND_UNSAFE(dest, j, c); } while(i<srcLength); return srcLength; } case UBIDI_REMOVE_BIDI_CONTROLS: { /* copy the LTR run and remove any BiDi control characters */ int32_t remaining=destSize; UChar c; do { c=*src++; if(!IS_BIDI_CONTROL_CHAR(c)) { if(--remaining<0) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; /* preflight the length */ while(--srcLength>0) { c=*src++; if(!IS_BIDI_CONTROL_CHAR(c)) { --remaining; } } return destSize-remaining; } *dest++=c; } } while(--srcLength>0); return destSize-remaining; } default: { /* remove BiDi control characters and do mirroring */ int32_t remaining=destSize; int32_t i, j=0; UChar32 c; do { i=0; U16_NEXT(src, i, srcLength, c); src+=i; srcLength-=i; if(!IS_BIDI_CONTROL_CHAR(c)) { remaining-=i; if(remaining<0) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; /* preflight the length */ while(srcLength>0) { c=*src++; if(!IS_BIDI_CONTROL_CHAR(c)) { --remaining; } --srcLength; } return destSize-remaining; } c=u_charMirror(c); U16_APPEND_UNSAFE(dest, j, c); } } while(srcLength>0); return j; } } /* end of switch */ }
static int icuNext( sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes, int *piStartOffset, int *piEndOffset, int *piPosition ){ IcuCursor *pCsr = (IcuCursor *)pCursor; int iStart = 0; int iEnd = 0; int nByte = 0; while( iStart==iEnd ){ UChar32 c; iStart = ubrk_current(pCsr->pIter); iEnd = ubrk_next(pCsr->pIter); if( iEnd==UBRK_DONE ){ return SQLITE_DONE; } while( iStart<iEnd ){ int iWhite = iStart; U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); if( u_isspace(c) ){ iStart = iWhite; }else{ break; } } assert(iStart<=iEnd); } do { UErrorCode status = U_ZERO_ERROR; if( nByte ){ char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); if( !zNew ){ return SQLITE_NOMEM; } pCsr->zBuffer = zNew; pCsr->nBuffer = nByte; } u_strToUTF8( pCsr->zBuffer, pCsr->nBuffer, &nByte, &pCsr->aChar[iStart], iEnd-iStart, &status ); } while( nByte>pCsr->nBuffer ); *ppToken = pCsr->zBuffer; *pnBytes = nByte; *piStartOffset = pCsr->aOffset[iStart]; *piEndOffset = pCsr->aOffset[iEnd]; *piPosition = pCsr->iToken++; return SQLITE_OK; }
/** * Greek string uppercasing with a state machine. * Probably simpler than a stateless function that has to figure out complex context-before * for each character. * TODO: Try to re-consolidate one way or another with the non-Greek function. */ int32_t toUpper(const UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { int32_t locCache = UCASE_LOC_GREEK; int32_t destIndex=0; uint32_t state = 0; for (int32_t i = 0; i < srcLength;) { int32_t nextIndex = i; UChar32 c; U16_NEXT(src, nextIndex, srcLength, c); uint32_t nextState = 0; int32_t type = ucase_getTypeOrIgnorable(csm->csp, c); if ((type & UCASE_IGNORABLE) != 0) { // c is case-ignorable nextState |= (state & AFTER_CASED); } else if (type != UCASE_NONE) { // c is cased nextState |= AFTER_CASED; } uint32_t data = getLetterData(c); if (data > 0) { uint32_t upper = data & UPPER_MASK; // Add a dialytika to this iota or ypsilon vowel // if we removed a tonos from the previous vowel, // and that previous vowel did not also have (or gain) a dialytika. // Adding one only to the final vowel in a longer sequence // (which does not occur in normal writing) would require lookahead. // Set the same flag as for preserving an existing dialytika. if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && (upper == 0x399 || upper == 0x3A5)) { data |= HAS_DIALYTIKA; } int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. if ((data & HAS_YPOGEGRAMMENI) != 0) { numYpogegrammeni = 1; } // Skip combining diacritics after this Greek letter. while (nextIndex < srcLength) { uint32_t diacriticData = getDiacriticData(src[nextIndex]); if (diacriticData != 0) { data |= diacriticData; if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { ++numYpogegrammeni; } ++nextIndex; } else { break; // not a Greek diacritic } } if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { nextState |= AFTER_VOWEL_WITH_ACCENT; } // Map according to Greek rules. UBool addTonos = FALSE; if (upper == 0x397 && (data & HAS_ACCENT) != 0 && numYpogegrammeni == 0 && (state & AFTER_CASED) == 0 && !isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) { // Keep disjunctive "or" with (only) a tonos. // We use the same "word boundary" conditions as for the Final_Sigma test. if (i == nextIndex) { upper = 0x389; // Preserve the precomposed form. } else { addTonos = TRUE; } } else if ((data & HAS_DIALYTIKA) != 0) { // Preserve a vowel with dialytika in precomposed form if it exists. if (upper == 0x399) { upper = 0x3AA; data &= ~HAS_EITHER_DIALYTIKA; } else if (upper == 0x3A5) { upper = 0x3AB; data &= ~HAS_EITHER_DIALYTIKA; } } destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper); if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika } if (destIndex >= 0 && addTonos) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x301); } while (destIndex >= 0 && numYpogegrammeni > 0) { destIndex=appendUChar(dest, destIndex, destCapacity, 0x399); --numYpogegrammeni; } if(destIndex<0) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } else { const UChar *s; UChar32 c2 = 0; c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache); if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { /* fast path version of appendResult() for BMP results */ dest[destIndex++]=(UChar)c2; } else { destIndex=appendResult(dest, destIndex, destCapacity, c, s); if(destIndex<0) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } } i = nextIndex; state = nextState; } if(destIndex>destCapacity) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destIndex; }
// FIXME: // - Handle 'Inherited', 'Common' and 'Unknown' // (see http://www.unicode.org/reports/tr24/#Usage_Model ) // For 'Inherited' and 'Common', perhaps we need to // accept another parameter indicating the previous family // and just return it. // - All the characters (or characters up to the point a single // font can cover) need to be taken into account const UChar* getFallbackFamily(const UChar* characters, int length, FontDescription::GenericFamilyType generic, UChar32* charChecked, UScriptCode* scriptChecked) { ASSERT(characters && characters[0] && length > 0); UScriptCode script = USCRIPT_COMMON; // Sometimes characters common to script (e.g. space) is at // the beginning of a string so that we need to skip them // to get a font required to render the string. int i = 0; UChar32 ucs4 = 0; while (i < length && script == USCRIPT_COMMON || script == USCRIPT_INVALID_CODE) { U16_NEXT(characters, i, length, ucs4); UErrorCode err = U_ZERO_ERROR; script = uscript_getScript(ucs4, &err); // silently ignore the error } // For the full-width ASCII characters (U+FF00 - U+FF5E), use the font for // Han (determined in a locale-dependent way above). Full-width ASCII // characters are rather widely used in Japanese and Chinese documents and // they're fully covered by Chinese, Japanese and Korean fonts. if (0xFF00 < ucs4 && ucs4 < 0xFF5F) script = USCRIPT_HAN; // There are a lot of characters in USCRIPT_COMMON that can be covered // by fonts for scripts closely related to them. See // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Script=Common:] // FIXME: make this more efficient with a wider coverage if (script == USCRIPT_COMMON || script == USCRIPT_INHERITED) { UBlockCode block = ublock_getCode(ucs4); switch (block) { case UBLOCK_BASIC_LATIN: script = USCRIPT_LATIN; break; case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION: script = USCRIPT_HAN; break; case UBLOCK_HIRAGANA: case UBLOCK_KATAKANA: script = USCRIPT_HIRAGANA; break; case UBLOCK_ARABIC: script = USCRIPT_ARABIC; break; case UBLOCK_GREEK: script = USCRIPT_GREEK; break; case UBLOCK_DEVANAGARI: // For Danda and Double Danda (U+0964, U+0965), use a Devanagari // font for now although they're used by other scripts as well. // Without a context, we can't do any better. script = USCRIPT_DEVANAGARI; break; case UBLOCK_ARMENIAN: script = USCRIPT_ARMENIAN; break; case UBLOCK_GEORGIAN: script = USCRIPT_GEORGIAN; break; case UBLOCK_KANNADA: script = USCRIPT_KANNADA; break; } } // Another lame work-around to cover non-BMP characters. const UChar* family = getFontFamilyForScript(script, generic); if (!family) { int plane = ucs4 >> 16; switch (plane) { case 1: family = L"code2001"; break; case 2: family = L"simsun-extb"; break; default: family = L"lucida sans unicode"; } }
static void demoCaseMapInC() { /* * input= * "aB<capital sigma>" * "iI<small dotless i><capital dotted I> " * "<sharp s> <small lig. ffi>" * "<small final sigma><small sigma><capital sigma>" */ static const UChar input[]={ 0x61, 0x42, 0x3a3, 0x69, 0x49, 0x131, 0x130, 0x20, 0xdf, 0x20, 0xfb03, 0x3c2, 0x3c3, 0x3a3, 0 }; UChar buffer[32]; UErrorCode errorCode; UChar32 c; int32_t i, j, length; UBool isError; printf("\n* demoCaseMapInC() ----------------- ***\n\n"); /* * First, use simple case mapping functions which provide * 1:1 code point mappings without context/locale ID. * * Note that some mappings will not be "right" because some "real" * case mappings require context, depend on the locale ID, * and/or result in a change in the number of code points. */ printUString("input string: ", input, -1); /* uppercase */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_toupper(c); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-uppercased: ", buffer, j); /* lowercase */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_tolower(c); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-lowercased: ", buffer, j); /* titlecase */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_totitle(c); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-titlecased: ", buffer, j); /* case-fold/default */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_foldCase(c, U_FOLD_CASE_DEFAULT); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-case-folded/default: ", buffer, j); /* case-fold/Turkic */ isError=FALSE; for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) { U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */ if(c==0) { break; /* stop at terminating NUL, no need to terminate buffer */ } c=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I); U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError); } printUString("simple-case-folded/Turkic: ", buffer, j); /* * Second, use full case mapping functions which provide * 1:n code point mappings (n can be 0!) and are sensitive to context and locale ID. * * Note that lower/upper/titlecasing take a locale ID while case-folding * has bit flag options instead, by design of the Unicode SpecialCasing.txt UCD file. * * Also, string titlecasing requires a BreakIterator to find starts of words. * The sample code here passes in a NULL pointer; u_strToTitle() will open and close a default * titlecasing BreakIterator automatically. * For production code where many strings are titlecased it would be more efficient * to open a BreakIterator externally and pass it in. */ printUString("\ninput string: ", input, -1); /* lowercase/English */ errorCode=U_ZERO_ERROR; length=u_strToLower(buffer, UPRV_LENGTHOF(buffer), input, -1, "en", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-lowercased/en: ", buffer, length); } else { printf("error in u_strToLower(en)=%ld error=%s\n", length, u_errorName(errorCode)); } /* lowercase/Turkish */ errorCode=U_ZERO_ERROR; length=u_strToLower(buffer, UPRV_LENGTHOF(buffer), input, -1, "tr", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-lowercased/tr: ", buffer, length); } else { printf("error in u_strToLower(tr)=%ld error=%s\n", length, u_errorName(errorCode)); } /* uppercase/English */ errorCode=U_ZERO_ERROR; length=u_strToUpper(buffer, UPRV_LENGTHOF(buffer), input, -1, "en", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-uppercased/en: ", buffer, length); } else { printf("error in u_strToUpper(en)=%ld error=%s\n", length, u_errorName(errorCode)); } /* uppercase/Turkish */ errorCode=U_ZERO_ERROR; length=u_strToUpper(buffer, UPRV_LENGTHOF(buffer), input, -1, "tr", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-uppercased/tr: ", buffer, length); } else { printf("error in u_strToUpper(tr)=%ld error=%s\n", length, u_errorName(errorCode)); } /* titlecase/English */ errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, UPRV_LENGTHOF(buffer), input, -1, NULL, "en", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-titlecased/en: ", buffer, length); } else { printf("error in u_strToTitle(en)=%ld error=%s\n", length, u_errorName(errorCode)); } /* titlecase/Turkish */ errorCode=U_ZERO_ERROR; length=u_strToTitle(buffer, UPRV_LENGTHOF(buffer), input, -1, NULL, "tr", &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-titlecased/tr: ", buffer, length); } else { printf("error in u_strToTitle(tr)=%ld error=%s\n", length, u_errorName(errorCode)); } /* case-fold/default */ errorCode=U_ZERO_ERROR; length=u_strFoldCase(buffer, UPRV_LENGTHOF(buffer), input, -1, U_FOLD_CASE_DEFAULT, &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-case-folded/default: ", buffer, length); } else { printf("error in u_strFoldCase(default)=%ld error=%s\n", length, u_errorName(errorCode)); } /* case-fold/Turkic */ errorCode=U_ZERO_ERROR; length=u_strFoldCase(buffer, UPRV_LENGTHOF(buffer), input, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode); if(U_SUCCESS(errorCode)) { printUString("full-case-folded/Turkic: ", buffer, length); } else { printf("error in u_strFoldCase(Turkic)=%ld error=%s\n", length, u_errorName(errorCode)); } }
/* ** Extract the next token from a tokenization cursor. */ static int icuNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ const char **ppToken, /* OUT: *ppToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ IcuCursor *pCsr = (IcuCursor *)pCursor; int iStart = 0; int iEnd = 0; int nByte = 0; while( iStart==iEnd ){ UChar32 c; iStart = ubrk_current(pCsr->pIter); iEnd = ubrk_next(pCsr->pIter); if( iEnd==UBRK_DONE ){ return SQLITE_DONE; } while( iStart<iEnd ){ int iWhite = iStart; U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); if( u_isspace(c) ){ iStart = iWhite; }else{ break; } } assert(iStart<=iEnd); } do { UErrorCode status = U_ZERO_ERROR; if( nByte ){ char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); if( !zNew ){ return SQLITE_NOMEM; } pCsr->zBuffer = zNew; pCsr->nBuffer = nByte; } u_strToUTF8( pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ &status /* Output success/failure */ ); } while( nByte>pCsr->nBuffer ); *ppToken = pCsr->zBuffer; *pnBytes = nByte; *piStartOffset = pCsr->aOffset[iStart]; *piEndOffset = pCsr->aOffset[iEnd]; *piPosition = pCsr->iToken++; return SQLITE_OK; }
/** * See if the decomposition of cp2 is at segment starting at segmentPos * (with canonical rearrangment!) * If so, take the remainder, and return the equivalents */ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); if (U_FAILURE(status)) { return NULL; } UnicodeString temp(comp); int32_t inputLen=temp.length(); UnicodeString decompString; nfd.normalize(temp, decompString, status); if (U_FAILURE(status)) { return NULL; } if (decompString.isBogus()) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } const UChar *decomp=decompString.getBuffer(); int32_t decompLen=decompString.length(); // See if it matches the start of segment (at segmentPos) UBool ok = FALSE; UChar32 cp; int32_t decompPos = 0; UChar32 decompCp; U16_NEXT(decomp, decompPos, decompLen, decompCp); int32_t i = segmentPos; while(i < segLen) { U16_NEXT(segment, i, segLen, cp); if (cp == decompCp) { // if equal, eat another cp from decomp //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp)))); if (decompPos == decompLen) { // done, have all decomp characters! temp.append(segment+i, segLen-i); ok = TRUE; break; } U16_NEXT(decomp, decompPos, decompLen, decompCp); } else { //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp)))); // brute force approach temp.append(cp); /* TODO: optimize // since we know that the classes are monotonically increasing, after zero // e.g. 0 5 7 9 0 3 // we can do an optimization // there are only a few cases that work: zero, less, same, greater // if both classes are the same, we fail // if the decomp class < the segment class, we fail segClass = getClass(cp); if (decompClass <= segClass) return null; */ } } if (!ok) return NULL; // we failed, characters left over //if (PROGRESS) printf("Matches\n"); if (inputLen == temp.length()) { fillinResult->put(UnicodeString(), new UnicodeString(), status); return fillinResult; // succeed, but no remainder } // brute force approach // check to make sure result is canonically equivalent UnicodeString trial; nfd.normalize(temp, trial, status); if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) { return NULL; } return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status); }
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // start and limit of the input range int32_t start = offsets.start; int32_t limit = offsets.limit; int32_t length, delta; if(start >= limit) { return; } // a C code unit iterator, implemented around the Replaceable UCharIterator iter; uiter_setReplaceable(&iter, &text); // the output string and buffer pointer UnicodeString output; UChar *buffer; UBool neededToNormalize; UErrorCode errorCode; /* * Normalize as short chunks at a time as possible even in * bulk mode, so that styled text is minimally disrupted. * In incremental mode, a chunk that ends with offsets.limit * must not be normalized. * * If it was known that the input text is not styled, then * a bulk mode normalization could look like this: * UChar staticChars[256]; UnicodeString input; length = limit - start; input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); input.releaseBuffer(length); UErrorCode status = U_ZERO_ERROR; Normalizer::normalize(input, fMode, options, output, status); text.handleReplaceBetween(start, limit, output); int32_t delta = output.length() - length; offsets.contextLimit += delta; offsets.limit += delta; offsets.start = limit + delta; * */ while(start < limit) { // set the iterator limits for the remaining input range // this is a moving target because of the replacements in the text object iter.start = iter.index = start; iter.limit = limit; // incrementally normalize a small chunk of the input buffer = output.getBuffer(-1); errorCode = U_ZERO_ERROR; length = unorm_next(&iter, buffer, output.getCapacity(), fMode, 0, TRUE, &neededToNormalize, &errorCode); output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); if(errorCode == U_BUFFER_OVERFLOW_ERROR) { // use a larger output string buffer and do it again from the start iter.index = start; buffer = output.getBuffer(length); errorCode = U_ZERO_ERROR; length = unorm_next(&iter, buffer, output.getCapacity(), fMode, 0, TRUE, &neededToNormalize, &errorCode); output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); } if(U_FAILURE(errorCode)) { break; } limit = iter.index; if(isIncremental && limit == iter.limit) { // stop in incremental mode when we reach the input limit // in case there are additional characters that could change the // normalization result // UNLESS all characters in the result of the normalization of // the last run are in the skippable set const UChar *s=output.getBuffer(); int32_t i=0, outLength=output.length(); UChar32 c; while(i<outLength) { U16_NEXT(s, i, outLength, c); if(!unorm_isNFSkippable(c, fMode)) { outLength=-1; // I wish C++ had labeled loops and break outer; ... break; } } if (outLength<0) { break; } } if(neededToNormalize) { // replace the input chunk with its normalized form text.handleReplaceBetween(start, limit, output); // update all necessary indexes accordingly delta = length - (limit - start); // length change in the text object start = limit += delta; // the next chunk starts where this one ends, with adjustment limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range offsets.contextLimit += delta; } else { // delta == 0 start = limit; limit = offsets.limit; } } offsets.start = start; }
bool HarfBuzzShaper::collectHarfBuzzRuns() { const UChar* normalizedBufferEnd = m_normalizedBuffer.get() + m_normalizedBufferLength; SurrogatePairAwareTextIterator iterator(m_normalizedBuffer.get(), 0, m_normalizedBufferLength, m_normalizedBufferLength); UChar32 character; unsigned clusterLength = 0; unsigned startIndexOfCurrentRun = 0; if (!iterator.consume(character, clusterLength)) return false; const SimpleFontData* nextFontData = m_font->glyphDataForCharacter(character, false).fontData; UErrorCode errorCode = U_ZERO_ERROR; UScriptCode nextScript = uscript_getScript(character, &errorCode); if (U_FAILURE(errorCode)) return false; do { const UChar* currentCharacterPosition = iterator.characters(); const SimpleFontData* currentFontData = nextFontData; UScriptCode currentScript = nextScript; for (iterator.advance(clusterLength); iterator.consume(character, clusterLength); iterator.advance(clusterLength)) { if (Font::treatAsZeroWidthSpace(character)) continue; if (U_GET_GC_MASK(character) & U_GC_M_MASK) { int markLength = clusterLength; const UChar* markCharactersEnd = iterator.characters() + clusterLength; while (markCharactersEnd < normalizedBufferEnd) { UChar32 nextCharacter; int nextCharacterLength = 0; U16_NEXT(markCharactersEnd, nextCharacterLength, normalizedBufferEnd - markCharactersEnd, nextCharacter); if (!(U_GET_GC_MASK(nextCharacter) & U_GC_M_MASK)) break; markLength += nextCharacterLength; markCharactersEnd += nextCharacterLength; } if (currentFontData->canRenderCombiningCharacterSequence(currentCharacterPosition, markCharactersEnd - currentCharacterPosition)) { clusterLength = markLength; continue; } nextFontData = m_font->glyphDataForCharacter(character, false).fontData; } else nextFontData = m_font->glyphDataForCharacter(character, false).fontData; nextScript = uscript_getScript(character, &errorCode); if (U_FAILURE(errorCode)) return false; if ((nextFontData != currentFontData) || ((currentScript != nextScript) && (nextScript != USCRIPT_INHERITED) && (!uscript_hasScript(character, currentScript)))) break; if (nextScript == USCRIPT_INHERITED) nextScript = currentScript; currentCharacterPosition = iterator.characters(); } unsigned numCharactersOfCurrentRun = iterator.currentCharacter() - startIndexOfCurrentRun; hb_script_t script = hb_icu_script_to_script(currentScript); m_harfBuzzRuns.append(HarfBuzzRun::create(currentFontData, startIndexOfCurrentRun, numCharactersOfCurrentRun, m_run.direction(), script)); currentFontData = nextFontData; startIndexOfCurrentRun = iterator.currentCharacter(); } while (iterator.consume(character, clusterLength)); return !m_harfBuzzRuns.isEmpty(); }
/* * Match each code point in a string against each code point in the matchSet. * Return the index of the first string code point that * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. * Return -(string length)-1 if there is no such code point. */ static int32_t _matchFromSet(const UChar* string, const UChar* matchSet, UBool polarity) { int32_t matchLen, matchBMPLen, strItr, matchItr; UChar32 stringCh, matchCh; UChar c, c2; /* first part of matchSet contains only BMP code points */ matchBMPLen = 0; while ((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { ++matchBMPLen; } /* second part of matchSet contains BMP and supplementary code points */ matchLen = matchBMPLen; while (matchSet[matchLen] != 0) { ++matchLen; } for (strItr = 0; (c = string[strItr]) != 0;) { ++strItr; if (U16_IS_SINGLE(c)) { if (polarity) { for (matchItr = 0; matchItr < matchLen; ++matchItr) { if (c == matchSet[matchItr]) { return strItr - 1; /* one matches */ } } } else { for (matchItr = 0; matchItr < matchLen; ++matchItr) { if (c == matchSet[matchItr]) { goto endloop; } } return strItr - 1; /* none matches */ } } else { /* * No need to check for string length before U16_IS_TRAIL * because c2 could at worst be the terminating NUL. */ if (U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { ++strItr; stringCh = U16_GET_SUPPLEMENTARY(c, c2); } else { stringCh = c; /* unpaired trail surrogate */ } if (polarity) { for (matchItr = matchBMPLen; matchItr < matchLen;) { U16_NEXT(matchSet, matchItr, matchLen, matchCh); if (stringCh == matchCh) { return strItr - U16_LENGTH(stringCh); /* one matches */ } } } else { for (matchItr = matchBMPLen; matchItr < matchLen;) { U16_NEXT(matchSet, matchItr, matchLen, matchCh); if (stringCh == matchCh) { goto endloop; } } return strItr - U16_LENGTH(stringCh); /* none matches */ } } endloop: /* wish C had continue with labels like Java... */; } /* Didn't find it. */ return -strItr - 1; }
/* keep this in sync with utf8tst.c's TestNulTerminated() */ static void TestNulTerminated() { static const UChar input[]={ /* 0 */ 0x61, /* 1 */ 0xd801, 0xdc01, /* 3 */ 0xdc01, /* 4 */ 0x62, /* 5 */ 0xd801, /* 6 */ 0x00 /* 7 */ }; static const UChar32 result[]={ 0x61, 0x10401, 0xdc01, 0x62, 0xd801, 0 }; UChar32 c, c2; int32_t i0, i=0, j, k, expectedIndex; int32_t cpIndex=0; do { i0=i; U16_NEXT(input, i, -1, c); if(c!=result[cpIndex]) { log_err("U16_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]); } j=i0; U16_FWD_1(input, j, -1); if(j!=i) { log_err("U16_FWD_1() moved to index %d but U16_NEXT() moved to %d\n", j, i); } ++cpIndex; /* * Move by this many code points from the start. * U16_FWD_N() stops at the end of the string, that is, at the NUL if necessary. */ expectedIndex= (c==0) ? i-1 : i; k=0; U16_FWD_N(input, k, -1, cpIndex); if(k!=expectedIndex) { log_err("U16_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex); } } while(c!=0); i=0; do { j=i0=i; U16_NEXT(input, i, -1, c); do { U16_GET(input, 0, j, -1, c2); if(c2!=c) { log_err("U16_NEXT(from %d)=U+%04x != U+%04x=U16_GET(at %d)\n", i0, c, c2, j); } /* U16_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */ k=j+1; U16_SET_CP_LIMIT(input, 0, k, -1); if(k!=i) { log_err("U16_NEXT() moved to %d but U16_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k); } } while(++j<i); } while(c!=0); }