static void TestSerialized() { uint16_t buffer[1000]; USerializedSet sset; USet *set; UErrorCode errorCode; UChar32 c; int32_t length; /* use a pattern that generates both BMP and supplementary code points */ U_STRING_DECL(pattern, "[:Cf:]", 6); U_STRING_INIT(pattern, "[:Cf:]", 6); errorCode=U_ZERO_ERROR; set=uset_openPattern(pattern, -1, &errorCode); if(U_FAILURE(errorCode)) { log_data_err("uset_openPattern([:Cf:]) failed - %s (Are you missing data?)\n", u_errorName(errorCode)); return; } length=uset_serialize(set, buffer, UPRV_LENGTHOF(buffer), &errorCode); if(U_FAILURE(errorCode)) { log_err("unable to uset_serialize([:Cf:]) - %s\n", u_errorName(errorCode)); uset_close(set); return; } uset_getSerializedSet(&sset, buffer, length); for(c=0; c<=0x10ffff; ++c) { if(uset_contains(set, c)!=uset_serializedContains(&sset, c)) { log_err("uset_contains(U+%04x)!=uset_serializedContains(U+%04x)\n", c); break; } } uset_close(set); }
U_CFUNC UBool U_EXPORT2 unorm_isCanonSafeStart(UChar32 c) { #if UNORM_HARDCODE_DATA if(auxTrie.index!=NULL) { #else UErrorCode errorCode=U_ZERO_ERROR; if(_haveData(errorCode) && auxTrie.index!=NULL) { #endif uint16_t aux=UTRIE2_GET16(&auxTrie, c); return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0); } else { return FALSE; } } U_CAPI UBool U_EXPORT2 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) { #if !UNORM_HARDCODE_DATA UErrorCode errorCode=U_ZERO_ERROR; #endif if( fillSet!=NULL && (uint32_t)c<=0x10ffff && #if !UNORM_HARDCODE_DATA _haveData(errorCode) && #endif canonStartSets!=NULL ) { const uint16_t *table; int32_t i, start, limit; /* * binary search for c * * There are two search tables, * one for BMP code points and one for supplementary ones. * See unormimp.h for details. */ if(c<=0xffff) { table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]; start=0; limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]; /* each entry is a pair { c, result } */ while(start<limit-2) { i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */ if(c<table[i]) { limit=i; } else { start=i; } } /* found? */ if(c==table[start]) { i=table[start+1]; if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) { /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */ i&=(_NORM_MAX_CANON_SETS-1); return uset_getSerializedSet(fillSet, canonStartSets+i, canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i); } else { /* other result values are BMP code points for single-code point sets */ uset_setSerializedToOne(fillSet, (UChar32)i); return TRUE; } } } else { uint16_t high, low, h; table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+ canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]; start=0; limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]; high=(uint16_t)(c>>16); low=(uint16_t)c; /* each entry is a triplet { high(c), low(c), result } */ while(start<limit-3) { i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */ h=table[i]&0x1f; /* high word */ if(high<h || (high==h && low<table[i+1])) { limit=i; } else { start=i; } } /* found? */ h=table[start]; if(high==(h&0x1f) && low==table[start+1]) { i=table[start+2]; if((h&0x8000)==0) { /* the result is an index to a USerializedSet */ return uset_getSerializedSet(fillSet, canonStartSets+i, canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i); } else { /* * single-code point set {x} in * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx } */ i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */ uset_setSerializedToOne(fillSet, (UChar32)i); return TRUE; } } } } return FALSE; /* not found */ }
// --------------------------------------------------------------------------- // RangeToken: Getter methods // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { if (fCaseIToken == 0 && tokFactory && fRanges) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); #if XERCES_USE_TRANSCODER_ICU && ((U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4)) UChar* rangeStr=(UChar*)fMemoryManager->allocate(40*fElemCount*sizeof(UChar)); ArrayJanitor<UChar> janRange(rangeStr, fMemoryManager); int c=0; rangeStr[c++] = chOpenSquare; for (unsigned int i = 0; i < fElemCount - 1; i += 2) { XMLCh buffer[10]; XMLSize_t len, j; rangeStr[c++] = chBackSlash; rangeStr[c++] = chLatin_U; XMLString::binToText(fRanges[i], buffer, 10, 16, fMemoryManager); len = XMLString::stringLen(buffer); for(j=0;j<(8-len);j++) rangeStr[c++] = chDigit_0; XMLCh* p=buffer; while(*p) rangeStr[c++] = *p++; if(fRanges[i+1]!=fRanges[i]) { rangeStr[c++] = chDash; rangeStr[c++] = chBackSlash; rangeStr[c++] = chLatin_U; XMLString::binToText(fRanges[i+1], buffer, 10, 16, fMemoryManager); len = XMLString::stringLen(buffer); for(j=0;j<(8-len);j++) rangeStr[c++] = chDigit_0; p=buffer; while(*p) rangeStr[c++] = *p++; } } rangeStr[c++] = chCloseSquare; rangeStr[c++] = chNull; UErrorCode ec=U_ZERO_ERROR; USet* range=uset_openPatternOptions(rangeStr, -1, USET_CASE_INSENSITIVE, &ec); if(range) { ec = U_ZERO_ERROR; uint32_t cbCount=uset_serialize(range, NULL, 0, &ec); uint16_t* buffer=(uint16_t*)fMemoryManager->allocate(cbCount*sizeof(uint16_t)); ArrayJanitor<uint16_t> janSet(buffer, fMemoryManager); ec = U_ZERO_ERROR; uset_serialize(range, buffer, cbCount, &ec); USerializedSet serializedSet; uset_getSerializedSet(&serializedSet, buffer, cbCount); int32_t nSets=uset_getSerializedRangeCount(&serializedSet); for(int32_t i=0; i<nSets; i++) { UChar32 start, end; uset_getSerializedRange(&serializedSet, i, &start, &end); lwrToken->addRange(start, end); } // does this release the memory allocated by the set? uset_setSerializedToOne(&serializedSet, 32); uset_close(range); } #else unsigned int exceptIndex = 0; for (unsigned int i = 0; i < fElemCount - 1; i += 2) { for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) { #if XERCES_USE_TRANSCODER_ICU const XMLInt32 upperCh = u_toupper(ch); if (upperCh != ch) { lwrToken->addRange(upperCh, upperCh); } const XMLInt32 lowerCh = u_tolower(ch); if (lowerCh != ch) { lwrToken->addRange(lowerCh, lowerCh); } const XMLInt32 titleCh = u_totitle(ch); if (titleCh != ch && titleCh != upperCh) { lwrToken->addRange(titleCh, titleCh); } #else if (ch >= chLatin_A && ch <= chLatin_Z) { ch += chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } else if (ch >= chLatin_a && ch <= chLatin_z) { ch -= chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } #endif const unsigned int exceptionsSize = sizeof(s_exceptions) / sizeof(s_exceptions[0]); // Add any exception chars. These are characters where the the // case mapping is not symmetric. (Unicode case mappings are not isomorphic...) while (exceptIndex < exceptionsSize) { if (s_exceptions[exceptIndex].baseChar < ch) { ++exceptIndex; } else if (s_exceptions[exceptIndex].baseChar == ch) { const XMLInt32 matchingChar = s_exceptions[exceptIndex].matchingChar; lwrToken->addRange( matchingChar, matchingChar); ++exceptIndex; } else { break; } } } } lwrToken->mergeRanges(this); #endif lwrToken->compactRanges(); lwrToken->createMap(); fCaseIToken = lwrToken; // TODO(dbertoni) This is a temporary hack until we can change the ABI. // See Jira issue XERCESC-1866 for more details. // Overload the fCaseIToken data member to be the case-insensitive token // that's caching the case-insensitive one. We need this because tokens // have varying lifetimes. fCaseIToken->setCaseInsensitiveToken(this); } return fCaseIToken; }