// SpoofData::initPtrs() // Initialize the pointers to the various sections of the raw data. // // This function is used both during the Trie building process (multiple // times, as the individual data sections are added), and // during the opening of a Spoof Checker from prebuilt data. // // The pointers for non-existent data sections (identified by an offset of 0) // are set to NULL. // // Note: During building the data, adding each new data section // reallocs the raw data area, which likely relocates it, which // in turn requires reinitializing all of the pointers into it, hence // multiple calls to this function during building. // void SpoofData::initPtrs(UErrorCode &status) { fCFUKeys = NULL; fCFUValues = NULL; fCFUStringLengths = NULL; fCFUStrings = NULL; if (U_FAILURE(status)) { return; } if (fRawData->fCFUKeys != 0) { fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); } if (fRawData->fCFUStringIndex != 0) { fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); } if (fRawData->fCFUStringLengths != 0) { fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); } if (fRawData->fCFUStringTable != 0) { fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); } if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); } if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); } if (fRawData->fScriptSets != 0) { fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); } }
void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { if (U_FAILURE(status)) { return; } fHeader = data; if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) { status = U_INVALID_FORMAT_ERROR; return; } // Note: in ICU version 3.2 and earlier, there was a formatVersion 1 // that is no longer supported. At that time fFormatVersion was // an int32_t field, rather than an array of 4 bytes. fDontFreeData = FALSE; if (data->fFTableLen != 0) { fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); } if (data->fRTableLen != 0) { fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); } fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, (uint8_t *)data + fHeader->fTrie, fHeader->fTrieLen, NULL, // *actual length &status); if (U_FAILURE(status)) { return; } fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); fRuleString.setTo(TRUE, fRuleSource, -1); U_ASSERT(data->fRuleSourceLen > 0); fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); fRefCount = 1; #ifdef RBBI_DEBUG char *debugEnv = getenv("U_RBBIDEBUG"); if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} #endif }
void LoadedNormalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); if(U_FAILURE(errorCode)) { return; } const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); const int32_t *inIndexes=(const int32_t *)inBytes; int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; if(indexesLength<=IX_MIN_MAYBE_YES) { errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. return; } int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; ownedTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, inBytes+offset, nextOffset-offset, NULL, &errorCode); if(U_FAILURE(errorCode)) { return; } offset=nextOffset; nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; const uint16_t *inExtraData=(const uint16_t *)(inBytes+offset); // smallFCD: new in formatVersion 2 offset=nextOffset; const uint8_t *inSmallFCD=inBytes+offset; init(inIndexes, ownedTrie, inExtraData, inSmallFCD); }
/* unserialize a selector */ U_CAPI UConverterSelector* U_EXPORT2 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { // check if already failed if (U_FAILURE(*status)) { return NULL; } // ensure args make sense! const uint8_t *p = (const uint8_t *)buffer; if (length <= 0 || (length > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) ) { *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } // header if (length < 32) { // not even enough space for a minimal header *status = U_INDEX_OUTOFBOUNDS_ERROR; return NULL; } const DataHeader *pHeader = (const DataHeader *)p; if (!( pHeader->dataHeader.magic1==0xda && pHeader->dataHeader.magic2==0x27 && pHeader->info.dataFormat[0] == 0x43 && pHeader->info.dataFormat[1] == 0x53 && pHeader->info.dataFormat[2] == 0x65 && pHeader->info.dataFormat[3] == 0x6c )) { /* header not valid or dataFormat not recognized */ *status = U_INVALID_FORMAT_ERROR; return NULL; } if (pHeader->info.formatVersion[0] != 1) { *status = U_UNSUPPORTED_ERROR; return NULL; } uint8_t* swapped = NULL; if (pHeader->info.isBigEndian != U_IS_BIG_ENDIAN || pHeader->info.charsetFamily != U_CHARSET_FAMILY ) { // swap the data UDataSwapper *ds = udata_openSwapperForInputData(p, length, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, status); int32_t totalSize = ucnvsel_swap(ds, p, -1, NULL, status); if (U_FAILURE(*status)) { udata_closeSwapper(ds); return NULL; } if (length < totalSize) { udata_closeSwapper(ds); *status = U_INDEX_OUTOFBOUNDS_ERROR; return NULL; } swapped = (uint8_t*)uprv_malloc(totalSize); if (swapped == NULL) { udata_closeSwapper(ds); *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } ucnvsel_swap(ds, p, length, swapped, status); udata_closeSwapper(ds); if (U_FAILURE(*status)) { uprv_free(swapped); return NULL; } p = swapped; pHeader = (const DataHeader *)p; } if (length < (pHeader->dataHeader.headerSize + 16 * 4)) { // not even enough space for the header and the indexes uprv_free(swapped); *status = U_INDEX_OUTOFBOUNDS_ERROR; return NULL; } p += pHeader->dataHeader.headerSize; length -= pHeader->dataHeader.headerSize; // indexes const int32_t *indexes = (const int32_t *)p; if (length < indexes[UCNVSEL_INDEX_SIZE]) { uprv_free(swapped); *status = U_INDEX_OUTOFBOUNDS_ERROR; return NULL; } p += UCNVSEL_INDEX_COUNT * 4; // create and populate the selector object UConverterSelector* sel = (UConverterSelector*)uprv_malloc(sizeof(UConverterSelector)); char **encodings = (char **)uprv_malloc( indexes[UCNVSEL_INDEX_NAMES_COUNT] * sizeof(char *)); if (sel == NULL || encodings == NULL) { uprv_free(swapped); uprv_free(sel); uprv_free(encodings); *status = U_MEMORY_ALLOCATION_ERROR; return NULL; } uprv_memset(sel, 0, sizeof(UConverterSelector)); sel->pvCount = indexes[UCNVSEL_INDEX_PV_COUNT]; sel->encodings = encodings; sel->encodingsCount = indexes[UCNVSEL_INDEX_NAMES_COUNT]; sel->encodingStrLength = indexes[UCNVSEL_INDEX_NAMES_LENGTH]; sel->swapped = swapped; // trie sel->trie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, p, indexes[UCNVSEL_INDEX_TRIE_SIZE], NULL, status); p += indexes[UCNVSEL_INDEX_TRIE_SIZE]; if (U_FAILURE(*status)) { ucnvsel_close(sel); return NULL; } // bit vectors sel->pv = (uint32_t *)p; p += sel->pvCount * 4; // encoding names char* s = (char*)p; for (int32_t i = 0; i < sel->encodingsCount; ++i) { sel->encodings[i] = s; s += uprv_strlen(s) + 1; } p += sel->encodingStrLength; return sel; }
void CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, CollationTailoring &tailoring, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } if(base != NULL) { if(inBytes == NULL || (0 <= inLength && inLength < 24)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes); if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 && isAcceptable(tailoring.version, NULL, NULL, &header->info))) { errorCode = U_INVALID_FORMAT_ERROR; return; } if(base->getUCAVersion() != tailoring.getUCAVersion()) { errorCode = U_COLLATOR_VERSION_MISMATCH; return; } int32_t headerLength = header->dataHeader.headerSize; inBytes += headerLength; if(inLength >= 0) { inLength -= headerLength; } } if(inBytes == NULL || (0 <= inLength && inLength < 8)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes); int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH]; if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) { errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes. return; } // Assume that the tailoring data is in initial state, // with NULL pointers and 0 lengths. // Set pointers to non-empty data parts. // Do this in order of their byte offsets. (Should help porting to Java.) int32_t index; // one of the indexes[] slots int32_t offset; // byte offset for the index part int32_t length; // number of bytes in the index part if(indexesLength > IX_TOTAL_SIZE) { length = inIndexes[IX_TOTAL_SIZE]; } else if(indexesLength > IX_REORDER_CODES_OFFSET) { length = inIndexes[indexesLength - 1]; } else { length = 0; // only indexes, and inLength was already checked for them } if(0 <= inLength && inLength < length) { errorCode = U_INVALID_FORMAT_ERROR; return; } const CollationData *baseData = base == NULL ? NULL : base->data; const int32_t *reorderCodes = NULL; int32_t reorderCodesLength = 0; index = IX_REORDER_CODES_OFFSET; offset = getIndex(inIndexes, indexesLength, index); length = getIndex(inIndexes, indexesLength, index + 1) - offset; if(length >= 4) { if(baseData == NULL) { // We assume for collation settings that // the base data does not have a reordering. errorCode = U_INVALID_FORMAT_ERROR; return; } reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset); reorderCodesLength = length / 4; } // There should be a reorder table only if there are reorder codes. // However, when there are reorder codes the reorder table may be omitted to reduce // the data size. const uint8_t *reorderTable = NULL; index = IX_REORDER_TABLE_OFFSET; offset = getIndex(inIndexes, indexesLength, index); length = getIndex(inIndexes, indexesLength, index + 1) - offset; if(length >= 256) { if(reorderCodesLength == 0) { errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes. return; } reorderTable = inBytes + offset; } else { // If we have reorder codes, then build the reorderTable at the end, // when the CollationData is otherwise complete. } if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) { errorCode = U_INVALID_FORMAT_ERROR; return; } CollationData *data = NULL; // Remains NULL if there are no mappings. index = IX_TRIE_OFFSET; offset = getIndex(inIndexes, indexesLength, index); length = getIndex(inIndexes, indexesLength, index + 1) - offset; if(length >= 8) { if(!tailoring.ensureOwnedData(errorCode)) { return; } data = tailoring.ownedData; data->base = baseData; data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000; data->trie = tailoring.trie = utrie2_openFromSerialized( UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL, &errorCode); if(U_FAILURE(errorCode)) { return; } } else if(baseData != NULL) { // Use the base data. Only the settings are tailored. tailoring.data = baseData; } else { errorCode = U_INVALID_FORMAT_ERROR; // No mappings. return; } index = IX_CES_OFFSET; offset = getIndex(inIndexes, indexesLength, index); length = getIndex(inIndexes, indexesLength, index + 1) - offset; if(length >= 8) { if(data == NULL) { errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie. return; } data->ces = reinterpret_cast<const int64_t *>(inBytes + offset); data->cesLength = length / 8; } index = IX_CE32S_OFFSET; offset = getIndex(inIndexes, indexesLength, index); length = getIndex(inIndexes, indexesLength, index + 1) - offset; if(length >= 4) { if(data == NULL) { errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie. return; } data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset); data->ce32sLength = length / 4; } int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START); if(jamoCE32sStart >= 0) { if(data == NULL || data->ce32s == NULL) { errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[]. return; } data->jamoCE32s = data->ce32s + jamoCE32sStart; } else if(data == NULL) { // Nothing to do. } else if(baseData != NULL) { data->jamoCE32s = baseData->jamoCE32s; } else { errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing. return; } index = IX_ROOT_ELEMENTS_OFFSET; offset = getIndex(inIndexes, indexesLength, index); length = getIndex(inIndexes, indexesLength, index + 1) - offset; if(length >= 4) { length /= 4; if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) { errorCode = U_INVALID_FORMAT_ERROR; return; } data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset); data->rootElementsLength = length; uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE]; if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) { errorCode = U_INVALID_FORMAT_ERROR; return; } uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES]; if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) { // [fixed last secondary common byte] is too low, // and secondary weights would collide with compressed common secondaries. errorCode = U_INVALID_FORMAT_ERROR; return; } }
// Build the Whole Script Confusable data // // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, // because everything is local to this one build function anyhow, // OR // break this function into more reasonably sized pieces, with // state in WSConfusableDataBuilder. // void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) { if (U_FAILURE(status)) { return; } URegularExpression *parseRegexp = NULL; int32_t inputLen = 0; UChar *input = NULL; int32_t lineNum = 0; UVector *scriptSets = NULL; uint32_t rtScriptSetsCount = 2; UTrie2 *anyCaseTrie = NULL; UTrie2 *lowerCaseTrie = NULL; anyCaseTrie = utrie2_open(0, 0, &status); lowerCaseTrie = utrie2_open(0, 0, &status); // The scriptSets vector provides a mapping from TRIE values to the set of scripts. // // Reserved TRIE values: // 0: Code point has no whole script confusables. // 1: Code point is of script Common or Inherited. // These code points do not participate in whole script confusable detection. // (This is logically equivalent to saying that they contain confusables in // all scripts) // // Because Trie values are indexes into the ScriptSets vector, pre-fill // vector positions 0 and 1 to avoid conflicts with the reserved values. scriptSets = new UVector(status); if (scriptSets == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement((void *)NULL, status); scriptSets->addElement((void *)NULL, status); // Convert the user input data from UTF-8 to UChar (UTF-16) u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); if (input == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); parseRegexp = uregex_openC(parseExp, 0, NULL, &status); // Zap any Byte Order Mark at the start of input. Changing it to a space is benign // given the syntax of the input. if (*input == 0xfeff) { *input = 0x20; } // Parse the input, one line per iteration of this loop. uregex_setText(parseRegexp, input, inputLen, &status); while (uregex_findNext(parseRegexp, &status)) { lineNum++; UChar line[200]; uregex_group(parseRegexp, 0, line, 200, &status); if (uregex_start(parseRegexp, 1, &status) >= 0) { // this was a blank or comment line. continue; } if (uregex_start(parseRegexp, 8, &status) >= 0) { // input file syntax error. status = U_PARSE_ERROR; goto cleanup; } if (U_FAILURE(status)) { goto cleanup; } // Pick up the start and optional range end code points from the parsed line. UChar32 startCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); UChar32 endCodePoint = startCodePoint; if (uregex_start(parseRegexp, 3, &status) >=0) { endCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); } // Extract the two script names from the source line. We need these in an 8 bit // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on // to the ICU u_getPropertyValueEnum() function. Ugh. char srcScriptName[20]; char targScriptName[20]; extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); UScriptCode srcScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); UScriptCode targScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); if (U_FAILURE(status)) { goto cleanup; } if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } // select the table - (A) any case or (L) lower case only UTrie2 *table = anyCaseTrie; if (uregex_start(parseRegexp, 7, &status) >= 0) { table = lowerCaseTrie; } // Build the set of scripts containing confusable characters for // the code point(s) specified in this input line. // Sanity check that the script of the source code point is the same // as the source script indicated in the input file. Failure of this check is // an error in the input file. // Include the source script in the set (needed for Mixed Script Confusable detection). // UChar32 cp; for (cp=startCodePoint; cp<=endCodePoint; cp++) { int32_t setIndex = utrie2_get32(table, cp); BuilderScriptSet *bsset = NULL; if (setIndex > 0) { U_ASSERT(setIndex < scriptSets->size()); bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); } else { bsset = new BuilderScriptSet(); if (bsset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } bsset->codePoint = cp; bsset->trie = table; bsset->sset = new ScriptSet(); setIndex = scriptSets->size(); bsset->index = setIndex; bsset->rindex = 0; if (bsset->sset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement(bsset, status); utrie2_set32(table, cp, setIndex, &status); } bsset->sset->Union(targScript); bsset->sset->Union(srcScript); if (U_FAILURE(status)) { goto cleanup; } UScriptCode cpScript = uscript_getScript(cp, &status); if (cpScript != srcScript) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } } } // Eliminate duplicate script sets. At this point we have a separate // script set for every code point that had data in the input file. // // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them // // printf("Number of scriptSets: %d\n", scriptSets->size()); { int32_t duplicateCount = 0; rtScriptSetsCount = 2; for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); if (outerSet->index != static_cast<uint32_t>(outeri)) { // This set was already identified as a duplicate. // It will not be allocated a position in the runtime array of ScriptSets. continue; } outerSet->rindex = rtScriptSetsCount++; for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { delete innerSet->sset; innerSet->scriptSetOwned = FALSE; innerSet->sset = outerSet->sset; innerSet->index = outeri; innerSet->rindex = outerSet->rindex; duplicateCount++; } // But this doesn't get all. We need to fix the TRIE. } } // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); } // Update the Trie values to be reflect the run time script indexes (after duplicate merging). // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets // are unused, which is why the loop index starts at 2.) { for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex != (uint32_t)i) { utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); } } } // For code points with script==Common or script==Inherited, // Set the reserved value of 1 into both Tries. These characters do not participate // in Whole Script Confusable detection; this reserved value is the means // by which they are detected. { UnicodeSet ignoreSet; ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); UnicodeSet inheritedSet; inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); ignoreSet.addAll(inheritedSet); for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { UChar32 rangeStart = ignoreSet.getRangeStart(rn); UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); } } // Serialize the data to the Spoof Detector { utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); // printf("Any case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; void *where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(anyCaseTrie, where, size, &status); utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); // printf("Lower case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(lowerCaseTrie, where, size, &status); spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; ScriptSet *rtScriptSets = static_cast<ScriptSet *> (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); uint32_t rindex = 2; for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex < rindex) { // We have already copied this script set to the serialized data. continue; } U_ASSERT(rindex == bSet->rindex); rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. rindex++; } } // Open new utrie2s from the serialized data. We don't want to keep the ones // we just built because we would then have two copies of the data, one internal to // the utries that we have already constructed, and one in the serialized data area. // An alternative would be to not pre-serialize the Trie data, but that makes the // spoof detector data different, depending on how the detector was constructed. // It's simpler to keep the data always the same. spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); cleanup: if (U_FAILURE(status)) { pe->line = lineNum; } uregex_close(parseRegexp); uprv_free(input); int32_t i; for (i=0; i<scriptSets->size(); i++) { BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); delete bsset; } delete scriptSets; utrie2_close(anyCaseTrie); utrie2_close(lowerCaseTrie); return; }