//----------------------------------------------------------------------------------- // // getTrieSize() Return the size that will be required to serialize the Trie. // //----------------------------------------------------------------------------------- int32_t RBBISetBuilder::getTrieSize() { if (U_FAILURE(*fStatus)) { return 0; } utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus); fTrieSize = utrie2_serialize(fTrie, NULL, // Buffer 0, // Capacity fStatus); if (*fStatus == U_BUFFER_OVERFLOW_ERROR) { *fStatus = U_ZERO_ERROR; } // RBBIDebugPrintf("Trie table size is %d\n", trieSize); return fTrieSize; }
/* serialize a selector */ U_CAPI int32_t U_EXPORT2 ucnvsel_serialize(const UConverterSelector* sel, void* buffer, int32_t bufferCapacity, UErrorCode* status) { // check if already failed if (U_FAILURE(*status)) { return 0; } // ensure args make sense! uint8_t *p = (uint8_t *)buffer; if (bufferCapacity < 0 || (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0))) ) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } // add up the size of the serialized form int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status); if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { return 0; } *status = U_ZERO_ERROR; DataHeader header; uprv_memset(&header, 0, sizeof(header)); header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15); header.dataHeader.magic1 = 0xda; header.dataHeader.magic2 = 0x27; uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo)); int32_t indexes[UCNVSEL_INDEX_COUNT] = { serializedTrieSize, sel->pvCount, sel->encodingsCount, sel->encodingStrLength }; int32_t totalSize = header.dataHeader.headerSize + (int32_t)sizeof(indexes) + serializedTrieSize + sel->pvCount * 4 + sel->encodingStrLength; indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize; if (totalSize > bufferCapacity) { *status = U_BUFFER_OVERFLOW_ERROR; return totalSize; } // ok, save! int32_t length = header.dataHeader.headerSize; uprv_memcpy(p, &header, sizeof(header)); uprv_memset(p + sizeof(header), 0, length - sizeof(header)); p += length; length = (int32_t)sizeof(indexes); uprv_memcpy(p, indexes, length); p += length; utrie2_serialize(sel->trie, p, serializedTrieSize, status); p += serializedTrieSize; length = sel->pvCount * 4; uprv_memcpy(p, sel->pv, length); p += length; uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength); p += sel->encodingStrLength; return totalSize; }
void BiDiPropsBuilder::build(UErrorCode &errorCode) { makeMirror(errorCode); if(U_FAILURE(errorCode)) { return; } utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode); trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops error: utrie2_freeze()+utrie2_serialize() failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); return; } // Finish jgArray & jgArray2. UChar32 jgStart; // First code point with a Joining_Group, first range. UChar32 jgLimit; // One past the last one. // Find the end of the range first, so that if it's empty we // get jgStart=jgLimit=MIN_JG_START. for(jgLimit=MAX_JG_LIMIT; MIN_JG_START<jgLimit && jgArray[jgLimit-MIN_JG_START-1]==U_JG_NO_JOINING_GROUP; --jgLimit) {} for(jgStart=MIN_JG_START; jgStart<jgLimit && jgArray[jgStart-MIN_JG_START]==U_JG_NO_JOINING_GROUP; ++jgStart) {} UChar32 jgStart2; // First code point with a Joining_Group, second range. UChar32 jgLimit2; // One past the last one. for(jgLimit2=MAX_JG_LIMIT2; MIN_JG_START2<jgLimit2 && jgArray2[jgLimit2-MIN_JG_START2-1]==U_JG_NO_JOINING_GROUP; --jgLimit2) {} for(jgStart2=MIN_JG_START2; jgStart2<jgLimit2 && jgArray2[jgStart2-MIN_JG_START2]==U_JG_NO_JOINING_GROUP; ++jgStart2) {} // Pad the total Joining_Group arrays length to a multiple of 4. // Prefer rounding down starts before rounding up limits // so that we are guaranteed not to increase the limits beyond // the end of the arrays' code point ranges. int32_t jgLength=jgLimit-jgStart+jgLimit2-jgStart2; while(jgLength&3) { if((jgStart<jgLimit) && (jgStart&3)) { --jgStart; } else if((jgStart2<jgLimit2) && (jgStart2&3)) { --jgStart2; } else if(jgStart<jgLimit) { ++jgLimit; } else { ++jgLimit2; } ++jgLength; } indexes[UBIDI_IX_JG_START]=jgStart; indexes[UBIDI_IX_JG_LIMIT]=jgLimit; indexes[UBIDI_IX_JG_START2]=jgStart2; indexes[UBIDI_IX_JG_LIMIT2]=jgLimit2; indexes[UBIDI_IX_TRIE_SIZE]=trieSize; indexes[UBIDI_IX_MIRROR_LENGTH]=mirrorTop; indexes[UBIDI_IX_LENGTH]= (int32_t)sizeof(indexes)+ trieSize+ 4*mirrorTop+ jgLength; if(!beQuiet) { puts("* ubidi.icu stats *"); printf("trie size in bytes: %5d\n", (int)trieSize); printf("size in bytes of mirroring table: %5d\n", (int)(4*mirrorTop)); printf("length of Joining_Group array: %5d (U+%04x..U+%04x)\n", (int)(jgLimit-jgStart), (int)jgStart, (int)(jgLimit-1)); printf("length of Joining_Group array 2: %5d (U+%04x..U+%04x)\n", (int)(jgLimit2-jgStart2), (int)jgStart2, (int)(jgLimit2-1)); printf("data size: %5d\n", (int)indexes[UBIDI_IX_LENGTH]); } indexes[UBIDI_MAX_VALUES_INDEX]= ((int32_t)U_CHAR_DIRECTION_COUNT-1)| (((int32_t)U_JT_COUNT-1)<<UBIDI_JT_SHIFT)| (((int32_t)U_BPT_COUNT-1)<<UBIDI_BPT_SHIFT)| (((int32_t)U_JG_COUNT-1)<<UBIDI_MAX_JG_SHIFT); }
int main(int argc, char** argv) { // Create a value array of all possible code points. const UChar32 size = kMaxCodepoint + 1; CharacterProperty* values = new CharacterProperty[size]; memset(values, 0, sizeof(CharacterProperty) * size); setRanges(values, cjkIdeographRanges, ARRAY_LENGTH(cjkIdeographRanges), CharacterProperty::isCJKIdeographOrSymbol); setRanges(values, cjkSymbolRanges, ARRAY_LENGTH(cjkSymbolRanges), CharacterProperty::isCJKIdeographOrSymbol); setValues(values, cjkIsolatedSymbolsArray, ARRAY_LENGTH(cjkIsolatedSymbolsArray), CharacterProperty::isCJKIdeographOrSymbol); setRanges(values, isUprightInMixedVerticalRanges, ARRAY_LENGTH(isUprightInMixedVerticalRanges), CharacterProperty::isUprightInMixedVertical); setValues(values, isUprightInMixedVerticalArray, ARRAY_LENGTH(isUprightInMixedVerticalArray), CharacterProperty::isUprightInMixedVertical); // Create a trie from the value array. UErrorCode error = U_ZERO_ERROR; UTrie2* trie = utrie2_open(0, 0, &error); assert(error == U_ZERO_ERROR); UChar32 start = 0; CharacterProperty value = values[0]; for (UChar32 c = 1;; c++) { if (c < size && values[c] == value) continue; if (static_cast<uint32_t>(value)) { utrie2_setRange32(trie, start, c - 1, static_cast<uint32_t>(value), TRUE, &error); assert(error == U_ZERO_ERROR); } if (c >= size) break; start = c; value = values[start]; } // Freeze and serialize the trie to a byte array. utrie2_freeze(trie, UTrie2ValueBits::UTRIE2_16_VALUE_BITS, &error); assert(error == U_ZERO_ERROR); int32_t serializedSize = utrie2_serialize(trie, nullptr, 0, &error); error = U_ZERO_ERROR; uint8_t* serialized = new uint8_t[serializedSize]; serializedSize = utrie2_serialize(trie, serialized, serializedSize, &error); assert(error == U_ZERO_ERROR); // Write the serialized array to the source file. if (argc <= 1) { generate(stdout, serializedSize, serialized); } else { FILE* fp = fopen(argv[1], "wb"); generate(fp, serializedSize, serialized); fclose(fp); } utrie2_close(trie); return 0; }
int32_t CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion, const CollationData &data, const CollationSettings &settings, const void *rootElements, int32_t rootElementsLength, int32_t indexes[], uint8_t *dest, int32_t capacity, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } if(capacity < 0 || (capacity > 0 && dest == NULL)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return 0; } // Figure out which data items to write before settling on // the indexes length and writing offsets. // For any data item, we need to write the start and limit offsets, // so the indexes length must be at least index-of-start-offset + 2. int32_t indexesLength; UBool hasMappings; UnicodeSet unsafeBackwardSet; const CollationData *baseData = data.base; int32_t fastLatinVersion; if(data.fastLatinTable != NULL) { fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16; } else { fastLatinVersion = 0; } int32_t fastLatinTableLength = 0; if(isBase) { // For the root collator, we write an even number of indexes // so that we start with an 8-aligned offset. indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1; U_ASSERT(settings.reorderCodesLength == 0); hasMappings = TRUE; unsafeBackwardSet = *data.unsafeBackwardSet; fastLatinTableLength = data.fastLatinTableLength; } else if(baseData == NULL) { hasMappings = FALSE; if(settings.reorderCodesLength == 0) { // only options indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here } else { // only options, reorder codes, and the reorder table indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2; } } else { hasMappings = TRUE; // Tailored mappings, and what else? // Check in ascending order of optional tailoring data items. indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2; if(data.contextsLength != 0) { indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2; } unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet); if(!unsafeBackwardSet.isEmpty()) { indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2; } if(data.fastLatinTable != baseData->fastLatinTable) { fastLatinTableLength = data.fastLatinTableLength; indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2; } } UVector32 codesAndRanges(errorCode); const int32_t *reorderCodes = settings.reorderCodes; int32_t reorderCodesLength = settings.reorderCodesLength; if(settings.hasReordering() && CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) { // Rebuild the full list of reorder ranges. // The list in the settings is truncated for efficiency. data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode); // Write the codes, then the ranges. for(int32_t i = 0; i < reorderCodesLength; ++i) { codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode); } if(U_FAILURE(errorCode)) { return 0; } reorderCodes = codesAndRanges.getBuffer(); reorderCodesLength = codesAndRanges.size(); } int32_t headerSize; if(isBase) { headerSize = 0; // udata_create() writes the header } else { DataHeader header; header.dataHeader.magic1 = 0xda; header.dataHeader.magic2 = 0x27; uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo)); uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo)); headerSize = (int32_t)sizeof(header); U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes if(hasMappings && data.cesLength != 0) { // Sum of the sizes of the data items which are // not automatically multiples of 8 bytes and which are placed before the CEs. int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4; if((sum & 7) != 0) { // We need to add padding somewhere so that the 64-bit CEs are 8-aligned. // We add to the header size here. // Alternatively, we could increment the indexesLength // or add a few bytes to the reorderTable. headerSize += 4; } } header.dataHeader.headerSize = (uint16_t)headerSize; if(headerSize <= capacity) { uprv_memcpy(dest, &header, sizeof(header)); // Write 00 bytes so that the padding is not mistaken for a copyright string. uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header)); dest += headerSize; capacity -= headerSize; } else { dest = NULL; capacity = 0; } } indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength; U_ASSERT((settings.options & ~0xffff) == 0); indexes[CollationDataReader::IX_OPTIONS] = data.numericPrimary | fastLatinVersion | settings.options; indexes[CollationDataReader::IX_RESERVED2] = 0; indexes[CollationDataReader::IX_RESERVED3] = 0; // Byte offsets of data items all start from the start of the indexes. // We add the headerSize at the very end. int32_t totalSize = indexesLength * 4; if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s; } else { indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1; } indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize; totalSize += reorderCodesLength * 4; indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize; if(settings.reorderTable != NULL) { totalSize += 256; } indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize; if(hasMappings) { UErrorCode errorCode2 = U_ZERO_ERROR; int32_t length; if(totalSize < capacity) { length = utrie2_serialize(data.trie, dest + totalSize, capacity - totalSize, &errorCode2); } else { length = utrie2_serialize(data.trie, NULL, 0, &errorCode2); } if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { errorCode = errorCode2; return 0; } // The trie size should be a multiple of 8 bytes due to the way // compactIndex2(UNewTrie2 *trie) currently works. U_ASSERT((length & 7) == 0); totalSize += length; } indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize; indexes[CollationDataReader::IX_CES_OFFSET] = totalSize; if(hasMappings && data.cesLength != 0) { U_ASSERT(((headerSize + totalSize) & 7) == 0); totalSize += data.cesLength * 8; } indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize; indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize; if(hasMappings) { totalSize += data.ce32sLength * 4; } indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize; totalSize += rootElementsLength * 4; indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize; if(hasMappings) { totalSize += data.contextsLength * 2; } indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize; if(hasMappings && !unsafeBackwardSet.isEmpty()) { UErrorCode errorCode2 = U_ZERO_ERROR; int32_t length; if(totalSize < capacity) { uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize); length = unsafeBackwardSet.serialize( p, (capacity - totalSize) / 2, errorCode2); } else { length = unsafeBackwardSet.serialize(NULL, 0, errorCode2); } if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { errorCode = errorCode2; return 0; } totalSize += length * 2; } indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize; totalSize += fastLatinTableLength * 2; UnicodeString scripts; indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize; if(isBase) { scripts.append((UChar)data.numScripts); scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16); scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength); totalSize += scripts.length() * 2; } indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize; if(isBase) { totalSize += 256; } indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize; indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize; if(totalSize > capacity) { errorCode = U_BUFFER_OVERFLOW_ERROR; return headerSize + totalSize; } uprv_memcpy(dest, indexes, indexesLength * 4); copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest); copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest); // The trie has already been serialized into the dest buffer. copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest); copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest); copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest); copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest); // The unsafeBackwardSet has already been serialized into the dest buffer. copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest); copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest); copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest); return headerSize + totalSize; }
// Build the Whole Script Confusable data // // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, // because everything is local to this one build function anyhow, // OR // break this function into more reasonably sized pieces, with // state in WSConfusableDataBuilder. // void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) { if (U_FAILURE(status)) { return; } URegularExpression *parseRegexp = NULL; int32_t inputLen = 0; UChar *input = NULL; int32_t lineNum = 0; UVector *scriptSets = NULL; uint32_t rtScriptSetsCount = 2; UTrie2 *anyCaseTrie = NULL; UTrie2 *lowerCaseTrie = NULL; anyCaseTrie = utrie2_open(0, 0, &status); lowerCaseTrie = utrie2_open(0, 0, &status); // The scriptSets vector provides a mapping from TRIE values to the set of scripts. // // Reserved TRIE values: // 0: Code point has no whole script confusables. // 1: Code point is of script Common or Inherited. // These code points do not participate in whole script confusable detection. // (This is logically equivalent to saying that they contain confusables in // all scripts) // // Because Trie values are indexes into the ScriptSets vector, pre-fill // vector positions 0 and 1 to avoid conflicts with the reserved values. scriptSets = new UVector(status); if (scriptSets == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement((void *)NULL, status); scriptSets->addElement((void *)NULL, status); // Convert the user input data from UTF-8 to UChar (UTF-16) u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); if (input == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); parseRegexp = uregex_openC(parseExp, 0, NULL, &status); // Zap any Byte Order Mark at the start of input. Changing it to a space is benign // given the syntax of the input. if (*input == 0xfeff) { *input = 0x20; } // Parse the input, one line per iteration of this loop. uregex_setText(parseRegexp, input, inputLen, &status); while (uregex_findNext(parseRegexp, &status)) { lineNum++; UChar line[200]; uregex_group(parseRegexp, 0, line, 200, &status); if (uregex_start(parseRegexp, 1, &status) >= 0) { // this was a blank or comment line. continue; } if (uregex_start(parseRegexp, 8, &status) >= 0) { // input file syntax error. status = U_PARSE_ERROR; goto cleanup; } if (U_FAILURE(status)) { goto cleanup; } // Pick up the start and optional range end code points from the parsed line. UChar32 startCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); UChar32 endCodePoint = startCodePoint; if (uregex_start(parseRegexp, 3, &status) >=0) { endCodePoint = SpoofImpl::ScanHex( input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); } // Extract the two script names from the source line. We need these in an 8 bit // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on // to the ICU u_getPropertyValueEnum() function. Ugh. char srcScriptName[20]; char targScriptName[20]; extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); UScriptCode srcScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); UScriptCode targScript = static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); if (U_FAILURE(status)) { goto cleanup; } if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } // select the table - (A) any case or (L) lower case only UTrie2 *table = anyCaseTrie; if (uregex_start(parseRegexp, 7, &status) >= 0) { table = lowerCaseTrie; } // Build the set of scripts containing confusable characters for // the code point(s) specified in this input line. // Sanity check that the script of the source code point is the same // as the source script indicated in the input file. Failure of this check is // an error in the input file. // Include the source script in the set (needed for Mixed Script Confusable detection). // UChar32 cp; for (cp=startCodePoint; cp<=endCodePoint; cp++) { int32_t setIndex = utrie2_get32(table, cp); BuilderScriptSet *bsset = NULL; if (setIndex > 0) { U_ASSERT(setIndex < scriptSets->size()); bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); } else { bsset = new BuilderScriptSet(); if (bsset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } bsset->codePoint = cp; bsset->trie = table; bsset->sset = new ScriptSet(); setIndex = scriptSets->size(); bsset->index = setIndex; bsset->rindex = 0; if (bsset->sset == NULL) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } scriptSets->addElement(bsset, status); utrie2_set32(table, cp, setIndex, &status); } bsset->sset->Union(targScript); bsset->sset->Union(srcScript); if (U_FAILURE(status)) { goto cleanup; } UScriptCode cpScript = uscript_getScript(cp, &status); if (cpScript != srcScript) { status = U_INVALID_FORMAT_ERROR; goto cleanup; } } } // Eliminate duplicate script sets. At this point we have a separate // script set for every code point that had data in the input file. // // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them // // printf("Number of scriptSets: %d\n", scriptSets->size()); { int32_t duplicateCount = 0; rtScriptSetsCount = 2; for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); if (outerSet->index != static_cast<uint32_t>(outeri)) { // This set was already identified as a duplicate. // It will not be allocated a position in the runtime array of ScriptSets. continue; } outerSet->rindex = rtScriptSetsCount++; for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { delete innerSet->sset; innerSet->scriptSetOwned = FALSE; innerSet->sset = outerSet->sset; innerSet->index = outeri; innerSet->rindex = outerSet->rindex; duplicateCount++; } // But this doesn't get all. We need to fix the TRIE. } } // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); } // Update the Trie values to be reflect the run time script indexes (after duplicate merging). // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets // are unused, which is why the loop index starts at 2.) { for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex != (uint32_t)i) { utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); } } } // For code points with script==Common or script==Inherited, // Set the reserved value of 1 into both Tries. These characters do not participate // in Whole Script Confusable detection; this reserved value is the means // by which they are detected. { UnicodeSet ignoreSet; ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); UnicodeSet inheritedSet; inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); ignoreSet.addAll(inheritedSet); for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { UChar32 rangeStart = ignoreSet.getRangeStart(rn); UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); } } // Serialize the data to the Spoof Detector { utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); // printf("Any case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; void *where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(anyCaseTrie, where, size, &status); utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); // printf("Lower case Trie size: %d\n", size); if (status != U_BUFFER_OVERFLOW_ERROR) { goto cleanup; } status = U_ZERO_ERROR; spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; where = spImpl->fSpoofData->reserveSpace(size, status); utrie2_serialize(lowerCaseTrie, where, size, &status); spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; ScriptSet *rtScriptSets = static_cast<ScriptSet *> (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); uint32_t rindex = 2; for (int32_t i=2; i<scriptSets->size(); i++) { BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); if (bSet->rindex < rindex) { // We have already copied this script set to the serialized data. continue; } U_ASSERT(rindex == bSet->rindex); rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. rindex++; } } // Open new utrie2s from the serialized data. We don't want to keep the ones // we just built because we would then have two copies of the data, one internal to // the utries that we have already constructed, and one in the serialized data area. // An alternative would be to not pre-serialize the Trie data, but that makes the // spoof detector data different, depending on how the detector was constructed. // It's simpler to keep the data always the same. spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( UTRIE2_16_VALUE_BITS, (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, NULL, &status); cleanup: if (U_FAILURE(status)) { pe->line = lineNum; } uregex_close(parseRegexp); uprv_free(input); int32_t i; for (i=0; i<scriptSets->size(); i++) { BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); delete bsset; } delete scriptSets; utrie2_close(anyCaseTrie); utrie2_close(lowerCaseTrie); return; }
//----------------------------------------------------------------------------------- // // serializeTrie() Put the serialized trie at the specified address. // Trust the caller to have given us enough memory. // getTrieSize() MUST be called first. // //----------------------------------------------------------------------------------- void RBBISetBuilder::serializeTrie(uint8_t *where) { utrie2_serialize(fTrie, where, // Buffer fTrieSize, // Capacity fStatus); }