Esempio n. 1
0
//-----------------------------------------------------------------------------------
//
//  getTrieSize()    Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int32_t RBBISetBuilder::getTrieSize()  {
    if (U_FAILURE(*fStatus)) {
        return 0;
    }
    utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
    fTrieSize  = utrie2_serialize(fTrie,
                                  NULL,                // Buffer
                                  0,                   // Capacity
                                  fStatus);
    if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
        *fStatus = U_ZERO_ERROR;
    }
    // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
    return fTrieSize;
}
Esempio n. 2
0
/* serialize a selector */
U_CAPI int32_t U_EXPORT2
ucnvsel_serialize(const UConverterSelector* sel,
                  void* buffer, int32_t bufferCapacity, UErrorCode* status) {
  // check if already failed
  if (U_FAILURE(*status)) {
    return 0;
  }
  // ensure args make sense!
  uint8_t *p = (uint8_t *)buffer;
  if (bufferCapacity < 0 ||
      (bufferCapacity > 0 && (p == NULL || (U_POINTER_MASK_LSB(p, 3) != 0)))
  ) {
    *status = U_ILLEGAL_ARGUMENT_ERROR;
    return 0;
  }
  // add up the size of the serialized form
  int32_t serializedTrieSize = utrie2_serialize(sel->trie, NULL, 0, status);
  if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
    return 0;
  }
  *status = U_ZERO_ERROR;

  DataHeader header;
  uprv_memset(&header, 0, sizeof(header));
  header.dataHeader.headerSize = (uint16_t)((sizeof(header) + 15) & ~15);
  header.dataHeader.magic1 = 0xda;
  header.dataHeader.magic2 = 0x27;
  uprv_memcpy(&header.info, &dataInfo, sizeof(dataInfo));

  int32_t indexes[UCNVSEL_INDEX_COUNT] = {
    serializedTrieSize,
    sel->pvCount,
    sel->encodingsCount,
    sel->encodingStrLength
  };

  int32_t totalSize =
    header.dataHeader.headerSize +
    (int32_t)sizeof(indexes) +
    serializedTrieSize +
    sel->pvCount * 4 +
    sel->encodingStrLength;
  indexes[UCNVSEL_INDEX_SIZE] = totalSize - header.dataHeader.headerSize;
  if (totalSize > bufferCapacity) {
    *status = U_BUFFER_OVERFLOW_ERROR;
    return totalSize;
  }
  // ok, save!
  int32_t length = header.dataHeader.headerSize;
  uprv_memcpy(p, &header, sizeof(header));
  uprv_memset(p + sizeof(header), 0, length - sizeof(header));
  p += length;

  length = (int32_t)sizeof(indexes);
  uprv_memcpy(p, indexes, length);
  p += length;

  utrie2_serialize(sel->trie, p, serializedTrieSize, status);
  p += serializedTrieSize;

  length = sel->pvCount * 4;
  uprv_memcpy(p, sel->pv, length);
  p += length;

  uprv_memcpy(p, sel->encodings[0], sel->encodingStrLength);
  p += sel->encodingStrLength;

  return totalSize;
}
Esempio n. 3
0
void
BiDiPropsBuilder::build(UErrorCode &errorCode) {
    makeMirror(errorCode);
    if(U_FAILURE(errorCode)) { return; }

    utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
    trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genprops error: utrie2_freeze()+utrie2_serialize() failed: %s (length %ld)\n",
                u_errorName(errorCode), (long)trieSize);
        return;
    }

    // Finish jgArray & jgArray2.
    UChar32 jgStart;  // First code point with a Joining_Group, first range.
    UChar32 jgLimit;  // One past the last one.
    // Find the end of the range first, so that if it's empty we
    // get jgStart=jgLimit=MIN_JG_START.
    for(jgLimit=MAX_JG_LIMIT;
        MIN_JG_START<jgLimit && jgArray[jgLimit-MIN_JG_START-1]==U_JG_NO_JOINING_GROUP;
        --jgLimit) {}
    for(jgStart=MIN_JG_START;
        jgStart<jgLimit && jgArray[jgStart-MIN_JG_START]==U_JG_NO_JOINING_GROUP;
        ++jgStart) {}

    UChar32 jgStart2;  // First code point with a Joining_Group, second range.
    UChar32 jgLimit2;  // One past the last one.
    for(jgLimit2=MAX_JG_LIMIT2;
        MIN_JG_START2<jgLimit2 && jgArray2[jgLimit2-MIN_JG_START2-1]==U_JG_NO_JOINING_GROUP;
        --jgLimit2) {}
    for(jgStart2=MIN_JG_START2;
        jgStart2<jgLimit2 && jgArray2[jgStart2-MIN_JG_START2]==U_JG_NO_JOINING_GROUP;
        ++jgStart2) {}

    // Pad the total Joining_Group arrays length to a multiple of 4.
    // Prefer rounding down starts before rounding up limits
    // so that we are guaranteed not to increase the limits beyond
    // the end of the arrays' code point ranges.
    int32_t jgLength=jgLimit-jgStart+jgLimit2-jgStart2;
    while(jgLength&3) {
        if((jgStart<jgLimit) && (jgStart&3)) {
            --jgStart;
        } else if((jgStart2<jgLimit2) && (jgStart2&3)) {
            --jgStart2;
        } else if(jgStart<jgLimit) {
            ++jgLimit;
        } else {
            ++jgLimit2;
        }
        ++jgLength;
    }
    indexes[UBIDI_IX_JG_START]=jgStart;
    indexes[UBIDI_IX_JG_LIMIT]=jgLimit;
    indexes[UBIDI_IX_JG_START2]=jgStart2;
    indexes[UBIDI_IX_JG_LIMIT2]=jgLimit2;

    indexes[UBIDI_IX_TRIE_SIZE]=trieSize;
    indexes[UBIDI_IX_MIRROR_LENGTH]=mirrorTop;
    indexes[UBIDI_IX_LENGTH]=
        (int32_t)sizeof(indexes)+
        trieSize+
        4*mirrorTop+
        jgLength;

    if(!beQuiet) {
        puts("* ubidi.icu stats *");
        printf("trie size in bytes:                    %5d\n", (int)trieSize);
        printf("size in bytes of mirroring table:      %5d\n", (int)(4*mirrorTop));
        printf("length of Joining_Group array:         %5d (U+%04x..U+%04x)\n",
               (int)(jgLimit-jgStart), (int)jgStart, (int)(jgLimit-1));
        printf("length of Joining_Group array 2:       %5d (U+%04x..U+%04x)\n",
               (int)(jgLimit2-jgStart2), (int)jgStart2, (int)(jgLimit2-1));
        printf("data size:                             %5d\n", (int)indexes[UBIDI_IX_LENGTH]);
    }

    indexes[UBIDI_MAX_VALUES_INDEX]=
        ((int32_t)U_CHAR_DIRECTION_COUNT-1)|
        (((int32_t)U_JT_COUNT-1)<<UBIDI_JT_SHIFT)|
        (((int32_t)U_BPT_COUNT-1)<<UBIDI_BPT_SHIFT)|
        (((int32_t)U_JG_COUNT-1)<<UBIDI_MAX_JG_SHIFT);
}
int main(int argc, char** argv)
{
    // Create a value array of all possible code points.
    const UChar32 size = kMaxCodepoint + 1;
    CharacterProperty* values = new CharacterProperty[size];
    memset(values, 0, sizeof(CharacterProperty) * size);

    setRanges(values,
        cjkIdeographRanges, ARRAY_LENGTH(cjkIdeographRanges),
        CharacterProperty::isCJKIdeographOrSymbol);
    setRanges(values,
        cjkSymbolRanges, ARRAY_LENGTH(cjkSymbolRanges),
        CharacterProperty::isCJKIdeographOrSymbol);
    setValues(values,
        cjkIsolatedSymbolsArray, ARRAY_LENGTH(cjkIsolatedSymbolsArray),
        CharacterProperty::isCJKIdeographOrSymbol);

    setRanges(values,
        isUprightInMixedVerticalRanges,
        ARRAY_LENGTH(isUprightInMixedVerticalRanges),
        CharacterProperty::isUprightInMixedVertical);
    setValues(values,
        isUprightInMixedVerticalArray,
        ARRAY_LENGTH(isUprightInMixedVerticalArray),
        CharacterProperty::isUprightInMixedVertical);

    // Create a trie from the value array.
    UErrorCode error = U_ZERO_ERROR;
    UTrie2* trie = utrie2_open(0, 0, &error);
    assert(error == U_ZERO_ERROR);
    UChar32 start = 0;
    CharacterProperty value = values[0];
    for (UChar32 c = 1;; c++) {
        if (c < size && values[c] == value)
            continue;
        if (static_cast<uint32_t>(value)) {
            utrie2_setRange32(trie, start, c - 1,
                static_cast<uint32_t>(value), TRUE, &error);
            assert(error == U_ZERO_ERROR);
        }
        if (c >= size)
            break;
        start = c;
        value = values[start];
    }

    // Freeze and serialize the trie to a byte array.
    utrie2_freeze(trie, UTrie2ValueBits::UTRIE2_16_VALUE_BITS, &error);
    assert(error == U_ZERO_ERROR);
    int32_t serializedSize = utrie2_serialize(trie, nullptr, 0, &error);
    error = U_ZERO_ERROR;
    uint8_t* serialized = new uint8_t[serializedSize];
    serializedSize = utrie2_serialize(trie, serialized, serializedSize, &error);
    assert(error == U_ZERO_ERROR);

    // Write the serialized array to the source file.
    if (argc <= 1) {
        generate(stdout, serializedSize, serialized);
    } else {
        FILE* fp = fopen(argv[1], "wb");
        generate(fp, serializedSize, serialized);
        fclose(fp);
    }

    utrie2_close(trie);

    return 0;
}
Esempio n. 5
0
int32_t
CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
                           const CollationData &data, const CollationSettings &settings,
                           const void *rootElements, int32_t rootElementsLength,
                           int32_t indexes[], uint8_t *dest, int32_t capacity,
                           UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return 0; }
    if(capacity < 0 || (capacity > 0 && dest == NULL)) {
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    // Figure out which data items to write before settling on
    // the indexes length and writing offsets.
    // For any data item, we need to write the start and limit offsets,
    // so the indexes length must be at least index-of-start-offset + 2.
    int32_t indexesLength;
    UBool hasMappings;
    UnicodeSet unsafeBackwardSet;
    const CollationData *baseData = data.base;

    int32_t fastLatinVersion;
    if(data.fastLatinTable != NULL) {
        fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
    } else {
        fastLatinVersion = 0;
    }
    int32_t fastLatinTableLength = 0;

    if(isBase) {
        // For the root collator, we write an even number of indexes
        // so that we start with an 8-aligned offset.
        indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
        U_ASSERT(settings.reorderCodesLength == 0);
        hasMappings = TRUE;
        unsafeBackwardSet = *data.unsafeBackwardSet;
        fastLatinTableLength = data.fastLatinTableLength;
    } else if(baseData == NULL) {
        hasMappings = FALSE;
        if(settings.reorderCodesLength == 0) {
            // only options
            indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
        } else {
            // only options, reorder codes, and the reorder table
            indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
        }
    } else {
        hasMappings = TRUE;
        // Tailored mappings, and what else?
        // Check in ascending order of optional tailoring data items.
        indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
        if(data.contextsLength != 0) {
            indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
        }
        unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
        if(!unsafeBackwardSet.isEmpty()) {
            indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
        }
        if(data.fastLatinTable != baseData->fastLatinTable) {
            fastLatinTableLength = data.fastLatinTableLength;
            indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
        }
    }

    UVector32 codesAndRanges(errorCode);
    const int32_t *reorderCodes = settings.reorderCodes;
    int32_t reorderCodesLength = settings.reorderCodesLength;
    if(settings.hasReordering() &&
            CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
        // Rebuild the full list of reorder ranges.
        // The list in the settings is truncated for efficiency.
        data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
        // Write the codes, then the ranges.
        for(int32_t i = 0; i < reorderCodesLength; ++i) {
            codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
        }
        if(U_FAILURE(errorCode)) { return 0; }
        reorderCodes = codesAndRanges.getBuffer();
        reorderCodesLength = codesAndRanges.size();
    }

    int32_t headerSize;
    if(isBase) {
        headerSize = 0;  // udata_create() writes the header
    } else {
        DataHeader header;
        header.dataHeader.magic1 = 0xda;
        header.dataHeader.magic2 = 0x27;
        uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
        uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
        headerSize = (int32_t)sizeof(header);
        U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
        if(hasMappings && data.cesLength != 0) {
            // Sum of the sizes of the data items which are
            // not automatically multiples of 8 bytes and which are placed before the CEs.
            int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
            if((sum & 7) != 0) {
                // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
                // We add to the header size here.
                // Alternatively, we could increment the indexesLength
                // or add a few bytes to the reorderTable.
                headerSize += 4;
            }
        }
        header.dataHeader.headerSize = (uint16_t)headerSize;
        if(headerSize <= capacity) {
            uprv_memcpy(dest, &header, sizeof(header));
            // Write 00 bytes so that the padding is not mistaken for a copyright string.
            uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
            dest += headerSize;
            capacity -= headerSize;
        } else {
            dest = NULL;
            capacity = 0;
        }
    }

    indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
    U_ASSERT((settings.options & ~0xffff) == 0);
    indexes[CollationDataReader::IX_OPTIONS] =
            data.numericPrimary | fastLatinVersion | settings.options;
    indexes[CollationDataReader::IX_RESERVED2] = 0;
    indexes[CollationDataReader::IX_RESERVED3] = 0;

    // Byte offsets of data items all start from the start of the indexes.
    // We add the headerSize at the very end.
    int32_t totalSize = indexesLength * 4;

    if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
        indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
    } else {
        indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
    }

    indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
    totalSize += reorderCodesLength * 4;

    indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
    if(settings.reorderTable != NULL) {
        totalSize += 256;
    }

    indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
    if(hasMappings) {
        UErrorCode errorCode2 = U_ZERO_ERROR;
        int32_t length;
        if(totalSize < capacity) {
            length = utrie2_serialize(data.trie, dest + totalSize,
                                      capacity - totalSize, &errorCode2);
        } else {
            length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
        }
        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
            errorCode = errorCode2;
            return 0;
        }
        // The trie size should be a multiple of 8 bytes due to the way
        // compactIndex2(UNewTrie2 *trie) currently works.
        U_ASSERT((length & 7) == 0);
        totalSize += length;
    }

    indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
    indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
    if(hasMappings && data.cesLength != 0) {
        U_ASSERT(((headerSize + totalSize) & 7) == 0);
        totalSize += data.cesLength * 8;
    }

    indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
    indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
    if(hasMappings) {
        totalSize += data.ce32sLength * 4;
    }

    indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
    totalSize += rootElementsLength * 4;

    indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
    if(hasMappings) {
        totalSize += data.contextsLength * 2;
    }

    indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
    if(hasMappings && !unsafeBackwardSet.isEmpty()) {
        UErrorCode errorCode2 = U_ZERO_ERROR;
        int32_t length;
        if(totalSize < capacity) {
            uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
            length = unsafeBackwardSet.serialize(
                    p, (capacity - totalSize) / 2, errorCode2);
        } else {
            length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
        }
        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
            errorCode = errorCode2;
            return 0;
        }
        totalSize += length * 2;
    }

    indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
    totalSize += fastLatinTableLength * 2;

    UnicodeString scripts;
    indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
    if(isBase) {
        scripts.append((UChar)data.numScripts);
        scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
        scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
        totalSize += scripts.length() * 2;
    }

    indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
    if(isBase) {
        totalSize += 256;
    }

    indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
    indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;

    if(totalSize > capacity) {
        errorCode = U_BUFFER_OVERFLOW_ERROR;
        return headerSize + totalSize;
    }

    uprv_memcpy(dest, indexes, indexesLength * 4);
    copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
    copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
    // The trie has already been serialized into the dest buffer.
    copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
    copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
    copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
    copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
    // The unsafeBackwardSet has already been serialized into the dest buffer.
    copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
    copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
    copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);

    return headerSize + totalSize;
}
//  Build the Whole Script Confusable data
//
//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
//                         because everything is local to this one build function anyhow,
//                           OR
//                         break this function into more reasonably sized pieces, with
//                         state in WSConfusableDataBuilder.
//
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 
{
    if (U_FAILURE(status)) {
        return;
    }
    URegularExpression *parseRegexp = NULL;
    int32_t             inputLen    = 0;
    UChar              *input       = NULL;
    int32_t             lineNum     = 0;
    
    UVector            *scriptSets        = NULL;
    uint32_t            rtScriptSetsCount = 2;

    UTrie2             *anyCaseTrie   = NULL;
    UTrie2             *lowerCaseTrie = NULL;

    anyCaseTrie = utrie2_open(0, 0, &status);
    lowerCaseTrie = utrie2_open(0, 0, &status);
    

    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
    //
    // Reserved TRIE values:
    //   0:  Code point has no whole script confusables.
    //   1:  Code point is of script Common or Inherited.
    //       These code points do not participate in whole script confusable detection.
    //       (This is logically equivalent to saying that they contain confusables in
    //        all scripts)
    //
    // Because Trie values are indexes into the ScriptSets vector, pre-fill
    // vector positions 0 and 1 to avoid conflicts with the reserved values.
    
    scriptSets = new UVector(status);
    if (scriptSets == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    scriptSets->addElement((void *)NULL, status);
    scriptSets->addElement((void *)NULL, status);

    // Convert the user input data from UTF-8 to UChar (UTF-16)
    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        goto cleanup;
    }
    status = U_ZERO_ERROR;
    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    if (input == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);



    parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
    
    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    //   given the syntax of the input.
    if (*input == 0xfeff) {
        *input = 0x20;
    }

    // Parse the input, one line per iteration of this loop.
    uregex_setText(parseRegexp, input, inputLen, &status);
    while (uregex_findNext(parseRegexp, &status)) {
        lineNum++;
        UChar  line[200];
        uregex_group(parseRegexp, 0, line, 200, &status);
        if (uregex_start(parseRegexp, 1, &status) >= 0) {
            // this was a blank or comment line.
            continue;
        }
        if (uregex_start(parseRegexp, 8, &status) >= 0) {
            // input file syntax error.
            status = U_PARSE_ERROR;
            goto cleanup;
        }
        if (U_FAILURE(status)) {
            goto cleanup;
        }

        // Pick up the start and optional range end code points from the parsed line.
        UChar32  startCodePoint = SpoofImpl::ScanHex(
            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
        UChar32  endCodePoint = startCodePoint;
        if (uregex_start(parseRegexp, 3, &status) >=0) {
            endCodePoint = SpoofImpl::ScanHex(
                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
        }

        // Extract the two script names from the source line.  We need these in an 8 bit
        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
        char  srcScriptName[20];
        char  targScriptName[20];
        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
        UScriptCode srcScript  =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
        UScriptCode targScript =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
        if (U_FAILURE(status)) {
            goto cleanup;
        }
        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
            status = U_INVALID_FORMAT_ERROR;
            goto cleanup;
        }

        // select the table - (A) any case or (L) lower case only
        UTrie2 *table = anyCaseTrie;
        if (uregex_start(parseRegexp, 7, &status) >= 0) {
            table = lowerCaseTrie;
        }

        // Build the set of scripts containing confusable characters for
        //   the code point(s) specified in this input line.
        // Sanity check that the script of the source code point is the same
        //   as the source script indicated in the input file.  Failure of this check is
        //   an error in the input file.
        // Include the source script in the set (needed for Mixed Script Confusable detection).
        //
        UChar32 cp;
        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
            int32_t setIndex = utrie2_get32(table, cp);
            BuilderScriptSet *bsset = NULL;
            if (setIndex > 0) {
                U_ASSERT(setIndex < scriptSets->size());
                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
            } else {
                bsset = new BuilderScriptSet();
                if (bsset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                bsset->codePoint = cp;
                bsset->trie = table;
                bsset->sset = new ScriptSet();
                setIndex = scriptSets->size();
                bsset->index = setIndex;
                bsset->rindex = 0;
                if (bsset->sset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                scriptSets->addElement(bsset, status);
                utrie2_set32(table, cp, setIndex, &status);
            }
            bsset->sset->Union(targScript);
            bsset->sset->Union(srcScript);

            if (U_FAILURE(status)) {
                goto cleanup;
            }
            UScriptCode cpScript = uscript_getScript(cp, &status);
            if (cpScript != srcScript) {
                status = U_INVALID_FORMAT_ERROR;
                goto cleanup;
            }
        }
    }

    // Eliminate duplicate script sets.  At this point we have a separate
    // script set for every code point that had data in the input file.
    //
    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
    //
    // printf("Number of scriptSets: %d\n", scriptSets->size());
    {
        int32_t duplicateCount = 0;
        rtScriptSetsCount = 2;
        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
            if (outerSet->index != static_cast<uint32_t>(outeri)) {
                // This set was already identified as a duplicate.
                //   It will not be allocated a position in the runtime array of ScriptSets.
                continue;
            }
            outerSet->rindex = rtScriptSetsCount++;
            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
                    delete innerSet->sset;
                    innerSet->scriptSetOwned = FALSE;
                    innerSet->sset = outerSet->sset;
                    innerSet->index = outeri;
                    innerSet->rindex = outerSet->rindex;
                    duplicateCount++;
                }
                // But this doesn't get all.  We need to fix the TRIE.
            }
        }
        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
    }

    

    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
    //     are unused, which is why the loop index starts at 2.)
    {
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex != (uint32_t)i) {
                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
            }
        }
    }

    // For code points with script==Common or script==Inherited,
    //   Set the reserved value of 1 into both Tries.  These characters do not participate
    //   in Whole Script Confusable detection; this reserved value is the means
    //   by which they are detected.
    {
        UnicodeSet ignoreSet;
        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
        UnicodeSet inheritedSet;
        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
        ignoreSet.addAll(inheritedSet);
        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
        }
    }

    // Serialize the data to the Spoof Detector
    {
        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
        // printf("Any case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
        void *where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(anyCaseTrie, where, size, &status);
        
        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
        // printf("Lower case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
        where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(lowerCaseTrie, where, size, &status);

        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
        uint32_t rindex = 2;
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex < rindex) {
                // We have already copied this script set to the serialized data.
                continue;
            }
            U_ASSERT(rindex == bSet->rindex);
            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
            rindex++;
        }
    }

    // Open new utrie2s from the serialized data.  We don't want to keep the ones
    //   we just built because we would then have two copies of the data, one internal to
    //   the utries that we have already constructed, and one in the serialized data area.
    //   An alternative would be to not pre-serialize the Trie data, but that makes the
    //   spoof detector data different, depending on how the detector was constructed.
    //   It's simpler to keep the data always the same.
    
    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);

    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);

    

cleanup:
    if (U_FAILURE(status)) {
        pe->line = lineNum;
    }
    uregex_close(parseRegexp);
    uprv_free(input);

    int32_t i;
    for (i=0; i<scriptSets->size(); i++) {
        BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
        delete bsset;
    }
    delete scriptSets;
    utrie2_close(anyCaseTrie);
    utrie2_close(lowerCaseTrie);
    return;
}
Esempio n. 7
0
//-----------------------------------------------------------------------------------
//
//  serializeTrie()   Put the serialized trie at the specified address.
//                    Trust the caller to have given us enough memory.
//                    getTrieSize() MUST be called first.
//
//-----------------------------------------------------------------------------------
void RBBISetBuilder::serializeTrie(uint8_t *where) {
    utrie2_serialize(fTrie,
                     where,                   // Buffer
                     fTrieSize,               // Capacity
                     fStatus);
}