U_CAPI UTrie2 * U_EXPORT2
upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
    UPVecToUTrie2Context toUTrie2={ NULL };
    upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
    utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        utrie2_close(toUTrie2.trie);
        toUTrie2.trie=NULL;
    }
    return toUTrie2.trie;
}
Exemple #2
0
//-----------------------------------------------------------------------------------
//
//  getTrieSize()    Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int32_t RBBISetBuilder::getTrieSize()  {
    if (U_FAILURE(*fStatus)) {
        return 0;
    }
    utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
    fTrieSize  = utrie2_serialize(fTrie,
                                  NULL,                // Buffer
                                  0,                   // Capacity
                                  fStatus);
    if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
        *fStatus = U_ZERO_ERROR;
    }
    // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
    return fTrieSize;
}
Exemple #3
0
/* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
U_CAPI UTrie2 * U_EXPORT2
utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
    NewTrieAndStatus context;
    UChar lead;

    if(U_FAILURE(*pErrorCode)) {
        return NULL;
    }
    if(trie1==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        return NULL;
    }
    context.exclusiveLimit=TRUE;
    context.errorCode=*pErrorCode;
    utrie_enum(trie1, NULL, copyEnumRange, &context);
    *pErrorCode=context.errorCode;
    for(lead=0xd800; lead<0xdc00; ++lead) {
        uint32_t value;
        if(trie1->data32==NULL) {
            value=UTRIE_GET16_FROM_LEAD(trie1, lead);
        } else {
            value=UTRIE_GET32_FROM_LEAD(trie1, lead);
        }
        if(value!=trie1->initialValue) {
            utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
        }
    }
    if(U_SUCCESS(*pErrorCode)) {
        utrie2_freeze(context.trie,
                      trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
                      pErrorCode);
    }
#ifdef UTRIE2_DEBUG
    if(U_SUCCESS(*pErrorCode)) {
        utrie_printLengths(trie1);
        utrie2_printLengths(context.trie, "fromUTrie");
    }
#endif
    if(U_FAILURE(*pErrorCode)) {
        utrie2_close(context.trie);
        context.trie=NULL;
    }
    return context.trie;
}
void
BiDiPropsBuilder::build(UErrorCode &errorCode) {
    makeMirror(errorCode);
    if(U_FAILURE(errorCode)) { return; }

    utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
    trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genprops error: utrie2_freeze()+utrie2_serialize() failed: %s (length %ld)\n",
                u_errorName(errorCode), (long)trieSize);
        return;
    }

    // Finish jgArray & jgArray2.
    UChar32 jgStart;  // First code point with a Joining_Group, first range.
    UChar32 jgLimit;  // One past the last one.
    // Find the end of the range first, so that if it's empty we
    // get jgStart=jgLimit=MIN_JG_START.
    for(jgLimit=MAX_JG_LIMIT;
        MIN_JG_START<jgLimit && jgArray[jgLimit-MIN_JG_START-1]==U_JG_NO_JOINING_GROUP;
        --jgLimit) {}
    for(jgStart=MIN_JG_START;
        jgStart<jgLimit && jgArray[jgStart-MIN_JG_START]==U_JG_NO_JOINING_GROUP;
        ++jgStart) {}

    UChar32 jgStart2;  // First code point with a Joining_Group, second range.
    UChar32 jgLimit2;  // One past the last one.
    for(jgLimit2=MAX_JG_LIMIT2;
        MIN_JG_START2<jgLimit2 && jgArray2[jgLimit2-MIN_JG_START2-1]==U_JG_NO_JOINING_GROUP;
        --jgLimit2) {}
    for(jgStart2=MIN_JG_START2;
        jgStart2<jgLimit2 && jgArray2[jgStart2-MIN_JG_START2]==U_JG_NO_JOINING_GROUP;
        ++jgStart2) {}

    // Pad the total Joining_Group arrays length to a multiple of 4.
    // Prefer rounding down starts before rounding up limits
    // so that we are guaranteed not to increase the limits beyond
    // the end of the arrays' code point ranges.
    int32_t jgLength=jgLimit-jgStart+jgLimit2-jgStart2;
    while(jgLength&3) {
        if((jgStart<jgLimit) && (jgStart&3)) {
            --jgStart;
        } else if((jgStart2<jgLimit2) && (jgStart2&3)) {
            --jgStart2;
        } else if(jgStart<jgLimit) {
            ++jgLimit;
        } else {
            ++jgLimit2;
        }
        ++jgLength;
    }
    indexes[UBIDI_IX_JG_START]=jgStart;
    indexes[UBIDI_IX_JG_LIMIT]=jgLimit;
    indexes[UBIDI_IX_JG_START2]=jgStart2;
    indexes[UBIDI_IX_JG_LIMIT2]=jgLimit2;

    indexes[UBIDI_IX_TRIE_SIZE]=trieSize;
    indexes[UBIDI_IX_MIRROR_LENGTH]=mirrorTop;
    indexes[UBIDI_IX_LENGTH]=
        (int32_t)sizeof(indexes)+
        trieSize+
        4*mirrorTop+
        jgLength;

    if(!beQuiet) {
        puts("* ubidi.icu stats *");
        printf("trie size in bytes:                    %5d\n", (int)trieSize);
        printf("size in bytes of mirroring table:      %5d\n", (int)(4*mirrorTop));
        printf("length of Joining_Group array:         %5d (U+%04x..U+%04x)\n",
               (int)(jgLimit-jgStart), (int)jgStart, (int)(jgLimit-1));
        printf("length of Joining_Group array 2:       %5d (U+%04x..U+%04x)\n",
               (int)(jgLimit2-jgStart2), (int)jgStart2, (int)(jgLimit2-1));
        printf("data size:                             %5d\n", (int)indexes[UBIDI_IX_LENGTH]);
    }

    indexes[UBIDI_MAX_VALUES_INDEX]=
        ((int32_t)U_CHAR_DIRECTION_COUNT-1)|
        (((int32_t)U_JT_COUNT-1)<<UBIDI_JT_SHIFT)|
        (((int32_t)U_BPT_COUNT-1)<<UBIDI_BPT_SHIFT)|
        (((int32_t)U_JG_COUNT-1)<<UBIDI_MAX_JG_SHIFT);
}
Exemple #5
0
extern void
generateData(const char *dataDir, UBool csource) {
    static int32_t indexes[UBIDI_IX_TOP]={
        UBIDI_IX_TOP
    };
    static uint8_t trieBlock[40000];
    static uint8_t jgArray[0x300]; /* at most for U+0600..U+08FF */

    const uint32_t *row;
    UChar32 start, end, prev, jgStart;
    int32_t i;

    UNewDataMemory *pData;
    UNewTrie *pTrie;
    UErrorCode errorCode=U_ZERO_ERROR;
    int32_t trieSize;
    long dataLength;

    makeMirror();

    pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
    if(pTrie==NULL) {
        fprintf(stderr, "genbidi error: unable to create a UNewTrie\n");
        exit(U_MEMORY_ALLOCATION_ERROR);
    }

    prev=jgStart=0;
    for(i=0; (row=upvec_getRow(pv, i, &start, &end))!=NULL && start<UPVEC_FIRST_SPECIAL_CP; ++i) {
        /* store most values from vector column 0 in the trie */
        if(!utrie_setRange32(pTrie, start, end+1, *row, TRUE)) {
            fprintf(stderr, "genbidi error: unable to set trie value (overflow)\n");
            exit(U_BUFFER_OVERFLOW_ERROR);
        }

        /* store Joining_Group values from vector column 1 in a simple byte array */
        if(row[1]!=0) {
            if(start<0x600 || 0x8ff<end) {
                fprintf(stderr, "genbidi error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n",
                        (long)start, (long)end);
                exit(U_ILLEGAL_ARGUMENT_ERROR);
            }

            if(prev==0) {
                /* first code point with any value */
                prev=jgStart=start;
            } else {
                /* add No_Joining_Group for code points between prev and start */
                while(prev<start) {
                    jgArray[prev++ -jgStart]=0;
                }
            }

            /* set Joining_Group value for start..end */
            while(prev<=end) {
                jgArray[prev++ -jgStart]=(uint8_t)row[1];
            }
        }
    }

    /* finish jgArray, pad to multiple of 4 */
    while((prev-jgStart)&3) {
        jgArray[prev++ -jgStart]=0;
    }
    indexes[UBIDI_IX_JG_START]=jgStart;
    indexes[UBIDI_IX_JG_LIMIT]=prev;

    trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genbidi error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
        exit(errorCode);
    }

    indexes[UBIDI_IX_TRIE_SIZE]=trieSize;
    indexes[UBIDI_IX_MIRROR_LENGTH]=mirrorTop;
    indexes[UBIDI_IX_LENGTH]=
        (int32_t)sizeof(indexes)+
        trieSize+
        4*mirrorTop+
        (prev-jgStart);

    if(beVerbose) {
        printf("trie size in bytes:                    %5d\n", (int)trieSize);
        printf("size in bytes of mirroring table:      %5d\n", (int)(4*mirrorTop));
        printf("length of Joining_Group array:         %5d (U+%04x..U+%04x)\n", (int)(prev-jgStart), (int)jgStart, (int)(prev-1));
        printf("data size:                             %5d\n", (int)indexes[UBIDI_IX_LENGTH]);
    }

    indexes[UBIDI_MAX_VALUES_INDEX]=
        ((int32_t)U_CHAR_DIRECTION_COUNT-1)|
        (((int32_t)U_JT_COUNT-1)<<UBIDI_JT_SHIFT)|
        (((int32_t)U_JG_COUNT-1)<<UBIDI_MAX_JG_SHIFT);

    if(csource) {
        /* write .c file for hardcoded data */
        UTrie trie={ NULL };
        UTrie2 *trie2;
        FILE *f;

        utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(
                stderr,
                "genbidi error: failed to utrie_unserialize(ubidi.icu trie) - %s\n",
                u_errorName(errorCode));
            exit(errorCode);
        }

        /* use UTrie2 */
        dataInfo.formatVersion[0]=2;
        dataInfo.formatVersion[2]=0;
        dataInfo.formatVersion[3]=0;
        trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(
                stderr,
                "genbidi error: utrie2_fromUTrie() failed - %s\n",
                u_errorName(errorCode));
            exit(errorCode);
        }
        {
            /* delete lead surrogate code unit values */
            UChar lead;
            trie2=utrie2_cloneAsThawed(trie2, &errorCode);
            for(lead=0xd800; lead<0xdc00; ++lead) {
                utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
            }
            utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
            if(U_FAILURE(errorCode)) {
                fprintf(
                    stderr,
                    "genbidi error: deleting lead surrogate code unit values failed - %s\n",
                    u_errorName(errorCode));
                exit(errorCode);
            }
        }

        f=usrc_create(dataDir, "ubidi_props_data.c");
        if(f!=NULL) {
            usrc_writeArray(f,
                "static const UVersionInfo ubidi_props_dataVersion={",
                dataInfo.dataVersion, 8, 4,
                "};\n\n");
            usrc_writeArray(f,
                "static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={",
                indexes, 32, UBIDI_IX_TOP,
                "};\n\n");
            usrc_writeUTrie2Arrays(f,
                "static const uint16_t ubidi_props_trieIndex[%ld]={\n", NULL,
                trie2,
                "\n};\n\n");
            usrc_writeArray(f,
                "static const uint32_t ubidi_props_mirrors[%ld]={\n",
                mirrors, 32, mirrorTop,
                "\n};\n\n");
            usrc_writeArray(f,
                "static const uint8_t ubidi_props_jgArray[%ld]={\n",
                jgArray, 8, prev-jgStart,
                "\n};\n\n");
            fputs(
                "static const UBiDiProps ubidi_props_singleton={\n"
                "  NULL,\n"
                "  ubidi_props_indexes,\n"
                "  ubidi_props_mirrors,\n"
                "  ubidi_props_jgArray,\n",
                f);
            usrc_writeUTrie2Struct(f,
                "  {\n",
                trie2, "ubidi_props_trieIndex", NULL,
                "  },\n");
            usrc_writeArray(f, "  { ", dataInfo.formatVersion, 8, 4, " }\n");
            fputs("};\n", f);
            fclose(f);
        }
        utrie2_close(trie2);
    } else {
        /* write the data */
        pData=udata_create(dataDir, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &dataInfo,
                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(stderr, "genbidi: unable to create data memory, %s\n", u_errorName(errorCode));
            exit(errorCode);
        }

        udata_writeBlock(pData, indexes, sizeof(indexes));
        udata_writeBlock(pData, trieBlock, trieSize);
        udata_writeBlock(pData, mirrors, 4*mirrorTop);
        udata_writeBlock(pData, jgArray, prev-jgStart);

        /* finish up */
        dataLength=udata_finish(pData, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(stderr, "genbidi: error %d writing the output file\n", errorCode);
            exit(errorCode);
        }

        if(dataLength!=indexes[UBIDI_IX_LENGTH]) {
            fprintf(stderr, "genbidi: data length %ld != calculated size %d\n",
                dataLength, (int)indexes[UBIDI_IX_LENGTH]);
            exit(U_INTERNAL_PROGRAM_ERROR);
        }
    }

    utrie_close(pTrie);
    upvec_close(pv);
}
int main(int argc, char** argv)
{
    // Create a value array of all possible code points.
    const UChar32 size = kMaxCodepoint + 1;
    CharacterProperty* values = new CharacterProperty[size];
    memset(values, 0, sizeof(CharacterProperty) * size);

    setRanges(values,
        cjkIdeographRanges, ARRAY_LENGTH(cjkIdeographRanges),
        CharacterProperty::isCJKIdeographOrSymbol);
    setRanges(values,
        cjkSymbolRanges, ARRAY_LENGTH(cjkSymbolRanges),
        CharacterProperty::isCJKIdeographOrSymbol);
    setValues(values,
        cjkIsolatedSymbolsArray, ARRAY_LENGTH(cjkIsolatedSymbolsArray),
        CharacterProperty::isCJKIdeographOrSymbol);

    setRanges(values,
        isUprightInMixedVerticalRanges,
        ARRAY_LENGTH(isUprightInMixedVerticalRanges),
        CharacterProperty::isUprightInMixedVertical);
    setValues(values,
        isUprightInMixedVerticalArray,
        ARRAY_LENGTH(isUprightInMixedVerticalArray),
        CharacterProperty::isUprightInMixedVertical);

    // Create a trie from the value array.
    UErrorCode error = U_ZERO_ERROR;
    UTrie2* trie = utrie2_open(0, 0, &error);
    assert(error == U_ZERO_ERROR);
    UChar32 start = 0;
    CharacterProperty value = values[0];
    for (UChar32 c = 1;; c++) {
        if (c < size && values[c] == value)
            continue;
        if (static_cast<uint32_t>(value)) {
            utrie2_setRange32(trie, start, c - 1,
                static_cast<uint32_t>(value), TRUE, &error);
            assert(error == U_ZERO_ERROR);
        }
        if (c >= size)
            break;
        start = c;
        value = values[start];
    }

    // Freeze and serialize the trie to a byte array.
    utrie2_freeze(trie, UTrie2ValueBits::UTRIE2_16_VALUE_BITS, &error);
    assert(error == U_ZERO_ERROR);
    int32_t serializedSize = utrie2_serialize(trie, nullptr, 0, &error);
    error = U_ZERO_ERROR;
    uint8_t* serialized = new uint8_t[serializedSize];
    serializedSize = utrie2_serialize(trie, serialized, serializedSize, &error);
    assert(error == U_ZERO_ERROR);

    // Write the serialized array to the source file.
    if (argc <= 1) {
        generate(stdout, serializedSize, serialized);
    } else {
        FILE* fp = fopen(argv[1], "wb");
        generate(fp, serializedSize, serialized);
        fclose(fp);
    }

    utrie2_close(trie);

    return 0;
}
Exemple #7
0
extern void
generateData(const char *dataDir, UBool csource) {
    static int32_t indexes[UPROPS_INDEX_COUNT]={
        0, 0, 0, 0,
        0, 0, 0, 0,
        0, 0, 0, 0,
        0, 0, 0, 0
    };
    static uint8_t trieBlock[40000];
    static uint8_t additionalProps[120000];

    UNewDataMemory *pData;
    UErrorCode errorCode=U_ZERO_ERROR;
    uint32_t size = 0;
    int32_t trieSize, additionalPropsSize, offset;
    long dataLength;

    trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
        exit(errorCode);
    }

    offset=sizeof(indexes)/4;               /* uint32_t offset to the properties trie */

    /* round up trie size to 4-alignment */
    trieSize=(trieSize+3)&~3;
    offset+=trieSize>>2;
    indexes[UPROPS_PROPS32_INDEX]=          /* set indexes to the same offsets for empty */
    indexes[UPROPS_EXCEPTIONS_INDEX]=       /* structures from the old format version 3 */
    indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=   /* so that less runtime code has to be changed */
    indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;

    if(beVerbose) {
        printf("trie size in bytes:                    %5u\n", (int)trieSize);
    }

    if(csource) {
        /* write .c file for hardcoded data */
        UTrie trie={ NULL };
        UTrie2 *trie2;
        FILE *f;

        utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(
                stderr,
                "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n",
                u_errorName(errorCode));
            exit(errorCode);
        }

        /* use UTrie2 */
        trie2=utrie2_fromUTrie(&trie, 0, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(
                stderr,
                "genprops error: utrie2_fromUTrie() failed - %s\n",
                u_errorName(errorCode));
            exit(errorCode);
        }
        {
            /* delete lead surrogate code unit values */
            UChar lead;
            trie2=utrie2_cloneAsThawed(trie2, &errorCode);
            for(lead=0xd800; lead<0xdc00; ++lead) {
                utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
            }
            utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
            if(U_FAILURE(errorCode)) {
                fprintf(
                    stderr,
                    "genprops error: deleting lead surrogate code unit values failed - %s\n",
                    u_errorName(errorCode));
                exit(errorCode);
            }
        }

        f=usrc_create(dataDir, "uchar_props_data.c");
        if(f!=NULL) {
            /* unused
            usrc_writeArray(f,
                "static const UVersionInfo formatVersion={",
                dataInfo.formatVersion, 8, 4,
                "};\n\n");
             */
            usrc_writeArray(f,
                "static const UVersionInfo dataVersion={",
                dataInfo.dataVersion, 8, 4,
                "};\n\n");
            usrc_writeUTrie2Arrays(f,
                "static const uint16_t propsTrie_index[%ld]={\n", NULL,
                trie2,
                "\n};\n\n");
            usrc_writeUTrie2Struct(f,
                "static const UTrie2 propsTrie={\n",
                trie2, "propsTrie_index", NULL,
                "};\n\n");

            additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes);
            size=4*offset+additionalPropsSize;      /* total size of data */

            usrc_writeArray(f,
                "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
                indexes, 32, UPROPS_INDEX_COUNT,
                "};\n\n");
            fclose(f);
        }
        utrie2_close(trie2);
    } else {
        /* write the data */
        pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
            exit(errorCode);
        }

        additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes);
        size=4*offset+additionalPropsSize;      /* total size of data */

        udata_writeBlock(pData, indexes, sizeof(indexes));
        udata_writeBlock(pData, trieBlock, trieSize);
        udata_writeBlock(pData, additionalProps, additionalPropsSize);

        /* finish up */
        dataLength=udata_finish(pData, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
            exit(errorCode);
        }

        if(dataLength!=(long)size) {
            fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
                dataLength, (unsigned long)size);
            exit(U_INTERNAL_PROGRAM_ERROR);
        }
    }

    if(beVerbose) {
        printf("data size:                            %6lu\n", (unsigned long)size);
    }
}
//  Build the Whole Script Confusable data
//
//     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
//                         because everything is local to this one build function anyhow,
//                           OR
//                         break this function into more reasonably sized pieces, with
//                         state in WSConfusableDataBuilder.
//
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 
{
    if (U_FAILURE(status)) {
        return;
    }
    URegularExpression *parseRegexp = NULL;
    int32_t             inputLen    = 0;
    UChar              *input       = NULL;
    int32_t             lineNum     = 0;
    
    UVector            *scriptSets        = NULL;
    uint32_t            rtScriptSetsCount = 2;

    UTrie2             *anyCaseTrie   = NULL;
    UTrie2             *lowerCaseTrie = NULL;

    anyCaseTrie = utrie2_open(0, 0, &status);
    lowerCaseTrie = utrie2_open(0, 0, &status);
    

    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
    //
    // Reserved TRIE values:
    //   0:  Code point has no whole script confusables.
    //   1:  Code point is of script Common or Inherited.
    //       These code points do not participate in whole script confusable detection.
    //       (This is logically equivalent to saying that they contain confusables in
    //        all scripts)
    //
    // Because Trie values are indexes into the ScriptSets vector, pre-fill
    // vector positions 0 and 1 to avoid conflicts with the reserved values.
    
    scriptSets = new UVector(status);
    if (scriptSets == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    scriptSets->addElement((void *)NULL, status);
    scriptSets->addElement((void *)NULL, status);

    // Convert the user input data from UTF-8 to UChar (UTF-16)
    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        goto cleanup;
    }
    status = U_ZERO_ERROR;
    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    if (input == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);



    parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
    
    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    //   given the syntax of the input.
    if (*input == 0xfeff) {
        *input = 0x20;
    }

    // Parse the input, one line per iteration of this loop.
    uregex_setText(parseRegexp, input, inputLen, &status);
    while (uregex_findNext(parseRegexp, &status)) {
        lineNum++;
        UChar  line[200];
        uregex_group(parseRegexp, 0, line, 200, &status);
        if (uregex_start(parseRegexp, 1, &status) >= 0) {
            // this was a blank or comment line.
            continue;
        }
        if (uregex_start(parseRegexp, 8, &status) >= 0) {
            // input file syntax error.
            status = U_PARSE_ERROR;
            goto cleanup;
        }
        if (U_FAILURE(status)) {
            goto cleanup;
        }

        // Pick up the start and optional range end code points from the parsed line.
        UChar32  startCodePoint = SpoofImpl::ScanHex(
            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
        UChar32  endCodePoint = startCodePoint;
        if (uregex_start(parseRegexp, 3, &status) >=0) {
            endCodePoint = SpoofImpl::ScanHex(
                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
        }

        // Extract the two script names from the source line.  We need these in an 8 bit
        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
        char  srcScriptName[20];
        char  targScriptName[20];
        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
        UScriptCode srcScript  =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
        UScriptCode targScript =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
        if (U_FAILURE(status)) {
            goto cleanup;
        }
        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
            status = U_INVALID_FORMAT_ERROR;
            goto cleanup;
        }

        // select the table - (A) any case or (L) lower case only
        UTrie2 *table = anyCaseTrie;
        if (uregex_start(parseRegexp, 7, &status) >= 0) {
            table = lowerCaseTrie;
        }

        // Build the set of scripts containing confusable characters for
        //   the code point(s) specified in this input line.
        // Sanity check that the script of the source code point is the same
        //   as the source script indicated in the input file.  Failure of this check is
        //   an error in the input file.
        // Include the source script in the set (needed for Mixed Script Confusable detection).
        //
        UChar32 cp;
        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
            int32_t setIndex = utrie2_get32(table, cp);
            BuilderScriptSet *bsset = NULL;
            if (setIndex > 0) {
                U_ASSERT(setIndex < scriptSets->size());
                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
            } else {
                bsset = new BuilderScriptSet();
                if (bsset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                bsset->codePoint = cp;
                bsset->trie = table;
                bsset->sset = new ScriptSet();
                setIndex = scriptSets->size();
                bsset->index = setIndex;
                bsset->rindex = 0;
                if (bsset->sset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                scriptSets->addElement(bsset, status);
                utrie2_set32(table, cp, setIndex, &status);
            }
            bsset->sset->Union(targScript);
            bsset->sset->Union(srcScript);

            if (U_FAILURE(status)) {
                goto cleanup;
            }
            UScriptCode cpScript = uscript_getScript(cp, &status);
            if (cpScript != srcScript) {
                status = U_INVALID_FORMAT_ERROR;
                goto cleanup;
            }
        }
    }

    // Eliminate duplicate script sets.  At this point we have a separate
    // script set for every code point that had data in the input file.
    //
    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
    //
    // printf("Number of scriptSets: %d\n", scriptSets->size());
    {
        int32_t duplicateCount = 0;
        rtScriptSetsCount = 2;
        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
            if (outerSet->index != static_cast<uint32_t>(outeri)) {
                // This set was already identified as a duplicate.
                //   It will not be allocated a position in the runtime array of ScriptSets.
                continue;
            }
            outerSet->rindex = rtScriptSetsCount++;
            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
                    delete innerSet->sset;
                    innerSet->scriptSetOwned = FALSE;
                    innerSet->sset = outerSet->sset;
                    innerSet->index = outeri;
                    innerSet->rindex = outerSet->rindex;
                    duplicateCount++;
                }
                // But this doesn't get all.  We need to fix the TRIE.
            }
        }
        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
    }

    

    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
    //     are unused, which is why the loop index starts at 2.)
    {
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex != (uint32_t)i) {
                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
            }
        }
    }

    // For code points with script==Common or script==Inherited,
    //   Set the reserved value of 1 into both Tries.  These characters do not participate
    //   in Whole Script Confusable detection; this reserved value is the means
    //   by which they are detected.
    {
        UnicodeSet ignoreSet;
        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
        UnicodeSet inheritedSet;
        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
        ignoreSet.addAll(inheritedSet);
        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
        }
    }

    // Serialize the data to the Spoof Detector
    {
        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
        // printf("Any case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
        void *where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(anyCaseTrie, where, size, &status);
        
        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
        // printf("Lower case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
        where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(lowerCaseTrie, where, size, &status);

        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
        uint32_t rindex = 2;
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex < rindex) {
                // We have already copied this script set to the serialized data.
                continue;
            }
            U_ASSERT(rindex == bSet->rindex);
            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
            rindex++;
        }
    }

    // Open new utrie2s from the serialized data.  We don't want to keep the ones
    //   we just built because we would then have two copies of the data, one internal to
    //   the utries that we have already constructed, and one in the serialized data area.
    //   An alternative would be to not pre-serialize the Trie data, but that makes the
    //   spoof detector data different, depending on how the detector was constructed.
    //   It's simpler to keep the data always the same.
    
    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);

    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);

    

cleanup:
    if (U_FAILURE(status)) {
        pe->line = lineNum;
    }
    uregex_close(parseRegexp);
    uprv_free(input);

    int32_t i;
    for (i=0; i<scriptSets->size(); i++) {
        BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
        delete bsset;
    }
    delete scriptSets;
    utrie2_close(anyCaseTrie);
    utrie2_close(lowerCaseTrie);
    return;
}
U_CFUNC int32_t
writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
    const uint32_t *pvArray;
    int32_t pvRows, pvCount;
    int32_t length;
    UErrorCode errorCode;

    pvArray=upvec_getArray(pv, &pvRows, NULL);
    pvCount=pvRows*UPROPS_VECTOR_WORDS;

    errorCode=U_ZERO_ERROR;
    length=utrie_serialize(newTrie, p, capacity, NULL, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
        exit(errorCode);
    }
    if(p!=NULL) {
        if(beVerbose) {
            printf("size in bytes of additional props trie:%5u\n", (int)length);
        }
        if(f!=NULL) {
            UTrie trie={ NULL };
            UTrie2 *trie2;

            utrie_unserialize(&trie, p, length, &errorCode);
            if(U_FAILURE(errorCode)) {
                fprintf(
                    stderr,
                    "genprops error: failed to utrie_unserialize(trie for additional properties) - %s\n",
                    u_errorName(errorCode));
                exit(errorCode);
            }

            /* use UTrie2 */
            trie2=utrie2_fromUTrie(&trie, trie.initialValue, &errorCode);
            if(U_FAILURE(errorCode)) {
                fprintf(
                    stderr,
                    "genprops error: utrie2_fromUTrie() failed - %s\n",
                    u_errorName(errorCode));
                exit(errorCode);
            }
            {
                /* delete lead surrogate code unit values */
                UChar lead;
                trie2=utrie2_cloneAsThawed(trie2, &errorCode);
                for(lead=0xd800; lead<0xdc00; ++lead) {
                    utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode);
                }
                utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode);
                if(U_FAILURE(errorCode)) {
                    fprintf(
                        stderr,
                        "genbidi error: deleting lead surrogate code unit values failed - %s\n",
                        u_errorName(errorCode));
                    exit(errorCode);
                }
            }

            usrc_writeUTrie2Arrays(f,
                "static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
                trie2,
                "\n};\n\n");
            usrc_writeUTrie2Struct(f,
                "static const UTrie2 propsVectorsTrie={\n",
                trie2, "propsVectorsTrie_index", NULL,
                "};\n\n");

            utrie2_close(trie2);
        }

        p+=length;
        capacity-=length;

        /* set indexes */
        indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
            indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
        indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
        indexes[UPROPS_RESERVED_INDEX]=
            indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;

        indexes[UPROPS_MAX_VALUES_INDEX]=
            (((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
            (((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
            (((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
        indexes[UPROPS_MAX_VALUES_2_INDEX]=
            (((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
            (((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
            (((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
            (((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
            ((int32_t)U_DT_COUNT-1);
    }

    if(p!=NULL && (pvCount*4)<=capacity) {
        if(f!=NULL) {
            usrc_writeArray(f,
                "static const uint32_t propsVectors[%ld]={\n",
                pvArray, 32, pvCount,
                "};\n\n");
            fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
            fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
        } else {
            uprv_memcpy(p, pvArray, pvCount*4);
        }
        if(beVerbose) {
            printf("number of additional props vectors:    %5u\n", (int)pvRows);
            printf("number of 32-bit words per vector:     %5u\n", UPROPS_VECTOR_WORDS);
        }
    }
    length+=pvCount*4;

    return length;
}