Example #1
0
/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void BreakTransliterator::handleTransliterate(Replaceable & text, UTransPosition & offsets,
        UBool isIncremental) const
{

	UErrorCode status = U_ZERO_ERROR;
	boundaries->removeAllElements();
	BreakTransliterator * nonConstThis = (BreakTransliterator *)this;
	nonConstThis->getBreakIterator(); // Lazy-create it if necessary
	UnicodeString sText = replaceableAsString(text);
	bi->setText(sText);
	bi->preceding(offsets.start);

	// To make things much easier, we will stack the boundaries, and then insert at the end.
	// generally, we won't need too many, since we will be filtered.

	int32_t boundary;
	for (boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next())
	{
		if (boundary == 0) { continue; }
		// HACK: Check to see that preceeding item was a letter

		UChar32 cp = sText.char32At(boundary - 1);
		int type = u_charType(cp);
		//System.out.println(Integer.toString(cp,16) + " (before): " + type);
		if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) { continue; }

		cp = sText.char32At(boundary);
		type = u_charType(cp);
		//System.out.println(Integer.toString(cp,16) + " (after): " + type);
		if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) { continue; }

		boundaries->addElement(boundary, status);
		// printf("Boundary at %d\n", boundary);
	}

	int delta = 0;
	int lastBoundary = 0;

	if (boundaries->size() != 0)   // if we found something, adjust
	{
		delta = boundaries->size() * fInsertion.length();
		lastBoundary = boundaries->lastElementi();

		// we do this from the end backwards, so that we don't have to keep updating.

		while (boundaries->size() > 0)
		{
			boundary = boundaries->popi();
			text.handleReplaceBetween(boundary, boundary, fInsertion);
		}
	}

	// Now fix up the return values
	offsets.contextLimit += delta;
	offsets.limit += delta;
	offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;

	// TODO:  do something with U_FAILURE(status);
	//        (need to look at transliterators overall, not just here.)
}
Example #2
0
static void U_CALLCONV
unicodeDataLineFn(void *context,
                  char *fields[][2], int32_t fieldCount,
                  UErrorCode *pErrorCode) {
    char *end;
    UErrorCode errorCode;
    UChar32 c;

    errorCode=U_ZERO_ERROR;

    /* get the character code, field 0 */
    c=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        fprintf(stderr, "genbidi: syntax error in field 0 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* get Mirrored flag, field 9 */
    if(*fields[9][0]=='Y') {
        upvec_setValue(pv, c, c, 0, U_MASK(UBIDI_IS_MIRRORED_SHIFT), U_MASK(UBIDI_IS_MIRRORED_SHIFT), &errorCode);
        if(U_FAILURE(*pErrorCode)) {
            fprintf(stderr, "genbidi error: unable to set 'is mirrored' for U+%04lx, code: %s\n",
                            (long)c, u_errorName(errorCode));
            exit(errorCode);
        }
    } else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') {
        fprintf(stderr, "genbidi: syntax error in field 9 at U+%04lx\n",
            (long)c);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
}
Example #3
0
static inline bool shouldKeepAfter(UChar lastCh, UChar ch, UChar nextCh)
{
    UChar preCh = U_MASK(u_charType(ch)) & U_GC_M_MASK ? lastCh : ch;
    return U_MASK(u_charType(preCh)) & (U_GC_L_MASK | U_GC_N_MASK)
        && !WTF::Unicode::hasLineBreakingPropertyComplexContext(preCh)
        && U_MASK(u_charType(nextCh)) & (U_GC_L_MASK | U_GC_N_MASK)
        && !WTF::Unicode::hasLineBreakingPropertyComplexContext(nextCh);
}
void
BiDiPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
                           UErrorCode &errorCode) {
    if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; }

    UChar32 start=props.start;
    UChar32 end=props.end;

    // The runtime code relies on this invariant for returning both bmg and bpb
    // from the same data.
    int32_t bpt=props.getIntProp(UCHAR_BIDI_PAIRED_BRACKET_TYPE);
    if(!(bpt==0 ? props.bpb==U_SENTINEL : props.bpb==props.bmg)) {
        fprintf(stderr,
                "genprops error: invariant not true: "
                "if(bpt==None) then bpb=<none> else bpb=bmg\n");
        return;
    }
    int32_t delta=encodeBidiMirroringGlyph(start, end, props.bmg, errorCode);
    uint32_t value=(uint32_t)delta<<UBIDI_MIRROR_DELTA_SHIFT;
    if(props.binProps[UCHAR_BIDI_MIRRORED]) {
        value|=U_MASK(UBIDI_IS_MIRRORED_SHIFT);
    }
    if(props.binProps[UCHAR_BIDI_CONTROL]) {
        value|=U_MASK(UBIDI_BIDI_CONTROL_SHIFT);
    }
    if(props.binProps[UCHAR_JOIN_CONTROL]) {
        value|=U_MASK(UBIDI_JOIN_CONTROL_SHIFT);
    }
    value|=(uint32_t)bpt<<UBIDI_BPT_SHIFT;
    value|=(uint32_t)props.getIntProp(UCHAR_JOINING_TYPE)<<UBIDI_JT_SHIFT;
    value|=(uint32_t)props.getIntProp(UCHAR_BIDI_CLASS);
    utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genprops error: BiDiPropsBuilder utrie2_setRange32() failed - %s\n",
                u_errorName(errorCode));
        return;
    }

    // Store Joining_Group values from vector column 1 in simple byte arrays.
    int32_t jg=props.getIntProp(UCHAR_JOINING_GROUP);
    for(UChar32 c=start; c<=end; ++c) {
        int32_t jgStart;
        if(MIN_JG_START<=c && c<MAX_JG_LIMIT) {
            jgArray[c-MIN_JG_START]=(uint8_t)jg;
        } else if(MIN_JG_START2<=c && c<MAX_JG_LIMIT2) {
            jgArray2[c-MIN_JG_START2]=(uint8_t)jg;
        } else if(jg!=U_JG_NO_JOINING_GROUP) {
            fprintf(stderr, "genprops error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n",
                    (long)start, (long)end);
            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
    }
}
Example #5
0
static void U_CALLCONV
binariesLineFn(void *context,
               char *fields[][2], int32_t fieldCount,
               UErrorCode *pErrorCode) {
    const Binaries *bin;
    char *s;
    uint32_t start, end, uv;
    int32_t i;

    bin=(const Binaries *)context;

    u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
        exit(*pErrorCode);
    }

    /* parse binary property name */
    s=(char *)u_skipWhitespace(fields[1][0]);
    for(i=0;; ++i) {
        if(i==bin->binariesCount) {
            /* ignore unrecognized properties */
            if(beVerbose) {
                addIgnoredProp(s, fields[1][1]);
            }
            return;
        }
        if(isToken(bin->binaries[i].propName, s)) {
            break;
        }
    }

    if(bin->binaries[i].vecShift>=32) {
        fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
                        (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
        exit(U_INTERNAL_PROGRAM_ERROR);
    }
    uv=U_MASK(bin->binaries[i].vecShift);

    if(start==0 && end==0x10ffff) {
        /* Also set bits for initialValue and errorValue. */
        end=UPVEC_MAX_CP;
    }
    upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set %s code: %s\n",
                        bin->binaries[i].propName, u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}
Example #6
0
      symbol_type operator()(const symbol_type& symbol) const
      {
	const std::string& word = static_cast<const std::string&>(symbol);
	icu::UnicodeString uword = icu::UnicodeString::fromUTF8(icu::StringPiece(word.data(), word.size()));
	
	Unicode& impl = const_cast<Unicode&>(*this);
	
	bool dg = false;
	uint32_t gc = 0;
	uscript_type sc(script_.size(), false);
	
	icu::StringCharacterIterator iter(uword);
	for (iter.setToStart(); iter.hasNext(); /**/) {
	  const UChar32 ch = iter.next32PostInc();

	  dg |= (u_getNumericValue(ch) != U_NO_NUMERIC_VALUE);
	  gc |= u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY_MASK);
	  sc[u_getIntPropertyValue(ch, UCHAR_SCRIPT)] = true;
	}
	
	std::string signature = "<unk";

	for (int i = 1; i < U_CHAR_CATEGORY_COUNT; ++ i)
	  if (gc & U_MASK(i)) {
	    signature += "-";
	    signature += general_category_[i];
	  }
	
	for (int i = 1; i < USCRIPT_CODE_LIMIT; ++ i)
	  if (sc[i]) {
	    signature += "-";
	    signature += script_[i];
	  }
	
	if (dg)
	  signature += "-NUM";
	
	signature += '>';
	
	return signature;
      }
Example #7
0
U_CAPI UBool U_EXPORT2
u_isUWhiteSpace(UChar32 c) {
    return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0;
}
Example #8
0
U_CAPI UBool U_EXPORT2
u_isUAlphabetic(UChar32 c) {
    return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
}
Example #9
0
static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
    // Property starts are a subset of lb=RI etc.
    return 0x1F1E6<=c && c<=0x1F1FF;
}

static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
    /*
     * column and mask values for binary properties from u_getUnicodeProperties().
     * Must be in order of corresponding UProperty,
     * and there must be exactly one entry per binary UProperty.
     *
     * Properties with mask==0 are handled in code.
     * For them, column is the UPropertySource value.
     */
    { 1,                U_MASK(UPROPS_ALPHABETIC), defaultContains },
    { 1,                U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains },
    { UPROPS_SRC_BIDI,  0, isBidiControl },
    { UPROPS_SRC_BIDI,  0, isMirrored },
    { 1,                U_MASK(UPROPS_DASH), defaultContains },
    { 1,                U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains },
    { 1,                U_MASK(UPROPS_DEPRECATED), defaultContains },
    { 1,                U_MASK(UPROPS_DIACRITIC), defaultContains },
    { 1,                U_MASK(UPROPS_EXTENDER), defaultContains },
    { UPROPS_SRC_NFC,   0, hasFullCompositionExclusion },
    { 1,                U_MASK(UPROPS_GRAPHEME_BASE), defaultContains },
    { 1,                U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains },
    { 1,                U_MASK(UPROPS_GRAPHEME_LINK), defaultContains },
    { 1,                U_MASK(UPROPS_HEX_DIGIT), defaultContains },
    { 1,                U_MASK(UPROPS_HYPHEN), defaultContains },
    { 1,                U_MASK(UPROPS_ID_CONTINUE), defaultContains },
Example #10
0
static inline bool isUnicodeCategoryLetterOrNumber(UChar lastCh, UChar ch)
{
    UChar32 ch32 = U16_IS_LEAD(lastCh) && U16_IS_TRAIL(ch) ? U16_GET_SUPPLEMENTARY(lastCh, ch) : ch;
    return (U_MASK(u_charType(ch32)) & (U_GC_L_MASK | U_GC_N_MASK));
}
Example #11
0
    const char *propName;
    int32_t vecWord;
    uint32_t vecValue, vecMask;
};
typedef struct Binary Binary;

struct Binaries {
    const char *ucdFile;
    const Binary *binaries;
    int32_t binariesCount;
};
typedef struct Binaries Binaries;

static const Binary
propListNames[]={
    { "Bidi_Control",                       0, U_MASK(UBIDI_BIDI_CONTROL_SHIFT), U_MASK(UBIDI_BIDI_CONTROL_SHIFT) },
    { "Join_Control",                       0, U_MASK(UBIDI_JOIN_CONTROL_SHIFT), U_MASK(UBIDI_JOIN_CONTROL_SHIFT) }
};

static const Binaries
propListBinaries={
    "PropList", propListNames, LENGTHOF(propListNames)
};

static void U_CALLCONV
binariesLineFn(void *context,
               char *fields[][2], int32_t fieldCount,
               UErrorCode *pErrorCode) {
    const Binaries *bin;
    char *s;
    uint32_t start, end;
Example #12
0
static const Binary
propListNames[]={
    { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
};

static const Binaries
propListBinaries={
    "PropList", propListNames, LENGTHOF(propListNames)
};

static const Binary
derCorePropsNames[]={
    { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
    { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK },
    /* Unicode 5.2 adds Case_Ignorable as a public property. See comments in store.c. */
    { "Case_Ignorable",                     1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
};

static const Binaries
derCorePropsBinaries={
    "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
};

/*
 * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
 * We need not distinguish between them because both add to case-ignorable.
 * We ignore all other Word_Break values.
 */
static const Binary
wordBreakNames[]={
    { "MidLetter",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
Example #13
0
/* general properties API functions ----------------------------------------- */

static const struct {
    int32_t column;
    uint32_t mask;
} binProps[UCHAR_BINARY_LIMIT]={
    /*
     * column and mask values for binary properties from u_getUnicodeProperties().
     * Must be in order of corresponding UProperty,
     * and there must be exactly one entry per binary UProperty.
     *
     * Properties with mask 0 are handled in code.
     * For them, column is the UPropertySource value.
     */
    {  1,               U_MASK(UPROPS_ALPHABETIC) },
    {  1,               U_MASK(UPROPS_ASCII_HEX_DIGIT) },
    { UPROPS_SRC_BIDI,  0 },                                    /* UCHAR_BIDI_CONTROL */
    { UPROPS_SRC_BIDI,  0 },                                    /* UCHAR_BIDI_MIRRORED */
    {  1,               U_MASK(UPROPS_DASH) },
    {  1,               U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT) },
    {  1,               U_MASK(UPROPS_DEPRECATED) },
    {  1,               U_MASK(UPROPS_DIACRITIC) },
    {  1,               U_MASK(UPROPS_EXTENDER) },
    { UPROPS_SRC_NFC,  0 },                                     /* UCHAR_FULL_COMPOSITION_EXCLUSION */
    {  1,               U_MASK(UPROPS_GRAPHEME_BASE) },
    {  1,               U_MASK(UPROPS_GRAPHEME_EXTEND) },
    {  1,               U_MASK(UPROPS_GRAPHEME_LINK) },
    {  1,               U_MASK(UPROPS_HEX_DIGIT) },
    {  1,               U_MASK(UPROPS_HYPHEN) },
    {  1,               U_MASK(UPROPS_ID_CONTINUE) },
extern void
setProps(Props *p) {
    UErrorCode errorCode;
    uint32_t value, oldValue;
    int32_t delta;
    UBool isCaseIgnorable;

    /* get the non-UnicodeData.txt properties */
    value=oldValue=upvec_getValue(pv, p->code, 0);

    /* default: map to self */
    delta=0;

    if(p->gc==U_TITLECASE_LETTER) {
        /* the Titlecase property is read late, from UnicodeData.txt */
        value|=UCASE_TITLE;
    }

    if(p->upperCase!=0) {
        /* uppercase mapping as delta if the character is lowercase */
        if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
            delta=p->upperCase-p->code;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(p->lowerCase!=0) {
        /* lowercase mapping as delta if the character is uppercase or titlecase */
        if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
            delta=p->lowerCase-p->code;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(p->upperCase!=p->titleCase) {
        value|=UCASE_EXCEPTION;
    }
    if(p->closure[0]!=0) {
        value|=UCASE_EXCEPTION;
    }
    if(p->specialCasing!=NULL) {
        value|=UCASE_EXCEPTION;
    }
    if(p->caseFolding!=NULL) {
        value|=UCASE_EXCEPTION;
    }

    if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
        value|=UCASE_EXCEPTION;
    }

    if(p->cc!=0) {
        if(value&UCASE_DOT_MASK) {
            fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
            exit(U_INTERNAL_PROGRAM_ERROR);
        }
        if(p->cc==230) {
            value|=UCASE_ABOVE;
        } else {
            value|=UCASE_OTHER_ACCENT;
        }
    }

    /* encode case-ignorable as delta==1 on uncased characters */
    isCaseIgnorable=FALSE;
    if((value&UCASE_TYPE_MASK)==UCASE_NONE) {
        if(ucdVersion>=UNI_4_1) {
            /*
             * Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk
             * Unicode 5.1 and up: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk
             *   The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet.
             */
            if(
                (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
                (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0
            ) {
                isCaseIgnorable=TRUE;
            }
        } else {
            /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */
            if(
                (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
                p->code==0x27 || p->code==0xad || p->code==0x2019
            ) {
                isCaseIgnorable=TRUE;
            }
        }
    }

    if(isCaseIgnorable && p->code!=0x307) {
        /*
         * We use one of the delta/exception bits, which works because we only
         * store the case-ignorable flag for uncased characters.
         * There is no delta for uncased characters (see checks above).
         * If there is an exception for an uncased, case-ignorable character
         * (although there should not be any case mappings if it's uncased)
         * then we have a problem.
         * There is one character which is case-ignorable but has an exception:
         * U+0307 is uncased, Mn, has conditional special casing and
         * is therefore handled in code instead.
         */
        if(value&UCASE_EXCEPTION) {
            fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
                            (unsigned long)p->code);
            exit(U_INTERNAL_PROGRAM_ERROR);
        }

        delta=1;
    }

    /* handle exceptions */
    if(value&UCASE_EXCEPTION) {
        /* simply store exceptions for later processing and encoding */
        value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
        uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
        if(++exceptionsCount==MAX_EXC_COUNT) {
            fprintf(stderr, "gencase: too many exceptions\n");
            exit(U_INDEX_OUTOFBOUNDS_ERROR);
        }
    } else {
        /* store the simple case mapping delta */
        value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
    }

    errorCode=U_ZERO_ERROR;
    if(value!=oldValue) {
        upvec_setValue(pv, p->code, p->code, 0, value, 0xffffffff, &errorCode);
        if(U_FAILURE(errorCode)) {
            fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
                            u_errorName(errorCode));
            exit(errorCode);
        }
    }

    /* add the multi-character case folding to the "unfold" data */
    if(p->caseFolding!=NULL) {
        int32_t length=p->caseFolding->full[0];
        if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) {
            addUnfolding(p->code, p->caseFolding->full+1, length);
        }
    }
}