/** * Implements {@link Transliterator#handleTransliterate}. */ void BreakTransliterator::handleTransliterate(Replaceable & text, UTransPosition & offsets, UBool isIncremental) const { UErrorCode status = U_ZERO_ERROR; boundaries->removeAllElements(); BreakTransliterator * nonConstThis = (BreakTransliterator *)this; nonConstThis->getBreakIterator(); // Lazy-create it if necessary UnicodeString sText = replaceableAsString(text); bi->setText(sText); bi->preceding(offsets.start); // To make things much easier, we will stack the boundaries, and then insert at the end. // generally, we won't need too many, since we will be filtered. int32_t boundary; for (boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { if (boundary == 0) { continue; } // HACK: Check to see that preceeding item was a letter UChar32 cp = sText.char32At(boundary - 1); int type = u_charType(cp); //System.out.println(Integer.toString(cp,16) + " (before): " + type); if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) { continue; } cp = sText.char32At(boundary); type = u_charType(cp); //System.out.println(Integer.toString(cp,16) + " (after): " + type); if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) { continue; } boundaries->addElement(boundary, status); // printf("Boundary at %d\n", boundary); } int delta = 0; int lastBoundary = 0; if (boundaries->size() != 0) // if we found something, adjust { delta = boundaries->size() * fInsertion.length(); lastBoundary = boundaries->lastElementi(); // we do this from the end backwards, so that we don't have to keep updating. while (boundaries->size() > 0) { boundary = boundaries->popi(); text.handleReplaceBetween(boundary, boundary, fInsertion); } } // Now fix up the return values offsets.contextLimit += delta; offsets.limit += delta; offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; // TODO: do something with U_FAILURE(status); // (need to look at transliterators overall, not just here.) }
static void U_CALLCONV unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; UErrorCode errorCode; UChar32 c; errorCode=U_ZERO_ERROR; /* get the character code, field 0 */ c=(UChar32)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "genbidi: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get Mirrored flag, field 9 */ if(*fields[9][0]=='Y') { upvec_setValue(pv, c, c, 0, U_MASK(UBIDI_IS_MIRRORED_SHIFT), U_MASK(UBIDI_IS_MIRRORED_SHIFT), &errorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genbidi error: unable to set 'is mirrored' for U+%04lx, code: %s\n", (long)c, u_errorName(errorCode)); exit(errorCode); } } else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') { fprintf(stderr, "genbidi: syntax error in field 9 at U+%04lx\n", (long)c); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } }
static inline bool shouldKeepAfter(UChar lastCh, UChar ch, UChar nextCh) { UChar preCh = U_MASK(u_charType(ch)) & U_GC_M_MASK ? lastCh : ch; return U_MASK(u_charType(preCh)) & (U_GC_L_MASK | U_GC_N_MASK) && !WTF::Unicode::hasLineBreakingPropertyComplexContext(preCh) && U_MASK(u_charType(nextCh)) & (U_GC_L_MASK | U_GC_N_MASK) && !WTF::Unicode::hasLineBreakingPropertyComplexContext(nextCh); }
void BiDiPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) { if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; } UChar32 start=props.start; UChar32 end=props.end; // The runtime code relies on this invariant for returning both bmg and bpb // from the same data. int32_t bpt=props.getIntProp(UCHAR_BIDI_PAIRED_BRACKET_TYPE); if(!(bpt==0 ? props.bpb==U_SENTINEL : props.bpb==props.bmg)) { fprintf(stderr, "genprops error: invariant not true: " "if(bpt==None) then bpb=<none> else bpb=bmg\n"); return; } int32_t delta=encodeBidiMirroringGlyph(start, end, props.bmg, errorCode); uint32_t value=(uint32_t)delta<<UBIDI_MIRROR_DELTA_SHIFT; if(props.binProps[UCHAR_BIDI_MIRRORED]) { value|=U_MASK(UBIDI_IS_MIRRORED_SHIFT); } if(props.binProps[UCHAR_BIDI_CONTROL]) { value|=U_MASK(UBIDI_BIDI_CONTROL_SHIFT); } if(props.binProps[UCHAR_JOIN_CONTROL]) { value|=U_MASK(UBIDI_JOIN_CONTROL_SHIFT); } value|=(uint32_t)bpt<<UBIDI_BPT_SHIFT; value|=(uint32_t)props.getIntProp(UCHAR_JOINING_TYPE)<<UBIDI_JT_SHIFT; value|=(uint32_t)props.getIntProp(UCHAR_BIDI_CLASS); utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops error: BiDiPropsBuilder utrie2_setRange32() failed - %s\n", u_errorName(errorCode)); return; } // Store Joining_Group values from vector column 1 in simple byte arrays. int32_t jg=props.getIntProp(UCHAR_JOINING_GROUP); for(UChar32 c=start; c<=end; ++c) { int32_t jgStart; if(MIN_JG_START<=c && c<MAX_JG_LIMIT) { jgArray[c-MIN_JG_START]=(uint8_t)jg; } else if(MIN_JG_START2<=c && c<MAX_JG_LIMIT2) { jgArray2[c-MIN_JG_START2]=(uint8_t)jg; } else if(jg!=U_JG_NO_JOINING_GROUP) { fprintf(stderr, "genprops error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n", (long)start, (long)end); errorCode=U_ILLEGAL_ARGUMENT_ERROR; return; } } }
static void U_CALLCONV binariesLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { const Binaries *bin; char *s; uint32_t start, end, uv; int32_t i; bin=(const Binaries *)context; u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); exit(*pErrorCode); } /* parse binary property name */ s=(char *)u_skipWhitespace(fields[1][0]); for(i=0;; ++i) { if(i==bin->binariesCount) { /* ignore unrecognized properties */ if(beVerbose) { addIgnoredProp(s, fields[1][1]); } return; } if(isToken(bin->binaries[i].propName, s)) { break; } } if(bin->binaries[i].vecShift>=32) { fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n", (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName); exit(U_INTERNAL_PROGRAM_ERROR); } uv=U_MASK(bin->binaries[i].vecShift); if(start==0 && end==0x10ffff) { /* Also set bits for initialValue and errorValue. */ end=UPVEC_MAX_CP; } upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops error: unable to set %s code: %s\n", bin->binaries[i].propName, u_errorName(*pErrorCode)); exit(*pErrorCode); } }
symbol_type operator()(const symbol_type& symbol) const { const std::string& word = static_cast<const std::string&>(symbol); icu::UnicodeString uword = icu::UnicodeString::fromUTF8(icu::StringPiece(word.data(), word.size())); Unicode& impl = const_cast<Unicode&>(*this); bool dg = false; uint32_t gc = 0; uscript_type sc(script_.size(), false); icu::StringCharacterIterator iter(uword); for (iter.setToStart(); iter.hasNext(); /**/) { const UChar32 ch = iter.next32PostInc(); dg |= (u_getNumericValue(ch) != U_NO_NUMERIC_VALUE); gc |= u_getIntPropertyValue(ch, UCHAR_GENERAL_CATEGORY_MASK); sc[u_getIntPropertyValue(ch, UCHAR_SCRIPT)] = true; } std::string signature = "<unk"; for (int i = 1; i < U_CHAR_CATEGORY_COUNT; ++ i) if (gc & U_MASK(i)) { signature += "-"; signature += general_category_[i]; } for (int i = 1; i < USCRIPT_CODE_LIMIT; ++ i) if (sc[i]) { signature += "-"; signature += script_[i]; } if (dg) signature += "-NUM"; signature += '>'; return signature; }
U_CAPI UBool U_EXPORT2 u_isUWhiteSpace(UChar32 c) { return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0; }
U_CAPI UBool U_EXPORT2 u_isUAlphabetic(UChar32 c) { return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0; }
static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { // Property starts are a subset of lb=RI etc. return 0x1F1E6<=c && c<=0x1F1FF; } static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ /* * column and mask values for binary properties from u_getUnicodeProperties(). * Must be in order of corresponding UProperty, * and there must be exactly one entry per binary UProperty. * * Properties with mask==0 are handled in code. * For them, column is the UPropertySource value. */ { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains }, { UPROPS_SRC_BIDI, 0, isBidiControl }, { UPROPS_SRC_BIDI, 0, isMirrored }, { 1, U_MASK(UPROPS_DASH), defaultContains }, { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains }, { 1, U_MASK(UPROPS_DEPRECATED), defaultContains }, { 1, U_MASK(UPROPS_DIACRITIC), defaultContains }, { 1, U_MASK(UPROPS_EXTENDER), defaultContains }, { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion }, { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains }, { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains }, { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains }, { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains }, { 1, U_MASK(UPROPS_HYPHEN), defaultContains }, { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains },
static inline bool isUnicodeCategoryLetterOrNumber(UChar lastCh, UChar ch) { UChar32 ch32 = U16_IS_LEAD(lastCh) && U16_IS_TRAIL(ch) ? U16_GET_SUPPLEMENTARY(lastCh, ch) : ch; return (U_MASK(u_charType(ch32)) & (U_GC_L_MASK | U_GC_N_MASK)); }
const char *propName; int32_t vecWord; uint32_t vecValue, vecMask; }; typedef struct Binary Binary; struct Binaries { const char *ucdFile; const Binary *binaries; int32_t binariesCount; }; typedef struct Binaries Binaries; static const Binary propListNames[]={ { "Bidi_Control", 0, U_MASK(UBIDI_BIDI_CONTROL_SHIFT), U_MASK(UBIDI_BIDI_CONTROL_SHIFT) }, { "Join_Control", 0, U_MASK(UBIDI_JOIN_CONTROL_SHIFT), U_MASK(UBIDI_JOIN_CONTROL_SHIFT) } }; static const Binaries propListBinaries={ "PropList", propListNames, LENGTHOF(propListNames) }; static void U_CALLCONV binariesLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { const Binaries *bin; char *s; uint32_t start, end;
static const Binary propListNames[]={ { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } }; static const Binaries propListBinaries={ "PropList", propListNames, LENGTHOF(propListNames) }; static const Binary derCorePropsNames[]={ { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK }, /* Unicode 5.2 adds Case_Ignorable as a public property. See comments in store.c. */ { "Case_Ignorable", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } }; static const Binaries derCorePropsBinaries={ "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) }; /* * Treat Word_Break=MidLetter and MidNumLet as a single binary property. * We need not distinguish between them because both add to case-ignorable. * We ignore all other Word_Break values. */ static const Binary wordBreakNames[]={ { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
/* general properties API functions ----------------------------------------- */ static const struct { int32_t column; uint32_t mask; } binProps[UCHAR_BINARY_LIMIT]={ /* * column and mask values for binary properties from u_getUnicodeProperties(). * Must be in order of corresponding UProperty, * and there must be exactly one entry per binary UProperty. * * Properties with mask 0 are handled in code. * For them, column is the UPropertySource value. */ { 1, U_MASK(UPROPS_ALPHABETIC) }, { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT) }, { UPROPS_SRC_BIDI, 0 }, /* UCHAR_BIDI_CONTROL */ { UPROPS_SRC_BIDI, 0 }, /* UCHAR_BIDI_MIRRORED */ { 1, U_MASK(UPROPS_DASH) }, { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT) }, { 1, U_MASK(UPROPS_DEPRECATED) }, { 1, U_MASK(UPROPS_DIACRITIC) }, { 1, U_MASK(UPROPS_EXTENDER) }, { UPROPS_SRC_NFC, 0 }, /* UCHAR_FULL_COMPOSITION_EXCLUSION */ { 1, U_MASK(UPROPS_GRAPHEME_BASE) }, { 1, U_MASK(UPROPS_GRAPHEME_EXTEND) }, { 1, U_MASK(UPROPS_GRAPHEME_LINK) }, { 1, U_MASK(UPROPS_HEX_DIGIT) }, { 1, U_MASK(UPROPS_HYPHEN) }, { 1, U_MASK(UPROPS_ID_CONTINUE) },
extern void setProps(Props *p) { UErrorCode errorCode; uint32_t value, oldValue; int32_t delta; UBool isCaseIgnorable; /* get the non-UnicodeData.txt properties */ value=oldValue=upvec_getValue(pv, p->code, 0); /* default: map to self */ delta=0; if(p->gc==U_TITLECASE_LETTER) { /* the Titlecase property is read late, from UnicodeData.txt */ value|=UCASE_TITLE; } if(p->upperCase!=0) { /* uppercase mapping as delta if the character is lowercase */ if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { delta=p->upperCase-p->code; } else { value|=UCASE_EXCEPTION; } } if(p->lowerCase!=0) { /* lowercase mapping as delta if the character is uppercase or titlecase */ if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) { delta=p->lowerCase-p->code; } else { value|=UCASE_EXCEPTION; } } if(p->upperCase!=p->titleCase) { value|=UCASE_EXCEPTION; } if(p->closure[0]!=0) { value|=UCASE_EXCEPTION; } if(p->specialCasing!=NULL) { value|=UCASE_EXCEPTION; } if(p->caseFolding!=NULL) { value|=UCASE_EXCEPTION; } if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) { value|=UCASE_EXCEPTION; } if(p->cc!=0) { if(value&UCASE_DOT_MASK) { fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n"); exit(U_INTERNAL_PROGRAM_ERROR); } if(p->cc==230) { value|=UCASE_ABOVE; } else { value|=UCASE_OTHER_ACCENT; } } /* encode case-ignorable as delta==1 on uncased characters */ isCaseIgnorable=FALSE; if((value&UCASE_TYPE_MASK)==UCASE_NONE) { if(ucdVersion>=UNI_4_1) { /* * Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk * Unicode 5.1 and up: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk * The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet. */ if( (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0 ) { isCaseIgnorable=TRUE; } } else { /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */ if( (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || p->code==0x27 || p->code==0xad || p->code==0x2019 ) { isCaseIgnorable=TRUE; } } } if(isCaseIgnorable && p->code!=0x307) { /* * We use one of the delta/exception bits, which works because we only * store the case-ignorable flag for uncased characters. * There is no delta for uncased characters (see checks above). * If there is an exception for an uncased, case-ignorable character * (although there should not be any case mappings if it's uncased) * then we have a problem. * There is one character which is case-ignorable but has an exception: * U+0307 is uncased, Mn, has conditional special casing and * is therefore handled in code instead. */ if(value&UCASE_EXCEPTION) { fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n", (unsigned long)p->code); exit(U_INTERNAL_PROGRAM_ERROR); } delta=1; } /* handle exceptions */ if(value&UCASE_EXCEPTION) { /* simply store exceptions for later processing and encoding */ value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT; uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p)); if(++exceptionsCount==MAX_EXC_COUNT) { fprintf(stderr, "gencase: too many exceptions\n"); exit(U_INDEX_OUTOFBOUNDS_ERROR); } } else { /* store the simple case mapping delta */ value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK; } errorCode=U_ZERO_ERROR; if(value!=oldValue) { upvec_setValue(pv, p->code, p->code, 0, value, 0xffffffff, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", u_errorName(errorCode)); exit(errorCode); } } /* add the multi-character case folding to the "unfold" data */ if(p->caseFolding!=NULL) { int32_t length=p->caseFolding->full[0]; if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) { addUnfolding(p->code, p->caseFolding->full+1, length); } } }