U_NAMESPACE_BEGIN /** * Parse an integer at pos, either of the form \d+ or of the form * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, * or octal format. * @param pos INPUT-OUTPUT parameter. On input, the first * character to parse. On output, the character after the last * parsed character. */ int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) { int32_t count = 0; int32_t value = 0; int32_t p = pos; int8_t radix = 10; if (p < limit && rule.charAt(p) == 48 /*0*/) { if (p+1 < limit && (rule.charAt(p+1) == 0x78 /*x*/ || rule.charAt(p+1) == 0x58 /*X*/)) { p += 2; radix = 16; } else { p++; count = 1; radix = 8; } } while (p < limit) { int32_t d = u_digit(rule.charAt(p++), radix); if (d < 0) { --p; break; } ++count; int32_t v = (value * radix) + d; if (v <= value) { // If there are too many input digits, at some point // the value will go negative, e.g., if we have seen // "0x8000000" already and there is another '0', when // we parse the next 0 the value will go negative. return 0; } value = v; } if (count > 0) { pos = p; } return value; }
// // replaceCharRefs // // replace the char entities < & { ካ etc. in a string // with the corresponding actual character. // void UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { UnicodeString result; UnicodeString replacement; int i; mAmps.reset(s); // See the initialization for the regex matcher mAmps. // Which entity we've matched is determined by which capture group has content, // which is flaged by start() of that group not being -1. while (mAmps.find()) { if (mAmps.start(1, status) != -1) { replacement.setTo((UChar)x_AMP); } else if (mAmps.start(2, status) != -1) { replacement.setTo((UChar)x_LT); } else if (mAmps.start(3, status) != -1) { replacement.setTo((UChar)x_GT); } else if (mAmps.start(4, status) != -1) { replacement.setTo((UChar)x_APOS); } else if (mAmps.start(5, status) != -1) { replacement.setTo((UChar)x_QUOT); } else if (mAmps.start(6, status) != -1) { UnicodeString hexString = mAmps.group(6, status); UChar32 val = 0; for (i=0; i<hexString.length(); i++) { val = (val << 4) + u_digit(hexString.charAt(i), 16); } // TODO: some verification that the character is valid replacement.setTo(val); } else if (mAmps.start(7, status) != -1) { UnicodeString decimalString = mAmps.group(7, status); UChar32 val = 0; for (i=0; i<decimalString.length(); i++) { val = val*10 + u_digit(decimalString.charAt(i), 10); } // TODO: some verification that the character is valid replacement.setTo(val); } else { // An unrecognized &entity; Leave it alone. // TODO: check that it really looks like an entity, and is not some // random & in the text. replacement = mAmps.group((int32_t)0, status); } mAmps.appendReplacement(result, replacement, status); } mAmps.appendTail(result); s = result; }
static void setCharField(JNIEnv* env, jobject obj, const char* fieldName, const UnicodeString& value) { if (value.length() == 0) { return; } jfieldID fid = env->GetFieldID(JniConstants::localeDataClass, fieldName, "C"); env->SetCharField(obj, fid, value.charAt(0)); }
UBool LocaleUtility::isFallbackOf(const UnicodeString& root, const UnicodeString& child) { return child.indexOf(root) == 0 && (child.length() == root.length() || child.charAt(root.length()) == UNDERSCORE_CHAR); }
static UnicodeString parseHex(const UnicodeString &in) { // Convert a series of hex numbers in a Unicode String to a string with the // corresponding characters. // The conversion is _really_ annoying. There must be some function to just do it. UnicodeString result; UChar32 cc = 0; for (int32_t i=0; i<in.length(); i++) { UChar c = in.charAt(i); if (c == 0x20) { // Space if (cc > 0) { result.append(cc); cc = 0; } } else if (c>=0x30 && c<=0x39) { cc = (cc<<4) + (c - 0x30); } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) { cc = (cc<<4) + (c & 0x0f)+9; } // else do something with bad input. } if (cc > 0) { result.append(cc); } return result; }
// fixQuotes unescapes single quotes. Don''t -> Don't. Letter 'j' -> Letter j. // Modifies s in place. static void fixQuotes(UnicodeString& s) { QuoteState state = OUTSIDE; int32_t len = s.length(); int32_t dest = 0; for (int32_t i = 0; i < len; ++i) { UChar ch = s.charAt(i); if (ch == u_apos) { if (state == INSIDE_EMPTY) { s.setCharAt(dest, ch); ++dest; } } else { s.setCharAt(dest, ch); ++dest; } // Update state switch (state) { case OUTSIDE: state = ch == u_apos ? INSIDE_EMPTY : OUTSIDE; break; case INSIDE_EMPTY: case INSIDE_FULL: state = ch == u_apos ? OUTSIDE : INSIDE_FULL; break; default: break; } } s.truncate(dest); }
void TextGroup::addRun(const UnicodeString &input, UBiDiDirection direction, int32_t start, int32_t end) { std::string text; input.tempSubString(start, end - start).toUTF8String(text); printf("Hominlinx-->======TextGroup::addRun[%s]==== %d\n",text.c_str(), input.charAt(0) ); runs_.emplace_back(text, script_, lang_, uciDirectionToHB(direction)); }
TestReplaceable (const UnicodeString& text, const UnicodeString& newStyles) { chars = text; UnicodeString s; for (int i = 0; i < text.length(); ++i) { if (i < newStyles.length()) { s.append(newStyles.charAt(i)); } else { if (text.charAt(i) == NO_STYLE_MARK) { s.append(NO_STYLE); } else { s.append((UChar)(i + 0x0031)); } } } this->styles = s; }
/** * Parse a pattern string starting at offset pos. Keywords are * matched case-insensitively. Spaces may be skipped and may be * optional or required. Integer values may be parsed, and if * they are, they will be returned in the given array. If * successful, the offset of the next non-space character is * returned. On failure, -1 is returned. * @param pattern must only contain lowercase characters, which * will match their uppercase equivalents as well. A space * character matches one or more required spaces. A '~' character * matches zero or more optional spaces. A '#' character matches * an integer and stores it in parsedInts, which the caller must * ensure has enough capacity. * @param parsedInts array to receive parsed integers. Caller * must ensure that parsedInts.length is >= the number of '#' * signs in 'pattern'. * @return the position after the last character parsed, or -1 if * the parse failed */ int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit, const UnicodeString& pattern, int32_t* parsedInts) { // TODO Update this to handle surrogates int32_t p; int32_t intCount = 0; // number of integers parsed for (int32_t i=0; i<pattern.length(); ++i) { UChar cpat = pattern.charAt(i); UChar c; switch (cpat) { case 32 /*' '*/: if (pos >= limit) { return -1; } c = rule.charAt(pos++); if (!PatternProps::isWhiteSpace(c)) { return -1; } // FALL THROUGH to skipWhitespace U_FALLTHROUGH; case 126 /*'~'*/: pos = skipWhitespace(rule, pos); break; case 35 /*'#'*/: p = pos; parsedInts[intCount++] = parseInteger(rule, p, limit); if (p == pos) { // Syntax error; failed to parse integer return -1; } pos = p; break; default: if (pos >= limit) { return -1; } c = (UChar) u_tolower(rule.charAt(pos++)); if (c != cpat) { return -1; } break; } } return pos; }
UBool SelectFormat::checkValidKeyword(const UnicodeString& argKeyword ) const{ int32_t len = argKeyword.length(); if (len < 1){ return FALSE; } CharacterClass type = classifyCharacter(argKeyword.charAt(0)); if( type != tStartKeyword ){ return FALSE; } for (int32_t i = 0; i < argKeyword.length(); ++i) { type = classifyCharacter(argKeyword.charAt(i)); if( type != tStartKeyword && type != tContinueKeyword ){ return FALSE; } } return TRUE; }
int32_t MessagePattern::parseArgNumber(const UnicodeString &s, int32_t start, int32_t limit) { // If the identifier contains only ASCII digits, then it is an argument _number_ // and must not have leading zeros (except "0" itself). // Otherwise it is an argument _name_. if(start>=limit) { return UMSGPAT_ARG_NAME_NOT_VALID; } int32_t number; // Defer numeric errors until we know there are only digits. UBool badNumber; UChar c=s.charAt(start++); if(c==0x30) { if(start==limit) { return 0; } else { number=0; badNumber=TRUE; // leading zero } } else if(0x31<=c && c<=0x39) { number=c-0x30; badNumber=FALSE; } else { return UMSGPAT_ARG_NAME_NOT_NUMBER; } while(start<limit) { c=s.charAt(start++); if(0x30<=c && c<=0x39) { if(number>=INT32_MAX/10) { badNumber=TRUE; // overflow } number=number*10+(c-0x30); } else { return UMSGPAT_ARG_NAME_NOT_NUMBER; } } // There are only ASCII digits. if(badNumber) { return UMSGPAT_ARG_NAME_NOT_VALID; } else { return number; } }
U_NAMESPACE_BEGIN int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) { int32_t count = 0; int32_t value = 0; int32_t p = pos; int8_t radix = 10; if (p < limit && rule.charAt(p) == 48 /*0*/) { if (p+1 < limit && (rule.charAt(p+1) == 0x78 /*x*/ || rule.charAt(p+1) == 0x58 /*X*/)) { p += 2; radix = 16; } else { p++; count = 1; radix = 8; } } while (p < limit) { int32_t d = u_digit(rule.charAt(p++), radix); if (d < 0) { --p; break; } ++count; int32_t v = (value * radix) + d; if (v <= value) { // If there are too many input digits, at some point // the value will go negative, e.g., if we have seen // "0x8000000" already and there is another '0', when // we parse the next 0 the value will go negative. return 0; } value = v; } if (count > 0) { pos = p; } return value; }
UBool checkEqual(const PluralRules &test, char *result, int32_t max) { UnicodeString key; UBool isEqual = TRUE; for (int32_t i=0; i<max; ++i) { key= test.select(i); if ( key.charAt(0)!=result[i] ) { isEqual = FALSE; } } return isEqual; }
U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth) { int i; for (i=0; i<s.length(); i++) { RBBIDebugPrintf("%c", s.charAt(i)); // putc(s.charAt(i), stdout); } for (i=s.length(); i<minWidth; i++) { RBBIDebugPrintf(" "); } }
void UObjectTest::TestMFCCompatibility() { #if U_HAVE_DEBUG_LOCATION_NEW /* Make sure that it compiles with MFC's debuggable new usage. */ UnicodeString *str = new(__FILE__, __LINE__) UnicodeString(); str->append((UChar)0x0040); // Is it usable? if(str->charAt(0) != 0x0040) { errln("debug new doesn't work."); } UnicodeString::operator delete(str, __FILE__, __LINE__); #endif }
void PluralRules::getNextLocale(const UnicodeString& localeData, int32_t* curIndex, UnicodeString& localeName) { int32_t i=*curIndex; localeName.remove(); while (i< localeData.length()) { if ( (localeData.charAt(i)!= SPACE) && (localeData.charAt(i)!= COMMA) ) { break; } i++; } while (i< localeData.length()) { if ( (localeData.charAt(i)== SPACE) || (localeData.charAt(i)== COMMA) ) { break; } localeName+=localeData.charAt(i++); } *curIndex=i; }
// Computes the restriction level of a string, according to UTS 39 section 5.2. URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { // Section 5.2 step 1: if (!fAllowedCharsSet->containsAll(input)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 2 // Java use a static UnicodeSet for this test. In C++, avoid the static variable // and just do a simple for loop. UBool allASCII = TRUE; for (int32_t i=0, length=input.length(); i<length; i++) { if (input.charAt(i) > 0x7f) { allASCII = FALSE; break; } } if (allASCII) { return USPOOF_ASCII; } // Section 5.2 steps 3: ScriptSet resolvedScriptSet; getResolvedScriptSet(input, resolvedScriptSet, status); if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 4: if (!resolvedScriptSet.isEmpty()) { return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; } // Section 5.2 step 5: ScriptSet resolvedNoLatn; getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } // Section 5.2 step 6: if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) || resolvedNoLatn.test(USCRIPT_JAPANESE, status) || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { return USPOOF_HIGHLY_RESTRICTIVE; } // Section 5.2 step 7: if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) && !resolvedNoLatn.test(USCRIPT_GREEK, status) && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { return USPOOF_MODERATELY_RESTRICTIVE; } // Section 5.2 step 8: return USPOOF_MINIMALLY_RESTRICTIVE; }
int32_t NumberStringBuilder::insert(int32_t index, const UnicodeString &unistr, Field field, UErrorCode &status) { if (unistr.length() == 0) { // Nothing to insert. return 0; } else if (unistr.length() == 1) { // Fast path: insert using insertCodePoint. return insertCodePoint(index, unistr.charAt(0), field, status); } else { return insert(index, unistr, 0, unistr.length(), field, status); } }
void fixStyles(int32_t start, int32_t limit, int32_t newLen) { UChar newStyle = NO_STYLE; if (start != limit && styles.charAt(start) != NO_STYLE) { newStyle = styles.charAt(start); } else if (start > 0 && getCharAt(start-1) != NO_STYLE_MARK) { newStyle = styles.charAt(start-1); } else if (limit < styles.length()) { newStyle = styles.charAt(limit); } // dumb implementation for now. UnicodeString s; for (int i = 0; i < newLen; ++i) { // this doesn't really handle an embedded NO_STYLE_MARK // in the middle of a long run of characters right -- but // that case shouldn't happen anyway if (getCharAt(start+i) == NO_STYLE_MARK) { s.append(NO_STYLE); } else { s.append(newStyle); } } styles.replaceBetween(start, limit, s); }
int32_t NumberStringBuilder::insert(int32_t index, const UnicodeString &unistr, int32_t start, int32_t end, Field field, UErrorCode &status) { int32_t count = end - start; int32_t position = prepareForInsert(index, count, status); if (U_FAILURE(status)) { return count; } for (int32_t i = 0; i < count; i++) { getCharPtr()[position + i] = unistr.charAt(start + i); getFieldPtr()[position + i] = field; } return count; }
void ReplaceableTest::check(const UnicodeString& transliteratorName, const UnicodeString& test, const UnicodeString& shouldProduceStyles) { UErrorCode status = U_ZERO_ERROR; TestReplaceable *tr = new TestReplaceable(test, ""); UnicodeString expectedStyles = shouldProduceStyles; UnicodeString original = tr->toString(); Transliterator* t; if (transliteratorName.charAt(0) == 0x2A /*'*'*/) { UnicodeString rules(transliteratorName); rules.remove(0,1); UParseError pe; t = Transliterator::createFromRules("test", rules, UTRANS_FORWARD, pe, status); // test clone() TestReplaceable *tr2 = (TestReplaceable *)tr->clone(); if(tr2 != NULL) { delete tr; tr = tr2; } } else { t = Transliterator::createInstance(transliteratorName, UTRANS_FORWARD, status); } if (U_FAILURE(status)) { log("FAIL: failed to create the "); log(transliteratorName); errln(" transliterator."); delete tr; return; } t->transliterate(*tr); UnicodeString newStyles = tr->getStyles(); if (newStyles != expectedStyles) { errln("FAIL Styles: " + transliteratorName + "{" + original + "} => " + tr->toString() + "; should be {" + expectedStyles + "}!"); } else { log("OK: "); log(transliteratorName); log("("); log(original); log(") => "); logln(tr->toString()); } delete tr; delete t; }
// populatePrefixSuffix Adds a specific prefix-suffix pair to result for a // given variant and log10 value. // variant is 'zero', 'one', 'two', 'few', 'many', or 'other'. // formatStr is the format string from which the prefix and suffix are // extracted. It is usually of form 'Pefix 000 suffix'. // populatePrefixSuffix returns the number of 0's found in formatStr // before the decimal point. // In the special case that formatStr contains only spaces for prefix // and suffix, populatePrefixSuffix returns log10Value + 1. static int32_t populatePrefixSuffix( const char* variant, int32_t log10Value, const UnicodeString& formatStr, UHashtable* result, UBool overwrite, UErrorCode& status) { if (U_FAILURE(status)) { return 0; } int32_t firstIdx = formatStr.indexOf(kZero, UPRV_LENGTHOF(kZero), 0); // We must have 0's in format string. if (firstIdx == -1) { status = U_INTERNAL_PROGRAM_ERROR; return 0; } int32_t lastIdx = formatStr.lastIndexOf(kZero, UPRV_LENGTHOF(kZero), firstIdx); CDFUnit* unit = createCDFUnit(variant, log10Value, result, status); if (U_FAILURE(status)) { return 0; } // Return -1 if we are not overwriting an existing value if (unit->isSet() && !overwrite) { return -1; } unit->markAsSet(); // Everything up to first 0 is the prefix unit->prefix = formatStr.tempSubString(0, firstIdx); fixQuotes(unit->prefix); // Everything beyond the last 0 is the suffix unit->suffix = formatStr.tempSubString(lastIdx + 1); fixQuotes(unit->suffix); // If there is effectively no prefix or suffix, ignore the actual number of // 0's and act as if the number of 0's matches the size of the number. if (onlySpaces(unit->prefix) && onlySpaces(unit->suffix)) { return log10Value + 1; } // Calculate number of zeros before decimal point int32_t idx = firstIdx + 1; while (idx <= lastIdx && formatStr.charAt(idx) == u_0) { ++idx; } return (idx - firstIdx); }
UnicodeString PluralFormat::insertFormattedNumber(double number, UnicodeString & message, UnicodeString & appendTo, FieldPosition & pos) const { UnicodeString result; int32_t braceStack = 0; int32_t startIndex = 0; if (message.length() == 0) { return result; } appendTo = numberFormat->format(number, appendTo, pos); for (int32_t i = 0; i < message.length(); ++i) { switch (message.charAt(i)) { case LEFTBRACE: ++braceStack; break; case RIGHTBRACE: --braceStack; break; case NUMBER_SIGN: if (braceStack == 0) { result += UnicodeString(message, startIndex, i); result += appendTo; startIndex = i + 1; } break; } } if (startIndex < message.length()) { result += UnicodeString(message, startIndex, message.length() - startIndex); } appendTo = result; return result; }
// // RBBISymbolTable::parseReference This function from the abstract symbol table interface // looks for a $variable name in the source text. // It does not look it up, only scans for it. // It is used by the UnicodeSet parser. // // This implementation is lifted pretty much verbatim // from the rules based transliterator implementation. // I didn't see an obvious way of sharing it. // UnicodeString RBBISymbolTable::parseReference(const UnicodeString& text, ParsePosition& pos, int32_t limit) const { int32_t start = pos.getIndex(); int32_t i = start; UnicodeString result; while (i < limit) { UChar c = text.charAt(i); if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { break; } ++i; } if (i == start) { // No valid name chars return result; // Indicate failure with empty string } pos.setIndex(i); text.extractBetween(start, i, result); return result; }
void RuleBasedNumberFormat::stripWhitespace(UnicodeString& description) { // iterate through the characters... UnicodeString result; int start = 0; while (start != -1 && start < description.length()) { // seek to the first non-whitespace character... while (start < description.length() && PatternProps::isWhiteSpace(description.charAt(start))) { ++start; } // locate the next semicolon in the text and copy the text from // our current position up to that semicolon into the result int32_t p = description.indexOf(gSemiColon, start); if (p == -1) { // or if we don't find a semicolon, just copy the rest of // the string into the result result.append(description, start, description.length() - start); start = -1; } else if (p < description.length()) { result.append(description, start, p + 1 - start); start = p + 1; } // when we get here, we've seeked off the end of the sring, and // we terminate the loop (we continue until *start* is -1 rather // than until *p* is -1, because otherwise we'd miss the last // rule in the description) else { start = -1; } } description.setTo(result); }
static void printUnicodeString(const char *announce, const UnicodeString &s) { static char out[200]; int32_t i, length; // output the string, converted to the platform encoding // Note for Windows: The "platform encoding" defaults to the "ANSI codepage", // which is different from the "OEM codepage" in the console window. // However, if you pipe the output into a file and look at it with Notepad // or similar, then "ANSI" characters will show correctly. // Production code should be aware of what encoding is required, // and use a UConverter or at least a charset name explicitly. out[s.extract(0, 99, out)]=0; printf("%s%s {", announce, out); // output the code units (not code points) length=s.length(); for(i=0; i<length; ++i) { printf(" %04x", s.charAt(i)); } printf(" }\n"); }
int32_t NumberStringBuilder::splice(int32_t startThis, int32_t endThis, const UnicodeString &unistr, int32_t startOther, int32_t endOther, Field field, UErrorCode& status) { int32_t thisLength = endThis - startThis; int32_t otherLength = endOther - startOther; int32_t count = otherLength - thisLength; int32_t position; if (count > 0) { // Overall, chars need to be added. position = prepareForInsert(startThis, count, status); } else { // Overall, chars need to be removed or kept the same. position = remove(startThis, -count); } if (U_FAILURE(status)) { return count; } for (int32_t i = 0; i < otherLength; i++) { getCharPtr()[position + i] = unistr.charAt(startOther + i); getFieldPtr()[position + i] = field; } return count; }
//constructor NamePrepTransform::NamePrepTransform(UParseError& parseError, UErrorCode& status) : unassigned(), prohibited(), labelSeparatorSet(){ mapping = NULL; bundle = NULL; const char* testDataName = IntlTest::loadTestData(status); if(U_FAILURE(status)){ return; } bundle = ures_openDirect(testDataName,"idna_rules",&status); if(bundle != NULL && U_SUCCESS(status)){ // create the mapping transliterator int32_t ruleLen = 0; const UChar* ruleUChar = ures_getStringByKey(bundle, "MapNFKC",&ruleLen, &status); int32_t mapRuleLen = 0; const UChar *mapRuleUChar = ures_getStringByKey(bundle, "MapNoNormalization", &mapRuleLen, &status); UnicodeString rule(mapRuleUChar, mapRuleLen); rule.append(ruleUChar, ruleLen); mapping = Transliterator::createFromRules(UnicodeString("NamePrepTransform", ""), rule, UTRANS_FORWARD, parseError,status); if(U_FAILURE(status)) { return; } //create the unassigned set int32_t patternLen =0; const UChar* pattern = ures_getStringByKey(bundle,"UnassignedSet",&patternLen, &status); unassigned.applyPattern(UnicodeString(pattern, patternLen), status); //create prohibited set patternLen=0; pattern = ures_getStringByKey(bundle,"ProhibitedSet",&patternLen, &status); UnicodeString test(pattern,patternLen); prohibited.applyPattern(test,status); #ifdef DEBUG if(U_FAILURE(status)){ printf("Construction of Unicode set failed\n"); } if(U_SUCCESS(status)){ if(prohibited.contains((UChar) 0x644)){ printf("The string contains 0x644 ... damn !!\n"); } UnicodeString temp; prohibited.toPattern(temp,TRUE); for(int32_t i=0;i<temp.length();i++){ printf("%c", (char)temp.charAt(i)); } printf("\n"); } #endif //create label separator set patternLen=0; pattern = ures_getStringByKey(bundle,"LabelSeparatorSet",&patternLen, &status); labelSeparatorSet.applyPattern(UnicodeString(pattern,patternLen),status); } if(U_SUCCESS(status) && (mapping == NULL) ){ status = U_MEMORY_ALLOCATION_ERROR; delete mapping; ures_close(bundle); mapping = NULL; bundle = NULL; } }
/** * Append c to buf, unless buf is empty or buf already ends in c. */ static void _smartAppend(UnicodeString& buf, UChar c) { if (buf.length() != 0 && buf.charAt(buf.length() - 1) != c) { buf.append(c); } }
//--------------------------------------------------------------------- // // dump Output the compiled form of the pattern. // Debugging function only. // //--------------------------------------------------------------------- void RegexPattern::dumpOp(int32_t index) const { (void)index; // Suppress warnings in non-debug build. #if defined(REGEX_DEBUG) static const char * const opNames[] = {URX_OPCODE_NAMES}; int32_t op = fCompiledPat->elementAti(index); int32_t val = URX_VAL(op); int32_t type = URX_TYPE(op); int32_t pinnedType = type; if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { pinnedType = 0; } printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); switch (type) { case URX_NOP: case URX_DOTANY: case URX_DOTANY_ALL: case URX_FAIL: case URX_CARET: case URX_DOLLAR: case URX_BACKSLASH_G: case URX_BACKSLASH_X: case URX_END: case URX_DOLLAR_M: case URX_CARET_M: // Types with no operand field of interest. break; case URX_RESERVED_OP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_STATE_SAVE: case URX_JMP: case URX_JMP_SAV: case URX_JMP_SAV_X: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: case URX_BACKSLASH_D: case URX_BACKSLASH_Z: case URX_STRING_LEN: case URX_CTR_INIT: case URX_CTR_INIT_NG: case URX_CTR_LOOP: case URX_CTR_LOOP_NG: case URX_RELOC_OPRND: case URX_STO_SP: case URX_LD_SP: case URX_BACKREF: case URX_STO_INP_LOC: case URX_JMPX: case URX_LA_START: case URX_LA_END: case URX_BACKREF_I: case URX_LB_START: case URX_LB_CONT: case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: case URX_LOOP_C: case URX_LOOP_DOT_I: case URX_BACKSLASH_H: case URX_BACKSLASH_R: case URX_BACKSLASH_V: // types with an integer operand field. printf("%d", val); break; case URX_ONECHAR: case URX_ONECHAR_I: printf("%c", val<256?val:'?'); break; case URX_STRING: case URX_STRING_I: { int32_t lengthOp = fCompiledPat->elementAti(index+1); U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); int32_t length = URX_VAL(lengthOp); int32_t i; for (i=val; i<val+length; i++) { UChar c = fLiteralText[i]; if (c < 32 || c >= 256) {c = '.';} printf("%c", c); } } break; case URX_SETREF: case URX_LOOP_SR_I: { UnicodeString s; UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); set->toPattern(s, TRUE); for (int32_t i=0; i<s.length(); i++) { printf("%c", s.charAt(i)); } } break; case URX_STATIC_SETREF: case URX_STAT_SETREF_N: { UnicodeString s; if (val & URX_NEG_SET) { printf("NOT "); val &= ~URX_NEG_SET; } UnicodeSet *set = fStaticSets[val]; set->toPattern(s, TRUE); for (int32_t i=0; i<s.length(); i++) { printf("%c", s.charAt(i)); } } break; default: printf("??????"); break; } printf("\n"); #endif }