int pelet::HandleHeredoc(BufferClass *buffer) { /* * find out the stopping identifier. Since current is past the newline, the * identifier is located past the "<<<" (3) and before the newline (trimming * since newline can be more than 1 char) * * difference from PHP: we will treat single quoted strings and heredoc the same for now * the PHP scanner scans the string and returns T_ENCAPSED_AND_WHITESPACE if string * does not have embedded variables; since we don't care about embedded variables * we will always treat heredoc as singles quote strings */ UnicodeString identifier(buffer->TokenStart + 3, buffer->Current - buffer->TokenStart - 3 - 1); identifier.trim(); // remove double quotes if they are there if (identifier.startsWith(UNICODE_STRING("\"", 1))) { identifier.remove(0, 1); } if (identifier.endsWith(UNICODE_STRING("\"", 1))) { identifier.remove(identifier.length() - 1, 1); } if ((buffer->Limit - buffer->Current) < 2) { buffer->AppendToLexeme(1); } int failed = pelet::SkipToIdentifier(buffer, identifier); if (!failed) { return T_CONSTANT_ENCAPSED_STRING; } return failed; }
SimpleDateFormatStaticSets::SimpleDateFormatStaticSets(UErrorCode &status) : fDateIgnorables(NULL), fTimeIgnorables(NULL), fOtherIgnorables(NULL) { fDateIgnorables = new UnicodeSet(UNICODE_STRING("[-,./[:whitespace:]]", 20), status); fTimeIgnorables = new UnicodeSet(UNICODE_STRING("[-.:[:whitespace:]]", 19), status); fOtherIgnorables = new UnicodeSet(UNICODE_STRING("[:whitespace:]", 14), status); // Check for null pointers if (fDateIgnorables == NULL || fTimeIgnorables == NULL || fOtherIgnorables == NULL) { goto ExitConstrDeleteAll; } // Freeze all the sets fDateIgnorables->freeze(); fTimeIgnorables->freeze(); fOtherIgnorables->freeze(); return; // If we reached this point, everything is fine so just exit ExitConstrDeleteAll: // Remove all sets and return error delete fDateIgnorables; fDateIgnorables = NULL; delete fTimeIgnorables; fTimeIgnorables = NULL; delete fOtherIgnorables; fOtherIgnorables = NULL; status = U_MEMORY_ALLOCATION_ERROR; }
static void demoUnicodeStringInit() { // *** Make sure to read about invariant characters in utypes.h! *** // Initialization of Unicode strings from C literals works _only_ for // invariant characters! printf("\n* demoUnicodeStringInit() ---------- ***\n\n"); // the string literal is 32 chars long - this must be counted for the macro UnicodeString invariantOnly=UNICODE_STRING("such characters are safe 123 %-.", 32); /* * In C, we need two macros: one to declare the UChar[] array, and * one to populate it; the second one is a noop on platforms where * wchar_t is compatible with UChar and ASCII-based. * The length of the string literal must be counted for both macros. */ /* declare the invString array for the string */ U_STRING_DECL(invString, "such characters are safe 123 %-.", 32); /* populate it with the characters */ U_STRING_INIT(invString, "such characters are safe 123 %-.", 32); // compare the C and C++ strings printf("C and C++ Unicode strings are equal: %d\n", invariantOnly==UnicodeString(TRUE, invString, 32)); /* * convert between char * and UChar * strings that * contain only invariant characters */ static const char *cs1="such characters are safe 123 %-."; static UChar us1[40]; static char cs2[40]; u_charsToUChars(cs1, us1, 33); /* include the terminating NUL */ u_UCharsToChars(us1, cs2, 33); printf("char * -> UChar * -> char * with only " "invariant characters: \"%s\"\n", cs2); // initialize a UnicodeString from a string literal that contains // escape sequences written with invariant characters // do not forget to duplicate the backslashes for ICU to see them // then, count each double backslash only once! UnicodeString german=UNICODE_STRING( "Sch\\u00f6nes Auto: \\u20ac 11240.\\fPrivates Zeichen: \\U00102345\\n", 64). unescape(); printUnicodeString("german UnicodeString from unescaping:\n ", german); /* * C: convert and unescape a char * string with only invariant * characters to fill a UChar * string */ UChar buffer[200]; int32_t length; length=u_unescape( "Sch\\u00f6nes Auto: \\u20ac 11240.\\fPrivates Zeichen: \\U00102345\\n", buffer, UPRV_LENGTHOF(buffer)); printf("german C Unicode string from char * unescaping: (length %d)\n ", length); printUnicodeString("", UnicodeString(buffer)); }
int pelet::SkipToIdentifier(BufferClass *buffer, UnicodeString identifier) { bool end = false; // add semicolon to make checks easier identifier.append(';'); UChar c = *buffer->Current; while (!end) { /* * read one line at a time. If the line is the identifier we'll stop. If we reach the * end, then this heredoc in unterminated. * be careful; do NOT store buffer->Current since it may change at any after buffer->AppendToLexeme * is called */ UnicodeString line; while (c != 0 && c != '\n' && c != '\r') { line.append(c); // only fill buffer when we its close to being filled up; this will prevent // useless copying of the buffer to remove slack if ((buffer->Limit - buffer->Current) < 2) { buffer->AppendToLexeme(1); } c = *(++buffer->Current); } if (c == 0) { end = true; return T_ERROR_UNTERMINATED_STRING; } // since we are eating up a newline, otherwise line numbering in lint errors // will be wrong buffer->IncrementLine(); bool hasEndingSemicolon = true; if (!line.endsWith(UNICODE_STRING(";", 1))) { line.append(UNICODE_STRING(";", 1)); hasEndingSemicolon = false; } if (line.compare(identifier) == 0) { end = true; // semicolons and newlines are NOT part of the nowdoc; the parser will look for semicolons // semicolon is OPTIONAL for heredoc / nowdoc if (hasEndingSemicolon) { buffer->Current--; } } else { if ((buffer->Limit - buffer->Current) < 2) { buffer->AppendToLexeme(1); } c = *(++buffer->Current); } } return 0; }
void RBBISetBuilder::printRanges() { RangeDescriptor * rlRange; int i; RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); for (rlRange = fRangeList; rlRange != 0; rlRange = rlRange->fNext) { RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); for (i = 0; i < rlRange->fIncludesSets->size(); i++) { RBBINode * usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName = UNICODE_STRING("anon", 4); RBBINode * setRef = usetNode->fParent; if (setRef != NULL) { RBBINode * varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } RBBIDebugPrintf("\n"); } }
void RBBISetBuilder::printSets() { int i; RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); for (i=0; ; i++) { RBBINode *usetNode; RBBINode *setRef; RBBINode *varRef; UnicodeString setName; usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); if (usetNode == NULL) { break; } RBBIDebugPrintf("%3d ", i); setName = UNICODE_STRING("anonymous", 9); setRef = usetNode->fParent; if (setRef != NULL) { varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); RBBI_DEBUG_printUnicodeString(usetNode->fText); RBBIDebugPrintf("\n"); if (usetNode->fLeftChild != NULL) { usetNode->fLeftChild->printTree(TRUE); } } RBBIDebugPrintf("\n"); }
/** * Constructs a transliterator with the default delimiters '{' and * '}'. */ BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), fInsertion(SPACE) { bi = NULL; UErrorCode status = U_ZERO_ERROR; boundaries = new UVector32(status); }
void MessageFormatRegressionTest::Test4142938() { UnicodeString pat = CharsToUnicodeString("''Vous'' {0,choice,0#n''|1#}avez s\\u00E9lectionn\\u00E9 " "{0,choice,0#aucun|1#{0}} client{0,choice,0#s|1#|2#s} " "personnel{0,choice,0#s|1#|2#s}."); UErrorCode status = U_ZERO_ERROR; MessageFormat *mf = new MessageFormat(pat, status); failure(status, "new MessageFormat"); UnicodeString PREFIX [] = { CharsToUnicodeString("'Vous' n'avez s\\u00E9lectionn\\u00E9 aucun clients personnels."), CharsToUnicodeString("'Vous' avez s\\u00E9lectionn\\u00E9 "), CharsToUnicodeString("'Vous' avez s\\u00E9lectionn\\u00E9 ") }; UnicodeString SUFFIX [] = { UnicodeString(), UNICODE_STRING(" client personnel.", 18), UNICODE_STRING(" clients personnels.", 20) }; for (int i=0; i<3; i++) { UnicodeString out; //out = mf->format(new Object[]{new Integer(i)}); Formattable objs [] = { Formattable((int32_t)i) }; FieldPosition pos(FieldPosition::DONT_CARE); out = mf->format(objs, 1, out, pos, status); if (!failure(status, "mf->format", TRUE)) { if (SUFFIX[i] == "") { if (out != PREFIX[i]) errln((UnicodeString)"" + i + ": Got \"" + out + "\"; Want \"" + PREFIX[i] + "\""); } else { if (!out.startsWith(PREFIX[i]) || !out.endsWith(SUFFIX[i])) errln((UnicodeString)"" + i + ": Got \"" + out + "\"; Want \"" + PREFIX[i] + "\"...\"" + SUFFIX[i] + "\""); } } } delete mf; }
void PluralRulesTest::testOrdinal() { IcuTestErrorCode errorCode(*this, "testOrdinal"); LocalPointer<PluralRules> pr(PluralRules::forLocale("en", UPLURAL_TYPE_ORDINAL, errorCode)); if (errorCode.logIfFailureAndReset("PluralRules::forLocale(en, UPLURAL_TYPE_ORDINAL) failed")) { return; } UnicodeString keyword = pr->select(2.); if (keyword != UNICODE_STRING("two", 3)) { dataerrln("PluralRules(en-ordinal).select(2) failed"); } }
void RBBISetBuilder::printRangeGroups() { RangeDescriptor * rlRange; RangeDescriptor * tRange; int i; int lastPrintedGroupNum = 0; RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); for (rlRange = fRangeList; rlRange != 0; rlRange = rlRange->fNext) { int groupNum = rlRange->fNum & 0xbfff; if (groupNum > lastPrintedGroupNum) { lastPrintedGroupNum = groupNum; RBBIDebugPrintf("%2i ", groupNum); if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");} for (i = 0; i < rlRange->fIncludesSets->size(); i++) { RBBINode * usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName = UNICODE_STRING("anon", 4); RBBINode * setRef = usetNode->fParent; if (setRef != NULL) { RBBINode * varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } i = 0; for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { if (tRange->fNum == rlRange->fNum) { if (i++ % 5 == 0) { RBBIDebugPrintf("\n "); } RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); } } RBBIDebugPrintf("\n"); } } RBBIDebugPrintf("\n"); }
void StringTest::Test_UNICODE_STRING() { UnicodeString ustringVar=UNICODE_STRING("aZ0 -", 5); if( ustringVar.length()!=5 || ustringVar[0]!=0x61 || ustringVar[1]!=0x5a || ustringVar[2]!=0x30 || ustringVar[3]!=0x20 || ustringVar[4]!=0x2d ) { errln("Test_UNICODE_STRING: UNICODE_STRING does not work right! " "See unistr.h and utypes.h with platform.h."); } }
//------------------------------------------------------------------------------------- // // RangeDescriptor::setDictionaryFlag // // Character Category Numbers that include characters from // the original Unicode Set named "dictionary" have bit 14 // set to 1. The RBBI runtime engine uses this to trigger // use of the word dictionary. // // This function looks through the Unicode Sets that it // (the range) includes, and sets the bit in fNum when // "dictionary" is among them. // // TODO: a faster way would be to find the set node for // "dictionary" just once, rather than looking it // up by name every time. // //------------------------------------------------------------------------------------- void RangeDescriptor::setDictionaryFlag() { int i; for (i=0; i<this->fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); UnicodeString setName; RBBINode *setRef = usetNode->fParent; if (setRef != NULL) { RBBINode *varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals. this->fNum |= 0x4000; break; } } }
U_CDECL_END /** * Constructs a transliterator with the default delimiters '{' and * '}'. */ NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) : Transliterator(UNICODE_STRING("Name-Any", 8), adoptedFilter) { UnicodeSet *legalPtr = &legal; // Get the legal character set USetAdder sa = { (USet *)legalPtr, // USet* == UnicodeSet* _set_add, NULL, // Don't need _set_addRange NULL, // Don't need _set_addString NULL // Don't need remove() }; uprv_getCharNameCharacters(&sa); }
SubStringCharIter() { setText(UNICODE_STRING("abc", 3)); }
void DataDrivenNumberFormatTestSuite::run(const char *fileName, UBool runAllTests) { fFileLineNumber = 0; fFormatTestNumber = 0; UErrorCode status = U_ZERO_ERROR; for (int32_t i = 0; i < UPRV_LENGTHOF(fPreviousFormatters); ++i) { delete fPreviousFormatters[i]; fPreviousFormatters[i] = newFormatter(status); } if (!assertSuccess("Can't create previous formatters", status)) { return; } CharString path(getSourceTestData(status), status); path.appendPathPart(fileName, status); const char *codePage = "UTF-8"; LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status)); if (!assertSuccess("Can't open data file", status)) { return; } UnicodeString columnValues[kNumberFormatTestTupleFieldCount]; ENumberFormatTestTupleField columnTypes[kNumberFormatTestTupleFieldCount]; int32_t columnCount; int32_t state = 0; while(U_SUCCESS(status)) { // Read a new line if necessary. if(fFileLine.isEmpty()) { if(!readLine(f.getAlias(), status)) { break; } if (fFileLine.isEmpty() && state == 2) { state = 0; } continue; } if (fFileLine.startsWith("//")) { fFileLine.remove(); continue; } // Initial setup of test. if (state == 0) { if (fFileLine.startsWith(UNICODE_STRING("test ", 5))) { fFileTestName = fFileLine; fTuple.clear(); } else if(fFileLine.startsWith(UNICODE_STRING("set ", 4))) { setTupleField(status); } else if(fFileLine.startsWith(UNICODE_STRING("begin", 5))) { state = 1; } else { showError("Unrecognized verb."); return; } // column specification } else if (state == 1) { columnCount = splitBy(columnValues, UPRV_LENGTHOF(columnValues), 0x9); for (int32_t i = 0; i < columnCount; ++i) { columnTypes[i] = NumberFormatTestTuple::getFieldByName( columnValues[i]); if (columnTypes[i] == kNumberFormatTestTupleFieldCount) { showError("Unrecognized field name."); return; } } state = 2; // run the tests } else { int32_t columnsInThisRow = splitBy(columnValues, columnCount, 0x9); for (int32_t i = 0; i < columnsInThisRow; ++i) { fTuple.setField( columnTypes[i], columnValues[i].unescape(), status); } for (int32_t i = columnsInThisRow; i < columnCount; ++i) { fTuple.clearField(columnTypes[i], status); } if (U_FAILURE(status)) { showError("Invalid column values"); return; } if (!breaksC() || runAllTests) { UnicodeString errorMessage; if (!isPass(fTuple, errorMessage, status)) { showFailure(errorMessage); } } } fFileLine.remove(); } }
UXMLElement * UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { char bytes[4096], charsetBuffer[100]; FileStream *f; const char *charset, *pb; UnicodeString src; UConverter *cnv; UChar *buffer, *pu; int32_t fileLength, bytesLength, length, capacity; UBool flush; if(U_FAILURE(errorCode)) { return NULL; } f=T_FileStream_open(filename, "rb"); if(f==NULL) { errorCode=U_FILE_ACCESS_ERROR; return NULL; } bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); if(bytesLength<(int32_t)sizeof(bytes)) { // we have already read the entire file fileLength=bytesLength; } else { // get the file length fileLength=T_FileStream_size(f); } /* * get the charset: * 1. Unicode signature * 2. treat as ISO-8859-1 and read XML encoding="charser" * 3. default to UTF-8 */ charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); if(U_SUCCESS(errorCode) && charset!=NULL) { // open converter according to Unicode signature cnv=ucnv_open(charset, &errorCode); } else { // read as Latin-1 and parse the XML declaration and encoding cnv=ucnv_open("ISO-8859-1", &errorCode); if(U_FAILURE(errorCode)) { // unexpected error opening Latin-1 converter goto exit; } buffer=src.getBuffer(bytesLength); if(buffer==NULL) { // unexpected failure to reserve some string capacity errorCode=U_MEMORY_ALLOCATION_ERROR; goto exit; } pb=bytes; pu=buffer; ucnv_toUnicode( cnv, &pu, buffer+src.getCapacity(), &pb, bytes+bytesLength, NULL, TRUE, &errorCode); src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); ucnv_close(cnv); cnv=NULL; if(U_FAILURE(errorCode)) { // unexpected error in conversion from Latin-1 src.remove(); goto exit; } // parse XML declaration if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { int32_t declEnd=mXMLDecl.end(errorCode); // go beyond <?xml int32_t pos=src.indexOf((UChar)x_l)+1; mAttrValue.reset(src); while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. UnicodeString attName = mAttrValue.group(1, errorCode); UnicodeString attValue = mAttrValue.group(2, errorCode); // Trim the quotes from the att value. These are left over from the original regex // that parsed the attribue, which couldn't conveniently strip them. attValue.remove(0,1); // one char from the beginning attValue.truncate(attValue.length()-1); // and one from the end. if(attName==UNICODE_STRING("encoding", 8)) { length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); charset=charsetBuffer; break; } pos = mAttrValue.end(2, errorCode); } if(charset==NULL) { // default to UTF-8 charset="UTF-8"; } cnv=ucnv_open(charset, &errorCode); } } if(U_FAILURE(errorCode)) { // unable to open the converter goto exit; } // convert the file contents capacity=fileLength; // estimated capacity src.getBuffer(capacity); src.releaseBuffer(0); // zero length flush=FALSE; for(;;) { // convert contents of bytes[bytesLength] pb=bytes; for(;;) { length=src.length(); buffer=src.getBuffer(capacity); if(buffer==NULL) { // unexpected failure to reserve some string capacity errorCode=U_MEMORY_ALLOCATION_ERROR; goto exit; } pu=buffer+length; ucnv_toUnicode( cnv, &pu, buffer+src.getCapacity(), &pb, bytes+bytesLength, NULL, FALSE, &errorCode); src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); if(errorCode==U_BUFFER_OVERFLOW_ERROR) { errorCode=U_ZERO_ERROR; capacity=(3*src.getCapacity())/2; // increase capacity by 50% } else { break; } } if(U_FAILURE(errorCode)) { break; // conversion error } if(flush) { break; // completely converted the file } // read next block bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); if(bytesLength==0) { // reached end of file, convert once more to flush the converter flush=TRUE; } }; exit: ucnv_close(cnv); T_FileStream_close(f); if(U_SUCCESS(errorCode)) { return parse(src, errorCode); } else { return NULL; } }
UnicodeString& RelativeDateFormat::format( Calendar& cal, UnicodeString& appendTo, FieldPosition& pos) const { UErrorCode status = U_ZERO_ERROR; UnicodeString relativeDayString; UDisplayContext capitalizationContext = getContext(UDISPCTX_TYPE_CAPITALIZATION, status); // calculate the difference, in days, between 'cal' and now. int dayDiff = dayDifference(cal, status); // look up string int32_t len = 0; const UChar *theString = getStringForDay(dayDiff, len, status); if(U_SUCCESS(status) && (theString!=NULL)) { // found a relative string relativeDayString.setTo(theString, len); } if ( relativeDayString.length() > 0 && !fDatePattern.isEmpty() && (fTimePattern.isEmpty() || fCombinedFormat == NULL || fCombinedHasDateAtStart)) { #if !UCONFIG_NO_BREAK_ITERATION // capitalize relativeDayString according to context for relative, set formatter no context if ( u_islower(relativeDayString.char32At(0)) && fCapitalizationBrkIter!= NULL && ( capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU && fCapitalizationOfRelativeUnitsForUIListMenu) || (capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_STANDALONE && fCapitalizationOfRelativeUnitsForStandAlone) ) ) { // titlecase first word of relativeDayString relativeDayString.toTitle(fCapitalizationBrkIter, fLocale, U_TITLECASE_NO_LOWERCASE | U_TITLECASE_NO_BREAK_ADJUSTMENT); } #endif fDateTimeFormatter->setContext(UDISPCTX_CAPITALIZATION_NONE, status); } else { // set our context for the formatter fDateTimeFormatter->setContext(capitalizationContext, status); } if (fDatePattern.isEmpty()) { fDateTimeFormatter->applyPattern(fTimePattern); fDateTimeFormatter->format(cal,appendTo,pos); } else if (fTimePattern.isEmpty() || fCombinedFormat == NULL) { if (relativeDayString.length() > 0) { appendTo.append(relativeDayString); } else { fDateTimeFormatter->applyPattern(fDatePattern); fDateTimeFormatter->format(cal,appendTo,pos); } } else { UnicodeString datePattern; if (relativeDayString.length() > 0) { // Need to quote the relativeDayString to make it a legal date pattern relativeDayString.findAndReplace(UNICODE_STRING("'", 1), UNICODE_STRING("''", 2)); // double any existing APOSTROPHE relativeDayString.insert(0, APOSTROPHE); // add APOSTROPHE at beginning... relativeDayString.append(APOSTROPHE); // and at end datePattern.setTo(relativeDayString); } else { datePattern.setTo(fDatePattern); } UnicodeString combinedPattern; fCombinedFormat->format(fTimePattern, datePattern, combinedPattern, status); fDateTimeFormatter->applyPattern(combinedPattern); fDateTimeFormatter->format(cal,appendTo,pos); } return appendTo; }
TitlecaseTransliterator::TitlecaseTransliterator() : CaseMapTransliterator(UNICODE_STRING("Any-Title", 9), NULL) { // Need to look back 2 characters in the case of "can't" setMaximumContextLength(2); }
/** * Constructs a transliterator. */ UnicodeNameTransliterator::UnicodeNameTransliterator(UnicodeFilter* adoptedFilter) : Transliterator(UNICODE_STRING("Any-Name", 8), adoptedFilter) { }
//---------------------------------------------------------------------------------------- // // doParseAction Do some action during rule parsing. // Called by the parse state machine. // Actions build the parse tree and Unicode Sets, // and maintain the parse stack for nested expressions. // // TODO: unify EParseAction and RBBI_RuleParseAction enum types. // They represent exactly the same thing. They're separate // only to work around enum forward declaration restrictions // in some compilers, while at the same time avoiding multiple // definitions problems. I'm sure that there's a better way. // //---------------------------------------------------------------------------------------- UBool RBBIRuleScanner::doParseActions(EParseAction action) { RBBINode *n = NULL; UBool returnVal = TRUE; switch ((RBBI_RuleParseAction)action) { case doExprStart: pushNewNode(RBBINode::opStart); fRuleNum++; break; case doExprOrOperator: { fixOpStack(RBBINode::precOpCat); RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; RBBINode *orNode = pushNewNode(RBBINode::opOr); orNode->fLeftChild = operandNode; operandNode->fParent = orNode; } break; case doExprCatOperator: // concatenation operator. // For the implicit concatenation of adjacent terms in an expression that are // not separated by any other operator. Action is invoked between the // actions for the two terms. { fixOpStack(RBBINode::precOpCat); RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; RBBINode *catNode = pushNewNode(RBBINode::opCat); catNode->fLeftChild = operandNode; operandNode->fParent = catNode; } break; case doLParen: // Open Paren. // The openParen node is a dummy operation type with a low precedence, // which has the affect of ensuring that any real binary op that // follows within the parens binds more tightly to the operands than // stuff outside of the parens. pushNewNode(RBBINode::opLParen); break; case doExprRParen: fixOpStack(RBBINode::precLParen); break; case doNOP: break; case doStartAssign: // We've just scanned "$variable = " // The top of the node stack has the $variable ref node. // Save the start position of the RHS text in the StartExpression node // that precedes the $variableReference node on the stack. // This will eventually be used when saving the full $variable replacement // text as a string. n = fNodeStack[fNodeStackPtr-1]; n->fFirstPos = fNextIndex; // move past the '=' // Push a new start-of-expression node; needed to keep parse of the // RHS expression happy. pushNewNode(RBBINode::opStart); break; case doEndAssign: { // We have reached the end of an assignement statement. // Current scan char is the ';' that terminates the assignment. // Terminate expression, leaves expression parse tree rooted in TOS node. fixOpStack(RBBINode::precStart); RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; // Save original text of right side of assignment, excluding the terminating ';' // in the root of the node for the right-hand-side expression. RHSExprNode->fFirstPos = startExprNode->fFirstPos; RHSExprNode->fLastPos = fScanIndex; fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); // Expression parse tree becomes l. child of the $variable reference node. varRefNode->fLeftChild = RHSExprNode; RHSExprNode->fParent = varRefNode; // Make a symbol table entry for the $variableRef node. fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); if (U_FAILURE(*fRB->fStatus)) { // This is a round-about way to get the parse position set // so that duplicate symbols error messages include a line number. UErrorCode t = *fRB->fStatus; *fRB->fStatus = U_ZERO_ERROR; error(t); } // Clean up the stack. delete startExprNode; fNodeStackPtr-=3; break; } case doEndOfRule: { fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. break; } #ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) { printNodeStack("end of rule"); } #endif U_ASSERT(fNodeStackPtr == 1); // If this rule includes a look-ahead '/', add a endMark node to the // expression tree. if (fLookAheadRule) { RBBINode *thisRule = fNodeStack[fNodeStackPtr]; RBBINode *endNode = pushNewNode(RBBINode::endMark); RBBINode *catNode = pushNewNode(RBBINode::opCat); fNodeStackPtr -= 2; catNode->fLeftChild = thisRule; catNode->fRightChild = endNode; fNodeStack[fNodeStackPtr] = catNode; endNode->fVal = fRuleNum; endNode->fLookAheadEnd = TRUE; } // All rule expressions are ORed together. // The ';' that terminates an expression really just functions as a '|' with // a low operator prededence. // // Each of the four sets of rules are collected separately. // (forward, reverse, safe_forward, safe_reverse) // OR this rule into the appropriate group of them. // RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree); if (*destRules != NULL) { // This is not the first rule encounted. // OR previous stuff (from *destRules) // with the current rule expression (on the Node Stack) // with the resulting OR expression going to *destRules // RBBINode *thisRule = fNodeStack[fNodeStackPtr]; RBBINode *prevRules = *destRules; RBBINode *orNode = pushNewNode(RBBINode::opOr); orNode->fLeftChild = prevRules; prevRules->fParent = orNode; orNode->fRightChild = thisRule; thisRule->fParent = orNode; *destRules = orNode; } else { // This is the first rule encountered (for this direction). // Just move its parse tree from the stack to *destRules. *destRules = fNodeStack[fNodeStackPtr]; } fReverseRule = FALSE; // in preparation for the next rule. fLookAheadRule = FALSE; fNodeStackPtr = 0; } break; case doRuleError: error(U_BRK_RULE_SYNTAX); returnVal = FALSE; break; case doVariableNameExpectedErr: error(U_BRK_RULE_SYNTAX); break; // // Unary operands + ? * // These all appear after the operand to which they apply. // When we hit one, the operand (may be a whole sub expression) // will be on the top of the stack. // Unary Operator becomes TOS, with the old TOS as its one child. case doUnaryOpPlus: { RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; RBBINode *plusNode = pushNewNode(RBBINode::opPlus); plusNode->fLeftChild = operandNode; operandNode->fParent = plusNode; } break; case doUnaryOpQuestion: { RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; RBBINode *qNode = pushNewNode(RBBINode::opQuestion); qNode->fLeftChild = operandNode; operandNode->fParent = qNode; } break; case doUnaryOpStar: { RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; RBBINode *starNode = pushNewNode(RBBINode::opStar); starNode->fLeftChild = operandNode; operandNode->fParent = starNode; } break; case doRuleChar: // A "Rule Character" is any single character that is a literal part // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" // These are pretty uncommon in break rules; the terms are more commonly // sets. To keep things uniform, treat these characters like as // sets that just happen to contain only one character. { n = pushNewNode(RBBINode::setRef); findSetFor(fC.fChar, n); n->fFirstPos = fScanIndex; n->fLastPos = fNextIndex; fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); break; } case doDotAny: // scanned a ".", meaning match any single character. { n = pushNewNode(RBBINode::setRef); findSetFor(kAny, n); n->fFirstPos = fScanIndex; n->fLastPos = fNextIndex; fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); break; } case doSlash: // Scanned a '/', which identifies a look-ahead break position in a rule. n = pushNewNode(RBBINode::lookAhead); n->fVal = fRuleNum; n->fFirstPos = fScanIndex; n->fLastPos = fNextIndex; fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); fLookAheadRule = TRUE; break; case doStartTagValue: // Scanned a '{', the opening delimiter for a tag value within a rule. n = pushNewNode(RBBINode::tag); n->fVal = 0; n->fFirstPos = fScanIndex; n->fLastPos = fNextIndex; break; case doTagDigit: // Just scanned a decimal digit that's part of a tag value { n = fNodeStack[fNodeStackPtr]; uint32_t v = u_charDigitValue(fC.fChar); U_ASSERT(v < 10); n->fVal = n->fVal*10 + v; break; } case doTagValue: n = fNodeStack[fNodeStackPtr]; n->fLastPos = fNextIndex; fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); break; case doTagExpectedError: error(U_BRK_MALFORMED_RULE_TAG); returnVal = FALSE; break; case doOptionStart: // Scanning a !!option. At the start of string. fOptionStart = fScanIndex; break; case doOptionEnd: { UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); if (opt == UNICODE_STRING("chain", 5)) { fRB->fChainRules = TRUE; } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { fRB->fLBCMNoChain = TRUE; } else if (opt == UNICODE_STRING("forward", 7)) { fRB->fDefaultTree = &fRB->fForwardTree; } else if (opt == UNICODE_STRING("reverse", 7)) { fRB->fDefaultTree = &fRB->fReverseTree; } else if (opt == UNICODE_STRING("safe_forward", 12)) { fRB->fDefaultTree = &fRB->fSafeFwdTree; } else if (opt == UNICODE_STRING("safe_reverse", 12)) { fRB->fDefaultTree = &fRB->fSafeRevTree; } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { fRB->fLookAheadHardBreak = TRUE; } else { error(U_BRK_UNRECOGNIZED_OPTION); } } break; case doReverseDir: fReverseRule = TRUE; break; case doStartVariableName: n = pushNewNode(RBBINode::varRef); if (U_FAILURE(*fRB->fStatus)) { break; } n->fFirstPos = fScanIndex; break; case doEndVariableName: n = fNodeStack[fNodeStackPtr]; if (n==NULL || n->fType != RBBINode::varRef) { error(U_BRK_INTERNAL_ERROR); break; } n->fLastPos = fScanIndex; fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); // Look the newly scanned name up in the symbol table // If there's an entry, set the l. child of the var ref to the replacement expression. // (We also pass through here when scanning assignments, but no harm is done, other // than a slight wasted effort that seems hard to avoid. Lookup will be null) n->fLeftChild = fSymbolTable->lookupNode(n->fText); break; case doCheckVarDef: n = fNodeStack[fNodeStackPtr]; if (n->fLeftChild == NULL) { error(U_BRK_UNDEFINED_VARIABLE); returnVal = FALSE; } break; case doExprFinished: break; case doRuleErrorAssignExpr: error(U_BRK_ASSIGN_ERROR); returnVal = FALSE; break; case doExit: returnVal = FALSE; break; case doScanUnicodeSet: scanSet(); break; default: error(U_BRK_INTERNAL_ERROR); returnVal = FALSE; break; } return returnVal; }
/** * Constructs a transliterator. */ LowercaseTransliterator::LowercaseTransliterator() : CaseMapTransliterator(UNICODE_STRING("Any-Lower", 9), ucase_toFullLower) { }
_tstring SMCDatasetVector::GetName() { UGC::UGDatasetVector* pDatasetVector = (UGC::UGDatasetVector*)m_pDataset ; return UNICODE_STRING(pDatasetVector->GetName().Cstr()); }
ICUBreakIteratorService() : ICULocaleService(UNICODE_STRING("Break Iterator", 14)) { UErrorCode status = U_ZERO_ERROR; registerFactory(new ICUBreakIteratorFactory(), status); }
/** * Constructs a transliterator. */ UppercaseTransliterator::UppercaseTransliterator() : CaseMapTransliterator(UNICODE_STRING("Any-Upper", 9), ucase_toFullUpper) { }