// --------------------------------------------------------------------------- // BlockRangeFactory: Range creation methods // --------------------------------------------------------------------------- void BlockRangeFactory::buildRanges() { if (fRangesCreated) return; if (!fKeywordsInitialized) { initializeKeywordMap(); } RangeTokenMap* rangeTokMap = RangeTokenMap::instance(); TokenFactory* tokFactory = rangeTokMap->getTokenFactory(); //for performance, once the desired specials and private use are found //don't need to compareString anymore bool foundSpecial = false; bool foundPrivate = false; for (int i=0; i < BLOCKNAMESIZE; i++) { RangeToken* tok = tokFactory->createRange(); tok->addRange(blockRanges[i*2], blockRanges[(i*2)+1]); if (!foundSpecial && XMLString::equals((XMLCh*)fgBlockNames[i] , (XMLCh*) fgBlockIsSpecials)) { tok->addRange(0xFFF0, 0xFFFD); foundSpecial = true; } if (!foundPrivate && XMLString::equals((XMLCh*)fgBlockNames[i] , (XMLCh*) fgBlockIsPrivateUse)) { tok->addRange(0xF0000, 0xFFFFD); tok->addRange(0x100000, 0x10FFFD); foundPrivate = true; } rangeTokMap->setRangeToken(fgBlockNames[i], tok); } fRangesCreated = true; }
/** * for RANGE: Creates complement. * for NRANGE: Creates the same meaning RANGE. */ Token* RangeToken::complementRanges(RangeToken* const tok, TokenFactory* const tokFactory, MemoryManager* const manager) { if (tok->getTokenType() != T_RANGE && tok->getTokenType() != T_NRANGE) ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_ComplementRangesInvalidArg, manager); tok->sortRanges(); tok->compactRanges(); XMLInt32 lastElem = tok->fRanges[tok->fElemCount - 1]; RangeToken* rangeTok = tokFactory->createRange(); if (tok->fRanges[0] > 0) { rangeTok->addRange(0, tok->fRanges[0] - 1); } for (unsigned int i= 1; i< tok->fElemCount - 2; i += 2) { rangeTok->addRange(tok->fRanges[i] + 1, tok->fRanges[i+1] - 1); } if (lastElem != UTF16_MAX) { rangeTok->addRange(lastElem + 1, UTF16_MAX); } rangeTok->fCompacted = true; return rangeTok; }
bool RegularExpression::matchRange(Context* const context, const Op* const op, int& offset, const short direction, const bool ignoreCase) { int tmpOffset = direction > 0 ? offset : offset - 1; if (tmpOffset >= context->fLimit || tmpOffset < 0) return false; XMLInt32 strCh = 0; if (!context->nextCh(strCh, tmpOffset, direction)) return false; RangeToken* tok = (RangeToken *) op->getToken(); bool match = false; if (ignoreCase) { tok = tok->getCaseInsensitiveToken(fTokenFactory); } match = tok->match(strCh); if (!match) return false; offset = (direction > 0) ? ++tmpOffset : tmpOffset; return true; }
// --------------------------------------------------------------------------- // RangeToken: Getter methods // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { if (fCaseIToken == 0 && tokFactory) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); for (unsigned int i = 0; i < fElemCount - 1; i += 2) { for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) { #if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER) const XMLInt32 upperCh = u_toupper(ch); if (upperCh != ch) { lwrToken->addRange(upperCh, upperCh); } const XMLInt32 lowerCh = u_tolower(ch); if (lowerCh != ch) { lwrToken->addRange(lowerCh, lowerCh); } const XMLInt32 titleCh = u_totitle(ch); if (titleCh != ch && titleCh != upperCh) { lwrToken->addRange(titleCh, titleCh); } #else if (ch >= chLatin_A && ch <= chLatin_Z) { ch += chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } else if (ch >= chLatin_a && ch <= chLatin_z) { ch -= chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } #endif } } lwrToken->mergeRanges(this); lwrToken->compactRanges(); lwrToken->createMap(); fCaseIToken = lwrToken; } return fCaseIToken; }
// --------------------------------------------------------------------------- // RangeToken: Getter methods // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { // REVIST // We will not build a token with case insenstive ranges // For now we will return a copy of ourselves. if (fCaseIToken == 0 && tokFactory) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); lwrToken->mergeRanges(this); fCaseIToken = lwrToken; } return fCaseIToken; }
bool RegularExpression::matchRange(Context* const context, const Op* const op, int& offset, const short direction, const bool ignoreCase) { int tmpOffset = direction > 0 ? offset : offset - 1; if (tmpOffset >= context->fLimit || tmpOffset < 0) return false; XMLInt32 strCh = 0; if (!context->nextCh(strCh, tmpOffset, direction)) return false; RangeToken* tok = (RangeToken *) op->getToken(); bool match = false; if (ignoreCase) { //REVISIT we should match ignoring case, but for now //we will do a normal match //tok = tok->getCaseInsensitiveToken(); //if (!token->match(strCh)) { // if (strCh > 0x10000) // return -1; // Do case insensitive matching - uppercase match // or lowercase match //} match = tok->match(strCh); } else match = tok->match(strCh); if (!match) return false; offset = (direction > 0) ? ++tmpOffset : tmpOffset; return true; }
void RangeToken::mergeRanges(const Token *const tok) { if (tok->getTokenType() != this->getTokenType()) ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_MergeRangesTypeMismatch, fMemoryManager); RangeToken* rangeTok = (RangeToken *) tok; if (rangeTok->fRanges == 0) return; fCaseIToken = 0; sortRanges(); rangeTok->sortRanges(); if (fRanges == 0) { fMaxCount = rangeTok->fMaxCount; fRanges = (XMLInt32*) fMemoryManager->allocate ( fMaxCount * sizeof(XMLInt32) );//new XMLInt32[fMaxCount]; for (unsigned int index = 0; index < rangeTok->fElemCount; index++) { fRanges[index] = rangeTok->fRanges[index]; } fElemCount = rangeTok->fElemCount; return; } unsigned int newMaxCount = (fElemCount + rangeTok->fElemCount >= fMaxCount) ? fMaxCount + rangeTok->fMaxCount : fMaxCount; XMLInt32* result = (XMLInt32*) fMemoryManager->allocate ( newMaxCount * sizeof(XMLInt32) );//new XMLInt32[newMaxCount]; for (unsigned int i=0, j=0, k=0; i < fElemCount || j < rangeTok->fElemCount;) { if (i >= fElemCount) { for (int count = 0; count < 2; count++) { result[k++] = rangeTok->fRanges[j++]; } } else if (j >= rangeTok->fElemCount) { for (int count = 0; count < 2; count++) { result[k++] = fRanges[i++]; } } else if (rangeTok->fRanges[j] < fRanges[i] || (rangeTok->fRanges[j] == fRanges[i] && rangeTok->fRanges[j+1] < fRanges[i+1])) { for (int count = 0; count < 2; count++) { result[k++] = rangeTok->fRanges[j++]; } } else { for (int count = 0; count < 2; count++) { result[k++] = fRanges[i++]; } } } fMemoryManager->deallocate(fRanges);//delete [] fRanges; fElemCount += rangeTok->fElemCount; fRanges = result; fMaxCount = newMaxCount; }
bool RegularExpression::matches(const XMLCh* const expression, const int start, const int end, Match* const pMatch , MemoryManager* const manager) { if (fOperations == 0) prepare(); Context context(manager); int strLength = XMLString::stringLen(expression); context.reset(expression, strLength, start, end, fNoClosures); bool adoptMatch = false; Match* lMatch = pMatch; if (lMatch != 0) { lMatch->setNoGroups(fNoGroups); } else if (fHasBackReferences) { lMatch = new (fMemoryManager) Match(fMemoryManager); lMatch->setNoGroups(fNoGroups); adoptMatch = true; } if (context.fAdoptMatch) delete context.fMatch; context.fMatch = lMatch; context.fAdoptMatch = adoptMatch; if (isSet(fOptions, XMLSCHEMA_MODE)) { int matchEnd = match(&context, fOperations, context.fStart, 1); if (matchEnd == context.fLimit) { if (context.fMatch != 0) { context.fMatch->setStartPos(0, context.fStart); context.fMatch->setEndPos(0, matchEnd); } return true; } return false; } /* * If the pattern has only fixed string, use Boyer-Moore */ if (fFixedStringOnly) { int ret = fBMPattern->matches(expression, context.fStart, context.fLimit); if (ret >= 0) { if (context.fMatch != 0) { context.fMatch->setStartPos(0, ret); context.fMatch->setEndPos(0, ret + strLength); } return true; } return false; } /* * If the pattern contains a fixed string, we check with Boyer-Moore * whether the text contains the fixed string or not. If not found * return false */ if (fFixedString != 0) { int ret = fBMPattern->matches(expression, context.fStart, context.fLimit); if (ret < 0) { // No match return false; } } int limit = context.fLimit - fMinLength; int matchStart; int matchEnd = -1; /* * Check whether the expression start with ".*" */ if (fOperations != 0 && fOperations->getOpType() == Op::O_CLOSURE && fOperations->getChild()->getOpType() == Op::O_DOT) { if (isSet(fOptions, SINGLE_LINE)) { matchStart = context.fStart; matchEnd = match(&context, fOperations, matchStart, 1); } else { bool previousIsEOL = true; for (matchStart=context.fStart; matchStart<=limit; matchStart++) { XMLCh ch = expression[matchStart]; if (RegxUtil::isEOLChar(ch)) { previousIsEOL = true; } else { if (previousIsEOL) { if (0 <= (matchEnd = match(&context, fOperations, matchStart, 1))) break; } previousIsEOL = false; } } } } else { /* * Optimization against the first char */ if (fFirstChar != 0) { bool ignoreCase = isSet(fOptions, IGNORE_CASE); RangeToken* range = fFirstChar; if (ignoreCase) range = fFirstChar->getCaseInsensitiveToken(fTokenFactory); for (matchStart=context.fStart; matchStart<=limit; matchStart++) { XMLInt32 ch; if (!context.nextCh(ch, matchStart, 1)) break; if (!range->match(ch)) { if (!ignoreCase) continue; // Perform case insensitive match // REVISIT continue; } if (0 <= (matchEnd = match(&context,fOperations,matchStart,1))) break; } } else { /* * Straightforward matching */ for (matchStart=context.fStart; matchStart<=limit; matchStart++) { if (0 <= (matchEnd = match(&context,fOperations,matchStart,1))) break; } } } if (matchEnd >= 0) { if (context.fMatch != 0) { context.fMatch->setStartPos(0, matchStart); context.fMatch->setEndPos(0, matchEnd); } return true; } return false; }
/* * Prepares for matching. This method is called just before starting matching */ void RegularExpression::prepare() { XMLMutexLock lockInit(&fMutex); compile(fTokenTree); fMinLength = fTokenTree->getMinLength(); fFirstChar = 0; if (!isSet(fOptions, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) && !isSet(fOptions, XMLSCHEMA_MODE)) { RangeToken* rangeTok = fTokenFactory->createRange(); int result = fTokenTree->analyzeFirstCharacter(rangeTok, fOptions, fTokenFactory); if (result == Token::FC_TERMINAL) { rangeTok->compactRanges(); fFirstChar = rangeTok; } } if (fOperations != 0 && fOperations->getNextOp() == 0 && (fOperations->getOpType() == Op::O_STRING || fOperations->getOpType() == Op::O_CHAR) ) { fFixedStringOnly = true; if (fOperations->getOpType() == Op::O_STRING) { fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = XMLString::replicate(fOperations->getLiteral(), fMemoryManager); } else{ XMLInt32 ch = fOperations->getData(); if ( ch >= 0x10000) { // add as constant fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = RegxUtil::decomposeToSurrogates(ch, fMemoryManager); } else { XMLCh* dummyStr = (XMLCh*) fMemoryManager->allocate(2 * sizeof(XMLCh));//new XMLCh[2]; dummyStr[0] = (XMLCh) fOperations->getData(); dummyStr[1] = chNull; fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = dummyStr; } } fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256, isSet(fOptions, IGNORE_CASE), fMemoryManager); } else if (!isSet(fOptions, XMLSCHEMA_MODE) && !isSet(fOptions, PROHIBIT_FIXED_STRING_OPTIMIZATION)) { int fixedOpts = 0; Token* tok = fTokenTree->findFixedString(fOptions, fixedOpts); fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = (tok == 0) ? 0 : XMLString::replicate(tok->getString(), fMemoryManager); if (fFixedString != 0 && XMLString::stringLen(fFixedString) < 2) { fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; fFixedString = 0; } if (fFixedString != 0) { fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256, isSet(fixedOpts, IGNORE_CASE)); } } }
// --------------------------------------------------------------------------- // ASCIIRangeFactory: Range creation methods // --------------------------------------------------------------------------- void ASCIIRangeFactory::buildRanges() { if (fRangesCreated) return; if (!fKeywordsInitialized) { initializeKeywordMap(); } RangeTokenMap* rangeTokMap = RangeTokenMap::instance(); TokenFactory* tokFactory = rangeTokMap->getTokenFactory(); // Create space ranges RangeToken* tok = tokFactory->createRange(); tok->addRange(chHTab, chHTab); tok->addRange(chLF, chLF); tok->addRange(chFF, chFF); tok->addRange(chCR, chCR); tok->addRange(chSpace, chSpace); rangeTokMap->setRangeToken(fgASCIISpace, tok); // Create digits ranges tok = tokFactory->createRange(); tok->addRange(chDigit_0, chDigit_9); rangeTokMap->setRangeToken(fgASCIIDigit, tok); // Create word ranges tok = tokFactory->createRange(); tok->addRange(chDigit_0, chDigit_9); tok->addRange(chLatin_A, chLatin_Z); tok->addRange(chUnderscore, chUnderscore); tok->addRange(chLatin_a, chLatin_z); rangeTokMap->setRangeToken(fgASCIIWord, tok); // Create xdigit ranges tok = tokFactory->createRange(); tok->addRange(chDigit_0, chDigit_9); tok->addRange(chLatin_A, chLatin_F); tok->addRange(chLatin_a, chLatin_a); rangeTokMap->setRangeToken(fgASCIIXDigit, tok); // Create ascii ranges tok = tokFactory->createRange(); tok->addRange(0x00, 0x7F); rangeTokMap->setRangeToken(fgASCII, tok); fRangesCreated = true; }
// --------------------------------------------------------------------------- // ASCIIRangeFactory: Range creation methods // --------------------------------------------------------------------------- void ASCIIRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) { if (fRangesCreated) return; if (!fKeywordsInitialized) { initializeKeywordMap(rangeTokMap); } TokenFactory* tokFactory = rangeTokMap->getTokenFactory(); // Create space ranges RangeToken* tok = tokFactory->createRange(); tok->addRange(chHTab, chHTab); tok->addRange(chLF, chLF); tok->addRange(chFF, chFF); tok->addRange(chCR, chCR); tok->addRange(chSpace, chSpace); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCIISpace, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCIISpace, tok , true); // Create digits ranges tok = tokFactory->createRange(); tok->addRange(chDigit_0, chDigit_9); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCIIDigit, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCIIDigit, tok , true); // Create word ranges tok = tokFactory->createRange(); tok->addRange(chDigit_0, chDigit_9); tok->addRange(chLatin_A, chLatin_Z); tok->addRange(chUnderscore, chUnderscore); tok->addRange(chLatin_a, chLatin_z); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCIIWord, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCIIWord, tok , true); // Create xdigit ranges tok = tokFactory->createRange(); tok->addRange(chDigit_0, chDigit_9); tok->addRange(chLatin_A, chLatin_F); tok->addRange(chLatin_a, chLatin_a); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCIIXDigit, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCIIXDigit, tok , true); // Create ascii ranges tok = tokFactory->createRange(); tok->addRange(0x00, 0x7F); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCII, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgASCII, tok , true); fRangesCreated = true; }
// --------------------------------------------------------------------------- // XMLRangeFactory: Range creation methods // --------------------------------------------------------------------------- void XMLRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) { if (fRangesCreated) return; if (!fKeywordsInitialized) { initializeKeywordMap(rangeTokMap); } TokenFactory* tokFactory = rangeTokMap->getTokenFactory(); // Create space ranges unsigned int wsTblLen = getTableLen(gWhitespaceChars); RangeToken* tok = tokFactory->createRange(); XMLInt32* wsRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate ( wsTblLen * sizeof(XMLInt32) );//new XMLInt32[wsTblLen]; tok->setRangeValues(wsRange, wsTblLen); setupRange(wsRange, gWhitespaceChars, 0); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLSpace, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLSpace, tok , true); // Create digits ranges tok = tokFactory->createRange(); unsigned int digitTblLen = getTableLen(gDigitChars); XMLInt32* digitRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate ( digitTblLen * sizeof(XMLInt32) );//new XMLInt32[digitTblLen]; tok->setRangeValues(digitRange, digitTblLen); setupRange(digitRange, gDigitChars, 0); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLDigit, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLDigit, tok , true); // Build word ranges unsigned int baseTblLen = getTableLen(gBaseChars); unsigned int ideoTblLen = getTableLen(gIdeographicChars); unsigned int wordRangeLen = baseTblLen + ideoTblLen + digitTblLen; XMLInt32* wordRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate ( wordRangeLen * sizeof(XMLInt32) );//new XMLInt32[wordRangeLen]; ArrayJanitor<XMLInt32> janWordRange(wordRange, XMLPlatformUtils::fgMemoryManager); setupRange(wordRange, gBaseChars, 0); setupRange(wordRange, gIdeographicChars, baseTblLen); memcpy(wordRange + baseTblLen + ideoTblLen, digitRange, digitTblLen * sizeof(XMLInt32)); // Create NameChar ranges tok = tokFactory->createRange(); unsigned int combTblLen = getTableLen(gCombiningChars); unsigned int extTblLen = getTableLen(gExtenderChars); unsigned int nameTblLen = wordRangeLen + combTblLen + extTblLen; XMLInt32* nameRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate ( (nameTblLen + 8) * sizeof(XMLInt32) );//new XMLInt32[nameTblLen + 8]; tok->setRangeValues(nameRange, nameTblLen + 8); memcpy(nameRange, wordRange, wordRangeLen * sizeof(XMLInt32)); setupRange(nameRange, gCombiningChars, wordRangeLen); setupRange(nameRange, gExtenderChars, wordRangeLen + combTblLen); nameRange[nameTblLen++] = chDash; nameRange[nameTblLen++] = chDash; nameRange[nameTblLen++] = chColon; nameRange[nameTblLen++] = chColon; nameRange[nameTblLen++] = chPeriod; nameRange[nameTblLen++] = chPeriod; nameRange[nameTblLen++] = chUnderscore; nameRange[nameTblLen++] = chUnderscore; tok->sortRanges(); tok->compactRanges(); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLNameChar, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLNameChar, tok , true); // Create initialNameChar ranges tok = tokFactory->createRange(); unsigned int initialNameTblLen = baseTblLen + ideoTblLen; XMLInt32* initialNameRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate ( (initialNameTblLen + 4) * sizeof(XMLInt32) );//new XMLInt32[initialNameTblLen + 4]; tok->setRangeValues(initialNameRange, initialNameTblLen + 4); memcpy(initialNameRange, wordRange, initialNameTblLen * sizeof(XMLInt32)); initialNameRange[initialNameTblLen++] = chColon; initialNameRange[initialNameTblLen++] = chColon; initialNameRange[initialNameTblLen++] = chUnderscore; initialNameRange[initialNameTblLen++] = chUnderscore; tok->sortRanges(); tok->compactRanges(); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLInitialNameChar, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLInitialNameChar, tok , true); // Create word range tok = tokFactory->createRange(); tok->setRangeValues(wordRange, wordRangeLen); janWordRange.orphan(); tok->sortRanges(); tok->compactRanges(); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLWord, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // Build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgXMLWord, tok , true); fRangesCreated = true; }
// --------------------------------------------------------------------------- // UnicodeRangeFactory: Range creation methods // --------------------------------------------------------------------------- void UnicodeRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) { if (fRangesCreated) return; if (!fKeywordsInitialized) { initializeKeywordMap(rangeTokMap); } TokenFactory* tokFactory = rangeTokMap->getTokenFactory(); RangeToken* ranges[UNICATEGSIZE]; RangeToken* tok; for (int i=0; i < UNICATEGSIZE; i++) { ranges[i] = tokFactory->createRange(); } for (int j=0; j < 0x10000; j++) { unsigned short charType = XMLUniCharacter::getType(j); ranges[charType]->addRange(j, j); charType = getUniCategory(charType); ranges[charType]->addRange(j, j); } ranges[XMLUniCharacter::UNASSIGNED]->addRange(0x10000, Token::UTF16_MAX); for (int k=0; k < UNICATEGSIZE; k++) { tok = (RangeToken*) RangeToken::complementRanges(ranges[k], tokFactory); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(uniCategNames[k], ranges[k]); rangeTokMap->setRangeToken(uniCategNames[k], tok , true); } // Create all range tok = tokFactory->createRange(); tok->addRange(0, Token::UTF16_MAX); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniAll, tok); // Create alpha range tok = tokFactory->createRange(); tok->mergeRanges(ranges[XMLUniCharacter::UPPERCASE_LETTER]); tok->mergeRanges(ranges[XMLUniCharacter::LOWERCASE_LETTER]); tok->mergeRanges(ranges[XMLUniCharacter::OTHER_LETTER]); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsAlpha, tok); // Create alpha-num range RangeToken* alnumTok = tokFactory->createRange(); alnumTok->mergeRanges(tok); alnumTok->mergeRanges(ranges[XMLUniCharacter::DECIMAL_DIGIT_NUMBER]); // build the internal map. alnumTok->createMap(); rangeTokMap->setRangeToken(fgUniIsAlnum, alnumTok); // Create word range tok = tokFactory->createRange(); tok->mergeRanges(alnumTok); tok->addRange(chUnderscore, chUnderscore); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsWord, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsWord, tok , true); // Create assigned range tok = (RangeToken*)RangeToken::complementRanges( ranges[XMLUniCharacter::UNASSIGNED], tokFactory, tokFactory->getMemoryManager()); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniAssigned,tok); // Create space range tok = tokFactory->createRange(); tok->mergeRanges(ranges[XMLUniCharacter::SPACE_SEPARATOR]); tok->mergeRanges(ranges[XMLUniCharacter::LINE_SEPARATOR]); //tok->mergeRanges(ranges[XMLUniCharacter::PARAGRAPH_SEPARATOR]); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsSpace, tok); tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory); // build the internal map. tok->createMap(); rangeTokMap->setRangeToken(fgUniIsSpace, tok , true); // build the internal maps. for (int l=0; l < UNICATEGSIZE; l++) { ranges[l]->createMap(); } fRangesCreated = true; }
RangeToken* RegxParser::parseCharacterClass(const bool useNRange) { setParseContext(regexParserStateInBrackets); processNext(); RangeToken* tok = 0; bool isNRange = false; if (getState() == REGX_T_CHAR && getCharData() == chCaret) { isNRange = true; processNext(); } tok = fTokenFactory->createRange(); parserState type; bool firstLoop = true; bool wasDecoded; while ( (type = getState()) != REGX_T_EOF) { wasDecoded = false; // single range | from-to-range | subtraction if (type == REGX_T_CHAR && getCharData() == chCloseSquare && !firstLoop) break; XMLInt32 ch = getCharData(); bool end = false; if (type == REGX_T_BACKSOLIDUS) { switch(ch) { case chLatin_d: case chLatin_D: case chLatin_w: case chLatin_W: case chLatin_s: case chLatin_S: case chLatin_i: case chLatin_I: case chLatin_c: case chLatin_C: { tok->mergeRanges(getTokenForShorthand(ch)); end = true; } break; case chLatin_p: case chLatin_P: { RangeToken* tok2 = processBacksolidus_pP(ch); if (tok2 == 0) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, getMemoryManager()); } tok->mergeRanges(tok2); end = true; } break; case chDash: wasDecoded = true; // fall thru to default. default: ch = decodeEscaped(); } } // end if REGX_T_BACKSOLIDUS else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION && !firstLoop) { if (isNRange) { tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager); isNRange=false; } RangeToken* rangeTok = parseCharacterClass(false); tok->subtractRanges(rangeTok); if (getState() != REGX_T_CHAR || getCharData() != chCloseSquare) { ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC5, getMemoryManager()); } break; } // end if REGX_T_XMLSCHEMA... processNext(); if (!end) { if (type == REGX_T_CHAR && (ch == chOpenSquare || ch == chCloseSquare || (ch == chDash && getCharData() == chCloseSquare && firstLoop))) { // if regex = [-] then invalid... // '[', ']', '-' not allowed and should be escaped XMLCh chStr[] = { ch, chNull }; ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager()); } if (ch == chDash && getCharData() == chDash && getState() != REGX_T_BACKSOLIDUS && !wasDecoded) { XMLCh chStr[] = { ch, chNull }; ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager()); } if (getState() != REGX_T_CHAR || getCharData() != chDash) { tok->addRange(ch, ch); } else { processNext(); if ((type = getState()) == REGX_T_EOF) ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager()); if (type == REGX_T_CHAR && getCharData() == chCloseSquare) { tok->addRange(ch, ch); tok->addRange(chDash, chDash); } else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION) { static const XMLCh dashStr[] = { chDash, chNull}; ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, dashStr, dashStr, getMemoryManager()); } else { XMLInt32 rangeEnd = getCharData(); XMLCh rangeEndStr[] = { rangeEnd, chNull }; if (type == REGX_T_CHAR) { if (rangeEnd == chOpenSquare || rangeEnd == chCloseSquare || rangeEnd == chDash) // '[', ']', '-' not allowed and should be escaped ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, rangeEndStr, rangeEndStr, getMemoryManager()); } else if (type == REGX_T_BACKSOLIDUS) { rangeEnd = decodeEscaped(); } processNext(); if (ch > rangeEnd) { XMLCh chStr[] = { ch, chNull }; ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Ope3, rangeEndStr, chStr, getMemoryManager()); } tok->addRange(ch, rangeEnd); } } } firstLoop = false; } if (getState() == REGX_T_EOF) ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager()); if (isNRange) { if(useNRange) tok->setTokenType(Token::T_NRANGE); else tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager); } tok->sortRanges(); tok->compactRanges(); // If the case-insensitive option is enabled, we need to // have the new RangeToken instance build its internal // case-insensitive RangeToken. if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE)) { tok->getCaseInsensitiveToken(fTokenFactory); } setParseContext(regexParserStateNormal); processNext(); return tok; }
// --------------------------------------------------------------------------- // RangeToken: Getter methods // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { if (fCaseIToken == 0 && tokFactory && fRanges) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); #if XERCES_USE_TRANSCODER_ICU && ((U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4)) UChar* rangeStr=(UChar*)fMemoryManager->allocate(40*fElemCount*sizeof(UChar)); ArrayJanitor<UChar> janRange(rangeStr, fMemoryManager); int c=0; rangeStr[c++] = chOpenSquare; for (unsigned int i = 0; i < fElemCount - 1; i += 2) { XMLCh buffer[10]; XMLSize_t len, j; rangeStr[c++] = chBackSlash; rangeStr[c++] = chLatin_U; XMLString::binToText(fRanges[i], buffer, 10, 16, fMemoryManager); len = XMLString::stringLen(buffer); for(j=0;j<(8-len);j++) rangeStr[c++] = chDigit_0; XMLCh* p=buffer; while(*p) rangeStr[c++] = *p++; if(fRanges[i+1]!=fRanges[i]) { rangeStr[c++] = chDash; rangeStr[c++] = chBackSlash; rangeStr[c++] = chLatin_U; XMLString::binToText(fRanges[i+1], buffer, 10, 16, fMemoryManager); len = XMLString::stringLen(buffer); for(j=0;j<(8-len);j++) rangeStr[c++] = chDigit_0; p=buffer; while(*p) rangeStr[c++] = *p++; } } rangeStr[c++] = chCloseSquare; rangeStr[c++] = chNull; UErrorCode ec=U_ZERO_ERROR; USet* range=uset_openPatternOptions(rangeStr, -1, USET_CASE_INSENSITIVE, &ec); if(range) { ec = U_ZERO_ERROR; uint32_t cbCount=uset_serialize(range, NULL, 0, &ec); uint16_t* buffer=(uint16_t*)fMemoryManager->allocate(cbCount*sizeof(uint16_t)); ArrayJanitor<uint16_t> janSet(buffer, fMemoryManager); ec = U_ZERO_ERROR; uset_serialize(range, buffer, cbCount, &ec); USerializedSet serializedSet; uset_getSerializedSet(&serializedSet, buffer, cbCount); int32_t nSets=uset_getSerializedRangeCount(&serializedSet); for(int32_t i=0; i<nSets; i++) { UChar32 start, end; uset_getSerializedRange(&serializedSet, i, &start, &end); lwrToken->addRange(start, end); } // does this release the memory allocated by the set? uset_setSerializedToOne(&serializedSet, 32); uset_close(range); } #else unsigned int exceptIndex = 0; for (unsigned int i = 0; i < fElemCount - 1; i += 2) { for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) { #if XERCES_USE_TRANSCODER_ICU const XMLInt32 upperCh = u_toupper(ch); if (upperCh != ch) { lwrToken->addRange(upperCh, upperCh); } const XMLInt32 lowerCh = u_tolower(ch); if (lowerCh != ch) { lwrToken->addRange(lowerCh, lowerCh); } const XMLInt32 titleCh = u_totitle(ch); if (titleCh != ch && titleCh != upperCh) { lwrToken->addRange(titleCh, titleCh); } #else if (ch >= chLatin_A && ch <= chLatin_Z) { ch += chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } else if (ch >= chLatin_a && ch <= chLatin_z) { ch -= chLatin_a - chLatin_A; lwrToken->addRange(ch, ch); } #endif const unsigned int exceptionsSize = sizeof(s_exceptions) / sizeof(s_exceptions[0]); // Add any exception chars. These are characters where the the // case mapping is not symmetric. (Unicode case mappings are not isomorphic...) while (exceptIndex < exceptionsSize) { if (s_exceptions[exceptIndex].baseChar < ch) { ++exceptIndex; } else if (s_exceptions[exceptIndex].baseChar == ch) { const XMLInt32 matchingChar = s_exceptions[exceptIndex].matchingChar; lwrToken->addRange( matchingChar, matchingChar); ++exceptIndex; } else { break; } } } } lwrToken->mergeRanges(this); #endif lwrToken->compactRanges(); lwrToken->createMap(); fCaseIToken = lwrToken; // TODO(dbertoni) This is a temporary hack until we can change the ABI. // See Jira issue XERCESC-1866 for more details. // Overload the fCaseIToken data member to be the case-insensitive token // that's caching the case-insensitive one. We need this because tokens // have varying lifetimes. fCaseIToken->setCaseInsensitiveToken(this); } return fCaseIToken; }