Example #1
0
// ---------------------------------------------------------------------------
//  BlockRangeFactory: Range creation methods
// ---------------------------------------------------------------------------
void BlockRangeFactory::buildRanges() {

    if (fRangesCreated)
        return;

    if (!fKeywordsInitialized) {
        initializeKeywordMap();
    }

    RangeTokenMap* rangeTokMap = RangeTokenMap::instance();
    TokenFactory* tokFactory = rangeTokMap->getTokenFactory();

    //for performance, once the desired specials and private use are found
    //don't need to compareString anymore
    bool foundSpecial = false;
    bool foundPrivate = false;

    for (int i=0; i < BLOCKNAMESIZE; i++) {
        RangeToken* tok = tokFactory->createRange();
        tok->addRange(blockRanges[i*2], blockRanges[(i*2)+1]);

        if (!foundSpecial && XMLString::equals((XMLCh*)fgBlockNames[i] , (XMLCh*) fgBlockIsSpecials)) {
            tok->addRange(0xFFF0, 0xFFFD);
            foundSpecial = true;
        }
        if (!foundPrivate && XMLString::equals((XMLCh*)fgBlockNames[i] , (XMLCh*) fgBlockIsPrivateUse)) {
            tok->addRange(0xF0000, 0xFFFFD);
            tok->addRange(0x100000, 0x10FFFD);
            foundPrivate = true;
        }
        rangeTokMap->setRangeToken(fgBlockNames[i], tok);
    }

    fRangesCreated = true;
}
Example #2
0
/**
  * for RANGE: Creates complement.
  * for NRANGE: Creates the same meaning RANGE.
  */
Token* RangeToken::complementRanges(RangeToken* const tok,
                                    TokenFactory* const tokFactory,
                                    MemoryManager* const manager) {

    if (tok->getTokenType() != T_RANGE && tok->getTokenType() != T_NRANGE)
        ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_ComplementRangesInvalidArg, manager);

    tok->sortRanges();
    tok->compactRanges();

    XMLInt32 lastElem = tok->fRanges[tok->fElemCount - 1];
    RangeToken* rangeTok = tokFactory->createRange();

    if (tok->fRanges[0] > 0) {
        rangeTok->addRange(0, tok->fRanges[0] - 1);
    }

    for (unsigned int i= 1; i< tok->fElemCount - 2; i += 2) {
        rangeTok->addRange(tok->fRanges[i] + 1, tok->fRanges[i+1] - 1);
    }

    if (lastElem != UTF16_MAX) {
        rangeTok->addRange(lastElem + 1, UTF16_MAX);
    }

    rangeTok->fCompacted = true;

    return rangeTok;
}
Example #3
0
bool RegularExpression::matchRange(Context* const context, const Op* const op,
								   int& offset, const short direction,
								   const bool ignoreCase)
{
	int tmpOffset = direction > 0 ? offset : offset - 1;

	if (tmpOffset >= context->fLimit || tmpOffset < 0)
		return false;

	XMLInt32 strCh = 0;
	
	if (!context->nextCh(strCh, tmpOffset, direction))
		return false;

	RangeToken* tok = (RangeToken *) op->getToken();
	bool match = false;

	if (ignoreCase) {
		tok = tok->getCaseInsensitiveToken(fTokenFactory);
	}

    match = tok->match(strCh);

	if (!match)
		return false;

	offset = (direction > 0) ? ++tmpOffset : tmpOffset;

	return true;
}
Example #4
0
// ---------------------------------------------------------------------------
//  RangeToken: Getter methods
// ---------------------------------------------------------------------------
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {

    if (fCaseIToken == 0 && tokFactory) {

        bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
        RangeToken* lwrToken = tokFactory->createRange(isNRange);

        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            for (XMLInt32 ch = fRanges[i];  ch <= fRanges[i + 1];  ++ch) {
#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER)
                const XMLInt32  upperCh = u_toupper(ch);

                if (upperCh != ch)
                {
                    lwrToken->addRange(upperCh, upperCh);
                }

                const XMLInt32  lowerCh = u_tolower(ch);

                if (lowerCh != ch)
                {
                    lwrToken->addRange(lowerCh, lowerCh);
                }

                const XMLInt32  titleCh = u_totitle(ch);

                if (titleCh != ch && titleCh != upperCh)
                {
                    lwrToken->addRange(titleCh, titleCh);
                }
#else
                if (ch >= chLatin_A && ch <= chLatin_Z)
                {
                    ch += chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
                else if (ch >= chLatin_a && ch <= chLatin_z)
                {
                    ch -= chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
#endif
            }
        }

        lwrToken->mergeRanges(this);
        lwrToken->compactRanges();
        lwrToken->createMap();

        fCaseIToken = lwrToken;
    }

    return fCaseIToken;
}
Example #5
0
// ---------------------------------------------------------------------------
//  RangeToken: Getter methods
// ---------------------------------------------------------------------------
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {

    // REVIST
    // We will not build a token with case insenstive ranges
    // For now we will return a copy of ourselves.
    if (fCaseIToken == 0 && tokFactory) {

        bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
        RangeToken* lwrToken = tokFactory->createRange(isNRange);

        lwrToken->mergeRanges(this);
        fCaseIToken = lwrToken;
    }

    return fCaseIToken;
}
Example #6
0
bool RegularExpression::matchRange(Context* const context, const Op* const op,
								   int& offset, const short direction,
								   const bool ignoreCase)
{
	int tmpOffset = direction > 0 ? offset : offset - 1;

	if (tmpOffset >= context->fLimit || tmpOffset < 0)
		return false;

	XMLInt32 strCh = 0;
	
	if (!context->nextCh(strCh, tmpOffset, direction))
		return false;

	RangeToken* tok = (RangeToken *) op->getToken();
	bool match = false;

	if (ignoreCase) {

		//REVISIT we should match ignoring case, but for now
		//we will do a normal match
		//tok = tok->getCaseInsensitiveToken();
		//if (!token->match(strCh)) {

		//	if (strCh > 0x10000)
		//		return -1;
			// Do case insensitive matching - uppercase match
			// or lowercase match
		//}
		match = tok->match(strCh);
	}
	else
		match = tok->match(strCh);

	if (!match)
		return false;

	offset = (direction > 0) ? ++tmpOffset : tmpOffset;

	return true;
}
Example #7
0
void RangeToken::mergeRanges(const Token *const tok) {


    if (tok->getTokenType() != this->getTokenType())
        ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_MergeRangesTypeMismatch, fMemoryManager);

    RangeToken* rangeTok = (RangeToken *) tok;

    if (rangeTok->fRanges == 0)
        return;

    fCaseIToken = 0;
    sortRanges();
    rangeTok->sortRanges();

    if (fRanges == 0) {

        fMaxCount = rangeTok->fMaxCount;
        fRanges = (XMLInt32*) fMemoryManager->allocate
        (
            fMaxCount * sizeof(XMLInt32)
        );//new XMLInt32[fMaxCount];
        for (unsigned int index = 0; index < rangeTok->fElemCount; index++) {
            fRanges[index] = rangeTok->fRanges[index];
        }

        fElemCount = rangeTok->fElemCount;
        return;
    }

    unsigned int newMaxCount = (fElemCount + rangeTok->fElemCount >= fMaxCount)
                                 ? fMaxCount + rangeTok->fMaxCount : fMaxCount;
    XMLInt32* result = (XMLInt32*) fMemoryManager->allocate
    (
        newMaxCount * sizeof(XMLInt32)
    );//new XMLInt32[newMaxCount];

    for (unsigned int i=0, j=0, k=0; i < fElemCount || j < rangeTok->fElemCount;) {

        if (i >= fElemCount) {

            for (int count = 0; count < 2; count++) {
                result[k++] = rangeTok->fRanges[j++];
            }
        }
        else if (j >= rangeTok->fElemCount) {

            for (int count = 0; count < 2; count++) {
                result[k++] = fRanges[i++];
            }
        }
        else if (rangeTok->fRanges[j] < fRanges[i]
                 || (rangeTok->fRanges[j] == fRanges[i]
                     && rangeTok->fRanges[j+1] < fRanges[i+1])) {

            for (int count = 0; count < 2; count++) {
                result[k++] = rangeTok->fRanges[j++];
            }
        }
        else {

            for (int count = 0; count < 2; count++) {

                result[k++] = fRanges[i++];
            }
        }
    }

    fMemoryManager->deallocate(fRanges);//delete [] fRanges;
    fElemCount += rangeTok->fElemCount;
    fRanges = result;
    fMaxCount = newMaxCount;
}
Example #8
0
bool RegularExpression::matches(const XMLCh* const expression, const int start,
                                const int end, Match* const pMatch
                                , MemoryManager* const manager)	{
		
	if (fOperations == 0)
		prepare();

	Context context(manager);
	int		 strLength = XMLString::stringLen(expression);

    context.reset(expression, strLength, start, end, fNoClosures);

	bool adoptMatch = false;
	Match* lMatch = pMatch;

	if (lMatch != 0) {
		lMatch->setNoGroups(fNoGroups);
	}
	else if (fHasBackReferences) {

		lMatch = new (fMemoryManager) Match(fMemoryManager);
		lMatch->setNoGroups(fNoGroups);
		adoptMatch = true;
	}

	if (context.fAdoptMatch)
		delete context.fMatch;
    context.fMatch = lMatch;
	context.fAdoptMatch = adoptMatch;

	if (isSet(fOptions, XMLSCHEMA_MODE)) {

		int matchEnd = match(&context, fOperations, context.fStart, 1);

		if (matchEnd == context.fLimit) {

			if (context.fMatch != 0) {

				context.fMatch->setStartPos(0, context.fStart);
				context.fMatch->setEndPos(0, matchEnd);
			}		
			return true;
		}

		return false;
	}

	/*
	 *	If the pattern has only fixed string, use Boyer-Moore
	 */
	if (fFixedStringOnly) {

		int ret = fBMPattern->matches(expression, context.fStart,
			                          context.fLimit);
		if (ret >= 0) {

			if (context.fMatch != 0) {
				context.fMatch->setStartPos(0, ret);
				context.fMatch->setEndPos(0, ret + strLength);
			}		
			return true;
		}		
		return false;
	}

	/*
	 *	If the pattern contains a fixed string, we check with Boyer-Moore
	 *	whether the text contains the fixed string or not. If not found
	 *	return false
	 */
	if (fFixedString != 0) {

		int ret = fBMPattern->matches(expression, context.fStart,
                                      context.fLimit);

		if (ret < 0) { // No match
			return false;
		}
	}

	int limit = context.fLimit - fMinLength;
	int matchStart;
	int matchEnd = -1;

	/*
	 *	Check whether the expression start with ".*"
	 */
	if (fOperations != 0 && fOperations->getOpType() == Op::O_CLOSURE
        && fOperations->getChild()->getOpType() == Op::O_DOT) {

		if (isSet(fOptions, SINGLE_LINE)) {
			matchStart = context.fStart;
			matchEnd = match(&context, fOperations, matchStart, 1);
		}
		else {
			bool previousIsEOL = true;

			for (matchStart=context.fStart; matchStart<=limit; matchStart++) {

				XMLCh ch = expression[matchStart];
				if (RegxUtil::isEOLChar(ch)) {
					previousIsEOL = true;
				}
				else {

					if (previousIsEOL) {
						if (0 <= (matchEnd = match(&context, fOperations,
                                                   matchStart, 1)))
                            break;
					}

					previousIsEOL = false;
				}
			}
		}
	}
	else {
        /*
         *	Optimization against the first char
         */
		if (fFirstChar != 0) {
			bool ignoreCase = isSet(fOptions, IGNORE_CASE);
			RangeToken* range = fFirstChar;

			if (ignoreCase)
				range = fFirstChar->getCaseInsensitiveToken(fTokenFactory);

			for (matchStart=context.fStart; matchStart<=limit; matchStart++) {

                XMLInt32 ch;

				if (!context.nextCh(ch, matchStart, 1))
					break;

				if (!range->match(ch)) {

					if (!ignoreCase)
						continue;

					// Perform case insensitive match
					// REVISIT
					continue;
				}

				if (0 <= (matchEnd = match(&context,fOperations,matchStart,1)))
					break;
            }
		}
		else {

            /*
             *	Straightforward matching
             */
			for (matchStart=context.fStart; matchStart<=limit; matchStart++) {

				if (0 <= (matchEnd = match(&context,fOperations,matchStart,1)))
					break;
			}
		}
	}

	if (matchEnd >= 0) {

		if (context.fMatch != 0) {

			context.fMatch->setStartPos(0, matchStart);
			context.fMatch->setEndPos(0, matchEnd);
		}		
		return true;
	}
	return false;
}
Example #9
0
/*
 * Prepares for matching. This method is called just before starting matching
 */
void RegularExpression::prepare() {

	XMLMutexLock lockInit(&fMutex);

	compile(fTokenTree);

	fMinLength = fTokenTree->getMinLength();
	fFirstChar = 0;

	if (!isSet(fOptions, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) &&
		!isSet(fOptions, XMLSCHEMA_MODE))							{

		RangeToken* rangeTok = fTokenFactory->createRange();
		int result = fTokenTree->analyzeFirstCharacter(rangeTok, fOptions, fTokenFactory);

		if (result == Token::FC_TERMINAL) {

			rangeTok->compactRanges();
			fFirstChar = rangeTok;
		}
	}

	if (fOperations != 0 && fOperations->getNextOp() == 0 &&
		(fOperations->getOpType() == Op::O_STRING ||
		 fOperations->getOpType() == Op::O_CHAR) )			 {

		fFixedStringOnly = true;

		if (fOperations->getOpType() == Op::O_STRING) {
			fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
			fFixedString = XMLString::replicate(fOperations->getLiteral(), fMemoryManager);
		}
		else{
			
			XMLInt32 ch = fOperations->getData();

			if ( ch >= 0x10000) { // add as constant
				fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
				fFixedString = RegxUtil::decomposeToSurrogates(ch, fMemoryManager);
			}
			else {

				XMLCh* dummyStr = (XMLCh*) fMemoryManager->allocate(2 * sizeof(XMLCh));//new XMLCh[2];
				dummyStr[0] = (XMLCh) fOperations->getData();
				dummyStr[1] = chNull;
				fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
				fFixedString = dummyStr;
			}
		}

		fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256,
								  isSet(fOptions, IGNORE_CASE), fMemoryManager);
	}
	else if (!isSet(fOptions, XMLSCHEMA_MODE) &&
			 !isSet(fOptions, PROHIBIT_FIXED_STRING_OPTIMIZATION)) {

		int fixedOpts = 0;
		Token* tok = fTokenTree->findFixedString(fOptions, fixedOpts);

		fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;

		fFixedString = (tok == 0) ? 0
			: XMLString::replicate(tok->getString(), fMemoryManager);

		if (fFixedString != 0 && XMLString::stringLen(fFixedString) < 2) {

			fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
			fFixedString = 0;
		}
		
		if (fFixedString != 0) {

			fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256,
									   isSet(fixedOpts, IGNORE_CASE));
		}
	}
}
Example #10
0
// ---------------------------------------------------------------------------
//  ASCIIRangeFactory: Range creation methods
// ---------------------------------------------------------------------------
void ASCIIRangeFactory::buildRanges() {

    if (fRangesCreated)
        return;

    if (!fKeywordsInitialized) {
        initializeKeywordMap();
    }

    RangeTokenMap* rangeTokMap = RangeTokenMap::instance();
    TokenFactory* tokFactory = rangeTokMap->getTokenFactory();

    // Create space ranges
    RangeToken* tok = tokFactory->createRange();
    tok->addRange(chHTab, chHTab);
	tok->addRange(chLF, chLF);
	tok->addRange(chFF, chFF);
	tok->addRange(chCR, chCR);
	tok->addRange(chSpace, chSpace);
	rangeTokMap->setRangeToken(fgASCIISpace, tok);

    // Create digits ranges
    tok = tokFactory->createRange();
    tok->addRange(chDigit_0, chDigit_9);
    rangeTokMap->setRangeToken(fgASCIIDigit, tok);

    // Create word ranges
    tok = tokFactory->createRange();
    tok->addRange(chDigit_0, chDigit_9);
	tok->addRange(chLatin_A, chLatin_Z);
    tok->addRange(chUnderscore, chUnderscore);
	tok->addRange(chLatin_a, chLatin_z);
    rangeTokMap->setRangeToken(fgASCIIWord, tok);

    // Create xdigit ranges
    tok = tokFactory->createRange();
    tok->addRange(chDigit_0, chDigit_9);
    tok->addRange(chLatin_A, chLatin_F);
    tok->addRange(chLatin_a, chLatin_a);
    rangeTokMap->setRangeToken(fgASCIIXDigit, tok);

    // Create ascii ranges
    tok = tokFactory->createRange();
    tok->addRange(0x00, 0x7F);
    rangeTokMap->setRangeToken(fgASCII, tok);

    fRangesCreated = true;
}
// ---------------------------------------------------------------------------
//  ASCIIRangeFactory: Range creation methods
// ---------------------------------------------------------------------------
void ASCIIRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) {

    if (fRangesCreated)
        return;

    if (!fKeywordsInitialized) {
        initializeKeywordMap(rangeTokMap);
    }

    TokenFactory* tokFactory = rangeTokMap->getTokenFactory();

    // Create space ranges
    RangeToken* tok = tokFactory->createRange();
    tok->addRange(chHTab, chHTab);
    tok->addRange(chLF, chLF);
    tok->addRange(chFF, chFF);
    tok->addRange(chCR, chCR);
    tok->addRange(chSpace, chSpace);

    // Build the internal map.
    tok->createMap();

    rangeTokMap->setRangeToken(fgASCIISpace, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);

    // Build the internal map.
    tok->createMap();

    rangeTokMap->setRangeToken(fgASCIISpace, tok , true);

    // Create digits ranges
    tok = tokFactory->createRange();
    tok->addRange(chDigit_0, chDigit_9);

    // Build the internal map.
    tok->createMap();

    rangeTokMap->setRangeToken(fgASCIIDigit, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);

    // Build the internal map.
    tok->createMap();

    rangeTokMap->setRangeToken(fgASCIIDigit, tok , true);

    // Create word ranges
    tok = tokFactory->createRange();
    tok->addRange(chDigit_0, chDigit_9);
    tok->addRange(chLatin_A, chLatin_Z);
    tok->addRange(chUnderscore, chUnderscore);
    tok->addRange(chLatin_a, chLatin_z);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgASCIIWord, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgASCIIWord, tok , true);

    // Create xdigit ranges
    tok = tokFactory->createRange();
    tok->addRange(chDigit_0, chDigit_9);
    tok->addRange(chLatin_A, chLatin_F);
    tok->addRange(chLatin_a, chLatin_a);
    // Build the internal map.
    tok->createMap();

    rangeTokMap->setRangeToken(fgASCIIXDigit, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgASCIIXDigit, tok , true);

    // Create ascii ranges
    tok = tokFactory->createRange();
    tok->addRange(0x00, 0x7F);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgASCII, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgASCII, tok , true);

    fRangesCreated = true;
}
Example #12
0
// ---------------------------------------------------------------------------
//  XMLRangeFactory: Range creation methods
// ---------------------------------------------------------------------------
void XMLRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) {

    if (fRangesCreated)
        return;

    if (!fKeywordsInitialized) {
        initializeKeywordMap(rangeTokMap);
    }

    TokenFactory* tokFactory = rangeTokMap->getTokenFactory();

    // Create space ranges
    unsigned int wsTblLen = getTableLen(gWhitespaceChars);
    RangeToken* tok = tokFactory->createRange();
    XMLInt32* wsRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate
                        (
                            wsTblLen * sizeof(XMLInt32)
                        );//new XMLInt32[wsTblLen];

    tok->setRangeValues(wsRange, wsTblLen);
    setupRange(wsRange, gWhitespaceChars, 0);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLSpace, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLSpace, tok , true);

    // Create digits ranges
    tok = tokFactory->createRange();
    unsigned int digitTblLen = getTableLen(gDigitChars);
    XMLInt32* digitRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate
                           (
                               digitTblLen * sizeof(XMLInt32)
                           );//new XMLInt32[digitTblLen];

    tok->setRangeValues(digitRange, digitTblLen);
    setupRange(digitRange, gDigitChars, 0);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLDigit, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLDigit, tok , true);

    // Build word ranges
    unsigned int baseTblLen = getTableLen(gBaseChars);
    unsigned int ideoTblLen = getTableLen(gIdeographicChars);
    unsigned int wordRangeLen = baseTblLen + ideoTblLen + digitTblLen;
    XMLInt32* wordRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate
                          (
                              wordRangeLen * sizeof(XMLInt32)
                          );//new XMLInt32[wordRangeLen];
    ArrayJanitor<XMLInt32> janWordRange(wordRange, XMLPlatformUtils::fgMemoryManager);

    setupRange(wordRange, gBaseChars, 0);
    setupRange(wordRange, gIdeographicChars, baseTblLen);
    memcpy(wordRange + baseTblLen + ideoTblLen, digitRange, digitTblLen * sizeof(XMLInt32));

    // Create NameChar ranges
    tok = tokFactory->createRange();
    unsigned int combTblLen = getTableLen(gCombiningChars);
    unsigned int extTblLen = getTableLen(gExtenderChars);
    unsigned int nameTblLen = wordRangeLen + combTblLen + extTblLen;
    XMLInt32* nameRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate
                          (
                              (nameTblLen + 8) * sizeof(XMLInt32)
                          );//new XMLInt32[nameTblLen + 8];

    tok->setRangeValues(nameRange, nameTblLen + 8);
    memcpy(nameRange, wordRange, wordRangeLen * sizeof(XMLInt32));
    setupRange(nameRange, gCombiningChars, wordRangeLen);
    setupRange(nameRange, gExtenderChars, wordRangeLen + combTblLen);
    nameRange[nameTblLen++] = chDash;
    nameRange[nameTblLen++] = chDash;
    nameRange[nameTblLen++] = chColon;
    nameRange[nameTblLen++] = chColon;
    nameRange[nameTblLen++] = chPeriod;
    nameRange[nameTblLen++] = chPeriod;
    nameRange[nameTblLen++] = chUnderscore;
    nameRange[nameTblLen++] = chUnderscore;
    tok->sortRanges();
    tok->compactRanges();
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLNameChar, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLNameChar, tok , true);

    // Create initialNameChar ranges
    tok = tokFactory->createRange();
    unsigned int initialNameTblLen = baseTblLen + ideoTblLen;
    XMLInt32* initialNameRange = (XMLInt32*) XMLPlatformUtils::fgMemoryManager->allocate
                                 (
                                     (initialNameTblLen + 4) * sizeof(XMLInt32)
                                 );//new XMLInt32[initialNameTblLen + 4];

    tok->setRangeValues(initialNameRange, initialNameTblLen + 4);
    memcpy(initialNameRange, wordRange, initialNameTblLen * sizeof(XMLInt32));
    initialNameRange[initialNameTblLen++] = chColon;
    initialNameRange[initialNameTblLen++] = chColon;
    initialNameRange[initialNameTblLen++] = chUnderscore;
    initialNameRange[initialNameTblLen++] = chUnderscore;
    tok->sortRanges();
    tok->compactRanges();
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLInitialNameChar, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLInitialNameChar, tok , true);

    // Create word range
    tok = tokFactory->createRange();
    tok->setRangeValues(wordRange, wordRangeLen);
    janWordRange.orphan();
    tok->sortRanges();
    tok->compactRanges();
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLWord, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // Build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgXMLWord, tok , true);

    fRangesCreated = true;
}
Example #13
0
// ---------------------------------------------------------------------------
//  UnicodeRangeFactory: Range creation methods
// ---------------------------------------------------------------------------
void UnicodeRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) {

    if (fRangesCreated)
        return;

    if (!fKeywordsInitialized) {
        initializeKeywordMap(rangeTokMap);
    }

    TokenFactory* tokFactory = rangeTokMap->getTokenFactory();
	RangeToken* ranges[UNICATEGSIZE];
    RangeToken* tok;

    for (int i=0; i < UNICATEGSIZE; i++) {
        ranges[i] = tokFactory->createRange();
    }

    for (int j=0; j < 0x10000; j++) {

        unsigned short charType = XMLUniCharacter::getType(j);

        ranges[charType]->addRange(j, j);
        charType = getUniCategory(charType);
        ranges[charType]->addRange(j, j);
    }

    ranges[XMLUniCharacter::UNASSIGNED]->addRange(0x10000, Token::UTF16_MAX);

    for (int k=0; k < UNICATEGSIZE; k++) {
        tok = (RangeToken*) RangeToken::complementRanges(ranges[k], tokFactory);
        // build the internal map.
        tok->createMap();
        rangeTokMap->setRangeToken(uniCategNames[k], ranges[k]);
        rangeTokMap->setRangeToken(uniCategNames[k], tok , true);
    }

    // Create all range
    tok = tokFactory->createRange();
    tok->addRange(0, Token::UTF16_MAX);
    // build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgUniAll, tok);

    // Create alpha range
    tok = tokFactory->createRange();
    tok->mergeRanges(ranges[XMLUniCharacter::UPPERCASE_LETTER]);
    tok->mergeRanges(ranges[XMLUniCharacter::LOWERCASE_LETTER]);
    tok->mergeRanges(ranges[XMLUniCharacter::OTHER_LETTER]);
    // build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgUniIsAlpha, tok);

    // Create alpha-num range
    RangeToken* alnumTok = tokFactory->createRange();
    alnumTok->mergeRanges(tok);
    alnumTok->mergeRanges(ranges[XMLUniCharacter::DECIMAL_DIGIT_NUMBER]);
    // build the internal map.
    alnumTok->createMap();
    rangeTokMap->setRangeToken(fgUniIsAlnum, alnumTok);

    // Create word range
    tok = tokFactory->createRange();
    tok->mergeRanges(alnumTok);
    tok->addRange(chUnderscore, chUnderscore);
    // build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgUniIsWord, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgUniIsWord, tok , true);

    // Create assigned range
    tok = (RangeToken*)RangeToken::complementRanges(
                ranges[XMLUniCharacter::UNASSIGNED],
		        tokFactory,
                tokFactory->getMemoryManager());
    // build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgUniAssigned,tok);

    // Create space range
    tok = tokFactory->createRange();
    tok->mergeRanges(ranges[XMLUniCharacter::SPACE_SEPARATOR]);
    tok->mergeRanges(ranges[XMLUniCharacter::LINE_SEPARATOR]);
    //tok->mergeRanges(ranges[XMLUniCharacter::PARAGRAPH_SEPARATOR]);
    // build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgUniIsSpace, tok);

    tok = (RangeToken*) RangeToken::complementRanges(tok, tokFactory);
    // build the internal map.
    tok->createMap();
    rangeTokMap->setRangeToken(fgUniIsSpace, tok , true);

    // build the internal maps.
    for (int l=0; l < UNICATEGSIZE; l++) {
        ranges[l]->createMap();
    }

    fRangesCreated = true;
}
Example #14
0
RangeToken* RegxParser::parseCharacterClass(const bool useNRange) {

    setParseContext(regexParserStateInBrackets);
    processNext();

    RangeToken* tok = 0;
    bool isNRange = false;

    if (getState() == REGX_T_CHAR && getCharData() == chCaret) {
        isNRange = true;
        processNext();
    }
    tok = fTokenFactory->createRange();

    parserState type;
    bool firstLoop = true;
    bool wasDecoded;

    while ( (type = getState()) != REGX_T_EOF) {

        wasDecoded = false;

        // single range | from-to-range | subtraction
        if (type == REGX_T_CHAR && getCharData() == chCloseSquare && !firstLoop)
            break;

        XMLInt32 ch = getCharData();
        bool     end = false;

        if (type == REGX_T_BACKSOLIDUS) {

            switch(ch) {
            case chLatin_d:
            case chLatin_D:
            case chLatin_w:
            case chLatin_W:
            case chLatin_s:
            case chLatin_S:
            case chLatin_i:
            case chLatin_I:
            case chLatin_c:
            case chLatin_C:
                {
                    tok->mergeRanges(getTokenForShorthand(ch));
                    end = true;
                }
                break;
            case chLatin_p:
            case chLatin_P:
                {                    
                    RangeToken* tok2 = processBacksolidus_pP(ch);

                    if (tok2 == 0) {
                        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, getMemoryManager());
                    }

                    tok->mergeRanges(tok2);
                    end = true;
                }
                break;
            case chDash:
                wasDecoded = true;
                // fall thru to default.
            default:
                ch = decodeEscaped();
            }
        } // end if REGX_T_BACKSOLIDUS
        else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION && !firstLoop) {

            if (isNRange)
            {
                tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager);
                isNRange=false;
            }
            RangeToken* rangeTok = parseCharacterClass(false);
            tok->subtractRanges(rangeTok);

            if (getState() != REGX_T_CHAR || getCharData() != chCloseSquare) {
                ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC5, getMemoryManager());
            }
            break;
        } // end if REGX_T_XMLSCHEMA...

        processNext();

        if (!end) {

            if (type == REGX_T_CHAR
                && (ch == chOpenSquare
                    || ch == chCloseSquare
                    || (ch == chDash && getCharData() == chCloseSquare && firstLoop))) {
                // if regex = [-] then invalid...
                // '[', ']', '-' not allowed and should be escaped
                XMLCh chStr[] = { ch, chNull };
                ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager());
            }
            if (ch == chDash && getCharData() == chDash && getState() != REGX_T_BACKSOLIDUS && !wasDecoded) {
                XMLCh chStr[] = { ch, chNull };
                ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager());
            }

            if (getState() != REGX_T_CHAR || getCharData() != chDash) {
                tok->addRange(ch, ch);
            }
            else {

                processNext();
                if ((type = getState()) == REGX_T_EOF)
                    ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager());

                if (type == REGX_T_CHAR && getCharData() == chCloseSquare) {
                    tok->addRange(ch, ch);
                    tok->addRange(chDash, chDash);
                }
                else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION) {

                    static const XMLCh dashStr[] = { chDash, chNull};
                    ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, dashStr, dashStr, getMemoryManager());
                }
                else {

                    XMLInt32 rangeEnd = getCharData();
                    XMLCh rangeEndStr[] = { rangeEnd, chNull };

                    if (type == REGX_T_CHAR) {

                        if (rangeEnd == chOpenSquare
                            || rangeEnd == chCloseSquare
                            || rangeEnd == chDash)
                            // '[', ']', '-' not allowed and should be escaped
                            ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, rangeEndStr, rangeEndStr, getMemoryManager());
                    }
                    else if (type == REGX_T_BACKSOLIDUS) {
                        rangeEnd = decodeEscaped();
                    }

                    processNext();

                    if (ch > rangeEnd) {
                        XMLCh chStr[] = { ch, chNull };
                        ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Ope3, rangeEndStr, chStr, getMemoryManager());
                    }

                    tok->addRange(ch, rangeEnd);
                }
            }
        }
        firstLoop = false;
    }

    if (getState() == REGX_T_EOF)
        ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager());

    if (isNRange)
    {
        if(useNRange)
            tok->setTokenType(Token::T_NRANGE);
        else
            tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager);
    }

    tok->sortRanges();
    tok->compactRanges();

    // If the case-insensitive option is enabled, we need to
    // have the new RangeToken instance build its internal
    // case-insensitive RangeToken.
    if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE))
    {
        tok->getCaseInsensitiveToken(fTokenFactory);
    }

    setParseContext(regexParserStateNormal);
    processNext();

    return tok;
}
Example #15
0
// ---------------------------------------------------------------------------
//  RangeToken: Getter methods
// ---------------------------------------------------------------------------
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {

    if (fCaseIToken == 0 && tokFactory && fRanges) {

        bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
        RangeToken* lwrToken = tokFactory->createRange(isNRange);

#if XERCES_USE_TRANSCODER_ICU && ((U_ICU_VERSION_MAJOR_NUM > 2) || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >=4))
        UChar* rangeStr=(UChar*)fMemoryManager->allocate(40*fElemCount*sizeof(UChar));
        ArrayJanitor<UChar> janRange(rangeStr, fMemoryManager);
        int c=0;
        rangeStr[c++] = chOpenSquare;
        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            XMLCh buffer[10];
            XMLSize_t len, j;

            rangeStr[c++] = chBackSlash;
            rangeStr[c++] = chLatin_U;
            XMLString::binToText(fRanges[i], buffer, 10, 16, fMemoryManager);
            len = XMLString::stringLen(buffer);
            for(j=0;j<(8-len);j++)
                rangeStr[c++] = chDigit_0;
            XMLCh* p=buffer;
            while(*p)
                rangeStr[c++] = *p++;
            if(fRanges[i+1]!=fRanges[i])
            {
                rangeStr[c++] = chDash;
                rangeStr[c++] = chBackSlash;
                rangeStr[c++] = chLatin_U;
                XMLString::binToText(fRanges[i+1], buffer, 10, 16, fMemoryManager);
                len = XMLString::stringLen(buffer);
                for(j=0;j<(8-len);j++)
                    rangeStr[c++] = chDigit_0;
                p=buffer;
                while(*p)
                    rangeStr[c++] = *p++;
            }
        }
        rangeStr[c++] = chCloseSquare;
        rangeStr[c++] = chNull;
        UErrorCode ec=U_ZERO_ERROR;
        USet* range=uset_openPatternOptions(rangeStr, -1, USET_CASE_INSENSITIVE, &ec);
        if(range)
        {
            ec = U_ZERO_ERROR;
            uint32_t cbCount=uset_serialize(range, NULL, 0, &ec);
            uint16_t* buffer=(uint16_t*)fMemoryManager->allocate(cbCount*sizeof(uint16_t));
            ArrayJanitor<uint16_t> janSet(buffer, fMemoryManager);
            ec = U_ZERO_ERROR;
            uset_serialize(range, buffer, cbCount, &ec);
            USerializedSet serializedSet;
            uset_getSerializedSet(&serializedSet, buffer, cbCount);
            int32_t nSets=uset_getSerializedRangeCount(&serializedSet);
            for(int32_t i=0; i<nSets; i++)
            {
                UChar32 start, end;
                uset_getSerializedRange(&serializedSet, i, &start, &end);
                lwrToken->addRange(start, end);
            }
            // does this release the memory allocated by the set?
            uset_setSerializedToOne(&serializedSet, 32);
            uset_close(range);
        }
#else
        unsigned int exceptIndex = 0;

        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
            for (XMLInt32 ch = fRanges[i];  ch <= fRanges[i + 1];  ++ch) {
#if XERCES_USE_TRANSCODER_ICU
                const XMLInt32  upperCh = u_toupper(ch);

                if (upperCh != ch)
                {
                    lwrToken->addRange(upperCh, upperCh);
                }

                const XMLInt32  lowerCh = u_tolower(ch);

                if (lowerCh != ch)
                {
                    lwrToken->addRange(lowerCh, lowerCh);
                }

                const XMLInt32  titleCh = u_totitle(ch);

                if (titleCh != ch && titleCh != upperCh)
                {
                    lwrToken->addRange(titleCh, titleCh);
                }
#else
                if (ch >= chLatin_A && ch <= chLatin_Z)
                {
                    ch += chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
                else if (ch >= chLatin_a && ch <= chLatin_z)
                {
                    ch -= chLatin_a - chLatin_A;

                    lwrToken->addRange(ch, ch);
                }
#endif

                const unsigned int  exceptionsSize =
                    sizeof(s_exceptions) / sizeof(s_exceptions[0]);

                // Add any exception chars.  These are characters where the the
                // case mapping is not symmetric.  (Unicode case mappings are not isomorphic...)
                while (exceptIndex < exceptionsSize)
                {
                    if (s_exceptions[exceptIndex].baseChar < ch)
                    {
                        ++exceptIndex;
                    }
                    else if (s_exceptions[exceptIndex].baseChar == ch)
                    {
                        const XMLInt32  matchingChar =
                            s_exceptions[exceptIndex].matchingChar;

                        lwrToken->addRange(
                            matchingChar,
                            matchingChar);

                        ++exceptIndex;
                    }
                    else
                    {
                        break;
                    }
                }
            }
        }

        lwrToken->mergeRanges(this);
#endif
        lwrToken->compactRanges();
        lwrToken->createMap();

        fCaseIToken = lwrToken;
        // TODO(dbertoni) This is a temporary hack until we can change the ABI.
        // See Jira issue XERCESC-1866 for more details.
        // Overload the fCaseIToken data member to be the case-insensitive token
        // that's caching the case-insensitive one.  We need this because tokens
        // have varying lifetimes.
        fCaseIToken->setCaseInsensitiveToken(this);
    }

    return fCaseIToken;
}